diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,128833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0286513694778567, + "eval_steps": 500, + "global_step": 18400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016591314446887055, + "grad_norm": 85.19542448446617, + "learning_rate": 3.0000000000000004e-08, + "loss": 8.5863, + "step": 1 + }, + { + "epoch": 0.0003318262889377411, + "grad_norm": 82.32409405573297, + "learning_rate": 6.000000000000001e-08, + "loss": 8.473, + "step": 2 + }, + { + "epoch": 0.0004977394334066117, + "grad_norm": 87.13591882926195, + "learning_rate": 9e-08, + "loss": 8.6763, + "step": 3 + }, + { + "epoch": 0.0006636525778754822, + "grad_norm": 84.18601565903892, + "learning_rate": 1.2000000000000002e-07, + "loss": 8.4935, + "step": 4 + }, + { + "epoch": 0.0008295657223443527, + "grad_norm": 84.14667729364332, + "learning_rate": 1.5e-07, + "loss": 8.5695, + "step": 5 + }, + { + "epoch": 0.0009954788668132234, + "grad_norm": 84.10731463288944, + "learning_rate": 1.8e-07, + "loss": 8.5366, + "step": 6 + }, + { + "epoch": 0.0011613920112820938, + "grad_norm": 83.28625999969219, + "learning_rate": 2.1000000000000003e-07, + "loss": 8.5134, + "step": 7 + }, + { + "epoch": 0.0013273051557509644, + "grad_norm": 84.44997731146951, + "learning_rate": 2.4000000000000003e-07, + "loss": 8.5326, + "step": 8 + }, + { + "epoch": 0.0014932183002198348, + "grad_norm": 89.14194809970635, + "learning_rate": 2.7e-07, + "loss": 8.722, + "step": 9 + }, + { + "epoch": 0.0016591314446887055, + "grad_norm": 84.83466343564557, + "learning_rate": 3e-07, + "loss": 8.555, + "step": 10 + }, + { + "epoch": 0.001825044589157576, + "grad_norm": 81.56216222466577, + "learning_rate": 3.3e-07, + "loss": 8.4491, + "step": 11 + }, + { + "epoch": 0.0019909577336264467, + "grad_norm": 86.81209990901495, + "learning_rate": 3.6e-07, + "loss": 8.6371, + "step": 12 + }, + { + "epoch": 0.002156870878095317, + "grad_norm": 85.71908305594017, + "learning_rate": 3.9e-07, + "loss": 8.5535, + "step": 13 + }, + { + "epoch": 0.0023227840225641875, + "grad_norm": 86.9623893470603, + "learning_rate": 4.2000000000000006e-07, + "loss": 8.6119, + "step": 14 + }, + { + "epoch": 0.002488697167033058, + "grad_norm": 82.49495302426142, + "learning_rate": 4.5000000000000003e-07, + "loss": 8.441, + "step": 15 + }, + { + "epoch": 0.002654610311501929, + "grad_norm": 81.33695392158589, + "learning_rate": 4.800000000000001e-07, + "loss": 8.4148, + "step": 16 + }, + { + "epoch": 0.0028205234559707994, + "grad_norm": 84.88167396340097, + "learning_rate": 5.1e-07, + "loss": 8.5835, + "step": 17 + }, + { + "epoch": 0.0029864366004396696, + "grad_norm": 79.05682174134483, + "learning_rate": 5.4e-07, + "loss": 8.3251, + "step": 18 + }, + { + "epoch": 0.0031523497449085403, + "grad_norm": 78.38741271121032, + "learning_rate": 5.7e-07, + "loss": 8.3005, + "step": 19 + }, + { + "epoch": 0.003318262889377411, + "grad_norm": 82.4891623112013, + "learning_rate": 6e-07, + "loss": 8.4621, + "step": 20 + }, + { + "epoch": 0.0034841760338462815, + "grad_norm": 79.1659385995681, + "learning_rate": 6.300000000000001e-07, + "loss": 8.3031, + "step": 21 + }, + { + "epoch": 0.003650089178315152, + "grad_norm": 77.06598892419889, + "learning_rate": 6.6e-07, + "loss": 8.2303, + "step": 22 + }, + { + "epoch": 0.0038160023227840224, + "grad_norm": 82.36739641908615, + "learning_rate": 6.9e-07, + "loss": 8.2115, + "step": 23 + }, + { + "epoch": 0.003981915467252893, + "grad_norm": 76.36757247944843, + "learning_rate": 7.2e-07, + "loss": 8.1456, + "step": 24 + }, + { + "epoch": 0.004147828611721764, + "grad_norm": 68.98126779127293, + "learning_rate": 7.5e-07, + "loss": 7.8079, + "step": 25 + }, + { + "epoch": 0.004313741756190634, + "grad_norm": 69.00876196915596, + "learning_rate": 7.8e-07, + "loss": 7.8411, + "step": 26 + }, + { + "epoch": 0.004479654900659505, + "grad_norm": 67.5391863634817, + "learning_rate": 8.1e-07, + "loss": 7.7531, + "step": 27 + }, + { + "epoch": 0.004645568045128375, + "grad_norm": 68.05473702316927, + "learning_rate": 8.400000000000001e-07, + "loss": 7.7816, + "step": 28 + }, + { + "epoch": 0.004811481189597246, + "grad_norm": 67.38920732223586, + "learning_rate": 8.7e-07, + "loss": 7.7137, + "step": 29 + }, + { + "epoch": 0.004977394334066116, + "grad_norm": 66.22135170564259, + "learning_rate": 9.000000000000001e-07, + "loss": 7.6523, + "step": 30 + }, + { + "epoch": 0.0051433074785349866, + "grad_norm": 65.24262292310318, + "learning_rate": 9.3e-07, + "loss": 7.6215, + "step": 31 + }, + { + "epoch": 0.005309220623003858, + "grad_norm": 66.75573857246084, + "learning_rate": 9.600000000000001e-07, + "loss": 7.6052, + "step": 32 + }, + { + "epoch": 0.005475133767472728, + "grad_norm": 66.14125778432644, + "learning_rate": 9.9e-07, + "loss": 7.4641, + "step": 33 + }, + { + "epoch": 0.005641046911941599, + "grad_norm": 62.44214008796182, + "learning_rate": 1.02e-06, + "loss": 7.0586, + "step": 34 + }, + { + "epoch": 0.005806960056410469, + "grad_norm": 59.29700311739906, + "learning_rate": 1.0500000000000001e-06, + "loss": 6.8323, + "step": 35 + }, + { + "epoch": 0.005972873200879339, + "grad_norm": 59.53983946181781, + "learning_rate": 1.08e-06, + "loss": 6.8458, + "step": 36 + }, + { + "epoch": 0.00613878634534821, + "grad_norm": 58.40955787137384, + "learning_rate": 1.11e-06, + "loss": 6.749, + "step": 37 + }, + { + "epoch": 0.0063046994898170805, + "grad_norm": 58.46669985481344, + "learning_rate": 1.14e-06, + "loss": 6.7341, + "step": 38 + }, + { + "epoch": 0.006470612634285952, + "grad_norm": 57.85754721296723, + "learning_rate": 1.17e-06, + "loss": 6.679, + "step": 39 + }, + { + "epoch": 0.006636525778754822, + "grad_norm": 61.03757307715009, + "learning_rate": 1.2e-06, + "loss": 6.6835, + "step": 40 + }, + { + "epoch": 0.006802438923223692, + "grad_norm": 58.679250167783295, + "learning_rate": 1.2299999999999999e-06, + "loss": 6.6456, + "step": 41 + }, + { + "epoch": 0.006968352067692563, + "grad_norm": 58.420472121350315, + "learning_rate": 1.2600000000000002e-06, + "loss": 6.5997, + "step": 42 + }, + { + "epoch": 0.007134265212161433, + "grad_norm": 57.989410831374606, + "learning_rate": 1.2900000000000001e-06, + "loss": 6.477, + "step": 43 + }, + { + "epoch": 0.007300178356630304, + "grad_norm": 57.29419183703238, + "learning_rate": 1.32e-06, + "loss": 6.3482, + "step": 44 + }, + { + "epoch": 0.0074660915010991745, + "grad_norm": 57.64245355217973, + "learning_rate": 1.35e-06, + "loss": 6.2579, + "step": 45 + }, + { + "epoch": 0.007632004645568045, + "grad_norm": 56.91570071531264, + "learning_rate": 1.38e-06, + "loss": 6.1127, + "step": 46 + }, + { + "epoch": 0.007797917790036916, + "grad_norm": 57.511790120005976, + "learning_rate": 1.41e-06, + "loss": 6.0203, + "step": 47 + }, + { + "epoch": 0.007963830934505787, + "grad_norm": 58.13021570410771, + "learning_rate": 1.44e-06, + "loss": 5.9262, + "step": 48 + }, + { + "epoch": 0.008129744078974656, + "grad_norm": 57.938125335380285, + "learning_rate": 1.4700000000000001e-06, + "loss": 5.8124, + "step": 49 + }, + { + "epoch": 0.008295657223443527, + "grad_norm": 57.39264712573751, + "learning_rate": 1.5e-06, + "loss": 5.6758, + "step": 50 + }, + { + "epoch": 0.008461570367912398, + "grad_norm": 57.79052254778045, + "learning_rate": 1.5300000000000002e-06, + "loss": 5.6168, + "step": 51 + }, + { + "epoch": 0.008627483512381268, + "grad_norm": 58.18250706411147, + "learning_rate": 1.56e-06, + "loss": 5.527, + "step": 52 + }, + { + "epoch": 0.008793396656850139, + "grad_norm": 56.91519882334982, + "learning_rate": 1.59e-06, + "loss": 5.4117, + "step": 53 + }, + { + "epoch": 0.00895930980131901, + "grad_norm": 58.80496017496223, + "learning_rate": 1.62e-06, + "loss": 5.3978, + "step": 54 + }, + { + "epoch": 0.00912522294578788, + "grad_norm": 57.514939390872016, + "learning_rate": 1.6499999999999999e-06, + "loss": 5.2806, + "step": 55 + }, + { + "epoch": 0.00929113609025675, + "grad_norm": 58.261230393043675, + "learning_rate": 1.6800000000000002e-06, + "loss": 5.2305, + "step": 56 + }, + { + "epoch": 0.009457049234725621, + "grad_norm": 58.853719732787475, + "learning_rate": 1.7100000000000001e-06, + "loss": 5.1747, + "step": 57 + }, + { + "epoch": 0.009622962379194492, + "grad_norm": 58.44206227327712, + "learning_rate": 1.74e-06, + "loss": 5.0768, + "step": 58 + }, + { + "epoch": 0.009788875523663362, + "grad_norm": 58.39912900993697, + "learning_rate": 1.77e-06, + "loss": 4.9874, + "step": 59 + }, + { + "epoch": 0.009954788668132233, + "grad_norm": 58.19859876404236, + "learning_rate": 1.8000000000000001e-06, + "loss": 4.8936, + "step": 60 + }, + { + "epoch": 0.010120701812601104, + "grad_norm": 58.931458451778994, + "learning_rate": 1.83e-06, + "loss": 4.8333, + "step": 61 + }, + { + "epoch": 0.010286614957069973, + "grad_norm": 58.566598779309345, + "learning_rate": 1.86e-06, + "loss": 4.7316, + "step": 62 + }, + { + "epoch": 0.010452528101538844, + "grad_norm": 59.43204190036222, + "learning_rate": 1.89e-06, + "loss": 4.6782, + "step": 63 + }, + { + "epoch": 0.010618441246007715, + "grad_norm": 59.32237678351381, + "learning_rate": 1.9200000000000003e-06, + "loss": 4.5875, + "step": 64 + }, + { + "epoch": 0.010784354390476586, + "grad_norm": 58.22281010507284, + "learning_rate": 1.95e-06, + "loss": 4.4805, + "step": 65 + }, + { + "epoch": 0.010950267534945456, + "grad_norm": 58.89565065173638, + "learning_rate": 1.98e-06, + "loss": 4.4417, + "step": 66 + }, + { + "epoch": 0.011116180679414327, + "grad_norm": 59.219564213854724, + "learning_rate": 2.01e-06, + "loss": 4.3646, + "step": 67 + }, + { + "epoch": 0.011282093823883198, + "grad_norm": 60.084020905213414, + "learning_rate": 2.04e-06, + "loss": 4.3387, + "step": 68 + }, + { + "epoch": 0.011448006968352067, + "grad_norm": 59.930063371072634, + "learning_rate": 2.07e-06, + "loss": 4.2256, + "step": 69 + }, + { + "epoch": 0.011613920112820938, + "grad_norm": 59.443610975279206, + "learning_rate": 2.1000000000000002e-06, + "loss": 4.1584, + "step": 70 + }, + { + "epoch": 0.01177983325728981, + "grad_norm": 60.46768457622464, + "learning_rate": 2.13e-06, + "loss": 4.1046, + "step": 71 + }, + { + "epoch": 0.011945746401758679, + "grad_norm": 60.923838757674325, + "learning_rate": 2.16e-06, + "loss": 4.0402, + "step": 72 + }, + { + "epoch": 0.01211165954622755, + "grad_norm": 60.56706226907599, + "learning_rate": 2.19e-06, + "loss": 3.9675, + "step": 73 + }, + { + "epoch": 0.01227757269069642, + "grad_norm": 60.52251526885924, + "learning_rate": 2.22e-06, + "loss": 3.934, + "step": 74 + }, + { + "epoch": 0.012443485835165292, + "grad_norm": 60.64687995134201, + "learning_rate": 2.25e-06, + "loss": 3.8682, + "step": 75 + }, + { + "epoch": 0.012609398979634161, + "grad_norm": 59.74345013299483, + "learning_rate": 2.28e-06, + "loss": 3.7878, + "step": 76 + }, + { + "epoch": 0.012775312124103032, + "grad_norm": 62.7371580672841, + "learning_rate": 2.31e-06, + "loss": 3.7883, + "step": 77 + }, + { + "epoch": 0.012941225268571903, + "grad_norm": 61.120581572512556, + "learning_rate": 2.34e-06, + "loss": 3.6993, + "step": 78 + }, + { + "epoch": 0.013107138413040773, + "grad_norm": 61.57800519844952, + "learning_rate": 2.3699999999999998e-06, + "loss": 3.6369, + "step": 79 + }, + { + "epoch": 0.013273051557509644, + "grad_norm": 61.54876895013541, + "learning_rate": 2.4e-06, + "loss": 3.5785, + "step": 80 + }, + { + "epoch": 0.013438964701978515, + "grad_norm": 60.1358841900405, + "learning_rate": 2.43e-06, + "loss": 3.514, + "step": 81 + }, + { + "epoch": 0.013604877846447384, + "grad_norm": 61.297281772956765, + "learning_rate": 2.4599999999999997e-06, + "loss": 3.4742, + "step": 82 + }, + { + "epoch": 0.013770790990916255, + "grad_norm": 61.30923571729013, + "learning_rate": 2.4900000000000003e-06, + "loss": 3.4433, + "step": 83 + }, + { + "epoch": 0.013936704135385126, + "grad_norm": 61.23947604523023, + "learning_rate": 2.5200000000000004e-06, + "loss": 3.3705, + "step": 84 + }, + { + "epoch": 0.014102617279853997, + "grad_norm": 62.39595603092295, + "learning_rate": 2.55e-06, + "loss": 3.3502, + "step": 85 + }, + { + "epoch": 0.014268530424322867, + "grad_norm": 61.53318479023394, + "learning_rate": 2.5800000000000003e-06, + "loss": 3.2943, + "step": 86 + }, + { + "epoch": 0.014434443568791738, + "grad_norm": 61.212619109385486, + "learning_rate": 2.61e-06, + "loss": 3.2354, + "step": 87 + }, + { + "epoch": 0.014600356713260609, + "grad_norm": 60.68820969945481, + "learning_rate": 2.64e-06, + "loss": 3.1867, + "step": 88 + }, + { + "epoch": 0.014766269857729478, + "grad_norm": 61.161255238888366, + "learning_rate": 2.6700000000000003e-06, + "loss": 3.1363, + "step": 89 + }, + { + "epoch": 0.014932183002198349, + "grad_norm": 61.39181889008794, + "learning_rate": 2.7e-06, + "loss": 3.106, + "step": 90 + }, + { + "epoch": 0.01509809614666722, + "grad_norm": 61.510960534577144, + "learning_rate": 2.73e-06, + "loss": 3.0709, + "step": 91 + }, + { + "epoch": 0.01526400929113609, + "grad_norm": 61.403101355512064, + "learning_rate": 2.76e-06, + "loss": 3.0295, + "step": 92 + }, + { + "epoch": 0.01542992243560496, + "grad_norm": 60.92480004727436, + "learning_rate": 2.79e-06, + "loss": 2.9853, + "step": 93 + }, + { + "epoch": 0.015595835580073832, + "grad_norm": 60.38494778358348, + "learning_rate": 2.82e-06, + "loss": 2.9374, + "step": 94 + }, + { + "epoch": 0.015761748724542703, + "grad_norm": 61.24204912886601, + "learning_rate": 2.85e-06, + "loss": 2.9142, + "step": 95 + }, + { + "epoch": 0.015927661869011574, + "grad_norm": 61.79448720849728, + "learning_rate": 2.88e-06, + "loss": 2.8738, + "step": 96 + }, + { + "epoch": 0.01609357501348044, + "grad_norm": 60.44596322113541, + "learning_rate": 2.9099999999999997e-06, + "loss": 2.8393, + "step": 97 + }, + { + "epoch": 0.016259488157949312, + "grad_norm": 61.69496692909939, + "learning_rate": 2.9400000000000002e-06, + "loss": 2.8091, + "step": 98 + }, + { + "epoch": 0.016425401302418183, + "grad_norm": 61.042506818121446, + "learning_rate": 2.9700000000000004e-06, + "loss": 2.7632, + "step": 99 + }, + { + "epoch": 0.016591314446887055, + "grad_norm": 59.83708084484388, + "learning_rate": 3e-06, + "loss": 2.7211, + "step": 100 + }, + { + "epoch": 0.016757227591355926, + "grad_norm": 60.76096104413392, + "learning_rate": 3.0300000000000002e-06, + "loss": 2.6877, + "step": 101 + }, + { + "epoch": 0.016923140735824797, + "grad_norm": 60.503095248644414, + "learning_rate": 3.0600000000000003e-06, + "loss": 2.6464, + "step": 102 + }, + { + "epoch": 0.017089053880293668, + "grad_norm": 60.022580078973974, + "learning_rate": 3.09e-06, + "loss": 2.6246, + "step": 103 + }, + { + "epoch": 0.017254967024762535, + "grad_norm": 60.7786406432538, + "learning_rate": 3.12e-06, + "loss": 2.5835, + "step": 104 + }, + { + "epoch": 0.017420880169231406, + "grad_norm": 60.0134194819714, + "learning_rate": 3.15e-06, + "loss": 2.548, + "step": 105 + }, + { + "epoch": 0.017586793313700277, + "grad_norm": 59.59173672650453, + "learning_rate": 3.18e-06, + "loss": 2.5352, + "step": 106 + }, + { + "epoch": 0.01775270645816915, + "grad_norm": 59.365953677840324, + "learning_rate": 3.21e-06, + "loss": 2.5026, + "step": 107 + }, + { + "epoch": 0.01791861960263802, + "grad_norm": 59.03666897099401, + "learning_rate": 3.24e-06, + "loss": 2.4604, + "step": 108 + }, + { + "epoch": 0.01808453274710689, + "grad_norm": 58.7798482726752, + "learning_rate": 3.27e-06, + "loss": 2.4486, + "step": 109 + }, + { + "epoch": 0.01825044589157576, + "grad_norm": 58.114483374139134, + "learning_rate": 3.2999999999999997e-06, + "loss": 2.4179, + "step": 110 + }, + { + "epoch": 0.01841635903604463, + "grad_norm": 59.80520961291876, + "learning_rate": 3.33e-06, + "loss": 2.392, + "step": 111 + }, + { + "epoch": 0.0185822721805135, + "grad_norm": 58.14478216556001, + "learning_rate": 3.3600000000000004e-06, + "loss": 2.3406, + "step": 112 + }, + { + "epoch": 0.01874818532498237, + "grad_norm": 59.18483586241932, + "learning_rate": 3.3899999999999997e-06, + "loss": 2.3231, + "step": 113 + }, + { + "epoch": 0.018914098469451242, + "grad_norm": 57.28442984470624, + "learning_rate": 3.4200000000000003e-06, + "loss": 2.2941, + "step": 114 + }, + { + "epoch": 0.019080011613920114, + "grad_norm": 57.357039771400295, + "learning_rate": 3.4500000000000004e-06, + "loss": 2.2816, + "step": 115 + }, + { + "epoch": 0.019245924758388985, + "grad_norm": 57.111232218047554, + "learning_rate": 3.48e-06, + "loss": 2.2342, + "step": 116 + }, + { + "epoch": 0.019411837902857852, + "grad_norm": 57.18649743743891, + "learning_rate": 3.5100000000000003e-06, + "loss": 2.2274, + "step": 117 + }, + { + "epoch": 0.019577751047326723, + "grad_norm": 56.32726893984759, + "learning_rate": 3.54e-06, + "loss": 2.188, + "step": 118 + }, + { + "epoch": 0.019743664191795594, + "grad_norm": 56.11126674497503, + "learning_rate": 3.57e-06, + "loss": 2.1688, + "step": 119 + }, + { + "epoch": 0.019909577336264465, + "grad_norm": 56.452361895827636, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.1257, + "step": 120 + }, + { + "epoch": 0.020075490480733336, + "grad_norm": 54.87188045608243, + "learning_rate": 3.63e-06, + "loss": 2.1082, + "step": 121 + }, + { + "epoch": 0.020241403625202208, + "grad_norm": 55.85330145328899, + "learning_rate": 3.66e-06, + "loss": 2.1085, + "step": 122 + }, + { + "epoch": 0.02040731676967108, + "grad_norm": 54.697357042805265, + "learning_rate": 3.69e-06, + "loss": 2.0609, + "step": 123 + }, + { + "epoch": 0.020573229914139946, + "grad_norm": 54.39711063453628, + "learning_rate": 3.72e-06, + "loss": 2.0577, + "step": 124 + }, + { + "epoch": 0.020739143058608817, + "grad_norm": 54.63336965202357, + "learning_rate": 3.75e-06, + "loss": 2.0359, + "step": 125 + }, + { + "epoch": 0.02090505620307769, + "grad_norm": 54.2613857002437, + "learning_rate": 3.78e-06, + "loss": 2.0223, + "step": 126 + }, + { + "epoch": 0.02107096934754656, + "grad_norm": 54.55239593692058, + "learning_rate": 3.81e-06, + "loss": 1.9907, + "step": 127 + }, + { + "epoch": 0.02123688249201543, + "grad_norm": 54.163024439012574, + "learning_rate": 3.8400000000000005e-06, + "loss": 1.9572, + "step": 128 + }, + { + "epoch": 0.0214027956364843, + "grad_norm": 52.39556608669111, + "learning_rate": 3.87e-06, + "loss": 1.9547, + "step": 129 + }, + { + "epoch": 0.021568708780953173, + "grad_norm": 52.88053635834076, + "learning_rate": 3.9e-06, + "loss": 1.9174, + "step": 130 + }, + { + "epoch": 0.02173462192542204, + "grad_norm": 52.27647350963412, + "learning_rate": 3.93e-06, + "loss": 1.9085, + "step": 131 + }, + { + "epoch": 0.02190053506989091, + "grad_norm": 51.8188123723821, + "learning_rate": 3.96e-06, + "loss": 1.8921, + "step": 132 + }, + { + "epoch": 0.022066448214359782, + "grad_norm": 51.13956466153071, + "learning_rate": 3.99e-06, + "loss": 1.8695, + "step": 133 + }, + { + "epoch": 0.022232361358828653, + "grad_norm": 50.880997478246, + "learning_rate": 4.02e-06, + "loss": 1.8723, + "step": 134 + }, + { + "epoch": 0.022398274503297524, + "grad_norm": 50.17800750552662, + "learning_rate": 4.05e-06, + "loss": 1.8362, + "step": 135 + }, + { + "epoch": 0.022564187647766396, + "grad_norm": 50.25731092529157, + "learning_rate": 4.08e-06, + "loss": 1.8273, + "step": 136 + }, + { + "epoch": 0.022730100792235263, + "grad_norm": 49.557602243937815, + "learning_rate": 4.1100000000000005e-06, + "loss": 1.7923, + "step": 137 + }, + { + "epoch": 0.022896013936704134, + "grad_norm": 49.06910514435632, + "learning_rate": 4.14e-06, + "loss": 1.7823, + "step": 138 + }, + { + "epoch": 0.023061927081173005, + "grad_norm": 48.124932183722606, + "learning_rate": 4.17e-06, + "loss": 1.7748, + "step": 139 + }, + { + "epoch": 0.023227840225641876, + "grad_norm": 48.134667233726226, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7639, + "step": 140 + }, + { + "epoch": 0.023393753370110747, + "grad_norm": 47.557312841261506, + "learning_rate": 4.23e-06, + "loss": 1.7324, + "step": 141 + }, + { + "epoch": 0.02355966651457962, + "grad_norm": 47.70768250047664, + "learning_rate": 4.26e-06, + "loss": 1.7218, + "step": 142 + }, + { + "epoch": 0.02372557965904849, + "grad_norm": 46.50696652735735, + "learning_rate": 4.2900000000000004e-06, + "loss": 1.6973, + "step": 143 + }, + { + "epoch": 0.023891492803517357, + "grad_norm": 46.08588423958666, + "learning_rate": 4.32e-06, + "loss": 1.6985, + "step": 144 + }, + { + "epoch": 0.024057405947986228, + "grad_norm": 45.56462043474931, + "learning_rate": 4.35e-06, + "loss": 1.6685, + "step": 145 + }, + { + "epoch": 0.0242233190924551, + "grad_norm": 45.97216272149993, + "learning_rate": 4.38e-06, + "loss": 1.6579, + "step": 146 + }, + { + "epoch": 0.02438923223692397, + "grad_norm": 45.46498190474296, + "learning_rate": 4.41e-06, + "loss": 1.64, + "step": 147 + }, + { + "epoch": 0.02455514538139284, + "grad_norm": 44.854534417908845, + "learning_rate": 4.44e-06, + "loss": 1.624, + "step": 148 + }, + { + "epoch": 0.024721058525861712, + "grad_norm": 43.83569267105489, + "learning_rate": 4.4699999999999996e-06, + "loss": 1.6165, + "step": 149 + }, + { + "epoch": 0.024886971670330583, + "grad_norm": 43.48566112246643, + "learning_rate": 4.5e-06, + "loss": 1.5971, + "step": 150 + }, + { + "epoch": 0.02505288481479945, + "grad_norm": 42.91276059636016, + "learning_rate": 4.53e-06, + "loss": 1.5792, + "step": 151 + }, + { + "epoch": 0.025218797959268322, + "grad_norm": 42.606137245138356, + "learning_rate": 4.56e-06, + "loss": 1.5553, + "step": 152 + }, + { + "epoch": 0.025384711103737193, + "grad_norm": 41.4393639959328, + "learning_rate": 4.59e-06, + "loss": 1.5775, + "step": 153 + }, + { + "epoch": 0.025550624248206064, + "grad_norm": 41.54145609070485, + "learning_rate": 4.62e-06, + "loss": 1.5479, + "step": 154 + }, + { + "epoch": 0.025716537392674935, + "grad_norm": 41.887845194447074, + "learning_rate": 4.65e-06, + "loss": 1.5296, + "step": 155 + }, + { + "epoch": 0.025882450537143806, + "grad_norm": 40.933178238844356, + "learning_rate": 4.68e-06, + "loss": 1.5297, + "step": 156 + }, + { + "epoch": 0.026048363681612677, + "grad_norm": 39.94971992763078, + "learning_rate": 4.71e-06, + "loss": 1.5151, + "step": 157 + }, + { + "epoch": 0.026214276826081545, + "grad_norm": 39.35709938410145, + "learning_rate": 4.7399999999999995e-06, + "loss": 1.5054, + "step": 158 + }, + { + "epoch": 0.026380189970550416, + "grad_norm": 38.69670975999757, + "learning_rate": 4.77e-06, + "loss": 1.4814, + "step": 159 + }, + { + "epoch": 0.026546103115019287, + "grad_norm": 38.62419347835826, + "learning_rate": 4.8e-06, + "loss": 1.4754, + "step": 160 + }, + { + "epoch": 0.02671201625948816, + "grad_norm": 37.77998094940736, + "learning_rate": 4.8299999999999995e-06, + "loss": 1.462, + "step": 161 + }, + { + "epoch": 0.02687792940395703, + "grad_norm": 38.187533465091185, + "learning_rate": 4.86e-06, + "loss": 1.4476, + "step": 162 + }, + { + "epoch": 0.0270438425484259, + "grad_norm": 36.94958450784839, + "learning_rate": 4.89e-06, + "loss": 1.4472, + "step": 163 + }, + { + "epoch": 0.027209755692894768, + "grad_norm": 36.82539602249903, + "learning_rate": 4.9199999999999995e-06, + "loss": 1.4243, + "step": 164 + }, + { + "epoch": 0.02737566883736364, + "grad_norm": 35.84655716782342, + "learning_rate": 4.950000000000001e-06, + "loss": 1.4311, + "step": 165 + }, + { + "epoch": 0.02754158198183251, + "grad_norm": 35.976742015954045, + "learning_rate": 4.980000000000001e-06, + "loss": 1.4, + "step": 166 + }, + { + "epoch": 0.02770749512630138, + "grad_norm": 35.117919170195876, + "learning_rate": 5.01e-06, + "loss": 1.3998, + "step": 167 + }, + { + "epoch": 0.027873408270770252, + "grad_norm": 35.170733706617995, + "learning_rate": 5.040000000000001e-06, + "loss": 1.3726, + "step": 168 + }, + { + "epoch": 0.028039321415239123, + "grad_norm": 34.47083347313095, + "learning_rate": 5.070000000000001e-06, + "loss": 1.381, + "step": 169 + }, + { + "epoch": 0.028205234559707994, + "grad_norm": 33.79057750635065, + "learning_rate": 5.1e-06, + "loss": 1.3674, + "step": 170 + }, + { + "epoch": 0.028371147704176862, + "grad_norm": 32.91789621932166, + "learning_rate": 5.13e-06, + "loss": 1.3759, + "step": 171 + }, + { + "epoch": 0.028537060848645733, + "grad_norm": 33.56814465795624, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.3391, + "step": 172 + }, + { + "epoch": 0.028702973993114604, + "grad_norm": 32.6150380329613, + "learning_rate": 5.19e-06, + "loss": 1.3314, + "step": 173 + }, + { + "epoch": 0.028868887137583475, + "grad_norm": 31.955882800699367, + "learning_rate": 5.22e-06, + "loss": 1.3339, + "step": 174 + }, + { + "epoch": 0.029034800282052346, + "grad_norm": 31.40356236669499, + "learning_rate": 5.2500000000000006e-06, + "loss": 1.3184, + "step": 175 + }, + { + "epoch": 0.029200713426521217, + "grad_norm": 30.681143239334805, + "learning_rate": 5.28e-06, + "loss": 1.3139, + "step": 176 + }, + { + "epoch": 0.02936662657099009, + "grad_norm": 30.21502637398018, + "learning_rate": 5.31e-06, + "loss": 1.3107, + "step": 177 + }, + { + "epoch": 0.029532539715458956, + "grad_norm": 29.92516372692031, + "learning_rate": 5.3400000000000005e-06, + "loss": 1.3068, + "step": 178 + }, + { + "epoch": 0.029698452859927827, + "grad_norm": 30.018033153252357, + "learning_rate": 5.37e-06, + "loss": 1.2877, + "step": 179 + }, + { + "epoch": 0.029864366004396698, + "grad_norm": 29.63626860215389, + "learning_rate": 5.4e-06, + "loss": 1.2757, + "step": 180 + }, + { + "epoch": 0.03003027914886557, + "grad_norm": 29.205952566984713, + "learning_rate": 5.4300000000000005e-06, + "loss": 1.2624, + "step": 181 + }, + { + "epoch": 0.03019619229333444, + "grad_norm": 28.323326976872018, + "learning_rate": 5.46e-06, + "loss": 1.2636, + "step": 182 + }, + { + "epoch": 0.03036210543780331, + "grad_norm": 28.327391694947394, + "learning_rate": 5.49e-06, + "loss": 1.2445, + "step": 183 + }, + { + "epoch": 0.03052801858227218, + "grad_norm": 27.538465732946584, + "learning_rate": 5.52e-06, + "loss": 1.2553, + "step": 184 + }, + { + "epoch": 0.03069393172674105, + "grad_norm": 27.86388136213645, + "learning_rate": 5.55e-06, + "loss": 1.2187, + "step": 185 + }, + { + "epoch": 0.03085984487120992, + "grad_norm": 27.136153982898104, + "learning_rate": 5.58e-06, + "loss": 1.2264, + "step": 186 + }, + { + "epoch": 0.031025758015678792, + "grad_norm": 26.542801786628125, + "learning_rate": 5.61e-06, + "loss": 1.2271, + "step": 187 + }, + { + "epoch": 0.031191671160147663, + "grad_norm": 26.229276061707733, + "learning_rate": 5.64e-06, + "loss": 1.2213, + "step": 188 + }, + { + "epoch": 0.031357584304616534, + "grad_norm": 25.655150823462403, + "learning_rate": 5.67e-06, + "loss": 1.2131, + "step": 189 + }, + { + "epoch": 0.031523497449085405, + "grad_norm": 25.43762363935692, + "learning_rate": 5.7e-06, + "loss": 1.2009, + "step": 190 + }, + { + "epoch": 0.031689410593554276, + "grad_norm": 24.606418616190336, + "learning_rate": 5.73e-06, + "loss": 1.2105, + "step": 191 + }, + { + "epoch": 0.03185532373802315, + "grad_norm": 24.297360245690335, + "learning_rate": 5.76e-06, + "loss": 1.1902, + "step": 192 + }, + { + "epoch": 0.03202123688249202, + "grad_norm": 24.37416138144368, + "learning_rate": 5.79e-06, + "loss": 1.183, + "step": 193 + }, + { + "epoch": 0.03218715002696088, + "grad_norm": 23.58889140972946, + "learning_rate": 5.819999999999999e-06, + "loss": 1.1951, + "step": 194 + }, + { + "epoch": 0.032353063171429754, + "grad_norm": 23.192738038885675, + "learning_rate": 5.850000000000001e-06, + "loss": 1.1846, + "step": 195 + }, + { + "epoch": 0.032518976315898625, + "grad_norm": 23.067808075903493, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.189, + "step": 196 + }, + { + "epoch": 0.032684889460367496, + "grad_norm": 22.129640030521287, + "learning_rate": 5.909999999999999e-06, + "loss": 1.1902, + "step": 197 + }, + { + "epoch": 0.03285080260483637, + "grad_norm": 22.580085955752832, + "learning_rate": 5.940000000000001e-06, + "loss": 1.142, + "step": 198 + }, + { + "epoch": 0.03301671574930524, + "grad_norm": 22.001422436219375, + "learning_rate": 5.9700000000000004e-06, + "loss": 1.1451, + "step": 199 + }, + { + "epoch": 0.03318262889377411, + "grad_norm": 21.511559365669424, + "learning_rate": 6e-06, + "loss": 1.157, + "step": 200 + }, + { + "epoch": 0.03334854203824298, + "grad_norm": 21.350738678737287, + "learning_rate": 6.030000000000001e-06, + "loss": 1.1548, + "step": 201 + }, + { + "epoch": 0.03351445518271185, + "grad_norm": 21.213322714991193, + "learning_rate": 6.0600000000000004e-06, + "loss": 1.1302, + "step": 202 + }, + { + "epoch": 0.03368036832718072, + "grad_norm": 20.96284799616787, + "learning_rate": 6.09e-06, + "loss": 1.13, + "step": 203 + }, + { + "epoch": 0.03384628147164959, + "grad_norm": 20.239673660306355, + "learning_rate": 6.120000000000001e-06, + "loss": 1.1364, + "step": 204 + }, + { + "epoch": 0.034012194616118464, + "grad_norm": 20.140657358498693, + "learning_rate": 6.15e-06, + "loss": 1.1313, + "step": 205 + }, + { + "epoch": 0.034178107760587335, + "grad_norm": 19.752608255172916, + "learning_rate": 6.18e-06, + "loss": 1.1187, + "step": 206 + }, + { + "epoch": 0.034344020905056206, + "grad_norm": 19.351629911713136, + "learning_rate": 6.21e-06, + "loss": 1.1194, + "step": 207 + }, + { + "epoch": 0.03450993404952507, + "grad_norm": 19.1529582255177, + "learning_rate": 6.24e-06, + "loss": 1.1175, + "step": 208 + }, + { + "epoch": 0.03467584719399394, + "grad_norm": 18.921466270786127, + "learning_rate": 6.27e-06, + "loss": 1.1145, + "step": 209 + }, + { + "epoch": 0.03484176033846281, + "grad_norm": 18.439640178944842, + "learning_rate": 6.3e-06, + "loss": 1.106, + "step": 210 + }, + { + "epoch": 0.035007673482931684, + "grad_norm": 18.592213045939108, + "learning_rate": 6.33e-06, + "loss": 1.0832, + "step": 211 + }, + { + "epoch": 0.035173586627400555, + "grad_norm": 18.02782578467052, + "learning_rate": 6.36e-06, + "loss": 1.1019, + "step": 212 + }, + { + "epoch": 0.035339499771869426, + "grad_norm": 18.0166852155525, + "learning_rate": 6.39e-06, + "loss": 1.0737, + "step": 213 + }, + { + "epoch": 0.0355054129163383, + "grad_norm": 17.781239753798864, + "learning_rate": 6.42e-06, + "loss": 1.0775, + "step": 214 + }, + { + "epoch": 0.03567132606080717, + "grad_norm": 17.221262259530782, + "learning_rate": 6.45e-06, + "loss": 1.1112, + "step": 215 + }, + { + "epoch": 0.03583723920527604, + "grad_norm": 16.865632842715378, + "learning_rate": 6.48e-06, + "loss": 1.0933, + "step": 216 + }, + { + "epoch": 0.03600315234974491, + "grad_norm": 16.90995396668004, + "learning_rate": 6.51e-06, + "loss": 1.0786, + "step": 217 + }, + { + "epoch": 0.03616906549421378, + "grad_norm": 16.502587446764466, + "learning_rate": 6.54e-06, + "loss": 1.0824, + "step": 218 + }, + { + "epoch": 0.03633497863868265, + "grad_norm": 16.396720584098915, + "learning_rate": 6.57e-06, + "loss": 1.0666, + "step": 219 + }, + { + "epoch": 0.03650089178315152, + "grad_norm": 16.0958491226786, + "learning_rate": 6.5999999999999995e-06, + "loss": 1.0587, + "step": 220 + }, + { + "epoch": 0.03666680492762039, + "grad_norm": 15.781394542209787, + "learning_rate": 6.63e-06, + "loss": 1.0913, + "step": 221 + }, + { + "epoch": 0.03683271807208926, + "grad_norm": 15.526150669252083, + "learning_rate": 6.66e-06, + "loss": 1.0685, + "step": 222 + }, + { + "epoch": 0.03699863121655813, + "grad_norm": 15.295714129778606, + "learning_rate": 6.6899999999999995e-06, + "loss": 1.0688, + "step": 223 + }, + { + "epoch": 0.037164544361027, + "grad_norm": 14.962595843836096, + "learning_rate": 6.720000000000001e-06, + "loss": 1.0663, + "step": 224 + }, + { + "epoch": 0.03733045750549587, + "grad_norm": 14.845441469454471, + "learning_rate": 6.75e-06, + "loss": 1.0487, + "step": 225 + }, + { + "epoch": 0.03749637064996474, + "grad_norm": 14.669509278037967, + "learning_rate": 6.7799999999999995e-06, + "loss": 1.058, + "step": 226 + }, + { + "epoch": 0.037662283794433614, + "grad_norm": 14.550072253222602, + "learning_rate": 6.810000000000001e-06, + "loss": 1.0425, + "step": 227 + }, + { + "epoch": 0.037828196938902485, + "grad_norm": 14.382438880655664, + "learning_rate": 6.840000000000001e-06, + "loss": 1.0367, + "step": 228 + }, + { + "epoch": 0.037994110083371356, + "grad_norm": 14.274435292405178, + "learning_rate": 6.87e-06, + "loss": 1.0414, + "step": 229 + }, + { + "epoch": 0.03816002322784023, + "grad_norm": 14.07020130770808, + "learning_rate": 6.900000000000001e-06, + "loss": 1.0264, + "step": 230 + }, + { + "epoch": 0.0383259363723091, + "grad_norm": 13.49479267804676, + "learning_rate": 6.9300000000000006e-06, + "loss": 1.0438, + "step": 231 + }, + { + "epoch": 0.03849184951677797, + "grad_norm": 13.554935453999114, + "learning_rate": 6.96e-06, + "loss": 1.0426, + "step": 232 + }, + { + "epoch": 0.03865776266124684, + "grad_norm": 13.55923875969836, + "learning_rate": 6.99e-06, + "loss": 0.9961, + "step": 233 + }, + { + "epoch": 0.038823675805715704, + "grad_norm": 12.9140401944659, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.0416, + "step": 234 + }, + { + "epoch": 0.038989588950184575, + "grad_norm": 13.324650059397225, + "learning_rate": 7.05e-06, + "loss": 0.9825, + "step": 235 + }, + { + "epoch": 0.03915550209465345, + "grad_norm": 12.937116536824265, + "learning_rate": 7.08e-06, + "loss": 1.0103, + "step": 236 + }, + { + "epoch": 0.03932141523912232, + "grad_norm": 12.744639401075105, + "learning_rate": 7.1100000000000005e-06, + "loss": 1.0198, + "step": 237 + }, + { + "epoch": 0.03948732838359119, + "grad_norm": 12.472143264741263, + "learning_rate": 7.14e-06, + "loss": 1.0117, + "step": 238 + }, + { + "epoch": 0.03965324152806006, + "grad_norm": 12.428804263701728, + "learning_rate": 7.17e-06, + "loss": 0.9982, + "step": 239 + }, + { + "epoch": 0.03981915467252893, + "grad_norm": 12.136592772982283, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.0175, + "step": 240 + }, + { + "epoch": 0.0399850678169978, + "grad_norm": 12.053549043031309, + "learning_rate": 7.23e-06, + "loss": 1.0027, + "step": 241 + }, + { + "epoch": 0.04015098096146667, + "grad_norm": 11.843388086833002, + "learning_rate": 7.26e-06, + "loss": 1.0132, + "step": 242 + }, + { + "epoch": 0.040316894105935544, + "grad_norm": 11.628322273315547, + "learning_rate": 7.2900000000000005e-06, + "loss": 1.0017, + "step": 243 + }, + { + "epoch": 0.040482807250404415, + "grad_norm": 11.553787753691592, + "learning_rate": 7.32e-06, + "loss": 0.9988, + "step": 244 + }, + { + "epoch": 0.040648720394873286, + "grad_norm": 11.308136922354493, + "learning_rate": 7.35e-06, + "loss": 1.0267, + "step": 245 + }, + { + "epoch": 0.04081463353934216, + "grad_norm": 11.038599061997045, + "learning_rate": 7.38e-06, + "loss": 1.0145, + "step": 246 + }, + { + "epoch": 0.04098054668381103, + "grad_norm": 11.181030653605738, + "learning_rate": 7.41e-06, + "loss": 0.9889, + "step": 247 + }, + { + "epoch": 0.04114645982827989, + "grad_norm": 10.86832281657928, + "learning_rate": 7.44e-06, + "loss": 0.9852, + "step": 248 + }, + { + "epoch": 0.04131237297274876, + "grad_norm": 10.554939136339671, + "learning_rate": 7.47e-06, + "loss": 0.982, + "step": 249 + }, + { + "epoch": 0.041478286117217635, + "grad_norm": 10.568145863035252, + "learning_rate": 7.5e-06, + "loss": 0.9821, + "step": 250 + }, + { + "epoch": 0.041644199261686506, + "grad_norm": 10.308386741012788, + "learning_rate": 7.53e-06, + "loss": 0.9996, + "step": 251 + }, + { + "epoch": 0.04181011240615538, + "grad_norm": 10.372751036409273, + "learning_rate": 7.56e-06, + "loss": 0.9551, + "step": 252 + }, + { + "epoch": 0.04197602555062425, + "grad_norm": 10.072016333214728, + "learning_rate": 7.590000000000001e-06, + "loss": 0.9793, + "step": 253 + }, + { + "epoch": 0.04214193869509312, + "grad_norm": 9.971211572091233, + "learning_rate": 7.62e-06, + "loss": 0.9783, + "step": 254 + }, + { + "epoch": 0.04230785183956199, + "grad_norm": 9.805606829989866, + "learning_rate": 7.65e-06, + "loss": 0.9777, + "step": 255 + }, + { + "epoch": 0.04247376498403086, + "grad_norm": 9.511714368090708, + "learning_rate": 7.680000000000001e-06, + "loss": 0.9896, + "step": 256 + }, + { + "epoch": 0.04263967812849973, + "grad_norm": 9.793257406261617, + "learning_rate": 7.71e-06, + "loss": 0.9592, + "step": 257 + }, + { + "epoch": 0.0428055912729686, + "grad_norm": 9.489765564713549, + "learning_rate": 7.74e-06, + "loss": 0.9677, + "step": 258 + }, + { + "epoch": 0.042971504417437474, + "grad_norm": 9.522682824223127, + "learning_rate": 7.77e-06, + "loss": 0.9522, + "step": 259 + }, + { + "epoch": 0.043137417561906345, + "grad_norm": 9.31316057534312, + "learning_rate": 7.8e-06, + "loss": 0.9523, + "step": 260 + }, + { + "epoch": 0.04330333070637521, + "grad_norm": 9.191006657018935, + "learning_rate": 7.83e-06, + "loss": 0.9491, + "step": 261 + }, + { + "epoch": 0.04346924385084408, + "grad_norm": 9.005774575637536, + "learning_rate": 7.86e-06, + "loss": 0.9776, + "step": 262 + }, + { + "epoch": 0.04363515699531295, + "grad_norm": 8.944376972994409, + "learning_rate": 7.89e-06, + "loss": 0.9724, + "step": 263 + }, + { + "epoch": 0.04380107013978182, + "grad_norm": 8.880616489633164, + "learning_rate": 7.92e-06, + "loss": 0.9509, + "step": 264 + }, + { + "epoch": 0.043966983284250694, + "grad_norm": 10.295937531650397, + "learning_rate": 7.95e-06, + "loss": 0.9555, + "step": 265 + }, + { + "epoch": 0.044132896428719565, + "grad_norm": 8.742470979369012, + "learning_rate": 7.98e-06, + "loss": 0.9365, + "step": 266 + }, + { + "epoch": 0.044298809573188436, + "grad_norm": 8.59286130501365, + "learning_rate": 8.01e-06, + "loss": 0.9479, + "step": 267 + }, + { + "epoch": 0.04446472271765731, + "grad_norm": 8.522351957971027, + "learning_rate": 8.04e-06, + "loss": 0.9424, + "step": 268 + }, + { + "epoch": 0.04463063586212618, + "grad_norm": 8.330738085855412, + "learning_rate": 8.069999999999999e-06, + "loss": 0.9485, + "step": 269 + }, + { + "epoch": 0.04479654900659505, + "grad_norm": 8.18707701163648, + "learning_rate": 8.1e-06, + "loss": 0.9649, + "step": 270 + }, + { + "epoch": 0.04496246215106392, + "grad_norm": 8.222407203836411, + "learning_rate": 8.13e-06, + "loss": 0.9468, + "step": 271 + }, + { + "epoch": 0.04512837529553279, + "grad_norm": 8.094229801443008, + "learning_rate": 8.16e-06, + "loss": 0.9418, + "step": 272 + }, + { + "epoch": 0.04529428844000166, + "grad_norm": 7.940256424590958, + "learning_rate": 8.190000000000001e-06, + "loss": 0.9553, + "step": 273 + }, + { + "epoch": 0.045460201584470526, + "grad_norm": 7.813320194657039, + "learning_rate": 8.220000000000001e-06, + "loss": 0.9505, + "step": 274 + }, + { + "epoch": 0.0456261147289394, + "grad_norm": 7.854878419977774, + "learning_rate": 8.25e-06, + "loss": 0.9382, + "step": 275 + }, + { + "epoch": 0.04579202787340827, + "grad_norm": 7.774670072265875, + "learning_rate": 8.28e-06, + "loss": 0.9364, + "step": 276 + }, + { + "epoch": 0.04595794101787714, + "grad_norm": 7.567068884700512, + "learning_rate": 8.31e-06, + "loss": 0.9388, + "step": 277 + }, + { + "epoch": 0.04612385416234601, + "grad_norm": 7.605183779660865, + "learning_rate": 8.34e-06, + "loss": 0.9403, + "step": 278 + }, + { + "epoch": 0.04628976730681488, + "grad_norm": 7.566454167028441, + "learning_rate": 8.370000000000001e-06, + "loss": 0.9218, + "step": 279 + }, + { + "epoch": 0.04645568045128375, + "grad_norm": 7.3469120826750265, + "learning_rate": 8.400000000000001e-06, + "loss": 0.932, + "step": 280 + }, + { + "epoch": 0.046621593595752624, + "grad_norm": 7.26408565369186, + "learning_rate": 8.43e-06, + "loss": 0.9332, + "step": 281 + }, + { + "epoch": 0.046787506740221495, + "grad_norm": 7.126714812653672, + "learning_rate": 8.46e-06, + "loss": 0.9408, + "step": 282 + }, + { + "epoch": 0.046953419884690366, + "grad_norm": 7.160875540112923, + "learning_rate": 8.49e-06, + "loss": 0.9326, + "step": 283 + }, + { + "epoch": 0.04711933302915924, + "grad_norm": 6.989719202314339, + "learning_rate": 8.52e-06, + "loss": 0.9258, + "step": 284 + }, + { + "epoch": 0.04728524617362811, + "grad_norm": 6.897188941855704, + "learning_rate": 8.55e-06, + "loss": 0.9464, + "step": 285 + }, + { + "epoch": 0.04745115931809698, + "grad_norm": 6.841146949131663, + "learning_rate": 8.580000000000001e-06, + "loss": 0.9452, + "step": 286 + }, + { + "epoch": 0.04761707246256585, + "grad_norm": 6.841546310249179, + "learning_rate": 8.61e-06, + "loss": 0.9178, + "step": 287 + }, + { + "epoch": 0.047782985607034714, + "grad_norm": 6.544761686641858, + "learning_rate": 8.64e-06, + "loss": 0.9603, + "step": 288 + }, + { + "epoch": 0.047948898751503585, + "grad_norm": 6.614087506733643, + "learning_rate": 8.67e-06, + "loss": 0.916, + "step": 289 + }, + { + "epoch": 0.048114811895972456, + "grad_norm": 6.6124048640237945, + "learning_rate": 8.7e-06, + "loss": 0.9129, + "step": 290 + }, + { + "epoch": 0.04828072504044133, + "grad_norm": 6.408146101594846, + "learning_rate": 8.73e-06, + "loss": 0.9225, + "step": 291 + }, + { + "epoch": 0.0484466381849102, + "grad_norm": 6.502364355003014, + "learning_rate": 8.76e-06, + "loss": 0.9031, + "step": 292 + }, + { + "epoch": 0.04861255132937907, + "grad_norm": 6.331848917593811, + "learning_rate": 8.79e-06, + "loss": 0.9251, + "step": 293 + }, + { + "epoch": 0.04877846447384794, + "grad_norm": 6.312668466628936, + "learning_rate": 8.82e-06, + "loss": 0.9046, + "step": 294 + }, + { + "epoch": 0.04894437761831681, + "grad_norm": 6.120718833347466, + "learning_rate": 8.85e-06, + "loss": 0.9387, + "step": 295 + }, + { + "epoch": 0.04911029076278568, + "grad_norm": 6.146017531230814, + "learning_rate": 8.88e-06, + "loss": 0.9065, + "step": 296 + }, + { + "epoch": 0.049276203907254554, + "grad_norm": 6.2298783704404475, + "learning_rate": 8.91e-06, + "loss": 0.8897, + "step": 297 + }, + { + "epoch": 0.049442117051723425, + "grad_norm": 6.01881306966524, + "learning_rate": 8.939999999999999e-06, + "loss": 0.9179, + "step": 298 + }, + { + "epoch": 0.049608030196192296, + "grad_norm": 6.018783760260834, + "learning_rate": 8.97e-06, + "loss": 0.898, + "step": 299 + }, + { + "epoch": 0.04977394334066117, + "grad_norm": 5.909905610679987, + "learning_rate": 9e-06, + "loss": 0.9119, + "step": 300 + }, + { + "epoch": 0.04993985648513003, + "grad_norm": 5.734121028769744, + "learning_rate": 8.999999975052319e-06, + "loss": 0.9342, + "step": 301 + }, + { + "epoch": 0.0501057696295989, + "grad_norm": 5.78200438759076, + "learning_rate": 8.999999900209277e-06, + "loss": 0.9036, + "step": 302 + }, + { + "epoch": 0.05027168277406777, + "grad_norm": 5.667277372988322, + "learning_rate": 8.999999775470875e-06, + "loss": 0.9075, + "step": 303 + }, + { + "epoch": 0.050437595918536644, + "grad_norm": 5.686905178208374, + "learning_rate": 8.999999600837113e-06, + "loss": 0.9196, + "step": 304 + }, + { + "epoch": 0.050603509063005515, + "grad_norm": 5.589145090418338, + "learning_rate": 8.999999376307994e-06, + "loss": 0.9139, + "step": 305 + }, + { + "epoch": 0.050769422207474386, + "grad_norm": 5.53466927124937, + "learning_rate": 8.99999910188352e-06, + "loss": 0.9135, + "step": 306 + }, + { + "epoch": 0.05093533535194326, + "grad_norm": 9.215054382271381, + "learning_rate": 8.999998777563696e-06, + "loss": 0.9075, + "step": 307 + }, + { + "epoch": 0.05110124849641213, + "grad_norm": 5.50940374670424, + "learning_rate": 8.99999840334852e-06, + "loss": 0.906, + "step": 308 + }, + { + "epoch": 0.051267161640881, + "grad_norm": 5.315125426110582, + "learning_rate": 8.999997979238004e-06, + "loss": 0.9362, + "step": 309 + }, + { + "epoch": 0.05143307478534987, + "grad_norm": 5.456218191120494, + "learning_rate": 8.999997505232148e-06, + "loss": 0.8974, + "step": 310 + }, + { + "epoch": 0.05159898792981874, + "grad_norm": 5.3458253139310905, + "learning_rate": 8.999996981330957e-06, + "loss": 0.9052, + "step": 311 + }, + { + "epoch": 0.05176490107428761, + "grad_norm": 5.428852344895628, + "learning_rate": 8.999996407534438e-06, + "loss": 0.8927, + "step": 312 + }, + { + "epoch": 0.051930814218756484, + "grad_norm": 5.362891578705491, + "learning_rate": 8.999995783842599e-06, + "loss": 0.8913, + "step": 313 + }, + { + "epoch": 0.052096727363225355, + "grad_norm": 5.2337978422263385, + "learning_rate": 8.999995110255444e-06, + "loss": 0.9175, + "step": 314 + }, + { + "epoch": 0.05226264050769422, + "grad_norm": 5.176887043314268, + "learning_rate": 8.999994386772981e-06, + "loss": 0.9003, + "step": 315 + }, + { + "epoch": 0.05242855365216309, + "grad_norm": 5.184446115994752, + "learning_rate": 8.999993613395219e-06, + "loss": 0.8943, + "step": 316 + }, + { + "epoch": 0.05259446679663196, + "grad_norm": 5.159248082806763, + "learning_rate": 8.999992790122167e-06, + "loss": 0.8893, + "step": 317 + }, + { + "epoch": 0.05276037994110083, + "grad_norm": 5.083647157877446, + "learning_rate": 8.999991916953832e-06, + "loss": 0.8921, + "step": 318 + }, + { + "epoch": 0.0529262930855697, + "grad_norm": 5.0793184864339045, + "learning_rate": 8.999990993890227e-06, + "loss": 0.8891, + "step": 319 + }, + { + "epoch": 0.053092206230038574, + "grad_norm": 4.969318514135246, + "learning_rate": 8.99999002093136e-06, + "loss": 0.9008, + "step": 320 + }, + { + "epoch": 0.053258119374507445, + "grad_norm": 4.866162400058146, + "learning_rate": 8.999988998077239e-06, + "loss": 0.9074, + "step": 321 + }, + { + "epoch": 0.05342403251897632, + "grad_norm": 4.871718580190094, + "learning_rate": 8.999987925327882e-06, + "loss": 0.9105, + "step": 322 + }, + { + "epoch": 0.05358994566344519, + "grad_norm": 4.950770420419117, + "learning_rate": 8.999986802683295e-06, + "loss": 0.8792, + "step": 323 + }, + { + "epoch": 0.05375585880791406, + "grad_norm": 4.8879460632967735, + "learning_rate": 8.999985630143494e-06, + "loss": 0.8691, + "step": 324 + }, + { + "epoch": 0.05392177195238293, + "grad_norm": 4.776820682760442, + "learning_rate": 8.999984407708489e-06, + "loss": 0.9004, + "step": 325 + }, + { + "epoch": 0.0540876850968518, + "grad_norm": 4.734078577215334, + "learning_rate": 8.999983135378296e-06, + "loss": 0.8811, + "step": 326 + }, + { + "epoch": 0.05425359824132067, + "grad_norm": 4.702414692785676, + "learning_rate": 8.99998181315293e-06, + "loss": 0.9019, + "step": 327 + }, + { + "epoch": 0.054419511385789536, + "grad_norm": 4.552715310423402, + "learning_rate": 8.999980441032402e-06, + "loss": 0.9042, + "step": 328 + }, + { + "epoch": 0.05458542453025841, + "grad_norm": 4.507915660100879, + "learning_rate": 8.999979019016731e-06, + "loss": 0.9119, + "step": 329 + }, + { + "epoch": 0.05475133767472728, + "grad_norm": 4.523492938568341, + "learning_rate": 8.99997754710593e-06, + "loss": 0.8932, + "step": 330 + }, + { + "epoch": 0.05491725081919615, + "grad_norm": 4.6297463894007365, + "learning_rate": 8.999976025300017e-06, + "loss": 0.8477, + "step": 331 + }, + { + "epoch": 0.05508316396366502, + "grad_norm": 4.51195869056777, + "learning_rate": 8.999974453599007e-06, + "loss": 0.8758, + "step": 332 + }, + { + "epoch": 0.05524907710813389, + "grad_norm": 4.477645446414166, + "learning_rate": 8.99997283200292e-06, + "loss": 0.8538, + "step": 333 + }, + { + "epoch": 0.05541499025260276, + "grad_norm": 4.393730342387632, + "learning_rate": 8.999971160511772e-06, + "loss": 0.8964, + "step": 334 + }, + { + "epoch": 0.05558090339707163, + "grad_norm": 4.332577691208558, + "learning_rate": 8.999969439125582e-06, + "loss": 0.9002, + "step": 335 + }, + { + "epoch": 0.055746816541540505, + "grad_norm": 4.272904462844758, + "learning_rate": 8.999967667844369e-06, + "loss": 0.8936, + "step": 336 + }, + { + "epoch": 0.055912729686009376, + "grad_norm": 4.284138641618394, + "learning_rate": 8.999965846668153e-06, + "loss": 0.8806, + "step": 337 + }, + { + "epoch": 0.05607864283047825, + "grad_norm": 4.245997951478823, + "learning_rate": 8.999963975596955e-06, + "loss": 0.8933, + "step": 338 + }, + { + "epoch": 0.05624455597494712, + "grad_norm": 4.262350308738906, + "learning_rate": 8.999962054630794e-06, + "loss": 0.8672, + "step": 339 + }, + { + "epoch": 0.05641046911941599, + "grad_norm": 4.218320423707499, + "learning_rate": 8.999960083769692e-06, + "loss": 0.8697, + "step": 340 + }, + { + "epoch": 0.05657638226388485, + "grad_norm": 4.161434572377091, + "learning_rate": 8.999958063013673e-06, + "loss": 0.8878, + "step": 341 + }, + { + "epoch": 0.056742295408353724, + "grad_norm": 4.0848883777831855, + "learning_rate": 8.999955992362754e-06, + "loss": 0.9028, + "step": 342 + }, + { + "epoch": 0.056908208552822595, + "grad_norm": 4.087674741684919, + "learning_rate": 8.999953871816964e-06, + "loss": 0.884, + "step": 343 + }, + { + "epoch": 0.057074121697291466, + "grad_norm": 4.077627298421648, + "learning_rate": 8.999951701376323e-06, + "loss": 0.8733, + "step": 344 + }, + { + "epoch": 0.05724003484176034, + "grad_norm": 4.014185881607656, + "learning_rate": 8.999949481040857e-06, + "loss": 0.873, + "step": 345 + }, + { + "epoch": 0.05740594798622921, + "grad_norm": 4.048242476037428, + "learning_rate": 8.999947210810587e-06, + "loss": 0.8612, + "step": 346 + }, + { + "epoch": 0.05757186113069808, + "grad_norm": 3.9790776791909286, + "learning_rate": 8.999944890685543e-06, + "loss": 0.8745, + "step": 347 + }, + { + "epoch": 0.05773777427516695, + "grad_norm": 3.9412637498937935, + "learning_rate": 8.999942520665747e-06, + "loss": 0.8751, + "step": 348 + }, + { + "epoch": 0.05790368741963582, + "grad_norm": 3.9243342606372216, + "learning_rate": 8.999940100751228e-06, + "loss": 0.8705, + "step": 349 + }, + { + "epoch": 0.05806960056410469, + "grad_norm": 3.8615778390716713, + "learning_rate": 8.99993763094201e-06, + "loss": 0.8912, + "step": 350 + }, + { + "epoch": 0.058235513708573564, + "grad_norm": 3.859140301595901, + "learning_rate": 8.999935111238122e-06, + "loss": 0.8694, + "step": 351 + }, + { + "epoch": 0.058401426853042435, + "grad_norm": 3.822341835829405, + "learning_rate": 8.999932541639593e-06, + "loss": 0.8742, + "step": 352 + }, + { + "epoch": 0.058567339997511306, + "grad_norm": 3.8184180233248943, + "learning_rate": 8.99992992214645e-06, + "loss": 0.8676, + "step": 353 + }, + { + "epoch": 0.05873325314198018, + "grad_norm": 3.801210717065968, + "learning_rate": 8.999927252758723e-06, + "loss": 0.8662, + "step": 354 + }, + { + "epoch": 0.05889916628644904, + "grad_norm": 3.780756273238635, + "learning_rate": 8.99992453347644e-06, + "loss": 0.8649, + "step": 355 + }, + { + "epoch": 0.05906507943091791, + "grad_norm": 3.7474392141127906, + "learning_rate": 8.999921764299633e-06, + "loss": 0.8583, + "step": 356 + }, + { + "epoch": 0.05923099257538678, + "grad_norm": 3.730060223515627, + "learning_rate": 8.999918945228331e-06, + "loss": 0.8589, + "step": 357 + }, + { + "epoch": 0.059396905719855654, + "grad_norm": 3.6811758989495567, + "learning_rate": 8.999916076262566e-06, + "loss": 0.8568, + "step": 358 + }, + { + "epoch": 0.059562818864324525, + "grad_norm": 3.6726514292781505, + "learning_rate": 8.999913157402371e-06, + "loss": 0.8573, + "step": 359 + }, + { + "epoch": 0.059728732008793396, + "grad_norm": 3.595206525156515, + "learning_rate": 8.999910188647777e-06, + "loss": 0.8742, + "step": 360 + }, + { + "epoch": 0.05989464515326227, + "grad_norm": 3.471677475337036, + "learning_rate": 8.999907169998819e-06, + "loss": 0.8869, + "step": 361 + }, + { + "epoch": 0.06006055829773114, + "grad_norm": 3.5659572607511163, + "learning_rate": 8.999904101455525e-06, + "loss": 0.8576, + "step": 362 + }, + { + "epoch": 0.06022647144220001, + "grad_norm": 3.494068972585631, + "learning_rate": 8.999900983017934e-06, + "loss": 0.8626, + "step": 363 + }, + { + "epoch": 0.06039238458666888, + "grad_norm": 3.5366577459151984, + "learning_rate": 8.99989781468608e-06, + "loss": 0.8641, + "step": 364 + }, + { + "epoch": 0.06055829773113775, + "grad_norm": 3.4640552235900026, + "learning_rate": 8.999894596459998e-06, + "loss": 0.8498, + "step": 365 + }, + { + "epoch": 0.06072421087560662, + "grad_norm": 3.4627533254161778, + "learning_rate": 8.999891328339722e-06, + "loss": 0.8701, + "step": 366 + }, + { + "epoch": 0.060890124020075494, + "grad_norm": 3.4543538438907455, + "learning_rate": 8.99988801032529e-06, + "loss": 0.8525, + "step": 367 + }, + { + "epoch": 0.06105603716454436, + "grad_norm": 3.4310968757128895, + "learning_rate": 8.999884642416736e-06, + "loss": 0.8653, + "step": 368 + }, + { + "epoch": 0.06122195030901323, + "grad_norm": 3.3538182281557583, + "learning_rate": 8.999881224614101e-06, + "loss": 0.8908, + "step": 369 + }, + { + "epoch": 0.0613878634534821, + "grad_norm": 3.3905994838285163, + "learning_rate": 8.999877756917422e-06, + "loss": 0.8521, + "step": 370 + }, + { + "epoch": 0.06155377659795097, + "grad_norm": 3.3231583523257604, + "learning_rate": 8.999874239326737e-06, + "loss": 0.865, + "step": 371 + }, + { + "epoch": 0.06171968974241984, + "grad_norm": 3.3486181559656334, + "learning_rate": 8.999870671842085e-06, + "loss": 0.8565, + "step": 372 + }, + { + "epoch": 0.06188560288688871, + "grad_norm": 3.391890189649907, + "learning_rate": 8.999867054463503e-06, + "loss": 0.8344, + "step": 373 + }, + { + "epoch": 0.062051516031357584, + "grad_norm": 3.2932669339414287, + "learning_rate": 8.999863387191034e-06, + "loss": 0.8668, + "step": 374 + }, + { + "epoch": 0.062217429175826455, + "grad_norm": 3.350955000113915, + "learning_rate": 8.99985967002472e-06, + "loss": 0.8253, + "step": 375 + }, + { + "epoch": 0.062383342320295326, + "grad_norm": 3.2330710537829304, + "learning_rate": 8.9998559029646e-06, + "loss": 0.8684, + "step": 376 + }, + { + "epoch": 0.0625492554647642, + "grad_norm": 3.2098638665770243, + "learning_rate": 8.999852086010716e-06, + "loss": 0.8628, + "step": 377 + }, + { + "epoch": 0.06271516860923307, + "grad_norm": 3.1892699537908156, + "learning_rate": 8.99984821916311e-06, + "loss": 0.8696, + "step": 378 + }, + { + "epoch": 0.06288108175370194, + "grad_norm": 3.2334974851490363, + "learning_rate": 8.999844302421825e-06, + "loss": 0.8387, + "step": 379 + }, + { + "epoch": 0.06304699489817081, + "grad_norm": 3.238152271931643, + "learning_rate": 8.999840335786906e-06, + "loss": 0.8276, + "step": 380 + }, + { + "epoch": 0.06321290804263968, + "grad_norm": 3.1331529105356974, + "learning_rate": 8.999836319258393e-06, + "loss": 0.8512, + "step": 381 + }, + { + "epoch": 0.06337882118710855, + "grad_norm": 3.152663308943621, + "learning_rate": 8.999832252836337e-06, + "loss": 0.8384, + "step": 382 + }, + { + "epoch": 0.06354473433157742, + "grad_norm": 3.084432904970283, + "learning_rate": 8.999828136520776e-06, + "loss": 0.8584, + "step": 383 + }, + { + "epoch": 0.0637106474760463, + "grad_norm": 3.1120289343036838, + "learning_rate": 8.999823970311762e-06, + "loss": 0.8443, + "step": 384 + }, + { + "epoch": 0.06387656062051517, + "grad_norm": 3.1157944117751373, + "learning_rate": 8.999819754209336e-06, + "loss": 0.8355, + "step": 385 + }, + { + "epoch": 0.06404247376498404, + "grad_norm": 3.016422516063702, + "learning_rate": 8.999815488213547e-06, + "loss": 0.8718, + "step": 386 + }, + { + "epoch": 0.06420838690945291, + "grad_norm": 3.0280804164647277, + "learning_rate": 8.999811172324443e-06, + "loss": 0.8525, + "step": 387 + }, + { + "epoch": 0.06437430005392177, + "grad_norm": 3.036473880069474, + "learning_rate": 8.999806806542072e-06, + "loss": 0.8644, + "step": 388 + }, + { + "epoch": 0.06454021319839064, + "grad_norm": 2.991716780189957, + "learning_rate": 8.99980239086648e-06, + "loss": 0.8474, + "step": 389 + }, + { + "epoch": 0.06470612634285951, + "grad_norm": 3.0744448937466635, + "learning_rate": 8.999797925297717e-06, + "loss": 0.8251, + "step": 390 + }, + { + "epoch": 0.06487203948732838, + "grad_norm": 2.970959395794443, + "learning_rate": 8.999793409835835e-06, + "loss": 0.8497, + "step": 391 + }, + { + "epoch": 0.06503795263179725, + "grad_norm": 2.942697893932714, + "learning_rate": 8.99978884448088e-06, + "loss": 0.8497, + "step": 392 + }, + { + "epoch": 0.06520386577626612, + "grad_norm": 2.9217535841307374, + "learning_rate": 8.999784229232905e-06, + "loss": 0.8638, + "step": 393 + }, + { + "epoch": 0.06536977892073499, + "grad_norm": 2.9558083426986483, + "learning_rate": 8.999779564091961e-06, + "loss": 0.8274, + "step": 394 + }, + { + "epoch": 0.06553569206520386, + "grad_norm": 2.885361223118148, + "learning_rate": 8.999774849058102e-06, + "loss": 0.8583, + "step": 395 + }, + { + "epoch": 0.06570160520967273, + "grad_norm": 2.8800300061377473, + "learning_rate": 8.999770084131374e-06, + "loss": 0.8499, + "step": 396 + }, + { + "epoch": 0.0658675183541416, + "grad_norm": 2.8665971529078695, + "learning_rate": 8.999765269311836e-06, + "loss": 0.8489, + "step": 397 + }, + { + "epoch": 0.06603343149861048, + "grad_norm": 2.8344547998127103, + "learning_rate": 8.999760404599538e-06, + "loss": 0.8432, + "step": 398 + }, + { + "epoch": 0.06619934464307935, + "grad_norm": 2.839791425822409, + "learning_rate": 8.999755489994537e-06, + "loss": 0.8407, + "step": 399 + }, + { + "epoch": 0.06636525778754822, + "grad_norm": 2.828150592505063, + "learning_rate": 8.999750525496884e-06, + "loss": 0.8217, + "step": 400 + }, + { + "epoch": 0.06585511054543962, + "grad_norm": 2.791449185667969, + "learning_rate": 8.99975071828438e-06, + "loss": 0.8651, + "step": 401 + }, + { + "epoch": 0.06601933775378235, + "grad_norm": 2.799198588104831, + "learning_rate": 8.999745757622507e-06, + "loss": 0.8615, + "step": 402 + }, + { + "epoch": 0.0661835649621251, + "grad_norm": 2.731196238394043, + "learning_rate": 8.99974074808897e-06, + "loss": 0.87, + "step": 403 + }, + { + "epoch": 0.06634779217046784, + "grad_norm": 2.7669579734667407, + "learning_rate": 8.999735689683818e-06, + "loss": 0.8304, + "step": 404 + }, + { + "epoch": 0.06651201937881059, + "grad_norm": 2.6933917829612226, + "learning_rate": 8.999730582407112e-06, + "loss": 0.8645, + "step": 405 + }, + { + "epoch": 0.06667624658715332, + "grad_norm": 2.7037937388444493, + "learning_rate": 8.999725426258905e-06, + "loss": 0.8522, + "step": 406 + }, + { + "epoch": 0.06684047379549607, + "grad_norm": 2.6529365860710947, + "learning_rate": 8.999720221239252e-06, + "loss": 0.8665, + "step": 407 + }, + { + "epoch": 0.0670047010038388, + "grad_norm": 2.6958361564015623, + "learning_rate": 8.999714967348209e-06, + "loss": 0.8421, + "step": 408 + }, + { + "epoch": 0.06716892821218155, + "grad_norm": 2.679623752008986, + "learning_rate": 8.999709664585836e-06, + "loss": 0.8444, + "step": 409 + }, + { + "epoch": 0.0673331554205243, + "grad_norm": 2.6846073911898247, + "learning_rate": 8.999704312952188e-06, + "loss": 0.8225, + "step": 410 + }, + { + "epoch": 0.06749738262886704, + "grad_norm": 2.6357646834424733, + "learning_rate": 8.999698912447324e-06, + "loss": 0.8428, + "step": 411 + }, + { + "epoch": 0.06766160983720979, + "grad_norm": 2.5864163075261515, + "learning_rate": 8.999693463071303e-06, + "loss": 0.8647, + "step": 412 + }, + { + "epoch": 0.06782583704555252, + "grad_norm": 2.649716139853906, + "learning_rate": 8.999687964824184e-06, + "loss": 0.8336, + "step": 413 + }, + { + "epoch": 0.06799006425389527, + "grad_norm": 2.6697160350910982, + "learning_rate": 8.999682417706028e-06, + "loss": 0.8315, + "step": 414 + }, + { + "epoch": 0.068154291462238, + "grad_norm": 2.614544893051611, + "learning_rate": 8.999676821716893e-06, + "loss": 0.833, + "step": 415 + }, + { + "epoch": 0.06831851867058075, + "grad_norm": 2.605311057811457, + "learning_rate": 8.99967117685684e-06, + "loss": 0.8354, + "step": 416 + }, + { + "epoch": 0.06848274587892349, + "grad_norm": 2.512994946017452, + "learning_rate": 8.999665483125932e-06, + "loss": 0.8527, + "step": 417 + }, + { + "epoch": 0.06864697308726624, + "grad_norm": 2.532102472565359, + "learning_rate": 8.999659740524227e-06, + "loss": 0.8526, + "step": 418 + }, + { + "epoch": 0.06881120029560897, + "grad_norm": 2.537552292844787, + "learning_rate": 8.999653949051792e-06, + "loss": 0.8593, + "step": 419 + }, + { + "epoch": 0.06897542750395172, + "grad_norm": 2.520981683215179, + "learning_rate": 8.999648108708689e-06, + "loss": 0.8409, + "step": 420 + }, + { + "epoch": 0.06913965471229445, + "grad_norm": 2.5307279062339947, + "learning_rate": 8.999642219494979e-06, + "loss": 0.8544, + "step": 421 + }, + { + "epoch": 0.0693038819206372, + "grad_norm": 2.513981724612271, + "learning_rate": 8.999636281410728e-06, + "loss": 0.838, + "step": 422 + }, + { + "epoch": 0.06946810912897994, + "grad_norm": 2.5196092111609443, + "learning_rate": 8.999630294456001e-06, + "loss": 0.822, + "step": 423 + }, + { + "epoch": 0.06963233633732269, + "grad_norm": 2.501508735280606, + "learning_rate": 8.99962425863086e-06, + "loss": 0.814, + "step": 424 + }, + { + "epoch": 0.06979656354566542, + "grad_norm": 2.463190118948678, + "learning_rate": 8.999618173935375e-06, + "loss": 0.821, + "step": 425 + }, + { + "epoch": 0.06996079075400817, + "grad_norm": 2.485170388269591, + "learning_rate": 8.999612040369608e-06, + "loss": 0.8129, + "step": 426 + }, + { + "epoch": 0.07012501796235092, + "grad_norm": 2.4535480317174696, + "learning_rate": 8.999605857933626e-06, + "loss": 0.8204, + "step": 427 + }, + { + "epoch": 0.07028924517069365, + "grad_norm": 2.4297331854286934, + "learning_rate": 8.999599626627498e-06, + "loss": 0.8228, + "step": 428 + }, + { + "epoch": 0.0704534723790364, + "grad_norm": 2.412420279962346, + "learning_rate": 8.999593346451292e-06, + "loss": 0.8317, + "step": 429 + }, + { + "epoch": 0.07061769958737914, + "grad_norm": 2.3821068613707364, + "learning_rate": 8.999587017405074e-06, + "loss": 0.8496, + "step": 430 + }, + { + "epoch": 0.07078192679572189, + "grad_norm": 2.4590557883994086, + "learning_rate": 8.999580639488916e-06, + "loss": 0.8142, + "step": 431 + }, + { + "epoch": 0.07094615400406462, + "grad_norm": 2.4087171883288763, + "learning_rate": 8.999574212702883e-06, + "loss": 0.8401, + "step": 432 + }, + { + "epoch": 0.07111038121240737, + "grad_norm": 2.375466734189583, + "learning_rate": 8.99956773704705e-06, + "loss": 0.8161, + "step": 433 + }, + { + "epoch": 0.0712746084207501, + "grad_norm": 2.377908447736706, + "learning_rate": 8.999561212521482e-06, + "loss": 0.8236, + "step": 434 + }, + { + "epoch": 0.07143883562909285, + "grad_norm": 2.3169519503998264, + "learning_rate": 8.999554639126252e-06, + "loss": 0.8426, + "step": 435 + }, + { + "epoch": 0.07160306283743559, + "grad_norm": 2.3362968144154816, + "learning_rate": 8.999548016861435e-06, + "loss": 0.8359, + "step": 436 + }, + { + "epoch": 0.07176729004577834, + "grad_norm": 2.386407267700876, + "learning_rate": 8.999541345727095e-06, + "loss": 0.8029, + "step": 437 + }, + { + "epoch": 0.07193151725412107, + "grad_norm": 2.3200737690092303, + "learning_rate": 8.999534625723312e-06, + "loss": 0.8193, + "step": 438 + }, + { + "epoch": 0.07209574446246382, + "grad_norm": 2.340620712080502, + "learning_rate": 8.999527856850155e-06, + "loss": 0.8179, + "step": 439 + }, + { + "epoch": 0.07225997167080656, + "grad_norm": 2.3005341648037216, + "learning_rate": 8.999521039107699e-06, + "loss": 0.8352, + "step": 440 + }, + { + "epoch": 0.0724241988791493, + "grad_norm": 2.268310807131279, + "learning_rate": 8.999514172496018e-06, + "loss": 0.8584, + "step": 441 + }, + { + "epoch": 0.07258842608749204, + "grad_norm": 2.280231628453319, + "learning_rate": 8.999507257015185e-06, + "loss": 0.8265, + "step": 442 + }, + { + "epoch": 0.07275265329583479, + "grad_norm": 2.2563491434376366, + "learning_rate": 8.999500292665276e-06, + "loss": 0.8438, + "step": 443 + }, + { + "epoch": 0.07291688050417754, + "grad_norm": 2.2687198666635093, + "learning_rate": 8.999493279446368e-06, + "loss": 0.8342, + "step": 444 + }, + { + "epoch": 0.07308110771252027, + "grad_norm": 2.230140831740877, + "learning_rate": 8.999486217358537e-06, + "loss": 0.8238, + "step": 445 + }, + { + "epoch": 0.07324533492086302, + "grad_norm": 2.210105023067696, + "learning_rate": 8.999479106401858e-06, + "loss": 0.8281, + "step": 446 + }, + { + "epoch": 0.07340956212920575, + "grad_norm": 2.22721070539442, + "learning_rate": 8.999471946576406e-06, + "loss": 0.8255, + "step": 447 + }, + { + "epoch": 0.0735737893375485, + "grad_norm": 2.1865500694065423, + "learning_rate": 8.999464737882265e-06, + "loss": 0.8383, + "step": 448 + }, + { + "epoch": 0.07373801654589124, + "grad_norm": 2.2143774580363145, + "learning_rate": 8.999457480319511e-06, + "loss": 0.8283, + "step": 449 + }, + { + "epoch": 0.07390224375423399, + "grad_norm": 2.1718648151396462, + "learning_rate": 8.99945017388822e-06, + "loss": 0.8467, + "step": 450 + }, + { + "epoch": 0.07406647096257672, + "grad_norm": 2.206728470233497, + "learning_rate": 8.999442818588473e-06, + "loss": 0.8038, + "step": 451 + }, + { + "epoch": 0.07423069817091947, + "grad_norm": 2.1780216266278853, + "learning_rate": 8.99943541442035e-06, + "loss": 0.8327, + "step": 452 + }, + { + "epoch": 0.0743949253792622, + "grad_norm": 2.1845803500643184, + "learning_rate": 8.999427961383933e-06, + "loss": 0.7931, + "step": 453 + }, + { + "epoch": 0.07455915258760495, + "grad_norm": 2.13549126985634, + "learning_rate": 8.9994204594793e-06, + "loss": 0.8301, + "step": 454 + }, + { + "epoch": 0.07472337979594769, + "grad_norm": 2.1297114580768404, + "learning_rate": 8.999412908706536e-06, + "loss": 0.8271, + "step": 455 + }, + { + "epoch": 0.07488760700429044, + "grad_norm": 2.1538771659712497, + "learning_rate": 8.99940530906572e-06, + "loss": 0.8335, + "step": 456 + }, + { + "epoch": 0.07505183421263317, + "grad_norm": 2.15071828742556, + "learning_rate": 8.999397660556935e-06, + "loss": 0.8057, + "step": 457 + }, + { + "epoch": 0.07521606142097592, + "grad_norm": 2.093120448722487, + "learning_rate": 8.999389963180265e-06, + "loss": 0.8367, + "step": 458 + }, + { + "epoch": 0.07538028862931866, + "grad_norm": 2.085757684625511, + "learning_rate": 8.999382216935793e-06, + "loss": 0.8278, + "step": 459 + }, + { + "epoch": 0.0755445158376614, + "grad_norm": 2.102360562017555, + "learning_rate": 8.999374421823603e-06, + "loss": 0.8206, + "step": 460 + }, + { + "epoch": 0.07570874304600414, + "grad_norm": 2.061123490022161, + "learning_rate": 8.99936657784378e-06, + "loss": 0.8178, + "step": 461 + }, + { + "epoch": 0.07587297025434689, + "grad_norm": 2.040202576510708, + "learning_rate": 8.99935868499641e-06, + "loss": 0.8234, + "step": 462 + }, + { + "epoch": 0.07603719746268964, + "grad_norm": 2.0925459619084807, + "learning_rate": 8.999350743281578e-06, + "loss": 0.8011, + "step": 463 + }, + { + "epoch": 0.07620142467103237, + "grad_norm": 2.0618958956130506, + "learning_rate": 8.999342752699368e-06, + "loss": 0.8064, + "step": 464 + }, + { + "epoch": 0.07636565187937512, + "grad_norm": 2.0446614226644275, + "learning_rate": 8.999334713249872e-06, + "loss": 0.8143, + "step": 465 + }, + { + "epoch": 0.07652987908771786, + "grad_norm": 2.009335333061511, + "learning_rate": 8.999326624933172e-06, + "loss": 0.8326, + "step": 466 + }, + { + "epoch": 0.0766941062960606, + "grad_norm": 2.0489481976521797, + "learning_rate": 8.999318487749358e-06, + "loss": 0.7827, + "step": 467 + }, + { + "epoch": 0.07685833350440334, + "grad_norm": 2.0221916630599304, + "learning_rate": 8.99931030169852e-06, + "loss": 0.804, + "step": 468 + }, + { + "epoch": 0.07702256071274609, + "grad_norm": 2.029438896224542, + "learning_rate": 8.999302066780746e-06, + "loss": 0.8087, + "step": 469 + }, + { + "epoch": 0.07718678792108882, + "grad_norm": 1.978319845152218, + "learning_rate": 8.999293782996124e-06, + "loss": 0.821, + "step": 470 + }, + { + "epoch": 0.07735101512943157, + "grad_norm": 1.9918589527926724, + "learning_rate": 8.999285450344744e-06, + "loss": 0.8201, + "step": 471 + }, + { + "epoch": 0.0775152423377743, + "grad_norm": 2.0023287957387836, + "learning_rate": 8.999277068826698e-06, + "loss": 0.8114, + "step": 472 + }, + { + "epoch": 0.07767946954611706, + "grad_norm": 1.9735825072492768, + "learning_rate": 8.999268638442074e-06, + "loss": 0.81, + "step": 473 + }, + { + "epoch": 0.07784369675445979, + "grad_norm": 1.9457430616789873, + "learning_rate": 8.99926015919097e-06, + "loss": 0.8161, + "step": 474 + }, + { + "epoch": 0.07800792396280254, + "grad_norm": 1.9501504831433791, + "learning_rate": 8.999251631073472e-06, + "loss": 0.8233, + "step": 475 + }, + { + "epoch": 0.07817215117114527, + "grad_norm": 1.9587112324999745, + "learning_rate": 8.999243054089675e-06, + "loss": 0.8289, + "step": 476 + }, + { + "epoch": 0.07833637837948802, + "grad_norm": 1.9321723467173395, + "learning_rate": 8.999234428239671e-06, + "loss": 0.81, + "step": 477 + }, + { + "epoch": 0.07850060558783076, + "grad_norm": 1.8961717334570505, + "learning_rate": 8.999225753523554e-06, + "loss": 0.8286, + "step": 478 + }, + { + "epoch": 0.0786648327961735, + "grad_norm": 1.8988900894264487, + "learning_rate": 8.99921702994142e-06, + "loss": 0.8257, + "step": 479 + }, + { + "epoch": 0.07882906000451625, + "grad_norm": 1.917437492909705, + "learning_rate": 8.999208257493363e-06, + "loss": 0.796, + "step": 480 + }, + { + "epoch": 0.07899328721285899, + "grad_norm": 1.9162103223989222, + "learning_rate": 8.999199436179476e-06, + "loss": 0.8117, + "step": 481 + }, + { + "epoch": 0.07915751442120174, + "grad_norm": 1.8900698273277583, + "learning_rate": 8.999190565999858e-06, + "loss": 0.8188, + "step": 482 + }, + { + "epoch": 0.07932174162954447, + "grad_norm": 1.8766604711654709, + "learning_rate": 8.999181646954602e-06, + "loss": 0.8152, + "step": 483 + }, + { + "epoch": 0.07948596883788722, + "grad_norm": 1.8868541925904996, + "learning_rate": 8.999172679043808e-06, + "loss": 0.812, + "step": 484 + }, + { + "epoch": 0.07965019604622996, + "grad_norm": 1.8559987483245017, + "learning_rate": 8.999163662267572e-06, + "loss": 0.8223, + "step": 485 + }, + { + "epoch": 0.0798144232545727, + "grad_norm": 1.8423384119854995, + "learning_rate": 8.999154596625992e-06, + "loss": 0.8195, + "step": 486 + }, + { + "epoch": 0.07997865046291544, + "grad_norm": 1.8755155601469158, + "learning_rate": 8.999145482119168e-06, + "loss": 0.789, + "step": 487 + }, + { + "epoch": 0.08014287767125819, + "grad_norm": 1.8578743443690677, + "learning_rate": 8.999136318747195e-06, + "loss": 0.817, + "step": 488 + }, + { + "epoch": 0.08030710487960092, + "grad_norm": 1.8149426139291804, + "learning_rate": 8.999127106510175e-06, + "loss": 0.8298, + "step": 489 + }, + { + "epoch": 0.08047133208794367, + "grad_norm": 1.8071557080745544, + "learning_rate": 8.999117845408209e-06, + "loss": 0.8156, + "step": 490 + }, + { + "epoch": 0.08063555929628641, + "grad_norm": 1.7903215084168274, + "learning_rate": 8.999108535441398e-06, + "loss": 0.8213, + "step": 491 + }, + { + "epoch": 0.08079978650462916, + "grad_norm": 1.8039616892674522, + "learning_rate": 8.99909917660984e-06, + "loss": 0.8086, + "step": 492 + }, + { + "epoch": 0.08096401371297189, + "grad_norm": 1.8228172637206808, + "learning_rate": 8.99908976891364e-06, + "loss": 0.806, + "step": 493 + }, + { + "epoch": 0.08112824092131464, + "grad_norm": 1.808137095151838, + "learning_rate": 8.999080312352896e-06, + "loss": 0.7975, + "step": 494 + }, + { + "epoch": 0.08129246812965737, + "grad_norm": 1.7910413085027803, + "learning_rate": 8.999070806927716e-06, + "loss": 0.7961, + "step": 495 + }, + { + "epoch": 0.08145669533800012, + "grad_norm": 1.8391374609956732, + "learning_rate": 8.9990612526382e-06, + "loss": 0.7667, + "step": 496 + }, + { + "epoch": 0.08162092254634287, + "grad_norm": 1.7825821027741344, + "learning_rate": 8.999051649484451e-06, + "loss": 0.8082, + "step": 497 + }, + { + "epoch": 0.08178514975468561, + "grad_norm": 1.7535944210904948, + "learning_rate": 8.999041997466575e-06, + "loss": 0.806, + "step": 498 + }, + { + "epoch": 0.08194937696302836, + "grad_norm": 1.7940946696819353, + "learning_rate": 8.999032296584675e-06, + "loss": 0.7794, + "step": 499 + }, + { + "epoch": 0.08211360417137109, + "grad_norm": 1.7675061166108443, + "learning_rate": 8.999022546838862e-06, + "loss": 0.8007, + "step": 500 + }, + { + "epoch": 0.08227783137971384, + "grad_norm": 1.7466093753207794, + "learning_rate": 8.999012748229235e-06, + "loss": 0.8085, + "step": 501 + }, + { + "epoch": 0.08244205858805657, + "grad_norm": 1.7384682546038235, + "learning_rate": 8.999002900755904e-06, + "loss": 0.8029, + "step": 502 + }, + { + "epoch": 0.08260628579639932, + "grad_norm": 1.749204038070269, + "learning_rate": 8.998993004418973e-06, + "loss": 0.8065, + "step": 503 + }, + { + "epoch": 0.08277051300474206, + "grad_norm": 1.731827509189124, + "learning_rate": 8.998983059218553e-06, + "loss": 0.7904, + "step": 504 + }, + { + "epoch": 0.0829347402130848, + "grad_norm": 1.713406755440384, + "learning_rate": 8.998973065154751e-06, + "loss": 0.7929, + "step": 505 + }, + { + "epoch": 0.08309896742142754, + "grad_norm": 1.7370353467699013, + "learning_rate": 8.998963022227676e-06, + "loss": 0.7979, + "step": 506 + }, + { + "epoch": 0.08326319462977029, + "grad_norm": 1.722144711965804, + "learning_rate": 8.998952930437434e-06, + "loss": 0.7864, + "step": 507 + }, + { + "epoch": 0.08342742183811303, + "grad_norm": 1.6850288461926786, + "learning_rate": 8.998942789784138e-06, + "loss": 0.7966, + "step": 508 + }, + { + "epoch": 0.08359164904645577, + "grad_norm": 1.7125487489335667, + "learning_rate": 8.998932600267896e-06, + "loss": 0.789, + "step": 509 + }, + { + "epoch": 0.08375587625479851, + "grad_norm": 1.666553952443347, + "learning_rate": 8.998922361888821e-06, + "loss": 0.8144, + "step": 510 + }, + { + "epoch": 0.08392010346314126, + "grad_norm": 1.6766351515133946, + "learning_rate": 8.998912074647022e-06, + "loss": 0.8144, + "step": 511 + }, + { + "epoch": 0.08408433067148399, + "grad_norm": 1.6676136142415157, + "learning_rate": 8.998901738542612e-06, + "loss": 0.8086, + "step": 512 + }, + { + "epoch": 0.08424855787982674, + "grad_norm": 1.6939153245378111, + "learning_rate": 8.998891353575703e-06, + "loss": 0.7887, + "step": 513 + }, + { + "epoch": 0.08441278508816948, + "grad_norm": 1.6367960342037091, + "learning_rate": 8.998880919746406e-06, + "loss": 0.8217, + "step": 514 + }, + { + "epoch": 0.08457701229651222, + "grad_norm": 1.656541292788681, + "learning_rate": 8.998870437054837e-06, + "loss": 0.7918, + "step": 515 + }, + { + "epoch": 0.08474123950485497, + "grad_norm": 1.649968215675082, + "learning_rate": 8.99885990550111e-06, + "loss": 0.7948, + "step": 516 + }, + { + "epoch": 0.08490546671319771, + "grad_norm": 1.637068861746453, + "learning_rate": 8.998849325085336e-06, + "loss": 0.8022, + "step": 517 + }, + { + "epoch": 0.08506969392154046, + "grad_norm": 1.6598278261447834, + "learning_rate": 8.998838695807632e-06, + "loss": 0.7767, + "step": 518 + }, + { + "epoch": 0.08523392112988319, + "grad_norm": 1.6497075059531359, + "learning_rate": 8.998828017668115e-06, + "loss": 0.7843, + "step": 519 + }, + { + "epoch": 0.08539814833822594, + "grad_norm": 1.6048685678000418, + "learning_rate": 8.9988172906669e-06, + "loss": 0.8041, + "step": 520 + }, + { + "epoch": 0.08556237554656868, + "grad_norm": 1.6169953267344268, + "learning_rate": 8.998806514804102e-06, + "loss": 0.7884, + "step": 521 + }, + { + "epoch": 0.08572660275491142, + "grad_norm": 1.6177878656916025, + "learning_rate": 8.998795690079838e-06, + "loss": 0.7828, + "step": 522 + }, + { + "epoch": 0.08589082996325416, + "grad_norm": 1.605566301566824, + "learning_rate": 8.998784816494227e-06, + "loss": 0.7877, + "step": 523 + }, + { + "epoch": 0.08605505717159691, + "grad_norm": 1.5887379483469, + "learning_rate": 8.99877389404739e-06, + "loss": 0.8035, + "step": 524 + }, + { + "epoch": 0.08621928437993964, + "grad_norm": 1.605034282881469, + "learning_rate": 8.998762922739438e-06, + "loss": 0.793, + "step": 525 + }, + { + "epoch": 0.08638351158828239, + "grad_norm": 1.5765227198544196, + "learning_rate": 8.998751902570496e-06, + "loss": 0.796, + "step": 526 + }, + { + "epoch": 0.08654773879662513, + "grad_norm": 1.5863705667950556, + "learning_rate": 8.998740833540683e-06, + "loss": 0.7796, + "step": 527 + }, + { + "epoch": 0.08671196600496787, + "grad_norm": 1.5650967154526163, + "learning_rate": 8.998729715650116e-06, + "loss": 0.7964, + "step": 528 + }, + { + "epoch": 0.08687619321331061, + "grad_norm": 1.5973499387442838, + "learning_rate": 8.99871854889892e-06, + "loss": 0.7778, + "step": 529 + }, + { + "epoch": 0.08704042042165336, + "grad_norm": 1.5399366650437916, + "learning_rate": 8.998707333287214e-06, + "loss": 0.8062, + "step": 530 + }, + { + "epoch": 0.0872046476299961, + "grad_norm": 1.5877605974589395, + "learning_rate": 8.99869606881512e-06, + "loss": 0.7861, + "step": 531 + }, + { + "epoch": 0.08736887483833884, + "grad_norm": 1.5404243830810433, + "learning_rate": 8.998684755482762e-06, + "loss": 0.7904, + "step": 532 + }, + { + "epoch": 0.08753310204668159, + "grad_norm": 1.5293995565013145, + "learning_rate": 8.99867339329026e-06, + "loss": 0.8052, + "step": 533 + }, + { + "epoch": 0.08769732925502433, + "grad_norm": 1.5285645822160316, + "learning_rate": 8.99866198223774e-06, + "loss": 0.8057, + "step": 534 + }, + { + "epoch": 0.08786155646336707, + "grad_norm": 1.5317827095316812, + "learning_rate": 8.998650522325322e-06, + "loss": 0.7905, + "step": 535 + }, + { + "epoch": 0.08802578367170981, + "grad_norm": 1.5487961117886297, + "learning_rate": 8.998639013553136e-06, + "loss": 0.7814, + "step": 536 + }, + { + "epoch": 0.08819001088005256, + "grad_norm": 1.512615812975723, + "learning_rate": 8.998627455921304e-06, + "loss": 0.794, + "step": 537 + }, + { + "epoch": 0.08835423808839529, + "grad_norm": 1.524531472522993, + "learning_rate": 8.998615849429952e-06, + "loss": 0.7853, + "step": 538 + }, + { + "epoch": 0.08851846529673804, + "grad_norm": 1.5113713334392895, + "learning_rate": 8.998604194079204e-06, + "loss": 0.7985, + "step": 539 + }, + { + "epoch": 0.08868269250508078, + "grad_norm": 1.513587345447187, + "learning_rate": 8.99859248986919e-06, + "loss": 0.7818, + "step": 540 + }, + { + "epoch": 0.08884691971342353, + "grad_norm": 1.5025263128659543, + "learning_rate": 8.998580736800035e-06, + "loss": 0.7828, + "step": 541 + }, + { + "epoch": 0.08901114692176626, + "grad_norm": 1.50559719285815, + "learning_rate": 8.998568934871868e-06, + "loss": 0.7903, + "step": 542 + }, + { + "epoch": 0.08917537413010901, + "grad_norm": 1.4894498383121608, + "learning_rate": 8.998557084084815e-06, + "loss": 0.7731, + "step": 543 + }, + { + "epoch": 0.08933960133845174, + "grad_norm": 1.4952299462037122, + "learning_rate": 8.998545184439007e-06, + "loss": 0.7702, + "step": 544 + }, + { + "epoch": 0.08950382854679449, + "grad_norm": 1.4653177315472807, + "learning_rate": 8.99853323593457e-06, + "loss": 0.7988, + "step": 545 + }, + { + "epoch": 0.08966805575513723, + "grad_norm": 1.459127859543925, + "learning_rate": 8.99852123857164e-06, + "loss": 0.8138, + "step": 546 + }, + { + "epoch": 0.08983228296347998, + "grad_norm": 1.4651643994973669, + "learning_rate": 8.998509192350342e-06, + "loss": 0.7876, + "step": 547 + }, + { + "epoch": 0.08999651017182271, + "grad_norm": 1.4676671777660628, + "learning_rate": 8.998497097270808e-06, + "loss": 0.7933, + "step": 548 + }, + { + "epoch": 0.09016073738016546, + "grad_norm": 1.4376367008426452, + "learning_rate": 8.998484953333168e-06, + "loss": 0.8033, + "step": 549 + }, + { + "epoch": 0.0903249645885082, + "grad_norm": 1.4627566923252446, + "learning_rate": 8.998472760537557e-06, + "loss": 0.7791, + "step": 550 + }, + { + "epoch": 0.09048919179685094, + "grad_norm": 1.4303444060969528, + "learning_rate": 8.998460518884106e-06, + "loss": 0.7941, + "step": 551 + }, + { + "epoch": 0.09065341900519369, + "grad_norm": 1.520429285468019, + "learning_rate": 8.998448228372947e-06, + "loss": 0.7875, + "step": 552 + }, + { + "epoch": 0.09081764621353643, + "grad_norm": 1.4467982423594512, + "learning_rate": 8.998435889004217e-06, + "loss": 0.7599, + "step": 553 + }, + { + "epoch": 0.09098187342187918, + "grad_norm": 1.4542459927929825, + "learning_rate": 8.998423500778045e-06, + "loss": 0.76, + "step": 554 + }, + { + "epoch": 0.09114610063022191, + "grad_norm": 1.4334949641597166, + "learning_rate": 8.998411063694569e-06, + "loss": 0.7816, + "step": 555 + }, + { + "epoch": 0.09131032783856466, + "grad_norm": 1.4376273667298758, + "learning_rate": 8.99839857775392e-06, + "loss": 0.78, + "step": 556 + }, + { + "epoch": 0.0914745550469074, + "grad_norm": 1.4244926116912235, + "learning_rate": 8.99838604295624e-06, + "loss": 0.7887, + "step": 557 + }, + { + "epoch": 0.09163878225525014, + "grad_norm": 1.4180874831462564, + "learning_rate": 8.99837345930166e-06, + "loss": 0.7668, + "step": 558 + }, + { + "epoch": 0.09180300946359288, + "grad_norm": 1.402879410935121, + "learning_rate": 8.998360826790319e-06, + "loss": 0.7842, + "step": 559 + }, + { + "epoch": 0.09196723667193563, + "grad_norm": 1.4040836140296928, + "learning_rate": 8.998348145422355e-06, + "loss": 0.7859, + "step": 560 + }, + { + "epoch": 0.09213146388027836, + "grad_norm": 1.4219797527782458, + "learning_rate": 8.998335415197903e-06, + "loss": 0.7706, + "step": 561 + }, + { + "epoch": 0.09229569108862111, + "grad_norm": 1.4503951930362755, + "learning_rate": 8.998322636117103e-06, + "loss": 0.7518, + "step": 562 + }, + { + "epoch": 0.09245991829696384, + "grad_norm": 1.4170028168044975, + "learning_rate": 8.998309808180093e-06, + "loss": 0.7806, + "step": 563 + }, + { + "epoch": 0.0926241455053066, + "grad_norm": 1.3835900975245248, + "learning_rate": 8.998296931387013e-06, + "loss": 0.7776, + "step": 564 + }, + { + "epoch": 0.09278837271364933, + "grad_norm": 1.3880650749493229, + "learning_rate": 8.998284005738002e-06, + "loss": 0.7716, + "step": 565 + }, + { + "epoch": 0.09295259992199208, + "grad_norm": 1.3904515187341628, + "learning_rate": 8.998271031233202e-06, + "loss": 0.7868, + "step": 566 + }, + { + "epoch": 0.09311682713033481, + "grad_norm": 1.3873774301491548, + "learning_rate": 8.998258007872753e-06, + "loss": 0.7768, + "step": 567 + }, + { + "epoch": 0.09328105433867756, + "grad_norm": 1.370753815387451, + "learning_rate": 8.998244935656797e-06, + "loss": 0.7658, + "step": 568 + }, + { + "epoch": 0.09344528154702031, + "grad_norm": 1.4026344155479453, + "learning_rate": 8.998231814585473e-06, + "loss": 0.7526, + "step": 569 + }, + { + "epoch": 0.09360950875536304, + "grad_norm": 1.344243720958496, + "learning_rate": 8.998218644658926e-06, + "loss": 0.7794, + "step": 570 + }, + { + "epoch": 0.09377373596370579, + "grad_norm": 1.3829097281740133, + "learning_rate": 8.9982054258773e-06, + "loss": 0.7875, + "step": 571 + }, + { + "epoch": 0.09393796317204853, + "grad_norm": 1.3604310447903412, + "learning_rate": 8.998192158240738e-06, + "loss": 0.7869, + "step": 572 + }, + { + "epoch": 0.09410219038039128, + "grad_norm": 1.3581920491158632, + "learning_rate": 8.998178841749382e-06, + "loss": 0.7821, + "step": 573 + }, + { + "epoch": 0.09426641758873401, + "grad_norm": 1.3407414437715726, + "learning_rate": 8.998165476403378e-06, + "loss": 0.78, + "step": 574 + }, + { + "epoch": 0.09443064479707676, + "grad_norm": 1.323211309690318, + "learning_rate": 8.998152062202874e-06, + "loss": 0.7974, + "step": 575 + }, + { + "epoch": 0.0945948720054195, + "grad_norm": 1.3516519692522833, + "learning_rate": 8.998138599148012e-06, + "loss": 0.7649, + "step": 576 + }, + { + "epoch": 0.09475909921376224, + "grad_norm": 1.3229131845618651, + "learning_rate": 8.998125087238938e-06, + "loss": 0.7962, + "step": 577 + }, + { + "epoch": 0.09492332642210498, + "grad_norm": 1.307882279240173, + "learning_rate": 8.9981115264758e-06, + "loss": 0.806, + "step": 578 + }, + { + "epoch": 0.09508755363044773, + "grad_norm": 1.3352279874733834, + "learning_rate": 8.998097916858747e-06, + "loss": 0.774, + "step": 579 + }, + { + "epoch": 0.09525178083879046, + "grad_norm": 1.311690507687563, + "learning_rate": 8.998084258387923e-06, + "loss": 0.7928, + "step": 580 + }, + { + "epoch": 0.09541600804713321, + "grad_norm": 1.316169384327891, + "learning_rate": 8.99807055106348e-06, + "loss": 0.8016, + "step": 581 + }, + { + "epoch": 0.09558023525547595, + "grad_norm": 1.3115286012581258, + "learning_rate": 8.998056794885563e-06, + "loss": 0.7687, + "step": 582 + }, + { + "epoch": 0.0957444624638187, + "grad_norm": 1.2973876639193667, + "learning_rate": 8.998042989854326e-06, + "loss": 0.7957, + "step": 583 + }, + { + "epoch": 0.09590868967216143, + "grad_norm": 1.3034852678205227, + "learning_rate": 8.998029135969917e-06, + "loss": 0.7694, + "step": 584 + }, + { + "epoch": 0.09607291688050418, + "grad_norm": 1.2921967384130368, + "learning_rate": 8.998015233232484e-06, + "loss": 0.7673, + "step": 585 + }, + { + "epoch": 0.09623714408884693, + "grad_norm": 1.2906042190034313, + "learning_rate": 8.998001281642182e-06, + "loss": 0.7768, + "step": 586 + }, + { + "epoch": 0.09640137129718966, + "grad_norm": 1.2940931141309069, + "learning_rate": 8.99798728119916e-06, + "loss": 0.7844, + "step": 587 + }, + { + "epoch": 0.09656559850553241, + "grad_norm": 1.284649039797445, + "learning_rate": 8.99797323190357e-06, + "loss": 0.7789, + "step": 588 + }, + { + "epoch": 0.09672982571387514, + "grad_norm": 1.283737139541954, + "learning_rate": 8.997959133755566e-06, + "loss": 0.7677, + "step": 589 + }, + { + "epoch": 0.0968940529222179, + "grad_norm": 1.2736857154419794, + "learning_rate": 8.997944986755302e-06, + "loss": 0.778, + "step": 590 + }, + { + "epoch": 0.09705828013056063, + "grad_norm": 1.2769757024108, + "learning_rate": 8.997930790902928e-06, + "loss": 0.7819, + "step": 591 + }, + { + "epoch": 0.09722250733890338, + "grad_norm": 1.262465276945493, + "learning_rate": 8.997916546198599e-06, + "loss": 0.7783, + "step": 592 + }, + { + "epoch": 0.09738673454724611, + "grad_norm": 1.2663433484817244, + "learning_rate": 8.997902252642474e-06, + "loss": 0.7661, + "step": 593 + }, + { + "epoch": 0.09755096175558886, + "grad_norm": 1.2748735844470442, + "learning_rate": 8.997887910234704e-06, + "loss": 0.7871, + "step": 594 + }, + { + "epoch": 0.0977151889639316, + "grad_norm": 1.2627194359703184, + "learning_rate": 8.997873518975445e-06, + "loss": 0.7697, + "step": 595 + }, + { + "epoch": 0.09787941617227434, + "grad_norm": 1.2458601207727036, + "learning_rate": 8.997859078864856e-06, + "loss": 0.7761, + "step": 596 + }, + { + "epoch": 0.09804364338061708, + "grad_norm": 1.2685292083003645, + "learning_rate": 8.99784458990309e-06, + "loss": 0.7595, + "step": 597 + }, + { + "epoch": 0.09820787058895983, + "grad_norm": 1.2423763258616558, + "learning_rate": 8.997830052090308e-06, + "loss": 0.7569, + "step": 598 + }, + { + "epoch": 0.09837209779730256, + "grad_norm": 1.2284843126416907, + "learning_rate": 8.997815465426666e-06, + "loss": 0.7691, + "step": 599 + }, + { + "epoch": 0.09853632500564531, + "grad_norm": 1.2380214027521046, + "learning_rate": 8.997800829912323e-06, + "loss": 0.7736, + "step": 600 + }, + { + "epoch": 0.09870055221398805, + "grad_norm": 1.2319079389161247, + "learning_rate": 8.997786145547437e-06, + "loss": 0.7714, + "step": 601 + }, + { + "epoch": 0.0988647794223308, + "grad_norm": 1.2327228774043395, + "learning_rate": 8.997771412332169e-06, + "loss": 0.7796, + "step": 602 + }, + { + "epoch": 0.09902900663067353, + "grad_norm": 1.2188386522371906, + "learning_rate": 8.997756630266679e-06, + "loss": 0.7831, + "step": 603 + }, + { + "epoch": 0.09919323383901628, + "grad_norm": 1.198537872248901, + "learning_rate": 8.997741799351126e-06, + "loss": 0.7919, + "step": 604 + }, + { + "epoch": 0.09935746104735903, + "grad_norm": 1.2216480771422678, + "learning_rate": 8.997726919585671e-06, + "loss": 0.7551, + "step": 605 + }, + { + "epoch": 0.09952168825570176, + "grad_norm": 1.225034221391949, + "learning_rate": 8.997711990970478e-06, + "loss": 0.7663, + "step": 606 + }, + { + "epoch": 0.09968591546404451, + "grad_norm": 1.2000663875670825, + "learning_rate": 8.997697013505707e-06, + "loss": 0.7677, + "step": 607 + }, + { + "epoch": 0.09985014267238725, + "grad_norm": 1.2053600381758947, + "learning_rate": 8.997681987191521e-06, + "loss": 0.7685, + "step": 608 + }, + { + "epoch": 0.10001436988073, + "grad_norm": 1.2058066147722575, + "learning_rate": 8.997666912028083e-06, + "loss": 0.7591, + "step": 609 + }, + { + "epoch": 0.10017859708907273, + "grad_norm": 1.201513478456253, + "learning_rate": 8.997651788015559e-06, + "loss": 0.7716, + "step": 610 + }, + { + "epoch": 0.10034282429741548, + "grad_norm": 1.2182501923516547, + "learning_rate": 8.997636615154109e-06, + "loss": 0.741, + "step": 611 + }, + { + "epoch": 0.10050705150575821, + "grad_norm": 1.1980299500822917, + "learning_rate": 8.997621393443901e-06, + "loss": 0.759, + "step": 612 + }, + { + "epoch": 0.10067127871410096, + "grad_norm": 1.2021512642384815, + "learning_rate": 8.997606122885102e-06, + "loss": 0.7686, + "step": 613 + }, + { + "epoch": 0.1008355059224437, + "grad_norm": 1.1891403994893042, + "learning_rate": 8.997590803477872e-06, + "loss": 0.7735, + "step": 614 + }, + { + "epoch": 0.10099973313078645, + "grad_norm": 1.1847699209247626, + "learning_rate": 8.997575435222384e-06, + "loss": 0.7709, + "step": 615 + }, + { + "epoch": 0.10116396033912918, + "grad_norm": 1.1887640824690933, + "learning_rate": 8.9975600181188e-06, + "loss": 0.7463, + "step": 616 + }, + { + "epoch": 0.10132818754747193, + "grad_norm": 1.1738353175527652, + "learning_rate": 8.997544552167289e-06, + "loss": 0.7775, + "step": 617 + }, + { + "epoch": 0.10149241475581466, + "grad_norm": 1.181133780761317, + "learning_rate": 8.99752903736802e-06, + "loss": 0.7607, + "step": 618 + }, + { + "epoch": 0.10165664196415741, + "grad_norm": 1.1674676030003306, + "learning_rate": 8.99751347372116e-06, + "loss": 0.7616, + "step": 619 + }, + { + "epoch": 0.10182086917250015, + "grad_norm": 1.168583211529951, + "learning_rate": 8.99749786122688e-06, + "loss": 0.7683, + "step": 620 + }, + { + "epoch": 0.1019850963808429, + "grad_norm": 1.1622822735095926, + "learning_rate": 8.997482199885346e-06, + "loss": 0.768, + "step": 621 + }, + { + "epoch": 0.10214932358918564, + "grad_norm": 1.1471729076856725, + "learning_rate": 8.997466489696732e-06, + "loss": 0.7604, + "step": 622 + }, + { + "epoch": 0.10231355079752838, + "grad_norm": 1.145057169808049, + "learning_rate": 8.997450730661206e-06, + "loss": 0.7578, + "step": 623 + }, + { + "epoch": 0.10247777800587113, + "grad_norm": 1.1557515410421166, + "learning_rate": 8.997434922778942e-06, + "loss": 0.7672, + "step": 624 + }, + { + "epoch": 0.10264200521421386, + "grad_norm": 1.1355739695048033, + "learning_rate": 8.99741906605011e-06, + "loss": 0.7653, + "step": 625 + }, + { + "epoch": 0.10280623242255661, + "grad_norm": 1.1538565942915397, + "learning_rate": 8.99740316047488e-06, + "loss": 0.7394, + "step": 626 + }, + { + "epoch": 0.10297045963089935, + "grad_norm": 1.1446329691976465, + "learning_rate": 8.997387206053427e-06, + "loss": 0.7499, + "step": 627 + }, + { + "epoch": 0.1031346868392421, + "grad_norm": 1.1537159517502786, + "learning_rate": 8.997371202785925e-06, + "loss": 0.7422, + "step": 628 + }, + { + "epoch": 0.10329891404758483, + "grad_norm": 1.1205865491670322, + "learning_rate": 8.997355150672548e-06, + "loss": 0.7799, + "step": 629 + }, + { + "epoch": 0.10346314125592758, + "grad_norm": 1.138552041683983, + "learning_rate": 8.997339049713468e-06, + "loss": 0.7533, + "step": 630 + }, + { + "epoch": 0.10362736846427031, + "grad_norm": 1.1208591898037377, + "learning_rate": 8.99732289990886e-06, + "loss": 0.7685, + "step": 631 + }, + { + "epoch": 0.10379159567261306, + "grad_norm": 1.122828390596003, + "learning_rate": 8.997306701258903e-06, + "loss": 0.7533, + "step": 632 + }, + { + "epoch": 0.1039558228809558, + "grad_norm": 1.12283473302057, + "learning_rate": 8.997290453763768e-06, + "loss": 0.748, + "step": 633 + }, + { + "epoch": 0.10412005008929855, + "grad_norm": 1.1034866068092488, + "learning_rate": 8.997274157423636e-06, + "loss": 0.7512, + "step": 634 + }, + { + "epoch": 0.10428427729764128, + "grad_norm": 1.1165031140298034, + "learning_rate": 8.99725781223868e-06, + "loss": 0.7536, + "step": 635 + }, + { + "epoch": 0.10444850450598403, + "grad_norm": 1.12908873229894, + "learning_rate": 8.99724141820908e-06, + "loss": 0.7437, + "step": 636 + }, + { + "epoch": 0.10461273171432676, + "grad_norm": 1.122551438784602, + "learning_rate": 8.997224975335015e-06, + "loss": 0.7525, + "step": 637 + }, + { + "epoch": 0.10477695892266951, + "grad_norm": 1.093230994266884, + "learning_rate": 8.99720848361666e-06, + "loss": 0.7871, + "step": 638 + }, + { + "epoch": 0.10494118613101226, + "grad_norm": 1.1022741460479262, + "learning_rate": 8.997191943054198e-06, + "loss": 0.768, + "step": 639 + }, + { + "epoch": 0.105105413339355, + "grad_norm": 1.0910181528443548, + "learning_rate": 8.997175353647806e-06, + "loss": 0.7549, + "step": 640 + }, + { + "epoch": 0.10526964054769775, + "grad_norm": 1.1062114372442224, + "learning_rate": 8.997158715397665e-06, + "loss": 0.7539, + "step": 641 + }, + { + "epoch": 0.10543386775604048, + "grad_norm": 1.103586593071817, + "learning_rate": 8.997142028303955e-06, + "loss": 0.7544, + "step": 642 + }, + { + "epoch": 0.10559809496438323, + "grad_norm": 1.1071233361706303, + "learning_rate": 8.99712529236686e-06, + "loss": 0.764, + "step": 643 + }, + { + "epoch": 0.10576232217272596, + "grad_norm": 1.094482530155215, + "learning_rate": 8.99710850758656e-06, + "loss": 0.7468, + "step": 644 + }, + { + "epoch": 0.10592654938106871, + "grad_norm": 1.078601443848298, + "learning_rate": 8.997091673963234e-06, + "loss": 0.7576, + "step": 645 + }, + { + "epoch": 0.10609077658941145, + "grad_norm": 1.124326466230655, + "learning_rate": 8.99707479149707e-06, + "loss": 0.7488, + "step": 646 + }, + { + "epoch": 0.1062550037977542, + "grad_norm": 1.1065751826100323, + "learning_rate": 8.99705786018825e-06, + "loss": 0.7552, + "step": 647 + }, + { + "epoch": 0.10641923100609693, + "grad_norm": 1.067142366456579, + "learning_rate": 8.997040880036956e-06, + "loss": 0.771, + "step": 648 + }, + { + "epoch": 0.10658345821443968, + "grad_norm": 1.0713690882822862, + "learning_rate": 8.997023851043372e-06, + "loss": 0.7527, + "step": 649 + }, + { + "epoch": 0.10674768542278242, + "grad_norm": 1.0664071164503175, + "learning_rate": 8.997006773207687e-06, + "loss": 0.7405, + "step": 650 + }, + { + "epoch": 0.10691191263112516, + "grad_norm": 1.0590095194012845, + "learning_rate": 8.996989646530083e-06, + "loss": 0.7711, + "step": 651 + }, + { + "epoch": 0.1070761398394679, + "grad_norm": 1.0157260991175592, + "learning_rate": 8.996972471010747e-06, + "loss": 0.8044, + "step": 652 + }, + { + "epoch": 0.10724036704781065, + "grad_norm": 1.0702230692833752, + "learning_rate": 8.996955246649866e-06, + "loss": 0.7708, + "step": 653 + }, + { + "epoch": 0.10740459425615338, + "grad_norm": 1.0415420000470876, + "learning_rate": 8.996937973447627e-06, + "loss": 0.7648, + "step": 654 + }, + { + "epoch": 0.10756882146449613, + "grad_norm": 1.0502831812393232, + "learning_rate": 8.996920651404215e-06, + "loss": 0.7514, + "step": 655 + }, + { + "epoch": 0.10773304867283887, + "grad_norm": 1.0643751230664344, + "learning_rate": 8.996903280519821e-06, + "loss": 0.7432, + "step": 656 + }, + { + "epoch": 0.10789727588118161, + "grad_norm": 1.0611243032791475, + "learning_rate": 8.996885860794635e-06, + "loss": 0.75, + "step": 657 + }, + { + "epoch": 0.10806150308952436, + "grad_norm": 1.0614565864005756, + "learning_rate": 8.996868392228843e-06, + "loss": 0.7704, + "step": 658 + }, + { + "epoch": 0.1082257302978671, + "grad_norm": 1.0522356645726656, + "learning_rate": 8.996850874822636e-06, + "loss": 0.7514, + "step": 659 + }, + { + "epoch": 0.10838995750620985, + "grad_norm": 1.0396641601794014, + "learning_rate": 8.996833308576205e-06, + "loss": 0.7443, + "step": 660 + }, + { + "epoch": 0.10855418471455258, + "grad_norm": 1.0352100207650416, + "learning_rate": 8.996815693489738e-06, + "loss": 0.7444, + "step": 661 + }, + { + "epoch": 0.10871841192289533, + "grad_norm": 1.0440907953492387, + "learning_rate": 8.99679802956343e-06, + "loss": 0.7588, + "step": 662 + }, + { + "epoch": 0.10888263913123807, + "grad_norm": 1.0237415973036992, + "learning_rate": 8.99678031679747e-06, + "loss": 0.743, + "step": 663 + }, + { + "epoch": 0.10904686633958081, + "grad_norm": 1.038219778138841, + "learning_rate": 8.996762555192053e-06, + "loss": 0.7696, + "step": 664 + }, + { + "epoch": 0.10921109354792355, + "grad_norm": 1.0371317655068988, + "learning_rate": 8.99674474474737e-06, + "loss": 0.741, + "step": 665 + }, + { + "epoch": 0.1093753207562663, + "grad_norm": 1.015958023377164, + "learning_rate": 8.996726885463612e-06, + "loss": 0.7632, + "step": 666 + }, + { + "epoch": 0.10953954796460903, + "grad_norm": 1.025225602279085, + "learning_rate": 8.996708977340979e-06, + "loss": 0.7559, + "step": 667 + }, + { + "epoch": 0.10970377517295178, + "grad_norm": 1.0102874868738583, + "learning_rate": 8.996691020379662e-06, + "loss": 0.7528, + "step": 668 + }, + { + "epoch": 0.10986800238129452, + "grad_norm": 1.0171496292001285, + "learning_rate": 8.996673014579855e-06, + "loss": 0.7476, + "step": 669 + }, + { + "epoch": 0.11003222958963726, + "grad_norm": 1.0133586146958062, + "learning_rate": 8.996654959941757e-06, + "loss": 0.7406, + "step": 670 + }, + { + "epoch": 0.11019645679798, + "grad_norm": 1.012184525401541, + "learning_rate": 8.99663685646556e-06, + "loss": 0.7323, + "step": 671 + }, + { + "epoch": 0.11036068400632275, + "grad_norm": 1.0204812192417894, + "learning_rate": 8.996618704151464e-06, + "loss": 0.7231, + "step": 672 + }, + { + "epoch": 0.11052491121466548, + "grad_norm": 1.0394114945935236, + "learning_rate": 8.996600502999663e-06, + "loss": 0.7318, + "step": 673 + }, + { + "epoch": 0.11068913842300823, + "grad_norm": 1.0071324286292476, + "learning_rate": 8.99658225301036e-06, + "loss": 0.7371, + "step": 674 + }, + { + "epoch": 0.11085336563135098, + "grad_norm": 1.0071241514792235, + "learning_rate": 8.996563954183747e-06, + "loss": 0.7263, + "step": 675 + }, + { + "epoch": 0.11101759283969372, + "grad_norm": 0.9849565269875252, + "learning_rate": 8.996545606520026e-06, + "loss": 0.7488, + "step": 676 + }, + { + "epoch": 0.11118182004803646, + "grad_norm": 0.9857903891350793, + "learning_rate": 8.996527210019395e-06, + "loss": 0.7781, + "step": 677 + }, + { + "epoch": 0.1113460472563792, + "grad_norm": 1.0008451419132542, + "learning_rate": 8.996508764682056e-06, + "loss": 0.7335, + "step": 678 + }, + { + "epoch": 0.11151027446472195, + "grad_norm": 0.9904057469198645, + "learning_rate": 8.996490270508207e-06, + "loss": 0.7509, + "step": 679 + }, + { + "epoch": 0.11167450167306468, + "grad_norm": 1.0154428424155522, + "learning_rate": 8.99647172749805e-06, + "loss": 0.7438, + "step": 680 + }, + { + "epoch": 0.11183872888140743, + "grad_norm": 0.9859511271051146, + "learning_rate": 8.996453135651785e-06, + "loss": 0.7311, + "step": 681 + }, + { + "epoch": 0.11200295608975017, + "grad_norm": 1.0172995678678334, + "learning_rate": 8.996434494969617e-06, + "loss": 0.7522, + "step": 682 + }, + { + "epoch": 0.11216718329809292, + "grad_norm": 0.9934227384074349, + "learning_rate": 8.996415805451744e-06, + "loss": 0.748, + "step": 683 + }, + { + "epoch": 0.11233141050643565, + "grad_norm": 0.9809402738647511, + "learning_rate": 8.996397067098373e-06, + "loss": 0.7384, + "step": 684 + }, + { + "epoch": 0.1124956377147784, + "grad_norm": 0.9627480326356829, + "learning_rate": 8.996378279909707e-06, + "loss": 0.7475, + "step": 685 + }, + { + "epoch": 0.11265986492312113, + "grad_norm": 0.9823816568610797, + "learning_rate": 8.996359443885948e-06, + "loss": 0.7546, + "step": 686 + }, + { + "epoch": 0.11282409213146388, + "grad_norm": 0.982802515323171, + "learning_rate": 8.9963405590273e-06, + "loss": 0.7489, + "step": 687 + }, + { + "epoch": 0.11298831933980662, + "grad_norm": 0.9965019206725246, + "learning_rate": 8.996321625333972e-06, + "loss": 0.7285, + "step": 688 + }, + { + "epoch": 0.11315254654814937, + "grad_norm": 1.0211395532769991, + "learning_rate": 8.996302642806166e-06, + "loss": 0.7529, + "step": 689 + }, + { + "epoch": 0.1133167737564921, + "grad_norm": 0.9443212451181419, + "learning_rate": 8.996283611444091e-06, + "loss": 0.7526, + "step": 690 + }, + { + "epoch": 0.11348100096483485, + "grad_norm": 0.9626893407176333, + "learning_rate": 8.99626453124795e-06, + "loss": 0.7228, + "step": 691 + }, + { + "epoch": 0.11364522817317758, + "grad_norm": 0.9730358742605486, + "learning_rate": 8.996245402217954e-06, + "loss": 0.744, + "step": 692 + }, + { + "epoch": 0.11380945538152033, + "grad_norm": 0.9563252686051095, + "learning_rate": 8.99622622435431e-06, + "loss": 0.7431, + "step": 693 + }, + { + "epoch": 0.11397368258986308, + "grad_norm": 0.9554055649452712, + "learning_rate": 8.996206997657225e-06, + "loss": 0.7432, + "step": 694 + }, + { + "epoch": 0.11413790979820582, + "grad_norm": 0.9467956137652699, + "learning_rate": 8.996187722126907e-06, + "loss": 0.743, + "step": 695 + }, + { + "epoch": 0.11430213700654857, + "grad_norm": 0.9489039079960571, + "learning_rate": 8.996168397763568e-06, + "loss": 0.7494, + "step": 696 + }, + { + "epoch": 0.1144663642148913, + "grad_norm": 0.9387244200911926, + "learning_rate": 8.996149024567416e-06, + "loss": 0.7428, + "step": 697 + }, + { + "epoch": 0.11463059142323405, + "grad_norm": 0.9501341353371187, + "learning_rate": 8.996129602538664e-06, + "loss": 0.7108, + "step": 698 + }, + { + "epoch": 0.11479481863157678, + "grad_norm": 0.962206616789228, + "learning_rate": 8.996110131677517e-06, + "loss": 0.7195, + "step": 699 + }, + { + "epoch": 0.11495904583991953, + "grad_norm": 0.9243568911177664, + "learning_rate": 8.996090611984192e-06, + "loss": 0.7557, + "step": 700 + }, + { + "epoch": 0.11512327304826227, + "grad_norm": 0.953170841709303, + "learning_rate": 8.996071043458902e-06, + "loss": 0.719, + "step": 701 + }, + { + "epoch": 0.11528750025660502, + "grad_norm": 0.9386842014299721, + "learning_rate": 8.996051426101855e-06, + "loss": 0.7436, + "step": 702 + }, + { + "epoch": 0.11545172746494775, + "grad_norm": 0.9274541909232491, + "learning_rate": 8.996031759913265e-06, + "loss": 0.7385, + "step": 703 + }, + { + "epoch": 0.1156159546732905, + "grad_norm": 0.9207183074144455, + "learning_rate": 8.996012044893347e-06, + "loss": 0.7592, + "step": 704 + }, + { + "epoch": 0.11578018188163323, + "grad_norm": 0.9062916445772407, + "learning_rate": 8.995992281042315e-06, + "loss": 0.7615, + "step": 705 + }, + { + "epoch": 0.11594440908997598, + "grad_norm": 0.9736278432415078, + "learning_rate": 8.995972468360383e-06, + "loss": 0.74, + "step": 706 + }, + { + "epoch": 0.11610863629831872, + "grad_norm": 0.9065148273263071, + "learning_rate": 8.995952606847767e-06, + "loss": 0.7568, + "step": 707 + }, + { + "epoch": 0.11627286350666147, + "grad_norm": 0.9193357664661844, + "learning_rate": 8.995932696504683e-06, + "loss": 0.7486, + "step": 708 + }, + { + "epoch": 0.1164370907150042, + "grad_norm": 0.9195501174668326, + "learning_rate": 8.995912737331345e-06, + "loss": 0.7532, + "step": 709 + }, + { + "epoch": 0.11660131792334695, + "grad_norm": 0.9507039973612097, + "learning_rate": 8.995892729327973e-06, + "loss": 0.7613, + "step": 710 + }, + { + "epoch": 0.1167655451316897, + "grad_norm": 0.9261363835047515, + "learning_rate": 8.995872672494781e-06, + "loss": 0.74, + "step": 711 + }, + { + "epoch": 0.11692977234003243, + "grad_norm": 0.9060734346127396, + "learning_rate": 8.99585256683199e-06, + "loss": 0.7368, + "step": 712 + }, + { + "epoch": 0.11709399954837518, + "grad_norm": 0.8977667506848039, + "learning_rate": 8.995832412339815e-06, + "loss": 0.7516, + "step": 713 + }, + { + "epoch": 0.11725822675671792, + "grad_norm": 0.8955651223846078, + "learning_rate": 8.995812209018479e-06, + "loss": 0.7456, + "step": 714 + }, + { + "epoch": 0.11742245396506067, + "grad_norm": 0.9030569385935932, + "learning_rate": 8.995791956868197e-06, + "loss": 0.7176, + "step": 715 + }, + { + "epoch": 0.1175866811734034, + "grad_norm": 0.8942305604018785, + "learning_rate": 8.995771655889192e-06, + "loss": 0.7384, + "step": 716 + }, + { + "epoch": 0.11775090838174615, + "grad_norm": 0.9027311362567876, + "learning_rate": 8.995751306081683e-06, + "loss": 0.7405, + "step": 717 + }, + { + "epoch": 0.11791513559008888, + "grad_norm": 0.885311457940408, + "learning_rate": 8.995730907445893e-06, + "loss": 0.7568, + "step": 718 + }, + { + "epoch": 0.11807936279843163, + "grad_norm": 0.9012435000861214, + "learning_rate": 8.99571045998204e-06, + "loss": 0.7371, + "step": 719 + }, + { + "epoch": 0.11824359000677437, + "grad_norm": 0.907146848660689, + "learning_rate": 8.99568996369035e-06, + "loss": 0.7292, + "step": 720 + }, + { + "epoch": 0.11840781721511712, + "grad_norm": 1.0063300221437657, + "learning_rate": 8.995669418571042e-06, + "loss": 0.7459, + "step": 721 + }, + { + "epoch": 0.11857204442345985, + "grad_norm": 0.9072193733429393, + "learning_rate": 8.995648824624344e-06, + "loss": 0.7195, + "step": 722 + }, + { + "epoch": 0.1187362716318026, + "grad_norm": 0.9224186169335864, + "learning_rate": 8.995628181850475e-06, + "loss": 0.7263, + "step": 723 + }, + { + "epoch": 0.11890049884014534, + "grad_norm": 0.8730762718625776, + "learning_rate": 8.99560749024966e-06, + "loss": 0.7508, + "step": 724 + }, + { + "epoch": 0.11906472604848808, + "grad_norm": 0.8982887153940247, + "learning_rate": 8.995586749822124e-06, + "loss": 0.7269, + "step": 725 + }, + { + "epoch": 0.11922895325683082, + "grad_norm": 0.9068289642355488, + "learning_rate": 8.995565960568095e-06, + "loss": 0.7465, + "step": 726 + }, + { + "epoch": 0.11939318046517357, + "grad_norm": 0.8875535394981413, + "learning_rate": 8.995545122487797e-06, + "loss": 0.72, + "step": 727 + }, + { + "epoch": 0.11955740767351632, + "grad_norm": 0.8679336550118347, + "learning_rate": 8.995524235581455e-06, + "loss": 0.7391, + "step": 728 + }, + { + "epoch": 0.11972163488185905, + "grad_norm": 0.8826045646205907, + "learning_rate": 8.995503299849294e-06, + "loss": 0.7378, + "step": 729 + }, + { + "epoch": 0.1198858620902018, + "grad_norm": 0.8701194777603063, + "learning_rate": 8.995482315291548e-06, + "loss": 0.7364, + "step": 730 + }, + { + "epoch": 0.12005008929854454, + "grad_norm": 0.8893542492932401, + "learning_rate": 8.99546128190844e-06, + "loss": 0.7361, + "step": 731 + }, + { + "epoch": 0.12021431650688728, + "grad_norm": 0.8765486658546501, + "learning_rate": 8.995440199700199e-06, + "loss": 0.7239, + "step": 732 + }, + { + "epoch": 0.12037854371523002, + "grad_norm": 0.8761211470503778, + "learning_rate": 8.995419068667056e-06, + "loss": 0.7118, + "step": 733 + }, + { + "epoch": 0.12054277092357277, + "grad_norm": 0.8784561432542338, + "learning_rate": 8.995397888809236e-06, + "loss": 0.7252, + "step": 734 + }, + { + "epoch": 0.1207069981319155, + "grad_norm": 0.885958895044403, + "learning_rate": 8.995376660126975e-06, + "loss": 0.7289, + "step": 735 + }, + { + "epoch": 0.12087122534025825, + "grad_norm": 1.3628303359719938, + "learning_rate": 8.995355382620499e-06, + "loss": 0.7363, + "step": 736 + }, + { + "epoch": 0.12103545254860099, + "grad_norm": 0.8620243858978899, + "learning_rate": 8.995334056290042e-06, + "loss": 0.7345, + "step": 737 + }, + { + "epoch": 0.12119967975694373, + "grad_norm": 0.8616681510393484, + "learning_rate": 8.995312681135833e-06, + "loss": 0.7369, + "step": 738 + }, + { + "epoch": 0.12136390696528647, + "grad_norm": 0.8536335524173874, + "learning_rate": 8.995291257158106e-06, + "loss": 0.7301, + "step": 739 + }, + { + "epoch": 0.12152813417362922, + "grad_norm": 0.8890164010727863, + "learning_rate": 8.995269784357094e-06, + "loss": 0.7259, + "step": 740 + }, + { + "epoch": 0.12169236138197195, + "grad_norm": 0.8584462694033533, + "learning_rate": 8.995248262733028e-06, + "loss": 0.7211, + "step": 741 + }, + { + "epoch": 0.1218565885903147, + "grad_norm": 0.8473041107598641, + "learning_rate": 8.995226692286144e-06, + "loss": 0.7446, + "step": 742 + }, + { + "epoch": 0.12202081579865744, + "grad_norm": 0.857983951564626, + "learning_rate": 8.995205073016676e-06, + "loss": 0.7486, + "step": 743 + }, + { + "epoch": 0.12218504300700019, + "grad_norm": 0.8499021660743089, + "learning_rate": 8.995183404924856e-06, + "loss": 0.7252, + "step": 744 + }, + { + "epoch": 0.12234927021534292, + "grad_norm": 0.8432649022601657, + "learning_rate": 8.995161688010924e-06, + "loss": 0.7361, + "step": 745 + }, + { + "epoch": 0.12251349742368567, + "grad_norm": 0.8349936223743508, + "learning_rate": 8.995139922275112e-06, + "loss": 0.728, + "step": 746 + }, + { + "epoch": 0.12267772463202842, + "grad_norm": 0.8359575337444805, + "learning_rate": 8.995118107717658e-06, + "loss": 0.7339, + "step": 747 + }, + { + "epoch": 0.12284195184037115, + "grad_norm": 0.8491789703211692, + "learning_rate": 8.995096244338799e-06, + "loss": 0.7273, + "step": 748 + }, + { + "epoch": 0.1230061790487139, + "grad_norm": 0.8343756782871875, + "learning_rate": 8.995074332138771e-06, + "loss": 0.7376, + "step": 749 + }, + { + "epoch": 0.12317040625705664, + "grad_norm": 0.8177523292597683, + "learning_rate": 8.995052371117816e-06, + "loss": 0.7498, + "step": 750 + }, + { + "epoch": 0.12333463346539938, + "grad_norm": 0.8275634000310962, + "learning_rate": 8.995030361276167e-06, + "loss": 0.7181, + "step": 751 + }, + { + "epoch": 0.12349886067374212, + "grad_norm": 0.8339533594185612, + "learning_rate": 8.995008302614069e-06, + "loss": 0.7265, + "step": 752 + }, + { + "epoch": 0.12366308788208487, + "grad_norm": 0.8279085022505449, + "learning_rate": 8.994986195131755e-06, + "loss": 0.7269, + "step": 753 + }, + { + "epoch": 0.1238273150904276, + "grad_norm": 0.8346885779308945, + "learning_rate": 8.99496403882947e-06, + "loss": 0.7239, + "step": 754 + }, + { + "epoch": 0.12399154229877035, + "grad_norm": 0.8535778930786089, + "learning_rate": 8.994941833707452e-06, + "loss": 0.7335, + "step": 755 + }, + { + "epoch": 0.12415576950711309, + "grad_norm": 0.8433517095759425, + "learning_rate": 8.994919579765944e-06, + "loss": 0.7262, + "step": 756 + }, + { + "epoch": 0.12431999671545584, + "grad_norm": 0.8115026773168339, + "learning_rate": 8.994897277005187e-06, + "loss": 0.7543, + "step": 757 + }, + { + "epoch": 0.12448422392379857, + "grad_norm": 0.8237859019603188, + "learning_rate": 8.994874925425423e-06, + "loss": 0.696, + "step": 758 + }, + { + "epoch": 0.12464845113214132, + "grad_norm": 0.9508056714088163, + "learning_rate": 8.994852525026896e-06, + "loss": 0.7313, + "step": 759 + }, + { + "epoch": 0.12481267834048405, + "grad_norm": 0.8253401090339175, + "learning_rate": 8.994830075809846e-06, + "loss": 0.7333, + "step": 760 + }, + { + "epoch": 0.1249769055488268, + "grad_norm": 0.8141935870360409, + "learning_rate": 8.99480757777452e-06, + "loss": 0.7333, + "step": 761 + }, + { + "epoch": 0.12514113275716954, + "grad_norm": 0.8136553415480036, + "learning_rate": 8.994785030921164e-06, + "loss": 0.7293, + "step": 762 + }, + { + "epoch": 0.1253053599655123, + "grad_norm": 0.8063988080685636, + "learning_rate": 8.994762435250018e-06, + "loss": 0.7398, + "step": 763 + }, + { + "epoch": 0.12546958717385504, + "grad_norm": 0.8024743885585705, + "learning_rate": 8.994739790761329e-06, + "loss": 0.7318, + "step": 764 + }, + { + "epoch": 0.12563381438219778, + "grad_norm": 0.8210883509073227, + "learning_rate": 8.994717097455346e-06, + "loss": 0.7128, + "step": 765 + }, + { + "epoch": 0.1257980415905405, + "grad_norm": 0.8081208044654137, + "learning_rate": 8.994694355332311e-06, + "loss": 0.7216, + "step": 766 + }, + { + "epoch": 0.12596226879888325, + "grad_norm": 0.8043457567543434, + "learning_rate": 8.994671564392472e-06, + "loss": 0.729, + "step": 767 + }, + { + "epoch": 0.126126496007226, + "grad_norm": 0.8332594138537104, + "learning_rate": 8.994648724636081e-06, + "loss": 0.7152, + "step": 768 + }, + { + "epoch": 0.12629072321556875, + "grad_norm": 0.796400256746438, + "learning_rate": 8.994625836063381e-06, + "loss": 0.7337, + "step": 769 + }, + { + "epoch": 0.12645495042391147, + "grad_norm": 0.7970766048853704, + "learning_rate": 8.99460289867462e-06, + "loss": 0.7385, + "step": 770 + }, + { + "epoch": 0.12661917763225422, + "grad_norm": 0.8168189102976473, + "learning_rate": 8.994579912470054e-06, + "loss": 0.7207, + "step": 771 + }, + { + "epoch": 0.12678340484059697, + "grad_norm": 0.8210337972298519, + "learning_rate": 8.994556877449926e-06, + "loss": 0.7173, + "step": 772 + }, + { + "epoch": 0.12694763204893972, + "grad_norm": 0.7928567401168414, + "learning_rate": 8.994533793614486e-06, + "loss": 0.7299, + "step": 773 + }, + { + "epoch": 0.12711185925728244, + "grad_norm": 0.7867827344107035, + "learning_rate": 8.99451066096399e-06, + "loss": 0.7327, + "step": 774 + }, + { + "epoch": 0.1272760864656252, + "grad_norm": 0.8101031512180691, + "learning_rate": 8.994487479498686e-06, + "loss": 0.7201, + "step": 775 + }, + { + "epoch": 0.12744031367396794, + "grad_norm": 0.8149592408788043, + "learning_rate": 8.994464249218824e-06, + "loss": 0.7059, + "step": 776 + }, + { + "epoch": 0.12760454088231069, + "grad_norm": 0.7976670057578504, + "learning_rate": 8.99444097012466e-06, + "loss": 0.7234, + "step": 777 + }, + { + "epoch": 0.1277687680906534, + "grad_norm": 0.7848442475451943, + "learning_rate": 8.994417642216445e-06, + "loss": 0.7224, + "step": 778 + }, + { + "epoch": 0.12793299529899615, + "grad_norm": 0.7831415532113032, + "learning_rate": 8.994394265494434e-06, + "loss": 0.7116, + "step": 779 + }, + { + "epoch": 0.1280972225073389, + "grad_norm": 0.8042009917600411, + "learning_rate": 8.994370839958876e-06, + "loss": 0.7249, + "step": 780 + }, + { + "epoch": 0.12826144971568165, + "grad_norm": 0.7823737809741357, + "learning_rate": 8.994347365610032e-06, + "loss": 0.7148, + "step": 781 + }, + { + "epoch": 0.1284256769240244, + "grad_norm": 0.7915106748965738, + "learning_rate": 8.994323842448152e-06, + "loss": 0.7328, + "step": 782 + }, + { + "epoch": 0.12858990413236712, + "grad_norm": 0.7721916884200549, + "learning_rate": 8.994300270473493e-06, + "loss": 0.7112, + "step": 783 + }, + { + "epoch": 0.12875413134070987, + "grad_norm": 0.7720807666798172, + "learning_rate": 8.994276649686313e-06, + "loss": 0.7276, + "step": 784 + }, + { + "epoch": 0.12891835854905262, + "grad_norm": 0.7812178056014114, + "learning_rate": 8.994252980086865e-06, + "loss": 0.7125, + "step": 785 + }, + { + "epoch": 0.12908258575739537, + "grad_norm": 0.7552866535606362, + "learning_rate": 8.994229261675408e-06, + "loss": 0.7384, + "step": 786 + }, + { + "epoch": 0.1292468129657381, + "grad_norm": 0.7736822903510917, + "learning_rate": 8.9942054944522e-06, + "loss": 0.7231, + "step": 787 + }, + { + "epoch": 0.12941104017408084, + "grad_norm": 0.7863166996807561, + "learning_rate": 8.9941816784175e-06, + "loss": 0.7125, + "step": 788 + }, + { + "epoch": 0.1295752673824236, + "grad_norm": 0.7584645168372531, + "learning_rate": 8.994157813571563e-06, + "loss": 0.7218, + "step": 789 + }, + { + "epoch": 0.12973949459076634, + "grad_norm": 0.7591392100122629, + "learning_rate": 8.99413389991465e-06, + "loss": 0.7196, + "step": 790 + }, + { + "epoch": 0.12990372179910906, + "grad_norm": 0.7638964528596006, + "learning_rate": 8.994109937447023e-06, + "loss": 0.742, + "step": 791 + }, + { + "epoch": 0.1300679490074518, + "grad_norm": 0.7638417423029302, + "learning_rate": 8.994085926168942e-06, + "loss": 0.7144, + "step": 792 + }, + { + "epoch": 0.13023217621579455, + "grad_norm": 0.7531125912433023, + "learning_rate": 8.994061866080662e-06, + "loss": 0.7181, + "step": 793 + }, + { + "epoch": 0.1303964034241373, + "grad_norm": 0.7638871871583925, + "learning_rate": 8.994037757182452e-06, + "loss": 0.7263, + "step": 794 + }, + { + "epoch": 0.13056063063248002, + "grad_norm": 0.7712535656244461, + "learning_rate": 8.99401359947457e-06, + "loss": 0.7203, + "step": 795 + }, + { + "epoch": 0.13072485784082277, + "grad_norm": 0.7534551679287208, + "learning_rate": 8.993989392957277e-06, + "loss": 0.7233, + "step": 796 + }, + { + "epoch": 0.13088908504916552, + "grad_norm": 0.7541686256824629, + "learning_rate": 8.99396513763084e-06, + "loss": 0.7191, + "step": 797 + }, + { + "epoch": 0.13105331225750827, + "grad_norm": 0.7558776359791881, + "learning_rate": 8.993940833495517e-06, + "loss": 0.7189, + "step": 798 + }, + { + "epoch": 0.13121753946585102, + "grad_norm": 0.7618419103671177, + "learning_rate": 8.993916480551577e-06, + "loss": 0.7279, + "step": 799 + }, + { + "epoch": 0.13138176667419374, + "grad_norm": 0.768280234616204, + "learning_rate": 8.993892078799284e-06, + "loss": 0.7126, + "step": 800 + }, + { + "epoch": 0.1315459938825365, + "grad_norm": 0.7394145525562601, + "learning_rate": 8.993867628238901e-06, + "loss": 0.7446, + "step": 801 + }, + { + "epoch": 0.13171022109087924, + "grad_norm": 0.7615299280436207, + "learning_rate": 8.993843128870692e-06, + "loss": 0.6987, + "step": 802 + }, + { + "epoch": 0.13187444829922199, + "grad_norm": 0.7465231091292661, + "learning_rate": 8.993818580694929e-06, + "loss": 0.7378, + "step": 803 + }, + { + "epoch": 0.1320386755075647, + "grad_norm": 0.7491298101718952, + "learning_rate": 8.993793983711872e-06, + "loss": 0.6964, + "step": 804 + }, + { + "epoch": 0.13220290271590746, + "grad_norm": 0.7326662321957933, + "learning_rate": 8.99376933792179e-06, + "loss": 0.7346, + "step": 805 + }, + { + "epoch": 0.1323671299242502, + "grad_norm": 0.7405957735369632, + "learning_rate": 8.993744643324954e-06, + "loss": 0.7056, + "step": 806 + }, + { + "epoch": 0.13253135713259295, + "grad_norm": 0.7309108095519508, + "learning_rate": 8.993719899921629e-06, + "loss": 0.7229, + "step": 807 + }, + { + "epoch": 0.13269558434093567, + "grad_norm": 0.7965004013613755, + "learning_rate": 8.993695107712085e-06, + "loss": 0.7272, + "step": 808 + }, + { + "epoch": 0.13285981154927842, + "grad_norm": 0.7405587765052916, + "learning_rate": 8.993670266696592e-06, + "loss": 0.7033, + "step": 809 + }, + { + "epoch": 0.13302403875762117, + "grad_norm": 0.738947917878739, + "learning_rate": 8.993645376875417e-06, + "loss": 0.7037, + "step": 810 + }, + { + "epoch": 0.13318826596596392, + "grad_norm": 0.7266519422397545, + "learning_rate": 8.993620438248834e-06, + "loss": 0.7105, + "step": 811 + }, + { + "epoch": 0.13335249317430664, + "grad_norm": 0.7321881237545829, + "learning_rate": 8.99359545081711e-06, + "loss": 0.719, + "step": 812 + }, + { + "epoch": 0.1335167203826494, + "grad_norm": 0.7343760502140673, + "learning_rate": 8.99357041458052e-06, + "loss": 0.6994, + "step": 813 + }, + { + "epoch": 0.13368094759099214, + "grad_norm": 0.7360965253226606, + "learning_rate": 8.993545329539331e-06, + "loss": 0.7087, + "step": 814 + }, + { + "epoch": 0.1338451747993349, + "grad_norm": 0.7312711387682936, + "learning_rate": 8.993520195693823e-06, + "loss": 0.703, + "step": 815 + }, + { + "epoch": 0.1340094020076776, + "grad_norm": 0.7354108968862768, + "learning_rate": 8.99349501304426e-06, + "loss": 0.7163, + "step": 816 + }, + { + "epoch": 0.13417362921602036, + "grad_norm": 0.7263178056025598, + "learning_rate": 8.993469781590924e-06, + "loss": 0.7257, + "step": 817 + }, + { + "epoch": 0.1343378564243631, + "grad_norm": 0.7416571311831478, + "learning_rate": 8.993444501334085e-06, + "loss": 0.7083, + "step": 818 + }, + { + "epoch": 0.13450208363270585, + "grad_norm": 0.7849234501566686, + "learning_rate": 8.993419172274016e-06, + "loss": 0.7128, + "step": 819 + }, + { + "epoch": 0.1346663108410486, + "grad_norm": 0.7178383001551933, + "learning_rate": 8.993393794410995e-06, + "loss": 0.7131, + "step": 820 + }, + { + "epoch": 0.13483053804939132, + "grad_norm": 0.7241642838444384, + "learning_rate": 8.993368367745295e-06, + "loss": 0.7138, + "step": 821 + }, + { + "epoch": 0.13499476525773407, + "grad_norm": 0.7168737582699622, + "learning_rate": 8.993342892277194e-06, + "loss": 0.7322, + "step": 822 + }, + { + "epoch": 0.13515899246607682, + "grad_norm": 0.7120308237386352, + "learning_rate": 8.99331736800697e-06, + "loss": 0.6974, + "step": 823 + }, + { + "epoch": 0.13532321967441957, + "grad_norm": 0.7321566228567261, + "learning_rate": 8.993291794934896e-06, + "loss": 0.6999, + "step": 824 + }, + { + "epoch": 0.1354874468827623, + "grad_norm": 0.7438779269308436, + "learning_rate": 8.993266173061255e-06, + "loss": 0.6944, + "step": 825 + }, + { + "epoch": 0.13565167409110504, + "grad_norm": 0.7118594659041331, + "learning_rate": 8.993240502386322e-06, + "loss": 0.7189, + "step": 826 + }, + { + "epoch": 0.1358159012994478, + "grad_norm": 0.7361564635340935, + "learning_rate": 8.993214782910375e-06, + "loss": 0.6926, + "step": 827 + }, + { + "epoch": 0.13598012850779054, + "grad_norm": 0.711244373700878, + "learning_rate": 8.993189014633696e-06, + "loss": 0.7185, + "step": 828 + }, + { + "epoch": 0.13614435571613326, + "grad_norm": 0.7385931746846988, + "learning_rate": 8.993163197556563e-06, + "loss": 0.6904, + "step": 829 + }, + { + "epoch": 0.136308582924476, + "grad_norm": 0.7135997031691055, + "learning_rate": 8.993137331679257e-06, + "loss": 0.7108, + "step": 830 + }, + { + "epoch": 0.13647281013281876, + "grad_norm": 0.7246704678281216, + "learning_rate": 8.993111417002059e-06, + "loss": 0.7113, + "step": 831 + }, + { + "epoch": 0.1366370373411615, + "grad_norm": 0.7091228636214885, + "learning_rate": 8.993085453525251e-06, + "loss": 0.7166, + "step": 832 + }, + { + "epoch": 0.13680126454950423, + "grad_norm": 0.698307172462183, + "learning_rate": 8.993059441249113e-06, + "loss": 0.7131, + "step": 833 + }, + { + "epoch": 0.13696549175784697, + "grad_norm": 0.7302212267089161, + "learning_rate": 8.99303338017393e-06, + "loss": 0.7153, + "step": 834 + }, + { + "epoch": 0.13712971896618972, + "grad_norm": 0.7133967428699537, + "learning_rate": 8.993007270299985e-06, + "loss": 0.7199, + "step": 835 + }, + { + "epoch": 0.13729394617453247, + "grad_norm": 0.6999377770634104, + "learning_rate": 8.992981111627558e-06, + "loss": 0.7037, + "step": 836 + }, + { + "epoch": 0.13745817338287522, + "grad_norm": 0.7036896301199365, + "learning_rate": 8.992954904156937e-06, + "loss": 0.702, + "step": 837 + }, + { + "epoch": 0.13762240059121794, + "grad_norm": 0.6896374085707714, + "learning_rate": 8.992928647888405e-06, + "loss": 0.7106, + "step": 838 + }, + { + "epoch": 0.1377866277995607, + "grad_norm": 0.7269208950900521, + "learning_rate": 8.992902342822247e-06, + "loss": 0.7068, + "step": 839 + }, + { + "epoch": 0.13795085500790344, + "grad_norm": 0.7046937086091937, + "learning_rate": 8.99287598895875e-06, + "loss": 0.703, + "step": 840 + }, + { + "epoch": 0.1381150822162462, + "grad_norm": 0.7121950225185115, + "learning_rate": 8.9928495862982e-06, + "loss": 0.7152, + "step": 841 + }, + { + "epoch": 0.1382793094245889, + "grad_norm": 0.7026225935912559, + "learning_rate": 8.992823134840882e-06, + "loss": 0.6939, + "step": 842 + }, + { + "epoch": 0.13844353663293166, + "grad_norm": 0.6970719648209621, + "learning_rate": 8.992796634587084e-06, + "loss": 0.7043, + "step": 843 + }, + { + "epoch": 0.1386077638412744, + "grad_norm": 0.6804608164044504, + "learning_rate": 8.992770085537095e-06, + "loss": 0.7077, + "step": 844 + }, + { + "epoch": 0.13877199104961715, + "grad_norm": 0.6956774935332672, + "learning_rate": 8.992743487691202e-06, + "loss": 0.706, + "step": 845 + }, + { + "epoch": 0.13893621825795988, + "grad_norm": 0.6965453226724493, + "learning_rate": 8.992716841049694e-06, + "loss": 0.6805, + "step": 846 + }, + { + "epoch": 0.13910044546630262, + "grad_norm": 0.6834149944374464, + "learning_rate": 8.992690145612864e-06, + "loss": 0.7184, + "step": 847 + }, + { + "epoch": 0.13926467267464537, + "grad_norm": 0.6878178766526616, + "learning_rate": 8.992663401380995e-06, + "loss": 0.6993, + "step": 848 + }, + { + "epoch": 0.13942889988298812, + "grad_norm": 0.6872283526043412, + "learning_rate": 8.992636608354383e-06, + "loss": 0.6862, + "step": 849 + }, + { + "epoch": 0.13959312709133084, + "grad_norm": 0.6963162299008429, + "learning_rate": 8.992609766533318e-06, + "loss": 0.6932, + "step": 850 + }, + { + "epoch": 0.1397573542996736, + "grad_norm": 0.6895674941257026, + "learning_rate": 8.992582875918089e-06, + "loss": 0.7016, + "step": 851 + }, + { + "epoch": 0.13992158150801634, + "grad_norm": 0.6984241181890183, + "learning_rate": 8.992555936508991e-06, + "loss": 0.7067, + "step": 852 + }, + { + "epoch": 0.1400858087163591, + "grad_norm": 0.6905598521116458, + "learning_rate": 8.992528948306314e-06, + "loss": 0.6937, + "step": 853 + }, + { + "epoch": 0.14025003592470184, + "grad_norm": 0.685496447106375, + "learning_rate": 8.992501911310354e-06, + "loss": 0.6928, + "step": 854 + }, + { + "epoch": 0.14041426313304456, + "grad_norm": 0.7186605931410012, + "learning_rate": 8.992474825521403e-06, + "loss": 0.6888, + "step": 855 + }, + { + "epoch": 0.1405784903413873, + "grad_norm": 0.6926681031006332, + "learning_rate": 8.992447690939756e-06, + "loss": 0.6945, + "step": 856 + }, + { + "epoch": 0.14074271754973006, + "grad_norm": 0.6687413046389367, + "learning_rate": 8.992420507565707e-06, + "loss": 0.6915, + "step": 857 + }, + { + "epoch": 0.1409069447580728, + "grad_norm": 0.6750901909722651, + "learning_rate": 8.99239327539955e-06, + "loss": 0.6972, + "step": 858 + }, + { + "epoch": 0.14107117196641553, + "grad_norm": 0.7056162625158083, + "learning_rate": 8.992365994441583e-06, + "loss": 0.6981, + "step": 859 + }, + { + "epoch": 0.14123539917475827, + "grad_norm": 0.6767374335973468, + "learning_rate": 8.992338664692101e-06, + "loss": 0.704, + "step": 860 + }, + { + "epoch": 0.14139962638310102, + "grad_norm": 0.6824925357290131, + "learning_rate": 8.992311286151403e-06, + "loss": 0.7126, + "step": 861 + }, + { + "epoch": 0.14156385359144377, + "grad_norm": 0.6611734657558349, + "learning_rate": 8.992283858819784e-06, + "loss": 0.7142, + "step": 862 + }, + { + "epoch": 0.1417280807997865, + "grad_norm": 0.6632695371380769, + "learning_rate": 8.992256382697539e-06, + "loss": 0.7232, + "step": 863 + }, + { + "epoch": 0.14189230800812924, + "grad_norm": 0.6626421044508288, + "learning_rate": 8.992228857784975e-06, + "loss": 0.7306, + "step": 864 + }, + { + "epoch": 0.142056535216472, + "grad_norm": 0.6765717769175646, + "learning_rate": 8.992201284082383e-06, + "loss": 0.6982, + "step": 865 + }, + { + "epoch": 0.14222076242481474, + "grad_norm": 0.6708039962729273, + "learning_rate": 8.992173661590065e-06, + "loss": 0.7088, + "step": 866 + }, + { + "epoch": 0.14238498963315746, + "grad_norm": 0.6793311754143073, + "learning_rate": 8.992145990308323e-06, + "loss": 0.7004, + "step": 867 + }, + { + "epoch": 0.1425492168415002, + "grad_norm": 0.6657408958013594, + "learning_rate": 8.992118270237455e-06, + "loss": 0.6799, + "step": 868 + }, + { + "epoch": 0.14271344404984296, + "grad_norm": 0.6540758971441215, + "learning_rate": 8.992090501377762e-06, + "loss": 0.7228, + "step": 869 + }, + { + "epoch": 0.1428776712581857, + "grad_norm": 0.6643906221820899, + "learning_rate": 8.992062683729548e-06, + "loss": 0.7104, + "step": 870 + }, + { + "epoch": 0.14304189846652846, + "grad_norm": 0.6989964341191656, + "learning_rate": 8.992034817293112e-06, + "loss": 0.6858, + "step": 871 + }, + { + "epoch": 0.14320612567487118, + "grad_norm": 0.6812360050752753, + "learning_rate": 8.99200690206876e-06, + "loss": 0.7117, + "step": 872 + }, + { + "epoch": 0.14337035288321393, + "grad_norm": 0.6949187113631994, + "learning_rate": 8.991978938056793e-06, + "loss": 0.7152, + "step": 873 + }, + { + "epoch": 0.14353458009155667, + "grad_norm": 0.6577059262682897, + "learning_rate": 8.991950925257517e-06, + "loss": 0.711, + "step": 874 + }, + { + "epoch": 0.14369880729989942, + "grad_norm": 0.662738282110258, + "learning_rate": 8.991922863671232e-06, + "loss": 0.6881, + "step": 875 + }, + { + "epoch": 0.14386303450824214, + "grad_norm": 0.6782128483184631, + "learning_rate": 8.991894753298245e-06, + "loss": 0.6662, + "step": 876 + }, + { + "epoch": 0.1440272617165849, + "grad_norm": 0.6571616469132535, + "learning_rate": 8.991866594138862e-06, + "loss": 0.7077, + "step": 877 + }, + { + "epoch": 0.14419148892492764, + "grad_norm": 0.6516355933069562, + "learning_rate": 8.991838386193388e-06, + "loss": 0.7009, + "step": 878 + }, + { + "epoch": 0.1443557161332704, + "grad_norm": 0.6556308700827564, + "learning_rate": 8.991810129462132e-06, + "loss": 0.7031, + "step": 879 + }, + { + "epoch": 0.1445199433416131, + "grad_norm": 0.6627545749856372, + "learning_rate": 8.991781823945398e-06, + "loss": 0.6717, + "step": 880 + }, + { + "epoch": 0.14468417054995586, + "grad_norm": 0.6482910836902056, + "learning_rate": 8.991753469643493e-06, + "loss": 0.7217, + "step": 881 + }, + { + "epoch": 0.1448483977582986, + "grad_norm": 0.7686915515107932, + "learning_rate": 8.991725066556726e-06, + "loss": 0.7087, + "step": 882 + }, + { + "epoch": 0.14501262496664136, + "grad_norm": 0.633391471242811, + "learning_rate": 8.991696614685406e-06, + "loss": 0.7121, + "step": 883 + }, + { + "epoch": 0.14517685217498408, + "grad_norm": 0.6592509022850245, + "learning_rate": 8.99166811402984e-06, + "loss": 0.6901, + "step": 884 + }, + { + "epoch": 0.14534107938332683, + "grad_norm": 0.6462500462113238, + "learning_rate": 8.991639564590342e-06, + "loss": 0.6892, + "step": 885 + }, + { + "epoch": 0.14550530659166958, + "grad_norm": 0.6449101927159558, + "learning_rate": 8.991610966367217e-06, + "loss": 0.6986, + "step": 886 + }, + { + "epoch": 0.14566953380001232, + "grad_norm": 0.6513519292471238, + "learning_rate": 8.991582319360779e-06, + "loss": 0.6952, + "step": 887 + }, + { + "epoch": 0.14583376100835507, + "grad_norm": 0.6351156522282531, + "learning_rate": 8.991553623571336e-06, + "loss": 0.6947, + "step": 888 + }, + { + "epoch": 0.1459979882166978, + "grad_norm": 0.6463441730942351, + "learning_rate": 8.991524878999202e-06, + "loss": 0.6855, + "step": 889 + }, + { + "epoch": 0.14616221542504054, + "grad_norm": 0.6296524118090603, + "learning_rate": 8.99149608564469e-06, + "loss": 0.6988, + "step": 890 + }, + { + "epoch": 0.1463264426333833, + "grad_norm": 0.6527468073618038, + "learning_rate": 8.991467243508111e-06, + "loss": 0.6997, + "step": 891 + }, + { + "epoch": 0.14649066984172604, + "grad_norm": 0.6337043319103454, + "learning_rate": 8.991438352589779e-06, + "loss": 0.6963, + "step": 892 + }, + { + "epoch": 0.14665489705006876, + "grad_norm": 0.6566657563406406, + "learning_rate": 8.991409412890008e-06, + "loss": 0.7111, + "step": 893 + }, + { + "epoch": 0.1468191242584115, + "grad_norm": 0.6858464246143825, + "learning_rate": 8.991380424409111e-06, + "loss": 0.7123, + "step": 894 + }, + { + "epoch": 0.14698335146675426, + "grad_norm": 0.6529635045203046, + "learning_rate": 8.991351387147404e-06, + "loss": 0.7247, + "step": 895 + }, + { + "epoch": 0.147147578675097, + "grad_norm": 0.6511476919346777, + "learning_rate": 8.991322301105202e-06, + "loss": 0.7022, + "step": 896 + }, + { + "epoch": 0.14731180588343973, + "grad_norm": 0.6335117395711469, + "learning_rate": 8.99129316628282e-06, + "loss": 0.6852, + "step": 897 + }, + { + "epoch": 0.14747603309178248, + "grad_norm": 0.6231917174435052, + "learning_rate": 8.991263982680576e-06, + "loss": 0.7049, + "step": 898 + }, + { + "epoch": 0.14764026030012523, + "grad_norm": 0.6470589103422413, + "learning_rate": 8.991234750298787e-06, + "loss": 0.7089, + "step": 899 + }, + { + "epoch": 0.14780448750846797, + "grad_norm": 0.6358772641933567, + "learning_rate": 8.99120546913777e-06, + "loss": 0.6738, + "step": 900 + }, + { + "epoch": 0.1479687147168107, + "grad_norm": 0.6221324653432067, + "learning_rate": 8.991176139197841e-06, + "loss": 0.7093, + "step": 901 + }, + { + "epoch": 0.14813294192515344, + "grad_norm": 0.6215558642133383, + "learning_rate": 8.991146760479323e-06, + "loss": 0.6978, + "step": 902 + }, + { + "epoch": 0.1482971691334962, + "grad_norm": 0.6263701610045573, + "learning_rate": 8.99111733298253e-06, + "loss": 0.7108, + "step": 903 + }, + { + "epoch": 0.14846139634183894, + "grad_norm": 0.6296632177864455, + "learning_rate": 8.991087856707785e-06, + "loss": 0.7151, + "step": 904 + }, + { + "epoch": 0.14862562355018166, + "grad_norm": 0.6178781456844099, + "learning_rate": 8.991058331655408e-06, + "loss": 0.6921, + "step": 905 + }, + { + "epoch": 0.1487898507585244, + "grad_norm": 0.6426241622321025, + "learning_rate": 8.991028757825718e-06, + "loss": 0.6661, + "step": 906 + }, + { + "epoch": 0.14895407796686716, + "grad_norm": 0.6315217002269438, + "learning_rate": 8.990999135219037e-06, + "loss": 0.7029, + "step": 907 + }, + { + "epoch": 0.1491183051752099, + "grad_norm": 0.6221289953237676, + "learning_rate": 8.990969463835688e-06, + "loss": 0.6854, + "step": 908 + }, + { + "epoch": 0.14928253238355266, + "grad_norm": 0.6351780842503867, + "learning_rate": 8.99093974367599e-06, + "loss": 0.6746, + "step": 909 + }, + { + "epoch": 0.14944675959189538, + "grad_norm": 0.6162843131557986, + "learning_rate": 8.990909974740271e-06, + "loss": 0.7063, + "step": 910 + }, + { + "epoch": 0.14961098680023813, + "grad_norm": 0.6438716140978041, + "learning_rate": 8.990880157028849e-06, + "loss": 0.668, + "step": 911 + }, + { + "epoch": 0.14977521400858088, + "grad_norm": 0.6738550733623182, + "learning_rate": 8.99085029054205e-06, + "loss": 0.6723, + "step": 912 + }, + { + "epoch": 0.14993944121692362, + "grad_norm": 0.6236064060080636, + "learning_rate": 8.990820375280199e-06, + "loss": 0.6933, + "step": 913 + }, + { + "epoch": 0.15010366842526635, + "grad_norm": 0.8878180053952123, + "learning_rate": 8.990790411243622e-06, + "loss": 0.7072, + "step": 914 + }, + { + "epoch": 0.1502678956336091, + "grad_norm": 0.6040213602808409, + "learning_rate": 8.99076039843264e-06, + "loss": 0.7265, + "step": 915 + }, + { + "epoch": 0.15043212284195184, + "grad_norm": 0.5993272996617645, + "learning_rate": 8.990730336847584e-06, + "loss": 0.6735, + "step": 916 + }, + { + "epoch": 0.1505963500502946, + "grad_norm": 0.6196667818949094, + "learning_rate": 8.990700226488775e-06, + "loss": 0.683, + "step": 917 + }, + { + "epoch": 0.1507605772586373, + "grad_norm": 0.611151140998212, + "learning_rate": 8.990670067356546e-06, + "loss": 0.6855, + "step": 918 + }, + { + "epoch": 0.15092480446698006, + "grad_norm": 0.6124554833223387, + "learning_rate": 8.990639859451223e-06, + "loss": 0.7316, + "step": 919 + }, + { + "epoch": 0.1510890316753228, + "grad_norm": 0.8671036907456195, + "learning_rate": 8.99060960277313e-06, + "loss": 0.6754, + "step": 920 + }, + { + "epoch": 0.15125325888366556, + "grad_norm": 0.604961084181521, + "learning_rate": 8.9905792973226e-06, + "loss": 0.6957, + "step": 921 + }, + { + "epoch": 0.15141748609200828, + "grad_norm": 0.6367685614450236, + "learning_rate": 8.99054894309996e-06, + "loss": 0.6878, + "step": 922 + }, + { + "epoch": 0.15158171330035103, + "grad_norm": 0.6291160157774949, + "learning_rate": 8.99051854010554e-06, + "loss": 0.6758, + "step": 923 + }, + { + "epoch": 0.15174594050869378, + "grad_norm": 0.6186538967618891, + "learning_rate": 8.990488088339673e-06, + "loss": 0.6851, + "step": 924 + }, + { + "epoch": 0.15191016771703653, + "grad_norm": 0.6043423314180405, + "learning_rate": 8.990457587802685e-06, + "loss": 0.7029, + "step": 925 + }, + { + "epoch": 0.15207439492537927, + "grad_norm": 0.6057826193851099, + "learning_rate": 8.99042703849491e-06, + "loss": 0.6894, + "step": 926 + }, + { + "epoch": 0.152238622133722, + "grad_norm": 0.6415804974916999, + "learning_rate": 8.990396440416682e-06, + "loss": 0.6895, + "step": 927 + }, + { + "epoch": 0.15240284934206474, + "grad_norm": 0.590171420782872, + "learning_rate": 8.990365793568326e-06, + "loss": 0.7064, + "step": 928 + }, + { + "epoch": 0.1525670765504075, + "grad_norm": 0.5991023181384277, + "learning_rate": 8.990335097950184e-06, + "loss": 0.7034, + "step": 929 + }, + { + "epoch": 0.15273130375875024, + "grad_norm": 0.5925124030029137, + "learning_rate": 8.990304353562582e-06, + "loss": 0.6722, + "step": 930 + }, + { + "epoch": 0.15289553096709296, + "grad_norm": 0.5879098836185569, + "learning_rate": 8.990273560405858e-06, + "loss": 0.6778, + "step": 931 + }, + { + "epoch": 0.1530597581754357, + "grad_norm": 0.6081890261292409, + "learning_rate": 8.990242718480345e-06, + "loss": 0.6977, + "step": 932 + }, + { + "epoch": 0.15322398538377846, + "grad_norm": 0.5943707439558413, + "learning_rate": 8.990211827786377e-06, + "loss": 0.6833, + "step": 933 + }, + { + "epoch": 0.1533882125921212, + "grad_norm": 0.5957073787044868, + "learning_rate": 8.990180888324293e-06, + "loss": 0.6769, + "step": 934 + }, + { + "epoch": 0.15355243980046393, + "grad_norm": 0.5990803867181086, + "learning_rate": 8.990149900094426e-06, + "loss": 0.6947, + "step": 935 + }, + { + "epoch": 0.15371666700880668, + "grad_norm": 0.5762901620148804, + "learning_rate": 8.990118863097113e-06, + "loss": 0.6989, + "step": 936 + }, + { + "epoch": 0.15388089421714943, + "grad_norm": 0.583179708719448, + "learning_rate": 8.990087777332693e-06, + "loss": 0.7021, + "step": 937 + }, + { + "epoch": 0.15404512142549218, + "grad_norm": 0.5957217968883289, + "learning_rate": 8.9900566428015e-06, + "loss": 0.6581, + "step": 938 + }, + { + "epoch": 0.1542093486338349, + "grad_norm": 0.5768688535477488, + "learning_rate": 8.990025459503875e-06, + "loss": 0.6909, + "step": 939 + }, + { + "epoch": 0.15437357584217765, + "grad_norm": 0.5863263337089208, + "learning_rate": 8.989994227440156e-06, + "loss": 0.6903, + "step": 940 + }, + { + "epoch": 0.1545378030505204, + "grad_norm": 1.449174264009128, + "learning_rate": 8.989962946610682e-06, + "loss": 0.6777, + "step": 941 + }, + { + "epoch": 0.15470203025886314, + "grad_norm": 0.5857095155638259, + "learning_rate": 8.989931617015794e-06, + "loss": 0.6847, + "step": 942 + }, + { + "epoch": 0.1548662574672059, + "grad_norm": 0.6045629484192497, + "learning_rate": 8.98990023865583e-06, + "loss": 0.6973, + "step": 943 + }, + { + "epoch": 0.1550304846755486, + "grad_norm": 0.6271020723122404, + "learning_rate": 8.989868811531133e-06, + "loss": 0.6822, + "step": 944 + }, + { + "epoch": 0.15519471188389136, + "grad_norm": 0.5974097511017354, + "learning_rate": 8.989837335642041e-06, + "loss": 0.6932, + "step": 945 + }, + { + "epoch": 0.1553589390922341, + "grad_norm": 0.6165658939960453, + "learning_rate": 8.989805810988899e-06, + "loss": 0.7028, + "step": 946 + }, + { + "epoch": 0.15552316630057686, + "grad_norm": 0.5750032115880928, + "learning_rate": 8.98977423757205e-06, + "loss": 0.699, + "step": 947 + }, + { + "epoch": 0.15568739350891958, + "grad_norm": 0.5930114131715402, + "learning_rate": 8.989742615391835e-06, + "loss": 0.6946, + "step": 948 + }, + { + "epoch": 0.15585162071726233, + "grad_norm": 0.5821912545770899, + "learning_rate": 8.989710944448598e-06, + "loss": 0.6944, + "step": 949 + }, + { + "epoch": 0.15601584792560508, + "grad_norm": 0.5998707680711169, + "learning_rate": 8.989679224742682e-06, + "loss": 0.6817, + "step": 950 + }, + { + "epoch": 0.15618007513394783, + "grad_norm": 0.784960727758064, + "learning_rate": 8.989647456274432e-06, + "loss": 0.6667, + "step": 951 + }, + { + "epoch": 0.15634430234229055, + "grad_norm": 0.5905550333586939, + "learning_rate": 8.989615639044194e-06, + "loss": 0.6909, + "step": 952 + }, + { + "epoch": 0.1565085295506333, + "grad_norm": 0.5813775463869184, + "learning_rate": 8.989583773052312e-06, + "loss": 0.6741, + "step": 953 + }, + { + "epoch": 0.15667275675897605, + "grad_norm": 0.5859001004681035, + "learning_rate": 8.989551858299135e-06, + "loss": 0.675, + "step": 954 + }, + { + "epoch": 0.1568369839673188, + "grad_norm": 0.5789497437415951, + "learning_rate": 8.989519894785007e-06, + "loss": 0.6924, + "step": 955 + }, + { + "epoch": 0.15700121117566151, + "grad_norm": 0.576924083966074, + "learning_rate": 8.989487882510275e-06, + "loss": 0.6951, + "step": 956 + }, + { + "epoch": 0.15716543838400426, + "grad_norm": 0.5860897339311051, + "learning_rate": 8.98945582147529e-06, + "loss": 0.6837, + "step": 957 + }, + { + "epoch": 0.157329665592347, + "grad_norm": 0.567962554350338, + "learning_rate": 8.989423711680394e-06, + "loss": 0.6718, + "step": 958 + }, + { + "epoch": 0.15749389280068976, + "grad_norm": 0.580251872407634, + "learning_rate": 8.989391553125943e-06, + "loss": 0.6766, + "step": 959 + }, + { + "epoch": 0.1576581200090325, + "grad_norm": 0.5835185656150728, + "learning_rate": 8.98935934581228e-06, + "loss": 0.6656, + "step": 960 + }, + { + "epoch": 0.15782234721737523, + "grad_norm": 0.5680557698537251, + "learning_rate": 8.989327089739759e-06, + "loss": 0.6886, + "step": 961 + }, + { + "epoch": 0.15798657442571798, + "grad_norm": 0.5822249108751449, + "learning_rate": 8.98929478490873e-06, + "loss": 0.6793, + "step": 962 + }, + { + "epoch": 0.15815080163406073, + "grad_norm": 0.6920777043397142, + "learning_rate": 8.989262431319541e-06, + "loss": 0.68, + "step": 963 + }, + { + "epoch": 0.15831502884240348, + "grad_norm": 0.5697926374238244, + "learning_rate": 8.989230028972546e-06, + "loss": 0.6902, + "step": 964 + }, + { + "epoch": 0.1584792560507462, + "grad_norm": 0.5648702262985533, + "learning_rate": 8.989197577868095e-06, + "loss": 0.6785, + "step": 965 + }, + { + "epoch": 0.15864348325908895, + "grad_norm": 0.574611072826276, + "learning_rate": 8.989165078006542e-06, + "loss": 0.702, + "step": 966 + }, + { + "epoch": 0.1588077104674317, + "grad_norm": 0.5653515769606229, + "learning_rate": 8.989132529388239e-06, + "loss": 0.6805, + "step": 967 + }, + { + "epoch": 0.15897193767577444, + "grad_norm": 0.5677479447859182, + "learning_rate": 8.989099932013541e-06, + "loss": 0.6899, + "step": 968 + }, + { + "epoch": 0.15913616488411716, + "grad_norm": 0.5600255179636507, + "learning_rate": 8.989067285882801e-06, + "loss": 0.6952, + "step": 969 + }, + { + "epoch": 0.1593003920924599, + "grad_norm": 0.5645642174349392, + "learning_rate": 8.989034590996373e-06, + "loss": 0.6797, + "step": 970 + }, + { + "epoch": 0.15946461930080266, + "grad_norm": 0.5661098243057704, + "learning_rate": 8.989001847354614e-06, + "loss": 0.6769, + "step": 971 + }, + { + "epoch": 0.1596288465091454, + "grad_norm": 0.5608033372773416, + "learning_rate": 8.988969054957878e-06, + "loss": 0.6786, + "step": 972 + }, + { + "epoch": 0.15979307371748813, + "grad_norm": 0.5780246426412402, + "learning_rate": 8.98893621380652e-06, + "loss": 0.6693, + "step": 973 + }, + { + "epoch": 0.15995730092583088, + "grad_norm": 0.5786252288148208, + "learning_rate": 8.9889033239009e-06, + "loss": 0.6862, + "step": 974 + }, + { + "epoch": 0.16012152813417363, + "grad_norm": 0.7542693728723142, + "learning_rate": 8.988870385241371e-06, + "loss": 0.6678, + "step": 975 + }, + { + "epoch": 0.16028575534251638, + "grad_norm": 0.6306239933124853, + "learning_rate": 8.988837397828296e-06, + "loss": 0.6757, + "step": 976 + }, + { + "epoch": 0.16044998255085913, + "grad_norm": 0.5939312755402465, + "learning_rate": 8.988804361662029e-06, + "loss": 0.6713, + "step": 977 + }, + { + "epoch": 0.16061420975920185, + "grad_norm": 0.5633583256765683, + "learning_rate": 8.98877127674293e-06, + "loss": 0.6673, + "step": 978 + }, + { + "epoch": 0.1607784369675446, + "grad_norm": 0.5800987462350472, + "learning_rate": 8.988738143071359e-06, + "loss": 0.6605, + "step": 979 + }, + { + "epoch": 0.16094266417588735, + "grad_norm": 0.5643582604641704, + "learning_rate": 8.988704960647677e-06, + "loss": 0.7013, + "step": 980 + }, + { + "epoch": 0.1611068913842301, + "grad_norm": 0.5835933494518621, + "learning_rate": 8.988671729472241e-06, + "loss": 0.6782, + "step": 981 + }, + { + "epoch": 0.16127111859257282, + "grad_norm": 0.6788940707481684, + "learning_rate": 8.988638449545415e-06, + "loss": 0.6844, + "step": 982 + }, + { + "epoch": 0.16143534580091556, + "grad_norm": 0.5471636693606157, + "learning_rate": 8.988605120867557e-06, + "loss": 0.6786, + "step": 983 + }, + { + "epoch": 0.1615995730092583, + "grad_norm": 0.5734962283482052, + "learning_rate": 8.988571743439032e-06, + "loss": 0.6819, + "step": 984 + }, + { + "epoch": 0.16176380021760106, + "grad_norm": 0.6161812954255436, + "learning_rate": 8.988538317260203e-06, + "loss": 0.7011, + "step": 985 + }, + { + "epoch": 0.16192802742594378, + "grad_norm": 0.5586729255122475, + "learning_rate": 8.988504842331431e-06, + "loss": 0.6924, + "step": 986 + }, + { + "epoch": 0.16209225463428653, + "grad_norm": 0.5589443674472094, + "learning_rate": 8.988471318653081e-06, + "loss": 0.6688, + "step": 987 + }, + { + "epoch": 0.16225648184262928, + "grad_norm": 0.5725349388200487, + "learning_rate": 8.988437746225515e-06, + "loss": 0.6859, + "step": 988 + }, + { + "epoch": 0.16242070905097203, + "grad_norm": 0.5386632021604499, + "learning_rate": 8.9884041250491e-06, + "loss": 0.7098, + "step": 989 + }, + { + "epoch": 0.16258493625931475, + "grad_norm": 0.5588338765541392, + "learning_rate": 8.9883704551242e-06, + "loss": 0.6869, + "step": 990 + }, + { + "epoch": 0.1627491634676575, + "grad_norm": 0.5620172479210048, + "learning_rate": 8.98833673645118e-06, + "loss": 0.6801, + "step": 991 + }, + { + "epoch": 0.16291339067600025, + "grad_norm": 0.578537129368564, + "learning_rate": 8.988302969030409e-06, + "loss": 0.6802, + "step": 992 + }, + { + "epoch": 0.163077617884343, + "grad_norm": 0.5788833235484687, + "learning_rate": 8.98826915286225e-06, + "loss": 0.6838, + "step": 993 + }, + { + "epoch": 0.16324184509268574, + "grad_norm": 0.5455036420454441, + "learning_rate": 8.988235287947074e-06, + "loss": 0.684, + "step": 994 + }, + { + "epoch": 0.16340607230102847, + "grad_norm": 0.5469684799799748, + "learning_rate": 8.988201374285246e-06, + "loss": 0.6732, + "step": 995 + }, + { + "epoch": 0.16357029950937121, + "grad_norm": 0.5659737004114193, + "learning_rate": 8.988167411877134e-06, + "loss": 0.6774, + "step": 996 + }, + { + "epoch": 0.16373452671771396, + "grad_norm": 0.5472964921114717, + "learning_rate": 8.98813340072311e-06, + "loss": 0.6561, + "step": 997 + }, + { + "epoch": 0.1638987539260567, + "grad_norm": 0.5313159381480278, + "learning_rate": 8.98809934082354e-06, + "loss": 0.6924, + "step": 998 + }, + { + "epoch": 0.16406298113439943, + "grad_norm": 0.5792443966133527, + "learning_rate": 8.988065232178799e-06, + "loss": 0.6711, + "step": 999 + }, + { + "epoch": 0.16422720834274218, + "grad_norm": 0.5545515494905078, + "learning_rate": 8.98803107478925e-06, + "loss": 0.6831, + "step": 1000 + }, + { + "epoch": 0.16439143555108493, + "grad_norm": 0.5573621935545905, + "learning_rate": 8.98799686865527e-06, + "loss": 0.6991, + "step": 1001 + }, + { + "epoch": 0.16455566275942768, + "grad_norm": 0.5491556801369158, + "learning_rate": 8.987962613777226e-06, + "loss": 0.6638, + "step": 1002 + }, + { + "epoch": 0.1647198899677704, + "grad_norm": 0.5392445583149836, + "learning_rate": 8.987928310155495e-06, + "loss": 0.6652, + "step": 1003 + }, + { + "epoch": 0.16488411717611315, + "grad_norm": 0.5348090900057155, + "learning_rate": 8.987893957790447e-06, + "loss": 0.6828, + "step": 1004 + }, + { + "epoch": 0.1650483443844559, + "grad_norm": 0.5577106888054703, + "learning_rate": 8.987859556682454e-06, + "loss": 0.6719, + "step": 1005 + }, + { + "epoch": 0.16521257159279865, + "grad_norm": 0.5363067993234555, + "learning_rate": 8.98782510683189e-06, + "loss": 0.6803, + "step": 1006 + }, + { + "epoch": 0.16537679880114137, + "grad_norm": 0.5603737925264757, + "learning_rate": 8.987790608239131e-06, + "loss": 0.6825, + "step": 1007 + }, + { + "epoch": 0.16554102600948412, + "grad_norm": 0.5411605815062192, + "learning_rate": 8.98775606090455e-06, + "loss": 0.6625, + "step": 1008 + }, + { + "epoch": 0.16570525321782686, + "grad_norm": 0.5489945681101899, + "learning_rate": 8.987721464828525e-06, + "loss": 0.6695, + "step": 1009 + }, + { + "epoch": 0.1658694804261696, + "grad_norm": 0.54397334030031, + "learning_rate": 8.987686820011428e-06, + "loss": 0.6667, + "step": 1010 + }, + { + "epoch": 0.16603370763451233, + "grad_norm": 0.5321484388094383, + "learning_rate": 8.987652126453636e-06, + "loss": 0.691, + "step": 1011 + }, + { + "epoch": 0.16619793484285508, + "grad_norm": 0.5317184209836602, + "learning_rate": 8.987617384155527e-06, + "loss": 0.6477, + "step": 1012 + }, + { + "epoch": 0.16636216205119783, + "grad_norm": 0.5341564431442047, + "learning_rate": 8.987582593117478e-06, + "loss": 0.6473, + "step": 1013 + }, + { + "epoch": 0.16652638925954058, + "grad_norm": 0.5787953947276587, + "learning_rate": 8.987547753339868e-06, + "loss": 0.6788, + "step": 1014 + }, + { + "epoch": 0.16669061646788333, + "grad_norm": 0.5338083228144461, + "learning_rate": 8.987512864823073e-06, + "loss": 0.6783, + "step": 1015 + }, + { + "epoch": 0.16685484367622605, + "grad_norm": 0.5431111870614626, + "learning_rate": 8.987477927567475e-06, + "loss": 0.6665, + "step": 1016 + }, + { + "epoch": 0.1670190708845688, + "grad_norm": 0.5460094365072707, + "learning_rate": 8.987442941573448e-06, + "loss": 0.665, + "step": 1017 + }, + { + "epoch": 0.16718329809291155, + "grad_norm": 0.5427687593109538, + "learning_rate": 8.98740790684138e-06, + "loss": 0.6562, + "step": 1018 + }, + { + "epoch": 0.1673475253012543, + "grad_norm": 0.533396167621092, + "learning_rate": 8.987372823371644e-06, + "loss": 0.6684, + "step": 1019 + }, + { + "epoch": 0.16751175250959702, + "grad_norm": 0.5320714216988802, + "learning_rate": 8.987337691164625e-06, + "loss": 0.6519, + "step": 1020 + }, + { + "epoch": 0.16767597971793977, + "grad_norm": 0.5456380984656337, + "learning_rate": 8.987302510220704e-06, + "loss": 0.686, + "step": 1021 + }, + { + "epoch": 0.16784020692628251, + "grad_norm": 0.5224293963323513, + "learning_rate": 8.987267280540264e-06, + "loss": 0.6697, + "step": 1022 + }, + { + "epoch": 0.16800443413462526, + "grad_norm": 0.5243433616305909, + "learning_rate": 8.987232002123685e-06, + "loss": 0.6921, + "step": 1023 + }, + { + "epoch": 0.16816866134296798, + "grad_norm": 0.5328025582851148, + "learning_rate": 8.987196674971352e-06, + "loss": 0.6555, + "step": 1024 + }, + { + "epoch": 0.16833288855131073, + "grad_norm": 0.5198078640352478, + "learning_rate": 8.987161299083647e-06, + "loss": 0.6379, + "step": 1025 + }, + { + "epoch": 0.16849711575965348, + "grad_norm": 0.5882832847154114, + "learning_rate": 8.987125874460957e-06, + "loss": 0.6701, + "step": 1026 + }, + { + "epoch": 0.16866134296799623, + "grad_norm": 0.5295752447874579, + "learning_rate": 8.987090401103665e-06, + "loss": 0.6944, + "step": 1027 + }, + { + "epoch": 0.16882557017633895, + "grad_norm": 0.5326690138378312, + "learning_rate": 8.987054879012156e-06, + "loss": 0.6812, + "step": 1028 + }, + { + "epoch": 0.1689897973846817, + "grad_norm": 0.5279524083193049, + "learning_rate": 8.987019308186818e-06, + "loss": 0.664, + "step": 1029 + }, + { + "epoch": 0.16915402459302445, + "grad_norm": 0.5696406663324558, + "learning_rate": 8.986983688628034e-06, + "loss": 0.6615, + "step": 1030 + }, + { + "epoch": 0.1693182518013672, + "grad_norm": 0.5237891547003628, + "learning_rate": 8.986948020336192e-06, + "loss": 0.6643, + "step": 1031 + }, + { + "epoch": 0.16948247900970995, + "grad_norm": 0.5398581475126638, + "learning_rate": 8.986912303311682e-06, + "loss": 0.6935, + "step": 1032 + }, + { + "epoch": 0.16964670621805267, + "grad_norm": 0.5257870387792046, + "learning_rate": 8.986876537554889e-06, + "loss": 0.6688, + "step": 1033 + }, + { + "epoch": 0.16981093342639542, + "grad_norm": 0.5358931293606296, + "learning_rate": 8.986840723066202e-06, + "loss": 0.6756, + "step": 1034 + }, + { + "epoch": 0.16997516063473816, + "grad_norm": 0.5429766773598853, + "learning_rate": 8.98680485984601e-06, + "loss": 0.6706, + "step": 1035 + }, + { + "epoch": 0.1701393878430809, + "grad_norm": 0.5115483229740615, + "learning_rate": 8.986768947894704e-06, + "loss": 0.665, + "step": 1036 + }, + { + "epoch": 0.17030361505142363, + "grad_norm": 0.5223361541452932, + "learning_rate": 8.98673298721267e-06, + "loss": 0.6848, + "step": 1037 + }, + { + "epoch": 0.17046784225976638, + "grad_norm": 0.8292942856556398, + "learning_rate": 8.986696977800305e-06, + "loss": 0.6611, + "step": 1038 + }, + { + "epoch": 0.17063206946810913, + "grad_norm": 0.5594058489983992, + "learning_rate": 8.986660919657995e-06, + "loss": 0.6699, + "step": 1039 + }, + { + "epoch": 0.17079629667645188, + "grad_norm": 0.5191782674532504, + "learning_rate": 8.986624812786133e-06, + "loss": 0.6463, + "step": 1040 + }, + { + "epoch": 0.1709605238847946, + "grad_norm": 0.5065778655698908, + "learning_rate": 8.986588657185112e-06, + "loss": 0.6804, + "step": 1041 + }, + { + "epoch": 0.17112475109313735, + "grad_norm": 0.5140861111946393, + "learning_rate": 8.986552452855323e-06, + "loss": 0.6425, + "step": 1042 + }, + { + "epoch": 0.1712889783014801, + "grad_norm": 0.5472505082789985, + "learning_rate": 8.98651619979716e-06, + "loss": 0.654, + "step": 1043 + }, + { + "epoch": 0.17145320550982285, + "grad_norm": 0.5101925390931323, + "learning_rate": 8.986479898011019e-06, + "loss": 0.685, + "step": 1044 + }, + { + "epoch": 0.17161743271816557, + "grad_norm": 0.5206702278219933, + "learning_rate": 8.98644354749729e-06, + "loss": 0.6693, + "step": 1045 + }, + { + "epoch": 0.17178165992650832, + "grad_norm": 0.5126143393833118, + "learning_rate": 8.986407148256372e-06, + "loss": 0.6799, + "step": 1046 + }, + { + "epoch": 0.17194588713485107, + "grad_norm": 0.5331986314895121, + "learning_rate": 8.986370700288655e-06, + "loss": 0.6638, + "step": 1047 + }, + { + "epoch": 0.17211011434319382, + "grad_norm": 0.5073509933218882, + "learning_rate": 8.98633420359454e-06, + "loss": 0.6802, + "step": 1048 + }, + { + "epoch": 0.17227434155153656, + "grad_norm": 0.508473575804819, + "learning_rate": 8.986297658174423e-06, + "loss": 0.6571, + "step": 1049 + }, + { + "epoch": 0.17243856875987928, + "grad_norm": 0.49990662654253826, + "learning_rate": 8.986261064028698e-06, + "loss": 0.668, + "step": 1050 + }, + { + "epoch": 0.17260279596822203, + "grad_norm": 0.5180145085098473, + "learning_rate": 8.986224421157764e-06, + "loss": 0.6652, + "step": 1051 + }, + { + "epoch": 0.17276702317656478, + "grad_norm": 0.5008280735732593, + "learning_rate": 8.98618772956202e-06, + "loss": 0.6635, + "step": 1052 + }, + { + "epoch": 0.17293125038490753, + "grad_norm": 0.525862217902218, + "learning_rate": 8.986150989241863e-06, + "loss": 0.6604, + "step": 1053 + }, + { + "epoch": 0.17309547759325025, + "grad_norm": 0.5041642087839215, + "learning_rate": 8.986114200197692e-06, + "loss": 0.6637, + "step": 1054 + }, + { + "epoch": 0.173259704801593, + "grad_norm": 0.5755431185436396, + "learning_rate": 8.986077362429908e-06, + "loss": 0.6724, + "step": 1055 + }, + { + "epoch": 0.17342393200993575, + "grad_norm": 0.5355683129486976, + "learning_rate": 8.986040475938908e-06, + "loss": 0.6549, + "step": 1056 + }, + { + "epoch": 0.1735881592182785, + "grad_norm": 0.5041556885697243, + "learning_rate": 8.986003540725098e-06, + "loss": 0.6591, + "step": 1057 + }, + { + "epoch": 0.17375238642662122, + "grad_norm": 0.511207002198651, + "learning_rate": 8.985966556788873e-06, + "loss": 0.6426, + "step": 1058 + }, + { + "epoch": 0.17391661363496397, + "grad_norm": 0.49542003624935754, + "learning_rate": 8.985929524130638e-06, + "loss": 0.6586, + "step": 1059 + }, + { + "epoch": 0.17408084084330672, + "grad_norm": 0.5639426152691353, + "learning_rate": 8.985892442750796e-06, + "loss": 0.6463, + "step": 1060 + }, + { + "epoch": 0.17424506805164947, + "grad_norm": 0.5156251092751705, + "learning_rate": 8.985855312649749e-06, + "loss": 0.701, + "step": 1061 + }, + { + "epoch": 0.1744092952599922, + "grad_norm": 0.5004678577815013, + "learning_rate": 8.985818133827898e-06, + "loss": 0.6876, + "step": 1062 + }, + { + "epoch": 0.17457352246833494, + "grad_norm": 0.5368872269813518, + "learning_rate": 8.98578090628565e-06, + "loss": 0.6573, + "step": 1063 + }, + { + "epoch": 0.17473774967667768, + "grad_norm": 0.5012298661884161, + "learning_rate": 8.985743630023406e-06, + "loss": 0.675, + "step": 1064 + }, + { + "epoch": 0.17490197688502043, + "grad_norm": 0.586654914508436, + "learning_rate": 8.985706305041575e-06, + "loss": 0.6746, + "step": 1065 + }, + { + "epoch": 0.17506620409336318, + "grad_norm": 0.5265854279573517, + "learning_rate": 8.98566893134056e-06, + "loss": 0.6883, + "step": 1066 + }, + { + "epoch": 0.1752304313017059, + "grad_norm": 0.49981491800267486, + "learning_rate": 8.985631508920767e-06, + "loss": 0.6485, + "step": 1067 + }, + { + "epoch": 0.17539465851004865, + "grad_norm": 0.4938509609458083, + "learning_rate": 8.985594037782602e-06, + "loss": 0.6846, + "step": 1068 + }, + { + "epoch": 0.1755588857183914, + "grad_norm": 0.5031409162616062, + "learning_rate": 8.985556517926472e-06, + "loss": 0.6357, + "step": 1069 + }, + { + "epoch": 0.17572311292673415, + "grad_norm": 0.5700620669276998, + "learning_rate": 8.985518949352786e-06, + "loss": 0.6849, + "step": 1070 + }, + { + "epoch": 0.17588734013507687, + "grad_norm": 0.5145274475647432, + "learning_rate": 8.98548133206195e-06, + "loss": 0.6694, + "step": 1071 + }, + { + "epoch": 0.17605156734341962, + "grad_norm": 0.5063946464405173, + "learning_rate": 8.985443666054375e-06, + "loss": 0.678, + "step": 1072 + }, + { + "epoch": 0.17621579455176237, + "grad_norm": 0.4916761388400217, + "learning_rate": 8.985405951330468e-06, + "loss": 0.6829, + "step": 1073 + }, + { + "epoch": 0.17638002176010512, + "grad_norm": 0.5289628393025868, + "learning_rate": 8.98536818789064e-06, + "loss": 0.6685, + "step": 1074 + }, + { + "epoch": 0.17654424896844784, + "grad_norm": 0.5643139132504469, + "learning_rate": 8.985330375735298e-06, + "loss": 0.6627, + "step": 1075 + }, + { + "epoch": 0.17670847617679059, + "grad_norm": 0.4814303227261524, + "learning_rate": 8.985292514864859e-06, + "loss": 0.6559, + "step": 1076 + }, + { + "epoch": 0.17687270338513333, + "grad_norm": 0.4974900220408461, + "learning_rate": 8.985254605279726e-06, + "loss": 0.6716, + "step": 1077 + }, + { + "epoch": 0.17703693059347608, + "grad_norm": 0.48748470280103556, + "learning_rate": 8.985216646980318e-06, + "loss": 0.6554, + "step": 1078 + }, + { + "epoch": 0.1772011578018188, + "grad_norm": 0.476700155245185, + "learning_rate": 8.985178639967044e-06, + "loss": 0.6633, + "step": 1079 + }, + { + "epoch": 0.17736538501016155, + "grad_norm": 0.48611980044750863, + "learning_rate": 8.985140584240317e-06, + "loss": 0.6477, + "step": 1080 + }, + { + "epoch": 0.1775296122185043, + "grad_norm": 0.4744923322444135, + "learning_rate": 8.985102479800551e-06, + "loss": 0.6889, + "step": 1081 + }, + { + "epoch": 0.17769383942684705, + "grad_norm": 0.492363316728015, + "learning_rate": 8.985064326648157e-06, + "loss": 0.6646, + "step": 1082 + }, + { + "epoch": 0.1778580666351898, + "grad_norm": 0.5034362431677137, + "learning_rate": 8.985026124783554e-06, + "loss": 0.6495, + "step": 1083 + }, + { + "epoch": 0.17802229384353252, + "grad_norm": 0.5305230978210335, + "learning_rate": 8.984987874207156e-06, + "loss": 0.6578, + "step": 1084 + }, + { + "epoch": 0.17818652105187527, + "grad_norm": 0.49163982241906173, + "learning_rate": 8.984949574919374e-06, + "loss": 0.67, + "step": 1085 + }, + { + "epoch": 0.17835074826021802, + "grad_norm": 0.48484339846155095, + "learning_rate": 8.984911226920629e-06, + "loss": 0.6795, + "step": 1086 + }, + { + "epoch": 0.17851497546856077, + "grad_norm": 0.4746707284972163, + "learning_rate": 8.984872830211335e-06, + "loss": 0.6693, + "step": 1087 + }, + { + "epoch": 0.1786792026769035, + "grad_norm": 0.4815131650072882, + "learning_rate": 8.984834384791908e-06, + "loss": 0.6627, + "step": 1088 + }, + { + "epoch": 0.17884342988524624, + "grad_norm": 0.49934183398618404, + "learning_rate": 8.984795890662768e-06, + "loss": 0.6705, + "step": 1089 + }, + { + "epoch": 0.17900765709358898, + "grad_norm": 0.46901602497900924, + "learning_rate": 8.984757347824334e-06, + "loss": 0.6822, + "step": 1090 + }, + { + "epoch": 0.17917188430193173, + "grad_norm": 0.4841196036749678, + "learning_rate": 8.984718756277019e-06, + "loss": 0.6612, + "step": 1091 + }, + { + "epoch": 0.17933611151027445, + "grad_norm": 0.495840243932423, + "learning_rate": 8.984680116021248e-06, + "loss": 0.6459, + "step": 1092 + }, + { + "epoch": 0.1795003387186172, + "grad_norm": 0.4745846787795327, + "learning_rate": 8.98464142705744e-06, + "loss": 0.663, + "step": 1093 + }, + { + "epoch": 0.17966456592695995, + "grad_norm": 0.4880415845630486, + "learning_rate": 8.984602689386013e-06, + "loss": 0.6527, + "step": 1094 + }, + { + "epoch": 0.1798287931353027, + "grad_norm": 0.48962473221363556, + "learning_rate": 8.984563903007389e-06, + "loss": 0.6714, + "step": 1095 + }, + { + "epoch": 0.17999302034364542, + "grad_norm": 0.483069837873451, + "learning_rate": 8.984525067921987e-06, + "loss": 0.6776, + "step": 1096 + }, + { + "epoch": 0.18015724755198817, + "grad_norm": 0.4767229218630196, + "learning_rate": 8.984486184130231e-06, + "loss": 0.6615, + "step": 1097 + }, + { + "epoch": 0.18032147476033092, + "grad_norm": 0.5064791581734579, + "learning_rate": 8.984447251632543e-06, + "loss": 0.6765, + "step": 1098 + }, + { + "epoch": 0.18048570196867367, + "grad_norm": 0.5345397900086212, + "learning_rate": 8.984408270429348e-06, + "loss": 0.6513, + "step": 1099 + }, + { + "epoch": 0.1806499291770164, + "grad_norm": 0.5296402491323332, + "learning_rate": 8.984369240521063e-06, + "loss": 0.6468, + "step": 1100 + }, + { + "epoch": 0.18081415638535914, + "grad_norm": 0.46787695321689876, + "learning_rate": 8.984330161908119e-06, + "loss": 0.6772, + "step": 1101 + }, + { + "epoch": 0.18097838359370189, + "grad_norm": 0.48925568695329985, + "learning_rate": 8.984291034590937e-06, + "loss": 0.6857, + "step": 1102 + }, + { + "epoch": 0.18114261080204463, + "grad_norm": 0.4790066192561788, + "learning_rate": 8.984251858569943e-06, + "loss": 0.6709, + "step": 1103 + }, + { + "epoch": 0.18130683801038738, + "grad_norm": 0.469440234272998, + "learning_rate": 8.98421263384556e-06, + "loss": 0.6627, + "step": 1104 + }, + { + "epoch": 0.1814710652187301, + "grad_norm": 0.49923220336592083, + "learning_rate": 8.984173360418219e-06, + "loss": 0.6747, + "step": 1105 + }, + { + "epoch": 0.18163529242707285, + "grad_norm": 0.49630959834059857, + "learning_rate": 8.98413403828834e-06, + "loss": 0.6774, + "step": 1106 + }, + { + "epoch": 0.1817995196354156, + "grad_norm": 0.4766594175572563, + "learning_rate": 8.984094667456355e-06, + "loss": 0.6526, + "step": 1107 + }, + { + "epoch": 0.18196374684375835, + "grad_norm": 0.47797237085992367, + "learning_rate": 8.98405524792269e-06, + "loss": 0.6377, + "step": 1108 + }, + { + "epoch": 0.18212797405210107, + "grad_norm": 0.6046432859375188, + "learning_rate": 8.984015779687773e-06, + "loss": 0.6396, + "step": 1109 + }, + { + "epoch": 0.18229220126044382, + "grad_norm": 0.479492420455775, + "learning_rate": 8.983976262752034e-06, + "loss": 0.6621, + "step": 1110 + }, + { + "epoch": 0.18245642846878657, + "grad_norm": 0.4843117988504339, + "learning_rate": 8.9839366971159e-06, + "loss": 0.6606, + "step": 1111 + }, + { + "epoch": 0.18262065567712932, + "grad_norm": 0.487141594800104, + "learning_rate": 8.983897082779804e-06, + "loss": 0.6644, + "step": 1112 + }, + { + "epoch": 0.18278488288547204, + "grad_norm": 0.4785509249745807, + "learning_rate": 8.983857419744173e-06, + "loss": 0.6592, + "step": 1113 + }, + { + "epoch": 0.1829491100938148, + "grad_norm": 0.4779896385199476, + "learning_rate": 8.983817708009438e-06, + "loss": 0.6618, + "step": 1114 + }, + { + "epoch": 0.18311333730215754, + "grad_norm": 0.49850559987657705, + "learning_rate": 8.983777947576032e-06, + "loss": 0.6579, + "step": 1115 + }, + { + "epoch": 0.18327756451050028, + "grad_norm": 0.4635978888264068, + "learning_rate": 8.983738138444387e-06, + "loss": 0.6794, + "step": 1116 + }, + { + "epoch": 0.183441791718843, + "grad_norm": 0.46979893505545445, + "learning_rate": 8.98369828061493e-06, + "loss": 0.6773, + "step": 1117 + }, + { + "epoch": 0.18360601892718575, + "grad_norm": 0.5249699786325431, + "learning_rate": 8.983658374088103e-06, + "loss": 0.66, + "step": 1118 + }, + { + "epoch": 0.1837702461355285, + "grad_norm": 0.6266089286333181, + "learning_rate": 8.983618418864334e-06, + "loss": 0.6625, + "step": 1119 + }, + { + "epoch": 0.18393447334387125, + "grad_norm": 0.47135356277133833, + "learning_rate": 8.983578414944056e-06, + "loss": 0.6742, + "step": 1120 + }, + { + "epoch": 0.184098700552214, + "grad_norm": 0.4939184853353077, + "learning_rate": 8.983538362327707e-06, + "loss": 0.661, + "step": 1121 + }, + { + "epoch": 0.18426292776055672, + "grad_norm": 0.5556973801833033, + "learning_rate": 8.98349826101572e-06, + "loss": 0.6765, + "step": 1122 + }, + { + "epoch": 0.18442715496889947, + "grad_norm": 0.59081209705095, + "learning_rate": 8.983458111008528e-06, + "loss": 0.6667, + "step": 1123 + }, + { + "epoch": 0.18459138217724222, + "grad_norm": 0.6127369186330817, + "learning_rate": 8.983417912306573e-06, + "loss": 0.6463, + "step": 1124 + }, + { + "epoch": 0.18475560938558497, + "grad_norm": 0.4725990782989587, + "learning_rate": 8.983377664910287e-06, + "loss": 0.6547, + "step": 1125 + }, + { + "epoch": 0.1849198365939277, + "grad_norm": 0.459266379649527, + "learning_rate": 8.98333736882011e-06, + "loss": 0.6441, + "step": 1126 + }, + { + "epoch": 0.18508406380227044, + "grad_norm": 0.5021589556181871, + "learning_rate": 8.983297024036475e-06, + "loss": 0.6618, + "step": 1127 + }, + { + "epoch": 0.1852482910106132, + "grad_norm": 0.4731458077349534, + "learning_rate": 8.983256630559826e-06, + "loss": 0.6343, + "step": 1128 + }, + { + "epoch": 0.18541251821895594, + "grad_norm": 0.46076652722107214, + "learning_rate": 8.983216188390598e-06, + "loss": 0.6571, + "step": 1129 + }, + { + "epoch": 0.18557674542729866, + "grad_norm": 0.4729510638590709, + "learning_rate": 8.98317569752923e-06, + "loss": 0.6551, + "step": 1130 + }, + { + "epoch": 0.1857409726356414, + "grad_norm": 0.4672436396771921, + "learning_rate": 8.983135157976166e-06, + "loss": 0.6526, + "step": 1131 + }, + { + "epoch": 0.18590519984398415, + "grad_norm": 0.5362518055595471, + "learning_rate": 8.983094569731842e-06, + "loss": 0.6495, + "step": 1132 + }, + { + "epoch": 0.1860694270523269, + "grad_norm": 0.4974174360111951, + "learning_rate": 8.9830539327967e-06, + "loss": 0.6796, + "step": 1133 + }, + { + "epoch": 0.18623365426066962, + "grad_norm": 0.5038650088289387, + "learning_rate": 8.983013247171182e-06, + "loss": 0.6395, + "step": 1134 + }, + { + "epoch": 0.18639788146901237, + "grad_norm": 0.46062899373893684, + "learning_rate": 8.98297251285573e-06, + "loss": 0.692, + "step": 1135 + }, + { + "epoch": 0.18656210867735512, + "grad_norm": 0.5124254676856769, + "learning_rate": 8.982931729850786e-06, + "loss": 0.6724, + "step": 1136 + }, + { + "epoch": 0.18672633588569787, + "grad_norm": 0.4507316596529217, + "learning_rate": 8.98289089815679e-06, + "loss": 0.6534, + "step": 1137 + }, + { + "epoch": 0.18689056309404062, + "grad_norm": 0.5255609623079383, + "learning_rate": 8.98285001777419e-06, + "loss": 0.6297, + "step": 1138 + }, + { + "epoch": 0.18705479030238334, + "grad_norm": 0.4753157263680939, + "learning_rate": 8.98280908870343e-06, + "loss": 0.6602, + "step": 1139 + }, + { + "epoch": 0.1872190175107261, + "grad_norm": 0.4699605583978582, + "learning_rate": 8.98276811094495e-06, + "loss": 0.6736, + "step": 1140 + }, + { + "epoch": 0.18738324471906884, + "grad_norm": 0.46174823365599904, + "learning_rate": 8.9827270844992e-06, + "loss": 0.6678, + "step": 1141 + }, + { + "epoch": 0.18754747192741159, + "grad_norm": 0.46428061282059035, + "learning_rate": 8.982686009366622e-06, + "loss": 0.6789, + "step": 1142 + }, + { + "epoch": 0.1877116991357543, + "grad_norm": 0.4699014887684446, + "learning_rate": 8.982644885547666e-06, + "loss": 0.6732, + "step": 1143 + }, + { + "epoch": 0.18787592634409706, + "grad_norm": 0.4811773724914709, + "learning_rate": 8.982603713042773e-06, + "loss": 0.66, + "step": 1144 + }, + { + "epoch": 0.1880401535524398, + "grad_norm": 0.45321971267257205, + "learning_rate": 8.982562491852394e-06, + "loss": 0.6625, + "step": 1145 + }, + { + "epoch": 0.18820438076078255, + "grad_norm": 0.6330573920154635, + "learning_rate": 8.982521221976978e-06, + "loss": 0.6692, + "step": 1146 + }, + { + "epoch": 0.18836860796912527, + "grad_norm": 0.45518725569387647, + "learning_rate": 8.98247990341697e-06, + "loss": 0.6435, + "step": 1147 + }, + { + "epoch": 0.18853283517746802, + "grad_norm": 0.47309733075163013, + "learning_rate": 8.982438536172819e-06, + "loss": 0.6296, + "step": 1148 + }, + { + "epoch": 0.18869706238581077, + "grad_norm": 0.475995730507989, + "learning_rate": 8.982397120244977e-06, + "loss": 0.6272, + "step": 1149 + }, + { + "epoch": 0.18886128959415352, + "grad_norm": 0.4547023805476552, + "learning_rate": 8.982355655633892e-06, + "loss": 0.6477, + "step": 1150 + }, + { + "epoch": 0.18902551680249624, + "grad_norm": 0.4725682510670995, + "learning_rate": 8.982314142340014e-06, + "loss": 0.6653, + "step": 1151 + }, + { + "epoch": 0.189189744010839, + "grad_norm": 0.5627057072719744, + "learning_rate": 8.982272580363796e-06, + "loss": 0.6545, + "step": 1152 + }, + { + "epoch": 0.18935397121918174, + "grad_norm": 0.5006530922999425, + "learning_rate": 8.982230969705685e-06, + "loss": 0.6545, + "step": 1153 + }, + { + "epoch": 0.1895181984275245, + "grad_norm": 0.4547374521034954, + "learning_rate": 8.982189310366138e-06, + "loss": 0.6717, + "step": 1154 + }, + { + "epoch": 0.18968242563586724, + "grad_norm": 0.445107036433399, + "learning_rate": 8.982147602345605e-06, + "loss": 0.661, + "step": 1155 + }, + { + "epoch": 0.18984665284420996, + "grad_norm": 0.4561286392537167, + "learning_rate": 8.982105845644539e-06, + "loss": 0.6447, + "step": 1156 + }, + { + "epoch": 0.1900108800525527, + "grad_norm": 0.5216727923195004, + "learning_rate": 8.982064040263394e-06, + "loss": 0.6586, + "step": 1157 + }, + { + "epoch": 0.19017510726089545, + "grad_norm": 0.45996861833590036, + "learning_rate": 8.982022186202623e-06, + "loss": 0.6618, + "step": 1158 + }, + { + "epoch": 0.1903393344692382, + "grad_norm": 0.44939371658844723, + "learning_rate": 8.981980283462681e-06, + "loss": 0.6705, + "step": 1159 + }, + { + "epoch": 0.19050356167758092, + "grad_norm": 0.48425712732679077, + "learning_rate": 8.981938332044024e-06, + "loss": 0.6684, + "step": 1160 + }, + { + "epoch": 0.19066778888592367, + "grad_norm": 0.4645137102326744, + "learning_rate": 8.981896331947108e-06, + "loss": 0.6696, + "step": 1161 + }, + { + "epoch": 0.19083201609426642, + "grad_norm": 0.4695085651143102, + "learning_rate": 8.981854283172386e-06, + "loss": 0.6632, + "step": 1162 + }, + { + "epoch": 0.19099624330260917, + "grad_norm": 0.45627726789169143, + "learning_rate": 8.981812185720319e-06, + "loss": 0.6774, + "step": 1163 + }, + { + "epoch": 0.1911604705109519, + "grad_norm": 0.4508546900179419, + "learning_rate": 8.98177003959136e-06, + "loss": 0.6495, + "step": 1164 + }, + { + "epoch": 0.19132469771929464, + "grad_norm": 0.4676794625199336, + "learning_rate": 8.981727844785972e-06, + "loss": 0.6505, + "step": 1165 + }, + { + "epoch": 0.1914889249276374, + "grad_norm": 0.46561161225429254, + "learning_rate": 8.981685601304608e-06, + "loss": 0.639, + "step": 1166 + }, + { + "epoch": 0.19165315213598014, + "grad_norm": 0.46003161609163923, + "learning_rate": 8.98164330914773e-06, + "loss": 0.659, + "step": 1167 + }, + { + "epoch": 0.19181737934432286, + "grad_norm": 0.4750628926671919, + "learning_rate": 8.981600968315796e-06, + "loss": 0.6494, + "step": 1168 + }, + { + "epoch": 0.1919816065526656, + "grad_norm": 0.4766948623132289, + "learning_rate": 8.981558578809265e-06, + "loss": 0.6408, + "step": 1169 + }, + { + "epoch": 0.19214583376100836, + "grad_norm": 0.5949849501521395, + "learning_rate": 8.9815161406286e-06, + "loss": 0.6377, + "step": 1170 + }, + { + "epoch": 0.1923100609693511, + "grad_norm": 0.4315864051018545, + "learning_rate": 8.98147365377426e-06, + "loss": 0.657, + "step": 1171 + }, + { + "epoch": 0.19247428817769385, + "grad_norm": 0.48620239608886057, + "learning_rate": 8.981431118246707e-06, + "loss": 0.6484, + "step": 1172 + }, + { + "epoch": 0.19263851538603657, + "grad_norm": 0.48009521513705844, + "learning_rate": 8.981388534046403e-06, + "loss": 0.661, + "step": 1173 + }, + { + "epoch": 0.19280274259437932, + "grad_norm": 0.46661786912772535, + "learning_rate": 8.981345901173812e-06, + "loss": 0.6553, + "step": 1174 + }, + { + "epoch": 0.19296696980272207, + "grad_norm": 0.4670500707072574, + "learning_rate": 8.981303219629392e-06, + "loss": 0.6443, + "step": 1175 + }, + { + "epoch": 0.19313119701106482, + "grad_norm": 0.4409917364955242, + "learning_rate": 8.981260489413613e-06, + "loss": 0.6619, + "step": 1176 + }, + { + "epoch": 0.19329542421940754, + "grad_norm": 0.4711657126739569, + "learning_rate": 8.981217710526935e-06, + "loss": 0.635, + "step": 1177 + }, + { + "epoch": 0.1934596514277503, + "grad_norm": 0.5696321006657865, + "learning_rate": 8.981174882969823e-06, + "loss": 0.6625, + "step": 1178 + }, + { + "epoch": 0.19362387863609304, + "grad_norm": 0.46026048794687807, + "learning_rate": 8.981132006742745e-06, + "loss": 0.6392, + "step": 1179 + }, + { + "epoch": 0.1937881058444358, + "grad_norm": 0.44618034160175857, + "learning_rate": 8.981089081846164e-06, + "loss": 0.6793, + "step": 1180 + }, + { + "epoch": 0.1939523330527785, + "grad_norm": 0.466485326789076, + "learning_rate": 8.981046108280545e-06, + "loss": 0.6483, + "step": 1181 + }, + { + "epoch": 0.19411656026112126, + "grad_norm": 0.48215706406724307, + "learning_rate": 8.981003086046358e-06, + "loss": 0.6493, + "step": 1182 + }, + { + "epoch": 0.194280787469464, + "grad_norm": 0.44683277849330777, + "learning_rate": 8.980960015144068e-06, + "loss": 0.6471, + "step": 1183 + }, + { + "epoch": 0.19444501467780675, + "grad_norm": 0.4492007111991035, + "learning_rate": 8.980916895574143e-06, + "loss": 0.6608, + "step": 1184 + }, + { + "epoch": 0.19460924188614948, + "grad_norm": 0.43166191142444144, + "learning_rate": 8.980873727337053e-06, + "loss": 0.6674, + "step": 1185 + }, + { + "epoch": 0.19477346909449222, + "grad_norm": 0.43998146007104594, + "learning_rate": 8.980830510433266e-06, + "loss": 0.6579, + "step": 1186 + }, + { + "epoch": 0.19493769630283497, + "grad_norm": 0.48354840890327583, + "learning_rate": 8.98078724486325e-06, + "loss": 0.6475, + "step": 1187 + }, + { + "epoch": 0.19510192351117772, + "grad_norm": 0.48479443389772425, + "learning_rate": 8.980743930627477e-06, + "loss": 0.6576, + "step": 1188 + }, + { + "epoch": 0.19526615071952047, + "grad_norm": 0.46889288135317325, + "learning_rate": 8.980700567726415e-06, + "loss": 0.6674, + "step": 1189 + }, + { + "epoch": 0.1954303779278632, + "grad_norm": 0.4363016948383347, + "learning_rate": 8.980657156160538e-06, + "loss": 0.6426, + "step": 1190 + }, + { + "epoch": 0.19559460513620594, + "grad_norm": 0.4856980442738892, + "learning_rate": 8.980613695930315e-06, + "loss": 0.6763, + "step": 1191 + }, + { + "epoch": 0.1957588323445487, + "grad_norm": 0.43424616977986746, + "learning_rate": 8.98057018703622e-06, + "loss": 0.6549, + "step": 1192 + }, + { + "epoch": 0.19592305955289144, + "grad_norm": 0.4315225641477105, + "learning_rate": 8.980526629478724e-06, + "loss": 0.633, + "step": 1193 + }, + { + "epoch": 0.19608728676123416, + "grad_norm": 0.44353238665548256, + "learning_rate": 8.9804830232583e-06, + "loss": 0.6314, + "step": 1194 + }, + { + "epoch": 0.1962515139695769, + "grad_norm": 0.42772453875967764, + "learning_rate": 8.980439368375423e-06, + "loss": 0.6608, + "step": 1195 + }, + { + "epoch": 0.19641574117791966, + "grad_norm": 0.47642585813722443, + "learning_rate": 8.980395664830566e-06, + "loss": 0.6469, + "step": 1196 + }, + { + "epoch": 0.1965799683862624, + "grad_norm": 0.4585709399028893, + "learning_rate": 8.980351912624204e-06, + "loss": 0.6409, + "step": 1197 + }, + { + "epoch": 0.19674419559460513, + "grad_norm": 0.4683407067017671, + "learning_rate": 8.980308111756812e-06, + "loss": 0.6343, + "step": 1198 + }, + { + "epoch": 0.19690842280294787, + "grad_norm": 0.45598990783869575, + "learning_rate": 8.980264262228865e-06, + "loss": 0.6723, + "step": 1199 + }, + { + "epoch": 0.19707265001129062, + "grad_norm": 0.4441138325488558, + "learning_rate": 8.980220364040843e-06, + "loss": 0.642, + "step": 1200 + }, + { + "epoch": 0.19723687721963337, + "grad_norm": 0.4372372641941166, + "learning_rate": 8.980176417193217e-06, + "loss": 0.6613, + "step": 1201 + }, + { + "epoch": 0.1974011044279761, + "grad_norm": 0.43527778983518434, + "learning_rate": 8.980132421686467e-06, + "loss": 0.6413, + "step": 1202 + }, + { + "epoch": 0.19756533163631884, + "grad_norm": 0.4318187764560961, + "learning_rate": 8.980088377521073e-06, + "loss": 0.6488, + "step": 1203 + }, + { + "epoch": 0.1977295588446616, + "grad_norm": 0.45481144116691763, + "learning_rate": 8.98004428469751e-06, + "loss": 0.6527, + "step": 1204 + }, + { + "epoch": 0.19789378605300434, + "grad_norm": 0.5522025822262004, + "learning_rate": 8.98000014321626e-06, + "loss": 0.6489, + "step": 1205 + }, + { + "epoch": 0.19805801326134706, + "grad_norm": 0.41859957738652587, + "learning_rate": 8.9799559530778e-06, + "loss": 0.6368, + "step": 1206 + }, + { + "epoch": 0.1982222404696898, + "grad_norm": 0.45326825381870833, + "learning_rate": 8.97991171428261e-06, + "loss": 0.6596, + "step": 1207 + }, + { + "epoch": 0.19838646767803256, + "grad_norm": 0.44391403918832606, + "learning_rate": 8.979867426831171e-06, + "loss": 0.6476, + "step": 1208 + }, + { + "epoch": 0.1985506948863753, + "grad_norm": 0.43255445491586364, + "learning_rate": 8.979823090723966e-06, + "loss": 0.6301, + "step": 1209 + }, + { + "epoch": 0.19871492209471806, + "grad_norm": 0.428228400413717, + "learning_rate": 8.979778705961471e-06, + "loss": 0.6779, + "step": 1210 + }, + { + "epoch": 0.19887914930306078, + "grad_norm": 0.461878401668105, + "learning_rate": 8.979734272544175e-06, + "loss": 0.648, + "step": 1211 + }, + { + "epoch": 0.19904337651140352, + "grad_norm": 0.4311866229558285, + "learning_rate": 8.979689790472556e-06, + "loss": 0.6456, + "step": 1212 + }, + { + "epoch": 0.19920760371974627, + "grad_norm": 0.41531054144954355, + "learning_rate": 8.979645259747098e-06, + "loss": 0.6407, + "step": 1213 + }, + { + "epoch": 0.19937183092808902, + "grad_norm": 0.4262317469216339, + "learning_rate": 8.979600680368286e-06, + "loss": 0.6243, + "step": 1214 + }, + { + "epoch": 0.19953605813643174, + "grad_norm": 0.4314545771479194, + "learning_rate": 8.979556052336605e-06, + "loss": 0.6493, + "step": 1215 + }, + { + "epoch": 0.1997002853447745, + "grad_norm": 0.44815070308467875, + "learning_rate": 8.979511375652535e-06, + "loss": 0.6438, + "step": 1216 + }, + { + "epoch": 0.19986451255311724, + "grad_norm": 0.4422403594628592, + "learning_rate": 8.979466650316565e-06, + "loss": 0.669, + "step": 1217 + }, + { + "epoch": 0.20002873976146, + "grad_norm": 0.42824265290355684, + "learning_rate": 8.97942187632918e-06, + "loss": 0.6579, + "step": 1218 + }, + { + "epoch": 0.2001929669698027, + "grad_norm": 0.543273528497163, + "learning_rate": 8.979377053690867e-06, + "loss": 0.6373, + "step": 1219 + }, + { + "epoch": 0.20035719417814546, + "grad_norm": 0.4230607481140885, + "learning_rate": 8.979332182402111e-06, + "loss": 0.6431, + "step": 1220 + }, + { + "epoch": 0.2005214213864882, + "grad_norm": 0.4362433490235076, + "learning_rate": 8.979287262463403e-06, + "loss": 0.6501, + "step": 1221 + }, + { + "epoch": 0.20068564859483096, + "grad_norm": 0.4451428401814509, + "learning_rate": 8.979242293875225e-06, + "loss": 0.6557, + "step": 1222 + }, + { + "epoch": 0.20084987580317368, + "grad_norm": 0.4352292485824094, + "learning_rate": 8.979197276638071e-06, + "loss": 0.6598, + "step": 1223 + }, + { + "epoch": 0.20101410301151643, + "grad_norm": 0.4261401748689992, + "learning_rate": 8.979152210752427e-06, + "loss": 0.6318, + "step": 1224 + }, + { + "epoch": 0.20117833021985917, + "grad_norm": 0.41345651752110774, + "learning_rate": 8.979107096218781e-06, + "loss": 0.6701, + "step": 1225 + }, + { + "epoch": 0.20134255742820192, + "grad_norm": 0.4257173704920539, + "learning_rate": 8.979061933037629e-06, + "loss": 0.6462, + "step": 1226 + }, + { + "epoch": 0.20150678463654467, + "grad_norm": 0.4402291803372949, + "learning_rate": 8.979016721209456e-06, + "loss": 0.6405, + "step": 1227 + }, + { + "epoch": 0.2016710118448874, + "grad_norm": 0.4149097427684321, + "learning_rate": 8.978971460734753e-06, + "loss": 0.6203, + "step": 1228 + }, + { + "epoch": 0.20183523905323014, + "grad_norm": 0.48308972389337906, + "learning_rate": 8.978926151614014e-06, + "loss": 0.6642, + "step": 1229 + }, + { + "epoch": 0.2019994662615729, + "grad_norm": 0.4331467017777629, + "learning_rate": 8.978880793847732e-06, + "loss": 0.6395, + "step": 1230 + }, + { + "epoch": 0.20216369346991564, + "grad_norm": 0.43266826431991046, + "learning_rate": 8.978835387436396e-06, + "loss": 0.6434, + "step": 1231 + }, + { + "epoch": 0.20232792067825836, + "grad_norm": 0.4460728101045431, + "learning_rate": 8.978789932380501e-06, + "loss": 0.6286, + "step": 1232 + }, + { + "epoch": 0.2024921478866011, + "grad_norm": 0.43293087715759737, + "learning_rate": 8.978744428680543e-06, + "loss": 0.6546, + "step": 1233 + }, + { + "epoch": 0.20265637509494386, + "grad_norm": 0.42924541630533913, + "learning_rate": 8.978698876337011e-06, + "loss": 0.6446, + "step": 1234 + }, + { + "epoch": 0.2028206023032866, + "grad_norm": 0.4151254920163802, + "learning_rate": 8.978653275350405e-06, + "loss": 0.6631, + "step": 1235 + }, + { + "epoch": 0.20298482951162933, + "grad_norm": 0.44360392632935175, + "learning_rate": 8.978607625721219e-06, + "loss": 0.6303, + "step": 1236 + }, + { + "epoch": 0.20314905671997208, + "grad_norm": 0.4277824314710478, + "learning_rate": 8.978561927449946e-06, + "loss": 0.6467, + "step": 1237 + }, + { + "epoch": 0.20331328392831483, + "grad_norm": 0.4366290288251897, + "learning_rate": 8.978516180537083e-06, + "loss": 0.6476, + "step": 1238 + }, + { + "epoch": 0.20347751113665757, + "grad_norm": 0.42651600201106177, + "learning_rate": 8.97847038498313e-06, + "loss": 0.6514, + "step": 1239 + }, + { + "epoch": 0.2036417383450003, + "grad_norm": 0.4251031339705655, + "learning_rate": 8.978424540788583e-06, + "loss": 0.6734, + "step": 1240 + }, + { + "epoch": 0.20380596555334304, + "grad_norm": 0.549156680303186, + "learning_rate": 8.978378647953937e-06, + "loss": 0.6291, + "step": 1241 + }, + { + "epoch": 0.2039701927616858, + "grad_norm": 0.4224197425754509, + "learning_rate": 8.978332706479694e-06, + "loss": 0.6516, + "step": 1242 + }, + { + "epoch": 0.20413441997002854, + "grad_norm": 0.42435648487983524, + "learning_rate": 8.978286716366352e-06, + "loss": 0.6401, + "step": 1243 + }, + { + "epoch": 0.2042986471783713, + "grad_norm": 0.4218890161029301, + "learning_rate": 8.97824067761441e-06, + "loss": 0.6332, + "step": 1244 + }, + { + "epoch": 0.204462874386714, + "grad_norm": 0.4598458015905371, + "learning_rate": 8.978194590224367e-06, + "loss": 0.636, + "step": 1245 + }, + { + "epoch": 0.20462710159505676, + "grad_norm": 0.4494394320075245, + "learning_rate": 8.978148454196728e-06, + "loss": 0.6628, + "step": 1246 + }, + { + "epoch": 0.2047913288033995, + "grad_norm": 0.45629173674361884, + "learning_rate": 8.978102269531988e-06, + "loss": 0.6434, + "step": 1247 + }, + { + "epoch": 0.20495555601174226, + "grad_norm": 0.42740668248808533, + "learning_rate": 8.978056036230651e-06, + "loss": 0.6408, + "step": 1248 + }, + { + "epoch": 0.20511978322008498, + "grad_norm": 0.45719887031605844, + "learning_rate": 8.978009754293221e-06, + "loss": 0.6407, + "step": 1249 + }, + { + "epoch": 0.20528401042842773, + "grad_norm": 0.4352265634714779, + "learning_rate": 8.9779634237202e-06, + "loss": 0.6371, + "step": 1250 + }, + { + "epoch": 0.20544823763677048, + "grad_norm": 0.42210549828374605, + "learning_rate": 8.97791704451209e-06, + "loss": 0.6408, + "step": 1251 + }, + { + "epoch": 0.20561246484511322, + "grad_norm": 0.40181174335862413, + "learning_rate": 8.977870616669395e-06, + "loss": 0.6494, + "step": 1252 + }, + { + "epoch": 0.20577669205345595, + "grad_norm": 0.40188184471409283, + "learning_rate": 8.97782414019262e-06, + "loss": 0.6593, + "step": 1253 + }, + { + "epoch": 0.2059409192617987, + "grad_norm": 0.410659768636049, + "learning_rate": 8.977777615082268e-06, + "loss": 0.6452, + "step": 1254 + }, + { + "epoch": 0.20610514647014144, + "grad_norm": 0.4235122270844502, + "learning_rate": 8.977731041338847e-06, + "loss": 0.6438, + "step": 1255 + }, + { + "epoch": 0.2062693736784842, + "grad_norm": 0.44846490809871864, + "learning_rate": 8.97768441896286e-06, + "loss": 0.6454, + "step": 1256 + }, + { + "epoch": 0.2064336008868269, + "grad_norm": 0.42416716591523856, + "learning_rate": 8.977637747954815e-06, + "loss": 0.6342, + "step": 1257 + }, + { + "epoch": 0.20659782809516966, + "grad_norm": 0.4198893083929536, + "learning_rate": 8.977591028315221e-06, + "loss": 0.6533, + "step": 1258 + }, + { + "epoch": 0.2067620553035124, + "grad_norm": 0.4455138571329966, + "learning_rate": 8.97754426004458e-06, + "loss": 0.6586, + "step": 1259 + }, + { + "epoch": 0.20692628251185516, + "grad_norm": 0.4142993504597112, + "learning_rate": 8.977497443143405e-06, + "loss": 0.6525, + "step": 1260 + }, + { + "epoch": 0.2070905097201979, + "grad_norm": 0.42824370833772935, + "learning_rate": 8.9774505776122e-06, + "loss": 0.6228, + "step": 1261 + }, + { + "epoch": 0.20725473692854063, + "grad_norm": 0.424782198551827, + "learning_rate": 8.977403663451478e-06, + "loss": 0.6432, + "step": 1262 + }, + { + "epoch": 0.20741896413688338, + "grad_norm": 0.41832663825833505, + "learning_rate": 8.977356700661749e-06, + "loss": 0.6567, + "step": 1263 + }, + { + "epoch": 0.20758319134522613, + "grad_norm": 0.45774058372764936, + "learning_rate": 8.977309689243519e-06, + "loss": 0.6594, + "step": 1264 + }, + { + "epoch": 0.20774741855356887, + "grad_norm": 0.4315936364515443, + "learning_rate": 8.9772626291973e-06, + "loss": 0.6348, + "step": 1265 + }, + { + "epoch": 0.2079116457619116, + "grad_norm": 0.4070764707847252, + "learning_rate": 8.977215520523605e-06, + "loss": 0.6427, + "step": 1266 + }, + { + "epoch": 0.20807587297025434, + "grad_norm": 0.4162334688331903, + "learning_rate": 8.977168363222944e-06, + "loss": 0.6605, + "step": 1267 + }, + { + "epoch": 0.2082401001785971, + "grad_norm": 0.4053266968546352, + "learning_rate": 8.977121157295831e-06, + "loss": 0.6579, + "step": 1268 + }, + { + "epoch": 0.20840432738693984, + "grad_norm": 0.43978146816965014, + "learning_rate": 8.977073902742775e-06, + "loss": 0.6334, + "step": 1269 + }, + { + "epoch": 0.20856855459528256, + "grad_norm": 0.40390746582339854, + "learning_rate": 8.977026599564294e-06, + "loss": 0.6526, + "step": 1270 + }, + { + "epoch": 0.2087327818036253, + "grad_norm": 0.3978119140838034, + "learning_rate": 8.976979247760898e-06, + "loss": 0.6379, + "step": 1271 + }, + { + "epoch": 0.20889700901196806, + "grad_norm": 0.400050972879593, + "learning_rate": 8.976931847333104e-06, + "loss": 0.634, + "step": 1272 + }, + { + "epoch": 0.2090612362203108, + "grad_norm": 0.43290124573046884, + "learning_rate": 8.976884398281424e-06, + "loss": 0.6635, + "step": 1273 + }, + { + "epoch": 0.20922546342865353, + "grad_norm": 0.4087546614663093, + "learning_rate": 8.976836900606375e-06, + "loss": 0.6506, + "step": 1274 + }, + { + "epoch": 0.20938969063699628, + "grad_norm": 0.3974013674929746, + "learning_rate": 8.976789354308471e-06, + "loss": 0.6425, + "step": 1275 + }, + { + "epoch": 0.20955391784533903, + "grad_norm": 0.519176836639908, + "learning_rate": 8.976741759388233e-06, + "loss": 0.6533, + "step": 1276 + }, + { + "epoch": 0.20971814505368178, + "grad_norm": 0.4212920607770516, + "learning_rate": 8.976694115846174e-06, + "loss": 0.6414, + "step": 1277 + }, + { + "epoch": 0.20988237226202452, + "grad_norm": 0.42102429097091515, + "learning_rate": 8.976646423682811e-06, + "loss": 0.6277, + "step": 1278 + }, + { + "epoch": 0.21004659947036725, + "grad_norm": 0.4265483499282997, + "learning_rate": 8.976598682898665e-06, + "loss": 0.6527, + "step": 1279 + }, + { + "epoch": 0.21021082667871, + "grad_norm": 0.44527848666988024, + "learning_rate": 8.976550893494252e-06, + "loss": 0.6275, + "step": 1280 + }, + { + "epoch": 0.21037505388705274, + "grad_norm": 0.4074471380026174, + "learning_rate": 8.976503055470093e-06, + "loss": 0.6325, + "step": 1281 + }, + { + "epoch": 0.2105392810953955, + "grad_norm": 0.42635054736052974, + "learning_rate": 8.976455168826705e-06, + "loss": 0.6404, + "step": 1282 + }, + { + "epoch": 0.2107035083037382, + "grad_norm": 0.5478927850514103, + "learning_rate": 8.97640723356461e-06, + "loss": 0.6493, + "step": 1283 + }, + { + "epoch": 0.21086773551208096, + "grad_norm": 0.4516746938788672, + "learning_rate": 8.976359249684329e-06, + "loss": 0.6431, + "step": 1284 + }, + { + "epoch": 0.2110319627204237, + "grad_norm": 0.4591186560356215, + "learning_rate": 8.976311217186384e-06, + "loss": 0.6636, + "step": 1285 + }, + { + "epoch": 0.21119618992876646, + "grad_norm": 0.42161671968749, + "learning_rate": 8.976263136071294e-06, + "loss": 0.6388, + "step": 1286 + }, + { + "epoch": 0.21136041713710918, + "grad_norm": 0.4294902286359102, + "learning_rate": 8.97621500633958e-06, + "loss": 0.6307, + "step": 1287 + }, + { + "epoch": 0.21152464434545193, + "grad_norm": 0.47729600920882603, + "learning_rate": 8.97616682799177e-06, + "loss": 0.664, + "step": 1288 + }, + { + "epoch": 0.21168887155379468, + "grad_norm": 0.40274636860562796, + "learning_rate": 8.976118601028382e-06, + "loss": 0.6676, + "step": 1289 + }, + { + "epoch": 0.21185309876213743, + "grad_norm": 0.41214219381854394, + "learning_rate": 8.976070325449942e-06, + "loss": 0.6405, + "step": 1290 + }, + { + "epoch": 0.21201732597048015, + "grad_norm": 0.41848751377730636, + "learning_rate": 8.976022001256977e-06, + "loss": 0.6483, + "step": 1291 + }, + { + "epoch": 0.2121815531788229, + "grad_norm": 0.42500778911619475, + "learning_rate": 8.975973628450006e-06, + "loss": 0.6406, + "step": 1292 + }, + { + "epoch": 0.21234578038716564, + "grad_norm": 0.4119091349723487, + "learning_rate": 8.97592520702956e-06, + "loss": 0.6458, + "step": 1293 + }, + { + "epoch": 0.2125100075955084, + "grad_norm": 0.44255627064099584, + "learning_rate": 8.975876736996163e-06, + "loss": 0.64, + "step": 1294 + }, + { + "epoch": 0.21267423480385111, + "grad_norm": 0.43848087034148364, + "learning_rate": 8.97582821835034e-06, + "loss": 0.6463, + "step": 1295 + }, + { + "epoch": 0.21283846201219386, + "grad_norm": 0.42192516605366814, + "learning_rate": 8.975779651092618e-06, + "loss": 0.6413, + "step": 1296 + }, + { + "epoch": 0.2130026892205366, + "grad_norm": 0.4323607525943481, + "learning_rate": 8.975731035223526e-06, + "loss": 0.6407, + "step": 1297 + }, + { + "epoch": 0.21316691642887936, + "grad_norm": 0.3968768309143591, + "learning_rate": 8.975682370743592e-06, + "loss": 0.626, + "step": 1298 + }, + { + "epoch": 0.2133311436372221, + "grad_norm": 0.40895837748550445, + "learning_rate": 8.975633657653344e-06, + "loss": 0.6436, + "step": 1299 + }, + { + "epoch": 0.21349537084556483, + "grad_norm": 0.4110881048866647, + "learning_rate": 8.97558489595331e-06, + "loss": 0.6084, + "step": 1300 + }, + { + "epoch": 0.21365959805390758, + "grad_norm": 0.7065067394013022, + "learning_rate": 8.975536085644022e-06, + "loss": 0.6485, + "step": 1301 + }, + { + "epoch": 0.21382382526225033, + "grad_norm": 0.5630818303342476, + "learning_rate": 8.975487226726007e-06, + "loss": 0.6415, + "step": 1302 + }, + { + "epoch": 0.21398805247059308, + "grad_norm": 0.42991690147388806, + "learning_rate": 8.975438319199798e-06, + "loss": 0.6244, + "step": 1303 + }, + { + "epoch": 0.2141522796789358, + "grad_norm": 0.3916322356920595, + "learning_rate": 8.975389363065928e-06, + "loss": 0.6474, + "step": 1304 + }, + { + "epoch": 0.21431650688727855, + "grad_norm": 0.4040633626617589, + "learning_rate": 8.975340358324925e-06, + "loss": 0.6482, + "step": 1305 + }, + { + "epoch": 0.2144807340956213, + "grad_norm": 0.39442006030905546, + "learning_rate": 8.97529130497732e-06, + "loss": 0.6329, + "step": 1306 + }, + { + "epoch": 0.21464496130396404, + "grad_norm": 0.628544271221126, + "learning_rate": 8.975242203023652e-06, + "loss": 0.6529, + "step": 1307 + }, + { + "epoch": 0.21480918851230676, + "grad_norm": 0.38401898520257327, + "learning_rate": 8.97519305246445e-06, + "loss": 0.6534, + "step": 1308 + }, + { + "epoch": 0.2149734157206495, + "grad_norm": 0.39648552275477433, + "learning_rate": 8.975143853300246e-06, + "loss": 0.6424, + "step": 1309 + }, + { + "epoch": 0.21513764292899226, + "grad_norm": 0.3998211133640743, + "learning_rate": 8.975094605531577e-06, + "loss": 0.6135, + "step": 1310 + }, + { + "epoch": 0.215301870137335, + "grad_norm": 0.41497948541509916, + "learning_rate": 8.975045309158978e-06, + "loss": 0.6564, + "step": 1311 + }, + { + "epoch": 0.21546609734567773, + "grad_norm": 0.4072266315384676, + "learning_rate": 8.974995964182987e-06, + "loss": 0.6239, + "step": 1312 + }, + { + "epoch": 0.21563032455402048, + "grad_norm": 0.4158843054553293, + "learning_rate": 8.974946570604135e-06, + "loss": 0.6197, + "step": 1313 + }, + { + "epoch": 0.21579455176236323, + "grad_norm": 0.42555213454131813, + "learning_rate": 8.97489712842296e-06, + "loss": 0.6357, + "step": 1314 + }, + { + "epoch": 0.21595877897070598, + "grad_norm": 0.4344847335024281, + "learning_rate": 8.97484763764e-06, + "loss": 0.6538, + "step": 1315 + }, + { + "epoch": 0.21612300617904873, + "grad_norm": 0.40231338795547006, + "learning_rate": 8.974798098255793e-06, + "loss": 0.6438, + "step": 1316 + }, + { + "epoch": 0.21628723338739145, + "grad_norm": 0.401018844856692, + "learning_rate": 8.974748510270874e-06, + "loss": 0.6219, + "step": 1317 + }, + { + "epoch": 0.2164514605957342, + "grad_norm": 0.3879562862518085, + "learning_rate": 8.974698873685786e-06, + "loss": 0.6432, + "step": 1318 + }, + { + "epoch": 0.21661568780407695, + "grad_norm": 0.3916001285671798, + "learning_rate": 8.974649188501065e-06, + "loss": 0.6266, + "step": 1319 + }, + { + "epoch": 0.2167799150124197, + "grad_norm": 0.43067968795936457, + "learning_rate": 8.974599454717248e-06, + "loss": 0.6507, + "step": 1320 + }, + { + "epoch": 0.21694414222076241, + "grad_norm": 0.3984580712524322, + "learning_rate": 8.974549672334883e-06, + "loss": 0.6419, + "step": 1321 + }, + { + "epoch": 0.21710836942910516, + "grad_norm": 0.46140477849198946, + "learning_rate": 8.974499841354504e-06, + "loss": 0.6416, + "step": 1322 + }, + { + "epoch": 0.2172725966374479, + "grad_norm": 0.459126770981474, + "learning_rate": 8.974449961776656e-06, + "loss": 0.6478, + "step": 1323 + }, + { + "epoch": 0.21743682384579066, + "grad_norm": 0.46092127260237753, + "learning_rate": 8.974400033601878e-06, + "loss": 0.6424, + "step": 1324 + }, + { + "epoch": 0.21760105105413338, + "grad_norm": 0.39361748300881266, + "learning_rate": 8.974350056830712e-06, + "loss": 0.6444, + "step": 1325 + }, + { + "epoch": 0.21776527826247613, + "grad_norm": 0.4046619648292167, + "learning_rate": 8.974300031463704e-06, + "loss": 0.6576, + "step": 1326 + }, + { + "epoch": 0.21792950547081888, + "grad_norm": 0.37804314723939025, + "learning_rate": 8.974249957501395e-06, + "loss": 0.6309, + "step": 1327 + }, + { + "epoch": 0.21809373267916163, + "grad_norm": 0.5167184663185815, + "learning_rate": 8.97419983494433e-06, + "loss": 0.643, + "step": 1328 + }, + { + "epoch": 0.21825795988750435, + "grad_norm": 0.37870371535857766, + "learning_rate": 8.974149663793053e-06, + "loss": 0.6322, + "step": 1329 + }, + { + "epoch": 0.2184221870958471, + "grad_norm": 0.39915021911898146, + "learning_rate": 8.974099444048108e-06, + "loss": 0.6457, + "step": 1330 + }, + { + "epoch": 0.21858641430418985, + "grad_norm": 0.4006828964084616, + "learning_rate": 8.97404917571004e-06, + "loss": 0.6247, + "step": 1331 + }, + { + "epoch": 0.2187506415125326, + "grad_norm": 0.40900509712567223, + "learning_rate": 8.973998858779397e-06, + "loss": 0.6388, + "step": 1332 + }, + { + "epoch": 0.21891486872087534, + "grad_norm": 0.454939516696798, + "learning_rate": 8.973948493256727e-06, + "loss": 0.629, + "step": 1333 + }, + { + "epoch": 0.21907909592921807, + "grad_norm": 0.37761379656488037, + "learning_rate": 8.973898079142573e-06, + "loss": 0.6359, + "step": 1334 + }, + { + "epoch": 0.2192433231375608, + "grad_norm": 0.4672563224075992, + "learning_rate": 8.973847616437483e-06, + "loss": 0.6387, + "step": 1335 + }, + { + "epoch": 0.21940755034590356, + "grad_norm": 0.3981227122118326, + "learning_rate": 8.973797105142005e-06, + "loss": 0.6317, + "step": 1336 + }, + { + "epoch": 0.2195717775542463, + "grad_norm": 0.45785924142754797, + "learning_rate": 8.973746545256692e-06, + "loss": 0.6275, + "step": 1337 + }, + { + "epoch": 0.21973600476258903, + "grad_norm": 0.40208418946268193, + "learning_rate": 8.973695936782088e-06, + "loss": 0.6649, + "step": 1338 + }, + { + "epoch": 0.21990023197093178, + "grad_norm": 0.4196465854058847, + "learning_rate": 8.973645279718746e-06, + "loss": 0.65, + "step": 1339 + }, + { + "epoch": 0.22006445917927453, + "grad_norm": 0.39709401064649014, + "learning_rate": 8.973594574067214e-06, + "loss": 0.6219, + "step": 1340 + }, + { + "epoch": 0.22022868638761728, + "grad_norm": 0.4317979361442743, + "learning_rate": 8.973543819828042e-06, + "loss": 0.6352, + "step": 1341 + }, + { + "epoch": 0.22039291359596, + "grad_norm": 0.3996012626147995, + "learning_rate": 8.973493017001785e-06, + "loss": 0.6504, + "step": 1342 + }, + { + "epoch": 0.22055714080430275, + "grad_norm": 0.401605897362349, + "learning_rate": 8.973442165588993e-06, + "loss": 0.6564, + "step": 1343 + }, + { + "epoch": 0.2207213680126455, + "grad_norm": 0.4211815055879324, + "learning_rate": 8.973391265590215e-06, + "loss": 0.6377, + "step": 1344 + }, + { + "epoch": 0.22088559522098825, + "grad_norm": 0.4147590208542743, + "learning_rate": 8.97334031700601e-06, + "loss": 0.6358, + "step": 1345 + }, + { + "epoch": 0.22104982242933097, + "grad_norm": 0.4041410755841317, + "learning_rate": 8.973289319836924e-06, + "loss": 0.6345, + "step": 1346 + }, + { + "epoch": 0.22121404963767372, + "grad_norm": 0.4419733735874853, + "learning_rate": 8.973238274083517e-06, + "loss": 0.6309, + "step": 1347 + }, + { + "epoch": 0.22137827684601646, + "grad_norm": 0.4438396420085765, + "learning_rate": 8.973187179746341e-06, + "loss": 0.6456, + "step": 1348 + }, + { + "epoch": 0.2215425040543592, + "grad_norm": 0.40548180198977596, + "learning_rate": 8.973136036825952e-06, + "loss": 0.6448, + "step": 1349 + }, + { + "epoch": 0.22170673126270196, + "grad_norm": 0.3776693195760448, + "learning_rate": 8.973084845322905e-06, + "loss": 0.6267, + "step": 1350 + }, + { + "epoch": 0.22187095847104468, + "grad_norm": 0.3914134341854077, + "learning_rate": 8.973033605237754e-06, + "loss": 0.6064, + "step": 1351 + }, + { + "epoch": 0.22203518567938743, + "grad_norm": 0.4008878109833888, + "learning_rate": 8.972982316571059e-06, + "loss": 0.6412, + "step": 1352 + }, + { + "epoch": 0.22219941288773018, + "grad_norm": 0.3947443304069219, + "learning_rate": 8.972930979323373e-06, + "loss": 0.6296, + "step": 1353 + }, + { + "epoch": 0.22236364009607293, + "grad_norm": 0.40428731395639134, + "learning_rate": 8.972879593495257e-06, + "loss": 0.6468, + "step": 1354 + }, + { + "epoch": 0.22252786730441565, + "grad_norm": 0.42559973840758325, + "learning_rate": 8.972828159087268e-06, + "loss": 0.636, + "step": 1355 + }, + { + "epoch": 0.2226920945127584, + "grad_norm": 0.38558193013481934, + "learning_rate": 8.972776676099965e-06, + "loss": 0.6267, + "step": 1356 + }, + { + "epoch": 0.22285632172110115, + "grad_norm": 0.3942742258512122, + "learning_rate": 8.972725144533905e-06, + "loss": 0.6095, + "step": 1357 + }, + { + "epoch": 0.2230205489294439, + "grad_norm": 0.39813829783312527, + "learning_rate": 8.972673564389651e-06, + "loss": 0.6206, + "step": 1358 + }, + { + "epoch": 0.22318477613778662, + "grad_norm": 0.424281039810817, + "learning_rate": 8.972621935667763e-06, + "loss": 0.6455, + "step": 1359 + }, + { + "epoch": 0.22334900334612937, + "grad_norm": 0.3908176395636074, + "learning_rate": 8.972570258368797e-06, + "loss": 0.6242, + "step": 1360 + }, + { + "epoch": 0.22351323055447211, + "grad_norm": 0.4052328537533125, + "learning_rate": 8.972518532493319e-06, + "loss": 0.6209, + "step": 1361 + }, + { + "epoch": 0.22367745776281486, + "grad_norm": 0.39202858379796474, + "learning_rate": 8.97246675804189e-06, + "loss": 0.6256, + "step": 1362 + }, + { + "epoch": 0.22384168497115758, + "grad_norm": 0.3800992157787325, + "learning_rate": 8.97241493501507e-06, + "loss": 0.6232, + "step": 1363 + }, + { + "epoch": 0.22400591217950033, + "grad_norm": 0.3889099579798512, + "learning_rate": 8.972363063413424e-06, + "loss": 0.642, + "step": 1364 + }, + { + "epoch": 0.22417013938784308, + "grad_norm": 0.43843337554692713, + "learning_rate": 8.972311143237516e-06, + "loss": 0.6309, + "step": 1365 + }, + { + "epoch": 0.22433436659618583, + "grad_norm": 0.38555050369799626, + "learning_rate": 8.972259174487908e-06, + "loss": 0.6175, + "step": 1366 + }, + { + "epoch": 0.22449859380452858, + "grad_norm": 0.3954594559012571, + "learning_rate": 8.972207157165167e-06, + "loss": 0.6332, + "step": 1367 + }, + { + "epoch": 0.2246628210128713, + "grad_norm": 0.397471975775458, + "learning_rate": 8.972155091269854e-06, + "loss": 0.6291, + "step": 1368 + }, + { + "epoch": 0.22482704822121405, + "grad_norm": 0.3954234981146683, + "learning_rate": 8.972102976802537e-06, + "loss": 0.6497, + "step": 1369 + }, + { + "epoch": 0.2249912754295568, + "grad_norm": 0.4398435558984372, + "learning_rate": 8.972050813763783e-06, + "loss": 0.6282, + "step": 1370 + }, + { + "epoch": 0.22515550263789955, + "grad_norm": 1.1371437397465023, + "learning_rate": 8.971998602154156e-06, + "loss": 0.6283, + "step": 1371 + }, + { + "epoch": 0.22531972984624227, + "grad_norm": 0.42734572284380834, + "learning_rate": 8.971946341974225e-06, + "loss": 0.6498, + "step": 1372 + }, + { + "epoch": 0.22548395705458502, + "grad_norm": 0.372006683716092, + "learning_rate": 8.971894033224556e-06, + "loss": 0.6317, + "step": 1373 + }, + { + "epoch": 0.22564818426292776, + "grad_norm": 0.5365999450195464, + "learning_rate": 8.97184167590572e-06, + "loss": 0.6398, + "step": 1374 + }, + { + "epoch": 0.2258124114712705, + "grad_norm": 0.4513109754424501, + "learning_rate": 8.971789270018282e-06, + "loss": 0.6516, + "step": 1375 + }, + { + "epoch": 0.22597663867961323, + "grad_norm": 0.3876777628138284, + "learning_rate": 8.971736815562813e-06, + "loss": 0.6239, + "step": 1376 + }, + { + "epoch": 0.22614086588795598, + "grad_norm": 0.39030172173320044, + "learning_rate": 8.971684312539884e-06, + "loss": 0.6362, + "step": 1377 + }, + { + "epoch": 0.22630509309629873, + "grad_norm": 0.5022741874530742, + "learning_rate": 8.971631760950062e-06, + "loss": 0.6391, + "step": 1378 + }, + { + "epoch": 0.22646932030464148, + "grad_norm": 0.3966665973762089, + "learning_rate": 8.971579160793921e-06, + "loss": 0.6279, + "step": 1379 + }, + { + "epoch": 0.2266335475129842, + "grad_norm": 0.46552566540421336, + "learning_rate": 8.971526512072028e-06, + "loss": 0.633, + "step": 1380 + }, + { + "epoch": 0.22679777472132695, + "grad_norm": 0.3887481021652706, + "learning_rate": 8.971473814784961e-06, + "loss": 0.6316, + "step": 1381 + }, + { + "epoch": 0.2269620019296697, + "grad_norm": 0.3981403897977426, + "learning_rate": 8.971421068933289e-06, + "loss": 0.6297, + "step": 1382 + }, + { + "epoch": 0.22712622913801245, + "grad_norm": 0.3760925511936663, + "learning_rate": 8.971368274517584e-06, + "loss": 0.6518, + "step": 1383 + }, + { + "epoch": 0.22729045634635517, + "grad_norm": 0.41634568057723575, + "learning_rate": 8.971315431538419e-06, + "loss": 0.6507, + "step": 1384 + }, + { + "epoch": 0.22745468355469792, + "grad_norm": 0.3767648106691863, + "learning_rate": 8.971262539996371e-06, + "loss": 0.6219, + "step": 1385 + }, + { + "epoch": 0.22761891076304067, + "grad_norm": 0.37280796908592456, + "learning_rate": 8.971209599892012e-06, + "loss": 0.63, + "step": 1386 + }, + { + "epoch": 0.22778313797138341, + "grad_norm": 0.39063293180757414, + "learning_rate": 8.971156611225918e-06, + "loss": 0.6354, + "step": 1387 + }, + { + "epoch": 0.22794736517972616, + "grad_norm": 0.4130871584775029, + "learning_rate": 8.971103573998664e-06, + "loss": 0.6215, + "step": 1388 + }, + { + "epoch": 0.22811159238806888, + "grad_norm": 0.3697250690243639, + "learning_rate": 8.971050488210827e-06, + "loss": 0.6509, + "step": 1389 + }, + { + "epoch": 0.22827581959641163, + "grad_norm": 0.40985315624852925, + "learning_rate": 8.97099735386298e-06, + "loss": 0.6223, + "step": 1390 + }, + { + "epoch": 0.22844004680475438, + "grad_norm": 0.41753388559467336, + "learning_rate": 8.970944170955705e-06, + "loss": 0.6308, + "step": 1391 + }, + { + "epoch": 0.22860427401309713, + "grad_norm": 0.37172418439885546, + "learning_rate": 8.970890939489577e-06, + "loss": 0.6378, + "step": 1392 + }, + { + "epoch": 0.22876850122143985, + "grad_norm": 0.3799761140290008, + "learning_rate": 8.970837659465175e-06, + "loss": 0.62, + "step": 1393 + }, + { + "epoch": 0.2289327284297826, + "grad_norm": 0.3637974123542842, + "learning_rate": 8.970784330883077e-06, + "loss": 0.6414, + "step": 1394 + }, + { + "epoch": 0.22909695563812535, + "grad_norm": 0.39145712168539487, + "learning_rate": 8.970730953743865e-06, + "loss": 0.6259, + "step": 1395 + }, + { + "epoch": 0.2292611828464681, + "grad_norm": 0.3747433458407329, + "learning_rate": 8.970677528048112e-06, + "loss": 0.629, + "step": 1396 + }, + { + "epoch": 0.22942541005481082, + "grad_norm": 0.40593368367010124, + "learning_rate": 8.970624053796405e-06, + "loss": 0.6545, + "step": 1397 + }, + { + "epoch": 0.22958963726315357, + "grad_norm": 0.3814870268904145, + "learning_rate": 8.970570530989322e-06, + "loss": 0.6356, + "step": 1398 + }, + { + "epoch": 0.22975386447149632, + "grad_norm": 0.37393431918398545, + "learning_rate": 8.970516959627445e-06, + "loss": 0.6149, + "step": 1399 + }, + { + "epoch": 0.22991809167983907, + "grad_norm": 0.4234768187758134, + "learning_rate": 8.970463339711354e-06, + "loss": 0.6213, + "step": 1400 + }, + { + "epoch": 0.23008231888818179, + "grad_norm": 0.39010381977649883, + "learning_rate": 8.970409671241635e-06, + "loss": 0.6239, + "step": 1401 + }, + { + "epoch": 0.23024654609652453, + "grad_norm": 0.4059667118236035, + "learning_rate": 8.970355954218866e-06, + "loss": 0.6457, + "step": 1402 + }, + { + "epoch": 0.23041077330486728, + "grad_norm": 0.36347685311176176, + "learning_rate": 8.970302188643634e-06, + "loss": 0.6366, + "step": 1403 + }, + { + "epoch": 0.23057500051321003, + "grad_norm": 0.4016516068154081, + "learning_rate": 8.970248374516523e-06, + "loss": 0.6155, + "step": 1404 + }, + { + "epoch": 0.23073922772155278, + "grad_norm": 0.3738165759888122, + "learning_rate": 8.970194511838116e-06, + "loss": 0.6297, + "step": 1405 + }, + { + "epoch": 0.2309034549298955, + "grad_norm": 0.38775548720178016, + "learning_rate": 8.970140600608998e-06, + "loss": 0.631, + "step": 1406 + }, + { + "epoch": 0.23106768213823825, + "grad_norm": 0.36651163646368323, + "learning_rate": 8.970086640829755e-06, + "loss": 0.6235, + "step": 1407 + }, + { + "epoch": 0.231231909346581, + "grad_norm": 0.388177161793234, + "learning_rate": 8.970032632500974e-06, + "loss": 0.6447, + "step": 1408 + }, + { + "epoch": 0.23139613655492375, + "grad_norm": 0.38342963166847494, + "learning_rate": 8.96997857562324e-06, + "loss": 0.6446, + "step": 1409 + }, + { + "epoch": 0.23156036376326647, + "grad_norm": 0.35965242348385945, + "learning_rate": 8.969924470197141e-06, + "loss": 0.6211, + "step": 1410 + }, + { + "epoch": 0.23172459097160922, + "grad_norm": 0.40581904246250583, + "learning_rate": 8.969870316223264e-06, + "loss": 0.6089, + "step": 1411 + }, + { + "epoch": 0.23188881817995197, + "grad_norm": 0.36406413748044436, + "learning_rate": 8.969816113702198e-06, + "loss": 0.6302, + "step": 1412 + }, + { + "epoch": 0.23205304538829472, + "grad_norm": 0.36016891310093596, + "learning_rate": 8.969761862634532e-06, + "loss": 0.6301, + "step": 1413 + }, + { + "epoch": 0.23221727259663744, + "grad_norm": 0.38497253061331427, + "learning_rate": 8.969707563020854e-06, + "loss": 0.6381, + "step": 1414 + }, + { + "epoch": 0.23238149980498018, + "grad_norm": 0.3728601291907777, + "learning_rate": 8.969653214861753e-06, + "loss": 0.6181, + "step": 1415 + }, + { + "epoch": 0.23254572701332293, + "grad_norm": 0.3667182768200828, + "learning_rate": 8.969598818157824e-06, + "loss": 0.6175, + "step": 1416 + }, + { + "epoch": 0.23270995422166568, + "grad_norm": 0.40991591837455477, + "learning_rate": 8.969544372909651e-06, + "loss": 0.6357, + "step": 1417 + }, + { + "epoch": 0.2328741814300084, + "grad_norm": 0.3830261589112887, + "learning_rate": 8.96948987911783e-06, + "loss": 0.6263, + "step": 1418 + }, + { + "epoch": 0.23303840863835115, + "grad_norm": 0.37342955461632715, + "learning_rate": 8.969435336782951e-06, + "loss": 0.651, + "step": 1419 + }, + { + "epoch": 0.2332026358466939, + "grad_norm": 0.38302602639530275, + "learning_rate": 8.969380745905607e-06, + "loss": 0.6147, + "step": 1420 + }, + { + "epoch": 0.23336686305503665, + "grad_norm": 0.3801283833275303, + "learning_rate": 8.969326106486392e-06, + "loss": 0.5939, + "step": 1421 + }, + { + "epoch": 0.2335310902633794, + "grad_norm": 0.3829133639241409, + "learning_rate": 8.969271418525897e-06, + "loss": 0.6029, + "step": 1422 + }, + { + "epoch": 0.23369531747172212, + "grad_norm": 0.37812529413156654, + "learning_rate": 8.969216682024718e-06, + "loss": 0.6403, + "step": 1423 + }, + { + "epoch": 0.23385954468006487, + "grad_norm": 0.37094348293191975, + "learning_rate": 8.969161896983448e-06, + "loss": 0.6171, + "step": 1424 + }, + { + "epoch": 0.23402377188840762, + "grad_norm": 0.35854811021498156, + "learning_rate": 8.969107063402682e-06, + "loss": 0.6213, + "step": 1425 + }, + { + "epoch": 0.23418799909675037, + "grad_norm": 0.3600943551683208, + "learning_rate": 8.969052181283017e-06, + "loss": 0.6146, + "step": 1426 + }, + { + "epoch": 0.2343522263050931, + "grad_norm": 0.37828891860958147, + "learning_rate": 8.968997250625048e-06, + "loss": 0.6205, + "step": 1427 + }, + { + "epoch": 0.23451645351343584, + "grad_norm": 0.3753629445499872, + "learning_rate": 8.968942271429375e-06, + "loss": 0.6267, + "step": 1428 + }, + { + "epoch": 0.23468068072177858, + "grad_norm": 0.35650819622201213, + "learning_rate": 8.968887243696589e-06, + "loss": 0.6282, + "step": 1429 + }, + { + "epoch": 0.23484490793012133, + "grad_norm": 0.38875352987788536, + "learning_rate": 8.96883216742729e-06, + "loss": 0.6156, + "step": 1430 + }, + { + "epoch": 0.23500913513846405, + "grad_norm": 0.3817434000607439, + "learning_rate": 8.96877704262208e-06, + "loss": 0.6388, + "step": 1431 + }, + { + "epoch": 0.2351733623468068, + "grad_norm": 0.3873016030270038, + "learning_rate": 8.968721869281552e-06, + "loss": 0.6163, + "step": 1432 + }, + { + "epoch": 0.23533758955514955, + "grad_norm": 0.4079351706434482, + "learning_rate": 8.96866664740631e-06, + "loss": 0.6485, + "step": 1433 + }, + { + "epoch": 0.2355018167634923, + "grad_norm": 0.38317014691327517, + "learning_rate": 8.968611376996949e-06, + "loss": 0.5968, + "step": 1434 + }, + { + "epoch": 0.23566604397183502, + "grad_norm": 0.3862892534452918, + "learning_rate": 8.968556058054075e-06, + "loss": 0.6228, + "step": 1435 + }, + { + "epoch": 0.23583027118017777, + "grad_norm": 0.3513593263777787, + "learning_rate": 8.968500690578285e-06, + "loss": 0.6188, + "step": 1436 + }, + { + "epoch": 0.23599449838852052, + "grad_norm": 0.36867544120784373, + "learning_rate": 8.968445274570179e-06, + "loss": 0.6263, + "step": 1437 + }, + { + "epoch": 0.23615872559686327, + "grad_norm": 0.36938216239379995, + "learning_rate": 8.968389810030362e-06, + "loss": 0.6164, + "step": 1438 + }, + { + "epoch": 0.23632295280520602, + "grad_norm": 0.3727449831108178, + "learning_rate": 8.968334296959436e-06, + "loss": 0.6085, + "step": 1439 + }, + { + "epoch": 0.23648718001354874, + "grad_norm": 0.3662499381596673, + "learning_rate": 8.968278735358003e-06, + "loss": 0.6023, + "step": 1440 + }, + { + "epoch": 0.23665140722189149, + "grad_norm": 0.40628718930364516, + "learning_rate": 8.968223125226667e-06, + "loss": 0.6171, + "step": 1441 + }, + { + "epoch": 0.23681563443023423, + "grad_norm": 0.37037082262637283, + "learning_rate": 8.96816746656603e-06, + "loss": 0.6368, + "step": 1442 + }, + { + "epoch": 0.23697986163857698, + "grad_norm": 0.3549645143515296, + "learning_rate": 8.968111759376699e-06, + "loss": 0.6334, + "step": 1443 + }, + { + "epoch": 0.2371440888469197, + "grad_norm": 0.3798618217383637, + "learning_rate": 8.96805600365928e-06, + "loss": 0.6191, + "step": 1444 + }, + { + "epoch": 0.23730831605526245, + "grad_norm": 0.36686801549322573, + "learning_rate": 8.968000199414376e-06, + "loss": 0.6356, + "step": 1445 + }, + { + "epoch": 0.2374725432636052, + "grad_norm": 0.35942437209207495, + "learning_rate": 8.967944346642592e-06, + "loss": 0.6144, + "step": 1446 + }, + { + "epoch": 0.23763677047194795, + "grad_norm": 0.36600431669189964, + "learning_rate": 8.96788844534454e-06, + "loss": 0.6198, + "step": 1447 + }, + { + "epoch": 0.23780099768029067, + "grad_norm": 0.367127385380099, + "learning_rate": 8.967832495520822e-06, + "loss": 0.6204, + "step": 1448 + }, + { + "epoch": 0.23796522488863342, + "grad_norm": 0.37739288988592845, + "learning_rate": 8.967776497172046e-06, + "loss": 0.6286, + "step": 1449 + }, + { + "epoch": 0.23812945209697617, + "grad_norm": 0.3694809371169283, + "learning_rate": 8.967720450298822e-06, + "loss": 0.6329, + "step": 1450 + }, + { + "epoch": 0.23829367930531892, + "grad_norm": 0.39873053139399506, + "learning_rate": 8.967664354901759e-06, + "loss": 0.607, + "step": 1451 + }, + { + "epoch": 0.23845790651366164, + "grad_norm": 0.37174093067137187, + "learning_rate": 8.967608210981466e-06, + "loss": 0.6297, + "step": 1452 + }, + { + "epoch": 0.2386221337220044, + "grad_norm": 0.3655257783754846, + "learning_rate": 8.967552018538552e-06, + "loss": 0.614, + "step": 1453 + }, + { + "epoch": 0.23878636093034714, + "grad_norm": 0.3486334444422297, + "learning_rate": 8.967495777573626e-06, + "loss": 0.612, + "step": 1454 + }, + { + "epoch": 0.23895058813868988, + "grad_norm": 0.38653414135810354, + "learning_rate": 8.967439488087303e-06, + "loss": 0.6373, + "step": 1455 + }, + { + "epoch": 0.23911481534703263, + "grad_norm": 0.4166657864417276, + "learning_rate": 8.967383150080191e-06, + "loss": 0.6172, + "step": 1456 + }, + { + "epoch": 0.23927904255537535, + "grad_norm": 0.36625495836800076, + "learning_rate": 8.967326763552901e-06, + "loss": 0.6105, + "step": 1457 + }, + { + "epoch": 0.2394432697637181, + "grad_norm": 0.35693092319100034, + "learning_rate": 8.96727032850605e-06, + "loss": 0.6361, + "step": 1458 + }, + { + "epoch": 0.23960749697206085, + "grad_norm": 0.4668329077344547, + "learning_rate": 8.967213844940246e-06, + "loss": 0.6507, + "step": 1459 + }, + { + "epoch": 0.2397717241804036, + "grad_norm": 0.36293669801834133, + "learning_rate": 8.967157312856105e-06, + "loss": 0.629, + "step": 1460 + }, + { + "epoch": 0.23993595138874632, + "grad_norm": 0.39226552096477374, + "learning_rate": 8.96710073225424e-06, + "loss": 0.6246, + "step": 1461 + }, + { + "epoch": 0.24010017859708907, + "grad_norm": 0.3767099803854531, + "learning_rate": 8.967044103135266e-06, + "loss": 0.6157, + "step": 1462 + }, + { + "epoch": 0.24026440580543182, + "grad_norm": 0.3738464526239834, + "learning_rate": 8.966987425499798e-06, + "loss": 0.6338, + "step": 1463 + }, + { + "epoch": 0.24042863301377457, + "grad_norm": 0.3924322055790276, + "learning_rate": 8.966930699348453e-06, + "loss": 0.6097, + "step": 1464 + }, + { + "epoch": 0.2405928602221173, + "grad_norm": 0.40128496672548014, + "learning_rate": 8.966873924681845e-06, + "loss": 0.6409, + "step": 1465 + }, + { + "epoch": 0.24075708743046004, + "grad_norm": 0.35113853348291707, + "learning_rate": 8.96681710150059e-06, + "loss": 0.6347, + "step": 1466 + }, + { + "epoch": 0.24092131463880279, + "grad_norm": 0.3820894843457969, + "learning_rate": 8.96676022980531e-06, + "loss": 0.6058, + "step": 1467 + }, + { + "epoch": 0.24108554184714553, + "grad_norm": 0.36199118035878114, + "learning_rate": 8.966703309596615e-06, + "loss": 0.6356, + "step": 1468 + }, + { + "epoch": 0.24124976905548826, + "grad_norm": 0.3540001836120606, + "learning_rate": 8.966646340875129e-06, + "loss": 0.619, + "step": 1469 + }, + { + "epoch": 0.241413996263831, + "grad_norm": 0.38920950337400373, + "learning_rate": 8.96658932364147e-06, + "loss": 0.6122, + "step": 1470 + }, + { + "epoch": 0.24157822347217375, + "grad_norm": 0.35862870218740506, + "learning_rate": 8.966532257896256e-06, + "loss": 0.5999, + "step": 1471 + }, + { + "epoch": 0.2417424506805165, + "grad_norm": 0.40346544693249303, + "learning_rate": 8.966475143640108e-06, + "loss": 0.6389, + "step": 1472 + }, + { + "epoch": 0.24190667788885925, + "grad_norm": 0.35802924018053983, + "learning_rate": 8.966417980873644e-06, + "loss": 0.656, + "step": 1473 + }, + { + "epoch": 0.24207090509720197, + "grad_norm": 0.35416797749163403, + "learning_rate": 8.966360769597487e-06, + "loss": 0.5871, + "step": 1474 + }, + { + "epoch": 0.24223513230554472, + "grad_norm": 0.38394310761980127, + "learning_rate": 8.966303509812259e-06, + "loss": 0.6278, + "step": 1475 + }, + { + "epoch": 0.24239935951388747, + "grad_norm": 0.37542608512409903, + "learning_rate": 8.966246201518577e-06, + "loss": 0.6451, + "step": 1476 + }, + { + "epoch": 0.24256358672223022, + "grad_norm": 0.50798389472191, + "learning_rate": 8.96618884471707e-06, + "loss": 0.6169, + "step": 1477 + }, + { + "epoch": 0.24272781393057294, + "grad_norm": 0.37174062893917387, + "learning_rate": 8.966131439408357e-06, + "loss": 0.6373, + "step": 1478 + }, + { + "epoch": 0.2428920411389157, + "grad_norm": 0.4209079011176253, + "learning_rate": 8.966073985593063e-06, + "loss": 0.6235, + "step": 1479 + }, + { + "epoch": 0.24305626834725844, + "grad_norm": 0.41783270170305736, + "learning_rate": 8.966016483271813e-06, + "loss": 0.6291, + "step": 1480 + }, + { + "epoch": 0.24322049555560118, + "grad_norm": 0.3578979030166597, + "learning_rate": 8.965958932445228e-06, + "loss": 0.6075, + "step": 1481 + }, + { + "epoch": 0.2433847227639439, + "grad_norm": 0.37588742363710526, + "learning_rate": 8.965901333113936e-06, + "loss": 0.6172, + "step": 1482 + }, + { + "epoch": 0.24354894997228665, + "grad_norm": 0.43019135611502246, + "learning_rate": 8.965843685278561e-06, + "loss": 0.6152, + "step": 1483 + }, + { + "epoch": 0.2437131771806294, + "grad_norm": 0.3713307328258515, + "learning_rate": 8.965785988939728e-06, + "loss": 0.633, + "step": 1484 + }, + { + "epoch": 0.24387740438897215, + "grad_norm": 0.38538363095385886, + "learning_rate": 8.96572824409807e-06, + "loss": 0.6103, + "step": 1485 + }, + { + "epoch": 0.24404163159731487, + "grad_norm": 0.42959307234492405, + "learning_rate": 8.965670450754205e-06, + "loss": 0.6159, + "step": 1486 + }, + { + "epoch": 0.24420585880565762, + "grad_norm": 0.3677298238888804, + "learning_rate": 8.965612608908767e-06, + "loss": 0.6242, + "step": 1487 + }, + { + "epoch": 0.24437008601400037, + "grad_norm": 0.3700124245084724, + "learning_rate": 8.965554718562383e-06, + "loss": 0.632, + "step": 1488 + }, + { + "epoch": 0.24453431322234312, + "grad_norm": 0.37162697278412393, + "learning_rate": 8.965496779715681e-06, + "loss": 0.6291, + "step": 1489 + }, + { + "epoch": 0.24469854043068584, + "grad_norm": 0.5932246129868997, + "learning_rate": 8.965438792369291e-06, + "loss": 0.6218, + "step": 1490 + }, + { + "epoch": 0.2448627676390286, + "grad_norm": 0.35796971024046653, + "learning_rate": 8.965380756523842e-06, + "loss": 0.6301, + "step": 1491 + }, + { + "epoch": 0.24502699484737134, + "grad_norm": 0.3888508026013573, + "learning_rate": 8.965322672179964e-06, + "loss": 0.607, + "step": 1492 + }, + { + "epoch": 0.2451912220557141, + "grad_norm": 0.3782937892080753, + "learning_rate": 8.96526453933829e-06, + "loss": 0.6456, + "step": 1493 + }, + { + "epoch": 0.24535544926405684, + "grad_norm": 0.3799106905153972, + "learning_rate": 8.965206357999449e-06, + "loss": 0.6294, + "step": 1494 + }, + { + "epoch": 0.24551967647239956, + "grad_norm": 0.39762851482762696, + "learning_rate": 8.965148128164074e-06, + "loss": 0.6399, + "step": 1495 + }, + { + "epoch": 0.2456839036807423, + "grad_norm": 0.4416226219612057, + "learning_rate": 8.965089849832796e-06, + "loss": 0.6251, + "step": 1496 + }, + { + "epoch": 0.24584813088908505, + "grad_norm": 0.3889843392209306, + "learning_rate": 8.96503152300625e-06, + "loss": 0.6497, + "step": 1497 + }, + { + "epoch": 0.2460123580974278, + "grad_norm": 0.35215446037387954, + "learning_rate": 8.964973147685069e-06, + "loss": 0.633, + "step": 1498 + }, + { + "epoch": 0.24617658530577052, + "grad_norm": 0.45650337654446665, + "learning_rate": 8.964914723869886e-06, + "loss": 0.6042, + "step": 1499 + }, + { + "epoch": 0.24634081251411327, + "grad_norm": 0.36286389615728926, + "learning_rate": 8.964856251561336e-06, + "loss": 0.6341, + "step": 1500 + }, + { + "epoch": 0.24650503972245602, + "grad_norm": 0.3593756064119613, + "learning_rate": 8.964797730760055e-06, + "loss": 0.6197, + "step": 1501 + }, + { + "epoch": 0.24666926693079877, + "grad_norm": 0.3771052035230812, + "learning_rate": 8.964739161466678e-06, + "loss": 0.6242, + "step": 1502 + }, + { + "epoch": 0.2468334941391415, + "grad_norm": 0.3605637264989002, + "learning_rate": 8.96468054368184e-06, + "loss": 0.6094, + "step": 1503 + }, + { + "epoch": 0.24699772134748424, + "grad_norm": 0.3626333911931281, + "learning_rate": 8.964621877406181e-06, + "loss": 0.6348, + "step": 1504 + }, + { + "epoch": 0.247161948555827, + "grad_norm": 0.38088499252044117, + "learning_rate": 8.964563162640334e-06, + "loss": 0.6268, + "step": 1505 + }, + { + "epoch": 0.24732617576416974, + "grad_norm": 0.36603008668404546, + "learning_rate": 8.964504399384938e-06, + "loss": 0.6265, + "step": 1506 + }, + { + "epoch": 0.24749040297251246, + "grad_norm": 0.3627778415298624, + "learning_rate": 8.964445587640633e-06, + "loss": 0.6557, + "step": 1507 + }, + { + "epoch": 0.2476546301808552, + "grad_norm": 0.3976038856166393, + "learning_rate": 8.964386727408055e-06, + "loss": 0.6295, + "step": 1508 + }, + { + "epoch": 0.24781885738919796, + "grad_norm": 0.35128896141586324, + "learning_rate": 8.964327818687847e-06, + "loss": 0.629, + "step": 1509 + }, + { + "epoch": 0.2479830845975407, + "grad_norm": 0.3408244422741637, + "learning_rate": 8.964268861480645e-06, + "loss": 0.614, + "step": 1510 + }, + { + "epoch": 0.24814731180588345, + "grad_norm": 0.35929482815511576, + "learning_rate": 8.964209855787091e-06, + "loss": 0.5934, + "step": 1511 + }, + { + "epoch": 0.24831153901422617, + "grad_norm": 0.46183852703188355, + "learning_rate": 8.964150801607825e-06, + "loss": 0.6054, + "step": 1512 + }, + { + "epoch": 0.24847576622256892, + "grad_norm": 0.37059564147077584, + "learning_rate": 8.96409169894349e-06, + "loss": 0.6184, + "step": 1513 + }, + { + "epoch": 0.24863999343091167, + "grad_norm": 0.3629530929389863, + "learning_rate": 8.964032547794728e-06, + "loss": 0.6242, + "step": 1514 + }, + { + "epoch": 0.24880422063925442, + "grad_norm": 0.36798673186434855, + "learning_rate": 8.96397334816218e-06, + "loss": 0.6034, + "step": 1515 + }, + { + "epoch": 0.24896844784759714, + "grad_norm": 0.3943059155021281, + "learning_rate": 8.963914100046489e-06, + "loss": 0.6471, + "step": 1516 + }, + { + "epoch": 0.2491326750559399, + "grad_norm": 0.4348402454752103, + "learning_rate": 8.963854803448301e-06, + "loss": 0.5869, + "step": 1517 + }, + { + "epoch": 0.24929690226428264, + "grad_norm": 0.400012642981447, + "learning_rate": 8.963795458368254e-06, + "loss": 0.6282, + "step": 1518 + }, + { + "epoch": 0.2494611294726254, + "grad_norm": 0.39046182025528064, + "learning_rate": 8.963736064807e-06, + "loss": 0.6466, + "step": 1519 + }, + { + "epoch": 0.2496253566809681, + "grad_norm": 0.35844392595007163, + "learning_rate": 8.963676622765179e-06, + "loss": 0.6256, + "step": 1520 + }, + { + "epoch": 0.24978958388931086, + "grad_norm": 0.35242439852874236, + "learning_rate": 8.963617132243439e-06, + "loss": 0.5967, + "step": 1521 + }, + { + "epoch": 0.2499538110976536, + "grad_norm": 0.35656035849610795, + "learning_rate": 8.963557593242424e-06, + "loss": 0.6198, + "step": 1522 + }, + { + "epoch": 0.2501180383059963, + "grad_norm": 0.38132355048447236, + "learning_rate": 8.963498005762783e-06, + "loss": 0.6431, + "step": 1523 + }, + { + "epoch": 0.2502822655143391, + "grad_norm": 0.3537836522540743, + "learning_rate": 8.963438369805163e-06, + "loss": 0.6209, + "step": 1524 + }, + { + "epoch": 0.2504464927226818, + "grad_norm": 0.3807854751881578, + "learning_rate": 8.963378685370209e-06, + "loss": 0.614, + "step": 1525 + }, + { + "epoch": 0.2506107199310246, + "grad_norm": 0.3506963314976677, + "learning_rate": 8.963318952458571e-06, + "loss": 0.607, + "step": 1526 + }, + { + "epoch": 0.2507749471393673, + "grad_norm": 0.3478461020759691, + "learning_rate": 8.9632591710709e-06, + "loss": 0.641, + "step": 1527 + }, + { + "epoch": 0.25093917434771007, + "grad_norm": 0.3530580403738584, + "learning_rate": 8.963199341207842e-06, + "loss": 0.6202, + "step": 1528 + }, + { + "epoch": 0.2511034015560528, + "grad_norm": 0.39033985727780796, + "learning_rate": 8.963139462870049e-06, + "loss": 0.6342, + "step": 1529 + }, + { + "epoch": 0.25126762876439557, + "grad_norm": 0.34915595980919556, + "learning_rate": 8.963079536058168e-06, + "loss": 0.6036, + "step": 1530 + }, + { + "epoch": 0.25143185597273826, + "grad_norm": 0.41289312093272673, + "learning_rate": 8.963019560772856e-06, + "loss": 0.6, + "step": 1531 + }, + { + "epoch": 0.251596083181081, + "grad_norm": 0.3579650538537075, + "learning_rate": 8.962959537014757e-06, + "loss": 0.6271, + "step": 1532 + }, + { + "epoch": 0.25176031038942376, + "grad_norm": 0.358747837059559, + "learning_rate": 8.962899464784528e-06, + "loss": 0.6079, + "step": 1533 + }, + { + "epoch": 0.2519245375977665, + "grad_norm": 0.3630698663213305, + "learning_rate": 8.962839344082818e-06, + "loss": 0.6221, + "step": 1534 + }, + { + "epoch": 0.25208876480610926, + "grad_norm": 0.3587173079907215, + "learning_rate": 8.962779174910283e-06, + "loss": 0.6049, + "step": 1535 + }, + { + "epoch": 0.252252992014452, + "grad_norm": 0.3627910673327825, + "learning_rate": 8.962718957267576e-06, + "loss": 0.627, + "step": 1536 + }, + { + "epoch": 0.25241721922279475, + "grad_norm": 0.3543705245438461, + "learning_rate": 8.962658691155351e-06, + "loss": 0.6004, + "step": 1537 + }, + { + "epoch": 0.2525814464311375, + "grad_norm": 0.36309273648911605, + "learning_rate": 8.96259837657426e-06, + "loss": 0.6173, + "step": 1538 + }, + { + "epoch": 0.2527456736394802, + "grad_norm": 0.3777501728160246, + "learning_rate": 8.962538013524963e-06, + "loss": 0.5753, + "step": 1539 + }, + { + "epoch": 0.25290990084782294, + "grad_norm": 0.3592769791035358, + "learning_rate": 8.96247760200811e-06, + "loss": 0.6148, + "step": 1540 + }, + { + "epoch": 0.2530741280561657, + "grad_norm": 0.36539569711437503, + "learning_rate": 8.96241714202436e-06, + "loss": 0.636, + "step": 1541 + }, + { + "epoch": 0.25323835526450844, + "grad_norm": 0.3581273145789663, + "learning_rate": 8.962356633574368e-06, + "loss": 0.5825, + "step": 1542 + }, + { + "epoch": 0.2534025824728512, + "grad_norm": 0.4135080963610115, + "learning_rate": 8.962296076658795e-06, + "loss": 0.6252, + "step": 1543 + }, + { + "epoch": 0.25356680968119394, + "grad_norm": 0.33935214150807924, + "learning_rate": 8.962235471278298e-06, + "loss": 0.6465, + "step": 1544 + }, + { + "epoch": 0.2537310368895367, + "grad_norm": 0.4749931745427001, + "learning_rate": 8.962174817433531e-06, + "loss": 0.6142, + "step": 1545 + }, + { + "epoch": 0.25389526409787944, + "grad_norm": 0.42031769394767504, + "learning_rate": 8.962114115125154e-06, + "loss": 0.63, + "step": 1546 + }, + { + "epoch": 0.2540594913062222, + "grad_norm": 0.3804360525221743, + "learning_rate": 8.962053364353831e-06, + "loss": 0.6312, + "step": 1547 + }, + { + "epoch": 0.2542237185145649, + "grad_norm": 0.3524668319377424, + "learning_rate": 8.961992565120216e-06, + "loss": 0.6088, + "step": 1548 + }, + { + "epoch": 0.2543879457229076, + "grad_norm": 0.3458219025452087, + "learning_rate": 8.961931717424973e-06, + "loss": 0.6192, + "step": 1549 + }, + { + "epoch": 0.2545521729312504, + "grad_norm": 0.35274115975192566, + "learning_rate": 8.96187082126876e-06, + "loss": 0.6129, + "step": 1550 + }, + { + "epoch": 0.2547164001395931, + "grad_norm": 0.3626793254051729, + "learning_rate": 8.96180987665224e-06, + "loss": 0.6305, + "step": 1551 + }, + { + "epoch": 0.2548806273479359, + "grad_norm": 0.3848176233620632, + "learning_rate": 8.961748883576077e-06, + "loss": 0.589, + "step": 1552 + }, + { + "epoch": 0.2550448545562786, + "grad_norm": 0.366125994685682, + "learning_rate": 8.96168784204093e-06, + "loss": 0.6528, + "step": 1553 + }, + { + "epoch": 0.25520908176462137, + "grad_norm": 0.3322249818553472, + "learning_rate": 8.961626752047464e-06, + "loss": 0.641, + "step": 1554 + }, + { + "epoch": 0.2553733089729641, + "grad_norm": 0.39701211769550127, + "learning_rate": 8.96156561359634e-06, + "loss": 0.6347, + "step": 1555 + }, + { + "epoch": 0.2555375361813068, + "grad_norm": 0.331778398473829, + "learning_rate": 8.961504426688226e-06, + "loss": 0.5913, + "step": 1556 + }, + { + "epoch": 0.25570176338964956, + "grad_norm": 0.35888908022149046, + "learning_rate": 8.961443191323783e-06, + "loss": 0.6081, + "step": 1557 + }, + { + "epoch": 0.2558659905979923, + "grad_norm": 0.3535893722927766, + "learning_rate": 8.961381907503678e-06, + "loss": 0.6385, + "step": 1558 + }, + { + "epoch": 0.25603021780633506, + "grad_norm": 0.33617576269857896, + "learning_rate": 8.961320575228577e-06, + "loss": 0.6137, + "step": 1559 + }, + { + "epoch": 0.2561944450146778, + "grad_norm": 0.4050222822206187, + "learning_rate": 8.961259194499144e-06, + "loss": 0.6156, + "step": 1560 + }, + { + "epoch": 0.25635867222302056, + "grad_norm": 0.35052205808117587, + "learning_rate": 8.961197765316048e-06, + "loss": 0.599, + "step": 1561 + }, + { + "epoch": 0.2565228994313633, + "grad_norm": 0.39858862462836475, + "learning_rate": 8.961136287679955e-06, + "loss": 0.611, + "step": 1562 + }, + { + "epoch": 0.25668712663970605, + "grad_norm": 0.3749625887686892, + "learning_rate": 8.96107476159153e-06, + "loss": 0.6183, + "step": 1563 + }, + { + "epoch": 0.2568513538480488, + "grad_norm": 0.36936476849359495, + "learning_rate": 8.961013187051448e-06, + "loss": 0.6082, + "step": 1564 + }, + { + "epoch": 0.2570155810563915, + "grad_norm": 0.34295965567698833, + "learning_rate": 8.96095156406037e-06, + "loss": 0.6076, + "step": 1565 + }, + { + "epoch": 0.25717980826473424, + "grad_norm": 0.35256410051497583, + "learning_rate": 8.960889892618972e-06, + "loss": 0.6208, + "step": 1566 + }, + { + "epoch": 0.257344035473077, + "grad_norm": 0.5784333725990156, + "learning_rate": 8.960828172727918e-06, + "loss": 0.6236, + "step": 1567 + }, + { + "epoch": 0.25750826268141974, + "grad_norm": 0.3489605554526544, + "learning_rate": 8.960766404387882e-06, + "loss": 0.64, + "step": 1568 + }, + { + "epoch": 0.2576724898897625, + "grad_norm": 0.3415995007524165, + "learning_rate": 8.960704587599537e-06, + "loss": 0.638, + "step": 1569 + }, + { + "epoch": 0.25783671709810524, + "grad_norm": 0.3820929372774659, + "learning_rate": 8.960642722363548e-06, + "loss": 0.6488, + "step": 1570 + }, + { + "epoch": 0.258000944306448, + "grad_norm": 0.38911326483657277, + "learning_rate": 8.960580808680592e-06, + "loss": 0.5972, + "step": 1571 + }, + { + "epoch": 0.25816517151479074, + "grad_norm": 0.47819526662868234, + "learning_rate": 8.96051884655134e-06, + "loss": 0.6315, + "step": 1572 + }, + { + "epoch": 0.25832939872313343, + "grad_norm": 0.38675435762044147, + "learning_rate": 8.960456835976463e-06, + "loss": 0.6121, + "step": 1573 + }, + { + "epoch": 0.2584936259314762, + "grad_norm": 0.36691627542081623, + "learning_rate": 8.96039477695664e-06, + "loss": 0.6224, + "step": 1574 + }, + { + "epoch": 0.2586578531398189, + "grad_norm": 0.3273576946150552, + "learning_rate": 8.960332669492536e-06, + "loss": 0.6082, + "step": 1575 + }, + { + "epoch": 0.2588220803481617, + "grad_norm": 0.3771741764322628, + "learning_rate": 8.960270513584835e-06, + "loss": 0.6154, + "step": 1576 + }, + { + "epoch": 0.2589863075565044, + "grad_norm": 0.3504930691326926, + "learning_rate": 8.960208309234205e-06, + "loss": 0.6115, + "step": 1577 + }, + { + "epoch": 0.2591505347648472, + "grad_norm": 0.3221176517577065, + "learning_rate": 8.960146056441327e-06, + "loss": 0.6233, + "step": 1578 + }, + { + "epoch": 0.2593147619731899, + "grad_norm": 0.34553595989460767, + "learning_rate": 8.960083755206874e-06, + "loss": 0.6314, + "step": 1579 + }, + { + "epoch": 0.25947898918153267, + "grad_norm": 0.35748357758110466, + "learning_rate": 8.960021405531523e-06, + "loss": 0.6069, + "step": 1580 + }, + { + "epoch": 0.2596432163898754, + "grad_norm": 0.4473671425734154, + "learning_rate": 8.959959007415951e-06, + "loss": 0.6417, + "step": 1581 + }, + { + "epoch": 0.2598074435982181, + "grad_norm": 0.3510174264321415, + "learning_rate": 8.959896560860838e-06, + "loss": 0.629, + "step": 1582 + }, + { + "epoch": 0.25997167080656086, + "grad_norm": 0.36410783759248944, + "learning_rate": 8.959834065866857e-06, + "loss": 0.5946, + "step": 1583 + }, + { + "epoch": 0.2601358980149036, + "grad_norm": 0.3512116883938849, + "learning_rate": 8.959771522434693e-06, + "loss": 0.609, + "step": 1584 + }, + { + "epoch": 0.26030012522324636, + "grad_norm": 0.36875418431071966, + "learning_rate": 8.959708930565021e-06, + "loss": 0.6243, + "step": 1585 + }, + { + "epoch": 0.2604643524315891, + "grad_norm": 0.40873019656468806, + "learning_rate": 8.959646290258523e-06, + "loss": 0.6009, + "step": 1586 + }, + { + "epoch": 0.26062857963993186, + "grad_norm": 0.33934926723741654, + "learning_rate": 8.959583601515878e-06, + "loss": 0.6143, + "step": 1587 + }, + { + "epoch": 0.2607928068482746, + "grad_norm": 0.36594161286713034, + "learning_rate": 8.959520864337769e-06, + "loss": 0.5963, + "step": 1588 + }, + { + "epoch": 0.26095703405661735, + "grad_norm": 0.35446975771035094, + "learning_rate": 8.959458078724875e-06, + "loss": 0.6133, + "step": 1589 + }, + { + "epoch": 0.26112126126496005, + "grad_norm": 0.33119122466208173, + "learning_rate": 8.959395244677878e-06, + "loss": 0.6029, + "step": 1590 + }, + { + "epoch": 0.2612854884733028, + "grad_norm": 0.3558849986573231, + "learning_rate": 8.959332362197461e-06, + "loss": 0.5961, + "step": 1591 + }, + { + "epoch": 0.26144971568164554, + "grad_norm": 0.4300437747977983, + "learning_rate": 8.959269431284309e-06, + "loss": 0.6265, + "step": 1592 + }, + { + "epoch": 0.2616139428899883, + "grad_norm": 0.33313374885784347, + "learning_rate": 8.959206451939102e-06, + "loss": 0.6335, + "step": 1593 + }, + { + "epoch": 0.26177817009833104, + "grad_norm": 0.36400831389369626, + "learning_rate": 8.959143424162526e-06, + "loss": 0.6284, + "step": 1594 + }, + { + "epoch": 0.2619423973066738, + "grad_norm": 0.3464235716370396, + "learning_rate": 8.959080347955264e-06, + "loss": 0.6023, + "step": 1595 + }, + { + "epoch": 0.26210662451501654, + "grad_norm": 0.39376900324065406, + "learning_rate": 8.959017223318005e-06, + "loss": 0.6145, + "step": 1596 + }, + { + "epoch": 0.2622708517233593, + "grad_norm": 0.34821762378354826, + "learning_rate": 8.95895405025143e-06, + "loss": 0.6114, + "step": 1597 + }, + { + "epoch": 0.26243507893170204, + "grad_norm": 0.3463710949430958, + "learning_rate": 8.958890828756229e-06, + "loss": 0.6386, + "step": 1598 + }, + { + "epoch": 0.26259930614004473, + "grad_norm": 0.3247606140038734, + "learning_rate": 8.958827558833084e-06, + "loss": 0.6303, + "step": 1599 + }, + { + "epoch": 0.2627635333483875, + "grad_norm": 0.3406923782871825, + "learning_rate": 8.958764240482686e-06, + "loss": 0.5948, + "step": 1600 + }, + { + "epoch": 0.26292776055673023, + "grad_norm": 0.3627730588083312, + "learning_rate": 8.958700873705721e-06, + "loss": 0.6232, + "step": 1601 + }, + { + "epoch": 0.263091987765073, + "grad_norm": 0.35207234573189816, + "learning_rate": 8.958637458502879e-06, + "loss": 0.6102, + "step": 1602 + }, + { + "epoch": 0.2632562149734157, + "grad_norm": 0.3569459596649669, + "learning_rate": 8.958573994874846e-06, + "loss": 0.6313, + "step": 1603 + }, + { + "epoch": 0.2634204421817585, + "grad_norm": 0.4495054405643325, + "learning_rate": 8.958510482822314e-06, + "loss": 0.6, + "step": 1604 + }, + { + "epoch": 0.2635846693901012, + "grad_norm": 0.38731324914519233, + "learning_rate": 8.95844692234597e-06, + "loss": 0.6405, + "step": 1605 + }, + { + "epoch": 0.26374889659844397, + "grad_norm": 0.3338592407113862, + "learning_rate": 8.958383313446508e-06, + "loss": 0.6234, + "step": 1606 + }, + { + "epoch": 0.26391312380678666, + "grad_norm": 0.33659403725843934, + "learning_rate": 8.958319656124615e-06, + "loss": 0.6163, + "step": 1607 + }, + { + "epoch": 0.2640773510151294, + "grad_norm": 0.34801294714515657, + "learning_rate": 8.958255950380986e-06, + "loss": 0.6201, + "step": 1608 + }, + { + "epoch": 0.26424157822347216, + "grad_norm": 0.33754657169064073, + "learning_rate": 8.958192196216309e-06, + "loss": 0.6342, + "step": 1609 + }, + { + "epoch": 0.2644058054318149, + "grad_norm": 0.3224965637155876, + "learning_rate": 8.958128393631279e-06, + "loss": 0.6122, + "step": 1610 + }, + { + "epoch": 0.26457003264015766, + "grad_norm": 0.33135459212029433, + "learning_rate": 8.958064542626589e-06, + "loss": 0.5957, + "step": 1611 + }, + { + "epoch": 0.2647342598485004, + "grad_norm": 0.34651185946628477, + "learning_rate": 8.958000643202932e-06, + "loss": 0.5911, + "step": 1612 + }, + { + "epoch": 0.26489848705684316, + "grad_norm": 0.3637450672410526, + "learning_rate": 8.957936695361001e-06, + "loss": 0.6292, + "step": 1613 + }, + { + "epoch": 0.2650627142651859, + "grad_norm": 0.34550569205067416, + "learning_rate": 8.957872699101492e-06, + "loss": 0.6216, + "step": 1614 + }, + { + "epoch": 0.26522694147352865, + "grad_norm": 0.3360151957193412, + "learning_rate": 8.9578086544251e-06, + "loss": 0.6141, + "step": 1615 + }, + { + "epoch": 0.26539116868187135, + "grad_norm": 0.3322104258097254, + "learning_rate": 8.957744561332521e-06, + "loss": 0.6043, + "step": 1616 + }, + { + "epoch": 0.2655553958902141, + "grad_norm": 0.3507509351350694, + "learning_rate": 8.957680419824448e-06, + "loss": 0.6475, + "step": 1617 + }, + { + "epoch": 0.26571962309855685, + "grad_norm": 0.3477772878053762, + "learning_rate": 8.95761622990158e-06, + "loss": 0.601, + "step": 1618 + }, + { + "epoch": 0.2658838503068996, + "grad_norm": 0.3976960245557338, + "learning_rate": 8.957551991564617e-06, + "loss": 0.6343, + "step": 1619 + }, + { + "epoch": 0.26604807751524234, + "grad_norm": 0.34982742819917306, + "learning_rate": 8.957487704814252e-06, + "loss": 0.6665, + "step": 1620 + }, + { + "epoch": 0.2662123047235851, + "grad_norm": 0.3406041837951626, + "learning_rate": 8.957423369651183e-06, + "loss": 0.6283, + "step": 1621 + }, + { + "epoch": 0.26637653193192784, + "grad_norm": 0.3279996924316886, + "learning_rate": 8.957358986076113e-06, + "loss": 0.6057, + "step": 1622 + }, + { + "epoch": 0.2665407591402706, + "grad_norm": 0.3672845856678108, + "learning_rate": 8.957294554089738e-06, + "loss": 0.6201, + "step": 1623 + }, + { + "epoch": 0.2667049863486133, + "grad_norm": 0.39245027772202284, + "learning_rate": 8.957230073692759e-06, + "loss": 0.6082, + "step": 1624 + }, + { + "epoch": 0.26686921355695603, + "grad_norm": 0.40103050047766803, + "learning_rate": 8.957165544885875e-06, + "loss": 0.5781, + "step": 1625 + }, + { + "epoch": 0.2670334407652988, + "grad_norm": 0.33885876819639205, + "learning_rate": 8.957100967669791e-06, + "loss": 0.6334, + "step": 1626 + }, + { + "epoch": 0.26719766797364153, + "grad_norm": 0.3583568872434457, + "learning_rate": 8.957036342045203e-06, + "loss": 0.6053, + "step": 1627 + }, + { + "epoch": 0.2673618951819843, + "grad_norm": 0.3518879810969012, + "learning_rate": 8.956971668012817e-06, + "loss": 0.6216, + "step": 1628 + }, + { + "epoch": 0.267526122390327, + "grad_norm": 0.3381543177932053, + "learning_rate": 8.95690694557333e-06, + "loss": 0.6232, + "step": 1629 + }, + { + "epoch": 0.2676903495986698, + "grad_norm": 0.441450413777482, + "learning_rate": 8.95684217472745e-06, + "loss": 0.6225, + "step": 1630 + }, + { + "epoch": 0.2678545768070125, + "grad_norm": 0.35541320952449645, + "learning_rate": 8.956777355475881e-06, + "loss": 0.6389, + "step": 1631 + }, + { + "epoch": 0.2680188040153552, + "grad_norm": 0.4207779627283632, + "learning_rate": 8.956712487819323e-06, + "loss": 0.632, + "step": 1632 + }, + { + "epoch": 0.26818303122369797, + "grad_norm": 0.3320596414586848, + "learning_rate": 8.956647571758485e-06, + "loss": 0.6146, + "step": 1633 + }, + { + "epoch": 0.2683472584320407, + "grad_norm": 0.3565184586735187, + "learning_rate": 8.956582607294067e-06, + "loss": 0.6196, + "step": 1634 + }, + { + "epoch": 0.26851148564038346, + "grad_norm": 0.32988657768521945, + "learning_rate": 8.956517594426778e-06, + "loss": 0.6184, + "step": 1635 + }, + { + "epoch": 0.2686757128487262, + "grad_norm": 0.33061850241523516, + "learning_rate": 8.956452533157325e-06, + "loss": 0.5958, + "step": 1636 + }, + { + "epoch": 0.26883994005706896, + "grad_norm": 0.3852551983617042, + "learning_rate": 8.95638742348641e-06, + "loss": 0.6365, + "step": 1637 + }, + { + "epoch": 0.2690041672654117, + "grad_norm": 0.35206933111559585, + "learning_rate": 8.956322265414746e-06, + "loss": 0.6004, + "step": 1638 + }, + { + "epoch": 0.26916839447375446, + "grad_norm": 0.3427069503065173, + "learning_rate": 8.956257058943036e-06, + "loss": 0.6185, + "step": 1639 + }, + { + "epoch": 0.2693326216820972, + "grad_norm": 0.3406012616435544, + "learning_rate": 8.95619180407199e-06, + "loss": 0.606, + "step": 1640 + }, + { + "epoch": 0.2694968488904399, + "grad_norm": 0.3597673860314228, + "learning_rate": 8.956126500802318e-06, + "loss": 0.6304, + "step": 1641 + }, + { + "epoch": 0.26966107609878265, + "grad_norm": 0.3146073190261586, + "learning_rate": 8.956061149134725e-06, + "loss": 0.6177, + "step": 1642 + }, + { + "epoch": 0.2698253033071254, + "grad_norm": 0.3424703951143541, + "learning_rate": 8.955995749069926e-06, + "loss": 0.6211, + "step": 1643 + }, + { + "epoch": 0.26998953051546815, + "grad_norm": 0.35081835808818557, + "learning_rate": 8.955930300608629e-06, + "loss": 0.598, + "step": 1644 + }, + { + "epoch": 0.2701537577238109, + "grad_norm": 0.4085725713510005, + "learning_rate": 8.955864803751546e-06, + "loss": 0.6, + "step": 1645 + }, + { + "epoch": 0.27031798493215364, + "grad_norm": 0.3245057016062214, + "learning_rate": 8.955799258499384e-06, + "loss": 0.6016, + "step": 1646 + }, + { + "epoch": 0.2704822121404964, + "grad_norm": 0.3429782149104829, + "learning_rate": 8.95573366485286e-06, + "loss": 0.6029, + "step": 1647 + }, + { + "epoch": 0.27064643934883914, + "grad_norm": 0.3637562624540342, + "learning_rate": 8.955668022812687e-06, + "loss": 0.6142, + "step": 1648 + }, + { + "epoch": 0.27081066655718183, + "grad_norm": 0.4237638762114716, + "learning_rate": 8.955602332379572e-06, + "loss": 0.5921, + "step": 1649 + }, + { + "epoch": 0.2709748937655246, + "grad_norm": 0.39003068117908357, + "learning_rate": 8.955536593554232e-06, + "loss": 0.5899, + "step": 1650 + }, + { + "epoch": 0.27113912097386733, + "grad_norm": 0.37048027690848173, + "learning_rate": 8.955470806337382e-06, + "loss": 0.6276, + "step": 1651 + }, + { + "epoch": 0.2713033481822101, + "grad_norm": 0.3597344209193968, + "learning_rate": 8.955404970729736e-06, + "loss": 0.6175, + "step": 1652 + }, + { + "epoch": 0.27146757539055283, + "grad_norm": 0.3367753913122933, + "learning_rate": 8.955339086732009e-06, + "loss": 0.6233, + "step": 1653 + }, + { + "epoch": 0.2716318025988956, + "grad_norm": 0.3767695965340454, + "learning_rate": 8.955273154344914e-06, + "loss": 0.6275, + "step": 1654 + }, + { + "epoch": 0.2717960298072383, + "grad_norm": 0.35115066088714475, + "learning_rate": 8.95520717356917e-06, + "loss": 0.6262, + "step": 1655 + }, + { + "epoch": 0.2719602570155811, + "grad_norm": 0.35017560014066323, + "learning_rate": 8.955141144405493e-06, + "loss": 0.5956, + "step": 1656 + }, + { + "epoch": 0.2721244842239238, + "grad_norm": 0.33377410466349333, + "learning_rate": 8.9550750668546e-06, + "loss": 0.59, + "step": 1657 + }, + { + "epoch": 0.2722887114322665, + "grad_norm": 0.3590803976413437, + "learning_rate": 8.955008940917208e-06, + "loss": 0.6097, + "step": 1658 + }, + { + "epoch": 0.27245293864060927, + "grad_norm": 0.36554099156071396, + "learning_rate": 8.954942766594036e-06, + "loss": 0.6201, + "step": 1659 + }, + { + "epoch": 0.272617165848952, + "grad_norm": 0.3378974550030977, + "learning_rate": 8.954876543885802e-06, + "loss": 0.5875, + "step": 1660 + }, + { + "epoch": 0.27278139305729476, + "grad_norm": 0.32781104553889123, + "learning_rate": 8.954810272793227e-06, + "loss": 0.6002, + "step": 1661 + }, + { + "epoch": 0.2729456202656375, + "grad_norm": 0.398398331687394, + "learning_rate": 8.954743953317029e-06, + "loss": 0.62, + "step": 1662 + }, + { + "epoch": 0.27310984747398026, + "grad_norm": 0.5063519634001101, + "learning_rate": 8.95467758545793e-06, + "loss": 0.604, + "step": 1663 + }, + { + "epoch": 0.273274074682323, + "grad_norm": 0.3998293658682676, + "learning_rate": 8.954611169216646e-06, + "loss": 0.6273, + "step": 1664 + }, + { + "epoch": 0.27343830189066576, + "grad_norm": 0.6269815667257993, + "learning_rate": 8.954544704593904e-06, + "loss": 0.6311, + "step": 1665 + }, + { + "epoch": 0.27360252909900845, + "grad_norm": 0.32155955625909083, + "learning_rate": 8.954478191590425e-06, + "loss": 0.6042, + "step": 1666 + }, + { + "epoch": 0.2737667563073512, + "grad_norm": 0.35671629044279696, + "learning_rate": 8.95441163020693e-06, + "loss": 0.6325, + "step": 1667 + }, + { + "epoch": 0.27393098351569395, + "grad_norm": 0.3193642963859553, + "learning_rate": 8.954345020444141e-06, + "loss": 0.6272, + "step": 1668 + }, + { + "epoch": 0.2740952107240367, + "grad_norm": 0.3555838716484768, + "learning_rate": 8.954278362302783e-06, + "loss": 0.5956, + "step": 1669 + }, + { + "epoch": 0.27425943793237945, + "grad_norm": 0.49064088111583565, + "learning_rate": 8.954211655783579e-06, + "loss": 0.6205, + "step": 1670 + }, + { + "epoch": 0.2744236651407222, + "grad_norm": 0.3711693322859996, + "learning_rate": 8.954144900887255e-06, + "loss": 0.6149, + "step": 1671 + }, + { + "epoch": 0.27458789234906494, + "grad_norm": 0.33828977223496476, + "learning_rate": 8.954078097614534e-06, + "loss": 0.595, + "step": 1672 + }, + { + "epoch": 0.2747521195574077, + "grad_norm": 0.3603675345755279, + "learning_rate": 8.954011245966145e-06, + "loss": 0.6123, + "step": 1673 + }, + { + "epoch": 0.27491634676575044, + "grad_norm": 0.3342987882526401, + "learning_rate": 8.953944345942809e-06, + "loss": 0.6064, + "step": 1674 + }, + { + "epoch": 0.27508057397409313, + "grad_norm": 0.33840786908343246, + "learning_rate": 8.953877397545255e-06, + "loss": 0.6147, + "step": 1675 + }, + { + "epoch": 0.2752448011824359, + "grad_norm": 0.3361362202891087, + "learning_rate": 8.953810400774213e-06, + "loss": 0.612, + "step": 1676 + }, + { + "epoch": 0.27540902839077863, + "grad_norm": 0.3615623201654782, + "learning_rate": 8.953743355630406e-06, + "loss": 0.6197, + "step": 1677 + }, + { + "epoch": 0.2755732555991214, + "grad_norm": 0.34209041694780395, + "learning_rate": 8.953676262114565e-06, + "loss": 0.6493, + "step": 1678 + }, + { + "epoch": 0.27573748280746413, + "grad_norm": 0.325938108900595, + "learning_rate": 8.95360912022742e-06, + "loss": 0.6057, + "step": 1679 + }, + { + "epoch": 0.2759017100158069, + "grad_norm": 0.38745143087531064, + "learning_rate": 8.953541929969696e-06, + "loss": 0.6295, + "step": 1680 + }, + { + "epoch": 0.2760659372241496, + "grad_norm": 0.3810022825723158, + "learning_rate": 8.953474691342126e-06, + "loss": 0.5934, + "step": 1681 + }, + { + "epoch": 0.2762301644324924, + "grad_norm": 0.39295072393329594, + "learning_rate": 8.953407404345437e-06, + "loss": 0.6321, + "step": 1682 + }, + { + "epoch": 0.27639439164083507, + "grad_norm": 0.35390570166315877, + "learning_rate": 8.953340068980363e-06, + "loss": 0.5874, + "step": 1683 + }, + { + "epoch": 0.2765586188491778, + "grad_norm": 0.34976927367936533, + "learning_rate": 8.953272685247636e-06, + "loss": 0.6143, + "step": 1684 + }, + { + "epoch": 0.27672284605752057, + "grad_norm": 0.3327787798882458, + "learning_rate": 8.953205253147985e-06, + "loss": 0.6057, + "step": 1685 + }, + { + "epoch": 0.2768870732658633, + "grad_norm": 0.3415951518427282, + "learning_rate": 8.953137772682144e-06, + "loss": 0.6334, + "step": 1686 + }, + { + "epoch": 0.27705130047420606, + "grad_norm": 0.3384035832213004, + "learning_rate": 8.953070243850843e-06, + "loss": 0.6103, + "step": 1687 + }, + { + "epoch": 0.2772155276825488, + "grad_norm": 0.3495169277940791, + "learning_rate": 8.953002666654822e-06, + "loss": 0.6097, + "step": 1688 + }, + { + "epoch": 0.27737975489089156, + "grad_norm": 0.3416538910156951, + "learning_rate": 8.952935041094809e-06, + "loss": 0.5985, + "step": 1689 + }, + { + "epoch": 0.2775439820992343, + "grad_norm": 0.3451977176545778, + "learning_rate": 8.95286736717154e-06, + "loss": 0.5766, + "step": 1690 + }, + { + "epoch": 0.27770820930757706, + "grad_norm": 0.32525419888437135, + "learning_rate": 8.95279964488575e-06, + "loss": 0.6011, + "step": 1691 + }, + { + "epoch": 0.27787243651591975, + "grad_norm": 0.39000414414804097, + "learning_rate": 8.952731874238176e-06, + "loss": 0.6029, + "step": 1692 + }, + { + "epoch": 0.2780366637242625, + "grad_norm": 0.3393591338584082, + "learning_rate": 8.952664055229553e-06, + "loss": 0.5948, + "step": 1693 + }, + { + "epoch": 0.27820089093260525, + "grad_norm": 0.3986866139500602, + "learning_rate": 8.952596187860617e-06, + "loss": 0.6006, + "step": 1694 + }, + { + "epoch": 0.278365118140948, + "grad_norm": 0.3098858325727656, + "learning_rate": 8.952528272132107e-06, + "loss": 0.6008, + "step": 1695 + }, + { + "epoch": 0.27852934534929075, + "grad_norm": 0.3572611724356876, + "learning_rate": 8.952460308044756e-06, + "loss": 0.5998, + "step": 1696 + }, + { + "epoch": 0.2786935725576335, + "grad_norm": 0.4291924390919209, + "learning_rate": 8.952392295599309e-06, + "loss": 0.6133, + "step": 1697 + }, + { + "epoch": 0.27885779976597624, + "grad_norm": 0.34816840940610366, + "learning_rate": 8.9523242347965e-06, + "loss": 0.5985, + "step": 1698 + }, + { + "epoch": 0.279022026974319, + "grad_norm": 0.34022234855467215, + "learning_rate": 8.952256125637069e-06, + "loss": 0.6103, + "step": 1699 + }, + { + "epoch": 0.2791862541826617, + "grad_norm": 0.3639374654161306, + "learning_rate": 8.952187968121755e-06, + "loss": 0.6076, + "step": 1700 + }, + { + "epoch": 0.27935048139100443, + "grad_norm": 0.3338247300715134, + "learning_rate": 8.952119762251299e-06, + "loss": 0.6114, + "step": 1701 + }, + { + "epoch": 0.2795147085993472, + "grad_norm": 0.31947936911146907, + "learning_rate": 8.952051508026443e-06, + "loss": 0.616, + "step": 1702 + }, + { + "epoch": 0.27967893580768993, + "grad_norm": 0.34279451006593414, + "learning_rate": 8.951983205447928e-06, + "loss": 0.6179, + "step": 1703 + }, + { + "epoch": 0.2798431630160327, + "grad_norm": 0.45582975634168327, + "learning_rate": 8.951914854516495e-06, + "loss": 0.6173, + "step": 1704 + }, + { + "epoch": 0.28000739022437543, + "grad_norm": 0.3419181751540804, + "learning_rate": 8.951846455232888e-06, + "loss": 0.6024, + "step": 1705 + }, + { + "epoch": 0.2801716174327182, + "grad_norm": 0.3593734235508695, + "learning_rate": 8.951778007597848e-06, + "loss": 0.5989, + "step": 1706 + }, + { + "epoch": 0.2803358446410609, + "grad_norm": 0.3511409384421343, + "learning_rate": 8.951709511612116e-06, + "loss": 0.592, + "step": 1707 + }, + { + "epoch": 0.2805000718494037, + "grad_norm": 0.3501672881668432, + "learning_rate": 8.95164096727644e-06, + "loss": 0.5928, + "step": 1708 + }, + { + "epoch": 0.28066429905774637, + "grad_norm": 0.3727919134182136, + "learning_rate": 8.951572374591564e-06, + "loss": 0.5893, + "step": 1709 + }, + { + "epoch": 0.2808285262660891, + "grad_norm": 0.3280328400507155, + "learning_rate": 8.951503733558232e-06, + "loss": 0.6162, + "step": 1710 + }, + { + "epoch": 0.28099275347443187, + "grad_norm": 0.3775465194021973, + "learning_rate": 8.951435044177191e-06, + "loss": 0.6215, + "step": 1711 + }, + { + "epoch": 0.2811569806827746, + "grad_norm": 0.33441012730815717, + "learning_rate": 8.951366306449184e-06, + "loss": 0.5886, + "step": 1712 + }, + { + "epoch": 0.28132120789111736, + "grad_norm": 0.34134012357709204, + "learning_rate": 8.95129752037496e-06, + "loss": 0.5977, + "step": 1713 + }, + { + "epoch": 0.2814854350994601, + "grad_norm": 0.33093093198335205, + "learning_rate": 8.951228685955265e-06, + "loss": 0.6345, + "step": 1714 + }, + { + "epoch": 0.28164966230780286, + "grad_norm": 0.40993061330630076, + "learning_rate": 8.951159803190848e-06, + "loss": 0.6178, + "step": 1715 + }, + { + "epoch": 0.2818138895161456, + "grad_norm": 0.3963971197606208, + "learning_rate": 8.951090872082457e-06, + "loss": 0.6059, + "step": 1716 + }, + { + "epoch": 0.2819781167244883, + "grad_norm": 0.34012540670581837, + "learning_rate": 8.951021892630839e-06, + "loss": 0.6247, + "step": 1717 + }, + { + "epoch": 0.28214234393283105, + "grad_norm": 0.32786048540396723, + "learning_rate": 8.950952864836743e-06, + "loss": 0.6105, + "step": 1718 + }, + { + "epoch": 0.2823065711411738, + "grad_norm": 0.5082922315473971, + "learning_rate": 8.95088378870092e-06, + "loss": 0.6049, + "step": 1719 + }, + { + "epoch": 0.28247079834951655, + "grad_norm": 0.3094071280048266, + "learning_rate": 8.95081466422412e-06, + "loss": 0.5871, + "step": 1720 + }, + { + "epoch": 0.2826350255578593, + "grad_norm": 0.32426351466574627, + "learning_rate": 8.950745491407095e-06, + "loss": 0.5966, + "step": 1721 + }, + { + "epoch": 0.28279925276620205, + "grad_norm": 0.31977137099864866, + "learning_rate": 8.950676270250593e-06, + "loss": 0.6026, + "step": 1722 + }, + { + "epoch": 0.2829634799745448, + "grad_norm": 0.341677966380917, + "learning_rate": 8.95060700075537e-06, + "loss": 0.6146, + "step": 1723 + }, + { + "epoch": 0.28312770718288754, + "grad_norm": 0.34463367244022197, + "learning_rate": 8.950537682922175e-06, + "loss": 0.6037, + "step": 1724 + }, + { + "epoch": 0.2832919343912303, + "grad_norm": 0.3238216529925343, + "learning_rate": 8.950468316751763e-06, + "loss": 0.605, + "step": 1725 + }, + { + "epoch": 0.283456161599573, + "grad_norm": 0.3491065555826677, + "learning_rate": 8.950398902244884e-06, + "loss": 0.6054, + "step": 1726 + }, + { + "epoch": 0.28362038880791574, + "grad_norm": 0.3248877879087777, + "learning_rate": 8.950329439402296e-06, + "loss": 0.5972, + "step": 1727 + }, + { + "epoch": 0.2837846160162585, + "grad_norm": 0.3444958318564493, + "learning_rate": 8.950259928224753e-06, + "loss": 0.5953, + "step": 1728 + }, + { + "epoch": 0.28394884322460123, + "grad_norm": 0.32530493542474337, + "learning_rate": 8.950190368713007e-06, + "loss": 0.6184, + "step": 1729 + }, + { + "epoch": 0.284113070432944, + "grad_norm": 0.3321681672983501, + "learning_rate": 8.950120760867817e-06, + "loss": 0.598, + "step": 1730 + }, + { + "epoch": 0.28427729764128673, + "grad_norm": 0.33431613907622554, + "learning_rate": 8.950051104689933e-06, + "loss": 0.6233, + "step": 1731 + }, + { + "epoch": 0.2844415248496295, + "grad_norm": 0.33533763383067433, + "learning_rate": 8.94998140018012e-06, + "loss": 0.6041, + "step": 1732 + }, + { + "epoch": 0.2846057520579722, + "grad_norm": 0.3504748214822058, + "learning_rate": 8.949911647339128e-06, + "loss": 0.6167, + "step": 1733 + }, + { + "epoch": 0.2847699792663149, + "grad_norm": 0.4093185842862643, + "learning_rate": 8.94984184616772e-06, + "loss": 0.5901, + "step": 1734 + }, + { + "epoch": 0.28493420647465767, + "grad_norm": 0.3544777274138171, + "learning_rate": 8.94977199666665e-06, + "loss": 0.6031, + "step": 1735 + }, + { + "epoch": 0.2850984336830004, + "grad_norm": 0.3149036493882386, + "learning_rate": 8.949702098836679e-06, + "loss": 0.5978, + "step": 1736 + }, + { + "epoch": 0.28526266089134317, + "grad_norm": 0.32806889942804984, + "learning_rate": 8.949632152678564e-06, + "loss": 0.6175, + "step": 1737 + }, + { + "epoch": 0.2854268880996859, + "grad_norm": 0.33769469010145153, + "learning_rate": 8.949562158193067e-06, + "loss": 0.6051, + "step": 1738 + }, + { + "epoch": 0.28559111530802866, + "grad_norm": 0.321457597624339, + "learning_rate": 8.949492115380947e-06, + "loss": 0.6147, + "step": 1739 + }, + { + "epoch": 0.2857553425163714, + "grad_norm": 0.35122328128449154, + "learning_rate": 8.949422024242963e-06, + "loss": 0.5977, + "step": 1740 + }, + { + "epoch": 0.28591956972471416, + "grad_norm": 0.3853149359679044, + "learning_rate": 8.949351884779882e-06, + "loss": 0.6193, + "step": 1741 + }, + { + "epoch": 0.2860837969330569, + "grad_norm": 0.4647759735760696, + "learning_rate": 8.949281696992459e-06, + "loss": 0.6024, + "step": 1742 + }, + { + "epoch": 0.2862480241413996, + "grad_norm": 0.31943303091838704, + "learning_rate": 8.94921146088146e-06, + "loss": 0.5835, + "step": 1743 + }, + { + "epoch": 0.28641225134974235, + "grad_norm": 0.3386314610382678, + "learning_rate": 8.949141176447648e-06, + "loss": 0.6309, + "step": 1744 + }, + { + "epoch": 0.2865764785580851, + "grad_norm": 0.3223619040890537, + "learning_rate": 8.949070843691785e-06, + "loss": 0.6206, + "step": 1745 + }, + { + "epoch": 0.28674070576642785, + "grad_norm": 0.3460822464622039, + "learning_rate": 8.949000462614634e-06, + "loss": 0.5956, + "step": 1746 + }, + { + "epoch": 0.2869049329747706, + "grad_norm": 0.3360222306588388, + "learning_rate": 8.948930033216963e-06, + "loss": 0.5946, + "step": 1747 + }, + { + "epoch": 0.28706916018311335, + "grad_norm": 0.35259283103540384, + "learning_rate": 8.948859555499533e-06, + "loss": 0.596, + "step": 1748 + }, + { + "epoch": 0.2872333873914561, + "grad_norm": 0.34166466778665, + "learning_rate": 8.948789029463112e-06, + "loss": 0.5831, + "step": 1749 + }, + { + "epoch": 0.28739761459979885, + "grad_norm": 0.31738289979786777, + "learning_rate": 8.948718455108464e-06, + "loss": 0.6179, + "step": 1750 + }, + { + "epoch": 0.28756184180814154, + "grad_norm": 0.32852270318439086, + "learning_rate": 8.948647832436357e-06, + "loss": 0.6015, + "step": 1751 + }, + { + "epoch": 0.2877260690164843, + "grad_norm": 0.38106270430454836, + "learning_rate": 8.948577161447558e-06, + "loss": 0.6165, + "step": 1752 + }, + { + "epoch": 0.28789029622482704, + "grad_norm": 0.34254010965813875, + "learning_rate": 8.948506442142834e-06, + "loss": 0.6043, + "step": 1753 + }, + { + "epoch": 0.2880545234331698, + "grad_norm": 0.3124040810333103, + "learning_rate": 8.948435674522954e-06, + "loss": 0.5979, + "step": 1754 + }, + { + "epoch": 0.28821875064151253, + "grad_norm": 0.3204292251504406, + "learning_rate": 8.948364858588684e-06, + "loss": 0.5847, + "step": 1755 + }, + { + "epoch": 0.2883829778498553, + "grad_norm": 0.32046035935302963, + "learning_rate": 8.948293994340797e-06, + "loss": 0.6032, + "step": 1756 + }, + { + "epoch": 0.28854720505819803, + "grad_norm": 0.3408352541517514, + "learning_rate": 8.948223081780062e-06, + "loss": 0.6159, + "step": 1757 + }, + { + "epoch": 0.2887114322665408, + "grad_norm": 0.5290664262676616, + "learning_rate": 8.948152120907245e-06, + "loss": 0.6169, + "step": 1758 + }, + { + "epoch": 0.28887565947488353, + "grad_norm": 0.3400090749950359, + "learning_rate": 8.948081111723122e-06, + "loss": 0.6176, + "step": 1759 + }, + { + "epoch": 0.2890398866832262, + "grad_norm": 0.48512254987990944, + "learning_rate": 8.94801005422846e-06, + "loss": 0.6045, + "step": 1760 + }, + { + "epoch": 0.28920411389156897, + "grad_norm": 0.32696720216752834, + "learning_rate": 8.947938948424033e-06, + "loss": 0.611, + "step": 1761 + }, + { + "epoch": 0.2893683410999117, + "grad_norm": 0.3348572131814443, + "learning_rate": 8.947867794310612e-06, + "loss": 0.5976, + "step": 1762 + }, + { + "epoch": 0.28953256830825447, + "grad_norm": 0.3463666631002641, + "learning_rate": 8.947796591888971e-06, + "loss": 0.6093, + "step": 1763 + }, + { + "epoch": 0.2896967955165972, + "grad_norm": 0.37184836370878455, + "learning_rate": 8.947725341159884e-06, + "loss": 0.577, + "step": 1764 + }, + { + "epoch": 0.28986102272493997, + "grad_norm": 0.3554557818232329, + "learning_rate": 8.947654042124124e-06, + "loss": 0.628, + "step": 1765 + }, + { + "epoch": 0.2900252499332827, + "grad_norm": 0.34625018438648913, + "learning_rate": 8.947582694782464e-06, + "loss": 0.5874, + "step": 1766 + }, + { + "epoch": 0.29018947714162546, + "grad_norm": 0.3405480851791619, + "learning_rate": 8.947511299135681e-06, + "loss": 0.6213, + "step": 1767 + }, + { + "epoch": 0.29035370434996816, + "grad_norm": 0.33999734574851553, + "learning_rate": 8.947439855184548e-06, + "loss": 0.5855, + "step": 1768 + }, + { + "epoch": 0.2905179315583109, + "grad_norm": 0.33180325232357516, + "learning_rate": 8.947368362929844e-06, + "loss": 0.6249, + "step": 1769 + }, + { + "epoch": 0.29068215876665365, + "grad_norm": 0.33592557868625156, + "learning_rate": 8.947296822372344e-06, + "loss": 0.6044, + "step": 1770 + }, + { + "epoch": 0.2908463859749964, + "grad_norm": 0.3836129375299099, + "learning_rate": 8.947225233512824e-06, + "loss": 0.6128, + "step": 1771 + }, + { + "epoch": 0.29101061318333915, + "grad_norm": 0.35129704586375776, + "learning_rate": 8.947153596352064e-06, + "loss": 0.6037, + "step": 1772 + }, + { + "epoch": 0.2911748403916819, + "grad_norm": 0.3359681523693811, + "learning_rate": 8.947081910890838e-06, + "loss": 0.5972, + "step": 1773 + }, + { + "epoch": 0.29133906760002465, + "grad_norm": 0.4025206425833003, + "learning_rate": 8.947010177129929e-06, + "loss": 0.6035, + "step": 1774 + }, + { + "epoch": 0.2915032948083674, + "grad_norm": 0.3219472625707683, + "learning_rate": 8.946938395070115e-06, + "loss": 0.6012, + "step": 1775 + }, + { + "epoch": 0.29166752201671015, + "grad_norm": 0.34762377131835476, + "learning_rate": 8.946866564712174e-06, + "loss": 0.5764, + "step": 1776 + }, + { + "epoch": 0.29183174922505284, + "grad_norm": 0.33217069383444225, + "learning_rate": 8.946794686056886e-06, + "loss": 0.6019, + "step": 1777 + }, + { + "epoch": 0.2919959764333956, + "grad_norm": 0.31309510597308177, + "learning_rate": 8.946722759105034e-06, + "loss": 0.5969, + "step": 1778 + }, + { + "epoch": 0.29216020364173834, + "grad_norm": 0.3426072418753475, + "learning_rate": 8.946650783857395e-06, + "loss": 0.5952, + "step": 1779 + }, + { + "epoch": 0.2923244308500811, + "grad_norm": 0.3422905765735522, + "learning_rate": 8.946578760314758e-06, + "loss": 0.5999, + "step": 1780 + }, + { + "epoch": 0.29248865805842383, + "grad_norm": 0.32223052931910556, + "learning_rate": 8.946506688477896e-06, + "loss": 0.5891, + "step": 1781 + }, + { + "epoch": 0.2926528852667666, + "grad_norm": 0.34014514460328144, + "learning_rate": 8.9464345683476e-06, + "loss": 0.6022, + "step": 1782 + }, + { + "epoch": 0.29281711247510933, + "grad_norm": 0.4222426807348207, + "learning_rate": 8.94636239992465e-06, + "loss": 0.5949, + "step": 1783 + }, + { + "epoch": 0.2929813396834521, + "grad_norm": 1.4806624577700644, + "learning_rate": 8.946290183209829e-06, + "loss": 0.6048, + "step": 1784 + }, + { + "epoch": 0.2931455668917948, + "grad_norm": 0.43374222283990266, + "learning_rate": 8.946217918203922e-06, + "loss": 0.6213, + "step": 1785 + }, + { + "epoch": 0.2933097941001375, + "grad_norm": 0.33943359501404785, + "learning_rate": 8.946145604907712e-06, + "loss": 0.5849, + "step": 1786 + }, + { + "epoch": 0.29347402130848027, + "grad_norm": 0.34687657406918043, + "learning_rate": 8.94607324332199e-06, + "loss": 0.5989, + "step": 1787 + }, + { + "epoch": 0.293638248516823, + "grad_norm": 0.35147792904951136, + "learning_rate": 8.946000833447535e-06, + "loss": 0.6301, + "step": 1788 + }, + { + "epoch": 0.29380247572516577, + "grad_norm": 0.3120533026712586, + "learning_rate": 8.945928375285139e-06, + "loss": 0.6007, + "step": 1789 + }, + { + "epoch": 0.2939667029335085, + "grad_norm": 0.32762054284041, + "learning_rate": 8.945855868835584e-06, + "loss": 0.6263, + "step": 1790 + }, + { + "epoch": 0.29413093014185127, + "grad_norm": 0.319177490918817, + "learning_rate": 8.945783314099663e-06, + "loss": 0.5937, + "step": 1791 + }, + { + "epoch": 0.294295157350194, + "grad_norm": 0.3426538853948598, + "learning_rate": 8.94571071107816e-06, + "loss": 0.5915, + "step": 1792 + }, + { + "epoch": 0.29445938455853676, + "grad_norm": 0.3534302275740654, + "learning_rate": 8.945638059771864e-06, + "loss": 0.6086, + "step": 1793 + }, + { + "epoch": 0.29462361176687946, + "grad_norm": 0.4119987675520305, + "learning_rate": 8.945565360181566e-06, + "loss": 0.5817, + "step": 1794 + }, + { + "epoch": 0.2947878389752222, + "grad_norm": 0.33739850349835476, + "learning_rate": 8.945492612308053e-06, + "loss": 0.6046, + "step": 1795 + }, + { + "epoch": 0.29495206618356495, + "grad_norm": 0.31484597834060024, + "learning_rate": 8.945419816152118e-06, + "loss": 0.6109, + "step": 1796 + }, + { + "epoch": 0.2951162933919077, + "grad_norm": 0.3359396360573391, + "learning_rate": 8.945346971714548e-06, + "loss": 0.6, + "step": 1797 + }, + { + "epoch": 0.29528052060025045, + "grad_norm": 0.3273419658246956, + "learning_rate": 8.945274078996139e-06, + "loss": 0.5885, + "step": 1798 + }, + { + "epoch": 0.2954447478085932, + "grad_norm": 0.361277697562596, + "learning_rate": 8.945201137997677e-06, + "loss": 0.5726, + "step": 1799 + }, + { + "epoch": 0.29560897501693595, + "grad_norm": 0.3692868752877909, + "learning_rate": 8.94512814871996e-06, + "loss": 0.593, + "step": 1800 + }, + { + "epoch": 0.2957732022252787, + "grad_norm": 0.4303230153835513, + "learning_rate": 8.945055111163776e-06, + "loss": 0.5879, + "step": 1801 + }, + { + "epoch": 0.2959374294336214, + "grad_norm": 0.36640985236501644, + "learning_rate": 8.94498202532992e-06, + "loss": 0.6146, + "step": 1802 + }, + { + "epoch": 0.29610165664196414, + "grad_norm": 0.38608955906589143, + "learning_rate": 8.944908891219187e-06, + "loss": 0.5973, + "step": 1803 + }, + { + "epoch": 0.2962658838503069, + "grad_norm": 0.345799158653238, + "learning_rate": 8.94483570883237e-06, + "loss": 0.5899, + "step": 1804 + }, + { + "epoch": 0.29643011105864964, + "grad_norm": 0.38703871745196605, + "learning_rate": 8.944762478170264e-06, + "loss": 0.6125, + "step": 1805 + }, + { + "epoch": 0.2965943382669924, + "grad_norm": 0.4328536946290052, + "learning_rate": 8.944689199233665e-06, + "loss": 0.6108, + "step": 1806 + }, + { + "epoch": 0.29675856547533513, + "grad_norm": 0.41970722512068465, + "learning_rate": 8.944615872023367e-06, + "loss": 0.616, + "step": 1807 + }, + { + "epoch": 0.2969227926836779, + "grad_norm": 0.36376878975146204, + "learning_rate": 8.944542496540167e-06, + "loss": 0.5927, + "step": 1808 + }, + { + "epoch": 0.29708701989202063, + "grad_norm": 0.3833035639036213, + "learning_rate": 8.944469072784864e-06, + "loss": 0.5912, + "step": 1809 + }, + { + "epoch": 0.2972512471003633, + "grad_norm": 0.36238546383545633, + "learning_rate": 8.944395600758255e-06, + "loss": 0.6036, + "step": 1810 + }, + { + "epoch": 0.2974154743087061, + "grad_norm": 0.3480258093184134, + "learning_rate": 8.944322080461137e-06, + "loss": 0.616, + "step": 1811 + }, + { + "epoch": 0.2975797015170488, + "grad_norm": 0.3480417118481595, + "learning_rate": 8.944248511894307e-06, + "loss": 0.6033, + "step": 1812 + }, + { + "epoch": 0.29774392872539157, + "grad_norm": 0.33712664821835, + "learning_rate": 8.944174895058567e-06, + "loss": 0.5945, + "step": 1813 + }, + { + "epoch": 0.2979081559337343, + "grad_norm": 0.313027470384797, + "learning_rate": 8.944101229954714e-06, + "loss": 0.5916, + "step": 1814 + }, + { + "epoch": 0.29807238314207707, + "grad_norm": 0.3439276047463889, + "learning_rate": 8.94402751658355e-06, + "loss": 0.6182, + "step": 1815 + }, + { + "epoch": 0.2982366103504198, + "grad_norm": 0.36083957952146445, + "learning_rate": 8.943953754945874e-06, + "loss": 0.6012, + "step": 1816 + }, + { + "epoch": 0.29840083755876257, + "grad_norm": 0.3693169795087396, + "learning_rate": 8.943879945042488e-06, + "loss": 0.6028, + "step": 1817 + }, + { + "epoch": 0.2985650647671053, + "grad_norm": 0.35025691555948874, + "learning_rate": 8.943806086874195e-06, + "loss": 0.6178, + "step": 1818 + }, + { + "epoch": 0.298729291975448, + "grad_norm": 0.30773657041387825, + "learning_rate": 8.943732180441794e-06, + "loss": 0.6097, + "step": 1819 + }, + { + "epoch": 0.29889351918379076, + "grad_norm": 0.3707759522224546, + "learning_rate": 8.94365822574609e-06, + "loss": 0.5885, + "step": 1820 + }, + { + "epoch": 0.2990577463921335, + "grad_norm": 0.35508095599295475, + "learning_rate": 8.943584222787888e-06, + "loss": 0.6205, + "step": 1821 + }, + { + "epoch": 0.29922197360047625, + "grad_norm": 0.33789169443655837, + "learning_rate": 8.943510171567986e-06, + "loss": 0.6159, + "step": 1822 + }, + { + "epoch": 0.299386200808819, + "grad_norm": 0.37893216238689686, + "learning_rate": 8.943436072087195e-06, + "loss": 0.6163, + "step": 1823 + }, + { + "epoch": 0.29955042801716175, + "grad_norm": 0.3149645776504594, + "learning_rate": 8.943361924346313e-06, + "loss": 0.5986, + "step": 1824 + }, + { + "epoch": 0.2997146552255045, + "grad_norm": 0.2981620792000357, + "learning_rate": 8.943287728346151e-06, + "loss": 0.5902, + "step": 1825 + }, + { + "epoch": 0.29987888243384725, + "grad_norm": 0.3527275097586558, + "learning_rate": 8.943213484087512e-06, + "loss": 0.6147, + "step": 1826 + }, + { + "epoch": 0.30004310964218994, + "grad_norm": 0.3273132809209595, + "learning_rate": 8.943139191571203e-06, + "loss": 0.6007, + "step": 1827 + }, + { + "epoch": 0.3002073368505327, + "grad_norm": 0.37250810652301003, + "learning_rate": 8.943064850798031e-06, + "loss": 0.5959, + "step": 1828 + }, + { + "epoch": 0.30037156405887544, + "grad_norm": 0.33906589441259377, + "learning_rate": 8.942990461768805e-06, + "loss": 0.5965, + "step": 1829 + }, + { + "epoch": 0.3005357912672182, + "grad_norm": 0.313977528622849, + "learning_rate": 8.94291602448433e-06, + "loss": 0.5909, + "step": 1830 + }, + { + "epoch": 0.30070001847556094, + "grad_norm": 0.35447541373215136, + "learning_rate": 8.942841538945415e-06, + "loss": 0.5942, + "step": 1831 + }, + { + "epoch": 0.3008642456839037, + "grad_norm": 0.3184211563502309, + "learning_rate": 8.94276700515287e-06, + "loss": 0.6229, + "step": 1832 + }, + { + "epoch": 0.30102847289224643, + "grad_norm": 0.3279512969542947, + "learning_rate": 8.942692423107506e-06, + "loss": 0.5918, + "step": 1833 + }, + { + "epoch": 0.3011927001005892, + "grad_norm": 0.3862938159831846, + "learning_rate": 8.94261779281013e-06, + "loss": 0.631, + "step": 1834 + }, + { + "epoch": 0.30135692730893193, + "grad_norm": 0.9431248577218457, + "learning_rate": 8.942543114261552e-06, + "loss": 0.5946, + "step": 1835 + }, + { + "epoch": 0.3015211545172746, + "grad_norm": 0.37485001496427883, + "learning_rate": 8.942468387462588e-06, + "loss": 0.6025, + "step": 1836 + }, + { + "epoch": 0.3016853817256174, + "grad_norm": 0.32292549370149765, + "learning_rate": 8.942393612414045e-06, + "loss": 0.6107, + "step": 1837 + }, + { + "epoch": 0.3018496089339601, + "grad_norm": 0.3677915274414714, + "learning_rate": 8.942318789116736e-06, + "loss": 0.5722, + "step": 1838 + }, + { + "epoch": 0.30201383614230287, + "grad_norm": 0.31430531461192124, + "learning_rate": 8.942243917571474e-06, + "loss": 0.5984, + "step": 1839 + }, + { + "epoch": 0.3021780633506456, + "grad_norm": 0.320090326229002, + "learning_rate": 8.942168997779075e-06, + "loss": 0.5669, + "step": 1840 + }, + { + "epoch": 0.30234229055898837, + "grad_norm": 0.4661611655990814, + "learning_rate": 8.942094029740347e-06, + "loss": 0.5949, + "step": 1841 + }, + { + "epoch": 0.3025065177673311, + "grad_norm": 0.35595856183511837, + "learning_rate": 8.94201901345611e-06, + "loss": 0.6392, + "step": 1842 + }, + { + "epoch": 0.30267074497567387, + "grad_norm": 0.34468323175899396, + "learning_rate": 8.941943948927175e-06, + "loss": 0.6319, + "step": 1843 + }, + { + "epoch": 0.30283497218401656, + "grad_norm": 0.3283822883044266, + "learning_rate": 8.94186883615436e-06, + "loss": 0.633, + "step": 1844 + }, + { + "epoch": 0.3029991993923593, + "grad_norm": 0.3211390353728499, + "learning_rate": 8.941793675138477e-06, + "loss": 0.5856, + "step": 1845 + }, + { + "epoch": 0.30316342660070206, + "grad_norm": 0.38373627857810344, + "learning_rate": 8.941718465880344e-06, + "loss": 0.6072, + "step": 1846 + }, + { + "epoch": 0.3033276538090448, + "grad_norm": 0.4349614186299253, + "learning_rate": 8.941643208380781e-06, + "loss": 0.5782, + "step": 1847 + }, + { + "epoch": 0.30349188101738755, + "grad_norm": 0.33020652969405023, + "learning_rate": 8.941567902640602e-06, + "loss": 0.5849, + "step": 1848 + }, + { + "epoch": 0.3036561082257303, + "grad_norm": 0.3768588711697932, + "learning_rate": 8.941492548660625e-06, + "loss": 0.5899, + "step": 1849 + }, + { + "epoch": 0.30382033543407305, + "grad_norm": 0.3160245478293392, + "learning_rate": 8.94141714644167e-06, + "loss": 0.5968, + "step": 1850 + }, + { + "epoch": 0.3039845626424158, + "grad_norm": 0.34318955817414776, + "learning_rate": 8.941341695984554e-06, + "loss": 0.6093, + "step": 1851 + }, + { + "epoch": 0.30414878985075855, + "grad_norm": 0.44061105391992383, + "learning_rate": 8.941266197290098e-06, + "loss": 0.5914, + "step": 1852 + }, + { + "epoch": 0.30431301705910124, + "grad_norm": 0.32870210797503463, + "learning_rate": 8.941190650359121e-06, + "loss": 0.624, + "step": 1853 + }, + { + "epoch": 0.304477244267444, + "grad_norm": 0.30637297274135705, + "learning_rate": 8.941115055192444e-06, + "loss": 0.6058, + "step": 1854 + }, + { + "epoch": 0.30464147147578674, + "grad_norm": 0.32762818726888737, + "learning_rate": 8.941039411790888e-06, + "loss": 0.6015, + "step": 1855 + }, + { + "epoch": 0.3048056986841295, + "grad_norm": 0.3731775719566123, + "learning_rate": 8.940963720155276e-06, + "loss": 0.603, + "step": 1856 + }, + { + "epoch": 0.30496992589247224, + "grad_norm": 0.346190637525773, + "learning_rate": 8.940887980286428e-06, + "loss": 0.5789, + "step": 1857 + }, + { + "epoch": 0.305134153100815, + "grad_norm": 0.36066973926340856, + "learning_rate": 8.940812192185166e-06, + "loss": 0.6123, + "step": 1858 + }, + { + "epoch": 0.30529838030915774, + "grad_norm": 0.34442243351119006, + "learning_rate": 8.940736355852316e-06, + "loss": 0.5858, + "step": 1859 + }, + { + "epoch": 0.3054626075175005, + "grad_norm": 0.30796542041483227, + "learning_rate": 8.9406604712887e-06, + "loss": 0.6035, + "step": 1860 + }, + { + "epoch": 0.3056268347258432, + "grad_norm": 0.3052034273976577, + "learning_rate": 8.94058453849514e-06, + "loss": 0.5995, + "step": 1861 + }, + { + "epoch": 0.3057910619341859, + "grad_norm": 0.3136793732786086, + "learning_rate": 8.940508557472466e-06, + "loss": 0.6061, + "step": 1862 + }, + { + "epoch": 0.3059552891425287, + "grad_norm": 0.3088594677892597, + "learning_rate": 8.940432528221499e-06, + "loss": 0.5707, + "step": 1863 + }, + { + "epoch": 0.3061195163508714, + "grad_norm": 0.33833389134184366, + "learning_rate": 8.940356450743065e-06, + "loss": 0.5839, + "step": 1864 + }, + { + "epoch": 0.30628374355921417, + "grad_norm": 0.3889948728445, + "learning_rate": 8.940280325037992e-06, + "loss": 0.606, + "step": 1865 + }, + { + "epoch": 0.3064479707675569, + "grad_norm": 0.30634134506963684, + "learning_rate": 8.940204151107106e-06, + "loss": 0.5845, + "step": 1866 + }, + { + "epoch": 0.30661219797589967, + "grad_norm": 0.33450251258079644, + "learning_rate": 8.940127928951235e-06, + "loss": 0.615, + "step": 1867 + }, + { + "epoch": 0.3067764251842424, + "grad_norm": 0.33318859011481017, + "learning_rate": 8.940051658571205e-06, + "loss": 0.5912, + "step": 1868 + }, + { + "epoch": 0.30694065239258517, + "grad_norm": 0.33002590261509496, + "learning_rate": 8.939975339967846e-06, + "loss": 0.6006, + "step": 1869 + }, + { + "epoch": 0.30710487960092786, + "grad_norm": 0.381727301651218, + "learning_rate": 8.939898973141987e-06, + "loss": 0.5911, + "step": 1870 + }, + { + "epoch": 0.3072691068092706, + "grad_norm": 0.3523516962800485, + "learning_rate": 8.939822558094456e-06, + "loss": 0.6086, + "step": 1871 + }, + { + "epoch": 0.30743333401761336, + "grad_norm": 0.3417375521609287, + "learning_rate": 8.939746094826085e-06, + "loss": 0.5858, + "step": 1872 + }, + { + "epoch": 0.3075975612259561, + "grad_norm": 0.3338860766579545, + "learning_rate": 8.939669583337703e-06, + "loss": 0.61, + "step": 1873 + }, + { + "epoch": 0.30776178843429886, + "grad_norm": 0.32800363065689425, + "learning_rate": 8.93959302363014e-06, + "loss": 0.6232, + "step": 1874 + }, + { + "epoch": 0.3079260156426416, + "grad_norm": 0.3791074578049409, + "learning_rate": 8.939516415704231e-06, + "loss": 0.6074, + "step": 1875 + }, + { + "epoch": 0.30809024285098435, + "grad_norm": 0.32136934807801004, + "learning_rate": 8.939439759560805e-06, + "loss": 0.6395, + "step": 1876 + }, + { + "epoch": 0.3082544700593271, + "grad_norm": 0.3308724841591282, + "learning_rate": 8.939363055200693e-06, + "loss": 0.5923, + "step": 1877 + }, + { + "epoch": 0.3084186972676698, + "grad_norm": 0.3039170961510034, + "learning_rate": 8.939286302624734e-06, + "loss": 0.5927, + "step": 1878 + }, + { + "epoch": 0.30858292447601254, + "grad_norm": 0.3139854071425278, + "learning_rate": 8.939209501833755e-06, + "loss": 0.6029, + "step": 1879 + }, + { + "epoch": 0.3087471516843553, + "grad_norm": 0.32362328635050663, + "learning_rate": 8.939132652828596e-06, + "loss": 0.5967, + "step": 1880 + }, + { + "epoch": 0.30891137889269804, + "grad_norm": 0.30376582477672504, + "learning_rate": 8.939055755610087e-06, + "loss": 0.6067, + "step": 1881 + }, + { + "epoch": 0.3090756061010408, + "grad_norm": 0.3088804302395554, + "learning_rate": 8.938978810179066e-06, + "loss": 0.5968, + "step": 1882 + }, + { + "epoch": 0.30923983330938354, + "grad_norm": 0.3182885237046062, + "learning_rate": 8.938901816536367e-06, + "loss": 0.6052, + "step": 1883 + }, + { + "epoch": 0.3094040605177263, + "grad_norm": 0.334401122678781, + "learning_rate": 8.938824774682829e-06, + "loss": 0.5915, + "step": 1884 + }, + { + "epoch": 0.30956828772606904, + "grad_norm": 0.33028895160076666, + "learning_rate": 8.938747684619284e-06, + "loss": 0.609, + "step": 1885 + }, + { + "epoch": 0.3097325149344118, + "grad_norm": 0.3116708839296472, + "learning_rate": 8.938670546346574e-06, + "loss": 0.5787, + "step": 1886 + }, + { + "epoch": 0.3098967421427545, + "grad_norm": 0.29246076671142657, + "learning_rate": 8.938593359865533e-06, + "loss": 0.5822, + "step": 1887 + }, + { + "epoch": 0.3100609693510972, + "grad_norm": 0.3222058679707449, + "learning_rate": 8.938516125177002e-06, + "loss": 0.6057, + "step": 1888 + }, + { + "epoch": 0.31022519655944, + "grad_norm": 0.3983985286357778, + "learning_rate": 8.938438842281819e-06, + "loss": 0.61, + "step": 1889 + }, + { + "epoch": 0.3103894237677827, + "grad_norm": 0.360508351280119, + "learning_rate": 8.938361511180823e-06, + "loss": 0.5955, + "step": 1890 + }, + { + "epoch": 0.3105536509761255, + "grad_norm": 0.333327409913908, + "learning_rate": 8.938284131874856e-06, + "loss": 0.5741, + "step": 1891 + }, + { + "epoch": 0.3107178781844682, + "grad_norm": 0.32873570321422285, + "learning_rate": 8.938206704364754e-06, + "loss": 0.5719, + "step": 1892 + }, + { + "epoch": 0.31088210539281097, + "grad_norm": 0.33211514766374295, + "learning_rate": 8.938129228651361e-06, + "loss": 0.5712, + "step": 1893 + }, + { + "epoch": 0.3110463326011537, + "grad_norm": 0.37887895546631883, + "learning_rate": 8.938051704735521e-06, + "loss": 0.5768, + "step": 1894 + }, + { + "epoch": 0.3112105598094964, + "grad_norm": 0.3034162951190907, + "learning_rate": 8.93797413261807e-06, + "loss": 0.6036, + "step": 1895 + }, + { + "epoch": 0.31137478701783916, + "grad_norm": 0.2969354168845284, + "learning_rate": 8.937896512299854e-06, + "loss": 0.5692, + "step": 1896 + }, + { + "epoch": 0.3115390142261819, + "grad_norm": 0.3212729936642514, + "learning_rate": 8.937818843781717e-06, + "loss": 0.5753, + "step": 1897 + }, + { + "epoch": 0.31170324143452466, + "grad_norm": 0.31974316763883187, + "learning_rate": 8.9377411270645e-06, + "loss": 0.593, + "step": 1898 + }, + { + "epoch": 0.3118674686428674, + "grad_norm": 0.427602217890422, + "learning_rate": 8.937663362149048e-06, + "loss": 0.6011, + "step": 1899 + }, + { + "epoch": 0.31203169585121016, + "grad_norm": 0.32321832135134826, + "learning_rate": 8.937585549036207e-06, + "loss": 0.5819, + "step": 1900 + }, + { + "epoch": 0.3121959230595529, + "grad_norm": 0.3977562088177384, + "learning_rate": 8.93750768772682e-06, + "loss": 0.5982, + "step": 1901 + }, + { + "epoch": 0.31236015026789565, + "grad_norm": 0.34126358407384527, + "learning_rate": 8.937429778221733e-06, + "loss": 0.6179, + "step": 1902 + }, + { + "epoch": 0.3125243774762384, + "grad_norm": 0.34038697596903955, + "learning_rate": 8.937351820521793e-06, + "loss": 0.5826, + "step": 1903 + }, + { + "epoch": 0.3126886046845811, + "grad_norm": 0.382868319329483, + "learning_rate": 8.937273814627848e-06, + "loss": 0.5834, + "step": 1904 + }, + { + "epoch": 0.31285283189292384, + "grad_norm": 0.32282662203740864, + "learning_rate": 8.937195760540742e-06, + "loss": 0.5991, + "step": 1905 + }, + { + "epoch": 0.3130170591012666, + "grad_norm": 0.319661355229016, + "learning_rate": 8.937117658261326e-06, + "loss": 0.5726, + "step": 1906 + }, + { + "epoch": 0.31318128630960934, + "grad_norm": 0.34430211300551833, + "learning_rate": 8.937039507790446e-06, + "loss": 0.5983, + "step": 1907 + }, + { + "epoch": 0.3133455135179521, + "grad_norm": 0.38993287400775706, + "learning_rate": 8.936961309128951e-06, + "loss": 0.5928, + "step": 1908 + }, + { + "epoch": 0.31350974072629484, + "grad_norm": 0.4499590084430486, + "learning_rate": 8.93688306227769e-06, + "loss": 0.6112, + "step": 1909 + }, + { + "epoch": 0.3136739679346376, + "grad_norm": 0.35068628407856295, + "learning_rate": 8.936804767237515e-06, + "loss": 0.582, + "step": 1910 + }, + { + "epoch": 0.31383819514298034, + "grad_norm": 0.32678889633656605, + "learning_rate": 8.936726424009275e-06, + "loss": 0.5945, + "step": 1911 + }, + { + "epoch": 0.31400242235132303, + "grad_norm": 0.3427217699640625, + "learning_rate": 8.93664803259382e-06, + "loss": 0.6034, + "step": 1912 + }, + { + "epoch": 0.3141666495596658, + "grad_norm": 0.3750821944895772, + "learning_rate": 8.936569592992003e-06, + "loss": 0.5906, + "step": 1913 + }, + { + "epoch": 0.3143308767680085, + "grad_norm": 0.3733784173782264, + "learning_rate": 8.936491105204675e-06, + "loss": 0.6176, + "step": 1914 + }, + { + "epoch": 0.3144951039763513, + "grad_norm": 0.3463030952400915, + "learning_rate": 8.936412569232689e-06, + "loss": 0.5843, + "step": 1915 + }, + { + "epoch": 0.314659331184694, + "grad_norm": 0.3750041213901857, + "learning_rate": 8.936333985076898e-06, + "loss": 0.6075, + "step": 1916 + }, + { + "epoch": 0.3148235583930368, + "grad_norm": 0.3245755348831032, + "learning_rate": 8.936255352738155e-06, + "loss": 0.5803, + "step": 1917 + }, + { + "epoch": 0.3149877856013795, + "grad_norm": 0.3287569137043314, + "learning_rate": 8.936176672217314e-06, + "loss": 0.5633, + "step": 1918 + }, + { + "epoch": 0.31515201280972227, + "grad_norm": 0.34395092289289697, + "learning_rate": 8.936097943515229e-06, + "loss": 0.6011, + "step": 1919 + }, + { + "epoch": 0.315316240018065, + "grad_norm": 0.339337024599154, + "learning_rate": 8.936019166632757e-06, + "loss": 0.5962, + "step": 1920 + }, + { + "epoch": 0.3154804672264077, + "grad_norm": 0.38368448835939173, + "learning_rate": 8.935940341570752e-06, + "loss": 0.5859, + "step": 1921 + }, + { + "epoch": 0.31564469443475046, + "grad_norm": 0.36731857860519107, + "learning_rate": 8.93586146833007e-06, + "loss": 0.5962, + "step": 1922 + }, + { + "epoch": 0.3158089216430932, + "grad_norm": 0.32553923326385664, + "learning_rate": 8.935782546911568e-06, + "loss": 0.5815, + "step": 1923 + }, + { + "epoch": 0.31597314885143596, + "grad_norm": 0.32593913183393547, + "learning_rate": 8.935703577316105e-06, + "loss": 0.5998, + "step": 1924 + }, + { + "epoch": 0.3161373760597787, + "grad_norm": 0.3369591114430366, + "learning_rate": 8.935624559544534e-06, + "loss": 0.5987, + "step": 1925 + }, + { + "epoch": 0.31630160326812146, + "grad_norm": 0.34102164790399514, + "learning_rate": 8.935545493597719e-06, + "loss": 0.5898, + "step": 1926 + }, + { + "epoch": 0.3164658304764642, + "grad_norm": 0.44631235074440223, + "learning_rate": 8.935466379476515e-06, + "loss": 0.6025, + "step": 1927 + }, + { + "epoch": 0.31663005768480695, + "grad_norm": 0.3522044856839867, + "learning_rate": 8.93538721718178e-06, + "loss": 0.6083, + "step": 1928 + }, + { + "epoch": 0.31679428489314965, + "grad_norm": 0.3472773544596263, + "learning_rate": 8.93530800671438e-06, + "loss": 0.5754, + "step": 1929 + }, + { + "epoch": 0.3169585121014924, + "grad_norm": 0.4215209510654946, + "learning_rate": 8.93522874807517e-06, + "loss": 0.6199, + "step": 1930 + }, + { + "epoch": 0.31712273930983514, + "grad_norm": 0.33209722753968446, + "learning_rate": 8.935149441265012e-06, + "loss": 0.6037, + "step": 1931 + }, + { + "epoch": 0.3172869665181779, + "grad_norm": 0.5349339542080866, + "learning_rate": 8.935070086284766e-06, + "loss": 0.587, + "step": 1932 + }, + { + "epoch": 0.31745119372652064, + "grad_norm": 0.3410062528635434, + "learning_rate": 8.934990683135297e-06, + "loss": 0.5932, + "step": 1933 + }, + { + "epoch": 0.3176154209348634, + "grad_norm": 0.3408786107621934, + "learning_rate": 8.934911231817464e-06, + "loss": 0.609, + "step": 1934 + }, + { + "epoch": 0.31777964814320614, + "grad_norm": 0.3219443504322176, + "learning_rate": 8.934831732332133e-06, + "loss": 0.5974, + "step": 1935 + }, + { + "epoch": 0.3179438753515489, + "grad_norm": 0.40622542122388705, + "learning_rate": 8.934752184680166e-06, + "loss": 0.5862, + "step": 1936 + }, + { + "epoch": 0.31810810255989164, + "grad_norm": 0.32480511259589073, + "learning_rate": 8.934672588862426e-06, + "loss": 0.564, + "step": 1937 + }, + { + "epoch": 0.31827232976823433, + "grad_norm": 0.3076637039890817, + "learning_rate": 8.93459294487978e-06, + "loss": 0.5945, + "step": 1938 + }, + { + "epoch": 0.3184365569765771, + "grad_norm": 0.33152318749385223, + "learning_rate": 8.934513252733091e-06, + "loss": 0.5637, + "step": 1939 + }, + { + "epoch": 0.3186007841849198, + "grad_norm": 0.3318923807049237, + "learning_rate": 8.934433512423224e-06, + "loss": 0.6095, + "step": 1940 + }, + { + "epoch": 0.3187650113932626, + "grad_norm": 0.37782491111307404, + "learning_rate": 8.934353723951049e-06, + "loss": 0.6057, + "step": 1941 + }, + { + "epoch": 0.3189292386016053, + "grad_norm": 0.36785165188784136, + "learning_rate": 8.934273887317427e-06, + "loss": 0.6034, + "step": 1942 + }, + { + "epoch": 0.3190934658099481, + "grad_norm": 0.3129470298856957, + "learning_rate": 8.93419400252323e-06, + "loss": 0.6043, + "step": 1943 + }, + { + "epoch": 0.3192576930182908, + "grad_norm": 0.36195379839111386, + "learning_rate": 8.934114069569321e-06, + "loss": 0.5935, + "step": 1944 + }, + { + "epoch": 0.31942192022663357, + "grad_norm": 0.34969714763824405, + "learning_rate": 8.934034088456573e-06, + "loss": 0.582, + "step": 1945 + }, + { + "epoch": 0.31958614743497626, + "grad_norm": 0.3143958189825889, + "learning_rate": 8.93395405918585e-06, + "loss": 0.5904, + "step": 1946 + }, + { + "epoch": 0.319750374643319, + "grad_norm": 0.3676003778377262, + "learning_rate": 8.933873981758026e-06, + "loss": 0.5981, + "step": 1947 + }, + { + "epoch": 0.31991460185166176, + "grad_norm": 0.29483469959922937, + "learning_rate": 8.933793856173966e-06, + "loss": 0.6051, + "step": 1948 + }, + { + "epoch": 0.3200788290600045, + "grad_norm": 0.32753721487164505, + "learning_rate": 8.933713682434545e-06, + "loss": 0.5924, + "step": 1949 + }, + { + "epoch": 0.32024305626834726, + "grad_norm": 0.33750303418011973, + "learning_rate": 8.93363346054063e-06, + "loss": 0.5933, + "step": 1950 + }, + { + "epoch": 0.32040728347669, + "grad_norm": 0.31414417859725996, + "learning_rate": 8.933553190493092e-06, + "loss": 0.5742, + "step": 1951 + }, + { + "epoch": 0.32057151068503276, + "grad_norm": 0.3123828711273527, + "learning_rate": 8.933472872292805e-06, + "loss": 0.5843, + "step": 1952 + }, + { + "epoch": 0.3207357378933755, + "grad_norm": 0.4409976229317403, + "learning_rate": 8.933392505940643e-06, + "loss": 0.5793, + "step": 1953 + }, + { + "epoch": 0.32089996510171825, + "grad_norm": 0.30482982633884265, + "learning_rate": 8.933312091437473e-06, + "loss": 0.5888, + "step": 1954 + }, + { + "epoch": 0.32106419231006095, + "grad_norm": 0.4277019006668714, + "learning_rate": 8.933231628784174e-06, + "loss": 0.5929, + "step": 1955 + }, + { + "epoch": 0.3212284195184037, + "grad_norm": 0.35975629174938006, + "learning_rate": 8.933151117981617e-06, + "loss": 0.5996, + "step": 1956 + }, + { + "epoch": 0.32139264672674644, + "grad_norm": 0.33774220319360165, + "learning_rate": 8.933070559030678e-06, + "loss": 0.6049, + "step": 1957 + }, + { + "epoch": 0.3215568739350892, + "grad_norm": 0.34493943664636884, + "learning_rate": 8.93298995193223e-06, + "loss": 0.5892, + "step": 1958 + }, + { + "epoch": 0.32172110114343194, + "grad_norm": 0.3969135510220126, + "learning_rate": 8.93290929668715e-06, + "loss": 0.598, + "step": 1959 + }, + { + "epoch": 0.3218853283517747, + "grad_norm": 0.3308507690751659, + "learning_rate": 8.932828593296315e-06, + "loss": 0.5872, + "step": 1960 + }, + { + "epoch": 0.32204955556011744, + "grad_norm": 0.33602909569345507, + "learning_rate": 8.9327478417606e-06, + "loss": 0.5769, + "step": 1961 + }, + { + "epoch": 0.3222137827684602, + "grad_norm": 0.3928903161584532, + "learning_rate": 8.932667042080881e-06, + "loss": 0.5826, + "step": 1962 + }, + { + "epoch": 0.3223780099768029, + "grad_norm": 0.3503559259230569, + "learning_rate": 8.932586194258038e-06, + "loss": 0.5986, + "step": 1963 + }, + { + "epoch": 0.32254223718514563, + "grad_norm": 0.3413850568357385, + "learning_rate": 8.932505298292945e-06, + "loss": 0.572, + "step": 1964 + }, + { + "epoch": 0.3227064643934884, + "grad_norm": 0.33703606476691533, + "learning_rate": 8.932424354186486e-06, + "loss": 0.5834, + "step": 1965 + }, + { + "epoch": 0.32287069160183113, + "grad_norm": 0.3213463785765971, + "learning_rate": 8.932343361939538e-06, + "loss": 0.601, + "step": 1966 + }, + { + "epoch": 0.3230349188101739, + "grad_norm": 0.30437587809999017, + "learning_rate": 8.93226232155298e-06, + "loss": 0.591, + "step": 1967 + }, + { + "epoch": 0.3231991460185166, + "grad_norm": 0.44379718365544524, + "learning_rate": 8.932181233027692e-06, + "loss": 0.573, + "step": 1968 + }, + { + "epoch": 0.3233633732268594, + "grad_norm": 0.44330142263154393, + "learning_rate": 8.932100096364554e-06, + "loss": 0.6035, + "step": 1969 + }, + { + "epoch": 0.3235276004352021, + "grad_norm": 0.3358028448333551, + "learning_rate": 8.93201891156445e-06, + "loss": 0.5983, + "step": 1970 + }, + { + "epoch": 0.32369182764354487, + "grad_norm": 0.366694044935094, + "learning_rate": 8.931937678628258e-06, + "loss": 0.6014, + "step": 1971 + }, + { + "epoch": 0.32385605485188756, + "grad_norm": 0.31338557422441116, + "learning_rate": 8.931856397556865e-06, + "loss": 0.5766, + "step": 1972 + }, + { + "epoch": 0.3240202820602303, + "grad_norm": 0.2903187762457536, + "learning_rate": 8.931775068351149e-06, + "loss": 0.5789, + "step": 1973 + }, + { + "epoch": 0.32418450926857306, + "grad_norm": 0.3254260429214987, + "learning_rate": 8.931693691011998e-06, + "loss": 0.5892, + "step": 1974 + }, + { + "epoch": 0.3243487364769158, + "grad_norm": 0.3428275678284653, + "learning_rate": 8.931612265540291e-06, + "loss": 0.5984, + "step": 1975 + }, + { + "epoch": 0.32451296368525856, + "grad_norm": 0.30397612315020456, + "learning_rate": 8.931530791936915e-06, + "loss": 0.5831, + "step": 1976 + }, + { + "epoch": 0.3246771908936013, + "grad_norm": 0.4572089137256494, + "learning_rate": 8.931449270202756e-06, + "loss": 0.5819, + "step": 1977 + }, + { + "epoch": 0.32484141810194406, + "grad_norm": 0.36966275950806693, + "learning_rate": 8.931367700338696e-06, + "loss": 0.5889, + "step": 1978 + }, + { + "epoch": 0.3250056453102868, + "grad_norm": 0.3415927987872072, + "learning_rate": 8.931286082345625e-06, + "loss": 0.5923, + "step": 1979 + }, + { + "epoch": 0.3251698725186295, + "grad_norm": 0.3627702364467653, + "learning_rate": 8.931204416224426e-06, + "loss": 0.6089, + "step": 1980 + }, + { + "epoch": 0.32533409972697225, + "grad_norm": 0.340328174457014, + "learning_rate": 8.931122701975987e-06, + "loss": 0.5923, + "step": 1981 + }, + { + "epoch": 0.325498326935315, + "grad_norm": 0.32978423355398195, + "learning_rate": 8.931040939601196e-06, + "loss": 0.5914, + "step": 1982 + }, + { + "epoch": 0.32566255414365775, + "grad_norm": 0.3543467620738116, + "learning_rate": 8.930959129100941e-06, + "loss": 0.5821, + "step": 1983 + }, + { + "epoch": 0.3258267813520005, + "grad_norm": 0.5390093529285701, + "learning_rate": 8.930877270476112e-06, + "loss": 0.5872, + "step": 1984 + }, + { + "epoch": 0.32599100856034324, + "grad_norm": 0.3290275508548655, + "learning_rate": 8.930795363727595e-06, + "loss": 0.588, + "step": 1985 + }, + { + "epoch": 0.326155235768686, + "grad_norm": 0.31186173084525426, + "learning_rate": 8.930713408856281e-06, + "loss": 0.6012, + "step": 1986 + }, + { + "epoch": 0.32631946297702874, + "grad_norm": 0.32941053882022925, + "learning_rate": 8.930631405863059e-06, + "loss": 0.5816, + "step": 1987 + }, + { + "epoch": 0.3264836901853715, + "grad_norm": 0.3222654111456884, + "learning_rate": 8.930549354748822e-06, + "loss": 0.5898, + "step": 1988 + }, + { + "epoch": 0.3266479173937142, + "grad_norm": 0.3292664467169063, + "learning_rate": 8.930467255514461e-06, + "loss": 0.5889, + "step": 1989 + }, + { + "epoch": 0.32681214460205693, + "grad_norm": 0.3743999278918246, + "learning_rate": 8.930385108160867e-06, + "loss": 0.6068, + "step": 1990 + }, + { + "epoch": 0.3269763718103997, + "grad_norm": 0.31297732477605195, + "learning_rate": 8.93030291268893e-06, + "loss": 0.603, + "step": 1991 + }, + { + "epoch": 0.32714059901874243, + "grad_norm": 0.33774822533754145, + "learning_rate": 8.930220669099544e-06, + "loss": 0.5755, + "step": 1992 + }, + { + "epoch": 0.3273048262270852, + "grad_norm": 0.3403085046460094, + "learning_rate": 8.930138377393604e-06, + "loss": 0.5925, + "step": 1993 + }, + { + "epoch": 0.3274690534354279, + "grad_norm": 0.48517931840712203, + "learning_rate": 8.930056037572002e-06, + "loss": 0.5873, + "step": 1994 + }, + { + "epoch": 0.3276332806437707, + "grad_norm": 0.30851335436470095, + "learning_rate": 8.929973649635633e-06, + "loss": 0.5701, + "step": 1995 + }, + { + "epoch": 0.3277975078521134, + "grad_norm": 0.3451118082788753, + "learning_rate": 8.929891213585391e-06, + "loss": 0.5815, + "step": 1996 + }, + { + "epoch": 0.3279617350604561, + "grad_norm": 0.3343156174433525, + "learning_rate": 8.929808729422172e-06, + "loss": 0.5835, + "step": 1997 + }, + { + "epoch": 0.32812596226879887, + "grad_norm": 0.31129871158266786, + "learning_rate": 8.929726197146872e-06, + "loss": 0.5938, + "step": 1998 + }, + { + "epoch": 0.3282901894771416, + "grad_norm": 0.30126632572026746, + "learning_rate": 8.92964361676039e-06, + "loss": 0.5936, + "step": 1999 + }, + { + "epoch": 0.32845441668548436, + "grad_norm": 0.5638600322215224, + "learning_rate": 8.929560988263617e-06, + "loss": 0.5845, + "step": 2000 + }, + { + "epoch": 0.3286186438938271, + "grad_norm": 0.3554185177476191, + "learning_rate": 8.929478311657455e-06, + "loss": 0.5588, + "step": 2001 + }, + { + "epoch": 0.32878287110216986, + "grad_norm": 0.33610749379303184, + "learning_rate": 8.9293955869428e-06, + "loss": 0.5824, + "step": 2002 + }, + { + "epoch": 0.3289470983105126, + "grad_norm": 0.8752215708398972, + "learning_rate": 8.929312814120551e-06, + "loss": 0.6125, + "step": 2003 + }, + { + "epoch": 0.32911132551885536, + "grad_norm": 0.3846513114226988, + "learning_rate": 8.929229993191608e-06, + "loss": 0.5778, + "step": 2004 + }, + { + "epoch": 0.32927555272719805, + "grad_norm": 0.3111081213681579, + "learning_rate": 8.929147124156869e-06, + "loss": 0.5772, + "step": 2005 + }, + { + "epoch": 0.3294397799355408, + "grad_norm": 0.3280468513105951, + "learning_rate": 8.929064207017233e-06, + "loss": 0.5855, + "step": 2006 + }, + { + "epoch": 0.32960400714388355, + "grad_norm": 0.32684896198790714, + "learning_rate": 8.928981241773603e-06, + "loss": 0.5722, + "step": 2007 + }, + { + "epoch": 0.3297682343522263, + "grad_norm": 0.32752500433087867, + "learning_rate": 8.92889822842688e-06, + "loss": 0.5977, + "step": 2008 + }, + { + "epoch": 0.32993246156056905, + "grad_norm": 0.3142469749217832, + "learning_rate": 8.928815166977964e-06, + "loss": 0.5972, + "step": 2009 + }, + { + "epoch": 0.3300966887689118, + "grad_norm": 0.34216065189885336, + "learning_rate": 8.928732057427757e-06, + "loss": 0.6017, + "step": 2010 + }, + { + "epoch": 0.33026091597725454, + "grad_norm": 0.33522533076340355, + "learning_rate": 8.928648899777165e-06, + "loss": 0.5883, + "step": 2011 + }, + { + "epoch": 0.3304251431855973, + "grad_norm": 0.4104881167330931, + "learning_rate": 8.928565694027086e-06, + "loss": 0.5868, + "step": 2012 + }, + { + "epoch": 0.33058937039394004, + "grad_norm": 0.323178569938277, + "learning_rate": 8.928482440178428e-06, + "loss": 0.6021, + "step": 2013 + }, + { + "epoch": 0.33075359760228273, + "grad_norm": 0.3731243392644556, + "learning_rate": 8.928399138232094e-06, + "loss": 0.577, + "step": 2014 + }, + { + "epoch": 0.3309178248106255, + "grad_norm": 0.30986603573501476, + "learning_rate": 8.928315788188989e-06, + "loss": 0.6136, + "step": 2015 + }, + { + "epoch": 0.33108205201896823, + "grad_norm": 0.33958219563081765, + "learning_rate": 8.928232390050015e-06, + "loss": 0.5737, + "step": 2016 + }, + { + "epoch": 0.331246279227311, + "grad_norm": 0.328627168041723, + "learning_rate": 8.928148943816084e-06, + "loss": 0.5868, + "step": 2017 + }, + { + "epoch": 0.33141050643565373, + "grad_norm": 0.3176779588770702, + "learning_rate": 8.928065449488096e-06, + "loss": 0.5769, + "step": 2018 + }, + { + "epoch": 0.3315747336439965, + "grad_norm": 0.41809285685467873, + "learning_rate": 8.927981907066961e-06, + "loss": 0.5899, + "step": 2019 + }, + { + "epoch": 0.3317389608523392, + "grad_norm": 0.36709251595651227, + "learning_rate": 8.927898316553586e-06, + "loss": 0.604, + "step": 2020 + }, + { + "epoch": 0.331903188060682, + "grad_norm": 0.3274955175203132, + "learning_rate": 8.927814677948879e-06, + "loss": 0.6012, + "step": 2021 + }, + { + "epoch": 0.33206741526902467, + "grad_norm": 0.2832864673661248, + "learning_rate": 8.92773099125375e-06, + "loss": 0.5942, + "step": 2022 + }, + { + "epoch": 0.3322316424773674, + "grad_norm": 0.30603924362675944, + "learning_rate": 8.927647256469104e-06, + "loss": 0.5925, + "step": 2023 + }, + { + "epoch": 0.33239586968571017, + "grad_norm": 0.33180800499144236, + "learning_rate": 8.927563473595853e-06, + "loss": 0.5823, + "step": 2024 + }, + { + "epoch": 0.3325600968940529, + "grad_norm": 0.31278550416452017, + "learning_rate": 8.927479642634906e-06, + "loss": 0.5746, + "step": 2025 + }, + { + "epoch": 0.33272432410239566, + "grad_norm": 0.6370773833935756, + "learning_rate": 8.927395763587175e-06, + "loss": 0.5944, + "step": 2026 + }, + { + "epoch": 0.3328885513107384, + "grad_norm": 0.3793789597731592, + "learning_rate": 8.927311836453569e-06, + "loss": 0.6057, + "step": 2027 + }, + { + "epoch": 0.33305277851908116, + "grad_norm": 0.32422037386343944, + "learning_rate": 8.927227861235002e-06, + "loss": 0.5886, + "step": 2028 + }, + { + "epoch": 0.3332170057274239, + "grad_norm": 0.41822762137690095, + "learning_rate": 8.927143837932384e-06, + "loss": 0.5775, + "step": 2029 + }, + { + "epoch": 0.33338123293576666, + "grad_norm": 0.3339592815475642, + "learning_rate": 8.927059766546627e-06, + "loss": 0.5943, + "step": 2030 + }, + { + "epoch": 0.33354546014410935, + "grad_norm": 0.4645135259033762, + "learning_rate": 8.926975647078648e-06, + "loss": 0.6036, + "step": 2031 + }, + { + "epoch": 0.3337096873524521, + "grad_norm": 0.30158131377366604, + "learning_rate": 8.926891479529356e-06, + "loss": 0.5819, + "step": 2032 + }, + { + "epoch": 0.33387391456079485, + "grad_norm": 0.46237162147584415, + "learning_rate": 8.926807263899665e-06, + "loss": 0.5877, + "step": 2033 + }, + { + "epoch": 0.3340381417691376, + "grad_norm": 0.3169681472015116, + "learning_rate": 8.926723000190496e-06, + "loss": 0.59, + "step": 2034 + }, + { + "epoch": 0.33420236897748035, + "grad_norm": 0.3515972769278677, + "learning_rate": 8.926638688402759e-06, + "loss": 0.5732, + "step": 2035 + }, + { + "epoch": 0.3343665961858231, + "grad_norm": 0.32805529347869533, + "learning_rate": 8.926554328537368e-06, + "loss": 0.5825, + "step": 2036 + }, + { + "epoch": 0.33453082339416584, + "grad_norm": 0.32908781830163236, + "learning_rate": 8.926469920595243e-06, + "loss": 0.5955, + "step": 2037 + }, + { + "epoch": 0.3346950506025086, + "grad_norm": 0.8047963527089921, + "learning_rate": 8.9263854645773e-06, + "loss": 0.5999, + "step": 2038 + }, + { + "epoch": 0.3348592778108513, + "grad_norm": 0.2904460170060317, + "learning_rate": 8.926300960484457e-06, + "loss": 0.585, + "step": 2039 + }, + { + "epoch": 0.33502350501919403, + "grad_norm": 0.32512733927564513, + "learning_rate": 8.92621640831763e-06, + "loss": 0.6103, + "step": 2040 + }, + { + "epoch": 0.3351877322275368, + "grad_norm": 0.620086893708396, + "learning_rate": 8.926131808077737e-06, + "loss": 0.613, + "step": 2041 + }, + { + "epoch": 0.33535195943587953, + "grad_norm": 0.3439753681608022, + "learning_rate": 8.926047159765699e-06, + "loss": 0.5954, + "step": 2042 + }, + { + "epoch": 0.3355161866442223, + "grad_norm": 0.30302759955620057, + "learning_rate": 8.925962463382433e-06, + "loss": 0.588, + "step": 2043 + }, + { + "epoch": 0.33568041385256503, + "grad_norm": 0.34048898804704547, + "learning_rate": 8.92587771892886e-06, + "loss": 0.6006, + "step": 2044 + }, + { + "epoch": 0.3358446410609078, + "grad_norm": 0.35733144461693095, + "learning_rate": 8.925792926405903e-06, + "loss": 0.5708, + "step": 2045 + }, + { + "epoch": 0.3360088682692505, + "grad_norm": 0.3100918297713711, + "learning_rate": 8.925708085814478e-06, + "loss": 0.579, + "step": 2046 + }, + { + "epoch": 0.3361730954775933, + "grad_norm": 0.31010557216776774, + "learning_rate": 8.925623197155508e-06, + "loss": 0.6043, + "step": 2047 + }, + { + "epoch": 0.33633732268593597, + "grad_norm": 0.3207717425849682, + "learning_rate": 8.925538260429919e-06, + "loss": 0.5791, + "step": 2048 + }, + { + "epoch": 0.3365015498942787, + "grad_norm": 0.46303927724673305, + "learning_rate": 8.925453275638628e-06, + "loss": 0.5981, + "step": 2049 + }, + { + "epoch": 0.33666577710262147, + "grad_norm": 0.32519157260027265, + "learning_rate": 8.925368242782562e-06, + "loss": 0.5773, + "step": 2050 + }, + { + "epoch": 0.3368300043109642, + "grad_norm": 0.30398409620669065, + "learning_rate": 8.925283161862642e-06, + "loss": 0.5894, + "step": 2051 + }, + { + "epoch": 0.33699423151930696, + "grad_norm": 0.2974031839313766, + "learning_rate": 8.925198032879793e-06, + "loss": 0.5957, + "step": 2052 + }, + { + "epoch": 0.3371584587276497, + "grad_norm": 0.3404861792754053, + "learning_rate": 8.925112855834939e-06, + "loss": 0.5979, + "step": 2053 + }, + { + "epoch": 0.33732268593599246, + "grad_norm": 0.3542777008958305, + "learning_rate": 8.925027630729007e-06, + "loss": 0.5731, + "step": 2054 + }, + { + "epoch": 0.3374869131443352, + "grad_norm": 0.29838686215896404, + "learning_rate": 8.92494235756292e-06, + "loss": 0.5914, + "step": 2055 + }, + { + "epoch": 0.3376511403526779, + "grad_norm": 0.30489826458776687, + "learning_rate": 8.924857036337606e-06, + "loss": 0.5885, + "step": 2056 + }, + { + "epoch": 0.33781536756102065, + "grad_norm": 0.34373941171524325, + "learning_rate": 8.924771667053991e-06, + "loss": 0.5691, + "step": 2057 + }, + { + "epoch": 0.3379795947693634, + "grad_norm": 0.3235745424873555, + "learning_rate": 8.924686249713002e-06, + "loss": 0.5791, + "step": 2058 + }, + { + "epoch": 0.33814382197770615, + "grad_norm": 0.45911687221923925, + "learning_rate": 8.924600784315568e-06, + "loss": 0.6007, + "step": 2059 + }, + { + "epoch": 0.3383080491860489, + "grad_norm": 0.3303513848889647, + "learning_rate": 8.924515270862615e-06, + "loss": 0.5877, + "step": 2060 + }, + { + "epoch": 0.33847227639439165, + "grad_norm": 0.3146634573861978, + "learning_rate": 8.924429709355075e-06, + "loss": 0.5985, + "step": 2061 + }, + { + "epoch": 0.3386365036027344, + "grad_norm": 0.35605464455064945, + "learning_rate": 8.924344099793873e-06, + "loss": 0.5658, + "step": 2062 + }, + { + "epoch": 0.33880073081107714, + "grad_norm": 0.3348179585377794, + "learning_rate": 8.924258442179942e-06, + "loss": 0.6065, + "step": 2063 + }, + { + "epoch": 0.3389649580194199, + "grad_norm": 0.3194465381879545, + "learning_rate": 8.924172736514213e-06, + "loss": 0.5856, + "step": 2064 + }, + { + "epoch": 0.3391291852277626, + "grad_norm": 0.36449000613032345, + "learning_rate": 8.924086982797612e-06, + "loss": 0.5774, + "step": 2065 + }, + { + "epoch": 0.33929341243610533, + "grad_norm": 0.3880607217082634, + "learning_rate": 8.924001181031077e-06, + "loss": 0.566, + "step": 2066 + }, + { + "epoch": 0.3394576396444481, + "grad_norm": 0.287273007995822, + "learning_rate": 8.923915331215534e-06, + "loss": 0.5615, + "step": 2067 + }, + { + "epoch": 0.33962186685279083, + "grad_norm": 0.3312610820471845, + "learning_rate": 8.923829433351919e-06, + "loss": 0.5675, + "step": 2068 + }, + { + "epoch": 0.3397860940611336, + "grad_norm": 0.35310045879408936, + "learning_rate": 8.923743487441164e-06, + "loss": 0.5948, + "step": 2069 + }, + { + "epoch": 0.33995032126947633, + "grad_norm": 0.33574837233858923, + "learning_rate": 8.923657493484203e-06, + "loss": 0.5896, + "step": 2070 + }, + { + "epoch": 0.3401145484778191, + "grad_norm": 0.29104720814747725, + "learning_rate": 8.923571451481967e-06, + "loss": 0.5728, + "step": 2071 + }, + { + "epoch": 0.3402787756861618, + "grad_norm": 0.3733366968858962, + "learning_rate": 8.923485361435397e-06, + "loss": 0.5897, + "step": 2072 + }, + { + "epoch": 0.3404430028945045, + "grad_norm": 0.31009524646895975, + "learning_rate": 8.92339922334542e-06, + "loss": 0.5529, + "step": 2073 + }, + { + "epoch": 0.34060723010284727, + "grad_norm": 0.2956517683672337, + "learning_rate": 8.923313037212977e-06, + "loss": 0.6152, + "step": 2074 + }, + { + "epoch": 0.34077145731119, + "grad_norm": 0.3332125958408152, + "learning_rate": 8.923226803039e-06, + "loss": 0.6156, + "step": 2075 + }, + { + "epoch": 0.34093568451953277, + "grad_norm": 0.3180632021721819, + "learning_rate": 8.923140520824432e-06, + "loss": 0.583, + "step": 2076 + }, + { + "epoch": 0.3410999117278755, + "grad_norm": 0.35586058964375455, + "learning_rate": 8.923054190570204e-06, + "loss": 0.5819, + "step": 2077 + }, + { + "epoch": 0.34126413893621826, + "grad_norm": 0.3998068688551633, + "learning_rate": 8.922967812277256e-06, + "loss": 0.5787, + "step": 2078 + }, + { + "epoch": 0.341428366144561, + "grad_norm": 0.30357534998154256, + "learning_rate": 8.922881385946526e-06, + "loss": 0.5593, + "step": 2079 + }, + { + "epoch": 0.34159259335290376, + "grad_norm": 0.5561696146995321, + "learning_rate": 8.922794911578954e-06, + "loss": 0.5973, + "step": 2080 + }, + { + "epoch": 0.3417568205612465, + "grad_norm": 0.40185317262477926, + "learning_rate": 8.922708389175476e-06, + "loss": 0.5947, + "step": 2081 + }, + { + "epoch": 0.3419210477695892, + "grad_norm": 0.30390626963582257, + "learning_rate": 8.922621818737033e-06, + "loss": 0.5678, + "step": 2082 + }, + { + "epoch": 0.34208527497793195, + "grad_norm": 0.3413233665139855, + "learning_rate": 8.922535200264568e-06, + "loss": 0.5982, + "step": 2083 + }, + { + "epoch": 0.3422495021862747, + "grad_norm": 0.28579448814830777, + "learning_rate": 8.922448533759017e-06, + "loss": 0.5848, + "step": 2084 + }, + { + "epoch": 0.34241372939461745, + "grad_norm": 0.3204345352536137, + "learning_rate": 8.922361819221326e-06, + "loss": 0.5757, + "step": 2085 + }, + { + "epoch": 0.3425779566029602, + "grad_norm": 0.3142059988489119, + "learning_rate": 8.922275056652434e-06, + "loss": 0.5934, + "step": 2086 + }, + { + "epoch": 0.34274218381130295, + "grad_norm": 0.3307567711086455, + "learning_rate": 8.922188246053284e-06, + "loss": 0.5934, + "step": 2087 + }, + { + "epoch": 0.3429064110196457, + "grad_norm": 0.31046467756377955, + "learning_rate": 8.922101387424818e-06, + "loss": 0.5882, + "step": 2088 + }, + { + "epoch": 0.34307063822798844, + "grad_norm": 0.3820278829809434, + "learning_rate": 8.922014480767981e-06, + "loss": 0.5954, + "step": 2089 + }, + { + "epoch": 0.34323486543633114, + "grad_norm": 0.335264516354752, + "learning_rate": 8.921927526083716e-06, + "loss": 0.5987, + "step": 2090 + }, + { + "epoch": 0.3433990926446739, + "grad_norm": 0.35109540453615623, + "learning_rate": 8.921840523372967e-06, + "loss": 0.5886, + "step": 2091 + }, + { + "epoch": 0.34356331985301664, + "grad_norm": 0.3404371948804148, + "learning_rate": 8.92175347263668e-06, + "loss": 0.612, + "step": 2092 + }, + { + "epoch": 0.3437275470613594, + "grad_norm": 0.3226414596776337, + "learning_rate": 8.9216663738758e-06, + "loss": 0.5992, + "step": 2093 + }, + { + "epoch": 0.34389177426970213, + "grad_norm": 0.3638859412166717, + "learning_rate": 8.921579227091272e-06, + "loss": 0.5853, + "step": 2094 + }, + { + "epoch": 0.3440560014780449, + "grad_norm": 0.29899585682019175, + "learning_rate": 8.921492032284043e-06, + "loss": 0.5919, + "step": 2095 + }, + { + "epoch": 0.34422022868638763, + "grad_norm": 0.3039192178553265, + "learning_rate": 8.921404789455061e-06, + "loss": 0.5763, + "step": 2096 + }, + { + "epoch": 0.3443844558947304, + "grad_norm": 0.3465925369057025, + "learning_rate": 8.921317498605274e-06, + "loss": 0.593, + "step": 2097 + }, + { + "epoch": 0.34454868310307313, + "grad_norm": 0.3332403460427041, + "learning_rate": 8.921230159735627e-06, + "loss": 0.6263, + "step": 2098 + }, + { + "epoch": 0.3447129103114158, + "grad_norm": 0.3097195369061601, + "learning_rate": 8.921142772847073e-06, + "loss": 0.5772, + "step": 2099 + }, + { + "epoch": 0.34487713751975857, + "grad_norm": 0.2961247616477229, + "learning_rate": 8.921055337940556e-06, + "loss": 0.5517, + "step": 2100 + }, + { + "epoch": 0.3450413647281013, + "grad_norm": 0.3645824498707989, + "learning_rate": 8.92096785501703e-06, + "loss": 0.568, + "step": 2101 + }, + { + "epoch": 0.34520559193644407, + "grad_norm": 0.3490365788059674, + "learning_rate": 8.920880324077443e-06, + "loss": 0.5953, + "step": 2102 + }, + { + "epoch": 0.3453698191447868, + "grad_norm": 0.3786183967790828, + "learning_rate": 8.920792745122747e-06, + "loss": 0.6042, + "step": 2103 + }, + { + "epoch": 0.34553404635312956, + "grad_norm": 0.3290400716459685, + "learning_rate": 8.92070511815389e-06, + "loss": 0.5824, + "step": 2104 + }, + { + "epoch": 0.3456982735614723, + "grad_norm": 0.34834661911635995, + "learning_rate": 8.920617443171828e-06, + "loss": 0.5908, + "step": 2105 + }, + { + "epoch": 0.34586250076981506, + "grad_norm": 0.37268445468777833, + "learning_rate": 8.920529720177512e-06, + "loss": 0.5783, + "step": 2106 + }, + { + "epoch": 0.34602672797815776, + "grad_norm": 0.31664534703004893, + "learning_rate": 8.92044194917189e-06, + "loss": 0.5798, + "step": 2107 + }, + { + "epoch": 0.3461909551865005, + "grad_norm": 0.3582045293008781, + "learning_rate": 8.920354130155924e-06, + "loss": 0.5913, + "step": 2108 + }, + { + "epoch": 0.34635518239484325, + "grad_norm": 0.5543690484261287, + "learning_rate": 8.92026626313056e-06, + "loss": 0.5881, + "step": 2109 + }, + { + "epoch": 0.346519409603186, + "grad_norm": 0.3208990175255881, + "learning_rate": 8.920178348096756e-06, + "loss": 0.5996, + "step": 2110 + }, + { + "epoch": 0.34668363681152875, + "grad_norm": 0.3016388152725111, + "learning_rate": 8.920090385055468e-06, + "loss": 0.6002, + "step": 2111 + }, + { + "epoch": 0.3468478640198715, + "grad_norm": 0.3602146787451213, + "learning_rate": 8.920002374007648e-06, + "loss": 0.5964, + "step": 2112 + }, + { + "epoch": 0.34701209122821425, + "grad_norm": 0.35331652465559116, + "learning_rate": 8.919914314954255e-06, + "loss": 0.5855, + "step": 2113 + }, + { + "epoch": 0.347176318436557, + "grad_norm": 0.30569388964646355, + "learning_rate": 8.919826207896243e-06, + "loss": 0.6103, + "step": 2114 + }, + { + "epoch": 0.34734054564489975, + "grad_norm": 0.3339711679529885, + "learning_rate": 8.919738052834569e-06, + "loss": 0.5792, + "step": 2115 + }, + { + "epoch": 0.34750477285324244, + "grad_norm": 0.31919676404834385, + "learning_rate": 8.919649849770193e-06, + "loss": 0.5673, + "step": 2116 + }, + { + "epoch": 0.3476690000615852, + "grad_norm": 0.3503739553239841, + "learning_rate": 8.91956159870407e-06, + "loss": 0.586, + "step": 2117 + }, + { + "epoch": 0.34783322726992794, + "grad_norm": 0.3263957410349147, + "learning_rate": 8.919473299637159e-06, + "loss": 0.5933, + "step": 2118 + }, + { + "epoch": 0.3479974544782707, + "grad_norm": 0.3206121735433827, + "learning_rate": 8.919384952570423e-06, + "loss": 0.5619, + "step": 2119 + }, + { + "epoch": 0.34816168168661343, + "grad_norm": 0.3672111356514065, + "learning_rate": 8.919296557504816e-06, + "loss": 0.5986, + "step": 2120 + }, + { + "epoch": 0.3483259088949562, + "grad_norm": 0.31992702542573537, + "learning_rate": 8.9192081144413e-06, + "loss": 0.5877, + "step": 2121 + }, + { + "epoch": 0.34849013610329893, + "grad_norm": 0.5187842764216102, + "learning_rate": 8.919119623380837e-06, + "loss": 0.5838, + "step": 2122 + }, + { + "epoch": 0.3486543633116417, + "grad_norm": 0.28609000136348256, + "learning_rate": 8.919031084324387e-06, + "loss": 0.5767, + "step": 2123 + }, + { + "epoch": 0.3488185905199844, + "grad_norm": 0.3176274746766672, + "learning_rate": 8.918942497272911e-06, + "loss": 0.5679, + "step": 2124 + }, + { + "epoch": 0.3489828177283271, + "grad_norm": 0.4791596535673938, + "learning_rate": 8.918853862227372e-06, + "loss": 0.5807, + "step": 2125 + }, + { + "epoch": 0.34914704493666987, + "grad_norm": 0.3121921995990465, + "learning_rate": 8.918765179188733e-06, + "loss": 0.5492, + "step": 2126 + }, + { + "epoch": 0.3493112721450126, + "grad_norm": 0.34081370981264214, + "learning_rate": 8.918676448157957e-06, + "loss": 0.5876, + "step": 2127 + }, + { + "epoch": 0.34947549935335537, + "grad_norm": 0.36611718567205803, + "learning_rate": 8.918587669136007e-06, + "loss": 0.5729, + "step": 2128 + }, + { + "epoch": 0.3496397265616981, + "grad_norm": 0.3122800636082036, + "learning_rate": 8.918498842123846e-06, + "loss": 0.568, + "step": 2129 + }, + { + "epoch": 0.34980395377004087, + "grad_norm": 0.33777800341019615, + "learning_rate": 8.918409967122443e-06, + "loss": 0.5903, + "step": 2130 + }, + { + "epoch": 0.3499681809783836, + "grad_norm": 0.3359295272643457, + "learning_rate": 8.91832104413276e-06, + "loss": 0.5659, + "step": 2131 + }, + { + "epoch": 0.35013240818672636, + "grad_norm": 0.3429887840973727, + "learning_rate": 8.918232073155762e-06, + "loss": 0.581, + "step": 2132 + }, + { + "epoch": 0.35029663539506906, + "grad_norm": 0.33042086580264834, + "learning_rate": 8.918143054192417e-06, + "loss": 0.592, + "step": 2133 + }, + { + "epoch": 0.3504608626034118, + "grad_norm": 0.3318582273308789, + "learning_rate": 8.918053987243692e-06, + "loss": 0.5953, + "step": 2134 + }, + { + "epoch": 0.35062508981175455, + "grad_norm": 0.35486222754880553, + "learning_rate": 8.917964872310555e-06, + "loss": 0.5634, + "step": 2135 + }, + { + "epoch": 0.3507893170200973, + "grad_norm": 0.2850475030435862, + "learning_rate": 8.91787570939397e-06, + "loss": 0.5726, + "step": 2136 + }, + { + "epoch": 0.35095354422844005, + "grad_norm": 0.47896412101886765, + "learning_rate": 8.917786498494912e-06, + "loss": 0.5956, + "step": 2137 + }, + { + "epoch": 0.3511177714367828, + "grad_norm": 0.3116157303094224, + "learning_rate": 8.917697239614343e-06, + "loss": 0.5734, + "step": 2138 + }, + { + "epoch": 0.35128199864512555, + "grad_norm": 0.30719351202702805, + "learning_rate": 8.917607932753237e-06, + "loss": 0.5558, + "step": 2139 + }, + { + "epoch": 0.3514462258534683, + "grad_norm": 0.3747383902637673, + "learning_rate": 8.917518577912562e-06, + "loss": 0.5724, + "step": 2140 + }, + { + "epoch": 0.351610453061811, + "grad_norm": 0.33882358226497106, + "learning_rate": 8.91742917509329e-06, + "loss": 0.5962, + "step": 2141 + }, + { + "epoch": 0.35177468027015374, + "grad_norm": 0.36082268139826695, + "learning_rate": 8.917339724296391e-06, + "loss": 0.5844, + "step": 2142 + }, + { + "epoch": 0.3519389074784965, + "grad_norm": 0.30132427096962017, + "learning_rate": 8.917250225522834e-06, + "loss": 0.5746, + "step": 2143 + }, + { + "epoch": 0.35210313468683924, + "grad_norm": 0.3129525632905135, + "learning_rate": 8.917160678773597e-06, + "loss": 0.5752, + "step": 2144 + }, + { + "epoch": 0.352267361895182, + "grad_norm": 0.3219843814868087, + "learning_rate": 8.917071084049647e-06, + "loss": 0.5968, + "step": 2145 + }, + { + "epoch": 0.35243158910352473, + "grad_norm": 0.3258109865865092, + "learning_rate": 8.91698144135196e-06, + "loss": 0.6133, + "step": 2146 + }, + { + "epoch": 0.3525958163118675, + "grad_norm": 0.3090823359585868, + "learning_rate": 8.916891750681508e-06, + "loss": 0.5866, + "step": 2147 + }, + { + "epoch": 0.35276004352021023, + "grad_norm": 0.32487573794202323, + "learning_rate": 8.916802012039267e-06, + "loss": 0.5595, + "step": 2148 + }, + { + "epoch": 0.352924270728553, + "grad_norm": 0.31033278772936684, + "learning_rate": 8.916712225426208e-06, + "loss": 0.5872, + "step": 2149 + }, + { + "epoch": 0.3530884979368957, + "grad_norm": 0.2833943755067742, + "learning_rate": 8.916622390843312e-06, + "loss": 0.5687, + "step": 2150 + }, + { + "epoch": 0.3532527251452384, + "grad_norm": 0.33627350008595963, + "learning_rate": 8.916532508291549e-06, + "loss": 0.5878, + "step": 2151 + }, + { + "epoch": 0.35341695235358117, + "grad_norm": 0.3469802742069985, + "learning_rate": 8.9164425777719e-06, + "loss": 0.6043, + "step": 2152 + }, + { + "epoch": 0.3535811795619239, + "grad_norm": 0.31575761311555894, + "learning_rate": 8.916352599285338e-06, + "loss": 0.5815, + "step": 2153 + }, + { + "epoch": 0.35374540677026667, + "grad_norm": 0.3045497966721819, + "learning_rate": 8.916262572832842e-06, + "loss": 0.5663, + "step": 2154 + }, + { + "epoch": 0.3539096339786094, + "grad_norm": 0.31121959786989556, + "learning_rate": 8.916172498415389e-06, + "loss": 0.5729, + "step": 2155 + }, + { + "epoch": 0.35407386118695217, + "grad_norm": 0.3627412802197949, + "learning_rate": 8.916082376033958e-06, + "loss": 0.5644, + "step": 2156 + }, + { + "epoch": 0.3542380883952949, + "grad_norm": 0.31693514999285666, + "learning_rate": 8.915992205689529e-06, + "loss": 0.5936, + "step": 2157 + }, + { + "epoch": 0.3544023156036376, + "grad_norm": 0.3855064420166456, + "learning_rate": 8.915901987383078e-06, + "loss": 0.5771, + "step": 2158 + }, + { + "epoch": 0.35456654281198036, + "grad_norm": 0.32831398862385386, + "learning_rate": 8.915811721115588e-06, + "loss": 0.6016, + "step": 2159 + }, + { + "epoch": 0.3547307700203231, + "grad_norm": 0.32057668168581144, + "learning_rate": 8.915721406888037e-06, + "loss": 0.582, + "step": 2160 + }, + { + "epoch": 0.35489499722866585, + "grad_norm": 0.3029317139589128, + "learning_rate": 8.915631044701408e-06, + "loss": 0.5853, + "step": 2161 + }, + { + "epoch": 0.3550592244370086, + "grad_norm": 0.3683147589988735, + "learning_rate": 8.915540634556681e-06, + "loss": 0.5618, + "step": 2162 + }, + { + "epoch": 0.35522345164535135, + "grad_norm": 0.32250564191378006, + "learning_rate": 8.915450176454838e-06, + "loss": 0.5949, + "step": 2163 + }, + { + "epoch": 0.3553876788536941, + "grad_norm": 0.34769282222308573, + "learning_rate": 8.915359670396863e-06, + "loss": 0.6056, + "step": 2164 + }, + { + "epoch": 0.35555190606203685, + "grad_norm": 0.32794507485016755, + "learning_rate": 8.915269116383736e-06, + "loss": 0.5667, + "step": 2165 + }, + { + "epoch": 0.3557161332703796, + "grad_norm": 0.3592878992842856, + "learning_rate": 8.915178514416443e-06, + "loss": 0.5734, + "step": 2166 + }, + { + "epoch": 0.3558803604787223, + "grad_norm": 0.31418949050424083, + "learning_rate": 8.91508786449597e-06, + "loss": 0.5647, + "step": 2167 + }, + { + "epoch": 0.35604458768706504, + "grad_norm": 0.41684345813587714, + "learning_rate": 8.914997166623295e-06, + "loss": 0.5661, + "step": 2168 + }, + { + "epoch": 0.3562088148954078, + "grad_norm": 0.33388499752938794, + "learning_rate": 8.91490642079941e-06, + "loss": 0.5724, + "step": 2169 + }, + { + "epoch": 0.35637304210375054, + "grad_norm": 0.30404975902522224, + "learning_rate": 8.914815627025295e-06, + "loss": 0.5754, + "step": 2170 + }, + { + "epoch": 0.3565372693120933, + "grad_norm": 0.3150577755264073, + "learning_rate": 8.914724785301942e-06, + "loss": 0.5769, + "step": 2171 + }, + { + "epoch": 0.35670149652043603, + "grad_norm": 0.3201787714157396, + "learning_rate": 8.91463389563033e-06, + "loss": 0.6045, + "step": 2172 + }, + { + "epoch": 0.3568657237287788, + "grad_norm": 0.32275307016076826, + "learning_rate": 8.914542958011454e-06, + "loss": 0.5873, + "step": 2173 + }, + { + "epoch": 0.35702995093712153, + "grad_norm": 0.29979459104094425, + "learning_rate": 8.914451972446297e-06, + "loss": 0.5786, + "step": 2174 + }, + { + "epoch": 0.3571941781454642, + "grad_norm": 0.625963371230108, + "learning_rate": 8.914360938935849e-06, + "loss": 0.5633, + "step": 2175 + }, + { + "epoch": 0.357358405353807, + "grad_norm": 0.3087502313684936, + "learning_rate": 8.914269857481098e-06, + "loss": 0.5823, + "step": 2176 + }, + { + "epoch": 0.3575226325621497, + "grad_norm": 0.32683326534937834, + "learning_rate": 8.914178728083031e-06, + "loss": 0.5778, + "step": 2177 + }, + { + "epoch": 0.35768685977049247, + "grad_norm": 0.7880735018196897, + "learning_rate": 8.914087550742643e-06, + "loss": 0.5851, + "step": 2178 + }, + { + "epoch": 0.3578510869788352, + "grad_norm": 0.3278097326230743, + "learning_rate": 8.91399632546092e-06, + "loss": 0.5914, + "step": 2179 + }, + { + "epoch": 0.35801531418717797, + "grad_norm": 0.35430290321922725, + "learning_rate": 8.913905052238854e-06, + "loss": 0.6058, + "step": 2180 + }, + { + "epoch": 0.3581795413955207, + "grad_norm": 1.0396310534287811, + "learning_rate": 8.913813731077437e-06, + "loss": 0.583, + "step": 2181 + }, + { + "epoch": 0.35834376860386347, + "grad_norm": 0.3060473871714537, + "learning_rate": 8.91372236197766e-06, + "loss": 0.5871, + "step": 2182 + }, + { + "epoch": 0.3585079958122062, + "grad_norm": 0.33881404244223196, + "learning_rate": 8.913630944940516e-06, + "loss": 0.5872, + "step": 2183 + }, + { + "epoch": 0.3586722230205489, + "grad_norm": 0.2973344128005135, + "learning_rate": 8.913539479966997e-06, + "loss": 0.5696, + "step": 2184 + }, + { + "epoch": 0.35883645022889166, + "grad_norm": 0.35184436201426394, + "learning_rate": 8.913447967058097e-06, + "loss": 0.5965, + "step": 2185 + }, + { + "epoch": 0.3590006774372344, + "grad_norm": 0.3020841641575663, + "learning_rate": 8.913356406214809e-06, + "loss": 0.562, + "step": 2186 + }, + { + "epoch": 0.35916490464557715, + "grad_norm": 0.4027338067146939, + "learning_rate": 8.91326479743813e-06, + "loss": 0.5637, + "step": 2187 + }, + { + "epoch": 0.3593291318539199, + "grad_norm": 0.297962563186807, + "learning_rate": 8.913173140729051e-06, + "loss": 0.584, + "step": 2188 + }, + { + "epoch": 0.35949335906226265, + "grad_norm": 0.49567057600017395, + "learning_rate": 8.913081436088573e-06, + "loss": 0.5698, + "step": 2189 + }, + { + "epoch": 0.3596575862706054, + "grad_norm": 0.3139466119859544, + "learning_rate": 8.912989683517686e-06, + "loss": 0.581, + "step": 2190 + }, + { + "epoch": 0.35982181347894815, + "grad_norm": 0.3222467409727742, + "learning_rate": 8.91289788301739e-06, + "loss": 0.5692, + "step": 2191 + }, + { + "epoch": 0.35998604068729084, + "grad_norm": 0.3132899288065772, + "learning_rate": 8.912806034588682e-06, + "loss": 0.5716, + "step": 2192 + }, + { + "epoch": 0.3601502678956336, + "grad_norm": 0.311666768458647, + "learning_rate": 8.912714138232558e-06, + "loss": 0.5707, + "step": 2193 + }, + { + "epoch": 0.36031449510397634, + "grad_norm": 0.318265739153915, + "learning_rate": 8.912622193950016e-06, + "loss": 0.6014, + "step": 2194 + }, + { + "epoch": 0.3604787223123191, + "grad_norm": 0.33405275857775724, + "learning_rate": 8.912530201742057e-06, + "loss": 0.5796, + "step": 2195 + }, + { + "epoch": 0.36064294952066184, + "grad_norm": 0.32054434013299965, + "learning_rate": 8.912438161609678e-06, + "loss": 0.5737, + "step": 2196 + }, + { + "epoch": 0.3608071767290046, + "grad_norm": 0.4906329903361965, + "learning_rate": 8.912346073553882e-06, + "loss": 0.5959, + "step": 2197 + }, + { + "epoch": 0.36097140393734733, + "grad_norm": 0.6300646881591123, + "learning_rate": 8.912253937575663e-06, + "loss": 0.5996, + "step": 2198 + }, + { + "epoch": 0.3611356311456901, + "grad_norm": 0.32082175087580206, + "learning_rate": 8.912161753676026e-06, + "loss": 0.5648, + "step": 2199 + }, + { + "epoch": 0.3612998583540328, + "grad_norm": 0.3143112057835378, + "learning_rate": 8.912069521855971e-06, + "loss": 0.5699, + "step": 2200 + }, + { + "epoch": 0.3614640855623755, + "grad_norm": 0.3295077023756852, + "learning_rate": 8.911977242116502e-06, + "loss": 0.5756, + "step": 2201 + }, + { + "epoch": 0.3616283127707183, + "grad_norm": 0.33517369326913704, + "learning_rate": 8.911884914458618e-06, + "loss": 0.5851, + "step": 2202 + }, + { + "epoch": 0.361792539979061, + "grad_norm": 0.33051215135799833, + "learning_rate": 8.911792538883323e-06, + "loss": 0.5722, + "step": 2203 + }, + { + "epoch": 0.36195676718740377, + "grad_norm": 0.43832424126361785, + "learning_rate": 8.91170011539162e-06, + "loss": 0.5514, + "step": 2204 + }, + { + "epoch": 0.3621209943957465, + "grad_norm": 0.37071372638553013, + "learning_rate": 8.911607643984513e-06, + "loss": 0.5964, + "step": 2205 + }, + { + "epoch": 0.36228522160408927, + "grad_norm": 0.4086726046467989, + "learning_rate": 8.91151512466301e-06, + "loss": 0.5983, + "step": 2206 + }, + { + "epoch": 0.362449448812432, + "grad_norm": 0.3148352832312435, + "learning_rate": 8.91142255742811e-06, + "loss": 0.5952, + "step": 2207 + }, + { + "epoch": 0.36261367602077477, + "grad_norm": 0.30697433895023185, + "learning_rate": 8.91132994228082e-06, + "loss": 0.5793, + "step": 2208 + }, + { + "epoch": 0.36277790322911746, + "grad_norm": 0.3825985510743253, + "learning_rate": 8.911237279222148e-06, + "loss": 0.5774, + "step": 2209 + }, + { + "epoch": 0.3629421304374602, + "grad_norm": 0.34276141854757924, + "learning_rate": 8.911144568253097e-06, + "loss": 0.5758, + "step": 2210 + }, + { + "epoch": 0.36310635764580296, + "grad_norm": 0.3142215471342378, + "learning_rate": 8.911051809374677e-06, + "loss": 0.5792, + "step": 2211 + }, + { + "epoch": 0.3632705848541457, + "grad_norm": 0.30903065304902716, + "learning_rate": 8.910959002587895e-06, + "loss": 0.5866, + "step": 2212 + }, + { + "epoch": 0.36343481206248845, + "grad_norm": 0.31135335714393814, + "learning_rate": 8.910866147893758e-06, + "loss": 0.5868, + "step": 2213 + }, + { + "epoch": 0.3635990392708312, + "grad_norm": 0.3764236674444635, + "learning_rate": 8.910773245293275e-06, + "loss": 0.5889, + "step": 2214 + }, + { + "epoch": 0.36376326647917395, + "grad_norm": 0.40469774842863443, + "learning_rate": 8.910680294787455e-06, + "loss": 0.566, + "step": 2215 + }, + { + "epoch": 0.3639274936875167, + "grad_norm": 0.3145755387386865, + "learning_rate": 8.910587296377308e-06, + "loss": 0.5788, + "step": 2216 + }, + { + "epoch": 0.3640917208958594, + "grad_norm": 0.35885193000909843, + "learning_rate": 8.910494250063844e-06, + "loss": 0.5677, + "step": 2217 + }, + { + "epoch": 0.36425594810420214, + "grad_norm": 0.3651850799070969, + "learning_rate": 8.910401155848072e-06, + "loss": 0.5735, + "step": 2218 + }, + { + "epoch": 0.3644201753125449, + "grad_norm": 0.29711733435878707, + "learning_rate": 8.910308013731004e-06, + "loss": 0.5525, + "step": 2219 + }, + { + "epoch": 0.36458440252088764, + "grad_norm": 0.3236062586888815, + "learning_rate": 8.910214823713652e-06, + "loss": 0.592, + "step": 2220 + }, + { + "epoch": 0.3647486297292304, + "grad_norm": 0.33402951253417174, + "learning_rate": 8.910121585797028e-06, + "loss": 0.5915, + "step": 2221 + }, + { + "epoch": 0.36491285693757314, + "grad_norm": 0.29300268294595966, + "learning_rate": 8.910028299982145e-06, + "loss": 0.5813, + "step": 2222 + }, + { + "epoch": 0.3650770841459159, + "grad_norm": 0.3091653000574013, + "learning_rate": 8.909934966270016e-06, + "loss": 0.5804, + "step": 2223 + }, + { + "epoch": 0.36524131135425864, + "grad_norm": 0.3079107397662537, + "learning_rate": 8.909841584661654e-06, + "loss": 0.5748, + "step": 2224 + }, + { + "epoch": 0.3654055385626014, + "grad_norm": 0.34259914622682275, + "learning_rate": 8.909748155158074e-06, + "loss": 0.5796, + "step": 2225 + }, + { + "epoch": 0.3655697657709441, + "grad_norm": 0.4153545557740463, + "learning_rate": 8.90965467776029e-06, + "loss": 0.5734, + "step": 2226 + }, + { + "epoch": 0.3657339929792868, + "grad_norm": 0.31071013419276294, + "learning_rate": 8.909561152469317e-06, + "loss": 0.5905, + "step": 2227 + }, + { + "epoch": 0.3658982201876296, + "grad_norm": 0.3471544762094001, + "learning_rate": 8.909467579286173e-06, + "loss": 0.5755, + "step": 2228 + }, + { + "epoch": 0.3660624473959723, + "grad_norm": 0.3216553400318287, + "learning_rate": 8.909373958211872e-06, + "loss": 0.5566, + "step": 2229 + }, + { + "epoch": 0.3662266746043151, + "grad_norm": 0.3292521251192245, + "learning_rate": 8.909280289247431e-06, + "loss": 0.5803, + "step": 2230 + }, + { + "epoch": 0.3663909018126578, + "grad_norm": 0.3344410080798811, + "learning_rate": 8.90918657239387e-06, + "loss": 0.5777, + "step": 2231 + }, + { + "epoch": 0.36655512902100057, + "grad_norm": 0.2941625541245194, + "learning_rate": 8.909092807652202e-06, + "loss": 0.6021, + "step": 2232 + }, + { + "epoch": 0.3667193562293433, + "grad_norm": 0.35239833707095725, + "learning_rate": 8.908998995023449e-06, + "loss": 0.5848, + "step": 2233 + }, + { + "epoch": 0.366883583437686, + "grad_norm": 0.34132808655561586, + "learning_rate": 8.908905134508631e-06, + "loss": 0.5662, + "step": 2234 + }, + { + "epoch": 0.36704781064602876, + "grad_norm": 0.5185030895359983, + "learning_rate": 8.908811226108764e-06, + "loss": 0.5913, + "step": 2235 + }, + { + "epoch": 0.3672120378543715, + "grad_norm": 0.30832542073870384, + "learning_rate": 8.90871726982487e-06, + "loss": 0.6005, + "step": 2236 + }, + { + "epoch": 0.36737626506271426, + "grad_norm": 0.32855997531791475, + "learning_rate": 8.908623265657966e-06, + "loss": 0.5756, + "step": 2237 + }, + { + "epoch": 0.367540492271057, + "grad_norm": 0.30127305901284296, + "learning_rate": 8.90852921360908e-06, + "loss": 0.5881, + "step": 2238 + }, + { + "epoch": 0.36770471947939976, + "grad_norm": 0.2993924136214141, + "learning_rate": 8.908435113679226e-06, + "loss": 0.5682, + "step": 2239 + }, + { + "epoch": 0.3678689466877425, + "grad_norm": 0.3406570591462043, + "learning_rate": 8.908340965869432e-06, + "loss": 0.5739, + "step": 2240 + }, + { + "epoch": 0.36803317389608525, + "grad_norm": 0.33425544277809804, + "learning_rate": 8.908246770180716e-06, + "loss": 0.5857, + "step": 2241 + }, + { + "epoch": 0.368197401104428, + "grad_norm": 0.3471600230497452, + "learning_rate": 8.908152526614104e-06, + "loss": 0.5689, + "step": 2242 + }, + { + "epoch": 0.3683616283127707, + "grad_norm": 0.31874708068845486, + "learning_rate": 8.90805823517062e-06, + "loss": 0.5846, + "step": 2243 + }, + { + "epoch": 0.36852585552111344, + "grad_norm": 0.3397332860106541, + "learning_rate": 8.907963895851282e-06, + "loss": 0.5983, + "step": 2244 + }, + { + "epoch": 0.3686900827294562, + "grad_norm": 0.3454421184183436, + "learning_rate": 8.907869508657122e-06, + "loss": 0.5703, + "step": 2245 + }, + { + "epoch": 0.36885430993779894, + "grad_norm": 0.31162260048303864, + "learning_rate": 8.907775073589163e-06, + "loss": 0.5857, + "step": 2246 + }, + { + "epoch": 0.3690185371461417, + "grad_norm": 0.326196691003036, + "learning_rate": 8.907680590648429e-06, + "loss": 0.5868, + "step": 2247 + }, + { + "epoch": 0.36918276435448444, + "grad_norm": 0.3186926099650805, + "learning_rate": 8.907586059835948e-06, + "loss": 0.6031, + "step": 2248 + }, + { + "epoch": 0.3693469915628272, + "grad_norm": 0.36454682926659043, + "learning_rate": 8.907491481152747e-06, + "loss": 0.601, + "step": 2249 + }, + { + "epoch": 0.36951121877116994, + "grad_norm": 0.3407203715992103, + "learning_rate": 8.90739685459985e-06, + "loss": 0.5824, + "step": 2250 + }, + { + "epoch": 0.36967544597951263, + "grad_norm": 0.27972540167378696, + "learning_rate": 8.907302180178286e-06, + "loss": 0.5686, + "step": 2251 + }, + { + "epoch": 0.3698396731878554, + "grad_norm": 0.3208480967722726, + "learning_rate": 8.907207457889087e-06, + "loss": 0.5886, + "step": 2252 + }, + { + "epoch": 0.3700039003961981, + "grad_norm": 0.3041652148393244, + "learning_rate": 8.907112687733278e-06, + "loss": 0.582, + "step": 2253 + }, + { + "epoch": 0.3701681276045409, + "grad_norm": 0.3174281025630536, + "learning_rate": 8.907017869711888e-06, + "loss": 0.5669, + "step": 2254 + }, + { + "epoch": 0.3703323548128836, + "grad_norm": 0.2875806634656067, + "learning_rate": 8.906923003825949e-06, + "loss": 0.5741, + "step": 2255 + }, + { + "epoch": 0.3704965820212264, + "grad_norm": 0.3209729400410392, + "learning_rate": 8.90682809007649e-06, + "loss": 0.5761, + "step": 2256 + }, + { + "epoch": 0.3706608092295691, + "grad_norm": 0.3300546032520837, + "learning_rate": 8.906733128464541e-06, + "loss": 0.5587, + "step": 2257 + }, + { + "epoch": 0.37082503643791187, + "grad_norm": 0.291617247856832, + "learning_rate": 8.906638118991137e-06, + "loss": 0.5801, + "step": 2258 + }, + { + "epoch": 0.3709892636462546, + "grad_norm": 0.35873974625749094, + "learning_rate": 8.906543061657306e-06, + "loss": 0.5652, + "step": 2259 + }, + { + "epoch": 0.3711534908545973, + "grad_norm": 0.2995194684429335, + "learning_rate": 8.906447956464082e-06, + "loss": 0.5818, + "step": 2260 + }, + { + "epoch": 0.37131771806294006, + "grad_norm": 0.28613397814477437, + "learning_rate": 8.906352803412499e-06, + "loss": 0.5497, + "step": 2261 + }, + { + "epoch": 0.3714819452712828, + "grad_norm": 0.35714670764867573, + "learning_rate": 8.906257602503589e-06, + "loss": 0.5854, + "step": 2262 + }, + { + "epoch": 0.37164617247962556, + "grad_norm": 0.314102556822799, + "learning_rate": 8.906162353738385e-06, + "loss": 0.5927, + "step": 2263 + }, + { + "epoch": 0.3718103996879683, + "grad_norm": 0.31461753863692804, + "learning_rate": 8.906067057117924e-06, + "loss": 0.559, + "step": 2264 + }, + { + "epoch": 0.37197462689631106, + "grad_norm": 0.3202526289895275, + "learning_rate": 8.905971712643238e-06, + "loss": 0.5908, + "step": 2265 + }, + { + "epoch": 0.3721388541046538, + "grad_norm": 0.29543179263334973, + "learning_rate": 8.905876320315367e-06, + "loss": 0.5434, + "step": 2266 + }, + { + "epoch": 0.37230308131299655, + "grad_norm": 0.3062964276019493, + "learning_rate": 8.905780880135343e-06, + "loss": 0.5745, + "step": 2267 + }, + { + "epoch": 0.37246730852133925, + "grad_norm": 0.28951540649656626, + "learning_rate": 8.905685392104203e-06, + "loss": 0.587, + "step": 2268 + }, + { + "epoch": 0.372631535729682, + "grad_norm": 0.3192492106256812, + "learning_rate": 8.905589856222985e-06, + "loss": 0.5776, + "step": 2269 + }, + { + "epoch": 0.37279576293802474, + "grad_norm": 0.33482973901264274, + "learning_rate": 8.905494272492728e-06, + "loss": 0.5691, + "step": 2270 + }, + { + "epoch": 0.3729599901463675, + "grad_norm": 0.2875150584599915, + "learning_rate": 8.905398640914468e-06, + "loss": 0.5927, + "step": 2271 + }, + { + "epoch": 0.37312421735471024, + "grad_norm": 0.3076711414341829, + "learning_rate": 8.905302961489245e-06, + "loss": 0.5755, + "step": 2272 + }, + { + "epoch": 0.373288444563053, + "grad_norm": 0.29479734170962885, + "learning_rate": 8.905207234218098e-06, + "loss": 0.5966, + "step": 2273 + }, + { + "epoch": 0.37345267177139574, + "grad_norm": 0.28667253454249403, + "learning_rate": 8.905111459102065e-06, + "loss": 0.5847, + "step": 2274 + }, + { + "epoch": 0.3736168989797385, + "grad_norm": 0.37883062566583625, + "learning_rate": 8.905015636142189e-06, + "loss": 0.5881, + "step": 2275 + }, + { + "epoch": 0.37378112618808124, + "grad_norm": 0.29767160803116754, + "learning_rate": 8.904919765339508e-06, + "loss": 0.5627, + "step": 2276 + }, + { + "epoch": 0.37394535339642393, + "grad_norm": 0.3155297745770699, + "learning_rate": 8.904823846695065e-06, + "loss": 0.5693, + "step": 2277 + }, + { + "epoch": 0.3741095806047667, + "grad_norm": 0.30626645197578384, + "learning_rate": 8.904727880209902e-06, + "loss": 0.5739, + "step": 2278 + }, + { + "epoch": 0.3742738078131094, + "grad_norm": 0.2870421361283258, + "learning_rate": 8.904631865885059e-06, + "loss": 0.564, + "step": 2279 + }, + { + "epoch": 0.3744380350214522, + "grad_norm": 0.3041713714865128, + "learning_rate": 8.904535803721581e-06, + "loss": 0.581, + "step": 2280 + }, + { + "epoch": 0.3746022622297949, + "grad_norm": 0.3342741300499172, + "learning_rate": 8.904439693720511e-06, + "loss": 0.5672, + "step": 2281 + }, + { + "epoch": 0.3747664894381377, + "grad_norm": 0.29876866247680856, + "learning_rate": 8.904343535882892e-06, + "loss": 0.5563, + "step": 2282 + }, + { + "epoch": 0.3749307166464804, + "grad_norm": 0.3272388991501618, + "learning_rate": 8.90424733020977e-06, + "loss": 0.5733, + "step": 2283 + }, + { + "epoch": 0.37509494385482317, + "grad_norm": 0.29504478377135096, + "learning_rate": 8.904151076702187e-06, + "loss": 0.5582, + "step": 2284 + }, + { + "epoch": 0.37525917106316586, + "grad_norm": 0.32151438651497766, + "learning_rate": 8.904054775361191e-06, + "loss": 0.5808, + "step": 2285 + }, + { + "epoch": 0.3754233982715086, + "grad_norm": 0.29408465592424793, + "learning_rate": 8.903958426187827e-06, + "loss": 0.6031, + "step": 2286 + }, + { + "epoch": 0.37558762547985136, + "grad_norm": 0.39603717369803254, + "learning_rate": 8.903862029183144e-06, + "loss": 0.5646, + "step": 2287 + }, + { + "epoch": 0.3757518526881941, + "grad_norm": 0.27728565310231973, + "learning_rate": 8.903765584348183e-06, + "loss": 0.5764, + "step": 2288 + }, + { + "epoch": 0.37591607989653686, + "grad_norm": 0.3228980017218485, + "learning_rate": 8.903669091683996e-06, + "loss": 0.5758, + "step": 2289 + }, + { + "epoch": 0.3760803071048796, + "grad_norm": 0.27396890144473407, + "learning_rate": 8.903572551191633e-06, + "loss": 0.5904, + "step": 2290 + }, + { + "epoch": 0.37624453431322236, + "grad_norm": 0.33219915905771785, + "learning_rate": 8.903475962872135e-06, + "loss": 0.588, + "step": 2291 + }, + { + "epoch": 0.3764087615215651, + "grad_norm": 0.29442976439850005, + "learning_rate": 8.903379326726559e-06, + "loss": 0.5726, + "step": 2292 + }, + { + "epoch": 0.37657298872990785, + "grad_norm": 0.27379664077338145, + "learning_rate": 8.903282642755948e-06, + "loss": 0.5569, + "step": 2293 + }, + { + "epoch": 0.37673721593825055, + "grad_norm": 0.309011494993213, + "learning_rate": 8.903185910961358e-06, + "loss": 0.5866, + "step": 2294 + }, + { + "epoch": 0.3769014431465933, + "grad_norm": 0.34027174363674284, + "learning_rate": 8.903089131343835e-06, + "loss": 0.5794, + "step": 2295 + }, + { + "epoch": 0.37706567035493604, + "grad_norm": 0.2904502113115749, + "learning_rate": 8.902992303904435e-06, + "loss": 0.6026, + "step": 2296 + }, + { + "epoch": 0.3772298975632788, + "grad_norm": 0.38170797705184983, + "learning_rate": 8.902895428644203e-06, + "loss": 0.5842, + "step": 2297 + }, + { + "epoch": 0.37739412477162154, + "grad_norm": 0.31501550061600586, + "learning_rate": 8.902798505564198e-06, + "loss": 0.5805, + "step": 2298 + }, + { + "epoch": 0.3775583519799643, + "grad_norm": 0.33472044182482413, + "learning_rate": 8.902701534665467e-06, + "loss": 0.596, + "step": 2299 + }, + { + "epoch": 0.37772257918830704, + "grad_norm": 0.29453287123397076, + "learning_rate": 8.902604515949067e-06, + "loss": 0.5808, + "step": 2300 + }, + { + "epoch": 0.3778868063966498, + "grad_norm": 0.3003032769783216, + "learning_rate": 8.90250744941605e-06, + "loss": 0.595, + "step": 2301 + }, + { + "epoch": 0.3780510336049925, + "grad_norm": 0.33000419779183837, + "learning_rate": 8.90241033506747e-06, + "loss": 0.5461, + "step": 2302 + }, + { + "epoch": 0.37821526081333523, + "grad_norm": 0.44537302571064064, + "learning_rate": 8.902313172904383e-06, + "loss": 0.5843, + "step": 2303 + }, + { + "epoch": 0.378379488021678, + "grad_norm": 0.27528935880347627, + "learning_rate": 8.902215962927844e-06, + "loss": 0.5868, + "step": 2304 + }, + { + "epoch": 0.3785437152300207, + "grad_norm": 0.30776693033578106, + "learning_rate": 8.902118705138908e-06, + "loss": 0.5892, + "step": 2305 + }, + { + "epoch": 0.3787079424383635, + "grad_norm": 0.4128017693757589, + "learning_rate": 8.90202139953863e-06, + "loss": 0.5512, + "step": 2306 + }, + { + "epoch": 0.3788721696467062, + "grad_norm": 0.29928028112716526, + "learning_rate": 8.901924046128072e-06, + "loss": 0.5752, + "step": 2307 + }, + { + "epoch": 0.379036396855049, + "grad_norm": 0.2854681450585411, + "learning_rate": 8.901826644908287e-06, + "loss": 0.5643, + "step": 2308 + }, + { + "epoch": 0.3792006240633917, + "grad_norm": 0.33566978961389754, + "learning_rate": 8.901729195880332e-06, + "loss": 0.5894, + "step": 2309 + }, + { + "epoch": 0.37936485127173447, + "grad_norm": 0.3031308669451596, + "learning_rate": 8.90163169904527e-06, + "loss": 0.5769, + "step": 2310 + }, + { + "epoch": 0.37952907848007716, + "grad_norm": 0.3232660855580926, + "learning_rate": 8.901534154404154e-06, + "loss": 0.5928, + "step": 2311 + }, + { + "epoch": 0.3796933056884199, + "grad_norm": 0.3708012017505623, + "learning_rate": 8.901436561958048e-06, + "loss": 0.5831, + "step": 2312 + }, + { + "epoch": 0.37985753289676266, + "grad_norm": 0.31052685319187673, + "learning_rate": 8.90133892170801e-06, + "loss": 0.5587, + "step": 2313 + }, + { + "epoch": 0.3800217601051054, + "grad_norm": 0.32187830125576383, + "learning_rate": 8.901241233655103e-06, + "loss": 0.5717, + "step": 2314 + }, + { + "epoch": 0.38018598731344816, + "grad_norm": 0.43759240032581914, + "learning_rate": 8.901143497800383e-06, + "loss": 0.5554, + "step": 2315 + }, + { + "epoch": 0.3803502145217909, + "grad_norm": 0.44601495555487336, + "learning_rate": 8.901045714144916e-06, + "loss": 0.5956, + "step": 2316 + }, + { + "epoch": 0.38051444173013366, + "grad_norm": 0.33538126500137966, + "learning_rate": 8.900947882689763e-06, + "loss": 0.567, + "step": 2317 + }, + { + "epoch": 0.3806786689384764, + "grad_norm": 0.31306894560268406, + "learning_rate": 8.900850003435985e-06, + "loss": 0.5624, + "step": 2318 + }, + { + "epoch": 0.3808428961468191, + "grad_norm": 0.3111850571560804, + "learning_rate": 8.900752076384648e-06, + "loss": 0.5789, + "step": 2319 + }, + { + "epoch": 0.38100712335516185, + "grad_norm": 0.3030241102758526, + "learning_rate": 8.900654101536811e-06, + "loss": 0.5693, + "step": 2320 + }, + { + "epoch": 0.3811713505635046, + "grad_norm": 0.33699935182531765, + "learning_rate": 8.900556078893542e-06, + "loss": 0.5959, + "step": 2321 + }, + { + "epoch": 0.38133557777184734, + "grad_norm": 0.3544044560119914, + "learning_rate": 8.900458008455905e-06, + "loss": 0.5713, + "step": 2322 + }, + { + "epoch": 0.3814998049801901, + "grad_norm": 0.3177507153530851, + "learning_rate": 8.900359890224965e-06, + "loss": 0.5984, + "step": 2323 + }, + { + "epoch": 0.38166403218853284, + "grad_norm": 0.3175167714364075, + "learning_rate": 8.900261724201786e-06, + "loss": 0.5915, + "step": 2324 + }, + { + "epoch": 0.3818282593968756, + "grad_norm": 0.4009151151987348, + "learning_rate": 8.900163510387436e-06, + "loss": 0.5518, + "step": 2325 + }, + { + "epoch": 0.38199248660521834, + "grad_norm": 0.3754271825257931, + "learning_rate": 8.900065248782981e-06, + "loss": 0.6039, + "step": 2326 + }, + { + "epoch": 0.3821567138135611, + "grad_norm": 0.32691754192502154, + "learning_rate": 8.899966939389488e-06, + "loss": 0.5507, + "step": 2327 + }, + { + "epoch": 0.3823209410219038, + "grad_norm": 0.28998835197412, + "learning_rate": 8.899868582208024e-06, + "loss": 0.5652, + "step": 2328 + }, + { + "epoch": 0.38248516823024653, + "grad_norm": 0.3133390970464335, + "learning_rate": 8.89977017723966e-06, + "loss": 0.5767, + "step": 2329 + }, + { + "epoch": 0.3826493954385893, + "grad_norm": 0.27878076670015955, + "learning_rate": 8.899671724485463e-06, + "loss": 0.5876, + "step": 2330 + }, + { + "epoch": 0.38281362264693203, + "grad_norm": 0.3268019065018182, + "learning_rate": 8.899573223946502e-06, + "loss": 0.5673, + "step": 2331 + }, + { + "epoch": 0.3829778498552748, + "grad_norm": 0.373397306799112, + "learning_rate": 8.899474675623847e-06, + "loss": 0.5799, + "step": 2332 + }, + { + "epoch": 0.3831420770636175, + "grad_norm": 0.2975325766072855, + "learning_rate": 8.89937607951857e-06, + "loss": 0.5762, + "step": 2333 + }, + { + "epoch": 0.3833063042719603, + "grad_norm": 0.28508597358780635, + "learning_rate": 8.899277435631738e-06, + "loss": 0.5858, + "step": 2334 + }, + { + "epoch": 0.383470531480303, + "grad_norm": 0.4014745772227359, + "learning_rate": 8.899178743964426e-06, + "loss": 0.5544, + "step": 2335 + }, + { + "epoch": 0.3836347586886457, + "grad_norm": 0.32360223924724896, + "learning_rate": 8.899080004517704e-06, + "loss": 0.5718, + "step": 2336 + }, + { + "epoch": 0.38379898589698846, + "grad_norm": 0.33739313880409866, + "learning_rate": 8.898981217292645e-06, + "loss": 0.5623, + "step": 2337 + }, + { + "epoch": 0.3839632131053312, + "grad_norm": 0.3172906694171004, + "learning_rate": 8.898882382290323e-06, + "loss": 0.5796, + "step": 2338 + }, + { + "epoch": 0.38412744031367396, + "grad_norm": 0.35624301408414016, + "learning_rate": 8.89878349951181e-06, + "loss": 0.5828, + "step": 2339 + }, + { + "epoch": 0.3842916675220167, + "grad_norm": 0.3434168572205274, + "learning_rate": 8.89868456895818e-06, + "loss": 0.5946, + "step": 2340 + }, + { + "epoch": 0.38445589473035946, + "grad_norm": 0.3153538586937991, + "learning_rate": 8.898585590630508e-06, + "loss": 0.593, + "step": 2341 + }, + { + "epoch": 0.3846201219387022, + "grad_norm": 0.296511438451358, + "learning_rate": 8.89848656452987e-06, + "loss": 0.5789, + "step": 2342 + }, + { + "epoch": 0.38478434914704496, + "grad_norm": 0.42450348545740313, + "learning_rate": 8.898387490657339e-06, + "loss": 0.5797, + "step": 2343 + }, + { + "epoch": 0.3849485763553877, + "grad_norm": 0.33579453521532565, + "learning_rate": 8.898288369013993e-06, + "loss": 0.5962, + "step": 2344 + }, + { + "epoch": 0.3851128035637304, + "grad_norm": 0.30616008807308986, + "learning_rate": 8.898189199600907e-06, + "loss": 0.5813, + "step": 2345 + }, + { + "epoch": 0.38527703077207315, + "grad_norm": 0.36008697096086706, + "learning_rate": 8.89808998241916e-06, + "loss": 0.5833, + "step": 2346 + }, + { + "epoch": 0.3854412579804159, + "grad_norm": 0.30608114375905615, + "learning_rate": 8.897990717469828e-06, + "loss": 0.5581, + "step": 2347 + }, + { + "epoch": 0.38560548518875865, + "grad_norm": 0.29685505491002356, + "learning_rate": 8.89789140475399e-06, + "loss": 0.577, + "step": 2348 + }, + { + "epoch": 0.3857697123971014, + "grad_norm": 0.3025244608028047, + "learning_rate": 8.897792044272724e-06, + "loss": 0.5567, + "step": 2349 + }, + { + "epoch": 0.38593393960544414, + "grad_norm": 0.2933770229759006, + "learning_rate": 8.897692636027112e-06, + "loss": 0.5592, + "step": 2350 + }, + { + "epoch": 0.3860981668137869, + "grad_norm": 0.3232920174640066, + "learning_rate": 8.89759318001823e-06, + "loss": 0.5668, + "step": 2351 + }, + { + "epoch": 0.38626239402212964, + "grad_norm": 0.3288617268332227, + "learning_rate": 8.897493676247158e-06, + "loss": 0.5708, + "step": 2352 + }, + { + "epoch": 0.38642662123047233, + "grad_norm": 0.36540336033645326, + "learning_rate": 8.897394124714979e-06, + "loss": 0.5605, + "step": 2353 + }, + { + "epoch": 0.3865908484388151, + "grad_norm": 0.38478628819551197, + "learning_rate": 8.897294525422773e-06, + "loss": 0.5824, + "step": 2354 + }, + { + "epoch": 0.38675507564715783, + "grad_norm": 0.3239640598548664, + "learning_rate": 8.897194878371623e-06, + "loss": 0.5794, + "step": 2355 + }, + { + "epoch": 0.3869193028555006, + "grad_norm": 0.31349793588556835, + "learning_rate": 8.897095183562609e-06, + "loss": 0.5636, + "step": 2356 + }, + { + "epoch": 0.38708353006384333, + "grad_norm": 0.3038538132161622, + "learning_rate": 8.896995440996816e-06, + "loss": 0.5791, + "step": 2357 + }, + { + "epoch": 0.3872477572721861, + "grad_norm": 0.32051112577468094, + "learning_rate": 8.896895650675327e-06, + "loss": 0.5791, + "step": 2358 + }, + { + "epoch": 0.3874119844805288, + "grad_norm": 0.44838209737281975, + "learning_rate": 8.896795812599224e-06, + "loss": 0.5915, + "step": 2359 + }, + { + "epoch": 0.3875762116888716, + "grad_norm": 0.34659678851515363, + "learning_rate": 8.896695926769594e-06, + "loss": 0.5726, + "step": 2360 + }, + { + "epoch": 0.3877404388972143, + "grad_norm": 0.2887359708809979, + "learning_rate": 8.89659599318752e-06, + "loss": 0.5854, + "step": 2361 + }, + { + "epoch": 0.387904666105557, + "grad_norm": 0.3166935243544558, + "learning_rate": 8.896496011854087e-06, + "loss": 0.582, + "step": 2362 + }, + { + "epoch": 0.38806889331389977, + "grad_norm": 0.2931378659260637, + "learning_rate": 8.896395982770382e-06, + "loss": 0.5595, + "step": 2363 + }, + { + "epoch": 0.3882331205222425, + "grad_norm": 0.35930294339419266, + "learning_rate": 8.896295905937492e-06, + "loss": 0.5803, + "step": 2364 + }, + { + "epoch": 0.38839734773058526, + "grad_norm": 0.3112712071807427, + "learning_rate": 8.896195781356502e-06, + "loss": 0.5941, + "step": 2365 + }, + { + "epoch": 0.388561574938928, + "grad_norm": 0.2942703416825157, + "learning_rate": 8.896095609028501e-06, + "loss": 0.5783, + "step": 2366 + }, + { + "epoch": 0.38872580214727076, + "grad_norm": 0.3352361065283778, + "learning_rate": 8.895995388954577e-06, + "loss": 0.559, + "step": 2367 + }, + { + "epoch": 0.3888900293556135, + "grad_norm": 0.2917202118240349, + "learning_rate": 8.895895121135819e-06, + "loss": 0.5731, + "step": 2368 + }, + { + "epoch": 0.38905425656395626, + "grad_norm": 0.30769072743532866, + "learning_rate": 8.895794805573313e-06, + "loss": 0.5623, + "step": 2369 + }, + { + "epoch": 0.38921848377229895, + "grad_norm": 0.2893103915184653, + "learning_rate": 8.89569444226815e-06, + "loss": 0.5903, + "step": 2370 + }, + { + "epoch": 0.3893827109806417, + "grad_norm": 0.29857635371328933, + "learning_rate": 8.895594031221423e-06, + "loss": 0.5766, + "step": 2371 + }, + { + "epoch": 0.38954693818898445, + "grad_norm": 0.5167094917444112, + "learning_rate": 8.895493572434218e-06, + "loss": 0.5776, + "step": 2372 + }, + { + "epoch": 0.3897111653973272, + "grad_norm": 0.358184481637941, + "learning_rate": 8.89539306590763e-06, + "loss": 0.595, + "step": 2373 + }, + { + "epoch": 0.38987539260566995, + "grad_norm": 1.3947990141046052, + "learning_rate": 8.895292511642748e-06, + "loss": 0.5791, + "step": 2374 + }, + { + "epoch": 0.3900396198140127, + "grad_norm": 0.33588084377988386, + "learning_rate": 8.895191909640665e-06, + "loss": 0.5948, + "step": 2375 + }, + { + "epoch": 0.39020384702235544, + "grad_norm": 0.42192446454016386, + "learning_rate": 8.895091259902472e-06, + "loss": 0.5804, + "step": 2376 + }, + { + "epoch": 0.3903680742306982, + "grad_norm": 0.3417045707043655, + "learning_rate": 8.894990562429265e-06, + "loss": 0.5712, + "step": 2377 + }, + { + "epoch": 0.39053230143904094, + "grad_norm": 0.32637787783707956, + "learning_rate": 8.894889817222138e-06, + "loss": 0.59, + "step": 2378 + }, + { + "epoch": 0.39069652864738363, + "grad_norm": 0.3589500141121288, + "learning_rate": 8.894789024282181e-06, + "loss": 0.578, + "step": 2379 + }, + { + "epoch": 0.3908607558557264, + "grad_norm": 0.3401379685320087, + "learning_rate": 8.894688183610494e-06, + "loss": 0.5698, + "step": 2380 + }, + { + "epoch": 0.39102498306406913, + "grad_norm": 0.31926923165948984, + "learning_rate": 8.894587295208167e-06, + "loss": 0.5883, + "step": 2381 + }, + { + "epoch": 0.3911892102724119, + "grad_norm": 0.3215207946196231, + "learning_rate": 8.8944863590763e-06, + "loss": 0.5943, + "step": 2382 + }, + { + "epoch": 0.39135343748075463, + "grad_norm": 0.3453997776935591, + "learning_rate": 8.894385375215987e-06, + "loss": 0.5744, + "step": 2383 + }, + { + "epoch": 0.3915176646890974, + "grad_norm": 0.3461759429970046, + "learning_rate": 8.894284343628326e-06, + "loss": 0.5682, + "step": 2384 + }, + { + "epoch": 0.3916818918974401, + "grad_norm": 0.33305618278830995, + "learning_rate": 8.89418326431441e-06, + "loss": 0.6009, + "step": 2385 + }, + { + "epoch": 0.3918461191057829, + "grad_norm": 0.34941470397732544, + "learning_rate": 8.894082137275344e-06, + "loss": 0.5922, + "step": 2386 + }, + { + "epoch": 0.39201034631412557, + "grad_norm": 0.33931275242971576, + "learning_rate": 8.893980962512224e-06, + "loss": 0.5559, + "step": 2387 + }, + { + "epoch": 0.3921745735224683, + "grad_norm": 0.43197181831368625, + "learning_rate": 8.893879740026146e-06, + "loss": 0.5432, + "step": 2388 + }, + { + "epoch": 0.39233880073081107, + "grad_norm": 0.4445672617305598, + "learning_rate": 8.893778469818211e-06, + "loss": 0.5849, + "step": 2389 + }, + { + "epoch": 0.3925030279391538, + "grad_norm": 0.30662557274934144, + "learning_rate": 8.893677151889517e-06, + "loss": 0.5479, + "step": 2390 + }, + { + "epoch": 0.39266725514749656, + "grad_norm": 0.35586055709357794, + "learning_rate": 8.893575786241168e-06, + "loss": 0.5713, + "step": 2391 + }, + { + "epoch": 0.3928314823558393, + "grad_norm": 0.3416283784802478, + "learning_rate": 8.893474372874264e-06, + "loss": 0.5692, + "step": 2392 + }, + { + "epoch": 0.39299570956418206, + "grad_norm": 0.37964865251282476, + "learning_rate": 8.893372911789904e-06, + "loss": 0.5615, + "step": 2393 + }, + { + "epoch": 0.3931599367725248, + "grad_norm": 0.30654274690443206, + "learning_rate": 8.893271402989193e-06, + "loss": 0.5878, + "step": 2394 + }, + { + "epoch": 0.3933241639808675, + "grad_norm": 0.3098855741105631, + "learning_rate": 8.893169846473233e-06, + "loss": 0.586, + "step": 2395 + }, + { + "epoch": 0.39348839118921025, + "grad_norm": 0.3284469381158421, + "learning_rate": 8.893068242243126e-06, + "loss": 0.5656, + "step": 2396 + }, + { + "epoch": 0.393652618397553, + "grad_norm": 0.3457270508908126, + "learning_rate": 8.892966590299975e-06, + "loss": 0.5804, + "step": 2397 + }, + { + "epoch": 0.39381684560589575, + "grad_norm": 0.3824770008300639, + "learning_rate": 8.892864890644882e-06, + "loss": 0.5445, + "step": 2398 + }, + { + "epoch": 0.3939810728142385, + "grad_norm": 0.2890234530985488, + "learning_rate": 8.892763143278958e-06, + "loss": 0.5591, + "step": 2399 + }, + { + "epoch": 0.39414530002258125, + "grad_norm": 0.2999344078752705, + "learning_rate": 8.892661348203304e-06, + "loss": 0.5752, + "step": 2400 + }, + { + "epoch": 0.394309527230924, + "grad_norm": 0.2966419849742581, + "learning_rate": 8.892559505419023e-06, + "loss": 0.5673, + "step": 2401 + }, + { + "epoch": 0.39447375443926674, + "grad_norm": 0.3193420409937522, + "learning_rate": 8.892457614927228e-06, + "loss": 0.5612, + "step": 2402 + }, + { + "epoch": 0.3946379816476095, + "grad_norm": 0.3291083690324307, + "learning_rate": 8.89235567672902e-06, + "loss": 0.5676, + "step": 2403 + }, + { + "epoch": 0.3948022088559522, + "grad_norm": 0.3398061099802675, + "learning_rate": 8.892253690825507e-06, + "loss": 0.5447, + "step": 2404 + }, + { + "epoch": 0.39496643606429493, + "grad_norm": 0.436882175126899, + "learning_rate": 8.892151657217799e-06, + "loss": 0.5661, + "step": 2405 + }, + { + "epoch": 0.3951306632726377, + "grad_norm": 0.9383550668483664, + "learning_rate": 8.892049575907003e-06, + "loss": 0.5763, + "step": 2406 + }, + { + "epoch": 0.39529489048098043, + "grad_norm": 0.3557356759695701, + "learning_rate": 8.891947446894224e-06, + "loss": 0.5721, + "step": 2407 + }, + { + "epoch": 0.3954591176893232, + "grad_norm": 0.40604062140883007, + "learning_rate": 8.891845270180578e-06, + "loss": 0.577, + "step": 2408 + }, + { + "epoch": 0.39562334489766593, + "grad_norm": 0.3646391321735781, + "learning_rate": 8.89174304576717e-06, + "loss": 0.5777, + "step": 2409 + }, + { + "epoch": 0.3957875721060087, + "grad_norm": 0.3310293733714123, + "learning_rate": 8.891640773655112e-06, + "loss": 0.5613, + "step": 2410 + }, + { + "epoch": 0.3959517993143514, + "grad_norm": 0.4039936546858141, + "learning_rate": 8.891538453845515e-06, + "loss": 0.5776, + "step": 2411 + }, + { + "epoch": 0.3961160265226941, + "grad_norm": 0.39043339053206516, + "learning_rate": 8.891436086339489e-06, + "loss": 0.5647, + "step": 2412 + }, + { + "epoch": 0.39628025373103687, + "grad_norm": 0.3213534464575592, + "learning_rate": 8.891333671138146e-06, + "loss": 0.5711, + "step": 2413 + }, + { + "epoch": 0.3964444809393796, + "grad_norm": 0.31102891837891017, + "learning_rate": 8.8912312082426e-06, + "loss": 0.5837, + "step": 2414 + }, + { + "epoch": 0.39660870814772237, + "grad_norm": 0.3046339082276663, + "learning_rate": 8.891128697653962e-06, + "loss": 0.5554, + "step": 2415 + }, + { + "epoch": 0.3967729353560651, + "grad_norm": 0.31346347835751853, + "learning_rate": 8.891026139373347e-06, + "loss": 0.5792, + "step": 2416 + }, + { + "epoch": 0.39693716256440786, + "grad_norm": 0.3290758388183897, + "learning_rate": 8.890923533401866e-06, + "loss": 0.5893, + "step": 2417 + }, + { + "epoch": 0.3971013897727506, + "grad_norm": 0.32363239445426245, + "learning_rate": 8.890820879740636e-06, + "loss": 0.5926, + "step": 2418 + }, + { + "epoch": 0.39726561698109336, + "grad_norm": 0.3397402640059377, + "learning_rate": 8.890718178390772e-06, + "loss": 0.5573, + "step": 2419 + }, + { + "epoch": 0.3974298441894361, + "grad_norm": 0.33337105589432475, + "learning_rate": 8.89061542935339e-06, + "loss": 0.5788, + "step": 2420 + }, + { + "epoch": 0.3975940713977788, + "grad_norm": 0.3040636722728571, + "learning_rate": 8.890512632629603e-06, + "loss": 0.5735, + "step": 2421 + }, + { + "epoch": 0.39775829860612155, + "grad_norm": 0.3486589140163701, + "learning_rate": 8.89040978822053e-06, + "loss": 0.5698, + "step": 2422 + }, + { + "epoch": 0.3979225258144643, + "grad_norm": 0.33530465006472016, + "learning_rate": 8.890306896127285e-06, + "loss": 0.5824, + "step": 2423 + }, + { + "epoch": 0.39808675302280705, + "grad_norm": 0.416663784834214, + "learning_rate": 8.890203956350989e-06, + "loss": 0.5673, + "step": 2424 + }, + { + "epoch": 0.3982509802311498, + "grad_norm": 0.31480558408985104, + "learning_rate": 8.89010096889276e-06, + "loss": 0.5696, + "step": 2425 + }, + { + "epoch": 0.39841520743949255, + "grad_norm": 0.3087118076140605, + "learning_rate": 8.889997933753713e-06, + "loss": 0.5764, + "step": 2426 + }, + { + "epoch": 0.3985794346478353, + "grad_norm": 0.33372436198265837, + "learning_rate": 8.88989485093497e-06, + "loss": 0.5898, + "step": 2427 + }, + { + "epoch": 0.39874366185617804, + "grad_norm": 0.35881328155788444, + "learning_rate": 8.88979172043765e-06, + "loss": 0.5731, + "step": 2428 + }, + { + "epoch": 0.39890788906452074, + "grad_norm": 0.30641486846057925, + "learning_rate": 8.889688542262872e-06, + "loss": 0.5621, + "step": 2429 + }, + { + "epoch": 0.3990721162728635, + "grad_norm": 0.3188642436177513, + "learning_rate": 8.889585316411759e-06, + "loss": 0.5958, + "step": 2430 + }, + { + "epoch": 0.39923634348120624, + "grad_norm": 0.3586864652327105, + "learning_rate": 8.88948204288543e-06, + "loss": 0.5696, + "step": 2431 + }, + { + "epoch": 0.399400570689549, + "grad_norm": 0.32446607347051026, + "learning_rate": 8.889378721685008e-06, + "loss": 0.5801, + "step": 2432 + }, + { + "epoch": 0.39956479789789173, + "grad_norm": 0.3091050200854687, + "learning_rate": 8.889275352811614e-06, + "loss": 0.5882, + "step": 2433 + }, + { + "epoch": 0.3997290251062345, + "grad_norm": 0.4026782667136338, + "learning_rate": 8.889171936266373e-06, + "loss": 0.5563, + "step": 2434 + }, + { + "epoch": 0.39989325231457723, + "grad_norm": 0.3830299878740365, + "learning_rate": 8.889068472050405e-06, + "loss": 0.5828, + "step": 2435 + }, + { + "epoch": 0.40005747952292, + "grad_norm": 0.35592783342450557, + "learning_rate": 8.888964960164833e-06, + "loss": 0.5734, + "step": 2436 + }, + { + "epoch": 0.4002217067312627, + "grad_norm": 0.31061412781069053, + "learning_rate": 8.888861400610786e-06, + "loss": 0.5967, + "step": 2437 + }, + { + "epoch": 0.4003859339396054, + "grad_norm": 0.35688423964889243, + "learning_rate": 8.888757793389384e-06, + "loss": 0.5738, + "step": 2438 + }, + { + "epoch": 0.40055016114794817, + "grad_norm": 0.30702503029090233, + "learning_rate": 8.888654138501756e-06, + "loss": 0.5848, + "step": 2439 + }, + { + "epoch": 0.4007143883562909, + "grad_norm": 0.4147640287492882, + "learning_rate": 8.888550435949027e-06, + "loss": 0.5759, + "step": 2440 + }, + { + "epoch": 0.40087861556463367, + "grad_norm": 0.3049971946792247, + "learning_rate": 8.888446685732321e-06, + "loss": 0.5723, + "step": 2441 + }, + { + "epoch": 0.4010428427729764, + "grad_norm": 0.2979275672901426, + "learning_rate": 8.888342887852767e-06, + "loss": 0.5836, + "step": 2442 + }, + { + "epoch": 0.40120706998131916, + "grad_norm": 0.31789230808393515, + "learning_rate": 8.88823904231149e-06, + "loss": 0.5758, + "step": 2443 + }, + { + "epoch": 0.4013712971896619, + "grad_norm": 0.2969921000323856, + "learning_rate": 8.888135149109623e-06, + "loss": 0.5829, + "step": 2444 + }, + { + "epoch": 0.40153552439800466, + "grad_norm": 0.32598594902859673, + "learning_rate": 8.888031208248288e-06, + "loss": 0.5533, + "step": 2445 + }, + { + "epoch": 0.40169975160634736, + "grad_norm": 0.3055143376194248, + "learning_rate": 8.887927219728618e-06, + "loss": 0.5745, + "step": 2446 + }, + { + "epoch": 0.4018639788146901, + "grad_norm": 0.3290788171475274, + "learning_rate": 8.887823183551741e-06, + "loss": 0.5758, + "step": 2447 + }, + { + "epoch": 0.40202820602303285, + "grad_norm": 0.3092126257798601, + "learning_rate": 8.887719099718788e-06, + "loss": 0.5568, + "step": 2448 + }, + { + "epoch": 0.4021924332313756, + "grad_norm": 0.31991904996752984, + "learning_rate": 8.887614968230888e-06, + "loss": 0.5766, + "step": 2449 + }, + { + "epoch": 0.40235666043971835, + "grad_norm": 0.2857049219979474, + "learning_rate": 8.887510789089173e-06, + "loss": 0.5905, + "step": 2450 + }, + { + "epoch": 0.4025208876480611, + "grad_norm": 0.3171922371979184, + "learning_rate": 8.887406562294774e-06, + "loss": 0.5537, + "step": 2451 + }, + { + "epoch": 0.40268511485640385, + "grad_norm": 0.3223877063261562, + "learning_rate": 8.887302287848822e-06, + "loss": 0.578, + "step": 2452 + }, + { + "epoch": 0.4028493420647466, + "grad_norm": 0.2973553918001752, + "learning_rate": 8.887197965752452e-06, + "loss": 0.5664, + "step": 2453 + }, + { + "epoch": 0.40301356927308934, + "grad_norm": 0.2895585733719346, + "learning_rate": 8.887093596006794e-06, + "loss": 0.5725, + "step": 2454 + }, + { + "epoch": 0.40317779648143204, + "grad_norm": 0.33434995224312225, + "learning_rate": 8.886989178612985e-06, + "loss": 0.5889, + "step": 2455 + }, + { + "epoch": 0.4033420236897748, + "grad_norm": 0.31888170721341874, + "learning_rate": 8.886884713572157e-06, + "loss": 0.5661, + "step": 2456 + }, + { + "epoch": 0.40350625089811754, + "grad_norm": 0.2937884224933003, + "learning_rate": 8.886780200885444e-06, + "loss": 0.5522, + "step": 2457 + }, + { + "epoch": 0.4036704781064603, + "grad_norm": 0.3394152732680013, + "learning_rate": 8.886675640553981e-06, + "loss": 0.5622, + "step": 2458 + }, + { + "epoch": 0.40383470531480303, + "grad_norm": 0.32314358205589594, + "learning_rate": 8.886571032578906e-06, + "loss": 0.5775, + "step": 2459 + }, + { + "epoch": 0.4039989325231458, + "grad_norm": 0.40212521213858143, + "learning_rate": 8.886466376961355e-06, + "loss": 0.5769, + "step": 2460 + }, + { + "epoch": 0.40416315973148853, + "grad_norm": 0.3058600193689216, + "learning_rate": 8.886361673702463e-06, + "loss": 0.582, + "step": 2461 + }, + { + "epoch": 0.4043273869398313, + "grad_norm": 0.27062081866814136, + "learning_rate": 8.886256922803368e-06, + "loss": 0.5452, + "step": 2462 + }, + { + "epoch": 0.404491614148174, + "grad_norm": 0.3095302292598992, + "learning_rate": 8.886152124265205e-06, + "loss": 0.5701, + "step": 2463 + }, + { + "epoch": 0.4046558413565167, + "grad_norm": 0.3196683063197642, + "learning_rate": 8.886047278089117e-06, + "loss": 0.5688, + "step": 2464 + }, + { + "epoch": 0.40482006856485947, + "grad_norm": 0.3747769365378931, + "learning_rate": 8.885942384276238e-06, + "loss": 0.5602, + "step": 2465 + }, + { + "epoch": 0.4049842957732022, + "grad_norm": 0.3314657584893606, + "learning_rate": 8.88583744282771e-06, + "loss": 0.568, + "step": 2466 + }, + { + "epoch": 0.40514852298154497, + "grad_norm": 0.2921241951052027, + "learning_rate": 8.885732453744673e-06, + "loss": 0.5694, + "step": 2467 + }, + { + "epoch": 0.4053127501898877, + "grad_norm": 0.31684539227316827, + "learning_rate": 8.885627417028266e-06, + "loss": 0.5706, + "step": 2468 + }, + { + "epoch": 0.40547697739823046, + "grad_norm": 0.29919740238742754, + "learning_rate": 8.885522332679632e-06, + "loss": 0.5611, + "step": 2469 + }, + { + "epoch": 0.4056412046065732, + "grad_norm": 0.2927552985408856, + "learning_rate": 8.88541720069991e-06, + "loss": 0.5614, + "step": 2470 + }, + { + "epoch": 0.40580543181491596, + "grad_norm": 0.33353867719732694, + "learning_rate": 8.885312021090242e-06, + "loss": 0.5638, + "step": 2471 + }, + { + "epoch": 0.40596965902325866, + "grad_norm": 0.2925862099678924, + "learning_rate": 8.885206793851771e-06, + "loss": 0.5577, + "step": 2472 + }, + { + "epoch": 0.4061338862316014, + "grad_norm": 0.31469465854680573, + "learning_rate": 8.88510151898564e-06, + "loss": 0.5658, + "step": 2473 + }, + { + "epoch": 0.40629811343994415, + "grad_norm": 0.3254583303924738, + "learning_rate": 8.884996196492992e-06, + "loss": 0.5728, + "step": 2474 + }, + { + "epoch": 0.4064623406482869, + "grad_norm": 0.3475809377362272, + "learning_rate": 8.88489082637497e-06, + "loss": 0.5621, + "step": 2475 + }, + { + "epoch": 0.40662656785662965, + "grad_norm": 0.29271987278075334, + "learning_rate": 8.88478540863272e-06, + "loss": 0.5867, + "step": 2476 + }, + { + "epoch": 0.4067907950649724, + "grad_norm": 0.29606614439458234, + "learning_rate": 8.884679943267387e-06, + "loss": 0.5621, + "step": 2477 + }, + { + "epoch": 0.40695502227331515, + "grad_norm": 0.2990525558360778, + "learning_rate": 8.884574430280117e-06, + "loss": 0.5771, + "step": 2478 + }, + { + "epoch": 0.4071192494816579, + "grad_norm": 0.34870302304095335, + "learning_rate": 8.884468869672053e-06, + "loss": 0.5587, + "step": 2479 + }, + { + "epoch": 0.4072834766900006, + "grad_norm": 0.3581627456016973, + "learning_rate": 8.884363261444344e-06, + "loss": 0.566, + "step": 2480 + }, + { + "epoch": 0.40744770389834334, + "grad_norm": 0.35359658435859415, + "learning_rate": 8.884257605598137e-06, + "loss": 0.5745, + "step": 2481 + }, + { + "epoch": 0.4076119311066861, + "grad_norm": 0.31570900922366557, + "learning_rate": 8.884151902134578e-06, + "loss": 0.5746, + "step": 2482 + }, + { + "epoch": 0.40777615831502884, + "grad_norm": 0.35211170928524965, + "learning_rate": 8.884046151054815e-06, + "loss": 0.5819, + "step": 2483 + }, + { + "epoch": 0.4079403855233716, + "grad_norm": 0.3246151812151191, + "learning_rate": 8.883940352359998e-06, + "loss": 0.5808, + "step": 2484 + }, + { + "epoch": 0.40810461273171433, + "grad_norm": 0.29286391071193524, + "learning_rate": 8.883834506051277e-06, + "loss": 0.5618, + "step": 2485 + }, + { + "epoch": 0.4082688399400571, + "grad_norm": 0.38869425964352805, + "learning_rate": 8.883728612129799e-06, + "loss": 0.5944, + "step": 2486 + }, + { + "epoch": 0.40843306714839983, + "grad_norm": 0.3357922498523062, + "learning_rate": 8.883622670596715e-06, + "loss": 0.5485, + "step": 2487 + }, + { + "epoch": 0.4085972943567426, + "grad_norm": 0.3037126347902224, + "learning_rate": 8.883516681453177e-06, + "loss": 0.5801, + "step": 2488 + }, + { + "epoch": 0.4087615215650853, + "grad_norm": 0.32469023718462664, + "learning_rate": 8.883410644700335e-06, + "loss": 0.5674, + "step": 2489 + }, + { + "epoch": 0.408925748773428, + "grad_norm": 0.31429830880564624, + "learning_rate": 8.88330456033934e-06, + "loss": 0.5506, + "step": 2490 + }, + { + "epoch": 0.40908997598177077, + "grad_norm": 0.28926492543203663, + "learning_rate": 8.883198428371346e-06, + "loss": 0.584, + "step": 2491 + }, + { + "epoch": 0.4092542031901135, + "grad_norm": 0.33964002500396295, + "learning_rate": 8.883092248797503e-06, + "loss": 0.5619, + "step": 2492 + }, + { + "epoch": 0.40941843039845627, + "grad_norm": 0.30098148533924857, + "learning_rate": 8.882986021618967e-06, + "loss": 0.5504, + "step": 2493 + }, + { + "epoch": 0.409582657606799, + "grad_norm": 0.3575580059528442, + "learning_rate": 8.88287974683689e-06, + "loss": 0.5601, + "step": 2494 + }, + { + "epoch": 0.40974688481514177, + "grad_norm": 0.32069671607176764, + "learning_rate": 8.882773424452427e-06, + "loss": 0.5295, + "step": 2495 + }, + { + "epoch": 0.4099111120234845, + "grad_norm": 0.30232051278302446, + "learning_rate": 8.882667054466731e-06, + "loss": 0.5796, + "step": 2496 + }, + { + "epoch": 0.4100753392318272, + "grad_norm": 0.3708374336609579, + "learning_rate": 8.88256063688096e-06, + "loss": 0.5743, + "step": 2497 + }, + { + "epoch": 0.41023956644016996, + "grad_norm": 0.3032029622742079, + "learning_rate": 8.88245417169627e-06, + "loss": 0.5754, + "step": 2498 + }, + { + "epoch": 0.4104037936485127, + "grad_norm": 0.2867613437909591, + "learning_rate": 8.882347658913814e-06, + "loss": 0.5713, + "step": 2499 + }, + { + "epoch": 0.41056802085685545, + "grad_norm": 0.32926370873977717, + "learning_rate": 8.882241098534751e-06, + "loss": 0.5838, + "step": 2500 + }, + { + "epoch": 0.4107322480651982, + "grad_norm": 0.3428683835495346, + "learning_rate": 8.88213449056024e-06, + "loss": 0.5651, + "step": 2501 + }, + { + "epoch": 0.41089647527354095, + "grad_norm": 0.28973504149209034, + "learning_rate": 8.882027834991435e-06, + "loss": 0.5739, + "step": 2502 + }, + { + "epoch": 0.4110607024818837, + "grad_norm": 0.2797815273126128, + "learning_rate": 8.881921131829497e-06, + "loss": 0.5723, + "step": 2503 + }, + { + "epoch": 0.41122492969022645, + "grad_norm": 0.4145755184754829, + "learning_rate": 8.881814381075583e-06, + "loss": 0.5595, + "step": 2504 + }, + { + "epoch": 0.4113891568985692, + "grad_norm": 0.3331349868005999, + "learning_rate": 8.881707582730855e-06, + "loss": 0.5694, + "step": 2505 + }, + { + "epoch": 0.4115533841069119, + "grad_norm": 0.30051933126756675, + "learning_rate": 8.881600736796473e-06, + "loss": 0.5705, + "step": 2506 + }, + { + "epoch": 0.41171761131525464, + "grad_norm": 0.33450947684165244, + "learning_rate": 8.881493843273595e-06, + "loss": 0.5733, + "step": 2507 + }, + { + "epoch": 0.4118818385235974, + "grad_norm": 0.32963924885381335, + "learning_rate": 8.881386902163382e-06, + "loss": 0.5658, + "step": 2508 + }, + { + "epoch": 0.41204606573194014, + "grad_norm": 0.35930224711053116, + "learning_rate": 8.881279913466997e-06, + "loss": 0.5658, + "step": 2509 + }, + { + "epoch": 0.4122102929402829, + "grad_norm": 0.3704447240612607, + "learning_rate": 8.881172877185601e-06, + "loss": 0.5525, + "step": 2510 + }, + { + "epoch": 0.41237452014862563, + "grad_norm": 0.30949854959121004, + "learning_rate": 8.881065793320358e-06, + "loss": 0.5693, + "step": 2511 + }, + { + "epoch": 0.4125387473569684, + "grad_norm": 0.32021726342404155, + "learning_rate": 8.880958661872431e-06, + "loss": 0.5729, + "step": 2512 + }, + { + "epoch": 0.41270297456531113, + "grad_norm": 0.298710533844321, + "learning_rate": 8.88085148284298e-06, + "loss": 0.5395, + "step": 2513 + }, + { + "epoch": 0.4128672017736538, + "grad_norm": 0.3740569254074185, + "learning_rate": 8.880744256233175e-06, + "loss": 0.5472, + "step": 2514 + }, + { + "epoch": 0.4130314289819966, + "grad_norm": 0.4018513304426621, + "learning_rate": 8.880636982044176e-06, + "loss": 0.5624, + "step": 2515 + }, + { + "epoch": 0.4131956561903393, + "grad_norm": 0.40970239846974127, + "learning_rate": 8.88052966027715e-06, + "loss": 0.5564, + "step": 2516 + }, + { + "epoch": 0.41335988339868207, + "grad_norm": 0.3117304225668632, + "learning_rate": 8.88042229093326e-06, + "loss": 0.54, + "step": 2517 + }, + { + "epoch": 0.4135241106070248, + "grad_norm": 0.28437155849049806, + "learning_rate": 8.880314874013674e-06, + "loss": 0.5711, + "step": 2518 + }, + { + "epoch": 0.41368833781536757, + "grad_norm": 0.3332562664647129, + "learning_rate": 8.88020740951956e-06, + "loss": 0.5559, + "step": 2519 + }, + { + "epoch": 0.4138525650237103, + "grad_norm": 0.5483444520242049, + "learning_rate": 8.880099897452086e-06, + "loss": 0.5722, + "step": 2520 + }, + { + "epoch": 0.41401679223205307, + "grad_norm": 0.3011635778727716, + "learning_rate": 8.879992337812416e-06, + "loss": 0.5684, + "step": 2521 + }, + { + "epoch": 0.4141810194403958, + "grad_norm": 0.31450829980721595, + "learning_rate": 8.879884730601718e-06, + "loss": 0.5698, + "step": 2522 + }, + { + "epoch": 0.4143452466487385, + "grad_norm": 0.31001996453014075, + "learning_rate": 8.879777075821165e-06, + "loss": 0.5342, + "step": 2523 + }, + { + "epoch": 0.41450947385708126, + "grad_norm": 0.3139993732413725, + "learning_rate": 8.879669373471923e-06, + "loss": 0.5567, + "step": 2524 + }, + { + "epoch": 0.414673701065424, + "grad_norm": 0.3203420354656011, + "learning_rate": 8.879561623555163e-06, + "loss": 0.5657, + "step": 2525 + }, + { + "epoch": 0.41483792827376675, + "grad_norm": 0.3108136022021269, + "learning_rate": 8.879453826072055e-06, + "loss": 0.5815, + "step": 2526 + }, + { + "epoch": 0.4150021554821095, + "grad_norm": 0.30562850254753904, + "learning_rate": 8.879345981023769e-06, + "loss": 0.5765, + "step": 2527 + }, + { + "epoch": 0.41516638269045225, + "grad_norm": 0.3901832252063882, + "learning_rate": 8.879238088411476e-06, + "loss": 0.5606, + "step": 2528 + }, + { + "epoch": 0.415330609898795, + "grad_norm": 0.3017152351908569, + "learning_rate": 8.879130148236351e-06, + "loss": 0.6051, + "step": 2529 + }, + { + "epoch": 0.41549483710713775, + "grad_norm": 0.37648168466183224, + "learning_rate": 8.879022160499563e-06, + "loss": 0.557, + "step": 2530 + }, + { + "epoch": 0.41565906431548044, + "grad_norm": 0.31899551450864727, + "learning_rate": 8.878914125202287e-06, + "loss": 0.5608, + "step": 2531 + }, + { + "epoch": 0.4158232915238232, + "grad_norm": 0.2937937843776762, + "learning_rate": 8.878806042345693e-06, + "loss": 0.5608, + "step": 2532 + }, + { + "epoch": 0.41598751873216594, + "grad_norm": 0.3343905076548578, + "learning_rate": 8.878697911930959e-06, + "loss": 0.5666, + "step": 2533 + }, + { + "epoch": 0.4161517459405087, + "grad_norm": 0.37676843413898936, + "learning_rate": 8.878589733959256e-06, + "loss": 0.554, + "step": 2534 + }, + { + "epoch": 0.41631597314885144, + "grad_norm": 0.3158506997271894, + "learning_rate": 8.878481508431762e-06, + "loss": 0.56, + "step": 2535 + }, + { + "epoch": 0.4164802003571942, + "grad_norm": 0.31815058752542835, + "learning_rate": 8.87837323534965e-06, + "loss": 0.5791, + "step": 2536 + }, + { + "epoch": 0.41664442756553693, + "grad_norm": 0.4371060424111749, + "learning_rate": 8.878264914714098e-06, + "loss": 0.5815, + "step": 2537 + }, + { + "epoch": 0.4168086547738797, + "grad_norm": 0.2983880649857614, + "learning_rate": 8.878156546526282e-06, + "loss": 0.5604, + "step": 2538 + }, + { + "epoch": 0.41697288198222243, + "grad_norm": 0.34073001572633294, + "learning_rate": 8.878048130787376e-06, + "loss": 0.5813, + "step": 2539 + }, + { + "epoch": 0.4171371091905651, + "grad_norm": 0.3624505324000262, + "learning_rate": 8.877939667498561e-06, + "loss": 0.551, + "step": 2540 + }, + { + "epoch": 0.4173013363989079, + "grad_norm": 0.3038346714696093, + "learning_rate": 8.877831156661015e-06, + "loss": 0.5392, + "step": 2541 + }, + { + "epoch": 0.4174655636072506, + "grad_norm": 0.3447650018068754, + "learning_rate": 8.877722598275915e-06, + "loss": 0.5765, + "step": 2542 + }, + { + "epoch": 0.41762979081559337, + "grad_norm": 0.30392547582003404, + "learning_rate": 8.87761399234444e-06, + "loss": 0.5616, + "step": 2543 + }, + { + "epoch": 0.4177940180239361, + "grad_norm": 0.3887953041088847, + "learning_rate": 8.87750533886777e-06, + "loss": 0.5483, + "step": 2544 + }, + { + "epoch": 0.41795824523227887, + "grad_norm": 0.3289178164106548, + "learning_rate": 8.877396637847085e-06, + "loss": 0.5337, + "step": 2545 + }, + { + "epoch": 0.4181224724406216, + "grad_norm": 0.332748619678136, + "learning_rate": 8.877287889283566e-06, + "loss": 0.5682, + "step": 2546 + }, + { + "epoch": 0.41828669964896437, + "grad_norm": 0.3044178943644956, + "learning_rate": 8.877179093178394e-06, + "loss": 0.5424, + "step": 2547 + }, + { + "epoch": 0.41845092685730706, + "grad_norm": 0.3173245691997307, + "learning_rate": 8.87707024953275e-06, + "loss": 0.5936, + "step": 2548 + }, + { + "epoch": 0.4186151540656498, + "grad_norm": 0.265613088884184, + "learning_rate": 8.876961358347819e-06, + "loss": 0.561, + "step": 2549 + }, + { + "epoch": 0.41877938127399256, + "grad_norm": 0.3146875818714378, + "learning_rate": 8.876852419624777e-06, + "loss": 0.5787, + "step": 2550 + }, + { + "epoch": 0.4189436084823353, + "grad_norm": 0.37599575888562486, + "learning_rate": 8.876743433364814e-06, + "loss": 0.5568, + "step": 2551 + }, + { + "epoch": 0.41910783569067805, + "grad_norm": 0.3598518156246321, + "learning_rate": 8.876634399569111e-06, + "loss": 0.5856, + "step": 2552 + }, + { + "epoch": 0.4192720628990208, + "grad_norm": 0.30311498096956535, + "learning_rate": 8.876525318238852e-06, + "loss": 0.5876, + "step": 2553 + }, + { + "epoch": 0.41943629010736355, + "grad_norm": 0.3171555277490769, + "learning_rate": 8.876416189375222e-06, + "loss": 0.5866, + "step": 2554 + }, + { + "epoch": 0.4196005173157063, + "grad_norm": 0.3690366608054597, + "learning_rate": 8.876307012979409e-06, + "loss": 0.5691, + "step": 2555 + }, + { + "epoch": 0.41976474452404905, + "grad_norm": 0.2956348303710773, + "learning_rate": 8.876197789052593e-06, + "loss": 0.5774, + "step": 2556 + }, + { + "epoch": 0.41992897173239174, + "grad_norm": 0.30313111772520873, + "learning_rate": 8.876088517595964e-06, + "loss": 0.5626, + "step": 2557 + }, + { + "epoch": 0.4200931989407345, + "grad_norm": 0.29748543393772187, + "learning_rate": 8.875979198610709e-06, + "loss": 0.585, + "step": 2558 + }, + { + "epoch": 0.42025742614907724, + "grad_norm": 0.35983239485183327, + "learning_rate": 8.875869832098014e-06, + "loss": 0.5698, + "step": 2559 + }, + { + "epoch": 0.42042165335742, + "grad_norm": 0.2952903149801761, + "learning_rate": 8.875760418059067e-06, + "loss": 0.5602, + "step": 2560 + }, + { + "epoch": 0.42058588056576274, + "grad_norm": 0.2748700755222291, + "learning_rate": 8.875650956495058e-06, + "loss": 0.5571, + "step": 2561 + }, + { + "epoch": 0.4207501077741055, + "grad_norm": 0.30142460059542353, + "learning_rate": 8.875541447407174e-06, + "loss": 0.5815, + "step": 2562 + }, + { + "epoch": 0.42091433498244823, + "grad_norm": 0.30208405047227427, + "learning_rate": 8.875431890796603e-06, + "loss": 0.5731, + "step": 2563 + }, + { + "epoch": 0.421078562190791, + "grad_norm": 0.3118637667702491, + "learning_rate": 8.875322286664538e-06, + "loss": 0.5474, + "step": 2564 + }, + { + "epoch": 0.4212427893991337, + "grad_norm": 0.317291042191685, + "learning_rate": 8.87521263501217e-06, + "loss": 0.5446, + "step": 2565 + }, + { + "epoch": 0.4214070166074764, + "grad_norm": 0.35441933886963456, + "learning_rate": 8.875102935840687e-06, + "loss": 0.5516, + "step": 2566 + }, + { + "epoch": 0.4215712438158192, + "grad_norm": 0.3204356969177786, + "learning_rate": 8.874993189151281e-06, + "loss": 0.5661, + "step": 2567 + }, + { + "epoch": 0.4217354710241619, + "grad_norm": 0.30987334724965443, + "learning_rate": 8.874883394945145e-06, + "loss": 0.5629, + "step": 2568 + }, + { + "epoch": 0.42189969823250467, + "grad_norm": 0.32477041715743676, + "learning_rate": 8.87477355322347e-06, + "loss": 0.5627, + "step": 2569 + }, + { + "epoch": 0.4220639254408474, + "grad_norm": 0.2954548449250752, + "learning_rate": 8.874663663987452e-06, + "loss": 0.5503, + "step": 2570 + }, + { + "epoch": 0.42222815264919017, + "grad_norm": 0.3245662283540012, + "learning_rate": 8.874553727238281e-06, + "loss": 0.5747, + "step": 2571 + }, + { + "epoch": 0.4223923798575329, + "grad_norm": 0.3987350454378436, + "learning_rate": 8.874443742977154e-06, + "loss": 0.5561, + "step": 2572 + }, + { + "epoch": 0.4225566070658756, + "grad_norm": 0.3345985015745256, + "learning_rate": 8.874333711205264e-06, + "loss": 0.5555, + "step": 2573 + }, + { + "epoch": 0.42272083427421836, + "grad_norm": 0.28819563482433996, + "learning_rate": 8.874223631923804e-06, + "loss": 0.5392, + "step": 2574 + }, + { + "epoch": 0.4228850614825611, + "grad_norm": 0.3202474861777573, + "learning_rate": 8.874113505133974e-06, + "loss": 0.565, + "step": 2575 + }, + { + "epoch": 0.42304928869090386, + "grad_norm": 0.2790382154947942, + "learning_rate": 8.874003330836966e-06, + "loss": 0.5766, + "step": 2576 + }, + { + "epoch": 0.4232135158992466, + "grad_norm": 0.44535350715320376, + "learning_rate": 8.87389310903398e-06, + "loss": 0.5744, + "step": 2577 + }, + { + "epoch": 0.42337774310758935, + "grad_norm": 0.3184028856113947, + "learning_rate": 8.87378283972621e-06, + "loss": 0.5816, + "step": 2578 + }, + { + "epoch": 0.4235419703159321, + "grad_norm": 0.2922792339457821, + "learning_rate": 8.873672522914856e-06, + "loss": 0.5509, + "step": 2579 + }, + { + "epoch": 0.42370619752427485, + "grad_norm": 0.454554563345644, + "learning_rate": 8.873562158601116e-06, + "loss": 0.5594, + "step": 2580 + }, + { + "epoch": 0.4238704247326176, + "grad_norm": 0.34907028522108435, + "learning_rate": 8.873451746786186e-06, + "loss": 0.5655, + "step": 2581 + }, + { + "epoch": 0.4240346519409603, + "grad_norm": 0.3298719872719342, + "learning_rate": 8.873341287471269e-06, + "loss": 0.5629, + "step": 2582 + }, + { + "epoch": 0.42419887914930304, + "grad_norm": 0.3389124908725632, + "learning_rate": 8.873230780657562e-06, + "loss": 0.5878, + "step": 2583 + }, + { + "epoch": 0.4243631063576458, + "grad_norm": 0.37188737857676285, + "learning_rate": 8.873120226346266e-06, + "loss": 0.5831, + "step": 2584 + }, + { + "epoch": 0.42452733356598854, + "grad_norm": 0.34890335602420697, + "learning_rate": 8.873009624538582e-06, + "loss": 0.5722, + "step": 2585 + }, + { + "epoch": 0.4246915607743313, + "grad_norm": 0.328354898484246, + "learning_rate": 8.872898975235711e-06, + "loss": 0.5569, + "step": 2586 + }, + { + "epoch": 0.42485578798267404, + "grad_norm": 0.312327677555695, + "learning_rate": 8.872788278438854e-06, + "loss": 0.5674, + "step": 2587 + }, + { + "epoch": 0.4250200151910168, + "grad_norm": 0.32296344711606056, + "learning_rate": 8.872677534149215e-06, + "loss": 0.559, + "step": 2588 + }, + { + "epoch": 0.42518424239935954, + "grad_norm": 0.36315755635387575, + "learning_rate": 8.872566742367995e-06, + "loss": 0.5895, + "step": 2589 + }, + { + "epoch": 0.42534846960770223, + "grad_norm": 0.33097917263767107, + "learning_rate": 8.8724559030964e-06, + "loss": 0.578, + "step": 2590 + }, + { + "epoch": 0.425512696816045, + "grad_norm": 0.315094597471828, + "learning_rate": 8.87234501633563e-06, + "loss": 0.5504, + "step": 2591 + }, + { + "epoch": 0.4256769240243877, + "grad_norm": 0.45635596872249246, + "learning_rate": 8.87223408208689e-06, + "loss": 0.5482, + "step": 2592 + }, + { + "epoch": 0.4258411512327305, + "grad_norm": 0.4193262239236399, + "learning_rate": 8.872123100351389e-06, + "loss": 0.5688, + "step": 2593 + }, + { + "epoch": 0.4260053784410732, + "grad_norm": 0.4273899125136232, + "learning_rate": 8.87201207113033e-06, + "loss": 0.5796, + "step": 2594 + }, + { + "epoch": 0.426169605649416, + "grad_norm": 0.3917537813124731, + "learning_rate": 8.871900994424917e-06, + "loss": 0.5666, + "step": 2595 + }, + { + "epoch": 0.4263338328577587, + "grad_norm": 0.3335513432900561, + "learning_rate": 8.871789870236358e-06, + "loss": 0.5536, + "step": 2596 + }, + { + "epoch": 0.42649806006610147, + "grad_norm": 0.3463487402113218, + "learning_rate": 8.87167869856586e-06, + "loss": 0.5663, + "step": 2597 + }, + { + "epoch": 0.4266622872744442, + "grad_norm": 0.3872490703111275, + "learning_rate": 8.87156747941463e-06, + "loss": 0.5594, + "step": 2598 + }, + { + "epoch": 0.4268265144827869, + "grad_norm": 0.31074493917689083, + "learning_rate": 8.871456212783874e-06, + "loss": 0.5922, + "step": 2599 + }, + { + "epoch": 0.42699074169112966, + "grad_norm": 0.31115722805904655, + "learning_rate": 8.871344898674806e-06, + "loss": 0.5681, + "step": 2600 + }, + { + "epoch": 0.4271549688994724, + "grad_norm": 0.30377485723430814, + "learning_rate": 8.87123353708863e-06, + "loss": 0.5751, + "step": 2601 + }, + { + "epoch": 0.42731919610781516, + "grad_norm": 0.3443330184625631, + "learning_rate": 8.871122128026559e-06, + "loss": 0.5477, + "step": 2602 + }, + { + "epoch": 0.4274834233161579, + "grad_norm": 0.3160677382382284, + "learning_rate": 8.871010671489798e-06, + "loss": 0.551, + "step": 2603 + }, + { + "epoch": 0.42764765052450066, + "grad_norm": 0.3089935106670285, + "learning_rate": 8.870899167479561e-06, + "loss": 0.5573, + "step": 2604 + }, + { + "epoch": 0.4278118777328434, + "grad_norm": 0.29792130232279923, + "learning_rate": 8.87078761599706e-06, + "loss": 0.5449, + "step": 2605 + }, + { + "epoch": 0.42797610494118615, + "grad_norm": 0.3416750043491599, + "learning_rate": 8.870676017043506e-06, + "loss": 0.5558, + "step": 2606 + }, + { + "epoch": 0.42814033214952885, + "grad_norm": 0.3137377292747842, + "learning_rate": 8.870564370620109e-06, + "loss": 0.5785, + "step": 2607 + }, + { + "epoch": 0.4283045593578716, + "grad_norm": 0.337439787190629, + "learning_rate": 8.870452676728082e-06, + "loss": 0.5636, + "step": 2608 + }, + { + "epoch": 0.42846878656621434, + "grad_norm": 0.3449765466085809, + "learning_rate": 8.870340935368641e-06, + "loss": 0.5613, + "step": 2609 + }, + { + "epoch": 0.4286330137745571, + "grad_norm": 0.3239998293778881, + "learning_rate": 8.870229146542996e-06, + "loss": 0.5427, + "step": 2610 + }, + { + "epoch": 0.42879724098289984, + "grad_norm": 0.3629879882971418, + "learning_rate": 8.870117310252364e-06, + "loss": 0.5485, + "step": 2611 + }, + { + "epoch": 0.4289614681912426, + "grad_norm": 0.322609355069675, + "learning_rate": 8.870005426497957e-06, + "loss": 0.5946, + "step": 2612 + }, + { + "epoch": 0.42912569539958534, + "grad_norm": 0.2760721542583468, + "learning_rate": 8.869893495280993e-06, + "loss": 0.545, + "step": 2613 + }, + { + "epoch": 0.4292899226079281, + "grad_norm": 0.377724350549196, + "learning_rate": 8.869781516602686e-06, + "loss": 0.5727, + "step": 2614 + }, + { + "epoch": 0.42945414981627084, + "grad_norm": 0.3233053561286611, + "learning_rate": 8.869669490464253e-06, + "loss": 0.5629, + "step": 2615 + }, + { + "epoch": 0.42961837702461353, + "grad_norm": 0.35088644192144613, + "learning_rate": 8.869557416866907e-06, + "loss": 0.561, + "step": 2616 + }, + { + "epoch": 0.4297826042329563, + "grad_norm": 0.3447050844031401, + "learning_rate": 8.86944529581187e-06, + "loss": 0.572, + "step": 2617 + }, + { + "epoch": 0.429946831441299, + "grad_norm": 0.34469094004572337, + "learning_rate": 8.86933312730036e-06, + "loss": 0.5506, + "step": 2618 + }, + { + "epoch": 0.4301110586496418, + "grad_norm": 0.327264522482107, + "learning_rate": 8.869220911333591e-06, + "loss": 0.5374, + "step": 2619 + }, + { + "epoch": 0.4302752858579845, + "grad_norm": 0.32108065910291667, + "learning_rate": 8.869108647912786e-06, + "loss": 0.5562, + "step": 2620 + }, + { + "epoch": 0.4304395130663273, + "grad_norm": 0.3075435330753905, + "learning_rate": 8.868996337039163e-06, + "loss": 0.5655, + "step": 2621 + }, + { + "epoch": 0.43060374027467, + "grad_norm": 0.34795979712051084, + "learning_rate": 8.868883978713939e-06, + "loss": 0.5661, + "step": 2622 + }, + { + "epoch": 0.43076796748301277, + "grad_norm": 0.36050340443618495, + "learning_rate": 8.868771572938337e-06, + "loss": 0.5624, + "step": 2623 + }, + { + "epoch": 0.43093219469135546, + "grad_norm": 0.3918416774697456, + "learning_rate": 8.868659119713579e-06, + "loss": 0.5749, + "step": 2624 + }, + { + "epoch": 0.4310964218996982, + "grad_norm": 0.3457358807465997, + "learning_rate": 8.868546619040884e-06, + "loss": 0.5643, + "step": 2625 + }, + { + "epoch": 0.43126064910804096, + "grad_norm": 0.39287093615546537, + "learning_rate": 8.868434070921473e-06, + "loss": 0.5588, + "step": 2626 + }, + { + "epoch": 0.4314248763163837, + "grad_norm": 0.33227836409954387, + "learning_rate": 8.868321475356572e-06, + "loss": 0.531, + "step": 2627 + }, + { + "epoch": 0.43158910352472646, + "grad_norm": 0.30996877615907675, + "learning_rate": 8.868208832347401e-06, + "loss": 0.54, + "step": 2628 + }, + { + "epoch": 0.4317533307330692, + "grad_norm": 0.29705507963891703, + "learning_rate": 8.868096141895186e-06, + "loss": 0.5647, + "step": 2629 + }, + { + "epoch": 0.43191755794141196, + "grad_norm": 0.3145590224011055, + "learning_rate": 8.867983404001147e-06, + "loss": 0.575, + "step": 2630 + }, + { + "epoch": 0.4320817851497547, + "grad_norm": 0.2984999228772706, + "learning_rate": 8.867870618666512e-06, + "loss": 0.564, + "step": 2631 + }, + { + "epoch": 0.43224601235809745, + "grad_norm": 0.3948551709555983, + "learning_rate": 8.867757785892506e-06, + "loss": 0.56, + "step": 2632 + }, + { + "epoch": 0.43241023956644015, + "grad_norm": 0.313192693394057, + "learning_rate": 8.86764490568035e-06, + "loss": 0.5537, + "step": 2633 + }, + { + "epoch": 0.4325744667747829, + "grad_norm": 0.4653518718560589, + "learning_rate": 8.867531978031276e-06, + "loss": 0.5714, + "step": 2634 + }, + { + "epoch": 0.43273869398312564, + "grad_norm": 0.3742502734817775, + "learning_rate": 8.867419002946505e-06, + "loss": 0.5631, + "step": 2635 + }, + { + "epoch": 0.4329029211914684, + "grad_norm": 0.3232307295194687, + "learning_rate": 8.867305980427268e-06, + "loss": 0.5529, + "step": 2636 + }, + { + "epoch": 0.43306714839981114, + "grad_norm": 0.2807991745055016, + "learning_rate": 8.867192910474792e-06, + "loss": 0.5406, + "step": 2637 + }, + { + "epoch": 0.4332313756081539, + "grad_norm": 0.297296622670886, + "learning_rate": 8.867079793090304e-06, + "loss": 0.5506, + "step": 2638 + }, + { + "epoch": 0.43339560281649664, + "grad_norm": 0.3542755078188722, + "learning_rate": 8.866966628275032e-06, + "loss": 0.5684, + "step": 2639 + }, + { + "epoch": 0.4335598300248394, + "grad_norm": 0.2744566073730852, + "learning_rate": 8.866853416030206e-06, + "loss": 0.5396, + "step": 2640 + }, + { + "epoch": 0.4337240572331821, + "grad_norm": 0.2959244532919851, + "learning_rate": 8.866740156357056e-06, + "loss": 0.5702, + "step": 2641 + }, + { + "epoch": 0.43388828444152483, + "grad_norm": 0.3268923575661857, + "learning_rate": 8.866626849256812e-06, + "loss": 0.5568, + "step": 2642 + }, + { + "epoch": 0.4340525116498676, + "grad_norm": 0.30432906161931816, + "learning_rate": 8.866513494730702e-06, + "loss": 0.54, + "step": 2643 + }, + { + "epoch": 0.4342167388582103, + "grad_norm": 0.33743946703434036, + "learning_rate": 8.866400092779963e-06, + "loss": 0.5553, + "step": 2644 + }, + { + "epoch": 0.4343809660665531, + "grad_norm": 0.2873723233387204, + "learning_rate": 8.866286643405819e-06, + "loss": 0.5603, + "step": 2645 + }, + { + "epoch": 0.4345451932748958, + "grad_norm": 0.4160177869919669, + "learning_rate": 8.866173146609509e-06, + "loss": 0.5581, + "step": 2646 + }, + { + "epoch": 0.4347094204832386, + "grad_norm": 0.3620215325482711, + "learning_rate": 8.866059602392262e-06, + "loss": 0.5538, + "step": 2647 + }, + { + "epoch": 0.4348736476915813, + "grad_norm": 0.34398345578712186, + "learning_rate": 8.865946010755313e-06, + "loss": 0.5617, + "step": 2648 + }, + { + "epoch": 0.43503787489992407, + "grad_norm": 0.4050944944383113, + "learning_rate": 8.865832371699894e-06, + "loss": 0.5628, + "step": 2649 + }, + { + "epoch": 0.43520210210826676, + "grad_norm": 0.34906966357449254, + "learning_rate": 8.86571868522724e-06, + "loss": 0.5485, + "step": 2650 + }, + { + "epoch": 0.4353663293166095, + "grad_norm": 0.2968598963085859, + "learning_rate": 8.865604951338585e-06, + "loss": 0.5592, + "step": 2651 + }, + { + "epoch": 0.43553055652495226, + "grad_norm": 0.2987516697302997, + "learning_rate": 8.865491170035166e-06, + "loss": 0.5615, + "step": 2652 + }, + { + "epoch": 0.435694783733295, + "grad_norm": 0.3500301965790752, + "learning_rate": 8.865377341318218e-06, + "loss": 0.5892, + "step": 2653 + }, + { + "epoch": 0.43585901094163776, + "grad_norm": 0.3929052238635463, + "learning_rate": 8.865263465188977e-06, + "loss": 0.5495, + "step": 2654 + }, + { + "epoch": 0.4360232381499805, + "grad_norm": 0.3113448223634506, + "learning_rate": 8.865149541648679e-06, + "loss": 0.5663, + "step": 2655 + }, + { + "epoch": 0.43618746535832326, + "grad_norm": 0.31364147135723414, + "learning_rate": 8.865035570698563e-06, + "loss": 0.5871, + "step": 2656 + }, + { + "epoch": 0.436351692566666, + "grad_norm": 0.31789727183776045, + "learning_rate": 8.864921552339866e-06, + "loss": 0.5898, + "step": 2657 + }, + { + "epoch": 0.4365159197750087, + "grad_norm": 0.2991212298735117, + "learning_rate": 8.864807486573827e-06, + "loss": 0.5535, + "step": 2658 + }, + { + "epoch": 0.43668014698335145, + "grad_norm": 0.31462832495510007, + "learning_rate": 8.864693373401684e-06, + "loss": 0.5674, + "step": 2659 + }, + { + "epoch": 0.4368443741916942, + "grad_norm": 0.3690385907683935, + "learning_rate": 8.864579212824676e-06, + "loss": 0.5641, + "step": 2660 + }, + { + "epoch": 0.43700860140003694, + "grad_norm": 0.30516447961861315, + "learning_rate": 8.864465004844045e-06, + "loss": 0.5619, + "step": 2661 + }, + { + "epoch": 0.4371728286083797, + "grad_norm": 0.32974499017869374, + "learning_rate": 8.864350749461027e-06, + "loss": 0.5534, + "step": 2662 + }, + { + "epoch": 0.43733705581672244, + "grad_norm": 0.5630108302746223, + "learning_rate": 8.864236446676871e-06, + "loss": 0.5881, + "step": 2663 + }, + { + "epoch": 0.4375012830250652, + "grad_norm": 0.33965305823725117, + "learning_rate": 8.864122096492808e-06, + "loss": 0.5615, + "step": 2664 + }, + { + "epoch": 0.43766551023340794, + "grad_norm": 0.2980421255968811, + "learning_rate": 8.86400769891009e-06, + "loss": 0.5415, + "step": 2665 + }, + { + "epoch": 0.4378297374417507, + "grad_norm": 0.33674308958283006, + "learning_rate": 8.863893253929951e-06, + "loss": 0.5721, + "step": 2666 + }, + { + "epoch": 0.4379939646500934, + "grad_norm": 0.37123811762089753, + "learning_rate": 8.86377876155364e-06, + "loss": 0.5859, + "step": 2667 + }, + { + "epoch": 0.43815819185843613, + "grad_norm": 0.3360935049247868, + "learning_rate": 8.863664221782397e-06, + "loss": 0.5466, + "step": 2668 + }, + { + "epoch": 0.4383224190667789, + "grad_norm": 0.3194927447043426, + "learning_rate": 8.863549634617467e-06, + "loss": 0.5725, + "step": 2669 + }, + { + "epoch": 0.4384866462751216, + "grad_norm": 0.31690358845503236, + "learning_rate": 8.863435000060097e-06, + "loss": 0.5396, + "step": 2670 + }, + { + "epoch": 0.4386508734834644, + "grad_norm": 0.3191798894096308, + "learning_rate": 8.863320318111528e-06, + "loss": 0.5577, + "step": 2671 + }, + { + "epoch": 0.4388151006918071, + "grad_norm": 0.4654210766977454, + "learning_rate": 8.863205588773007e-06, + "loss": 0.5724, + "step": 2672 + }, + { + "epoch": 0.4389793279001499, + "grad_norm": 0.3422961348923824, + "learning_rate": 8.863090812045783e-06, + "loss": 0.5847, + "step": 2673 + }, + { + "epoch": 0.4391435551084926, + "grad_norm": 0.3121440943186785, + "learning_rate": 8.862975987931097e-06, + "loss": 0.5774, + "step": 2674 + }, + { + "epoch": 0.4393077823168353, + "grad_norm": 0.3614058733108181, + "learning_rate": 8.8628611164302e-06, + "loss": 0.5538, + "step": 2675 + }, + { + "epoch": 0.43947200952517806, + "grad_norm": 0.3305418031431199, + "learning_rate": 8.862746197544341e-06, + "loss": 0.5522, + "step": 2676 + }, + { + "epoch": 0.4396362367335208, + "grad_norm": 0.31728827829254425, + "learning_rate": 8.862631231274764e-06, + "loss": 0.5637, + "step": 2677 + }, + { + "epoch": 0.43980046394186356, + "grad_norm": 0.2984532306242673, + "learning_rate": 8.862516217622721e-06, + "loss": 0.5741, + "step": 2678 + }, + { + "epoch": 0.4399646911502063, + "grad_norm": 0.30044448216886954, + "learning_rate": 8.862401156589457e-06, + "loss": 0.5649, + "step": 2679 + }, + { + "epoch": 0.44012891835854906, + "grad_norm": 0.33355290409354743, + "learning_rate": 8.862286048176227e-06, + "loss": 0.5435, + "step": 2680 + }, + { + "epoch": 0.4402931455668918, + "grad_norm": 0.305282881254181, + "learning_rate": 8.862170892384278e-06, + "loss": 0.5473, + "step": 2681 + }, + { + "epoch": 0.44045737277523456, + "grad_norm": 0.31605141038354306, + "learning_rate": 8.86205568921486e-06, + "loss": 0.5179, + "step": 2682 + }, + { + "epoch": 0.4406215999835773, + "grad_norm": 0.36881933440627185, + "learning_rate": 8.861940438669227e-06, + "loss": 0.591, + "step": 2683 + }, + { + "epoch": 0.44078582719192, + "grad_norm": 0.3375529244437988, + "learning_rate": 8.86182514074863e-06, + "loss": 0.5791, + "step": 2684 + }, + { + "epoch": 0.44095005440026275, + "grad_norm": 0.30314726667864034, + "learning_rate": 8.861709795454319e-06, + "loss": 0.5557, + "step": 2685 + }, + { + "epoch": 0.4411142816086055, + "grad_norm": 0.3364679818060467, + "learning_rate": 8.86159440278755e-06, + "loss": 0.5733, + "step": 2686 + }, + { + "epoch": 0.44127850881694825, + "grad_norm": 0.39220116051738163, + "learning_rate": 8.861478962749572e-06, + "loss": 0.5579, + "step": 2687 + }, + { + "epoch": 0.441442736025291, + "grad_norm": 0.34252772123554887, + "learning_rate": 8.861363475341642e-06, + "loss": 0.5544, + "step": 2688 + }, + { + "epoch": 0.44160696323363374, + "grad_norm": 0.33108150991015783, + "learning_rate": 8.861247940565015e-06, + "loss": 0.5761, + "step": 2689 + }, + { + "epoch": 0.4417711904419765, + "grad_norm": 0.32834599672177206, + "learning_rate": 8.861132358420943e-06, + "loss": 0.5662, + "step": 2690 + }, + { + "epoch": 0.44193541765031924, + "grad_norm": 0.32244811454221667, + "learning_rate": 8.861016728910683e-06, + "loss": 0.5671, + "step": 2691 + }, + { + "epoch": 0.44209964485866193, + "grad_norm": 0.3597249482708189, + "learning_rate": 8.860901052035492e-06, + "loss": 0.5613, + "step": 2692 + }, + { + "epoch": 0.4422638720670047, + "grad_norm": 0.37744052906600656, + "learning_rate": 8.860785327796625e-06, + "loss": 0.553, + "step": 2693 + }, + { + "epoch": 0.44242809927534743, + "grad_norm": 0.3052348493632045, + "learning_rate": 8.860669556195338e-06, + "loss": 0.5457, + "step": 2694 + }, + { + "epoch": 0.4425923264836902, + "grad_norm": 0.30573461825823206, + "learning_rate": 8.860553737232889e-06, + "loss": 0.5613, + "step": 2695 + }, + { + "epoch": 0.44275655369203293, + "grad_norm": 0.3007325457721176, + "learning_rate": 8.860437870910537e-06, + "loss": 0.5534, + "step": 2696 + }, + { + "epoch": 0.4429207809003757, + "grad_norm": 0.2869011044903269, + "learning_rate": 8.86032195722954e-06, + "loss": 0.5542, + "step": 2697 + }, + { + "epoch": 0.4430850081087184, + "grad_norm": 0.5107075695902422, + "learning_rate": 8.860205996191155e-06, + "loss": 0.5081, + "step": 2698 + }, + { + "epoch": 0.4432492353170612, + "grad_norm": 0.33253967123335504, + "learning_rate": 8.860089987796643e-06, + "loss": 0.5554, + "step": 2699 + }, + { + "epoch": 0.4434134625254039, + "grad_norm": 0.42343864420968474, + "learning_rate": 8.859973932047267e-06, + "loss": 0.5545, + "step": 2700 + }, + { + "epoch": 0.4435776897337466, + "grad_norm": 0.30685102611881854, + "learning_rate": 8.85985782894428e-06, + "loss": 0.5412, + "step": 2701 + }, + { + "epoch": 0.44374191694208936, + "grad_norm": 0.3169560527008105, + "learning_rate": 8.85974167848895e-06, + "loss": 0.5784, + "step": 2702 + }, + { + "epoch": 0.4439061441504321, + "grad_norm": 0.3084651380614442, + "learning_rate": 8.859625480682535e-06, + "loss": 0.5317, + "step": 2703 + }, + { + "epoch": 0.44407037135877486, + "grad_norm": 0.441293933174961, + "learning_rate": 8.8595092355263e-06, + "loss": 0.543, + "step": 2704 + }, + { + "epoch": 0.4442345985671176, + "grad_norm": 0.46942179938637835, + "learning_rate": 8.859392943021504e-06, + "loss": 0.5627, + "step": 2705 + }, + { + "epoch": 0.44439882577546036, + "grad_norm": 0.293267545918469, + "learning_rate": 8.859276603169412e-06, + "loss": 0.5479, + "step": 2706 + }, + { + "epoch": 0.4445630529838031, + "grad_norm": 0.45510996453392444, + "learning_rate": 8.859160215971286e-06, + "loss": 0.5778, + "step": 2707 + }, + { + "epoch": 0.44472728019214586, + "grad_norm": 0.32926696538846945, + "learning_rate": 8.859043781428393e-06, + "loss": 0.5417, + "step": 2708 + }, + { + "epoch": 0.44489150740048855, + "grad_norm": 0.33787931129775456, + "learning_rate": 8.858927299541995e-06, + "loss": 0.572, + "step": 2709 + }, + { + "epoch": 0.4450557346088313, + "grad_norm": 0.40369557531155686, + "learning_rate": 8.858810770313358e-06, + "loss": 0.5943, + "step": 2710 + }, + { + "epoch": 0.44521996181717405, + "grad_norm": 0.33877275852459676, + "learning_rate": 8.858694193743747e-06, + "loss": 0.5494, + "step": 2711 + }, + { + "epoch": 0.4453841890255168, + "grad_norm": 0.3597032289417445, + "learning_rate": 8.85857756983443e-06, + "loss": 0.5811, + "step": 2712 + }, + { + "epoch": 0.44554841623385955, + "grad_norm": 0.34694433353018983, + "learning_rate": 8.858460898586671e-06, + "loss": 0.5472, + "step": 2713 + }, + { + "epoch": 0.4457126434422023, + "grad_norm": 0.3595141962113896, + "learning_rate": 8.858344180001738e-06, + "loss": 0.5738, + "step": 2714 + }, + { + "epoch": 0.44587687065054504, + "grad_norm": 0.31463064675320646, + "learning_rate": 8.8582274140809e-06, + "loss": 0.5648, + "step": 2715 + }, + { + "epoch": 0.4460410978588878, + "grad_norm": 0.34440317093930706, + "learning_rate": 8.858110600825425e-06, + "loss": 0.5654, + "step": 2716 + }, + { + "epoch": 0.44620532506723054, + "grad_norm": 0.31860356038557297, + "learning_rate": 8.857993740236582e-06, + "loss": 0.5553, + "step": 2717 + }, + { + "epoch": 0.44636955227557323, + "grad_norm": 0.3105546506846929, + "learning_rate": 8.857876832315636e-06, + "loss": 0.5647, + "step": 2718 + }, + { + "epoch": 0.446533779483916, + "grad_norm": 0.33831142671969106, + "learning_rate": 8.857759877063863e-06, + "loss": 0.5479, + "step": 2719 + }, + { + "epoch": 0.44669800669225873, + "grad_norm": 0.3358614278897194, + "learning_rate": 8.857642874482528e-06, + "loss": 0.5792, + "step": 2720 + }, + { + "epoch": 0.4468622339006015, + "grad_norm": 1.239182763799466, + "learning_rate": 8.857525824572906e-06, + "loss": 0.5551, + "step": 2721 + }, + { + "epoch": 0.44702646110894423, + "grad_norm": 0.41414189926956013, + "learning_rate": 8.857408727336265e-06, + "loss": 0.5596, + "step": 2722 + }, + { + "epoch": 0.447190688317287, + "grad_norm": 0.3364069633037803, + "learning_rate": 8.857291582773878e-06, + "loss": 0.5507, + "step": 2723 + }, + { + "epoch": 0.4473549155256297, + "grad_norm": 0.32342124716147996, + "learning_rate": 8.857174390887019e-06, + "loss": 0.5608, + "step": 2724 + }, + { + "epoch": 0.4475191427339725, + "grad_norm": 0.34676827689048356, + "learning_rate": 8.85705715167696e-06, + "loss": 0.5582, + "step": 2725 + }, + { + "epoch": 0.44768336994231517, + "grad_norm": 0.35153982996698413, + "learning_rate": 8.856939865144971e-06, + "loss": 0.5515, + "step": 2726 + }, + { + "epoch": 0.4478475971506579, + "grad_norm": 0.2938845084834155, + "learning_rate": 8.856822531292329e-06, + "loss": 0.5429, + "step": 2727 + }, + { + "epoch": 0.44801182435900067, + "grad_norm": 0.2910178771667892, + "learning_rate": 8.856705150120308e-06, + "loss": 0.5645, + "step": 2728 + }, + { + "epoch": 0.4481760515673434, + "grad_norm": 0.2918827196955868, + "learning_rate": 8.856587721630182e-06, + "loss": 0.5451, + "step": 2729 + }, + { + "epoch": 0.44834027877568616, + "grad_norm": 0.3394189888352664, + "learning_rate": 8.856470245823227e-06, + "loss": 0.5605, + "step": 2730 + }, + { + "epoch": 0.4485045059840289, + "grad_norm": 0.33940159120607644, + "learning_rate": 8.85635272270072e-06, + "loss": 0.5491, + "step": 2731 + }, + { + "epoch": 0.44866873319237166, + "grad_norm": 0.36680822587198164, + "learning_rate": 8.856235152263938e-06, + "loss": 0.5681, + "step": 2732 + }, + { + "epoch": 0.4488329604007144, + "grad_norm": 0.3334810084593633, + "learning_rate": 8.856117534514154e-06, + "loss": 0.5606, + "step": 2733 + }, + { + "epoch": 0.44899718760905716, + "grad_norm": 0.29740348475239803, + "learning_rate": 8.855999869452647e-06, + "loss": 0.5647, + "step": 2734 + }, + { + "epoch": 0.44916141481739985, + "grad_norm": 0.32512754587660186, + "learning_rate": 8.855882157080697e-06, + "loss": 0.5652, + "step": 2735 + }, + { + "epoch": 0.4493256420257426, + "grad_norm": 0.3561551765070881, + "learning_rate": 8.85576439739958e-06, + "loss": 0.5773, + "step": 2736 + }, + { + "epoch": 0.44948986923408535, + "grad_norm": 0.3304245407015048, + "learning_rate": 8.855646590410578e-06, + "loss": 0.5666, + "step": 2737 + }, + { + "epoch": 0.4496540964424281, + "grad_norm": 0.3332153957003678, + "learning_rate": 8.855528736114969e-06, + "loss": 0.5686, + "step": 2738 + }, + { + "epoch": 0.44981832365077085, + "grad_norm": 0.3014454733597397, + "learning_rate": 8.85541083451403e-06, + "loss": 0.5547, + "step": 2739 + }, + { + "epoch": 0.4499825508591136, + "grad_norm": 0.3068041232696349, + "learning_rate": 8.855292885609045e-06, + "loss": 0.5625, + "step": 2740 + }, + { + "epoch": 0.45014677806745634, + "grad_norm": 0.3670493909898495, + "learning_rate": 8.855174889401295e-06, + "loss": 0.5724, + "step": 2741 + }, + { + "epoch": 0.4503110052757991, + "grad_norm": 0.3600172039749998, + "learning_rate": 8.85505684589206e-06, + "loss": 0.552, + "step": 2742 + }, + { + "epoch": 0.4504752324841418, + "grad_norm": 0.34510224190707856, + "learning_rate": 8.854938755082624e-06, + "loss": 0.5542, + "step": 2743 + }, + { + "epoch": 0.45063945969248453, + "grad_norm": 0.3354210595356994, + "learning_rate": 8.854820616974267e-06, + "loss": 0.5706, + "step": 2744 + }, + { + "epoch": 0.4508036869008273, + "grad_norm": 0.27484837623978225, + "learning_rate": 8.854702431568276e-06, + "loss": 0.5536, + "step": 2745 + }, + { + "epoch": 0.45096791410917003, + "grad_norm": 0.2772875379629504, + "learning_rate": 8.85458419886593e-06, + "loss": 0.5519, + "step": 2746 + }, + { + "epoch": 0.4511321413175128, + "grad_norm": 0.36745553176616597, + "learning_rate": 8.854465918868516e-06, + "loss": 0.5691, + "step": 2747 + }, + { + "epoch": 0.45129636852585553, + "grad_norm": 0.9592406829660379, + "learning_rate": 8.854347591577319e-06, + "loss": 0.5453, + "step": 2748 + }, + { + "epoch": 0.4514605957341983, + "grad_norm": 0.37135239233609174, + "learning_rate": 8.85422921699362e-06, + "loss": 0.5541, + "step": 2749 + }, + { + "epoch": 0.451624822942541, + "grad_norm": 0.3242742529723335, + "learning_rate": 8.85411079511871e-06, + "loss": 0.5644, + "step": 2750 + }, + { + "epoch": 0.4517890501508838, + "grad_norm": 0.323580754011984, + "learning_rate": 8.853992325953872e-06, + "loss": 0.5593, + "step": 2751 + }, + { + "epoch": 0.45195327735922647, + "grad_norm": 0.3877844317699021, + "learning_rate": 8.853873809500395e-06, + "loss": 0.5353, + "step": 2752 + }, + { + "epoch": 0.4521175045675692, + "grad_norm": 0.32987127792086013, + "learning_rate": 8.853755245759564e-06, + "loss": 0.547, + "step": 2753 + }, + { + "epoch": 0.45228173177591197, + "grad_norm": 0.3243082577170755, + "learning_rate": 8.853636634732668e-06, + "loss": 0.5534, + "step": 2754 + }, + { + "epoch": 0.4524459589842547, + "grad_norm": 0.31835431916955587, + "learning_rate": 8.853517976420993e-06, + "loss": 0.5437, + "step": 2755 + }, + { + "epoch": 0.45261018619259746, + "grad_norm": 0.34976967541051596, + "learning_rate": 8.85339927082583e-06, + "loss": 0.5577, + "step": 2756 + }, + { + "epoch": 0.4527744134009402, + "grad_norm": 0.2923139255600356, + "learning_rate": 8.853280517948468e-06, + "loss": 0.5569, + "step": 2757 + }, + { + "epoch": 0.45293864060928296, + "grad_norm": 0.30552005384252057, + "learning_rate": 8.853161717790197e-06, + "loss": 0.5416, + "step": 2758 + }, + { + "epoch": 0.4531028678176257, + "grad_norm": 0.3990802742968821, + "learning_rate": 8.853042870352308e-06, + "loss": 0.5465, + "step": 2759 + }, + { + "epoch": 0.4532670950259684, + "grad_norm": 0.3461464555447581, + "learning_rate": 8.852923975636089e-06, + "loss": 0.5565, + "step": 2760 + }, + { + "epoch": 0.45343132223431115, + "grad_norm": 0.4068062311659384, + "learning_rate": 8.852805033642834e-06, + "loss": 0.5293, + "step": 2761 + }, + { + "epoch": 0.4535955494426539, + "grad_norm": 0.4050717095926476, + "learning_rate": 8.852686044373831e-06, + "loss": 0.5512, + "step": 2762 + }, + { + "epoch": 0.45375977665099665, + "grad_norm": 0.3106886645730793, + "learning_rate": 8.852567007830378e-06, + "loss": 0.566, + "step": 2763 + }, + { + "epoch": 0.4539240038593394, + "grad_norm": 0.3465009114201763, + "learning_rate": 8.852447924013763e-06, + "loss": 0.5429, + "step": 2764 + }, + { + "epoch": 0.45408823106768215, + "grad_norm": 0.34100205206268885, + "learning_rate": 8.852328792925284e-06, + "loss": 0.5562, + "step": 2765 + }, + { + "epoch": 0.4542524582760249, + "grad_norm": 0.3373659034184963, + "learning_rate": 8.85220961456623e-06, + "loss": 0.5508, + "step": 2766 + }, + { + "epoch": 0.45441668548436764, + "grad_norm": 0.32662364480543243, + "learning_rate": 8.852090388937899e-06, + "loss": 0.5574, + "step": 2767 + }, + { + "epoch": 0.45458091269271034, + "grad_norm": 0.29597270835282974, + "learning_rate": 8.851971116041582e-06, + "loss": 0.5619, + "step": 2768 + }, + { + "epoch": 0.4547451399010531, + "grad_norm": 0.3269485670857866, + "learning_rate": 8.85185179587858e-06, + "loss": 0.5336, + "step": 2769 + }, + { + "epoch": 0.45490936710939583, + "grad_norm": 0.3079376099727638, + "learning_rate": 8.851732428450183e-06, + "loss": 0.549, + "step": 2770 + }, + { + "epoch": 0.4550735943177386, + "grad_norm": 0.3205362156914591, + "learning_rate": 8.851613013757693e-06, + "loss": 0.5441, + "step": 2771 + }, + { + "epoch": 0.45523782152608133, + "grad_norm": 0.32782562576015545, + "learning_rate": 8.851493551802403e-06, + "loss": 0.5498, + "step": 2772 + }, + { + "epoch": 0.4554020487344241, + "grad_norm": 0.36441285832394543, + "learning_rate": 8.851374042585612e-06, + "loss": 0.5473, + "step": 2773 + }, + { + "epoch": 0.45556627594276683, + "grad_norm": 0.3225326006599982, + "learning_rate": 8.851254486108616e-06, + "loss": 0.5283, + "step": 2774 + }, + { + "epoch": 0.4557305031511096, + "grad_norm": 0.29349747631366513, + "learning_rate": 8.851134882372716e-06, + "loss": 0.5582, + "step": 2775 + }, + { + "epoch": 0.4558947303594523, + "grad_norm": 0.3107685467178685, + "learning_rate": 8.851015231379211e-06, + "loss": 0.5504, + "step": 2776 + }, + { + "epoch": 0.456058957567795, + "grad_norm": 0.31413826669046374, + "learning_rate": 8.8508955331294e-06, + "loss": 0.5159, + "step": 2777 + }, + { + "epoch": 0.45622318477613777, + "grad_norm": 0.2936011119768247, + "learning_rate": 8.850775787624584e-06, + "loss": 0.539, + "step": 2778 + }, + { + "epoch": 0.4563874119844805, + "grad_norm": 0.3475480393438803, + "learning_rate": 8.85065599486606e-06, + "loss": 0.5642, + "step": 2779 + }, + { + "epoch": 0.45655163919282327, + "grad_norm": 0.3098780378515171, + "learning_rate": 8.850536154855132e-06, + "loss": 0.5429, + "step": 2780 + }, + { + "epoch": 0.456715866401166, + "grad_norm": 0.9929195452826118, + "learning_rate": 8.850416267593102e-06, + "loss": 0.5698, + "step": 2781 + }, + { + "epoch": 0.45688009360950876, + "grad_norm": 0.3244883281188489, + "learning_rate": 8.85029633308127e-06, + "loss": 0.5742, + "step": 2782 + }, + { + "epoch": 0.4570443208178515, + "grad_norm": 0.31113246989096616, + "learning_rate": 8.85017635132094e-06, + "loss": 0.5397, + "step": 2783 + }, + { + "epoch": 0.45720854802619426, + "grad_norm": 0.31519864743587955, + "learning_rate": 8.850056322313414e-06, + "loss": 0.5792, + "step": 2784 + }, + { + "epoch": 0.45737277523453695, + "grad_norm": 0.3145460315844349, + "learning_rate": 8.849936246059998e-06, + "loss": 0.5623, + "step": 2785 + }, + { + "epoch": 0.4575370024428797, + "grad_norm": 0.38866288927128867, + "learning_rate": 8.849816122561993e-06, + "loss": 0.5444, + "step": 2786 + }, + { + "epoch": 0.45770122965122245, + "grad_norm": 0.32096724495160234, + "learning_rate": 8.849695951820707e-06, + "loss": 0.5496, + "step": 2787 + }, + { + "epoch": 0.4578654568595652, + "grad_norm": 0.37573771878896706, + "learning_rate": 8.849575733837444e-06, + "loss": 0.5418, + "step": 2788 + }, + { + "epoch": 0.45802968406790795, + "grad_norm": 0.27925464921147647, + "learning_rate": 8.849455468613506e-06, + "loss": 0.5458, + "step": 2789 + }, + { + "epoch": 0.4581939112762507, + "grad_norm": 0.36654175597987393, + "learning_rate": 8.849335156150205e-06, + "loss": 0.5602, + "step": 2790 + }, + { + "epoch": 0.45835813848459345, + "grad_norm": 0.3155475116274339, + "learning_rate": 8.849214796448844e-06, + "loss": 0.5824, + "step": 2791 + }, + { + "epoch": 0.4585223656929362, + "grad_norm": 0.3428136888193705, + "learning_rate": 8.84909438951073e-06, + "loss": 0.569, + "step": 2792 + }, + { + "epoch": 0.45868659290127894, + "grad_norm": 0.30583378806886213, + "learning_rate": 8.848973935337174e-06, + "loss": 0.5545, + "step": 2793 + }, + { + "epoch": 0.45885082010962164, + "grad_norm": 0.33735566982609017, + "learning_rate": 8.848853433929482e-06, + "loss": 0.5572, + "step": 2794 + }, + { + "epoch": 0.4590150473179644, + "grad_norm": 0.31361528788938636, + "learning_rate": 8.848732885288963e-06, + "loss": 0.5697, + "step": 2795 + }, + { + "epoch": 0.45917927452630714, + "grad_norm": 0.34545031796156106, + "learning_rate": 8.848612289416926e-06, + "loss": 0.5642, + "step": 2796 + }, + { + "epoch": 0.4593435017346499, + "grad_norm": 0.2874345855470007, + "learning_rate": 8.84849164631468e-06, + "loss": 0.5849, + "step": 2797 + }, + { + "epoch": 0.45950772894299263, + "grad_norm": 0.28314439970383, + "learning_rate": 8.848370955983539e-06, + "loss": 0.5566, + "step": 2798 + }, + { + "epoch": 0.4596719561513354, + "grad_norm": 0.29925600402316954, + "learning_rate": 8.848250218424809e-06, + "loss": 0.5598, + "step": 2799 + }, + { + "epoch": 0.45983618335967813, + "grad_norm": 0.5002875320880814, + "learning_rate": 8.848129433639803e-06, + "loss": 0.5434, + "step": 2800 + }, + { + "epoch": 0.4600004105680209, + "grad_norm": 0.33733296405596075, + "learning_rate": 8.848008601629834e-06, + "loss": 0.5406, + "step": 2801 + }, + { + "epoch": 0.46016463777636357, + "grad_norm": 0.3162926750411412, + "learning_rate": 8.847887722396215e-06, + "loss": 0.5539, + "step": 2802 + }, + { + "epoch": 0.4603288649847063, + "grad_norm": 0.3281447732451233, + "learning_rate": 8.847766795940256e-06, + "loss": 0.5747, + "step": 2803 + }, + { + "epoch": 0.46049309219304907, + "grad_norm": 0.2805684954794172, + "learning_rate": 8.847645822263274e-06, + "loss": 0.5665, + "step": 2804 + }, + { + "epoch": 0.4606573194013918, + "grad_norm": 0.39677613691474567, + "learning_rate": 8.847524801366579e-06, + "loss": 0.5454, + "step": 2805 + }, + { + "epoch": 0.46082154660973457, + "grad_norm": 0.29768665118106963, + "learning_rate": 8.847403733251488e-06, + "loss": 0.5619, + "step": 2806 + }, + { + "epoch": 0.4609857738180773, + "grad_norm": 0.2719942035991359, + "learning_rate": 8.847282617919317e-06, + "loss": 0.5527, + "step": 2807 + }, + { + "epoch": 0.46115000102642006, + "grad_norm": 0.30476150813606573, + "learning_rate": 8.847161455371376e-06, + "loss": 0.5661, + "step": 2808 + }, + { + "epoch": 0.4613142282347628, + "grad_norm": 0.35212052120108744, + "learning_rate": 8.847040245608987e-06, + "loss": 0.54, + "step": 2809 + }, + { + "epoch": 0.46147845544310556, + "grad_norm": 0.3856979779368862, + "learning_rate": 8.846918988633464e-06, + "loss": 0.5946, + "step": 2810 + }, + { + "epoch": 0.46164268265144826, + "grad_norm": 0.28956054679877297, + "learning_rate": 8.846797684446123e-06, + "loss": 0.5461, + "step": 2811 + }, + { + "epoch": 0.461806909859791, + "grad_norm": 0.3947231541849336, + "learning_rate": 8.846676333048283e-06, + "loss": 0.5781, + "step": 2812 + }, + { + "epoch": 0.46197113706813375, + "grad_norm": 0.3223342533441565, + "learning_rate": 8.84655493444126e-06, + "loss": 0.5637, + "step": 2813 + }, + { + "epoch": 0.4621353642764765, + "grad_norm": 0.31670445757816623, + "learning_rate": 8.846433488626376e-06, + "loss": 0.5548, + "step": 2814 + }, + { + "epoch": 0.46229959148481925, + "grad_norm": 0.3614008756161484, + "learning_rate": 8.846311995604947e-06, + "loss": 0.5659, + "step": 2815 + }, + { + "epoch": 0.462463818693162, + "grad_norm": 0.3072418723805468, + "learning_rate": 8.846190455378293e-06, + "loss": 0.5395, + "step": 2816 + }, + { + "epoch": 0.46262804590150475, + "grad_norm": 0.3025970724441588, + "learning_rate": 8.846068867947736e-06, + "loss": 0.545, + "step": 2817 + }, + { + "epoch": 0.4627922731098475, + "grad_norm": 0.4316806453711984, + "learning_rate": 8.845947233314595e-06, + "loss": 0.5384, + "step": 2818 + }, + { + "epoch": 0.4629565003181902, + "grad_norm": 0.3162804536736171, + "learning_rate": 8.845825551480192e-06, + "loss": 0.5714, + "step": 2819 + }, + { + "epoch": 0.46312072752653294, + "grad_norm": 0.3311951157979156, + "learning_rate": 8.845703822445845e-06, + "loss": 0.5648, + "step": 2820 + }, + { + "epoch": 0.4632849547348757, + "grad_norm": 0.3077557161039132, + "learning_rate": 8.84558204621288e-06, + "loss": 0.5737, + "step": 2821 + }, + { + "epoch": 0.46344918194321844, + "grad_norm": 0.4303477814296107, + "learning_rate": 8.845460222782619e-06, + "loss": 0.5653, + "step": 2822 + }, + { + "epoch": 0.4636134091515612, + "grad_norm": 0.29648758662065006, + "learning_rate": 8.845338352156384e-06, + "loss": 0.561, + "step": 2823 + }, + { + "epoch": 0.46377763635990393, + "grad_norm": 0.3183803997527791, + "learning_rate": 8.8452164343355e-06, + "loss": 0.5502, + "step": 2824 + }, + { + "epoch": 0.4639418635682467, + "grad_norm": 0.3179317015924949, + "learning_rate": 8.845094469321291e-06, + "loss": 0.5111, + "step": 2825 + }, + { + "epoch": 0.46410609077658943, + "grad_norm": 0.33356525833540174, + "learning_rate": 8.84497245711508e-06, + "loss": 0.5532, + "step": 2826 + }, + { + "epoch": 0.4642703179849322, + "grad_norm": 0.35637651622204897, + "learning_rate": 8.844850397718193e-06, + "loss": 0.5457, + "step": 2827 + }, + { + "epoch": 0.4644345451932749, + "grad_norm": 0.3510786554960796, + "learning_rate": 8.844728291131956e-06, + "loss": 0.542, + "step": 2828 + }, + { + "epoch": 0.4645987724016176, + "grad_norm": 0.36430876563423503, + "learning_rate": 8.844606137357697e-06, + "loss": 0.5329, + "step": 2829 + }, + { + "epoch": 0.46476299960996037, + "grad_norm": 0.28596151687840704, + "learning_rate": 8.84448393639674e-06, + "loss": 0.5466, + "step": 2830 + }, + { + "epoch": 0.4649272268183031, + "grad_norm": 0.3332646439130557, + "learning_rate": 8.844361688250412e-06, + "loss": 0.5234, + "step": 2831 + }, + { + "epoch": 0.46509145402664587, + "grad_norm": 0.29528217400431656, + "learning_rate": 8.844239392920044e-06, + "loss": 0.5533, + "step": 2832 + }, + { + "epoch": 0.4652556812349886, + "grad_norm": 0.3045475455711421, + "learning_rate": 8.844117050406958e-06, + "loss": 0.5516, + "step": 2833 + }, + { + "epoch": 0.46541990844333136, + "grad_norm": 0.3673341114723952, + "learning_rate": 8.84399466071249e-06, + "loss": 0.5628, + "step": 2834 + }, + { + "epoch": 0.4655841356516741, + "grad_norm": 0.36352857411555345, + "learning_rate": 8.843872223837964e-06, + "loss": 0.5527, + "step": 2835 + }, + { + "epoch": 0.4657483628600168, + "grad_norm": 0.37350387535762136, + "learning_rate": 8.843749739784714e-06, + "loss": 0.563, + "step": 2836 + }, + { + "epoch": 0.46591259006835956, + "grad_norm": 0.31369805940776757, + "learning_rate": 8.843627208554067e-06, + "loss": 0.5599, + "step": 2837 + }, + { + "epoch": 0.4660768172767023, + "grad_norm": 0.30836598525731435, + "learning_rate": 8.843504630147356e-06, + "loss": 0.5575, + "step": 2838 + }, + { + "epoch": 0.46624104448504505, + "grad_norm": 0.3666029081716213, + "learning_rate": 8.843382004565909e-06, + "loss": 0.544, + "step": 2839 + }, + { + "epoch": 0.4664052716933878, + "grad_norm": 0.32683584986669917, + "learning_rate": 8.843259331811062e-06, + "loss": 0.5926, + "step": 2840 + }, + { + "epoch": 0.46656949890173055, + "grad_norm": 0.3935470331834398, + "learning_rate": 8.843136611884145e-06, + "loss": 0.5502, + "step": 2841 + }, + { + "epoch": 0.4667337261100733, + "grad_norm": 0.32710167529269035, + "learning_rate": 8.843013844786491e-06, + "loss": 0.5599, + "step": 2842 + }, + { + "epoch": 0.46689795331841605, + "grad_norm": 0.3596882455911912, + "learning_rate": 8.842891030519434e-06, + "loss": 0.551, + "step": 2843 + }, + { + "epoch": 0.4670621805267588, + "grad_norm": 0.29837305973530415, + "learning_rate": 8.842768169084309e-06, + "loss": 0.5616, + "step": 2844 + }, + { + "epoch": 0.4672264077351015, + "grad_norm": 0.36261886857746567, + "learning_rate": 8.842645260482446e-06, + "loss": 0.546, + "step": 2845 + }, + { + "epoch": 0.46739063494344424, + "grad_norm": 0.3239781103541487, + "learning_rate": 8.842522304715184e-06, + "loss": 0.5493, + "step": 2846 + }, + { + "epoch": 0.467554862151787, + "grad_norm": 0.3022618320478039, + "learning_rate": 8.842399301783859e-06, + "loss": 0.5563, + "step": 2847 + }, + { + "epoch": 0.46771908936012974, + "grad_norm": 0.3324394490245069, + "learning_rate": 8.842276251689804e-06, + "loss": 0.5467, + "step": 2848 + }, + { + "epoch": 0.4678833165684725, + "grad_norm": 0.34005289281067436, + "learning_rate": 8.842153154434357e-06, + "loss": 0.5507, + "step": 2849 + }, + { + "epoch": 0.46804754377681523, + "grad_norm": 0.30758197825127814, + "learning_rate": 8.842030010018855e-06, + "loss": 0.5617, + "step": 2850 + }, + { + "epoch": 0.468211770985158, + "grad_norm": 0.3485397690878994, + "learning_rate": 8.841906818444634e-06, + "loss": 0.5414, + "step": 2851 + }, + { + "epoch": 0.46837599819350073, + "grad_norm": 0.28465131392759013, + "learning_rate": 8.841783579713033e-06, + "loss": 0.5549, + "step": 2852 + }, + { + "epoch": 0.4685402254018434, + "grad_norm": 0.340827843779612, + "learning_rate": 8.841660293825392e-06, + "loss": 0.5542, + "step": 2853 + }, + { + "epoch": 0.4687044526101862, + "grad_norm": 0.3642631778373032, + "learning_rate": 8.841536960783047e-06, + "loss": 0.5422, + "step": 2854 + }, + { + "epoch": 0.4688686798185289, + "grad_norm": 0.3350444847989983, + "learning_rate": 8.84141358058734e-06, + "loss": 0.5582, + "step": 2855 + }, + { + "epoch": 0.46903290702687167, + "grad_norm": 0.32037747228626573, + "learning_rate": 8.84129015323961e-06, + "loss": 0.5511, + "step": 2856 + }, + { + "epoch": 0.4691971342352144, + "grad_norm": 0.32535957757017653, + "learning_rate": 8.841166678741197e-06, + "loss": 0.5542, + "step": 2857 + }, + { + "epoch": 0.46936136144355717, + "grad_norm": 0.3612115613975925, + "learning_rate": 8.841043157093444e-06, + "loss": 0.5645, + "step": 2858 + }, + { + "epoch": 0.4695255886518999, + "grad_norm": 0.3032820965069931, + "learning_rate": 8.840919588297691e-06, + "loss": 0.5561, + "step": 2859 + }, + { + "epoch": 0.46968981586024267, + "grad_norm": 0.34091583425847477, + "learning_rate": 8.84079597235528e-06, + "loss": 0.5539, + "step": 2860 + }, + { + "epoch": 0.4698540430685854, + "grad_norm": 0.3366327725164345, + "learning_rate": 8.840672309267553e-06, + "loss": 0.5512, + "step": 2861 + }, + { + "epoch": 0.4700182702769281, + "grad_norm": 0.33509050371301097, + "learning_rate": 8.840548599035857e-06, + "loss": 0.5806, + "step": 2862 + }, + { + "epoch": 0.47018249748527086, + "grad_norm": 0.3043078378641758, + "learning_rate": 8.84042484166153e-06, + "loss": 0.5549, + "step": 2863 + }, + { + "epoch": 0.4703467246936136, + "grad_norm": 0.2911581650021685, + "learning_rate": 8.840301037145919e-06, + "loss": 0.5385, + "step": 2864 + }, + { + "epoch": 0.47051095190195635, + "grad_norm": 0.33595882669664007, + "learning_rate": 8.840177185490369e-06, + "loss": 0.5284, + "step": 2865 + }, + { + "epoch": 0.4706751791102991, + "grad_norm": 0.3617199149406533, + "learning_rate": 8.840053286696224e-06, + "loss": 0.5446, + "step": 2866 + }, + { + "epoch": 0.47083940631864185, + "grad_norm": 0.30372519781604457, + "learning_rate": 8.839929340764832e-06, + "loss": 0.5728, + "step": 2867 + }, + { + "epoch": 0.4710036335269846, + "grad_norm": 0.31933667718002556, + "learning_rate": 8.839805347697536e-06, + "loss": 0.5518, + "step": 2868 + }, + { + "epoch": 0.47116786073532735, + "grad_norm": 0.3966424493600261, + "learning_rate": 8.839681307495685e-06, + "loss": 0.5743, + "step": 2869 + }, + { + "epoch": 0.47133208794367004, + "grad_norm": 0.3348971739237751, + "learning_rate": 8.839557220160626e-06, + "loss": 0.5524, + "step": 2870 + }, + { + "epoch": 0.4714963151520128, + "grad_norm": 0.3213892824965565, + "learning_rate": 8.839433085693704e-06, + "loss": 0.5628, + "step": 2871 + }, + { + "epoch": 0.47166054236035554, + "grad_norm": 0.4390491443368386, + "learning_rate": 8.839308904096272e-06, + "loss": 0.5376, + "step": 2872 + }, + { + "epoch": 0.4718247695686983, + "grad_norm": 0.32059743684089986, + "learning_rate": 8.839184675369672e-06, + "loss": 0.5614, + "step": 2873 + }, + { + "epoch": 0.47198899677704104, + "grad_norm": 0.3058018710793769, + "learning_rate": 8.83906039951526e-06, + "loss": 0.5359, + "step": 2874 + }, + { + "epoch": 0.4721532239853838, + "grad_norm": 0.2804240335544641, + "learning_rate": 8.838936076534381e-06, + "loss": 0.5442, + "step": 2875 + }, + { + "epoch": 0.47231745119372653, + "grad_norm": 0.3746199995295414, + "learning_rate": 8.83881170642839e-06, + "loss": 0.5763, + "step": 2876 + }, + { + "epoch": 0.4724816784020693, + "grad_norm": 0.43755114764082026, + "learning_rate": 8.838687289198634e-06, + "loss": 0.5677, + "step": 2877 + }, + { + "epoch": 0.47264590561041203, + "grad_norm": 0.30021384425688985, + "learning_rate": 8.838562824846464e-06, + "loss": 0.5354, + "step": 2878 + }, + { + "epoch": 0.4728101328187547, + "grad_norm": 0.2982429736947675, + "learning_rate": 8.838438313373234e-06, + "loss": 0.5523, + "step": 2879 + }, + { + "epoch": 0.4729743600270975, + "grad_norm": 0.3139337209816409, + "learning_rate": 8.838313754780297e-06, + "loss": 0.5442, + "step": 2880 + }, + { + "epoch": 0.4731385872354402, + "grad_norm": 0.38187862880767065, + "learning_rate": 8.838189149069004e-06, + "loss": 0.5596, + "step": 2881 + }, + { + "epoch": 0.47330281444378297, + "grad_norm": 0.35478374350718284, + "learning_rate": 8.838064496240706e-06, + "loss": 0.5492, + "step": 2882 + }, + { + "epoch": 0.4734670416521257, + "grad_norm": 0.3149790656321742, + "learning_rate": 8.837939796296762e-06, + "loss": 0.5453, + "step": 2883 + }, + { + "epoch": 0.47363126886046847, + "grad_norm": 0.349800653583477, + "learning_rate": 8.837815049238523e-06, + "loss": 0.5303, + "step": 2884 + }, + { + "epoch": 0.4737954960688112, + "grad_norm": 0.3199958228053064, + "learning_rate": 8.837690255067346e-06, + "loss": 0.5516, + "step": 2885 + }, + { + "epoch": 0.47395972327715397, + "grad_norm": 0.3614633246024179, + "learning_rate": 8.837565413784583e-06, + "loss": 0.5503, + "step": 2886 + }, + { + "epoch": 0.47412395048549666, + "grad_norm": 0.41022846573910837, + "learning_rate": 8.837440525391593e-06, + "loss": 0.5704, + "step": 2887 + }, + { + "epoch": 0.4742881776938394, + "grad_norm": 0.32146513999845583, + "learning_rate": 8.83731558988973e-06, + "loss": 0.5593, + "step": 2888 + }, + { + "epoch": 0.47445240490218216, + "grad_norm": 0.3522243215874715, + "learning_rate": 8.837190607280355e-06, + "loss": 0.5535, + "step": 2889 + }, + { + "epoch": 0.4746166321105249, + "grad_norm": 0.34027532771479646, + "learning_rate": 8.83706557756482e-06, + "loss": 0.5422, + "step": 2890 + }, + { + "epoch": 0.47478085931886765, + "grad_norm": 0.3531730299805744, + "learning_rate": 8.836940500744489e-06, + "loss": 0.5617, + "step": 2891 + }, + { + "epoch": 0.4749450865272104, + "grad_norm": 0.3674374072994261, + "learning_rate": 8.836815376820715e-06, + "loss": 0.5592, + "step": 2892 + }, + { + "epoch": 0.47510931373555315, + "grad_norm": 0.32970365360706744, + "learning_rate": 8.836690205794858e-06, + "loss": 0.5676, + "step": 2893 + }, + { + "epoch": 0.4752735409438959, + "grad_norm": 0.39642473826995, + "learning_rate": 8.836564987668281e-06, + "loss": 0.5787, + "step": 2894 + }, + { + "epoch": 0.47543776815223865, + "grad_norm": 0.2868884450572186, + "learning_rate": 8.836439722442341e-06, + "loss": 0.5578, + "step": 2895 + }, + { + "epoch": 0.47560199536058134, + "grad_norm": 0.3071135606555718, + "learning_rate": 8.8363144101184e-06, + "loss": 0.5446, + "step": 2896 + }, + { + "epoch": 0.4757662225689241, + "grad_norm": 0.4085356980863807, + "learning_rate": 8.836189050697817e-06, + "loss": 0.5494, + "step": 2897 + }, + { + "epoch": 0.47593044977726684, + "grad_norm": 0.29666396729431577, + "learning_rate": 8.836063644181954e-06, + "loss": 0.544, + "step": 2898 + }, + { + "epoch": 0.4760946769856096, + "grad_norm": 0.30255551130327596, + "learning_rate": 8.835938190572174e-06, + "loss": 0.5499, + "step": 2899 + }, + { + "epoch": 0.47625890419395234, + "grad_norm": 0.2806984477691293, + "learning_rate": 8.83581268986984e-06, + "loss": 0.5648, + "step": 2900 + }, + { + "epoch": 0.4764231314022951, + "grad_norm": 0.3263273596646186, + "learning_rate": 8.835687142076314e-06, + "loss": 0.5561, + "step": 2901 + }, + { + "epoch": 0.47658735861063783, + "grad_norm": 0.3790842531002415, + "learning_rate": 8.83556154719296e-06, + "loss": 0.5744, + "step": 2902 + }, + { + "epoch": 0.4767515858189806, + "grad_norm": 0.3417983950853606, + "learning_rate": 8.835435905221142e-06, + "loss": 0.5652, + "step": 2903 + }, + { + "epoch": 0.4769158130273233, + "grad_norm": 0.29218721475019915, + "learning_rate": 8.835310216162224e-06, + "loss": 0.5554, + "step": 2904 + }, + { + "epoch": 0.477080040235666, + "grad_norm": 0.31113339800938206, + "learning_rate": 8.835184480017572e-06, + "loss": 0.5521, + "step": 2905 + }, + { + "epoch": 0.4772442674440088, + "grad_norm": 0.2985823425378832, + "learning_rate": 8.835058696788552e-06, + "loss": 0.5459, + "step": 2906 + }, + { + "epoch": 0.4774084946523515, + "grad_norm": 0.35116196870091576, + "learning_rate": 8.834932866476531e-06, + "loss": 0.5569, + "step": 2907 + }, + { + "epoch": 0.47757272186069427, + "grad_norm": 0.285228916766138, + "learning_rate": 8.83480698908287e-06, + "loss": 0.5476, + "step": 2908 + }, + { + "epoch": 0.477736949069037, + "grad_norm": 0.32355675164680775, + "learning_rate": 8.834681064608944e-06, + "loss": 0.5616, + "step": 2909 + }, + { + "epoch": 0.47790117627737977, + "grad_norm": 0.29250415203008606, + "learning_rate": 8.834555093056114e-06, + "loss": 0.5452, + "step": 2910 + }, + { + "epoch": 0.4780654034857225, + "grad_norm": 0.4086216732635031, + "learning_rate": 8.834429074425752e-06, + "loss": 0.5887, + "step": 2911 + }, + { + "epoch": 0.47822963069406527, + "grad_norm": 0.30486039535028525, + "learning_rate": 8.834303008719226e-06, + "loss": 0.5586, + "step": 2912 + }, + { + "epoch": 0.47839385790240796, + "grad_norm": 0.392026031748003, + "learning_rate": 8.834176895937906e-06, + "loss": 0.5454, + "step": 2913 + }, + { + "epoch": 0.4785580851107507, + "grad_norm": 0.32606961737462525, + "learning_rate": 8.834050736083158e-06, + "loss": 0.5565, + "step": 2914 + }, + { + "epoch": 0.47872231231909346, + "grad_norm": 0.3567403007077362, + "learning_rate": 8.833924529156357e-06, + "loss": 0.5579, + "step": 2915 + }, + { + "epoch": 0.4788865395274362, + "grad_norm": 0.2962722445309099, + "learning_rate": 8.833798275158871e-06, + "loss": 0.5517, + "step": 2916 + }, + { + "epoch": 0.47905076673577895, + "grad_norm": 0.31457620394082064, + "learning_rate": 8.83367197409207e-06, + "loss": 0.5654, + "step": 2917 + }, + { + "epoch": 0.4792149939441217, + "grad_norm": 0.36662242627563446, + "learning_rate": 8.833545625957332e-06, + "loss": 0.552, + "step": 2918 + }, + { + "epoch": 0.47937922115246445, + "grad_norm": 0.28945439867043166, + "learning_rate": 8.833419230756021e-06, + "loss": 0.5611, + "step": 2919 + }, + { + "epoch": 0.4795434483608072, + "grad_norm": 0.3090300015262434, + "learning_rate": 8.833292788489517e-06, + "loss": 0.57, + "step": 2920 + }, + { + "epoch": 0.4797076755691499, + "grad_norm": 0.3367798755701906, + "learning_rate": 8.833166299159187e-06, + "loss": 0.5635, + "step": 2921 + }, + { + "epoch": 0.47987190277749264, + "grad_norm": 0.3211307845952408, + "learning_rate": 8.833039762766408e-06, + "loss": 0.539, + "step": 2922 + }, + { + "epoch": 0.4800361299858354, + "grad_norm": 0.3269164013348773, + "learning_rate": 8.832913179312555e-06, + "loss": 0.5481, + "step": 2923 + }, + { + "epoch": 0.48020035719417814, + "grad_norm": 0.3193091915753343, + "learning_rate": 8.832786548799002e-06, + "loss": 0.5685, + "step": 2924 + }, + { + "epoch": 0.4803645844025209, + "grad_norm": 0.29715881654797105, + "learning_rate": 8.832659871227124e-06, + "loss": 0.547, + "step": 2925 + }, + { + "epoch": 0.48052881161086364, + "grad_norm": 0.3092509709334806, + "learning_rate": 8.832533146598297e-06, + "loss": 0.5407, + "step": 2926 + }, + { + "epoch": 0.4806930388192064, + "grad_norm": 0.2997878748298698, + "learning_rate": 8.832406374913896e-06, + "loss": 0.5437, + "step": 2927 + }, + { + "epoch": 0.48085726602754914, + "grad_norm": 0.36833955082132547, + "learning_rate": 8.832279556175302e-06, + "loss": 0.5591, + "step": 2928 + }, + { + "epoch": 0.4810214932358919, + "grad_norm": 0.31188670229780074, + "learning_rate": 8.832152690383887e-06, + "loss": 0.5523, + "step": 2929 + }, + { + "epoch": 0.4811857204442346, + "grad_norm": 0.31498192500648686, + "learning_rate": 8.832025777541032e-06, + "loss": 0.5367, + "step": 2930 + }, + { + "epoch": 0.4813499476525773, + "grad_norm": 0.32468516735093983, + "learning_rate": 8.831898817648116e-06, + "loss": 0.5561, + "step": 2931 + }, + { + "epoch": 0.4815141748609201, + "grad_norm": 0.33711950838323035, + "learning_rate": 8.831771810706518e-06, + "loss": 0.5711, + "step": 2932 + }, + { + "epoch": 0.4816784020692628, + "grad_norm": 0.3467531673707492, + "learning_rate": 8.831644756717614e-06, + "loss": 0.576, + "step": 2933 + }, + { + "epoch": 0.48184262927760557, + "grad_norm": 0.3587735987166933, + "learning_rate": 8.831517655682787e-06, + "loss": 0.5603, + "step": 2934 + }, + { + "epoch": 0.4820068564859483, + "grad_norm": 0.3479101996126524, + "learning_rate": 8.831390507603416e-06, + "loss": 0.5607, + "step": 2935 + }, + { + "epoch": 0.48217108369429107, + "grad_norm": 0.35567025483515724, + "learning_rate": 8.831263312480883e-06, + "loss": 0.5428, + "step": 2936 + }, + { + "epoch": 0.4823353109026338, + "grad_norm": 0.34050479161407216, + "learning_rate": 8.831136070316568e-06, + "loss": 0.5335, + "step": 2937 + }, + { + "epoch": 0.4824995381109765, + "grad_norm": 0.34844624545525676, + "learning_rate": 8.831008781111855e-06, + "loss": 0.5609, + "step": 2938 + }, + { + "epoch": 0.48266376531931926, + "grad_norm": 0.3534685183853488, + "learning_rate": 8.830881444868126e-06, + "loss": 0.5601, + "step": 2939 + }, + { + "epoch": 0.482827992527662, + "grad_norm": 0.3576134623563205, + "learning_rate": 8.830754061586764e-06, + "loss": 0.5622, + "step": 2940 + }, + { + "epoch": 0.48299221973600476, + "grad_norm": 0.37002257796135896, + "learning_rate": 8.83062663126915e-06, + "loss": 0.556, + "step": 2941 + }, + { + "epoch": 0.4831564469443475, + "grad_norm": 0.37832760131778265, + "learning_rate": 8.830499153916671e-06, + "loss": 0.5463, + "step": 2942 + }, + { + "epoch": 0.48332067415269026, + "grad_norm": 0.3257646983345067, + "learning_rate": 8.83037162953071e-06, + "loss": 0.5626, + "step": 2943 + }, + { + "epoch": 0.483484901361033, + "grad_norm": 0.34700830090226, + "learning_rate": 8.830244058112655e-06, + "loss": 0.5396, + "step": 2944 + }, + { + "epoch": 0.48364912856937575, + "grad_norm": 0.31036138655536344, + "learning_rate": 8.830116439663887e-06, + "loss": 0.5197, + "step": 2945 + }, + { + "epoch": 0.4838133557777185, + "grad_norm": 0.3565916198778085, + "learning_rate": 8.829988774185794e-06, + "loss": 0.5376, + "step": 2946 + }, + { + "epoch": 0.4839775829860612, + "grad_norm": 0.3681633251091734, + "learning_rate": 8.829861061679763e-06, + "loss": 0.5404, + "step": 2947 + }, + { + "epoch": 0.48414181019440394, + "grad_norm": 0.4760323163689969, + "learning_rate": 8.829733302147182e-06, + "loss": 0.5564, + "step": 2948 + }, + { + "epoch": 0.4843060374027467, + "grad_norm": 0.3142530374093069, + "learning_rate": 8.829605495589436e-06, + "loss": 0.5424, + "step": 2949 + }, + { + "epoch": 0.48447026461108944, + "grad_norm": 0.395721637651938, + "learning_rate": 8.829477642007915e-06, + "loss": 0.5428, + "step": 2950 + }, + { + "epoch": 0.4846344918194322, + "grad_norm": 0.3317668352816898, + "learning_rate": 8.82934974140401e-06, + "loss": 0.5413, + "step": 2951 + }, + { + "epoch": 0.48479871902777494, + "grad_norm": 0.4043660326447801, + "learning_rate": 8.829221793779102e-06, + "loss": 0.5645, + "step": 2952 + }, + { + "epoch": 0.4849629462361177, + "grad_norm": 0.31928485059252737, + "learning_rate": 8.82909379913459e-06, + "loss": 0.5396, + "step": 2953 + }, + { + "epoch": 0.48512717344446044, + "grad_norm": 0.30272961197459414, + "learning_rate": 8.828965757471858e-06, + "loss": 0.5611, + "step": 2954 + }, + { + "epoch": 0.48529140065280313, + "grad_norm": 0.3219097780968554, + "learning_rate": 8.8288376687923e-06, + "loss": 0.5495, + "step": 2955 + }, + { + "epoch": 0.4854556278611459, + "grad_norm": 0.3068594785573637, + "learning_rate": 8.828709533097304e-06, + "loss": 0.527, + "step": 2956 + }, + { + "epoch": 0.4856198550694886, + "grad_norm": 0.3286151563554068, + "learning_rate": 8.828581350388267e-06, + "loss": 0.5453, + "step": 2957 + }, + { + "epoch": 0.4857840822778314, + "grad_norm": 0.32326057654047574, + "learning_rate": 8.828453120666574e-06, + "loss": 0.5433, + "step": 2958 + }, + { + "epoch": 0.4859483094861741, + "grad_norm": 0.341029084787711, + "learning_rate": 8.828324843933625e-06, + "loss": 0.5644, + "step": 2959 + }, + { + "epoch": 0.4861125366945169, + "grad_norm": 0.34016806957422147, + "learning_rate": 8.828196520190807e-06, + "loss": 0.5556, + "step": 2960 + }, + { + "epoch": 0.4862767639028596, + "grad_norm": 0.4028159993767315, + "learning_rate": 8.828068149439518e-06, + "loss": 0.5325, + "step": 2961 + }, + { + "epoch": 0.48644099111120237, + "grad_norm": 0.2984402049931922, + "learning_rate": 8.82793973168115e-06, + "loss": 0.5751, + "step": 2962 + }, + { + "epoch": 0.48660521831954506, + "grad_norm": 0.33616040754201904, + "learning_rate": 8.827811266917099e-06, + "loss": 0.5424, + "step": 2963 + }, + { + "epoch": 0.4867694455278878, + "grad_norm": 0.333707218309075, + "learning_rate": 8.827682755148757e-06, + "loss": 0.5424, + "step": 2964 + }, + { + "epoch": 0.48693367273623056, + "grad_norm": 0.3336325173722268, + "learning_rate": 8.827554196377525e-06, + "loss": 0.5617, + "step": 2965 + }, + { + "epoch": 0.4870978999445733, + "grad_norm": 0.3299202105690531, + "learning_rate": 8.827425590604796e-06, + "loss": 0.5422, + "step": 2966 + }, + { + "epoch": 0.48726212715291606, + "grad_norm": 0.3506142749597406, + "learning_rate": 8.827296937831969e-06, + "loss": 0.5609, + "step": 2967 + }, + { + "epoch": 0.4874263543612588, + "grad_norm": 0.44404974906674444, + "learning_rate": 8.82716823806044e-06, + "loss": 0.5782, + "step": 2968 + }, + { + "epoch": 0.48759058156960156, + "grad_norm": 0.30580865087991876, + "learning_rate": 8.827039491291604e-06, + "loss": 0.5581, + "step": 2969 + }, + { + "epoch": 0.4877548087779443, + "grad_norm": 0.3048111574103392, + "learning_rate": 8.826910697526862e-06, + "loss": 0.5343, + "step": 2970 + }, + { + "epoch": 0.48791903598628705, + "grad_norm": 0.3281501305555785, + "learning_rate": 8.826781856767614e-06, + "loss": 0.5627, + "step": 2971 + }, + { + "epoch": 0.48808326319462975, + "grad_norm": 0.3026997019538971, + "learning_rate": 8.826652969015258e-06, + "loss": 0.579, + "step": 2972 + }, + { + "epoch": 0.4882474904029725, + "grad_norm": 0.33950303423306283, + "learning_rate": 8.826524034271194e-06, + "loss": 0.5491, + "step": 2973 + }, + { + "epoch": 0.48841171761131524, + "grad_norm": 0.3662426620093517, + "learning_rate": 8.82639505253682e-06, + "loss": 0.5609, + "step": 2974 + }, + { + "epoch": 0.488575944819658, + "grad_norm": 0.3504254683156696, + "learning_rate": 8.826266023813543e-06, + "loss": 0.5358, + "step": 2975 + }, + { + "epoch": 0.48874017202800074, + "grad_norm": 0.362624051096286, + "learning_rate": 8.826136948102757e-06, + "loss": 0.5515, + "step": 2976 + }, + { + "epoch": 0.4889043992363435, + "grad_norm": 0.3303718575483711, + "learning_rate": 8.82600782540587e-06, + "loss": 0.5562, + "step": 2977 + }, + { + "epoch": 0.48906862644468624, + "grad_norm": 0.35238845644920747, + "learning_rate": 8.825878655724279e-06, + "loss": 0.5385, + "step": 2978 + }, + { + "epoch": 0.489232853653029, + "grad_norm": 0.31177874457750715, + "learning_rate": 8.825749439059393e-06, + "loss": 0.5396, + "step": 2979 + }, + { + "epoch": 0.4893970808613717, + "grad_norm": 0.34109682668123126, + "learning_rate": 8.825620175412609e-06, + "loss": 0.5505, + "step": 2980 + }, + { + "epoch": 0.48956130806971443, + "grad_norm": 0.3620326005023896, + "learning_rate": 8.825490864785336e-06, + "loss": 0.5778, + "step": 2981 + }, + { + "epoch": 0.4897255352780572, + "grad_norm": 0.5979291508659067, + "learning_rate": 8.825361507178977e-06, + "loss": 0.5518, + "step": 2982 + }, + { + "epoch": 0.4898897624863999, + "grad_norm": 0.29689712347868064, + "learning_rate": 8.825232102594935e-06, + "loss": 0.5817, + "step": 2983 + }, + { + "epoch": 0.4900539896947427, + "grad_norm": 0.34548738955931996, + "learning_rate": 8.825102651034617e-06, + "loss": 0.5585, + "step": 2984 + }, + { + "epoch": 0.4902182169030854, + "grad_norm": 0.3554958070227503, + "learning_rate": 8.82497315249943e-06, + "loss": 0.5488, + "step": 2985 + }, + { + "epoch": 0.4903824441114282, + "grad_norm": 0.2904946631395032, + "learning_rate": 8.82484360699078e-06, + "loss": 0.5386, + "step": 2986 + }, + { + "epoch": 0.4905466713197709, + "grad_norm": 0.3229198644119859, + "learning_rate": 8.824714014510071e-06, + "loss": 0.5558, + "step": 2987 + }, + { + "epoch": 0.49071089852811367, + "grad_norm": 0.36384501410130143, + "learning_rate": 8.824584375058713e-06, + "loss": 0.5466, + "step": 2988 + }, + { + "epoch": 0.49087512573645636, + "grad_norm": 0.3753164950066053, + "learning_rate": 8.824454688638116e-06, + "loss": 0.5401, + "step": 2989 + }, + { + "epoch": 0.4910393529447991, + "grad_norm": 0.36709523119655635, + "learning_rate": 8.824324955249685e-06, + "loss": 0.5366, + "step": 2990 + }, + { + "epoch": 0.49120358015314186, + "grad_norm": 0.36824250049715346, + "learning_rate": 8.82419517489483e-06, + "loss": 0.5468, + "step": 2991 + }, + { + "epoch": 0.4913678073614846, + "grad_norm": 0.41026285040761973, + "learning_rate": 8.824065347574962e-06, + "loss": 0.5796, + "step": 2992 + }, + { + "epoch": 0.49153203456982736, + "grad_norm": 0.36743089732858497, + "learning_rate": 8.82393547329149e-06, + "loss": 0.5691, + "step": 2993 + }, + { + "epoch": 0.4916962617781701, + "grad_norm": 0.3573127649092164, + "learning_rate": 8.823805552045824e-06, + "loss": 0.5782, + "step": 2994 + }, + { + "epoch": 0.49186048898651286, + "grad_norm": 0.4711373569935757, + "learning_rate": 8.823675583839375e-06, + "loss": 0.5448, + "step": 2995 + }, + { + "epoch": 0.4920247161948556, + "grad_norm": 0.3467146120478822, + "learning_rate": 8.823545568673556e-06, + "loss": 0.5785, + "step": 2996 + }, + { + "epoch": 0.4921889434031983, + "grad_norm": 0.35151193970461303, + "learning_rate": 8.823415506549779e-06, + "loss": 0.5703, + "step": 2997 + }, + { + "epoch": 0.49235317061154105, + "grad_norm": 0.3620038956032762, + "learning_rate": 8.823285397469455e-06, + "loss": 0.5713, + "step": 2998 + }, + { + "epoch": 0.4925173978198838, + "grad_norm": 0.3607731749175902, + "learning_rate": 8.823155241434e-06, + "loss": 0.5556, + "step": 2999 + }, + { + "epoch": 0.49268162502822654, + "grad_norm": 0.32757022768540983, + "learning_rate": 8.823025038444823e-06, + "loss": 0.5531, + "step": 3000 + }, + { + "epoch": 0.4928458522365693, + "grad_norm": 0.3781611992714109, + "learning_rate": 8.822894788503342e-06, + "loss": 0.5505, + "step": 3001 + }, + { + "epoch": 0.49301007944491204, + "grad_norm": 0.3676888297654473, + "learning_rate": 8.82276449161097e-06, + "loss": 0.5646, + "step": 3002 + }, + { + "epoch": 0.4931743066532548, + "grad_norm": 0.3467636715211463, + "learning_rate": 8.822634147769123e-06, + "loss": 0.538, + "step": 3003 + }, + { + "epoch": 0.49333853386159754, + "grad_norm": 0.29828305218985435, + "learning_rate": 8.822503756979217e-06, + "loss": 0.5407, + "step": 3004 + }, + { + "epoch": 0.4935027610699403, + "grad_norm": 0.3722421038012637, + "learning_rate": 8.822373319242666e-06, + "loss": 0.553, + "step": 3005 + }, + { + "epoch": 0.493666988278283, + "grad_norm": 0.3964608753926012, + "learning_rate": 8.822242834560888e-06, + "loss": 0.5506, + "step": 3006 + }, + { + "epoch": 0.49383121548662573, + "grad_norm": 0.3038259436819525, + "learning_rate": 8.822112302935302e-06, + "loss": 0.5528, + "step": 3007 + }, + { + "epoch": 0.4939954426949685, + "grad_norm": 0.31767582517383786, + "learning_rate": 8.821981724367322e-06, + "loss": 0.5335, + "step": 3008 + }, + { + "epoch": 0.4941596699033112, + "grad_norm": 0.2613418127902927, + "learning_rate": 8.82185109885837e-06, + "loss": 0.5279, + "step": 3009 + }, + { + "epoch": 0.494323897111654, + "grad_norm": 0.3858067232180448, + "learning_rate": 8.821720426409862e-06, + "loss": 0.5577, + "step": 3010 + }, + { + "epoch": 0.4944881243199967, + "grad_norm": 0.34249742513275216, + "learning_rate": 8.821589707023218e-06, + "loss": 0.5404, + "step": 3011 + }, + { + "epoch": 0.4946523515283395, + "grad_norm": 0.3078017073081959, + "learning_rate": 8.821458940699858e-06, + "loss": 0.5441, + "step": 3012 + }, + { + "epoch": 0.4948165787366822, + "grad_norm": 0.4729428717813342, + "learning_rate": 8.821328127441202e-06, + "loss": 0.5871, + "step": 3013 + }, + { + "epoch": 0.4949808059450249, + "grad_norm": 0.344819112206208, + "learning_rate": 8.821197267248673e-06, + "loss": 0.5371, + "step": 3014 + }, + { + "epoch": 0.49514503315336766, + "grad_norm": 0.5182067984676952, + "learning_rate": 8.821066360123687e-06, + "loss": 0.554, + "step": 3015 + }, + { + "epoch": 0.4953092603617104, + "grad_norm": 0.5446474279763543, + "learning_rate": 8.820935406067672e-06, + "loss": 0.5833, + "step": 3016 + }, + { + "epoch": 0.49547348757005316, + "grad_norm": 0.2829439671030965, + "learning_rate": 8.820804405082045e-06, + "loss": 0.5415, + "step": 3017 + }, + { + "epoch": 0.4956377147783959, + "grad_norm": 0.3880941527869299, + "learning_rate": 8.820673357168232e-06, + "loss": 0.5579, + "step": 3018 + }, + { + "epoch": 0.49580194198673866, + "grad_norm": 0.29511019081794293, + "learning_rate": 8.820542262327655e-06, + "loss": 0.54, + "step": 3019 + }, + { + "epoch": 0.4959661691950814, + "grad_norm": 0.337603608717014, + "learning_rate": 8.820411120561738e-06, + "loss": 0.5414, + "step": 3020 + }, + { + "epoch": 0.49613039640342416, + "grad_norm": 0.36921638040684235, + "learning_rate": 8.820279931871906e-06, + "loss": 0.5572, + "step": 3021 + }, + { + "epoch": 0.4962946236117669, + "grad_norm": 0.34506745881486306, + "learning_rate": 8.820148696259584e-06, + "loss": 0.5594, + "step": 3022 + }, + { + "epoch": 0.4964588508201096, + "grad_norm": 0.35208611833465026, + "learning_rate": 8.820017413726196e-06, + "loss": 0.5487, + "step": 3023 + }, + { + "epoch": 0.49662307802845235, + "grad_norm": 0.3274229204168676, + "learning_rate": 8.819886084273168e-06, + "loss": 0.5649, + "step": 3024 + }, + { + "epoch": 0.4967873052367951, + "grad_norm": 0.3208518695214698, + "learning_rate": 8.819754707901928e-06, + "loss": 0.5461, + "step": 3025 + }, + { + "epoch": 0.49695153244513784, + "grad_norm": 0.3198716314130458, + "learning_rate": 8.819623284613901e-06, + "loss": 0.5608, + "step": 3026 + }, + { + "epoch": 0.4971157596534806, + "grad_norm": 0.4017507102525848, + "learning_rate": 8.819491814410516e-06, + "loss": 0.5156, + "step": 3027 + }, + { + "epoch": 0.49727998686182334, + "grad_norm": 0.3032437885755787, + "learning_rate": 8.819360297293199e-06, + "loss": 0.5362, + "step": 3028 + }, + { + "epoch": 0.4974442140701661, + "grad_norm": 0.3414840714435411, + "learning_rate": 8.81922873326338e-06, + "loss": 0.5451, + "step": 3029 + }, + { + "epoch": 0.49760844127850884, + "grad_norm": 0.35225110432503826, + "learning_rate": 8.819097122322488e-06, + "loss": 0.5209, + "step": 3030 + }, + { + "epoch": 0.49777266848685153, + "grad_norm": 0.3239762292518163, + "learning_rate": 8.81896546447195e-06, + "loss": 0.572, + "step": 3031 + }, + { + "epoch": 0.4979368956951943, + "grad_norm": 0.4545041404536738, + "learning_rate": 8.8188337597132e-06, + "loss": 0.5595, + "step": 3032 + }, + { + "epoch": 0.49810112290353703, + "grad_norm": 0.35087259800795967, + "learning_rate": 8.818702008047666e-06, + "loss": 0.5503, + "step": 3033 + }, + { + "epoch": 0.4982653501118798, + "grad_norm": 0.3518407108267486, + "learning_rate": 8.818570209476777e-06, + "loss": 0.5565, + "step": 3034 + }, + { + "epoch": 0.4984295773202225, + "grad_norm": 0.377200938642081, + "learning_rate": 8.81843836400197e-06, + "loss": 0.5663, + "step": 3035 + }, + { + "epoch": 0.4985938045285653, + "grad_norm": 0.33495384047527427, + "learning_rate": 8.818306471624672e-06, + "loss": 0.558, + "step": 3036 + }, + { + "epoch": 0.498758031736908, + "grad_norm": 0.3370327966629263, + "learning_rate": 8.818174532346315e-06, + "loss": 0.5431, + "step": 3037 + }, + { + "epoch": 0.4989222589452508, + "grad_norm": 0.31485961232651577, + "learning_rate": 8.818042546168336e-06, + "loss": 0.5452, + "step": 3038 + }, + { + "epoch": 0.4990864861535935, + "grad_norm": 1.0283110791160432, + "learning_rate": 8.817910513092168e-06, + "loss": 0.555, + "step": 3039 + }, + { + "epoch": 0.4992507133619362, + "grad_norm": 0.3370249745346664, + "learning_rate": 8.81777843311924e-06, + "loss": 0.5565, + "step": 3040 + }, + { + "epoch": 0.49941494057027896, + "grad_norm": 0.3697422926403468, + "learning_rate": 8.817646306250992e-06, + "loss": 0.5376, + "step": 3041 + }, + { + "epoch": 0.4995791677786217, + "grad_norm": 0.33821750551255797, + "learning_rate": 8.817514132488858e-06, + "loss": 0.5741, + "step": 3042 + }, + { + "epoch": 0.49974339498696446, + "grad_norm": 0.333623396144671, + "learning_rate": 8.817381911834272e-06, + "loss": 0.537, + "step": 3043 + }, + { + "epoch": 0.4999076221953072, + "grad_norm": 0.5552665670609216, + "learning_rate": 8.817249644288669e-06, + "loss": 0.5366, + "step": 3044 + }, + { + "epoch": 0.5000718494036499, + "grad_norm": 0.3361543209358446, + "learning_rate": 8.817117329853489e-06, + "loss": 0.5571, + "step": 3045 + }, + { + "epoch": 0.5002360766119927, + "grad_norm": 0.3305629109517608, + "learning_rate": 8.816984968530167e-06, + "loss": 0.5147, + "step": 3046 + }, + { + "epoch": 0.5004003038203354, + "grad_norm": 0.3732536451895665, + "learning_rate": 8.816852560320142e-06, + "loss": 0.5593, + "step": 3047 + }, + { + "epoch": 0.5005645310286782, + "grad_norm": 0.3253389659113194, + "learning_rate": 8.816720105224851e-06, + "loss": 0.5649, + "step": 3048 + }, + { + "epoch": 0.5007287582370209, + "grad_norm": 0.5642123979492453, + "learning_rate": 8.81658760324573e-06, + "loss": 0.5608, + "step": 3049 + }, + { + "epoch": 0.5008929854453636, + "grad_norm": 0.34823482199959965, + "learning_rate": 8.816455054384224e-06, + "loss": 0.5371, + "step": 3050 + }, + { + "epoch": 0.5010572126537064, + "grad_norm": 0.2841807878282881, + "learning_rate": 8.816322458641767e-06, + "loss": 0.577, + "step": 3051 + }, + { + "epoch": 0.5012214398620491, + "grad_norm": 0.42949070216811547, + "learning_rate": 8.816189816019802e-06, + "loss": 0.5596, + "step": 3052 + }, + { + "epoch": 0.5013856670703919, + "grad_norm": 0.37820954212665636, + "learning_rate": 8.816057126519769e-06, + "loss": 0.5555, + "step": 3053 + }, + { + "epoch": 0.5015498942787346, + "grad_norm": 0.33884019153259126, + "learning_rate": 8.815924390143108e-06, + "loss": 0.5125, + "step": 3054 + }, + { + "epoch": 0.5017141214870774, + "grad_norm": 0.3024625122784817, + "learning_rate": 8.815791606891265e-06, + "loss": 0.5431, + "step": 3055 + }, + { + "epoch": 0.5018783486954201, + "grad_norm": 0.31699884443526544, + "learning_rate": 8.815658776765675e-06, + "loss": 0.5392, + "step": 3056 + }, + { + "epoch": 0.5020425759037629, + "grad_norm": 0.3264450614548681, + "learning_rate": 8.815525899767788e-06, + "loss": 0.5455, + "step": 3057 + }, + { + "epoch": 0.5022068031121056, + "grad_norm": 0.3043521417126159, + "learning_rate": 8.815392975899042e-06, + "loss": 0.546, + "step": 3058 + }, + { + "epoch": 0.5023710303204484, + "grad_norm": 0.2830419102096985, + "learning_rate": 8.815260005160884e-06, + "loss": 0.5405, + "step": 3059 + }, + { + "epoch": 0.5025352575287911, + "grad_norm": 0.3197247856576631, + "learning_rate": 8.815126987554755e-06, + "loss": 0.5653, + "step": 3060 + }, + { + "epoch": 0.5026994847371338, + "grad_norm": 0.3388722907872103, + "learning_rate": 8.814993923082102e-06, + "loss": 0.5546, + "step": 3061 + }, + { + "epoch": 0.5028637119454765, + "grad_norm": 0.29425536996130774, + "learning_rate": 8.81486081174437e-06, + "loss": 0.5499, + "step": 3062 + }, + { + "epoch": 0.5030279391538193, + "grad_norm": 0.34880282593568984, + "learning_rate": 8.814727653543005e-06, + "loss": 0.5848, + "step": 3063 + }, + { + "epoch": 0.503192166362162, + "grad_norm": 0.3310747337461077, + "learning_rate": 8.814594448479452e-06, + "loss": 0.551, + "step": 3064 + }, + { + "epoch": 0.5033563935705048, + "grad_norm": 0.3895534185008042, + "learning_rate": 8.814461196555156e-06, + "loss": 0.5314, + "step": 3065 + }, + { + "epoch": 0.5035206207788475, + "grad_norm": 0.3101567026796809, + "learning_rate": 8.81432789777157e-06, + "loss": 0.5445, + "step": 3066 + }, + { + "epoch": 0.5036848479871903, + "grad_norm": 0.28256966020552643, + "learning_rate": 8.814194552130136e-06, + "loss": 0.5586, + "step": 3067 + }, + { + "epoch": 0.503849075195533, + "grad_norm": 0.29522749873378323, + "learning_rate": 8.814061159632306e-06, + "loss": 0.5617, + "step": 3068 + }, + { + "epoch": 0.5040133024038758, + "grad_norm": 0.35241166914489735, + "learning_rate": 8.813927720279526e-06, + "loss": 0.5617, + "step": 3069 + }, + { + "epoch": 0.5041775296122185, + "grad_norm": 0.3227727829669039, + "learning_rate": 8.813794234073247e-06, + "loss": 0.5513, + "step": 3070 + }, + { + "epoch": 0.5043417568205613, + "grad_norm": 0.35211764287996394, + "learning_rate": 8.813660701014918e-06, + "loss": 0.553, + "step": 3071 + }, + { + "epoch": 0.504505984028904, + "grad_norm": 0.2820677190638171, + "learning_rate": 8.813527121105991e-06, + "loss": 0.5366, + "step": 3072 + }, + { + "epoch": 0.5046702112372468, + "grad_norm": 0.3628674686715253, + "learning_rate": 8.813393494347915e-06, + "loss": 0.5504, + "step": 3073 + }, + { + "epoch": 0.5048344384455895, + "grad_norm": 0.3344707001677626, + "learning_rate": 8.813259820742143e-06, + "loss": 0.5411, + "step": 3074 + }, + { + "epoch": 0.5049986656539323, + "grad_norm": 0.33865193193614024, + "learning_rate": 8.813126100290124e-06, + "loss": 0.543, + "step": 3075 + }, + { + "epoch": 0.505162892862275, + "grad_norm": 0.34283850872753413, + "learning_rate": 8.812992332993312e-06, + "loss": 0.5304, + "step": 3076 + }, + { + "epoch": 0.5053271200706178, + "grad_norm": 0.3125953167598692, + "learning_rate": 8.81285851885316e-06, + "loss": 0.5779, + "step": 3077 + }, + { + "epoch": 0.5054913472789604, + "grad_norm": 0.3253589557042883, + "learning_rate": 8.812724657871124e-06, + "loss": 0.544, + "step": 3078 + }, + { + "epoch": 0.5056555744873031, + "grad_norm": 0.32809582790623654, + "learning_rate": 8.812590750048651e-06, + "loss": 0.5484, + "step": 3079 + }, + { + "epoch": 0.5058198016956459, + "grad_norm": 0.3000554576014139, + "learning_rate": 8.8124567953872e-06, + "loss": 0.5455, + "step": 3080 + }, + { + "epoch": 0.5059840289039886, + "grad_norm": 0.4201987809012978, + "learning_rate": 8.812322793888229e-06, + "loss": 0.542, + "step": 3081 + }, + { + "epoch": 0.5061482561123314, + "grad_norm": 0.3110305012514798, + "learning_rate": 8.812188745553186e-06, + "loss": 0.5443, + "step": 3082 + }, + { + "epoch": 0.5063124833206741, + "grad_norm": 0.31745465529222217, + "learning_rate": 8.812054650383533e-06, + "loss": 0.5374, + "step": 3083 + }, + { + "epoch": 0.5064767105290169, + "grad_norm": 0.28706529458952756, + "learning_rate": 8.811920508380722e-06, + "loss": 0.5796, + "step": 3084 + }, + { + "epoch": 0.5066409377373596, + "grad_norm": 0.31694659994111607, + "learning_rate": 8.811786319546213e-06, + "loss": 0.568, + "step": 3085 + }, + { + "epoch": 0.5068051649457024, + "grad_norm": 0.3311771175932345, + "learning_rate": 8.81165208388146e-06, + "loss": 0.541, + "step": 3086 + }, + { + "epoch": 0.5069693921540451, + "grad_norm": 0.2894350217712809, + "learning_rate": 8.811517801387926e-06, + "loss": 0.5511, + "step": 3087 + }, + { + "epoch": 0.5071336193623879, + "grad_norm": 0.3198518620985994, + "learning_rate": 8.811383472067066e-06, + "loss": 0.5487, + "step": 3088 + }, + { + "epoch": 0.5072978465707306, + "grad_norm": 0.2919190229017156, + "learning_rate": 8.811249095920339e-06, + "loss": 0.5515, + "step": 3089 + }, + { + "epoch": 0.5074620737790734, + "grad_norm": 0.3323556812127161, + "learning_rate": 8.811114672949207e-06, + "loss": 0.5363, + "step": 3090 + }, + { + "epoch": 0.5076263009874161, + "grad_norm": 0.28621504004096965, + "learning_rate": 8.810980203155126e-06, + "loss": 0.5152, + "step": 3091 + }, + { + "epoch": 0.5077905281957589, + "grad_norm": 0.30244379133479443, + "learning_rate": 8.81084568653956e-06, + "loss": 0.5592, + "step": 3092 + }, + { + "epoch": 0.5079547554041016, + "grad_norm": 0.32967015947935796, + "learning_rate": 8.810711123103967e-06, + "loss": 0.5378, + "step": 3093 + }, + { + "epoch": 0.5081189826124444, + "grad_norm": 0.32233218351029747, + "learning_rate": 8.810576512849812e-06, + "loss": 0.5701, + "step": 3094 + }, + { + "epoch": 0.508283209820787, + "grad_norm": 0.3038805312926043, + "learning_rate": 8.810441855778554e-06, + "loss": 0.5369, + "step": 3095 + }, + { + "epoch": 0.5084474370291298, + "grad_norm": 0.3675474725686056, + "learning_rate": 8.810307151891658e-06, + "loss": 0.5435, + "step": 3096 + }, + { + "epoch": 0.5086116642374725, + "grad_norm": 0.3031891362387634, + "learning_rate": 8.810172401190583e-06, + "loss": 0.553, + "step": 3097 + }, + { + "epoch": 0.5087758914458153, + "grad_norm": 0.3125909011850417, + "learning_rate": 8.810037603676797e-06, + "loss": 0.5349, + "step": 3098 + }, + { + "epoch": 0.508940118654158, + "grad_norm": 0.5022551060954056, + "learning_rate": 8.809902759351761e-06, + "loss": 0.5645, + "step": 3099 + }, + { + "epoch": 0.5091043458625008, + "grad_norm": 0.3055638918113333, + "learning_rate": 8.809767868216941e-06, + "loss": 0.548, + "step": 3100 + }, + { + "epoch": 0.5092685730708435, + "grad_norm": 0.2874713939919956, + "learning_rate": 8.809632930273801e-06, + "loss": 0.561, + "step": 3101 + }, + { + "epoch": 0.5094328002791862, + "grad_norm": 0.4250919250625114, + "learning_rate": 8.809497945523808e-06, + "loss": 0.5645, + "step": 3102 + }, + { + "epoch": 0.509597027487529, + "grad_norm": 0.35550987220631075, + "learning_rate": 8.809362913968428e-06, + "loss": 0.5403, + "step": 3103 + }, + { + "epoch": 0.5097612546958717, + "grad_norm": 0.3443953804934167, + "learning_rate": 8.809227835609127e-06, + "loss": 0.5413, + "step": 3104 + }, + { + "epoch": 0.5099254819042145, + "grad_norm": 0.2839313401806877, + "learning_rate": 8.80909271044737e-06, + "loss": 0.5491, + "step": 3105 + }, + { + "epoch": 0.5100897091125572, + "grad_norm": 0.2967153264099807, + "learning_rate": 8.808957538484629e-06, + "loss": 0.5488, + "step": 3106 + }, + { + "epoch": 0.5102539363209, + "grad_norm": 0.28848535562090943, + "learning_rate": 8.808822319722367e-06, + "loss": 0.5588, + "step": 3107 + }, + { + "epoch": 0.5104181635292427, + "grad_norm": 0.2830657128227046, + "learning_rate": 8.808687054162057e-06, + "loss": 0.557, + "step": 3108 + }, + { + "epoch": 0.5105823907375855, + "grad_norm": 0.39693335600192026, + "learning_rate": 8.808551741805167e-06, + "loss": 0.5537, + "step": 3109 + }, + { + "epoch": 0.5107466179459282, + "grad_norm": 0.3161563688662857, + "learning_rate": 8.808416382653165e-06, + "loss": 0.5418, + "step": 3110 + }, + { + "epoch": 0.510910845154271, + "grad_norm": 0.299459534024022, + "learning_rate": 8.808280976707522e-06, + "loss": 0.5751, + "step": 3111 + }, + { + "epoch": 0.5110750723626136, + "grad_norm": 0.3302741452682689, + "learning_rate": 8.80814552396971e-06, + "loss": 0.5722, + "step": 3112 + }, + { + "epoch": 0.5112392995709564, + "grad_norm": 0.4430503083050463, + "learning_rate": 8.808010024441198e-06, + "loss": 0.5413, + "step": 3113 + }, + { + "epoch": 0.5114035267792991, + "grad_norm": 0.32648783038577595, + "learning_rate": 8.80787447812346e-06, + "loss": 0.5405, + "step": 3114 + }, + { + "epoch": 0.5115677539876419, + "grad_norm": 0.300303509573739, + "learning_rate": 8.807738885017965e-06, + "loss": 0.5455, + "step": 3115 + }, + { + "epoch": 0.5117319811959846, + "grad_norm": 0.3694340171302308, + "learning_rate": 8.807603245126187e-06, + "loss": 0.545, + "step": 3116 + }, + { + "epoch": 0.5118962084043274, + "grad_norm": 0.3788879023858456, + "learning_rate": 8.807467558449603e-06, + "loss": 0.5359, + "step": 3117 + }, + { + "epoch": 0.5120604356126701, + "grad_norm": 0.353354588272609, + "learning_rate": 8.80733182498968e-06, + "loss": 0.5488, + "step": 3118 + }, + { + "epoch": 0.5122246628210129, + "grad_norm": 0.3408319876373525, + "learning_rate": 8.807196044747897e-06, + "loss": 0.5485, + "step": 3119 + }, + { + "epoch": 0.5123888900293556, + "grad_norm": 0.39035712683504437, + "learning_rate": 8.807060217725726e-06, + "loss": 0.5464, + "step": 3120 + }, + { + "epoch": 0.5125531172376984, + "grad_norm": 0.3533902186234627, + "learning_rate": 8.806924343924644e-06, + "loss": 0.5289, + "step": 3121 + }, + { + "epoch": 0.5127173444460411, + "grad_norm": 0.5125086187864211, + "learning_rate": 8.806788423346127e-06, + "loss": 0.5434, + "step": 3122 + }, + { + "epoch": 0.5128815716543839, + "grad_norm": 0.7203539322855015, + "learning_rate": 8.806652455991651e-06, + "loss": 0.5588, + "step": 3123 + }, + { + "epoch": 0.5130457988627266, + "grad_norm": 0.36552585640486723, + "learning_rate": 8.80651644186269e-06, + "loss": 0.5523, + "step": 3124 + }, + { + "epoch": 0.5132100260710694, + "grad_norm": 0.31504873528001304, + "learning_rate": 8.806380380960725e-06, + "loss": 0.5414, + "step": 3125 + }, + { + "epoch": 0.5133742532794121, + "grad_norm": 0.3246787661246523, + "learning_rate": 8.806244273287233e-06, + "loss": 0.5418, + "step": 3126 + }, + { + "epoch": 0.5135384804877549, + "grad_norm": 0.3715899215366805, + "learning_rate": 8.806108118843688e-06, + "loss": 0.5657, + "step": 3127 + }, + { + "epoch": 0.5137027076960976, + "grad_norm": 0.4586182447833959, + "learning_rate": 8.805971917631575e-06, + "loss": 0.5338, + "step": 3128 + }, + { + "epoch": 0.5138669349044402, + "grad_norm": 0.2998730656450566, + "learning_rate": 8.80583566965237e-06, + "loss": 0.567, + "step": 3129 + }, + { + "epoch": 0.514031162112783, + "grad_norm": 0.3358753869110285, + "learning_rate": 8.805699374907553e-06, + "loss": 0.5329, + "step": 3130 + }, + { + "epoch": 0.5141953893211257, + "grad_norm": 0.3623082881810261, + "learning_rate": 8.805563033398604e-06, + "loss": 0.5481, + "step": 3131 + }, + { + "epoch": 0.5143596165294685, + "grad_norm": 0.41586552825851203, + "learning_rate": 8.805426645127005e-06, + "loss": 0.5347, + "step": 3132 + }, + { + "epoch": 0.5145238437378112, + "grad_norm": 0.3389036505262228, + "learning_rate": 8.805290210094238e-06, + "loss": 0.5432, + "step": 3133 + }, + { + "epoch": 0.514688070946154, + "grad_norm": 0.33220166906284515, + "learning_rate": 8.80515372830178e-06, + "loss": 0.5737, + "step": 3134 + }, + { + "epoch": 0.5148522981544967, + "grad_norm": 1.4240426803228405, + "learning_rate": 8.80501719975112e-06, + "loss": 0.563, + "step": 3135 + }, + { + "epoch": 0.5150165253628395, + "grad_norm": 0.34283390659735813, + "learning_rate": 8.804880624443737e-06, + "loss": 0.5487, + "step": 3136 + }, + { + "epoch": 0.5151807525711822, + "grad_norm": 0.3625409825436214, + "learning_rate": 8.804744002381114e-06, + "loss": 0.558, + "step": 3137 + }, + { + "epoch": 0.515344979779525, + "grad_norm": 0.39081632339485356, + "learning_rate": 8.804607333564737e-06, + "loss": 0.5447, + "step": 3138 + }, + { + "epoch": 0.5155092069878677, + "grad_norm": 0.3236707823666088, + "learning_rate": 8.804470617996088e-06, + "loss": 0.5647, + "step": 3139 + }, + { + "epoch": 0.5156734341962105, + "grad_norm": 0.42790371855431897, + "learning_rate": 8.804333855676653e-06, + "loss": 0.5436, + "step": 3140 + }, + { + "epoch": 0.5158376614045532, + "grad_norm": 0.3510945198863832, + "learning_rate": 8.804197046607918e-06, + "loss": 0.595, + "step": 3141 + }, + { + "epoch": 0.516001888612896, + "grad_norm": 0.34592814315247516, + "learning_rate": 8.80406019079137e-06, + "loss": 0.5549, + "step": 3142 + }, + { + "epoch": 0.5161661158212387, + "grad_norm": 0.3300648787481515, + "learning_rate": 8.803923288228492e-06, + "loss": 0.5512, + "step": 3143 + }, + { + "epoch": 0.5163303430295815, + "grad_norm": 0.3367035380808404, + "learning_rate": 8.803786338920773e-06, + "loss": 0.5349, + "step": 3144 + }, + { + "epoch": 0.5164945702379242, + "grad_norm": 0.40066400824553233, + "learning_rate": 8.803649342869698e-06, + "loss": 0.5556, + "step": 3145 + }, + { + "epoch": 0.5166587974462669, + "grad_norm": 0.311816919024996, + "learning_rate": 8.803512300076759e-06, + "loss": 0.5589, + "step": 3146 + }, + { + "epoch": 0.5168230246546096, + "grad_norm": 0.31924308074256, + "learning_rate": 8.803375210543442e-06, + "loss": 0.5502, + "step": 3147 + }, + { + "epoch": 0.5169872518629524, + "grad_norm": 0.3624853958484826, + "learning_rate": 8.803238074271237e-06, + "loss": 0.5562, + "step": 3148 + }, + { + "epoch": 0.5171514790712951, + "grad_norm": 0.3150090211544387, + "learning_rate": 8.803100891261632e-06, + "loss": 0.534, + "step": 3149 + }, + { + "epoch": 0.5173157062796379, + "grad_norm": 0.30714145840950313, + "learning_rate": 8.802963661516117e-06, + "loss": 0.5609, + "step": 3150 + }, + { + "epoch": 0.5174799334879806, + "grad_norm": 0.41808965641054635, + "learning_rate": 8.802826385036183e-06, + "loss": 0.5277, + "step": 3151 + }, + { + "epoch": 0.5176441606963234, + "grad_norm": 0.33805758218512644, + "learning_rate": 8.802689061823322e-06, + "loss": 0.5744, + "step": 3152 + }, + { + "epoch": 0.5178083879046661, + "grad_norm": 0.32757272518208763, + "learning_rate": 8.802551691879024e-06, + "loss": 0.5434, + "step": 3153 + }, + { + "epoch": 0.5179726151130088, + "grad_norm": 0.30549027755496716, + "learning_rate": 8.802414275204783e-06, + "loss": 0.5388, + "step": 3154 + }, + { + "epoch": 0.5181368423213516, + "grad_norm": 0.5121294354664127, + "learning_rate": 8.802276811802089e-06, + "loss": 0.5559, + "step": 3155 + }, + { + "epoch": 0.5183010695296943, + "grad_norm": 0.4360850175903607, + "learning_rate": 8.802139301672434e-06, + "loss": 0.5508, + "step": 3156 + }, + { + "epoch": 0.5184652967380371, + "grad_norm": 0.3109791520360334, + "learning_rate": 8.802001744817315e-06, + "loss": 0.5582, + "step": 3157 + }, + { + "epoch": 0.5186295239463798, + "grad_norm": 0.34541983264627407, + "learning_rate": 8.801864141238225e-06, + "loss": 0.5425, + "step": 3158 + }, + { + "epoch": 0.5187937511547226, + "grad_norm": 0.44077911149552174, + "learning_rate": 8.801726490936658e-06, + "loss": 0.5333, + "step": 3159 + }, + { + "epoch": 0.5189579783630653, + "grad_norm": 0.3175152055000214, + "learning_rate": 8.801588793914108e-06, + "loss": 0.5347, + "step": 3160 + }, + { + "epoch": 0.5191222055714081, + "grad_norm": 0.33110520939463356, + "learning_rate": 8.801451050172072e-06, + "loss": 0.5367, + "step": 3161 + }, + { + "epoch": 0.5192864327797508, + "grad_norm": 0.6786090232297229, + "learning_rate": 8.801313259712045e-06, + "loss": 0.5379, + "step": 3162 + }, + { + "epoch": 0.5194506599880935, + "grad_norm": 0.32345532898552576, + "learning_rate": 8.801175422535524e-06, + "loss": 0.5689, + "step": 3163 + }, + { + "epoch": 0.5196148871964362, + "grad_norm": 0.29288324836950214, + "learning_rate": 8.801037538644008e-06, + "loss": 0.552, + "step": 3164 + }, + { + "epoch": 0.519779114404779, + "grad_norm": 0.3060045843978916, + "learning_rate": 8.80089960803899e-06, + "loss": 0.549, + "step": 3165 + }, + { + "epoch": 0.5199433416131217, + "grad_norm": 0.3997067399866202, + "learning_rate": 8.800761630721973e-06, + "loss": 0.5322, + "step": 3166 + }, + { + "epoch": 0.5201075688214645, + "grad_norm": 0.2952695042762905, + "learning_rate": 8.800623606694453e-06, + "loss": 0.5631, + "step": 3167 + }, + { + "epoch": 0.5202717960298072, + "grad_norm": 0.33202602894268374, + "learning_rate": 8.800485535957928e-06, + "loss": 0.5355, + "step": 3168 + }, + { + "epoch": 0.52043602323815, + "grad_norm": 0.3053471478926584, + "learning_rate": 8.8003474185139e-06, + "loss": 0.5239, + "step": 3169 + }, + { + "epoch": 0.5206002504464927, + "grad_norm": 0.44546827302209424, + "learning_rate": 8.80020925436387e-06, + "loss": 0.5446, + "step": 3170 + }, + { + "epoch": 0.5207644776548355, + "grad_norm": 0.3232464754044946, + "learning_rate": 8.800071043509333e-06, + "loss": 0.5625, + "step": 3171 + }, + { + "epoch": 0.5209287048631782, + "grad_norm": 0.31924534819311456, + "learning_rate": 8.799932785951797e-06, + "loss": 0.5548, + "step": 3172 + }, + { + "epoch": 0.521092932071521, + "grad_norm": 0.8093812270735015, + "learning_rate": 8.799794481692757e-06, + "loss": 0.5512, + "step": 3173 + }, + { + "epoch": 0.5212571592798637, + "grad_norm": 0.353912459104798, + "learning_rate": 8.79965613073372e-06, + "loss": 0.5355, + "step": 3174 + }, + { + "epoch": 0.5214213864882065, + "grad_norm": 17.314905034002273, + "learning_rate": 8.799517733076186e-06, + "loss": 0.5549, + "step": 3175 + }, + { + "epoch": 0.5215856136965492, + "grad_norm": 0.4253045024687482, + "learning_rate": 8.799379288721663e-06, + "loss": 0.5465, + "step": 3176 + }, + { + "epoch": 0.521749840904892, + "grad_norm": 0.5363599916050024, + "learning_rate": 8.799240797671648e-06, + "loss": 0.5398, + "step": 3177 + }, + { + "epoch": 0.5219140681132347, + "grad_norm": 0.5429239713433921, + "learning_rate": 8.799102259927648e-06, + "loss": 0.5547, + "step": 3178 + }, + { + "epoch": 0.5220782953215775, + "grad_norm": 0.888409605205552, + "learning_rate": 8.798963675491168e-06, + "loss": 0.569, + "step": 3179 + }, + { + "epoch": 0.5222425225299201, + "grad_norm": 0.8292470249496997, + "learning_rate": 8.798825044363714e-06, + "loss": 0.5491, + "step": 3180 + }, + { + "epoch": 0.5224067497382628, + "grad_norm": 0.8524348696719072, + "learning_rate": 8.79868636654679e-06, + "loss": 0.5565, + "step": 3181 + }, + { + "epoch": 0.5225709769466056, + "grad_norm": 0.7638273678405479, + "learning_rate": 8.798547642041903e-06, + "loss": 0.5655, + "step": 3182 + }, + { + "epoch": 0.5227352041549483, + "grad_norm": 0.5794743123076325, + "learning_rate": 8.798408870850557e-06, + "loss": 0.5791, + "step": 3183 + }, + { + "epoch": 0.5228994313632911, + "grad_norm": 0.6652418445715474, + "learning_rate": 8.798270052974265e-06, + "loss": 0.5817, + "step": 3184 + }, + { + "epoch": 0.5230636585716338, + "grad_norm": 0.6033023612512277, + "learning_rate": 8.79813118841453e-06, + "loss": 0.5602, + "step": 3185 + }, + { + "epoch": 0.5232278857799766, + "grad_norm": 0.7713085937031505, + "learning_rate": 8.79799227717286e-06, + "loss": 0.5467, + "step": 3186 + }, + { + "epoch": 0.5233921129883193, + "grad_norm": 0.5954572400751407, + "learning_rate": 8.797853319250767e-06, + "loss": 0.5632, + "step": 3187 + }, + { + "epoch": 0.5235563401966621, + "grad_norm": 0.47584740248527363, + "learning_rate": 8.797714314649757e-06, + "loss": 0.5594, + "step": 3188 + }, + { + "epoch": 0.5237205674050048, + "grad_norm": 0.5639314104879384, + "learning_rate": 8.797575263371343e-06, + "loss": 0.5555, + "step": 3189 + }, + { + "epoch": 0.5238847946133476, + "grad_norm": 0.5285776579354926, + "learning_rate": 8.797436165417032e-06, + "loss": 0.5671, + "step": 3190 + }, + { + "epoch": 0.5240490218216903, + "grad_norm": 0.43872512757835813, + "learning_rate": 8.797297020788336e-06, + "loss": 0.5555, + "step": 3191 + }, + { + "epoch": 0.5242132490300331, + "grad_norm": 0.5753312286290805, + "learning_rate": 8.797157829486767e-06, + "loss": 0.5585, + "step": 3192 + }, + { + "epoch": 0.5243774762383758, + "grad_norm": 0.38476151861139246, + "learning_rate": 8.797018591513837e-06, + "loss": 0.5576, + "step": 3193 + }, + { + "epoch": 0.5245417034467186, + "grad_norm": 0.43061245503098006, + "learning_rate": 8.796879306871056e-06, + "loss": 0.58, + "step": 3194 + }, + { + "epoch": 0.5247059306550613, + "grad_norm": 0.3378612386322587, + "learning_rate": 8.79673997555994e-06, + "loss": 0.552, + "step": 3195 + }, + { + "epoch": 0.5248701578634041, + "grad_norm": 0.3335234350164391, + "learning_rate": 8.796600597581998e-06, + "loss": 0.5599, + "step": 3196 + }, + { + "epoch": 0.5250343850717467, + "grad_norm": 0.5837684658025116, + "learning_rate": 8.796461172938749e-06, + "loss": 0.564, + "step": 3197 + }, + { + "epoch": 0.5251986122800895, + "grad_norm": 0.34310523436981255, + "learning_rate": 8.796321701631702e-06, + "loss": 0.541, + "step": 3198 + }, + { + "epoch": 0.5253628394884322, + "grad_norm": 0.44627037677002795, + "learning_rate": 8.796182183662376e-06, + "loss": 0.5639, + "step": 3199 + }, + { + "epoch": 0.525527066696775, + "grad_norm": 0.3762255974042171, + "learning_rate": 8.796042619032283e-06, + "loss": 0.5529, + "step": 3200 + }, + { + "epoch": 0.5256912939051177, + "grad_norm": 0.3437555157560286, + "learning_rate": 8.795903007742941e-06, + "loss": 0.5632, + "step": 3201 + }, + { + "epoch": 0.5258555211134605, + "grad_norm": 0.3317101998553545, + "learning_rate": 8.795763349795866e-06, + "loss": 0.5524, + "step": 3202 + }, + { + "epoch": 0.5260197483218032, + "grad_norm": 0.3979361730906255, + "learning_rate": 8.795623645192574e-06, + "loss": 0.5464, + "step": 3203 + }, + { + "epoch": 0.526183975530146, + "grad_norm": 0.3619951010888624, + "learning_rate": 8.795483893934584e-06, + "loss": 0.5404, + "step": 3204 + }, + { + "epoch": 0.5263482027384887, + "grad_norm": 0.3117407096018107, + "learning_rate": 8.795344096023411e-06, + "loss": 0.5376, + "step": 3205 + }, + { + "epoch": 0.5265124299468315, + "grad_norm": 0.3293924524709304, + "learning_rate": 8.795204251460576e-06, + "loss": 0.5306, + "step": 3206 + }, + { + "epoch": 0.5266766571551742, + "grad_norm": 0.41805058863513644, + "learning_rate": 8.795064360247598e-06, + "loss": 0.5646, + "step": 3207 + }, + { + "epoch": 0.526840884363517, + "grad_norm": 0.3826750759835131, + "learning_rate": 8.794924422385995e-06, + "loss": 0.5386, + "step": 3208 + }, + { + "epoch": 0.5270051115718597, + "grad_norm": 0.3430815145020039, + "learning_rate": 8.794784437877286e-06, + "loss": 0.5521, + "step": 3209 + }, + { + "epoch": 0.5271693387802024, + "grad_norm": 0.5506840435783226, + "learning_rate": 8.794644406722993e-06, + "loss": 0.5466, + "step": 3210 + }, + { + "epoch": 0.5273335659885452, + "grad_norm": 0.2992384367198502, + "learning_rate": 8.794504328924636e-06, + "loss": 0.5506, + "step": 3211 + }, + { + "epoch": 0.5274977931968879, + "grad_norm": 0.3029752344219687, + "learning_rate": 8.794364204483736e-06, + "loss": 0.5484, + "step": 3212 + }, + { + "epoch": 0.5276620204052307, + "grad_norm": 0.2948964578526369, + "learning_rate": 8.794224033401818e-06, + "loss": 0.5462, + "step": 3213 + }, + { + "epoch": 0.5278262476135733, + "grad_norm": 0.366601371536449, + "learning_rate": 8.794083815680402e-06, + "loss": 0.5599, + "step": 3214 + }, + { + "epoch": 0.5279904748219161, + "grad_norm": 0.3498632389083059, + "learning_rate": 8.79394355132101e-06, + "loss": 0.5383, + "step": 3215 + }, + { + "epoch": 0.5281547020302588, + "grad_norm": 0.31505091855348366, + "learning_rate": 8.793803240325165e-06, + "loss": 0.5631, + "step": 3216 + }, + { + "epoch": 0.5283189292386016, + "grad_norm": 0.42965003549284014, + "learning_rate": 8.793662882694394e-06, + "loss": 0.5419, + "step": 3217 + }, + { + "epoch": 0.5284831564469443, + "grad_norm": 0.29650964340269054, + "learning_rate": 8.79352247843022e-06, + "loss": 0.5551, + "step": 3218 + }, + { + "epoch": 0.5286473836552871, + "grad_norm": 0.29571046480753704, + "learning_rate": 8.793382027534167e-06, + "loss": 0.5402, + "step": 3219 + }, + { + "epoch": 0.5288116108636298, + "grad_norm": 0.3534211821619501, + "learning_rate": 8.79324153000776e-06, + "loss": 0.56, + "step": 3220 + }, + { + "epoch": 0.5289758380719726, + "grad_norm": 0.3105684543180674, + "learning_rate": 8.793100985852527e-06, + "loss": 0.5344, + "step": 3221 + }, + { + "epoch": 0.5291400652803153, + "grad_norm": 0.32396286394780704, + "learning_rate": 8.792960395069993e-06, + "loss": 0.5344, + "step": 3222 + }, + { + "epoch": 0.5293042924886581, + "grad_norm": 0.40299790637446253, + "learning_rate": 8.792819757661686e-06, + "loss": 0.5337, + "step": 3223 + }, + { + "epoch": 0.5294685196970008, + "grad_norm": 0.2948307048801837, + "learning_rate": 8.792679073629132e-06, + "loss": 0.5372, + "step": 3224 + }, + { + "epoch": 0.5296327469053436, + "grad_norm": 0.33790169922370855, + "learning_rate": 8.792538342973862e-06, + "loss": 0.5311, + "step": 3225 + }, + { + "epoch": 0.5297969741136863, + "grad_norm": 0.30645198851218913, + "learning_rate": 8.792397565697399e-06, + "loss": 0.543, + "step": 3226 + }, + { + "epoch": 0.5299612013220291, + "grad_norm": 0.3628053695130395, + "learning_rate": 8.792256741801277e-06, + "loss": 0.5559, + "step": 3227 + }, + { + "epoch": 0.5301254285303718, + "grad_norm": 0.27857795489535925, + "learning_rate": 8.792115871287025e-06, + "loss": 0.53, + "step": 3228 + }, + { + "epoch": 0.5302896557387146, + "grad_norm": 0.3071228087006437, + "learning_rate": 8.79197495415617e-06, + "loss": 0.5541, + "step": 3229 + }, + { + "epoch": 0.5304538829470573, + "grad_norm": 0.3325204943756406, + "learning_rate": 8.791833990410246e-06, + "loss": 0.5422, + "step": 3230 + }, + { + "epoch": 0.5306181101554, + "grad_norm": 0.3669488938803181, + "learning_rate": 8.79169298005078e-06, + "loss": 0.5363, + "step": 3231 + }, + { + "epoch": 0.5307823373637427, + "grad_norm": 0.27794447125357075, + "learning_rate": 8.791551923079308e-06, + "loss": 0.5678, + "step": 3232 + }, + { + "epoch": 0.5309465645720854, + "grad_norm": 0.3167738475291457, + "learning_rate": 8.791410819497359e-06, + "loss": 0.54, + "step": 3233 + }, + { + "epoch": 0.5311107917804282, + "grad_norm": 0.39694196446332014, + "learning_rate": 8.791269669306465e-06, + "loss": 0.5468, + "step": 3234 + }, + { + "epoch": 0.5312750189887709, + "grad_norm": 0.32582945645680483, + "learning_rate": 8.791128472508163e-06, + "loss": 0.5498, + "step": 3235 + }, + { + "epoch": 0.5314392461971137, + "grad_norm": 0.30304838745817103, + "learning_rate": 8.790987229103981e-06, + "loss": 0.5399, + "step": 3236 + }, + { + "epoch": 0.5316034734054564, + "grad_norm": 0.3316043075793242, + "learning_rate": 8.790845939095456e-06, + "loss": 0.5433, + "step": 3237 + }, + { + "epoch": 0.5317677006137992, + "grad_norm": 0.3544344684280817, + "learning_rate": 8.790704602484125e-06, + "loss": 0.5629, + "step": 3238 + }, + { + "epoch": 0.5319319278221419, + "grad_norm": 0.2916093235123261, + "learning_rate": 8.790563219271518e-06, + "loss": 0.5545, + "step": 3239 + }, + { + "epoch": 0.5320961550304847, + "grad_norm": 0.45549622656485855, + "learning_rate": 8.790421789459175e-06, + "loss": 0.5399, + "step": 3240 + }, + { + "epoch": 0.5322603822388274, + "grad_norm": 0.33583247440668695, + "learning_rate": 8.79028031304863e-06, + "loss": 0.5452, + "step": 3241 + }, + { + "epoch": 0.5324246094471702, + "grad_norm": 0.30414857412185614, + "learning_rate": 8.79013879004142e-06, + "loss": 0.5533, + "step": 3242 + }, + { + "epoch": 0.5325888366555129, + "grad_norm": 0.3661648266690582, + "learning_rate": 8.78999722043908e-06, + "loss": 0.5316, + "step": 3243 + }, + { + "epoch": 0.5327530638638557, + "grad_norm": 0.2943779954560458, + "learning_rate": 8.78985560424315e-06, + "loss": 0.5439, + "step": 3244 + }, + { + "epoch": 0.5329172910721984, + "grad_norm": 0.6613520472875823, + "learning_rate": 8.789713941455168e-06, + "loss": 0.5534, + "step": 3245 + }, + { + "epoch": 0.5330815182805412, + "grad_norm": 0.37859318983449436, + "learning_rate": 8.789572232076671e-06, + "loss": 0.5552, + "step": 3246 + }, + { + "epoch": 0.5332457454888838, + "grad_norm": 0.32920543399704716, + "learning_rate": 8.789430476109201e-06, + "loss": 0.5392, + "step": 3247 + }, + { + "epoch": 0.5334099726972266, + "grad_norm": 0.36207175639388633, + "learning_rate": 8.789288673554296e-06, + "loss": 0.5584, + "step": 3248 + }, + { + "epoch": 0.5335741999055693, + "grad_norm": 0.3265540316957601, + "learning_rate": 8.789146824413494e-06, + "loss": 0.5497, + "step": 3249 + }, + { + "epoch": 0.5337384271139121, + "grad_norm": 0.7945446107991943, + "learning_rate": 8.789004928688339e-06, + "loss": 0.5217, + "step": 3250 + }, + { + "epoch": 0.5339026543222548, + "grad_norm": 0.3216038639309901, + "learning_rate": 8.78886298638037e-06, + "loss": 0.5409, + "step": 3251 + }, + { + "epoch": 0.5340668815305976, + "grad_norm": 0.29287927055837376, + "learning_rate": 8.78872099749113e-06, + "loss": 0.5481, + "step": 3252 + }, + { + "epoch": 0.5342311087389403, + "grad_norm": 0.29317563619901427, + "learning_rate": 8.78857896202216e-06, + "loss": 0.548, + "step": 3253 + }, + { + "epoch": 0.5343953359472831, + "grad_norm": 0.739804214926253, + "learning_rate": 8.788436879975003e-06, + "loss": 0.5477, + "step": 3254 + }, + { + "epoch": 0.5345595631556258, + "grad_norm": 0.3462570710581254, + "learning_rate": 8.788294751351201e-06, + "loss": 0.5446, + "step": 3255 + }, + { + "epoch": 0.5347237903639686, + "grad_norm": 0.35379963473107207, + "learning_rate": 8.7881525761523e-06, + "loss": 0.5492, + "step": 3256 + }, + { + "epoch": 0.5348880175723113, + "grad_norm": 0.6842229018746369, + "learning_rate": 8.788010354379844e-06, + "loss": 0.5425, + "step": 3257 + }, + { + "epoch": 0.535052244780654, + "grad_norm": 0.3078870199163517, + "learning_rate": 8.787868086035374e-06, + "loss": 0.5589, + "step": 3258 + }, + { + "epoch": 0.5352164719889968, + "grad_norm": 0.3013589290107262, + "learning_rate": 8.78772577112044e-06, + "loss": 0.5458, + "step": 3259 + }, + { + "epoch": 0.5353806991973395, + "grad_norm": 0.29817945458234185, + "learning_rate": 8.787583409636587e-06, + "loss": 0.5515, + "step": 3260 + }, + { + "epoch": 0.5355449264056823, + "grad_norm": 0.5011852342901882, + "learning_rate": 8.787441001585356e-06, + "loss": 0.5421, + "step": 3261 + }, + { + "epoch": 0.535709153614025, + "grad_norm": 0.3648740653731427, + "learning_rate": 8.787298546968301e-06, + "loss": 0.5653, + "step": 3262 + }, + { + "epoch": 0.5358733808223678, + "grad_norm": 0.29821976509943804, + "learning_rate": 8.787156045786963e-06, + "loss": 0.5378, + "step": 3263 + }, + { + "epoch": 0.5360376080307104, + "grad_norm": 0.3126612686731939, + "learning_rate": 8.787013498042896e-06, + "loss": 0.5413, + "step": 3264 + }, + { + "epoch": 0.5362018352390532, + "grad_norm": 0.4119496533178666, + "learning_rate": 8.786870903737641e-06, + "loss": 0.5681, + "step": 3265 + }, + { + "epoch": 0.5363660624473959, + "grad_norm": 0.3168742881680851, + "learning_rate": 8.78672826287275e-06, + "loss": 0.5539, + "step": 3266 + }, + { + "epoch": 0.5365302896557387, + "grad_norm": 0.31014627546022744, + "learning_rate": 8.786585575449775e-06, + "loss": 0.5537, + "step": 3267 + }, + { + "epoch": 0.5366945168640814, + "grad_norm": 0.4724267454615436, + "learning_rate": 8.786442841470261e-06, + "loss": 0.55, + "step": 3268 + }, + { + "epoch": 0.5368587440724242, + "grad_norm": 0.33371309506311503, + "learning_rate": 8.786300060935761e-06, + "loss": 0.5482, + "step": 3269 + }, + { + "epoch": 0.5370229712807669, + "grad_norm": 0.3795166104581432, + "learning_rate": 8.786157233847827e-06, + "loss": 0.5255, + "step": 3270 + }, + { + "epoch": 0.5371871984891097, + "grad_norm": 0.4632986698184538, + "learning_rate": 8.786014360208008e-06, + "loss": 0.5316, + "step": 3271 + }, + { + "epoch": 0.5373514256974524, + "grad_norm": 0.3120794912916995, + "learning_rate": 8.785871440017854e-06, + "loss": 0.5272, + "step": 3272 + }, + { + "epoch": 0.5375156529057952, + "grad_norm": 0.3266812219370469, + "learning_rate": 8.785728473278922e-06, + "loss": 0.5395, + "step": 3273 + }, + { + "epoch": 0.5376798801141379, + "grad_norm": 0.3667428118373131, + "learning_rate": 8.78558545999276e-06, + "loss": 0.5372, + "step": 3274 + }, + { + "epoch": 0.5378441073224807, + "grad_norm": 0.3770025621124436, + "learning_rate": 8.785442400160925e-06, + "loss": 0.5321, + "step": 3275 + }, + { + "epoch": 0.5380083345308234, + "grad_norm": 0.3108514386539361, + "learning_rate": 8.785299293784968e-06, + "loss": 0.581, + "step": 3276 + }, + { + "epoch": 0.5381725617391662, + "grad_norm": 0.32926200102815384, + "learning_rate": 8.785156140866444e-06, + "loss": 0.5551, + "step": 3277 + }, + { + "epoch": 0.5383367889475089, + "grad_norm": 0.29382084808673803, + "learning_rate": 8.785012941406911e-06, + "loss": 0.5505, + "step": 3278 + }, + { + "epoch": 0.5385010161558517, + "grad_norm": 0.296915713846914, + "learning_rate": 8.78486969540792e-06, + "loss": 0.5403, + "step": 3279 + }, + { + "epoch": 0.5386652433641944, + "grad_norm": 0.3364779005663321, + "learning_rate": 8.784726402871028e-06, + "loss": 0.5507, + "step": 3280 + }, + { + "epoch": 0.538829470572537, + "grad_norm": 0.31712966506772305, + "learning_rate": 8.78458306379779e-06, + "loss": 0.5481, + "step": 3281 + }, + { + "epoch": 0.5389936977808798, + "grad_norm": 0.4194875222865034, + "learning_rate": 8.784439678189769e-06, + "loss": 0.5419, + "step": 3282 + }, + { + "epoch": 0.5391579249892225, + "grad_norm": 0.36369565463198017, + "learning_rate": 8.784296246048515e-06, + "loss": 0.538, + "step": 3283 + }, + { + "epoch": 0.5393221521975653, + "grad_norm": 0.33327635860855287, + "learning_rate": 8.784152767375589e-06, + "loss": 0.5786, + "step": 3284 + }, + { + "epoch": 0.539486379405908, + "grad_norm": 0.31341028441266205, + "learning_rate": 8.784009242172548e-06, + "loss": 0.5383, + "step": 3285 + }, + { + "epoch": 0.5396506066142508, + "grad_norm": 0.363543182076021, + "learning_rate": 8.783865670440954e-06, + "loss": 0.524, + "step": 3286 + }, + { + "epoch": 0.5398148338225935, + "grad_norm": 0.3023715111443661, + "learning_rate": 8.783722052182361e-06, + "loss": 0.54, + "step": 3287 + }, + { + "epoch": 0.5399790610309363, + "grad_norm": 0.3172844189567488, + "learning_rate": 8.783578387398333e-06, + "loss": 0.5319, + "step": 3288 + }, + { + "epoch": 0.540143288239279, + "grad_norm": 0.315659171219246, + "learning_rate": 8.78343467609043e-06, + "loss": 0.5714, + "step": 3289 + }, + { + "epoch": 0.5403075154476218, + "grad_norm": 0.7465434186042995, + "learning_rate": 8.783290918260212e-06, + "loss": 0.5532, + "step": 3290 + }, + { + "epoch": 0.5404717426559645, + "grad_norm": 0.3801930075543929, + "learning_rate": 8.78314711390924e-06, + "loss": 0.5595, + "step": 3291 + }, + { + "epoch": 0.5406359698643073, + "grad_norm": 0.35849452095029644, + "learning_rate": 8.783003263039077e-06, + "loss": 0.558, + "step": 3292 + }, + { + "epoch": 0.54080019707265, + "grad_norm": 0.3286836188938242, + "learning_rate": 8.782859365651284e-06, + "loss": 0.549, + "step": 3293 + }, + { + "epoch": 0.5409644242809928, + "grad_norm": 0.3544088100080713, + "learning_rate": 8.782715421747424e-06, + "loss": 0.5263, + "step": 3294 + }, + { + "epoch": 0.5411286514893355, + "grad_norm": 0.3012499735294153, + "learning_rate": 8.782571431329062e-06, + "loss": 0.5493, + "step": 3295 + }, + { + "epoch": 0.5412928786976783, + "grad_norm": 0.2862519026155833, + "learning_rate": 8.78242739439776e-06, + "loss": 0.5529, + "step": 3296 + }, + { + "epoch": 0.541457105906021, + "grad_norm": 0.34212920789775564, + "learning_rate": 8.782283310955084e-06, + "loss": 0.5156, + "step": 3297 + }, + { + "epoch": 0.5416213331143637, + "grad_norm": 0.2874868127529915, + "learning_rate": 8.782139181002598e-06, + "loss": 0.5347, + "step": 3298 + }, + { + "epoch": 0.5417855603227064, + "grad_norm": 0.30620036288558916, + "learning_rate": 8.781995004541866e-06, + "loss": 0.5562, + "step": 3299 + }, + { + "epoch": 0.5419497875310492, + "grad_norm": 0.3328961656461178, + "learning_rate": 8.781850781574458e-06, + "loss": 0.5402, + "step": 3300 + }, + { + "epoch": 0.5421140147393919, + "grad_norm": 0.4321499881343302, + "learning_rate": 8.781706512101936e-06, + "loss": 0.5424, + "step": 3301 + }, + { + "epoch": 0.5422782419477347, + "grad_norm": 0.2959437492978648, + "learning_rate": 8.781562196125868e-06, + "loss": 0.5616, + "step": 3302 + }, + { + "epoch": 0.5424424691560774, + "grad_norm": 0.34091956215548647, + "learning_rate": 8.781417833647823e-06, + "loss": 0.533, + "step": 3303 + }, + { + "epoch": 0.5426066963644202, + "grad_norm": 0.2969866679071415, + "learning_rate": 8.781273424669368e-06, + "loss": 0.5295, + "step": 3304 + }, + { + "epoch": 0.5427709235727629, + "grad_norm": 0.33197289823642345, + "learning_rate": 8.78112896919207e-06, + "loss": 0.5319, + "step": 3305 + }, + { + "epoch": 0.5429351507811057, + "grad_norm": 0.30111409556440877, + "learning_rate": 8.780984467217503e-06, + "loss": 0.5497, + "step": 3306 + }, + { + "epoch": 0.5430993779894484, + "grad_norm": 0.3166579827962453, + "learning_rate": 8.78083991874723e-06, + "loss": 0.5573, + "step": 3307 + }, + { + "epoch": 0.5432636051977912, + "grad_norm": 0.39090836439548915, + "learning_rate": 8.780695323782823e-06, + "loss": 0.5588, + "step": 3308 + }, + { + "epoch": 0.5434278324061339, + "grad_norm": 0.41882139487374753, + "learning_rate": 8.780550682325853e-06, + "loss": 0.5407, + "step": 3309 + }, + { + "epoch": 0.5435920596144767, + "grad_norm": 0.351388516687854, + "learning_rate": 8.780405994377893e-06, + "loss": 0.561, + "step": 3310 + }, + { + "epoch": 0.5437562868228194, + "grad_norm": 0.5146337082689596, + "learning_rate": 8.780261259940511e-06, + "loss": 0.5512, + "step": 3311 + }, + { + "epoch": 0.5439205140311622, + "grad_norm": 0.3632997394671109, + "learning_rate": 8.780116479015283e-06, + "loss": 0.5423, + "step": 3312 + }, + { + "epoch": 0.5440847412395049, + "grad_norm": 0.31828930879474143, + "learning_rate": 8.779971651603776e-06, + "loss": 0.5395, + "step": 3313 + }, + { + "epoch": 0.5442489684478476, + "grad_norm": 0.3052865887699414, + "learning_rate": 8.779826777707568e-06, + "loss": 0.5373, + "step": 3314 + }, + { + "epoch": 0.5444131956561903, + "grad_norm": 0.28649306849827694, + "learning_rate": 8.77968185732823e-06, + "loss": 0.5406, + "step": 3315 + }, + { + "epoch": 0.544577422864533, + "grad_norm": 0.3262919874735506, + "learning_rate": 8.779536890467336e-06, + "loss": 0.5552, + "step": 3316 + }, + { + "epoch": 0.5447416500728758, + "grad_norm": 0.28064816936119535, + "learning_rate": 8.77939187712646e-06, + "loss": 0.5448, + "step": 3317 + }, + { + "epoch": 0.5449058772812185, + "grad_norm": 0.2905610645984247, + "learning_rate": 8.77924681730718e-06, + "loss": 0.5514, + "step": 3318 + }, + { + "epoch": 0.5450701044895613, + "grad_norm": 0.3582455870421614, + "learning_rate": 8.779101711011067e-06, + "loss": 0.5543, + "step": 3319 + }, + { + "epoch": 0.545234331697904, + "grad_norm": 0.2956549075881862, + "learning_rate": 8.7789565582397e-06, + "loss": 0.528, + "step": 3320 + }, + { + "epoch": 0.5453985589062468, + "grad_norm": 0.32122379476319896, + "learning_rate": 8.778811358994655e-06, + "loss": 0.5133, + "step": 3321 + }, + { + "epoch": 0.5455627861145895, + "grad_norm": 0.2971563406062923, + "learning_rate": 8.77866611327751e-06, + "loss": 0.5245, + "step": 3322 + }, + { + "epoch": 0.5457270133229323, + "grad_norm": 0.3102675621709384, + "learning_rate": 8.77852082108984e-06, + "loss": 0.538, + "step": 3323 + }, + { + "epoch": 0.545891240531275, + "grad_norm": 0.2870978359981991, + "learning_rate": 8.778375482433226e-06, + "loss": 0.5384, + "step": 3324 + }, + { + "epoch": 0.5460554677396178, + "grad_norm": 0.2900289483988109, + "learning_rate": 8.778230097309243e-06, + "loss": 0.542, + "step": 3325 + }, + { + "epoch": 0.5462196949479605, + "grad_norm": 0.32092240664718624, + "learning_rate": 8.778084665719473e-06, + "loss": 0.5591, + "step": 3326 + }, + { + "epoch": 0.5463839221563033, + "grad_norm": 0.3126772582717247, + "learning_rate": 8.777939187665495e-06, + "loss": 0.5469, + "step": 3327 + }, + { + "epoch": 0.546548149364646, + "grad_norm": 0.3857668440217911, + "learning_rate": 8.777793663148888e-06, + "loss": 0.5633, + "step": 3328 + }, + { + "epoch": 0.5467123765729888, + "grad_norm": 0.28195626253107636, + "learning_rate": 8.777648092171232e-06, + "loss": 0.5439, + "step": 3329 + }, + { + "epoch": 0.5468766037813315, + "grad_norm": 0.36064556386630914, + "learning_rate": 8.777502474734109e-06, + "loss": 0.5489, + "step": 3330 + }, + { + "epoch": 0.5470408309896743, + "grad_norm": 0.33236569642633484, + "learning_rate": 8.777356810839102e-06, + "loss": 0.5251, + "step": 3331 + }, + { + "epoch": 0.5472050581980169, + "grad_norm": 0.5230949434342357, + "learning_rate": 8.77721110048779e-06, + "loss": 0.5652, + "step": 3332 + }, + { + "epoch": 0.5473692854063597, + "grad_norm": 0.2952566421849118, + "learning_rate": 8.77706534368176e-06, + "loss": 0.5377, + "step": 3333 + }, + { + "epoch": 0.5475335126147024, + "grad_norm": 0.35344945699330593, + "learning_rate": 8.77691954042259e-06, + "loss": 0.5225, + "step": 3334 + }, + { + "epoch": 0.5476977398230451, + "grad_norm": 0.32436302971883846, + "learning_rate": 8.776773690711866e-06, + "loss": 0.5521, + "step": 3335 + }, + { + "epoch": 0.5478619670313879, + "grad_norm": 0.30968709714872467, + "learning_rate": 8.776627794551174e-06, + "loss": 0.5473, + "step": 3336 + }, + { + "epoch": 0.5480261942397306, + "grad_norm": 0.292355032883547, + "learning_rate": 8.776481851942094e-06, + "loss": 0.5296, + "step": 3337 + }, + { + "epoch": 0.5481904214480734, + "grad_norm": 0.30650548477899137, + "learning_rate": 8.776335862886216e-06, + "loss": 0.5486, + "step": 3338 + }, + { + "epoch": 0.5483546486564161, + "grad_norm": 0.30828921012684446, + "learning_rate": 8.776189827385121e-06, + "loss": 0.5424, + "step": 3339 + }, + { + "epoch": 0.5485188758647589, + "grad_norm": 0.30623149799210697, + "learning_rate": 8.776043745440398e-06, + "loss": 0.5563, + "step": 3340 + }, + { + "epoch": 0.5486831030731016, + "grad_norm": 0.320878900454465, + "learning_rate": 8.775897617053633e-06, + "loss": 0.5434, + "step": 3341 + }, + { + "epoch": 0.5488473302814444, + "grad_norm": 0.3160907340766199, + "learning_rate": 8.775751442226412e-06, + "loss": 0.5266, + "step": 3342 + }, + { + "epoch": 0.5490115574897871, + "grad_norm": 0.2940009688601457, + "learning_rate": 8.775605220960325e-06, + "loss": 0.5591, + "step": 3343 + }, + { + "epoch": 0.5491757846981299, + "grad_norm": 0.2756229954361331, + "learning_rate": 8.775458953256958e-06, + "loss": 0.5422, + "step": 3344 + }, + { + "epoch": 0.5493400119064726, + "grad_norm": 0.4017540228900945, + "learning_rate": 8.7753126391179e-06, + "loss": 0.5378, + "step": 3345 + }, + { + "epoch": 0.5495042391148154, + "grad_norm": 0.30553436345975726, + "learning_rate": 8.775166278544742e-06, + "loss": 0.5207, + "step": 3346 + }, + { + "epoch": 0.5496684663231581, + "grad_norm": 0.3855818478613099, + "learning_rate": 8.775019871539071e-06, + "loss": 0.5377, + "step": 3347 + }, + { + "epoch": 0.5498326935315009, + "grad_norm": 0.26781930874015525, + "learning_rate": 8.774873418102477e-06, + "loss": 0.5154, + "step": 3348 + }, + { + "epoch": 0.5499969207398435, + "grad_norm": 0.5037312157429292, + "learning_rate": 8.774726918236553e-06, + "loss": 0.5387, + "step": 3349 + }, + { + "epoch": 0.5501611479481863, + "grad_norm": 0.37168947952148945, + "learning_rate": 8.774580371942888e-06, + "loss": 0.5503, + "step": 3350 + }, + { + "epoch": 0.550325375156529, + "grad_norm": 0.33315012217900974, + "learning_rate": 8.774433779223076e-06, + "loss": 0.5407, + "step": 3351 + }, + { + "epoch": 0.5504896023648718, + "grad_norm": 0.3247936341233711, + "learning_rate": 8.774287140078708e-06, + "loss": 0.5522, + "step": 3352 + }, + { + "epoch": 0.5506538295732145, + "grad_norm": 0.4965710840359514, + "learning_rate": 8.774140454511375e-06, + "loss": 0.5416, + "step": 3353 + }, + { + "epoch": 0.5508180567815573, + "grad_norm": 0.33188955600441644, + "learning_rate": 8.773993722522672e-06, + "loss": 0.5555, + "step": 3354 + }, + { + "epoch": 0.5509822839899, + "grad_norm": 0.418317451310977, + "learning_rate": 8.773846944114192e-06, + "loss": 0.5425, + "step": 3355 + }, + { + "epoch": 0.5511465111982428, + "grad_norm": 0.2825039647828183, + "learning_rate": 8.773700119287528e-06, + "loss": 0.5707, + "step": 3356 + }, + { + "epoch": 0.5513107384065855, + "grad_norm": 0.3538687145304638, + "learning_rate": 8.773553248044278e-06, + "loss": 0.5685, + "step": 3357 + }, + { + "epoch": 0.5514749656149283, + "grad_norm": 0.42920655053335216, + "learning_rate": 8.773406330386034e-06, + "loss": 0.5295, + "step": 3358 + }, + { + "epoch": 0.551639192823271, + "grad_norm": 0.32412456816316443, + "learning_rate": 8.773259366314393e-06, + "loss": 0.5574, + "step": 3359 + }, + { + "epoch": 0.5518034200316138, + "grad_norm": 0.2797144018953076, + "learning_rate": 8.77311235583095e-06, + "loss": 0.5669, + "step": 3360 + }, + { + "epoch": 0.5519676472399565, + "grad_norm": 0.37403853631233064, + "learning_rate": 8.772965298937305e-06, + "loss": 0.5285, + "step": 3361 + }, + { + "epoch": 0.5521318744482993, + "grad_norm": 0.28490995583623363, + "learning_rate": 8.772818195635052e-06, + "loss": 0.5413, + "step": 3362 + }, + { + "epoch": 0.552296101656642, + "grad_norm": 0.32396743410008255, + "learning_rate": 8.772671045925788e-06, + "loss": 0.5585, + "step": 3363 + }, + { + "epoch": 0.5524603288649848, + "grad_norm": 0.3075785289042439, + "learning_rate": 8.772523849811114e-06, + "loss": 0.5535, + "step": 3364 + }, + { + "epoch": 0.5526245560733275, + "grad_norm": 0.4027775951653619, + "learning_rate": 8.772376607292627e-06, + "loss": 0.55, + "step": 3365 + }, + { + "epoch": 0.5527887832816701, + "grad_norm": 0.385492345624032, + "learning_rate": 8.772229318371927e-06, + "loss": 0.5506, + "step": 3366 + }, + { + "epoch": 0.5529530104900129, + "grad_norm": 0.33761364893706824, + "learning_rate": 8.772081983050611e-06, + "loss": 0.5321, + "step": 3367 + }, + { + "epoch": 0.5531172376983556, + "grad_norm": 0.3403673283677389, + "learning_rate": 8.771934601330285e-06, + "loss": 0.5451, + "step": 3368 + }, + { + "epoch": 0.5532814649066984, + "grad_norm": 0.35126406168503904, + "learning_rate": 8.771787173212545e-06, + "loss": 0.5395, + "step": 3369 + }, + { + "epoch": 0.5534456921150411, + "grad_norm": 0.36250462085917223, + "learning_rate": 8.771639698698993e-06, + "loss": 0.5538, + "step": 3370 + }, + { + "epoch": 0.5536099193233839, + "grad_norm": 0.28299198501272577, + "learning_rate": 8.77149217779123e-06, + "loss": 0.5542, + "step": 3371 + }, + { + "epoch": 0.5537741465317266, + "grad_norm": 0.3021177801777963, + "learning_rate": 8.77134461049086e-06, + "loss": 0.5455, + "step": 3372 + }, + { + "epoch": 0.5539383737400694, + "grad_norm": 0.33274453416677413, + "learning_rate": 8.771196996799486e-06, + "loss": 0.5501, + "step": 3373 + }, + { + "epoch": 0.5541026009484121, + "grad_norm": 0.31709314075267925, + "learning_rate": 8.77104933671871e-06, + "loss": 0.5319, + "step": 3374 + }, + { + "epoch": 0.5542668281567549, + "grad_norm": 0.38946982014827064, + "learning_rate": 8.770901630250137e-06, + "loss": 0.5438, + "step": 3375 + }, + { + "epoch": 0.5544310553650976, + "grad_norm": 0.3106343141375796, + "learning_rate": 8.77075387739537e-06, + "loss": 0.537, + "step": 3376 + }, + { + "epoch": 0.5545952825734404, + "grad_norm": 0.40028112123615217, + "learning_rate": 8.770606078156013e-06, + "loss": 0.5553, + "step": 3377 + }, + { + "epoch": 0.5547595097817831, + "grad_norm": 0.3187911975290353, + "learning_rate": 8.770458232533672e-06, + "loss": 0.5554, + "step": 3378 + }, + { + "epoch": 0.5549237369901259, + "grad_norm": 0.3378804598434121, + "learning_rate": 8.770310340529954e-06, + "loss": 0.5533, + "step": 3379 + }, + { + "epoch": 0.5550879641984686, + "grad_norm": 0.3401695168568818, + "learning_rate": 8.770162402146465e-06, + "loss": 0.5408, + "step": 3380 + }, + { + "epoch": 0.5552521914068114, + "grad_norm": 0.29459484154398957, + "learning_rate": 8.77001441738481e-06, + "loss": 0.5198, + "step": 3381 + }, + { + "epoch": 0.5554164186151541, + "grad_norm": 0.3459913273509143, + "learning_rate": 8.769866386246596e-06, + "loss": 0.5393, + "step": 3382 + }, + { + "epoch": 0.5555806458234968, + "grad_norm": 0.3211088301865988, + "learning_rate": 8.769718308733434e-06, + "loss": 0.5494, + "step": 3383 + }, + { + "epoch": 0.5557448730318395, + "grad_norm": 0.3254018054944106, + "learning_rate": 8.769570184846929e-06, + "loss": 0.5429, + "step": 3384 + }, + { + "epoch": 0.5559091002401823, + "grad_norm": 0.3114123751834169, + "learning_rate": 8.769422014588692e-06, + "loss": 0.5479, + "step": 3385 + }, + { + "epoch": 0.556073327448525, + "grad_norm": 0.3236795283927829, + "learning_rate": 8.769273797960331e-06, + "loss": 0.5375, + "step": 3386 + }, + { + "epoch": 0.5562375546568677, + "grad_norm": 0.4880689140211738, + "learning_rate": 8.769125534963457e-06, + "loss": 0.5387, + "step": 3387 + }, + { + "epoch": 0.5564017818652105, + "grad_norm": 0.31117332262719577, + "learning_rate": 8.768977225599679e-06, + "loss": 0.5455, + "step": 3388 + }, + { + "epoch": 0.5565660090735532, + "grad_norm": 0.2786661939353369, + "learning_rate": 8.768828869870609e-06, + "loss": 0.5511, + "step": 3389 + }, + { + "epoch": 0.556730236281896, + "grad_norm": 0.31635464932598195, + "learning_rate": 8.768680467777857e-06, + "loss": 0.5336, + "step": 3390 + }, + { + "epoch": 0.5568944634902387, + "grad_norm": 0.29765570525059454, + "learning_rate": 8.768532019323034e-06, + "loss": 0.5561, + "step": 3391 + }, + { + "epoch": 0.5570586906985815, + "grad_norm": 0.3243154243746134, + "learning_rate": 8.768383524507754e-06, + "loss": 0.5381, + "step": 3392 + }, + { + "epoch": 0.5572229179069242, + "grad_norm": 0.2893446275664096, + "learning_rate": 8.76823498333363e-06, + "loss": 0.5556, + "step": 3393 + }, + { + "epoch": 0.557387145115267, + "grad_norm": 0.3273044449065161, + "learning_rate": 8.768086395802274e-06, + "loss": 0.5476, + "step": 3394 + }, + { + "epoch": 0.5575513723236097, + "grad_norm": 0.3267822607622496, + "learning_rate": 8.767937761915302e-06, + "loss": 0.5467, + "step": 3395 + }, + { + "epoch": 0.5577155995319525, + "grad_norm": 0.3321830374515757, + "learning_rate": 8.767789081674324e-06, + "loss": 0.5442, + "step": 3396 + }, + { + "epoch": 0.5578798267402952, + "grad_norm": 0.361249384322939, + "learning_rate": 8.767640355080962e-06, + "loss": 0.5374, + "step": 3397 + }, + { + "epoch": 0.558044053948638, + "grad_norm": 0.3154351904554402, + "learning_rate": 8.767491582136823e-06, + "loss": 0.536, + "step": 3398 + }, + { + "epoch": 0.5582082811569807, + "grad_norm": 0.2980191259728029, + "learning_rate": 8.767342762843529e-06, + "loss": 0.5449, + "step": 3399 + }, + { + "epoch": 0.5583725083653234, + "grad_norm": 0.39564078503411465, + "learning_rate": 8.767193897202692e-06, + "loss": 0.5377, + "step": 3400 + }, + { + "epoch": 0.5585367355736661, + "grad_norm": 0.3517990839488776, + "learning_rate": 8.767044985215933e-06, + "loss": 0.5404, + "step": 3401 + }, + { + "epoch": 0.5587009627820089, + "grad_norm": 0.4155353751005951, + "learning_rate": 8.766896026884868e-06, + "loss": 0.5333, + "step": 3402 + }, + { + "epoch": 0.5588651899903516, + "grad_norm": 0.3017636592925083, + "learning_rate": 8.766747022211112e-06, + "loss": 0.5676, + "step": 3403 + }, + { + "epoch": 0.5590294171986944, + "grad_norm": 0.33995408666281135, + "learning_rate": 8.766597971196285e-06, + "loss": 0.5374, + "step": 3404 + }, + { + "epoch": 0.5591936444070371, + "grad_norm": 0.2795836141353966, + "learning_rate": 8.766448873842009e-06, + "loss": 0.5438, + "step": 3405 + }, + { + "epoch": 0.5593578716153799, + "grad_norm": 0.35753173927761156, + "learning_rate": 8.766299730149898e-06, + "loss": 0.5552, + "step": 3406 + }, + { + "epoch": 0.5595220988237226, + "grad_norm": 0.3248618335926127, + "learning_rate": 8.766150540121576e-06, + "loss": 0.532, + "step": 3407 + }, + { + "epoch": 0.5596863260320654, + "grad_norm": 0.36621704636882757, + "learning_rate": 8.766001303758661e-06, + "loss": 0.5654, + "step": 3408 + }, + { + "epoch": 0.5598505532404081, + "grad_norm": 0.3491811217207003, + "learning_rate": 8.765852021062774e-06, + "loss": 0.5283, + "step": 3409 + }, + { + "epoch": 0.5600147804487509, + "grad_norm": 0.3545037992141468, + "learning_rate": 8.765702692035539e-06, + "loss": 0.548, + "step": 3410 + }, + { + "epoch": 0.5601790076570936, + "grad_norm": 0.2763757716992341, + "learning_rate": 8.765553316678574e-06, + "loss": 0.5585, + "step": 3411 + }, + { + "epoch": 0.5603432348654364, + "grad_norm": 0.30894970057288423, + "learning_rate": 8.765403894993503e-06, + "loss": 0.5697, + "step": 3412 + }, + { + "epoch": 0.5605074620737791, + "grad_norm": 0.30395462426760694, + "learning_rate": 8.765254426981951e-06, + "loss": 0.5392, + "step": 3413 + }, + { + "epoch": 0.5606716892821219, + "grad_norm": 0.2938324142755099, + "learning_rate": 8.765104912645538e-06, + "loss": 0.5445, + "step": 3414 + }, + { + "epoch": 0.5608359164904646, + "grad_norm": 0.33410474841600196, + "learning_rate": 8.76495535198589e-06, + "loss": 0.533, + "step": 3415 + }, + { + "epoch": 0.5610001436988074, + "grad_norm": 0.3183614582303699, + "learning_rate": 8.764805745004632e-06, + "loss": 0.5414, + "step": 3416 + }, + { + "epoch": 0.56116437090715, + "grad_norm": 0.3107793870235241, + "learning_rate": 8.764656091703386e-06, + "loss": 0.5563, + "step": 3417 + }, + { + "epoch": 0.5613285981154927, + "grad_norm": 0.28256412488593424, + "learning_rate": 8.76450639208378e-06, + "loss": 0.5213, + "step": 3418 + }, + { + "epoch": 0.5614928253238355, + "grad_norm": 0.3291646446986307, + "learning_rate": 8.764356646147437e-06, + "loss": 0.5298, + "step": 3419 + }, + { + "epoch": 0.5616570525321782, + "grad_norm": 0.29015251711779344, + "learning_rate": 8.764206853895987e-06, + "loss": 0.5451, + "step": 3420 + }, + { + "epoch": 0.561821279740521, + "grad_norm": 0.3669840500783633, + "learning_rate": 8.764057015331054e-06, + "loss": 0.5599, + "step": 3421 + }, + { + "epoch": 0.5619855069488637, + "grad_norm": 0.39324670708032294, + "learning_rate": 8.763907130454267e-06, + "loss": 0.5375, + "step": 3422 + }, + { + "epoch": 0.5621497341572065, + "grad_norm": 0.31065568764508283, + "learning_rate": 8.763757199267253e-06, + "loss": 0.5408, + "step": 3423 + }, + { + "epoch": 0.5623139613655492, + "grad_norm": 0.3830704592320809, + "learning_rate": 8.763607221771643e-06, + "loss": 0.5215, + "step": 3424 + }, + { + "epoch": 0.562478188573892, + "grad_norm": 0.38348921195787256, + "learning_rate": 8.763457197969061e-06, + "loss": 0.5264, + "step": 3425 + }, + { + "epoch": 0.5626424157822347, + "grad_norm": 0.37148155697446655, + "learning_rate": 8.76330712786114e-06, + "loss": 0.5582, + "step": 3426 + }, + { + "epoch": 0.5628066429905775, + "grad_norm": 0.4273592131591784, + "learning_rate": 8.76315701144951e-06, + "loss": 0.5652, + "step": 3427 + }, + { + "epoch": 0.5629708701989202, + "grad_norm": 0.36466575092615616, + "learning_rate": 8.7630068487358e-06, + "loss": 0.5272, + "step": 3428 + }, + { + "epoch": 0.563135097407263, + "grad_norm": 0.3627460014212349, + "learning_rate": 8.762856639721642e-06, + "loss": 0.5487, + "step": 3429 + }, + { + "epoch": 0.5632993246156057, + "grad_norm": 0.8673504397637322, + "learning_rate": 8.762706384408666e-06, + "loss": 0.5456, + "step": 3430 + }, + { + "epoch": 0.5634635518239485, + "grad_norm": 0.32618874303521883, + "learning_rate": 8.762556082798505e-06, + "loss": 0.5374, + "step": 3431 + }, + { + "epoch": 0.5636277790322912, + "grad_norm": 0.3913592007145045, + "learning_rate": 8.76240573489279e-06, + "loss": 0.5377, + "step": 3432 + }, + { + "epoch": 0.563792006240634, + "grad_norm": 0.34399391895825815, + "learning_rate": 8.762255340693156e-06, + "loss": 0.5405, + "step": 3433 + }, + { + "epoch": 0.5639562334489766, + "grad_norm": 0.307384899591098, + "learning_rate": 8.762104900201235e-06, + "loss": 0.5346, + "step": 3434 + }, + { + "epoch": 0.5641204606573194, + "grad_norm": 0.2968959663698474, + "learning_rate": 8.761954413418663e-06, + "loss": 0.5568, + "step": 3435 + }, + { + "epoch": 0.5642846878656621, + "grad_norm": 0.3949150892211441, + "learning_rate": 8.761803880347073e-06, + "loss": 0.5486, + "step": 3436 + }, + { + "epoch": 0.5644489150740049, + "grad_norm": 0.33926890627687606, + "learning_rate": 8.761653300988097e-06, + "loss": 0.5197, + "step": 3437 + }, + { + "epoch": 0.5646131422823476, + "grad_norm": 0.3172017138331228, + "learning_rate": 8.761502675343375e-06, + "loss": 0.5471, + "step": 3438 + }, + { + "epoch": 0.5647773694906904, + "grad_norm": 0.418356801651984, + "learning_rate": 8.761352003414541e-06, + "loss": 0.5321, + "step": 3439 + }, + { + "epoch": 0.5649415966990331, + "grad_norm": 0.3079378893504073, + "learning_rate": 8.761201285203232e-06, + "loss": 0.5321, + "step": 3440 + }, + { + "epoch": 0.5651058239073758, + "grad_norm": 0.37210351753249876, + "learning_rate": 8.761050520711083e-06, + "loss": 0.5329, + "step": 3441 + }, + { + "epoch": 0.5652700511157186, + "grad_norm": 0.2805648721097993, + "learning_rate": 8.760899709939735e-06, + "loss": 0.5186, + "step": 3442 + }, + { + "epoch": 0.5654342783240613, + "grad_norm": 0.47212938527808185, + "learning_rate": 8.760748852890824e-06, + "loss": 0.5474, + "step": 3443 + }, + { + "epoch": 0.5655985055324041, + "grad_norm": 0.342366746899832, + "learning_rate": 8.760597949565988e-06, + "loss": 0.5356, + "step": 3444 + }, + { + "epoch": 0.5657627327407468, + "grad_norm": 0.30910313660004984, + "learning_rate": 8.760446999966866e-06, + "loss": 0.5389, + "step": 3445 + }, + { + "epoch": 0.5659269599490896, + "grad_norm": 0.3183907922282123, + "learning_rate": 8.760296004095098e-06, + "loss": 0.526, + "step": 3446 + }, + { + "epoch": 0.5660911871574323, + "grad_norm": 0.3900742021203354, + "learning_rate": 8.760144961952324e-06, + "loss": 0.538, + "step": 3447 + }, + { + "epoch": 0.5662554143657751, + "grad_norm": 0.4494570545768727, + "learning_rate": 8.759993873540184e-06, + "loss": 0.5409, + "step": 3448 + }, + { + "epoch": 0.5664196415741178, + "grad_norm": 0.3344372117969043, + "learning_rate": 8.75984273886032e-06, + "loss": 0.5416, + "step": 3449 + }, + { + "epoch": 0.5665838687824606, + "grad_norm": 0.28815799555494404, + "learning_rate": 8.759691557914372e-06, + "loss": 0.5321, + "step": 3450 + }, + { + "epoch": 0.5667480959908032, + "grad_norm": 0.3427716429030344, + "learning_rate": 8.759540330703983e-06, + "loss": 0.5218, + "step": 3451 + }, + { + "epoch": 0.566912323199146, + "grad_norm": 0.3535468139773774, + "learning_rate": 8.759389057230795e-06, + "loss": 0.5311, + "step": 3452 + }, + { + "epoch": 0.5670765504074887, + "grad_norm": 0.37238689749646064, + "learning_rate": 8.759237737496451e-06, + "loss": 0.5626, + "step": 3453 + }, + { + "epoch": 0.5672407776158315, + "grad_norm": 0.4913391693392319, + "learning_rate": 8.759086371502595e-06, + "loss": 0.5505, + "step": 3454 + }, + { + "epoch": 0.5674050048241742, + "grad_norm": 0.37181041963439404, + "learning_rate": 8.75893495925087e-06, + "loss": 0.5414, + "step": 3455 + }, + { + "epoch": 0.567569232032517, + "grad_norm": 0.31509365349628793, + "learning_rate": 8.758783500742922e-06, + "loss": 0.55, + "step": 3456 + }, + { + "epoch": 0.5677334592408597, + "grad_norm": 0.3368764395024524, + "learning_rate": 8.758631995980395e-06, + "loss": 0.5384, + "step": 3457 + }, + { + "epoch": 0.5678976864492025, + "grad_norm": 0.42083886653211133, + "learning_rate": 8.758480444964933e-06, + "loss": 0.55, + "step": 3458 + }, + { + "epoch": 0.5680619136575452, + "grad_norm": 0.30635925898925725, + "learning_rate": 8.758328847698185e-06, + "loss": 0.5211, + "step": 3459 + }, + { + "epoch": 0.568226140865888, + "grad_norm": 0.31708343966830854, + "learning_rate": 8.758177204181797e-06, + "loss": 0.5579, + "step": 3460 + }, + { + "epoch": 0.5683903680742307, + "grad_norm": 0.36409414468119855, + "learning_rate": 8.758025514417415e-06, + "loss": 0.5375, + "step": 3461 + }, + { + "epoch": 0.5685545952825735, + "grad_norm": 0.3112449064143783, + "learning_rate": 8.757873778406686e-06, + "loss": 0.5374, + "step": 3462 + }, + { + "epoch": 0.5687188224909162, + "grad_norm": 0.3608556254888521, + "learning_rate": 8.757721996151258e-06, + "loss": 0.5528, + "step": 3463 + }, + { + "epoch": 0.568883049699259, + "grad_norm": 0.3114835902741878, + "learning_rate": 8.757570167652781e-06, + "loss": 0.5478, + "step": 3464 + }, + { + "epoch": 0.5690472769076017, + "grad_norm": 0.4473767595398761, + "learning_rate": 8.757418292912902e-06, + "loss": 0.5407, + "step": 3465 + }, + { + "epoch": 0.5692115041159445, + "grad_norm": 0.30253085478675174, + "learning_rate": 8.757266371933272e-06, + "loss": 0.5603, + "step": 3466 + }, + { + "epoch": 0.5693757313242872, + "grad_norm": 0.3115092905114874, + "learning_rate": 8.75711440471554e-06, + "loss": 0.5284, + "step": 3467 + }, + { + "epoch": 0.5695399585326298, + "grad_norm": 0.30839473910587395, + "learning_rate": 8.756962391261358e-06, + "loss": 0.5443, + "step": 3468 + }, + { + "epoch": 0.5697041857409726, + "grad_norm": 0.28102093112639204, + "learning_rate": 8.756810331572375e-06, + "loss": 0.5476, + "step": 3469 + }, + { + "epoch": 0.5698684129493153, + "grad_norm": 0.3302176543184028, + "learning_rate": 8.756658225650245e-06, + "loss": 0.5431, + "step": 3470 + }, + { + "epoch": 0.5700326401576581, + "grad_norm": 0.3365665104771197, + "learning_rate": 8.75650607349662e-06, + "loss": 0.5262, + "step": 3471 + }, + { + "epoch": 0.5701968673660008, + "grad_norm": 0.3067227588817041, + "learning_rate": 8.75635387511315e-06, + "loss": 0.5384, + "step": 3472 + }, + { + "epoch": 0.5703610945743436, + "grad_norm": 0.33218329012978515, + "learning_rate": 8.756201630501487e-06, + "loss": 0.5332, + "step": 3473 + }, + { + "epoch": 0.5705253217826863, + "grad_norm": 0.3713379909457561, + "learning_rate": 8.756049339663288e-06, + "loss": 0.5466, + "step": 3474 + }, + { + "epoch": 0.5706895489910291, + "grad_norm": 0.34441687097450313, + "learning_rate": 8.755897002600207e-06, + "loss": 0.5541, + "step": 3475 + }, + { + "epoch": 0.5708537761993718, + "grad_norm": 0.7228301451209881, + "learning_rate": 8.755744619313896e-06, + "loss": 0.5376, + "step": 3476 + }, + { + "epoch": 0.5710180034077146, + "grad_norm": 0.2999297342940657, + "learning_rate": 8.755592189806012e-06, + "loss": 0.5443, + "step": 3477 + }, + { + "epoch": 0.5711822306160573, + "grad_norm": 0.39362953316784804, + "learning_rate": 8.75543971407821e-06, + "loss": 0.535, + "step": 3478 + }, + { + "epoch": 0.5713464578244001, + "grad_norm": 0.4111607829748149, + "learning_rate": 8.755287192132145e-06, + "loss": 0.5611, + "step": 3479 + }, + { + "epoch": 0.5715106850327428, + "grad_norm": 0.29266563450602223, + "learning_rate": 8.755134623969477e-06, + "loss": 0.5634, + "step": 3480 + }, + { + "epoch": 0.5716749122410856, + "grad_norm": 0.32732512271978664, + "learning_rate": 8.75498200959186e-06, + "loss": 0.543, + "step": 3481 + }, + { + "epoch": 0.5718391394494283, + "grad_norm": 0.46875550296652735, + "learning_rate": 8.754829349000948e-06, + "loss": 0.5457, + "step": 3482 + }, + { + "epoch": 0.5720033666577711, + "grad_norm": 0.33783116396267615, + "learning_rate": 8.754676642198407e-06, + "loss": 0.526, + "step": 3483 + }, + { + "epoch": 0.5721675938661138, + "grad_norm": 0.5346132082744581, + "learning_rate": 8.75452388918589e-06, + "loss": 0.5329, + "step": 3484 + }, + { + "epoch": 0.5723318210744565, + "grad_norm": 0.2764576251732501, + "learning_rate": 8.754371089965058e-06, + "loss": 0.5519, + "step": 3485 + }, + { + "epoch": 0.5724960482827992, + "grad_norm": 0.2688202687009742, + "learning_rate": 8.75421824453757e-06, + "loss": 0.5399, + "step": 3486 + }, + { + "epoch": 0.572660275491142, + "grad_norm": 0.3093609334958058, + "learning_rate": 8.754065352905087e-06, + "loss": 0.5277, + "step": 3487 + }, + { + "epoch": 0.5728245026994847, + "grad_norm": 0.39119778064404526, + "learning_rate": 8.753912415069269e-06, + "loss": 0.5498, + "step": 3488 + }, + { + "epoch": 0.5729887299078275, + "grad_norm": 0.2955026812716169, + "learning_rate": 8.753759431031775e-06, + "loss": 0.5499, + "step": 3489 + }, + { + "epoch": 0.5731529571161702, + "grad_norm": 0.3113087610249926, + "learning_rate": 8.753606400794271e-06, + "loss": 0.5452, + "step": 3490 + }, + { + "epoch": 0.573317184324513, + "grad_norm": 0.3611344907617437, + "learning_rate": 8.753453324358416e-06, + "loss": 0.5348, + "step": 3491 + }, + { + "epoch": 0.5734814115328557, + "grad_norm": 0.3344023185058532, + "learning_rate": 8.753300201725872e-06, + "loss": 0.5429, + "step": 3492 + }, + { + "epoch": 0.5736456387411984, + "grad_norm": 0.304956411481048, + "learning_rate": 8.753147032898303e-06, + "loss": 0.5482, + "step": 3493 + }, + { + "epoch": 0.5738098659495412, + "grad_norm": 0.30165741270268126, + "learning_rate": 8.752993817877373e-06, + "loss": 0.5328, + "step": 3494 + }, + { + "epoch": 0.573974093157884, + "grad_norm": 0.33741706948502814, + "learning_rate": 8.752840556664747e-06, + "loss": 0.5189, + "step": 3495 + }, + { + "epoch": 0.5741383203662267, + "grad_norm": 0.3701100738767769, + "learning_rate": 8.752687249262087e-06, + "loss": 0.5623, + "step": 3496 + }, + { + "epoch": 0.5743025475745694, + "grad_norm": 0.3169045858154691, + "learning_rate": 8.75253389567106e-06, + "loss": 0.5277, + "step": 3497 + }, + { + "epoch": 0.5744667747829122, + "grad_norm": 0.3854837052550634, + "learning_rate": 8.75238049589333e-06, + "loss": 0.5461, + "step": 3498 + }, + { + "epoch": 0.5746310019912549, + "grad_norm": 0.35234654736307097, + "learning_rate": 8.752227049930566e-06, + "loss": 0.5478, + "step": 3499 + }, + { + "epoch": 0.5747952291995977, + "grad_norm": 0.43861512127403823, + "learning_rate": 8.75207355778443e-06, + "loss": 0.5158, + "step": 3500 + }, + { + "epoch": 0.5749594564079404, + "grad_norm": 0.5687892409034156, + "learning_rate": 8.751920019456594e-06, + "loss": 0.5426, + "step": 3501 + }, + { + "epoch": 0.5751236836162831, + "grad_norm": 0.321497927550389, + "learning_rate": 8.75176643494872e-06, + "loss": 0.5448, + "step": 3502 + }, + { + "epoch": 0.5752879108246258, + "grad_norm": 0.3619816521146462, + "learning_rate": 8.751612804262483e-06, + "loss": 0.539, + "step": 3503 + }, + { + "epoch": 0.5754521380329686, + "grad_norm": 0.32956518716772837, + "learning_rate": 8.751459127399548e-06, + "loss": 0.5492, + "step": 3504 + }, + { + "epoch": 0.5756163652413113, + "grad_norm": 0.3437166953073345, + "learning_rate": 8.751305404361582e-06, + "loss": 0.5374, + "step": 3505 + }, + { + "epoch": 0.5757805924496541, + "grad_norm": 0.32115500488296933, + "learning_rate": 8.751151635150255e-06, + "loss": 0.5278, + "step": 3506 + }, + { + "epoch": 0.5759448196579968, + "grad_norm": 0.347119267249423, + "learning_rate": 8.750997819767241e-06, + "loss": 0.5529, + "step": 3507 + }, + { + "epoch": 0.5761090468663396, + "grad_norm": 0.28964439064190384, + "learning_rate": 8.75084395821421e-06, + "loss": 0.5135, + "step": 3508 + }, + { + "epoch": 0.5762732740746823, + "grad_norm": 0.3267292561169078, + "learning_rate": 8.750690050492828e-06, + "loss": 0.5232, + "step": 3509 + }, + { + "epoch": 0.5764375012830251, + "grad_norm": 0.31301380438742404, + "learning_rate": 8.750536096604772e-06, + "loss": 0.5444, + "step": 3510 + }, + { + "epoch": 0.5766017284913678, + "grad_norm": 0.3211593964134133, + "learning_rate": 8.750382096551711e-06, + "loss": 0.5541, + "step": 3511 + }, + { + "epoch": 0.5767659556997106, + "grad_norm": 0.2826221728840829, + "learning_rate": 8.750228050335319e-06, + "loss": 0.5424, + "step": 3512 + }, + { + "epoch": 0.5769301829080533, + "grad_norm": 0.436341101746728, + "learning_rate": 8.750073957957269e-06, + "loss": 0.5366, + "step": 3513 + }, + { + "epoch": 0.5770944101163961, + "grad_norm": 0.38049927921661986, + "learning_rate": 8.749919819419234e-06, + "loss": 0.5241, + "step": 3514 + }, + { + "epoch": 0.5772586373247388, + "grad_norm": 0.32937459946163317, + "learning_rate": 8.749765634722889e-06, + "loss": 0.5361, + "step": 3515 + }, + { + "epoch": 0.5774228645330816, + "grad_norm": 0.3247301226127869, + "learning_rate": 8.749611403869907e-06, + "loss": 0.5296, + "step": 3516 + }, + { + "epoch": 0.5775870917414243, + "grad_norm": 0.3040428174782979, + "learning_rate": 8.749457126861965e-06, + "loss": 0.5287, + "step": 3517 + }, + { + "epoch": 0.5777513189497671, + "grad_norm": 0.33533069093525125, + "learning_rate": 8.749302803700735e-06, + "loss": 0.5494, + "step": 3518 + }, + { + "epoch": 0.5779155461581097, + "grad_norm": 0.3993043789739712, + "learning_rate": 8.7491484343879e-06, + "loss": 0.5533, + "step": 3519 + }, + { + "epoch": 0.5780797733664524, + "grad_norm": 0.37167067761157563, + "learning_rate": 8.748994018925129e-06, + "loss": 0.5457, + "step": 3520 + }, + { + "epoch": 0.5782440005747952, + "grad_norm": 0.29226128233424004, + "learning_rate": 8.748839557314105e-06, + "loss": 0.5426, + "step": 3521 + }, + { + "epoch": 0.5784082277831379, + "grad_norm": 0.3363682700498467, + "learning_rate": 8.748685049556502e-06, + "loss": 0.5322, + "step": 3522 + }, + { + "epoch": 0.5785724549914807, + "grad_norm": 0.6507199888257422, + "learning_rate": 8.748530495653999e-06, + "loss": 0.553, + "step": 3523 + }, + { + "epoch": 0.5787366821998234, + "grad_norm": 0.33296678939243896, + "learning_rate": 8.748375895608275e-06, + "loss": 0.5268, + "step": 3524 + }, + { + "epoch": 0.5789009094081662, + "grad_norm": 0.29283720540204305, + "learning_rate": 8.748221249421009e-06, + "loss": 0.5463, + "step": 3525 + }, + { + "epoch": 0.5790651366165089, + "grad_norm": 0.3050167584945107, + "learning_rate": 8.74806655709388e-06, + "loss": 0.549, + "step": 3526 + }, + { + "epoch": 0.5792293638248517, + "grad_norm": 0.34419849382255013, + "learning_rate": 8.74791181862857e-06, + "loss": 0.5378, + "step": 3527 + }, + { + "epoch": 0.5793935910331944, + "grad_norm": 0.3661112951777841, + "learning_rate": 8.747757034026757e-06, + "loss": 0.5397, + "step": 3528 + }, + { + "epoch": 0.5795578182415372, + "grad_norm": 0.29887150740963486, + "learning_rate": 8.747602203290124e-06, + "loss": 0.5328, + "step": 3529 + }, + { + "epoch": 0.5797220454498799, + "grad_norm": 0.46824862966054376, + "learning_rate": 8.747447326420352e-06, + "loss": 0.5405, + "step": 3530 + }, + { + "epoch": 0.5798862726582227, + "grad_norm": 0.29399612928502583, + "learning_rate": 8.747292403419123e-06, + "loss": 0.5435, + "step": 3531 + }, + { + "epoch": 0.5800504998665654, + "grad_norm": 0.310842296672758, + "learning_rate": 8.74713743428812e-06, + "loss": 0.5184, + "step": 3532 + }, + { + "epoch": 0.5802147270749082, + "grad_norm": 0.4061885724828876, + "learning_rate": 8.746982419029025e-06, + "loss": 0.5321, + "step": 3533 + }, + { + "epoch": 0.5803789542832509, + "grad_norm": 0.4215721203763714, + "learning_rate": 8.746827357643524e-06, + "loss": 0.5578, + "step": 3534 + }, + { + "epoch": 0.5805431814915937, + "grad_norm": 0.320833189661272, + "learning_rate": 8.746672250133299e-06, + "loss": 0.5444, + "step": 3535 + }, + { + "epoch": 0.5807074086999363, + "grad_norm": 0.30776465075514364, + "learning_rate": 8.746517096500034e-06, + "loss": 0.554, + "step": 3536 + }, + { + "epoch": 0.5808716359082791, + "grad_norm": 0.3897926875015399, + "learning_rate": 8.746361896745416e-06, + "loss": 0.5526, + "step": 3537 + }, + { + "epoch": 0.5810358631166218, + "grad_norm": 0.31542044643027906, + "learning_rate": 8.74620665087113e-06, + "loss": 0.5212, + "step": 3538 + }, + { + "epoch": 0.5812000903249646, + "grad_norm": 0.33704459885406324, + "learning_rate": 8.746051358878863e-06, + "loss": 0.5216, + "step": 3539 + }, + { + "epoch": 0.5813643175333073, + "grad_norm": 0.3362121370815365, + "learning_rate": 8.745896020770298e-06, + "loss": 0.5601, + "step": 3540 + }, + { + "epoch": 0.58152854474165, + "grad_norm": 0.39125110963458015, + "learning_rate": 8.745740636547128e-06, + "loss": 0.5317, + "step": 3541 + }, + { + "epoch": 0.5816927719499928, + "grad_norm": 0.30491752320771665, + "learning_rate": 8.745585206211037e-06, + "loss": 0.5494, + "step": 3542 + }, + { + "epoch": 0.5818569991583356, + "grad_norm": 0.4110951329926245, + "learning_rate": 8.745429729763711e-06, + "loss": 0.5618, + "step": 3543 + }, + { + "epoch": 0.5820212263666783, + "grad_norm": 0.343279992216792, + "learning_rate": 8.745274207206844e-06, + "loss": 0.5209, + "step": 3544 + }, + { + "epoch": 0.582185453575021, + "grad_norm": 0.32229992763426724, + "learning_rate": 8.745118638542121e-06, + "loss": 0.5143, + "step": 3545 + }, + { + "epoch": 0.5823496807833638, + "grad_norm": 0.41394397944438854, + "learning_rate": 8.744963023771233e-06, + "loss": 0.5621, + "step": 3546 + }, + { + "epoch": 0.5825139079917065, + "grad_norm": 0.4164012584557086, + "learning_rate": 8.74480736289587e-06, + "loss": 0.554, + "step": 3547 + }, + { + "epoch": 0.5826781352000493, + "grad_norm": 0.3226663377624633, + "learning_rate": 8.744651655917724e-06, + "loss": 0.549, + "step": 3548 + }, + { + "epoch": 0.582842362408392, + "grad_norm": 0.37795580482924596, + "learning_rate": 8.744495902838483e-06, + "loss": 0.5176, + "step": 3549 + }, + { + "epoch": 0.5830065896167348, + "grad_norm": 0.3317217876574988, + "learning_rate": 8.744340103659841e-06, + "loss": 0.5235, + "step": 3550 + }, + { + "epoch": 0.5831708168250775, + "grad_norm": 0.32555740709280667, + "learning_rate": 8.74418425838349e-06, + "loss": 0.5547, + "step": 3551 + }, + { + "epoch": 0.5833350440334203, + "grad_norm": 0.4069952879795932, + "learning_rate": 8.744028367011122e-06, + "loss": 0.5065, + "step": 3552 + }, + { + "epoch": 0.5834992712417629, + "grad_norm": 0.32360525266398305, + "learning_rate": 8.74387242954443e-06, + "loss": 0.5353, + "step": 3553 + }, + { + "epoch": 0.5836634984501057, + "grad_norm": 0.3948501021982254, + "learning_rate": 8.74371644598511e-06, + "loss": 0.5251, + "step": 3554 + }, + { + "epoch": 0.5838277256584484, + "grad_norm": 0.3584701735245281, + "learning_rate": 8.743560416334852e-06, + "loss": 0.5521, + "step": 3555 + }, + { + "epoch": 0.5839919528667912, + "grad_norm": 0.31069610623115584, + "learning_rate": 8.743404340595352e-06, + "loss": 0.5252, + "step": 3556 + }, + { + "epoch": 0.5841561800751339, + "grad_norm": 0.3453463165092735, + "learning_rate": 8.74324821876831e-06, + "loss": 0.5388, + "step": 3557 + }, + { + "epoch": 0.5843204072834767, + "grad_norm": 0.3017781549374568, + "learning_rate": 8.743092050855413e-06, + "loss": 0.5183, + "step": 3558 + }, + { + "epoch": 0.5844846344918194, + "grad_norm": 0.40474828712089234, + "learning_rate": 8.742935836858363e-06, + "loss": 0.5571, + "step": 3559 + }, + { + "epoch": 0.5846488617001622, + "grad_norm": 0.2922900982199023, + "learning_rate": 8.742779576778857e-06, + "loss": 0.5438, + "step": 3560 + }, + { + "epoch": 0.5848130889085049, + "grad_norm": 0.3191290542154945, + "learning_rate": 8.742623270618588e-06, + "loss": 0.4971, + "step": 3561 + }, + { + "epoch": 0.5849773161168477, + "grad_norm": 0.30540507514538917, + "learning_rate": 8.74246691837926e-06, + "loss": 0.5271, + "step": 3562 + }, + { + "epoch": 0.5851415433251904, + "grad_norm": 0.31493701820865183, + "learning_rate": 8.742310520062563e-06, + "loss": 0.5414, + "step": 3563 + }, + { + "epoch": 0.5853057705335332, + "grad_norm": 0.31388160368167506, + "learning_rate": 8.742154075670202e-06, + "loss": 0.5328, + "step": 3564 + }, + { + "epoch": 0.5854699977418759, + "grad_norm": 0.31964141957864006, + "learning_rate": 8.741997585203874e-06, + "loss": 0.5393, + "step": 3565 + }, + { + "epoch": 0.5856342249502187, + "grad_norm": 0.32701350425663833, + "learning_rate": 8.741841048665279e-06, + "loss": 0.5322, + "step": 3566 + }, + { + "epoch": 0.5857984521585614, + "grad_norm": 0.3203015136579183, + "learning_rate": 8.741684466056116e-06, + "loss": 0.5055, + "step": 3567 + }, + { + "epoch": 0.5859626793669042, + "grad_norm": 0.3208442953143098, + "learning_rate": 8.741527837378086e-06, + "loss": 0.5459, + "step": 3568 + }, + { + "epoch": 0.5861269065752469, + "grad_norm": 0.37630098391133593, + "learning_rate": 8.74137116263289e-06, + "loss": 0.5428, + "step": 3569 + }, + { + "epoch": 0.5862911337835895, + "grad_norm": 0.299680115481499, + "learning_rate": 8.741214441822231e-06, + "loss": 0.541, + "step": 3570 + }, + { + "epoch": 0.5864553609919323, + "grad_norm": 0.3896245426246468, + "learning_rate": 8.741057674947812e-06, + "loss": 0.5301, + "step": 3571 + }, + { + "epoch": 0.586619588200275, + "grad_norm": 0.28228508118203716, + "learning_rate": 8.740900862011332e-06, + "loss": 0.5237, + "step": 3572 + }, + { + "epoch": 0.5867838154086178, + "grad_norm": 0.26998847124009234, + "learning_rate": 8.740744003014498e-06, + "loss": 0.5622, + "step": 3573 + }, + { + "epoch": 0.5869480426169605, + "grad_norm": 0.31146496471387847, + "learning_rate": 8.74058709795901e-06, + "loss": 0.5439, + "step": 3574 + }, + { + "epoch": 0.5871122698253033, + "grad_norm": 0.35384476669187864, + "learning_rate": 8.740430146846576e-06, + "loss": 0.5403, + "step": 3575 + }, + { + "epoch": 0.587276497033646, + "grad_norm": 0.3270939587667193, + "learning_rate": 8.740273149678897e-06, + "loss": 0.53, + "step": 3576 + }, + { + "epoch": 0.5874407242419888, + "grad_norm": 0.29676668095238884, + "learning_rate": 8.74011610645768e-06, + "loss": 0.5135, + "step": 3577 + }, + { + "epoch": 0.5876049514503315, + "grad_norm": 0.28217328906910305, + "learning_rate": 8.739959017184629e-06, + "loss": 0.5421, + "step": 3578 + }, + { + "epoch": 0.5877691786586743, + "grad_norm": 0.26291304157258816, + "learning_rate": 8.739801881861453e-06, + "loss": 0.5278, + "step": 3579 + }, + { + "epoch": 0.587933405867017, + "grad_norm": 0.3132802190163169, + "learning_rate": 8.739644700489856e-06, + "loss": 0.5206, + "step": 3580 + }, + { + "epoch": 0.5880976330753598, + "grad_norm": 0.3025007721387032, + "learning_rate": 8.73948747307155e-06, + "loss": 0.532, + "step": 3581 + }, + { + "epoch": 0.5882618602837025, + "grad_norm": 0.26167893751892235, + "learning_rate": 8.739330199608235e-06, + "loss": 0.5501, + "step": 3582 + }, + { + "epoch": 0.5884260874920453, + "grad_norm": 0.2802592450047528, + "learning_rate": 8.739172880101624e-06, + "loss": 0.5373, + "step": 3583 + }, + { + "epoch": 0.588590314700388, + "grad_norm": 0.3227658244340501, + "learning_rate": 8.739015514553425e-06, + "loss": 0.5564, + "step": 3584 + }, + { + "epoch": 0.5887545419087308, + "grad_norm": 0.31112797368771505, + "learning_rate": 8.738858102965348e-06, + "loss": 0.5329, + "step": 3585 + }, + { + "epoch": 0.5889187691170735, + "grad_norm": 0.36689048311245726, + "learning_rate": 8.7387006453391e-06, + "loss": 0.534, + "step": 3586 + }, + { + "epoch": 0.5890829963254162, + "grad_norm": 0.30855317467713334, + "learning_rate": 8.738543141676393e-06, + "loss": 0.5394, + "step": 3587 + }, + { + "epoch": 0.5892472235337589, + "grad_norm": 0.35880633381663496, + "learning_rate": 8.738385591978936e-06, + "loss": 0.5273, + "step": 3588 + }, + { + "epoch": 0.5894114507421017, + "grad_norm": 0.3353793135243243, + "learning_rate": 8.738227996248444e-06, + "loss": 0.5352, + "step": 3589 + }, + { + "epoch": 0.5895756779504444, + "grad_norm": 0.29390223003626537, + "learning_rate": 8.738070354486626e-06, + "loss": 0.553, + "step": 3590 + }, + { + "epoch": 0.5897399051587872, + "grad_norm": 0.27791226239243794, + "learning_rate": 8.737912666695192e-06, + "loss": 0.5389, + "step": 3591 + }, + { + "epoch": 0.5899041323671299, + "grad_norm": 0.27245500324821437, + "learning_rate": 8.73775493287586e-06, + "loss": 0.5266, + "step": 3592 + }, + { + "epoch": 0.5900683595754727, + "grad_norm": 0.8524051814576923, + "learning_rate": 8.737597153030338e-06, + "loss": 0.5444, + "step": 3593 + }, + { + "epoch": 0.5902325867838154, + "grad_norm": 0.3002304556760417, + "learning_rate": 8.73743932716034e-06, + "loss": 0.5337, + "step": 3594 + }, + { + "epoch": 0.5903968139921582, + "grad_norm": 0.30605452136769806, + "learning_rate": 8.737281455267585e-06, + "loss": 0.5386, + "step": 3595 + }, + { + "epoch": 0.5905610412005009, + "grad_norm": 0.34386930237760743, + "learning_rate": 8.737123537353783e-06, + "loss": 0.5194, + "step": 3596 + }, + { + "epoch": 0.5907252684088437, + "grad_norm": 0.3312859202395492, + "learning_rate": 8.73696557342065e-06, + "loss": 0.5592, + "step": 3597 + }, + { + "epoch": 0.5908894956171864, + "grad_norm": 0.3658728879871337, + "learning_rate": 8.736807563469905e-06, + "loss": 0.5572, + "step": 3598 + }, + { + "epoch": 0.5910537228255291, + "grad_norm": 0.2881831918233398, + "learning_rate": 8.736649507503257e-06, + "loss": 0.5371, + "step": 3599 + }, + { + "epoch": 0.5912179500338719, + "grad_norm": 0.36645575351131854, + "learning_rate": 8.736491405522431e-06, + "loss": 0.5454, + "step": 3600 + }, + { + "epoch": 0.5913821772422146, + "grad_norm": 0.3179909063540978, + "learning_rate": 8.73633325752914e-06, + "loss": 0.551, + "step": 3601 + }, + { + "epoch": 0.5915464044505574, + "grad_norm": 0.38250970177758387, + "learning_rate": 8.7361750635251e-06, + "loss": 0.5431, + "step": 3602 + }, + { + "epoch": 0.5917106316589001, + "grad_norm": 0.2894129002667864, + "learning_rate": 8.736016823512031e-06, + "loss": 0.5438, + "step": 3603 + }, + { + "epoch": 0.5918748588672428, + "grad_norm": 0.2894750736546933, + "learning_rate": 8.735858537491652e-06, + "loss": 0.5393, + "step": 3604 + }, + { + "epoch": 0.5920390860755855, + "grad_norm": 0.3680922889819331, + "learning_rate": 8.735700205465683e-06, + "loss": 0.5356, + "step": 3605 + }, + { + "epoch": 0.5922033132839283, + "grad_norm": 0.3025193777662114, + "learning_rate": 8.73554182743584e-06, + "loss": 0.5447, + "step": 3606 + }, + { + "epoch": 0.592367540492271, + "grad_norm": 0.3136810875159733, + "learning_rate": 8.735383403403849e-06, + "loss": 0.5303, + "step": 3607 + }, + { + "epoch": 0.5925317677006138, + "grad_norm": 0.38311420078716235, + "learning_rate": 8.735224933371423e-06, + "loss": 0.5266, + "step": 3608 + }, + { + "epoch": 0.5926959949089565, + "grad_norm": 0.28233496253123336, + "learning_rate": 8.73506641734029e-06, + "loss": 0.5449, + "step": 3609 + }, + { + "epoch": 0.5928602221172993, + "grad_norm": 0.3227273047099139, + "learning_rate": 8.734907855312168e-06, + "loss": 0.5411, + "step": 3610 + }, + { + "epoch": 0.593024449325642, + "grad_norm": 0.27668525367867175, + "learning_rate": 8.734749247288782e-06, + "loss": 0.5377, + "step": 3611 + }, + { + "epoch": 0.5931886765339848, + "grad_norm": 0.28813590933697675, + "learning_rate": 8.734590593271851e-06, + "loss": 0.5364, + "step": 3612 + }, + { + "epoch": 0.5933529037423275, + "grad_norm": 0.3242787415516203, + "learning_rate": 8.7344318932631e-06, + "loss": 0.5472, + "step": 3613 + }, + { + "epoch": 0.5935171309506703, + "grad_norm": 0.30987179115108054, + "learning_rate": 8.734273147264252e-06, + "loss": 0.5264, + "step": 3614 + }, + { + "epoch": 0.593681358159013, + "grad_norm": 0.28439528277004245, + "learning_rate": 8.734114355277033e-06, + "loss": 0.5309, + "step": 3615 + }, + { + "epoch": 0.5938455853673558, + "grad_norm": 0.309183288887301, + "learning_rate": 8.733955517303165e-06, + "loss": 0.5451, + "step": 3616 + }, + { + "epoch": 0.5940098125756985, + "grad_norm": 0.2947685618065334, + "learning_rate": 8.733796633344375e-06, + "loss": 0.5292, + "step": 3617 + }, + { + "epoch": 0.5941740397840413, + "grad_norm": 0.3046893076201151, + "learning_rate": 8.733637703402387e-06, + "loss": 0.5221, + "step": 3618 + }, + { + "epoch": 0.594338266992384, + "grad_norm": 0.32928639712865154, + "learning_rate": 8.733478727478931e-06, + "loss": 0.528, + "step": 3619 + }, + { + "epoch": 0.5945024942007266, + "grad_norm": 0.4061734180068937, + "learning_rate": 8.733319705575728e-06, + "loss": 0.5405, + "step": 3620 + }, + { + "epoch": 0.5946667214090694, + "grad_norm": 0.2870014695047263, + "learning_rate": 8.73316063769451e-06, + "loss": 0.5378, + "step": 3621 + }, + { + "epoch": 0.5948309486174121, + "grad_norm": 0.35349649731908983, + "learning_rate": 8.733001523837003e-06, + "loss": 0.5466, + "step": 3622 + }, + { + "epoch": 0.5949951758257549, + "grad_norm": 0.3189730804637844, + "learning_rate": 8.732842364004932e-06, + "loss": 0.5431, + "step": 3623 + }, + { + "epoch": 0.5951594030340976, + "grad_norm": 0.3457512242859419, + "learning_rate": 8.73268315820003e-06, + "loss": 0.553, + "step": 3624 + }, + { + "epoch": 0.5953236302424404, + "grad_norm": 0.40936712834610717, + "learning_rate": 8.732523906424025e-06, + "loss": 0.5194, + "step": 3625 + }, + { + "epoch": 0.5954878574507831, + "grad_norm": 0.332203955122217, + "learning_rate": 8.732364608678644e-06, + "loss": 0.5539, + "step": 3626 + }, + { + "epoch": 0.5956520846591259, + "grad_norm": 0.2911967123741656, + "learning_rate": 8.732205264965622e-06, + "loss": 0.5376, + "step": 3627 + }, + { + "epoch": 0.5958163118674686, + "grad_norm": 0.287762468996504, + "learning_rate": 8.732045875286685e-06, + "loss": 0.5344, + "step": 3628 + }, + { + "epoch": 0.5959805390758114, + "grad_norm": 0.34854621639857275, + "learning_rate": 8.731886439643566e-06, + "loss": 0.5423, + "step": 3629 + }, + { + "epoch": 0.5961447662841541, + "grad_norm": 0.3698751509326475, + "learning_rate": 8.731726958037998e-06, + "loss": 0.529, + "step": 3630 + }, + { + "epoch": 0.5963089934924969, + "grad_norm": 0.27934921610336444, + "learning_rate": 8.731567430471711e-06, + "loss": 0.5443, + "step": 3631 + }, + { + "epoch": 0.5964732207008396, + "grad_norm": 0.3193564893222012, + "learning_rate": 8.731407856946438e-06, + "loss": 0.5296, + "step": 3632 + }, + { + "epoch": 0.5966374479091824, + "grad_norm": 0.30388255767050054, + "learning_rate": 8.731248237463913e-06, + "loss": 0.5581, + "step": 3633 + }, + { + "epoch": 0.5968016751175251, + "grad_norm": 0.34394183718979676, + "learning_rate": 8.73108857202587e-06, + "loss": 0.5245, + "step": 3634 + }, + { + "epoch": 0.5969659023258679, + "grad_norm": 0.3368222367193206, + "learning_rate": 8.730928860634041e-06, + "loss": 0.5364, + "step": 3635 + }, + { + "epoch": 0.5971301295342106, + "grad_norm": 0.6386666188329989, + "learning_rate": 8.730769103290162e-06, + "loss": 0.5315, + "step": 3636 + }, + { + "epoch": 0.5972943567425533, + "grad_norm": 0.39752248221935366, + "learning_rate": 8.73060929999597e-06, + "loss": 0.5453, + "step": 3637 + }, + { + "epoch": 0.597458583950896, + "grad_norm": 0.32135827832074965, + "learning_rate": 8.730449450753197e-06, + "loss": 0.5373, + "step": 3638 + }, + { + "epoch": 0.5976228111592388, + "grad_norm": 0.2997430977323944, + "learning_rate": 8.73028955556358e-06, + "loss": 0.5314, + "step": 3639 + }, + { + "epoch": 0.5977870383675815, + "grad_norm": 0.3172118412359191, + "learning_rate": 8.730129614428858e-06, + "loss": 0.5356, + "step": 3640 + }, + { + "epoch": 0.5979512655759243, + "grad_norm": 0.2722040870504818, + "learning_rate": 8.729969627350766e-06, + "loss": 0.5503, + "step": 3641 + }, + { + "epoch": 0.598115492784267, + "grad_norm": 0.29389145308594455, + "learning_rate": 8.72980959433104e-06, + "loss": 0.5079, + "step": 3642 + }, + { + "epoch": 0.5982797199926098, + "grad_norm": 0.2885264804569022, + "learning_rate": 8.729649515371423e-06, + "loss": 0.556, + "step": 3643 + }, + { + "epoch": 0.5984439472009525, + "grad_norm": 0.3008793211955447, + "learning_rate": 8.729489390473649e-06, + "loss": 0.5529, + "step": 3644 + }, + { + "epoch": 0.5986081744092953, + "grad_norm": 0.31658137204584463, + "learning_rate": 8.729329219639462e-06, + "loss": 0.5335, + "step": 3645 + }, + { + "epoch": 0.598772401617638, + "grad_norm": 0.34838847739756235, + "learning_rate": 8.729169002870596e-06, + "loss": 0.5502, + "step": 3646 + }, + { + "epoch": 0.5989366288259808, + "grad_norm": 0.3756933229117239, + "learning_rate": 8.729008740168793e-06, + "loss": 0.5294, + "step": 3647 + }, + { + "epoch": 0.5991008560343235, + "grad_norm": 0.34519823363513086, + "learning_rate": 8.728848431535795e-06, + "loss": 0.5284, + "step": 3648 + }, + { + "epoch": 0.5992650832426663, + "grad_norm": 0.3167375361570294, + "learning_rate": 8.728688076973344e-06, + "loss": 0.5358, + "step": 3649 + }, + { + "epoch": 0.599429310451009, + "grad_norm": 0.3306748583479528, + "learning_rate": 8.728527676483178e-06, + "loss": 0.5293, + "step": 3650 + }, + { + "epoch": 0.5995935376593517, + "grad_norm": 0.3708640363221295, + "learning_rate": 8.728367230067043e-06, + "loss": 0.5427, + "step": 3651 + }, + { + "epoch": 0.5997577648676945, + "grad_norm": 0.3431078530708263, + "learning_rate": 8.728206737726678e-06, + "loss": 0.5445, + "step": 3652 + }, + { + "epoch": 0.5999219920760372, + "grad_norm": 0.32219217359231533, + "learning_rate": 8.728046199463829e-06, + "loss": 0.5324, + "step": 3653 + }, + { + "epoch": 0.6000862192843799, + "grad_norm": 0.32057168261411606, + "learning_rate": 8.727885615280237e-06, + "loss": 0.5547, + "step": 3654 + }, + { + "epoch": 0.6002504464927226, + "grad_norm": 0.2767525924131096, + "learning_rate": 8.72772498517765e-06, + "loss": 0.5452, + "step": 3655 + }, + { + "epoch": 0.6004146737010654, + "grad_norm": 0.326894122884887, + "learning_rate": 8.727564309157807e-06, + "loss": 0.5449, + "step": 3656 + }, + { + "epoch": 0.6005789009094081, + "grad_norm": 0.3048109582358382, + "learning_rate": 8.727403587222457e-06, + "loss": 0.541, + "step": 3657 + }, + { + "epoch": 0.6007431281177509, + "grad_norm": 0.3520073738268126, + "learning_rate": 8.727242819373347e-06, + "loss": 0.5617, + "step": 3658 + }, + { + "epoch": 0.6009073553260936, + "grad_norm": 0.3499362485696832, + "learning_rate": 8.72708200561222e-06, + "loss": 0.546, + "step": 3659 + }, + { + "epoch": 0.6010715825344364, + "grad_norm": 0.29907581210466505, + "learning_rate": 8.726921145940824e-06, + "loss": 0.5367, + "step": 3660 + }, + { + "epoch": 0.6012358097427791, + "grad_norm": 0.35970980950843484, + "learning_rate": 8.726760240360904e-06, + "loss": 0.5456, + "step": 3661 + }, + { + "epoch": 0.6014000369511219, + "grad_norm": 0.3035374882855592, + "learning_rate": 8.726599288874211e-06, + "loss": 0.4972, + "step": 3662 + }, + { + "epoch": 0.6015642641594646, + "grad_norm": 0.3077833132019388, + "learning_rate": 8.72643829148249e-06, + "loss": 0.5381, + "step": 3663 + }, + { + "epoch": 0.6017284913678074, + "grad_norm": 0.27870036599180303, + "learning_rate": 8.726277248187491e-06, + "loss": 0.5331, + "step": 3664 + }, + { + "epoch": 0.6018927185761501, + "grad_norm": 0.3980801030680251, + "learning_rate": 8.726116158990964e-06, + "loss": 0.5534, + "step": 3665 + }, + { + "epoch": 0.6020569457844929, + "grad_norm": 0.306821683865838, + "learning_rate": 8.725955023894657e-06, + "loss": 0.5404, + "step": 3666 + }, + { + "epoch": 0.6022211729928356, + "grad_norm": 0.30355098474208053, + "learning_rate": 8.725793842900319e-06, + "loss": 0.5446, + "step": 3667 + }, + { + "epoch": 0.6023854002011784, + "grad_norm": 0.32484342105558917, + "learning_rate": 8.725632616009704e-06, + "loss": 0.5136, + "step": 3668 + }, + { + "epoch": 0.6025496274095211, + "grad_norm": 0.35942539543304525, + "learning_rate": 8.725471343224562e-06, + "loss": 0.5522, + "step": 3669 + }, + { + "epoch": 0.6027138546178639, + "grad_norm": 0.35114276088233815, + "learning_rate": 8.725310024546642e-06, + "loss": 0.5439, + "step": 3670 + }, + { + "epoch": 0.6028780818262065, + "grad_norm": 0.33071150493599055, + "learning_rate": 8.7251486599777e-06, + "loss": 0.5394, + "step": 3671 + }, + { + "epoch": 0.6030423090345493, + "grad_norm": 0.2914014429336388, + "learning_rate": 8.724987249519485e-06, + "loss": 0.5283, + "step": 3672 + }, + { + "epoch": 0.603206536242892, + "grad_norm": 0.2987885928184587, + "learning_rate": 8.724825793173752e-06, + "loss": 0.5484, + "step": 3673 + }, + { + "epoch": 0.6033707634512347, + "grad_norm": 0.35620845704867954, + "learning_rate": 8.724664290942254e-06, + "loss": 0.5232, + "step": 3674 + }, + { + "epoch": 0.6035349906595775, + "grad_norm": 0.8502964437012199, + "learning_rate": 8.724502742826746e-06, + "loss": 0.5313, + "step": 3675 + }, + { + "epoch": 0.6036992178679202, + "grad_norm": 0.35180116837141656, + "learning_rate": 8.724341148828982e-06, + "loss": 0.5405, + "step": 3676 + }, + { + "epoch": 0.603863445076263, + "grad_norm": 0.3428233815348743, + "learning_rate": 8.724179508950717e-06, + "loss": 0.5556, + "step": 3677 + }, + { + "epoch": 0.6040276722846057, + "grad_norm": 0.35078504166460134, + "learning_rate": 8.724017823193706e-06, + "loss": 0.5383, + "step": 3678 + }, + { + "epoch": 0.6041918994929485, + "grad_norm": 0.7235745909791813, + "learning_rate": 8.723856091559704e-06, + "loss": 0.5448, + "step": 3679 + }, + { + "epoch": 0.6043561267012912, + "grad_norm": 0.6140340695491158, + "learning_rate": 8.723694314050472e-06, + "loss": 0.5273, + "step": 3680 + }, + { + "epoch": 0.604520353909634, + "grad_norm": 0.3498517666631241, + "learning_rate": 8.723532490667763e-06, + "loss": 0.5331, + "step": 3681 + }, + { + "epoch": 0.6046845811179767, + "grad_norm": 0.32938150991708076, + "learning_rate": 8.723370621413335e-06, + "loss": 0.5391, + "step": 3682 + }, + { + "epoch": 0.6048488083263195, + "grad_norm": 0.3697450699685494, + "learning_rate": 8.723208706288946e-06, + "loss": 0.5395, + "step": 3683 + }, + { + "epoch": 0.6050130355346622, + "grad_norm": 0.32905386542315357, + "learning_rate": 8.723046745296357e-06, + "loss": 0.5269, + "step": 3684 + }, + { + "epoch": 0.605177262743005, + "grad_norm": 0.30086869065685895, + "learning_rate": 8.722884738437327e-06, + "loss": 0.5128, + "step": 3685 + }, + { + "epoch": 0.6053414899513477, + "grad_norm": 0.3113050683890726, + "learning_rate": 8.722722685713612e-06, + "loss": 0.5231, + "step": 3686 + }, + { + "epoch": 0.6055057171596905, + "grad_norm": 0.36319048353923766, + "learning_rate": 8.722560587126975e-06, + "loss": 0.5269, + "step": 3687 + }, + { + "epoch": 0.6056699443680331, + "grad_norm": 0.37114379821384086, + "learning_rate": 8.722398442679174e-06, + "loss": 0.5407, + "step": 3688 + }, + { + "epoch": 0.6058341715763759, + "grad_norm": 0.3219171912615586, + "learning_rate": 8.722236252371974e-06, + "loss": 0.5359, + "step": 3689 + }, + { + "epoch": 0.6059983987847186, + "grad_norm": 0.34569270952073616, + "learning_rate": 8.722074016207131e-06, + "loss": 0.5413, + "step": 3690 + }, + { + "epoch": 0.6061626259930614, + "grad_norm": 0.34523412105687357, + "learning_rate": 8.721911734186412e-06, + "loss": 0.5198, + "step": 3691 + }, + { + "epoch": 0.6063268532014041, + "grad_norm": 0.35292293229945165, + "learning_rate": 8.721749406311578e-06, + "loss": 0.5664, + "step": 3692 + }, + { + "epoch": 0.6064910804097469, + "grad_norm": 0.39140784378450616, + "learning_rate": 8.721587032584391e-06, + "loss": 0.5174, + "step": 3693 + }, + { + "epoch": 0.6066553076180896, + "grad_norm": 0.29287997907577673, + "learning_rate": 8.721424613006616e-06, + "loss": 0.5242, + "step": 3694 + }, + { + "epoch": 0.6068195348264324, + "grad_norm": 0.32102245081740655, + "learning_rate": 8.721262147580016e-06, + "loss": 0.5078, + "step": 3695 + }, + { + "epoch": 0.6069837620347751, + "grad_norm": 0.329148098587207, + "learning_rate": 8.721099636306357e-06, + "loss": 0.5179, + "step": 3696 + }, + { + "epoch": 0.6071479892431179, + "grad_norm": 0.43195330263064485, + "learning_rate": 8.720937079187402e-06, + "loss": 0.5486, + "step": 3697 + }, + { + "epoch": 0.6073122164514606, + "grad_norm": 0.2719468750025907, + "learning_rate": 8.720774476224918e-06, + "loss": 0.519, + "step": 3698 + }, + { + "epoch": 0.6074764436598034, + "grad_norm": 0.335292603650919, + "learning_rate": 8.72061182742067e-06, + "loss": 0.5406, + "step": 3699 + }, + { + "epoch": 0.6076406708681461, + "grad_norm": 0.3269121070823391, + "learning_rate": 8.720449132776424e-06, + "loss": 0.523, + "step": 3700 + }, + { + "epoch": 0.6078048980764889, + "grad_norm": 0.35219972464970845, + "learning_rate": 8.72028639229395e-06, + "loss": 0.5228, + "step": 3701 + }, + { + "epoch": 0.6079691252848316, + "grad_norm": 0.3391824315436888, + "learning_rate": 8.720123605975012e-06, + "loss": 0.5416, + "step": 3702 + }, + { + "epoch": 0.6081333524931744, + "grad_norm": 0.2759373012282259, + "learning_rate": 8.71996077382138e-06, + "loss": 0.5291, + "step": 3703 + }, + { + "epoch": 0.6082975797015171, + "grad_norm": 0.29854298935572593, + "learning_rate": 8.719797895834823e-06, + "loss": 0.5513, + "step": 3704 + }, + { + "epoch": 0.6084618069098597, + "grad_norm": 0.29485850608627, + "learning_rate": 8.719634972017109e-06, + "loss": 0.5346, + "step": 3705 + }, + { + "epoch": 0.6086260341182025, + "grad_norm": 0.29127554479034234, + "learning_rate": 8.719472002370007e-06, + "loss": 0.5288, + "step": 3706 + }, + { + "epoch": 0.6087902613265452, + "grad_norm": 0.35738849044273974, + "learning_rate": 8.719308986895288e-06, + "loss": 0.5316, + "step": 3707 + }, + { + "epoch": 0.608954488534888, + "grad_norm": 0.32317977559171934, + "learning_rate": 8.719145925594722e-06, + "loss": 0.5285, + "step": 3708 + }, + { + "epoch": 0.6091187157432307, + "grad_norm": 0.323539384053774, + "learning_rate": 8.718982818470081e-06, + "loss": 0.5224, + "step": 3709 + }, + { + "epoch": 0.6092829429515735, + "grad_norm": 0.3306250256102783, + "learning_rate": 8.718819665523135e-06, + "loss": 0.5515, + "step": 3710 + }, + { + "epoch": 0.6094471701599162, + "grad_norm": 0.3057541843863994, + "learning_rate": 8.718656466755657e-06, + "loss": 0.5364, + "step": 3711 + }, + { + "epoch": 0.609611397368259, + "grad_norm": 0.32189324965811217, + "learning_rate": 8.718493222169417e-06, + "loss": 0.56, + "step": 3712 + }, + { + "epoch": 0.6097756245766017, + "grad_norm": 0.2930020470373635, + "learning_rate": 8.718329931766193e-06, + "loss": 0.5491, + "step": 3713 + }, + { + "epoch": 0.6099398517849445, + "grad_norm": 0.37017958751656427, + "learning_rate": 8.718166595547755e-06, + "loss": 0.5273, + "step": 3714 + }, + { + "epoch": 0.6101040789932872, + "grad_norm": 0.38151259250123476, + "learning_rate": 8.718003213515876e-06, + "loss": 0.5495, + "step": 3715 + }, + { + "epoch": 0.61026830620163, + "grad_norm": 0.2899120782134563, + "learning_rate": 8.717839785672334e-06, + "loss": 0.5387, + "step": 3716 + }, + { + "epoch": 0.6104325334099727, + "grad_norm": 0.3129274447525849, + "learning_rate": 8.7176763120189e-06, + "loss": 0.5236, + "step": 3717 + }, + { + "epoch": 0.6105967606183155, + "grad_norm": 0.3245883613657934, + "learning_rate": 8.717512792557355e-06, + "loss": 0.5265, + "step": 3718 + }, + { + "epoch": 0.6107609878266582, + "grad_norm": 0.3046206158653733, + "learning_rate": 8.71734922728947e-06, + "loss": 0.5493, + "step": 3719 + }, + { + "epoch": 0.610925215035001, + "grad_norm": 0.3414330525019781, + "learning_rate": 8.717185616217022e-06, + "loss": 0.5487, + "step": 3720 + }, + { + "epoch": 0.6110894422433437, + "grad_norm": 0.30157466649864123, + "learning_rate": 8.71702195934179e-06, + "loss": 0.5214, + "step": 3721 + }, + { + "epoch": 0.6112536694516864, + "grad_norm": 0.2939600687346926, + "learning_rate": 8.71685825666555e-06, + "loss": 0.534, + "step": 3722 + }, + { + "epoch": 0.6114178966600291, + "grad_norm": 0.3532357313126545, + "learning_rate": 8.716694508190081e-06, + "loss": 0.522, + "step": 3723 + }, + { + "epoch": 0.6115821238683719, + "grad_norm": 0.2888952148557769, + "learning_rate": 8.716530713917162e-06, + "loss": 0.5391, + "step": 3724 + }, + { + "epoch": 0.6117463510767146, + "grad_norm": 0.34438682455747893, + "learning_rate": 8.716366873848569e-06, + "loss": 0.5399, + "step": 3725 + }, + { + "epoch": 0.6119105782850573, + "grad_norm": 0.364981358716848, + "learning_rate": 8.716202987986084e-06, + "loss": 0.5346, + "step": 3726 + }, + { + "epoch": 0.6120748054934001, + "grad_norm": 0.4694648437859305, + "learning_rate": 8.716039056331487e-06, + "loss": 0.5249, + "step": 3727 + }, + { + "epoch": 0.6122390327017428, + "grad_norm": 0.3210375908279311, + "learning_rate": 8.715875078886557e-06, + "loss": 0.5237, + "step": 3728 + }, + { + "epoch": 0.6124032599100856, + "grad_norm": 0.34752632577085785, + "learning_rate": 8.715711055653077e-06, + "loss": 0.5405, + "step": 3729 + }, + { + "epoch": 0.6125674871184283, + "grad_norm": 0.40203728653194964, + "learning_rate": 8.715546986632826e-06, + "loss": 0.5461, + "step": 3730 + }, + { + "epoch": 0.6127317143267711, + "grad_norm": 0.3049524203199884, + "learning_rate": 8.715382871827587e-06, + "loss": 0.5383, + "step": 3731 + }, + { + "epoch": 0.6128959415351138, + "grad_norm": 0.43035534035034767, + "learning_rate": 8.715218711239143e-06, + "loss": 0.5377, + "step": 3732 + }, + { + "epoch": 0.6130601687434566, + "grad_norm": 0.304376981147912, + "learning_rate": 8.715054504869277e-06, + "loss": 0.5365, + "step": 3733 + }, + { + "epoch": 0.6132243959517993, + "grad_norm": 0.3352498186486189, + "learning_rate": 8.714890252719772e-06, + "loss": 0.5405, + "step": 3734 + }, + { + "epoch": 0.6133886231601421, + "grad_norm": 0.37635670121733, + "learning_rate": 8.714725954792413e-06, + "loss": 0.55, + "step": 3735 + }, + { + "epoch": 0.6135528503684848, + "grad_norm": 0.3193338992698321, + "learning_rate": 8.714561611088982e-06, + "loss": 0.5254, + "step": 3736 + }, + { + "epoch": 0.6137170775768276, + "grad_norm": 0.28862874206631534, + "learning_rate": 8.714397221611264e-06, + "loss": 0.5122, + "step": 3737 + }, + { + "epoch": 0.6138813047851703, + "grad_norm": 0.36269165943906717, + "learning_rate": 8.714232786361049e-06, + "loss": 0.551, + "step": 3738 + }, + { + "epoch": 0.614045531993513, + "grad_norm": 0.43221986090902803, + "learning_rate": 8.714068305340117e-06, + "loss": 0.5409, + "step": 3739 + }, + { + "epoch": 0.6142097592018557, + "grad_norm": 0.30739719342988214, + "learning_rate": 8.713903778550258e-06, + "loss": 0.5617, + "step": 3740 + }, + { + "epoch": 0.6143739864101985, + "grad_norm": 0.3447769130969769, + "learning_rate": 8.713739205993259e-06, + "loss": 0.5031, + "step": 3741 + }, + { + "epoch": 0.6145382136185412, + "grad_norm": 0.36911964009951104, + "learning_rate": 8.713574587670906e-06, + "loss": 0.5487, + "step": 3742 + }, + { + "epoch": 0.614702440826884, + "grad_norm": 0.2882172434812275, + "learning_rate": 8.713409923584986e-06, + "loss": 0.5187, + "step": 3743 + }, + { + "epoch": 0.6148666680352267, + "grad_norm": 0.4483918078459419, + "learning_rate": 8.71324521373729e-06, + "loss": 0.5568, + "step": 3744 + }, + { + "epoch": 0.6150308952435695, + "grad_norm": 0.3979105216794056, + "learning_rate": 8.713080458129606e-06, + "loss": 0.5288, + "step": 3745 + }, + { + "epoch": 0.6151951224519122, + "grad_norm": 0.32051012765081943, + "learning_rate": 8.712915656763723e-06, + "loss": 0.5374, + "step": 3746 + }, + { + "epoch": 0.615359349660255, + "grad_norm": 0.4134079464940972, + "learning_rate": 8.71275080964143e-06, + "loss": 0.5364, + "step": 3747 + }, + { + "epoch": 0.6155235768685977, + "grad_norm": 0.3025117236117526, + "learning_rate": 8.71258591676452e-06, + "loss": 0.5315, + "step": 3748 + }, + { + "epoch": 0.6156878040769405, + "grad_norm": 0.5045071173403394, + "learning_rate": 8.71242097813478e-06, + "loss": 0.517, + "step": 3749 + }, + { + "epoch": 0.6158520312852832, + "grad_norm": 0.36659414586697897, + "learning_rate": 8.712255993754007e-06, + "loss": 0.5323, + "step": 3750 + }, + { + "epoch": 0.616016258493626, + "grad_norm": 0.3413997188779241, + "learning_rate": 8.712090963623987e-06, + "loss": 0.5415, + "step": 3751 + }, + { + "epoch": 0.6161804857019687, + "grad_norm": 0.30485270030431405, + "learning_rate": 8.711925887746516e-06, + "loss": 0.5422, + "step": 3752 + }, + { + "epoch": 0.6163447129103115, + "grad_norm": 0.3781760980970153, + "learning_rate": 8.711760766123385e-06, + "loss": 0.5226, + "step": 3753 + }, + { + "epoch": 0.6165089401186542, + "grad_norm": 0.32080829914928155, + "learning_rate": 8.71159559875639e-06, + "loss": 0.5307, + "step": 3754 + }, + { + "epoch": 0.616673167326997, + "grad_norm": 0.2814514445391986, + "learning_rate": 8.711430385647321e-06, + "loss": 0.518, + "step": 3755 + }, + { + "epoch": 0.6168373945353396, + "grad_norm": 0.3194129509030034, + "learning_rate": 8.711265126797976e-06, + "loss": 0.5458, + "step": 3756 + }, + { + "epoch": 0.6170016217436823, + "grad_norm": 0.31273916859958956, + "learning_rate": 8.711099822210148e-06, + "loss": 0.529, + "step": 3757 + }, + { + "epoch": 0.6171658489520251, + "grad_norm": 0.4807127966621582, + "learning_rate": 8.710934471885632e-06, + "loss": 0.5691, + "step": 3758 + }, + { + "epoch": 0.6173300761603678, + "grad_norm": 0.35464260050427854, + "learning_rate": 8.710769075826227e-06, + "loss": 0.5455, + "step": 3759 + }, + { + "epoch": 0.6174943033687106, + "grad_norm": 0.33236503232750314, + "learning_rate": 8.710603634033725e-06, + "loss": 0.5364, + "step": 3760 + }, + { + "epoch": 0.6176585305770533, + "grad_norm": 0.3464239772480392, + "learning_rate": 8.710438146509925e-06, + "loss": 0.5267, + "step": 3761 + }, + { + "epoch": 0.6178227577853961, + "grad_norm": 0.3150829346496259, + "learning_rate": 8.710272613256623e-06, + "loss": 0.5238, + "step": 3762 + }, + { + "epoch": 0.6179869849937388, + "grad_norm": 0.33797103760143943, + "learning_rate": 8.710107034275621e-06, + "loss": 0.5101, + "step": 3763 + }, + { + "epoch": 0.6181512122020816, + "grad_norm": 0.3460563845624605, + "learning_rate": 8.709941409568712e-06, + "loss": 0.5402, + "step": 3764 + }, + { + "epoch": 0.6183154394104243, + "grad_norm": 0.31892852721514725, + "learning_rate": 8.709775739137698e-06, + "loss": 0.5443, + "step": 3765 + }, + { + "epoch": 0.6184796666187671, + "grad_norm": 0.33910830608309395, + "learning_rate": 8.709610022984379e-06, + "loss": 0.5288, + "step": 3766 + }, + { + "epoch": 0.6186438938271098, + "grad_norm": 0.5006130609576214, + "learning_rate": 8.709444261110551e-06, + "loss": 0.5698, + "step": 3767 + }, + { + "epoch": 0.6188081210354526, + "grad_norm": 0.34041891842889027, + "learning_rate": 8.70927845351802e-06, + "loss": 0.5306, + "step": 3768 + }, + { + "epoch": 0.6189723482437953, + "grad_norm": 0.32959334265653095, + "learning_rate": 8.70911260020858e-06, + "loss": 0.5027, + "step": 3769 + }, + { + "epoch": 0.6191365754521381, + "grad_norm": 0.33632145365800753, + "learning_rate": 8.708946701184038e-06, + "loss": 0.5622, + "step": 3770 + }, + { + "epoch": 0.6193008026604808, + "grad_norm": 0.3284090276692645, + "learning_rate": 8.708780756446193e-06, + "loss": 0.5258, + "step": 3771 + }, + { + "epoch": 0.6194650298688236, + "grad_norm": 0.27420040378984273, + "learning_rate": 8.70861476599685e-06, + "loss": 0.5217, + "step": 3772 + }, + { + "epoch": 0.6196292570771662, + "grad_norm": 0.37164897196678104, + "learning_rate": 8.708448729837807e-06, + "loss": 0.5001, + "step": 3773 + }, + { + "epoch": 0.619793484285509, + "grad_norm": 0.3198369016389374, + "learning_rate": 8.708282647970872e-06, + "loss": 0.5501, + "step": 3774 + }, + { + "epoch": 0.6199577114938517, + "grad_norm": 0.4734567044434302, + "learning_rate": 8.708116520397847e-06, + "loss": 0.5429, + "step": 3775 + }, + { + "epoch": 0.6201219387021945, + "grad_norm": 0.3675363210937101, + "learning_rate": 8.707950347120536e-06, + "loss": 0.5343, + "step": 3776 + }, + { + "epoch": 0.6202861659105372, + "grad_norm": 0.3130171423294823, + "learning_rate": 8.707784128140745e-06, + "loss": 0.5626, + "step": 3777 + }, + { + "epoch": 0.62045039311888, + "grad_norm": 0.33656999876962657, + "learning_rate": 8.707617863460276e-06, + "loss": 0.5397, + "step": 3778 + }, + { + "epoch": 0.6206146203272227, + "grad_norm": 0.378701611013499, + "learning_rate": 8.70745155308094e-06, + "loss": 0.5236, + "step": 3779 + }, + { + "epoch": 0.6207788475355654, + "grad_norm": 0.40263268075287956, + "learning_rate": 8.70728519700454e-06, + "loss": 0.5522, + "step": 3780 + }, + { + "epoch": 0.6209430747439082, + "grad_norm": 0.31609267720649964, + "learning_rate": 8.707118795232882e-06, + "loss": 0.5341, + "step": 3781 + }, + { + "epoch": 0.621107301952251, + "grad_norm": 0.33076045899366674, + "learning_rate": 8.706952347767776e-06, + "loss": 0.5141, + "step": 3782 + }, + { + "epoch": 0.6212715291605937, + "grad_norm": 0.3119390105211355, + "learning_rate": 8.706785854611027e-06, + "loss": 0.5395, + "step": 3783 + }, + { + "epoch": 0.6214357563689364, + "grad_norm": 0.32407867611523294, + "learning_rate": 8.706619315764446e-06, + "loss": 0.5663, + "step": 3784 + }, + { + "epoch": 0.6215999835772792, + "grad_norm": 0.3366621208808585, + "learning_rate": 8.70645273122984e-06, + "loss": 0.5303, + "step": 3785 + }, + { + "epoch": 0.6217642107856219, + "grad_norm": 0.334100898833225, + "learning_rate": 8.706286101009021e-06, + "loss": 0.5346, + "step": 3786 + }, + { + "epoch": 0.6219284379939647, + "grad_norm": 0.2938793811806174, + "learning_rate": 8.706119425103793e-06, + "loss": 0.5509, + "step": 3787 + }, + { + "epoch": 0.6220926652023074, + "grad_norm": 0.30688080144004615, + "learning_rate": 8.705952703515972e-06, + "loss": 0.5409, + "step": 3788 + }, + { + "epoch": 0.6222568924106502, + "grad_norm": 0.30865811818944405, + "learning_rate": 8.705785936247364e-06, + "loss": 0.5298, + "step": 3789 + }, + { + "epoch": 0.6224211196189928, + "grad_norm": 0.34419297569497387, + "learning_rate": 8.705619123299786e-06, + "loss": 0.547, + "step": 3790 + }, + { + "epoch": 0.6225853468273356, + "grad_norm": 0.36703178884640025, + "learning_rate": 8.705452264675045e-06, + "loss": 0.5413, + "step": 3791 + }, + { + "epoch": 0.6227495740356783, + "grad_norm": 0.3923267640978763, + "learning_rate": 8.705285360374955e-06, + "loss": 0.5323, + "step": 3792 + }, + { + "epoch": 0.6229138012440211, + "grad_norm": 0.33842940645185976, + "learning_rate": 8.705118410401329e-06, + "loss": 0.5302, + "step": 3793 + }, + { + "epoch": 0.6230780284523638, + "grad_norm": 0.8608072591706694, + "learning_rate": 8.704951414755978e-06, + "loss": 0.5415, + "step": 3794 + }, + { + "epoch": 0.6232422556607066, + "grad_norm": 0.2936882961967509, + "learning_rate": 8.704784373440719e-06, + "loss": 0.5353, + "step": 3795 + }, + { + "epoch": 0.6234064828690493, + "grad_norm": 0.289643466614854, + "learning_rate": 8.704617286457365e-06, + "loss": 0.5305, + "step": 3796 + }, + { + "epoch": 0.6235707100773921, + "grad_norm": 0.45597156884259565, + "learning_rate": 8.70445015380773e-06, + "loss": 0.5335, + "step": 3797 + }, + { + "epoch": 0.6237349372857348, + "grad_norm": 0.37801086646828497, + "learning_rate": 8.70428297549363e-06, + "loss": 0.5075, + "step": 3798 + }, + { + "epoch": 0.6238991644940776, + "grad_norm": 0.2812051698019951, + "learning_rate": 8.70411575151688e-06, + "loss": 0.5319, + "step": 3799 + }, + { + "epoch": 0.6240633917024203, + "grad_norm": 0.3145164176523809, + "learning_rate": 8.703948481879296e-06, + "loss": 0.5154, + "step": 3800 + }, + { + "epoch": 0.6242276189107631, + "grad_norm": 0.3293466939126531, + "learning_rate": 8.703781166582696e-06, + "loss": 0.5329, + "step": 3801 + }, + { + "epoch": 0.6243918461191058, + "grad_norm": 0.3241613646790027, + "learning_rate": 8.703613805628897e-06, + "loss": 0.5366, + "step": 3802 + }, + { + "epoch": 0.6245560733274486, + "grad_norm": 0.30109707531054103, + "learning_rate": 8.703446399019716e-06, + "loss": 0.5267, + "step": 3803 + }, + { + "epoch": 0.6247203005357913, + "grad_norm": 0.3159230311428853, + "learning_rate": 8.703278946756972e-06, + "loss": 0.5408, + "step": 3804 + }, + { + "epoch": 0.624884527744134, + "grad_norm": 0.32223834189608774, + "learning_rate": 8.703111448842482e-06, + "loss": 0.5587, + "step": 3805 + }, + { + "epoch": 0.6250487549524768, + "grad_norm": 90.67267353364257, + "learning_rate": 8.702943905278067e-06, + "loss": 0.528, + "step": 3806 + }, + { + "epoch": 0.6252129821608194, + "grad_norm": 0.43270448510011983, + "learning_rate": 8.702776316065547e-06, + "loss": 0.5133, + "step": 3807 + }, + { + "epoch": 0.6253772093691622, + "grad_norm": 0.6509586015003804, + "learning_rate": 8.70260868120674e-06, + "loss": 0.5407, + "step": 3808 + }, + { + "epoch": 0.6255414365775049, + "grad_norm": 1.4183420052563982, + "learning_rate": 8.702441000703468e-06, + "loss": 0.563, + "step": 3809 + }, + { + "epoch": 0.6257056637858477, + "grad_norm": 1.8921901405577501, + "learning_rate": 8.702273274557552e-06, + "loss": 0.5751, + "step": 3810 + }, + { + "epoch": 0.6258698909941904, + "grad_norm": 1.4172223283468877, + "learning_rate": 8.702105502770813e-06, + "loss": 0.545, + "step": 3811 + }, + { + "epoch": 0.6260341182025332, + "grad_norm": 1.3874445788703256, + "learning_rate": 8.701937685345076e-06, + "loss": 0.573, + "step": 3812 + }, + { + "epoch": 0.6261983454108759, + "grad_norm": 1.4763769040707089, + "learning_rate": 8.70176982228216e-06, + "loss": 0.5626, + "step": 3813 + }, + { + "epoch": 0.6263625726192187, + "grad_norm": 1.2332840709413573, + "learning_rate": 8.701601913583891e-06, + "loss": 0.5505, + "step": 3814 + }, + { + "epoch": 0.6265267998275614, + "grad_norm": 1.082496847586964, + "learning_rate": 8.70143395925209e-06, + "loss": 0.5308, + "step": 3815 + }, + { + "epoch": 0.6266910270359042, + "grad_norm": 1.3361911741976398, + "learning_rate": 8.701265959288584e-06, + "loss": 0.5584, + "step": 3816 + }, + { + "epoch": 0.6268552542442469, + "grad_norm": 0.9853921793041766, + "learning_rate": 8.701097913695196e-06, + "loss": 0.5561, + "step": 3817 + }, + { + "epoch": 0.6270194814525897, + "grad_norm": 0.6673756700635366, + "learning_rate": 8.70092982247375e-06, + "loss": 0.5216, + "step": 3818 + }, + { + "epoch": 0.6271837086609324, + "grad_norm": 0.7139398117801092, + "learning_rate": 8.700761685626074e-06, + "loss": 0.5466, + "step": 3819 + }, + { + "epoch": 0.6273479358692752, + "grad_norm": 0.6201194455263372, + "learning_rate": 8.700593503153993e-06, + "loss": 0.5595, + "step": 3820 + }, + { + "epoch": 0.6275121630776179, + "grad_norm": 0.6338582846012263, + "learning_rate": 8.700425275059334e-06, + "loss": 0.5416, + "step": 3821 + }, + { + "epoch": 0.6276763902859607, + "grad_norm": 0.620727987673737, + "learning_rate": 8.700257001343924e-06, + "loss": 0.5448, + "step": 3822 + }, + { + "epoch": 0.6278406174943034, + "grad_norm": 0.6572165151879532, + "learning_rate": 8.70008868200959e-06, + "loss": 0.5585, + "step": 3823 + }, + { + "epoch": 0.6280048447026461, + "grad_norm": 0.7069587357491588, + "learning_rate": 8.69992031705816e-06, + "loss": 0.5505, + "step": 3824 + }, + { + "epoch": 0.6281690719109888, + "grad_norm": 0.5887814051268679, + "learning_rate": 8.699751906491464e-06, + "loss": 0.5453, + "step": 3825 + }, + { + "epoch": 0.6283332991193316, + "grad_norm": 0.5949769393475107, + "learning_rate": 8.69958345031133e-06, + "loss": 0.5485, + "step": 3826 + }, + { + "epoch": 0.6284975263276743, + "grad_norm": 0.45559918621982776, + "learning_rate": 8.699414948519588e-06, + "loss": 0.5507, + "step": 3827 + }, + { + "epoch": 0.628661753536017, + "grad_norm": 0.5066535100919715, + "learning_rate": 8.699246401118067e-06, + "loss": 0.5456, + "step": 3828 + }, + { + "epoch": 0.6288259807443598, + "grad_norm": 0.603987704362724, + "learning_rate": 8.699077808108598e-06, + "loss": 0.5367, + "step": 3829 + }, + { + "epoch": 0.6289902079527026, + "grad_norm": 0.5753829717531779, + "learning_rate": 8.698909169493014e-06, + "loss": 0.5354, + "step": 3830 + }, + { + "epoch": 0.6291544351610453, + "grad_norm": 0.48583557088688634, + "learning_rate": 8.698740485273147e-06, + "loss": 0.5613, + "step": 3831 + }, + { + "epoch": 0.629318662369388, + "grad_norm": 0.4923545359065946, + "learning_rate": 8.698571755450826e-06, + "loss": 0.5264, + "step": 3832 + }, + { + "epoch": 0.6294828895777308, + "grad_norm": 0.5021171246250056, + "learning_rate": 8.698402980027884e-06, + "loss": 0.5536, + "step": 3833 + }, + { + "epoch": 0.6296471167860735, + "grad_norm": 0.6308522128061763, + "learning_rate": 8.698234159006155e-06, + "loss": 0.5574, + "step": 3834 + }, + { + "epoch": 0.6298113439944163, + "grad_norm": 0.5317056736086443, + "learning_rate": 8.698065292387474e-06, + "loss": 0.5362, + "step": 3835 + }, + { + "epoch": 0.629975571202759, + "grad_norm": 0.41775683407473374, + "learning_rate": 8.697896380173673e-06, + "loss": 0.5521, + "step": 3836 + }, + { + "epoch": 0.6301397984111018, + "grad_norm": 0.35903545755316413, + "learning_rate": 8.697727422366586e-06, + "loss": 0.56, + "step": 3837 + }, + { + "epoch": 0.6303040256194445, + "grad_norm": 0.38789311643013547, + "learning_rate": 8.69755841896805e-06, + "loss": 0.525, + "step": 3838 + }, + { + "epoch": 0.6304682528277873, + "grad_norm": 0.5183715290306259, + "learning_rate": 8.697389369979901e-06, + "loss": 0.5372, + "step": 3839 + }, + { + "epoch": 0.63063248003613, + "grad_norm": 0.4307370872715388, + "learning_rate": 8.697220275403972e-06, + "loss": 0.5309, + "step": 3840 + }, + { + "epoch": 0.6307967072444727, + "grad_norm": 0.3726029619618114, + "learning_rate": 8.697051135242103e-06, + "loss": 0.5386, + "step": 3841 + }, + { + "epoch": 0.6309609344528154, + "grad_norm": 0.4780072696531049, + "learning_rate": 8.696881949496127e-06, + "loss": 0.5471, + "step": 3842 + }, + { + "epoch": 0.6311251616611582, + "grad_norm": 0.34995870047191846, + "learning_rate": 8.696712718167884e-06, + "loss": 0.5402, + "step": 3843 + }, + { + "epoch": 0.6312893888695009, + "grad_norm": 0.411103673149213, + "learning_rate": 8.696543441259215e-06, + "loss": 0.5433, + "step": 3844 + }, + { + "epoch": 0.6314536160778437, + "grad_norm": 0.3587501781903258, + "learning_rate": 8.696374118771952e-06, + "loss": 0.5486, + "step": 3845 + }, + { + "epoch": 0.6316178432861864, + "grad_norm": 0.33772335432480305, + "learning_rate": 8.69620475070794e-06, + "loss": 0.5315, + "step": 3846 + }, + { + "epoch": 0.6317820704945292, + "grad_norm": 0.3390924628374558, + "learning_rate": 8.696035337069013e-06, + "loss": 0.5622, + "step": 3847 + }, + { + "epoch": 0.6319462977028719, + "grad_norm": 0.4403930593852699, + "learning_rate": 8.695865877857015e-06, + "loss": 0.5013, + "step": 3848 + }, + { + "epoch": 0.6321105249112147, + "grad_norm": 0.41806220799017596, + "learning_rate": 8.695696373073787e-06, + "loss": 0.5464, + "step": 3849 + }, + { + "epoch": 0.6322747521195574, + "grad_norm": 0.4085362195583205, + "learning_rate": 8.695526822721167e-06, + "loss": 0.5217, + "step": 3850 + }, + { + "epoch": 0.6324389793279002, + "grad_norm": 0.3689830215891882, + "learning_rate": 8.695357226800999e-06, + "loss": 0.5503, + "step": 3851 + }, + { + "epoch": 0.6326032065362429, + "grad_norm": 0.3731211509083733, + "learning_rate": 8.695187585315122e-06, + "loss": 0.5305, + "step": 3852 + }, + { + "epoch": 0.6327674337445857, + "grad_norm": 0.3781825465409617, + "learning_rate": 8.695017898265381e-06, + "loss": 0.536, + "step": 3853 + }, + { + "epoch": 0.6329316609529284, + "grad_norm": 0.6074947040600762, + "learning_rate": 8.694848165653618e-06, + "loss": 0.5354, + "step": 3854 + }, + { + "epoch": 0.6330958881612712, + "grad_norm": 0.33740040810301675, + "learning_rate": 8.694678387481678e-06, + "loss": 0.5216, + "step": 3855 + }, + { + "epoch": 0.6332601153696139, + "grad_norm": 0.3720129842158853, + "learning_rate": 8.694508563751404e-06, + "loss": 0.5304, + "step": 3856 + }, + { + "epoch": 0.6334243425779567, + "grad_norm": 0.34325179252543697, + "learning_rate": 8.694338694464639e-06, + "loss": 0.5401, + "step": 3857 + }, + { + "epoch": 0.6335885697862993, + "grad_norm": 0.3983565060019772, + "learning_rate": 8.694168779623229e-06, + "loss": 0.5368, + "step": 3858 + }, + { + "epoch": 0.633752796994642, + "grad_norm": 0.4136266171320773, + "learning_rate": 8.69399881922902e-06, + "loss": 0.5129, + "step": 3859 + }, + { + "epoch": 0.6339170242029848, + "grad_norm": 0.5371242412763911, + "learning_rate": 8.693828813283856e-06, + "loss": 0.5441, + "step": 3860 + }, + { + "epoch": 0.6340812514113275, + "grad_norm": 0.3961247480341962, + "learning_rate": 8.693658761789587e-06, + "loss": 0.5676, + "step": 3861 + }, + { + "epoch": 0.6342454786196703, + "grad_norm": 0.39617220623101806, + "learning_rate": 8.693488664748058e-06, + "loss": 0.5468, + "step": 3862 + }, + { + "epoch": 0.634409705828013, + "grad_norm": 0.40097095211622075, + "learning_rate": 8.693318522161114e-06, + "loss": 0.5169, + "step": 3863 + }, + { + "epoch": 0.6345739330363558, + "grad_norm": 0.38894572895815444, + "learning_rate": 8.693148334030607e-06, + "loss": 0.5135, + "step": 3864 + }, + { + "epoch": 0.6347381602446985, + "grad_norm": 0.3763156946705871, + "learning_rate": 8.692978100358384e-06, + "loss": 0.5529, + "step": 3865 + }, + { + "epoch": 0.6349023874530413, + "grad_norm": 0.4886376067594294, + "learning_rate": 8.692807821146292e-06, + "loss": 0.5406, + "step": 3866 + }, + { + "epoch": 0.635066614661384, + "grad_norm": 0.35352227694749655, + "learning_rate": 8.692637496396181e-06, + "loss": 0.535, + "step": 3867 + }, + { + "epoch": 0.6352308418697268, + "grad_norm": 0.4983938112065874, + "learning_rate": 8.692467126109904e-06, + "loss": 0.5408, + "step": 3868 + }, + { + "epoch": 0.6353950690780695, + "grad_norm": 0.31829193416012663, + "learning_rate": 8.692296710289309e-06, + "loss": 0.534, + "step": 3869 + }, + { + "epoch": 0.6355592962864123, + "grad_norm": 0.3525883829667812, + "learning_rate": 8.692126248936246e-06, + "loss": 0.5417, + "step": 3870 + }, + { + "epoch": 0.635723523494755, + "grad_norm": 0.36401514044093797, + "learning_rate": 8.691955742052567e-06, + "loss": 0.5371, + "step": 3871 + }, + { + "epoch": 0.6358877507030978, + "grad_norm": 0.3969898970872966, + "learning_rate": 8.691785189640127e-06, + "loss": 0.5472, + "step": 3872 + }, + { + "epoch": 0.6360519779114405, + "grad_norm": 0.33217592523461514, + "learning_rate": 8.691614591700774e-06, + "loss": 0.5308, + "step": 3873 + }, + { + "epoch": 0.6362162051197833, + "grad_norm": 0.42246583922748293, + "learning_rate": 8.691443948236361e-06, + "loss": 0.5424, + "step": 3874 + }, + { + "epoch": 0.6363804323281259, + "grad_norm": 0.3403711310191984, + "learning_rate": 8.691273259248745e-06, + "loss": 0.5514, + "step": 3875 + }, + { + "epoch": 0.6365446595364687, + "grad_norm": 0.3248716492797271, + "learning_rate": 8.691102524739778e-06, + "loss": 0.5387, + "step": 3876 + }, + { + "epoch": 0.6367088867448114, + "grad_norm": 0.4398054414005426, + "learning_rate": 8.690931744711313e-06, + "loss": 0.543, + "step": 3877 + }, + { + "epoch": 0.6368731139531542, + "grad_norm": 0.42677973526179014, + "learning_rate": 8.690760919165206e-06, + "loss": 0.5536, + "step": 3878 + }, + { + "epoch": 0.6370373411614969, + "grad_norm": 0.40237646554369005, + "learning_rate": 8.690590048103313e-06, + "loss": 0.5328, + "step": 3879 + }, + { + "epoch": 0.6372015683698397, + "grad_norm": 0.2960670627284331, + "learning_rate": 8.690419131527489e-06, + "loss": 0.569, + "step": 3880 + }, + { + "epoch": 0.6373657955781824, + "grad_norm": 0.4324403747553011, + "learning_rate": 8.69024816943959e-06, + "loss": 0.541, + "step": 3881 + }, + { + "epoch": 0.6375300227865252, + "grad_norm": 0.3444517645451572, + "learning_rate": 8.690077161841473e-06, + "loss": 0.5449, + "step": 3882 + }, + { + "epoch": 0.6376942499948679, + "grad_norm": 0.34750063901480577, + "learning_rate": 8.689906108734994e-06, + "loss": 0.5387, + "step": 3883 + }, + { + "epoch": 0.6378584772032106, + "grad_norm": 0.30817723250833423, + "learning_rate": 8.689735010122015e-06, + "loss": 0.5284, + "step": 3884 + }, + { + "epoch": 0.6380227044115534, + "grad_norm": 0.3530145511907926, + "learning_rate": 8.68956386600439e-06, + "loss": 0.5128, + "step": 3885 + }, + { + "epoch": 0.6381869316198961, + "grad_norm": 0.40882371198815415, + "learning_rate": 8.68939267638398e-06, + "loss": 0.5395, + "step": 3886 + }, + { + "epoch": 0.6383511588282389, + "grad_norm": 0.36352754615313715, + "learning_rate": 8.689221441262645e-06, + "loss": 0.5224, + "step": 3887 + }, + { + "epoch": 0.6385153860365816, + "grad_norm": 0.2907383919549112, + "learning_rate": 8.689050160642242e-06, + "loss": 0.5336, + "step": 3888 + }, + { + "epoch": 0.6386796132449244, + "grad_norm": 0.34749893951758054, + "learning_rate": 8.688878834524634e-06, + "loss": 0.5389, + "step": 3889 + }, + { + "epoch": 0.6388438404532671, + "grad_norm": 0.40513911865191865, + "learning_rate": 8.688707462911679e-06, + "loss": 0.5158, + "step": 3890 + }, + { + "epoch": 0.6390080676616099, + "grad_norm": 0.3030479591332535, + "learning_rate": 8.688536045805241e-06, + "loss": 0.5539, + "step": 3891 + }, + { + "epoch": 0.6391722948699525, + "grad_norm": 0.33893944587583974, + "learning_rate": 8.688364583207181e-06, + "loss": 0.5337, + "step": 3892 + }, + { + "epoch": 0.6393365220782953, + "grad_norm": 0.3315789532550877, + "learning_rate": 8.68819307511936e-06, + "loss": 0.5402, + "step": 3893 + }, + { + "epoch": 0.639500749286638, + "grad_norm": 0.4070342888416627, + "learning_rate": 8.68802152154364e-06, + "loss": 0.5243, + "step": 3894 + }, + { + "epoch": 0.6396649764949808, + "grad_norm": 0.30937112791655846, + "learning_rate": 8.687849922481888e-06, + "loss": 0.5215, + "step": 3895 + }, + { + "epoch": 0.6398292037033235, + "grad_norm": 0.38064616880789337, + "learning_rate": 8.687678277935965e-06, + "loss": 0.5338, + "step": 3896 + }, + { + "epoch": 0.6399934309116663, + "grad_norm": 0.47604423697526016, + "learning_rate": 8.687506587907736e-06, + "loss": 0.5311, + "step": 3897 + }, + { + "epoch": 0.640157658120009, + "grad_norm": 0.410893019484912, + "learning_rate": 8.687334852399064e-06, + "loss": 0.5234, + "step": 3898 + }, + { + "epoch": 0.6403218853283518, + "grad_norm": 0.41715319002167833, + "learning_rate": 8.687163071411817e-06, + "loss": 0.5223, + "step": 3899 + }, + { + "epoch": 0.6404861125366945, + "grad_norm": 0.3342019449167628, + "learning_rate": 8.686991244947861e-06, + "loss": 0.5272, + "step": 3900 + }, + { + "epoch": 0.6406503397450373, + "grad_norm": 0.30869298822600383, + "learning_rate": 8.68681937300906e-06, + "loss": 0.5458, + "step": 3901 + }, + { + "epoch": 0.64081456695338, + "grad_norm": 0.4454509392864555, + "learning_rate": 8.68664745559728e-06, + "loss": 0.5255, + "step": 3902 + }, + { + "epoch": 0.6409787941617228, + "grad_norm": 0.33893845187399224, + "learning_rate": 8.686475492714389e-06, + "loss": 0.5355, + "step": 3903 + }, + { + "epoch": 0.6411430213700655, + "grad_norm": 0.36988459118997447, + "learning_rate": 8.686303484362257e-06, + "loss": 0.5415, + "step": 3904 + }, + { + "epoch": 0.6413072485784083, + "grad_norm": 0.3705192814061367, + "learning_rate": 8.686131430542749e-06, + "loss": 0.5456, + "step": 3905 + }, + { + "epoch": 0.641471475786751, + "grad_norm": 0.35173078787427914, + "learning_rate": 8.685959331257736e-06, + "loss": 0.5217, + "step": 3906 + }, + { + "epoch": 0.6416357029950938, + "grad_norm": 0.2957226125413411, + "learning_rate": 8.685787186509084e-06, + "loss": 0.533, + "step": 3907 + }, + { + "epoch": 0.6417999302034365, + "grad_norm": 0.7762910142753189, + "learning_rate": 8.685614996298667e-06, + "loss": 0.536, + "step": 3908 + }, + { + "epoch": 0.6419641574117791, + "grad_norm": 0.32793594523929287, + "learning_rate": 8.685442760628354e-06, + "loss": 0.5203, + "step": 3909 + }, + { + "epoch": 0.6421283846201219, + "grad_norm": 0.29157387769874193, + "learning_rate": 8.685270479500013e-06, + "loss": 0.5282, + "step": 3910 + }, + { + "epoch": 0.6422926118284646, + "grad_norm": 0.35421199782762086, + "learning_rate": 8.685098152915517e-06, + "loss": 0.5606, + "step": 3911 + }, + { + "epoch": 0.6424568390368074, + "grad_norm": 0.3948593371824962, + "learning_rate": 8.684925780876737e-06, + "loss": 0.5283, + "step": 3912 + }, + { + "epoch": 0.6426210662451501, + "grad_norm": 0.3382510623900333, + "learning_rate": 8.684753363385547e-06, + "loss": 0.5291, + "step": 3913 + }, + { + "epoch": 0.6427852934534929, + "grad_norm": 0.34658199700209985, + "learning_rate": 8.684580900443818e-06, + "loss": 0.5205, + "step": 3914 + }, + { + "epoch": 0.6429495206618356, + "grad_norm": 0.347988158080561, + "learning_rate": 8.684408392053423e-06, + "loss": 0.5284, + "step": 3915 + }, + { + "epoch": 0.6431137478701784, + "grad_norm": 0.3289779514425289, + "learning_rate": 8.684235838216237e-06, + "loss": 0.5491, + "step": 3916 + }, + { + "epoch": 0.6432779750785211, + "grad_norm": 0.417822963644472, + "learning_rate": 8.684063238934131e-06, + "loss": 0.5272, + "step": 3917 + }, + { + "epoch": 0.6434422022868639, + "grad_norm": 0.3819259777641144, + "learning_rate": 8.683890594208982e-06, + "loss": 0.5391, + "step": 3918 + }, + { + "epoch": 0.6436064294952066, + "grad_norm": 0.3245771331101448, + "learning_rate": 8.683717904042665e-06, + "loss": 0.5288, + "step": 3919 + }, + { + "epoch": 0.6437706567035494, + "grad_norm": 0.3232520982859475, + "learning_rate": 8.683545168437057e-06, + "loss": 0.546, + "step": 3920 + }, + { + "epoch": 0.6439348839118921, + "grad_norm": 0.3237295073427335, + "learning_rate": 8.683372387394031e-06, + "loss": 0.5479, + "step": 3921 + }, + { + "epoch": 0.6440991111202349, + "grad_norm": 0.3807149827338743, + "learning_rate": 8.683199560915464e-06, + "loss": 0.5288, + "step": 3922 + }, + { + "epoch": 0.6442633383285776, + "grad_norm": 0.3063169142380427, + "learning_rate": 8.683026689003236e-06, + "loss": 0.5211, + "step": 3923 + }, + { + "epoch": 0.6444275655369204, + "grad_norm": 0.43993293159188923, + "learning_rate": 8.682853771659222e-06, + "loss": 0.5373, + "step": 3924 + }, + { + "epoch": 0.6445917927452631, + "grad_norm": 0.3216930168200892, + "learning_rate": 8.6826808088853e-06, + "loss": 0.5395, + "step": 3925 + }, + { + "epoch": 0.6447560199536058, + "grad_norm": 0.45364275257594294, + "learning_rate": 8.68250780068335e-06, + "loss": 0.5586, + "step": 3926 + }, + { + "epoch": 0.6449202471619485, + "grad_norm": 0.34204207855157476, + "learning_rate": 8.682334747055251e-06, + "loss": 0.5261, + "step": 3927 + }, + { + "epoch": 0.6450844743702913, + "grad_norm": 0.47713121914216133, + "learning_rate": 8.682161648002881e-06, + "loss": 0.5333, + "step": 3928 + }, + { + "epoch": 0.645248701578634, + "grad_norm": 0.4187590920449625, + "learning_rate": 8.681988503528119e-06, + "loss": 0.5417, + "step": 3929 + }, + { + "epoch": 0.6454129287869768, + "grad_norm": 0.47497527294790776, + "learning_rate": 8.68181531363285e-06, + "loss": 0.5462, + "step": 3930 + }, + { + "epoch": 0.6455771559953195, + "grad_norm": 0.3535004979395706, + "learning_rate": 8.681642078318952e-06, + "loss": 0.546, + "step": 3931 + }, + { + "epoch": 0.6457413832036623, + "grad_norm": 0.359940360159505, + "learning_rate": 8.681468797588304e-06, + "loss": 0.5273, + "step": 3932 + }, + { + "epoch": 0.645905610412005, + "grad_norm": 0.3685906434372928, + "learning_rate": 8.681295471442793e-06, + "loss": 0.5349, + "step": 3933 + }, + { + "epoch": 0.6460698376203478, + "grad_norm": 0.4558508535400147, + "learning_rate": 8.6811220998843e-06, + "loss": 0.5183, + "step": 3934 + }, + { + "epoch": 0.6462340648286905, + "grad_norm": 0.44492975761566206, + "learning_rate": 8.680948682914706e-06, + "loss": 0.5257, + "step": 3935 + }, + { + "epoch": 0.6463982920370333, + "grad_norm": 0.3440786536061186, + "learning_rate": 8.680775220535897e-06, + "loss": 0.5169, + "step": 3936 + }, + { + "epoch": 0.646562519245376, + "grad_norm": 0.41451230027792896, + "learning_rate": 8.680601712749755e-06, + "loss": 0.536, + "step": 3937 + }, + { + "epoch": 0.6467267464537187, + "grad_norm": 0.34734955497252606, + "learning_rate": 8.680428159558167e-06, + "loss": 0.563, + "step": 3938 + }, + { + "epoch": 0.6468909736620615, + "grad_norm": 0.4474330702145904, + "learning_rate": 8.680254560963014e-06, + "loss": 0.5298, + "step": 3939 + }, + { + "epoch": 0.6470552008704042, + "grad_norm": 0.4773063602198883, + "learning_rate": 8.680080916966183e-06, + "loss": 0.5524, + "step": 3940 + }, + { + "epoch": 0.647219428078747, + "grad_norm": 0.3754853566580427, + "learning_rate": 8.679907227569562e-06, + "loss": 0.5538, + "step": 3941 + }, + { + "epoch": 0.6473836552870897, + "grad_norm": 0.3432933716810742, + "learning_rate": 8.679733492775035e-06, + "loss": 0.5241, + "step": 3942 + }, + { + "epoch": 0.6475478824954324, + "grad_norm": 0.4450073864802975, + "learning_rate": 8.679559712584492e-06, + "loss": 0.5283, + "step": 3943 + }, + { + "epoch": 0.6477121097037751, + "grad_norm": 0.3940289611879266, + "learning_rate": 8.679385886999818e-06, + "loss": 0.543, + "step": 3944 + }, + { + "epoch": 0.6478763369121179, + "grad_norm": 0.31865120983857287, + "learning_rate": 8.6792120160229e-06, + "loss": 0.5276, + "step": 3945 + }, + { + "epoch": 0.6480405641204606, + "grad_norm": 0.37514316444909035, + "learning_rate": 8.679038099655629e-06, + "loss": 0.5283, + "step": 3946 + }, + { + "epoch": 0.6482047913288034, + "grad_norm": 0.38267915121977825, + "learning_rate": 8.678864137899892e-06, + "loss": 0.5355, + "step": 3947 + }, + { + "epoch": 0.6483690185371461, + "grad_norm": 0.3566722105246468, + "learning_rate": 8.678690130757579e-06, + "loss": 0.5222, + "step": 3948 + }, + { + "epoch": 0.6485332457454889, + "grad_norm": 0.36829644006372725, + "learning_rate": 8.67851607823058e-06, + "loss": 0.5476, + "step": 3949 + }, + { + "epoch": 0.6486974729538316, + "grad_norm": 0.4250247596289241, + "learning_rate": 8.678341980320785e-06, + "loss": 0.5248, + "step": 3950 + }, + { + "epoch": 0.6488617001621744, + "grad_norm": 0.35853628855086594, + "learning_rate": 8.678167837030085e-06, + "loss": 0.5204, + "step": 3951 + }, + { + "epoch": 0.6490259273705171, + "grad_norm": 0.3992651417520165, + "learning_rate": 8.677993648360371e-06, + "loss": 0.5544, + "step": 3952 + }, + { + "epoch": 0.6491901545788599, + "grad_norm": 0.3157472097393151, + "learning_rate": 8.677819414313537e-06, + "loss": 0.52, + "step": 3953 + }, + { + "epoch": 0.6493543817872026, + "grad_norm": 0.35866185315676313, + "learning_rate": 8.677645134891472e-06, + "loss": 0.5252, + "step": 3954 + }, + { + "epoch": 0.6495186089955454, + "grad_norm": 0.3720242063110086, + "learning_rate": 8.677470810096072e-06, + "loss": 0.5543, + "step": 3955 + }, + { + "epoch": 0.6496828362038881, + "grad_norm": 0.33452108930498453, + "learning_rate": 8.677296439929228e-06, + "loss": 0.5281, + "step": 3956 + }, + { + "epoch": 0.6498470634122309, + "grad_norm": 0.36791969880247277, + "learning_rate": 8.677122024392837e-06, + "loss": 0.543, + "step": 3957 + }, + { + "epoch": 0.6500112906205736, + "grad_norm": 0.4421337612938926, + "learning_rate": 8.676947563488789e-06, + "loss": 0.5399, + "step": 3958 + }, + { + "epoch": 0.6501755178289164, + "grad_norm": 0.317527166513708, + "learning_rate": 8.67677305721898e-06, + "loss": 0.5314, + "step": 3959 + }, + { + "epoch": 0.650339745037259, + "grad_norm": 1.3753223596720887, + "learning_rate": 8.676598505585308e-06, + "loss": 0.5365, + "step": 3960 + }, + { + "epoch": 0.6505039722456017, + "grad_norm": 0.35311319928316154, + "learning_rate": 8.676423908589667e-06, + "loss": 0.5168, + "step": 3961 + }, + { + "epoch": 0.6506681994539445, + "grad_norm": 0.36668233413529516, + "learning_rate": 8.676249266233952e-06, + "loss": 0.5094, + "step": 3962 + }, + { + "epoch": 0.6508324266622872, + "grad_norm": 0.34926006262255677, + "learning_rate": 8.676074578520061e-06, + "loss": 0.5266, + "step": 3963 + }, + { + "epoch": 0.65099665387063, + "grad_norm": 0.3230850253272737, + "learning_rate": 8.675899845449892e-06, + "loss": 0.5222, + "step": 3964 + }, + { + "epoch": 0.6511608810789727, + "grad_norm": 0.3178205063481531, + "learning_rate": 8.675725067025343e-06, + "loss": 0.5334, + "step": 3965 + }, + { + "epoch": 0.6513251082873155, + "grad_norm": 0.3604238765156544, + "learning_rate": 8.67555024324831e-06, + "loss": 0.5571, + "step": 3966 + }, + { + "epoch": 0.6514893354956582, + "grad_norm": 0.4148764805152981, + "learning_rate": 8.675375374120695e-06, + "loss": 0.5082, + "step": 3967 + }, + { + "epoch": 0.651653562704001, + "grad_norm": 0.45696787695365004, + "learning_rate": 8.675200459644393e-06, + "loss": 0.5309, + "step": 3968 + }, + { + "epoch": 0.6518177899123437, + "grad_norm": 0.32008682200636956, + "learning_rate": 8.675025499821309e-06, + "loss": 0.5424, + "step": 3969 + }, + { + "epoch": 0.6519820171206865, + "grad_norm": 0.36765540872038655, + "learning_rate": 8.674850494653338e-06, + "loss": 0.5279, + "step": 3970 + }, + { + "epoch": 0.6521462443290292, + "grad_norm": 0.4053643104635402, + "learning_rate": 8.674675444142385e-06, + "loss": 0.5384, + "step": 3971 + }, + { + "epoch": 0.652310471537372, + "grad_norm": 0.39808760881236877, + "learning_rate": 8.674500348290349e-06, + "loss": 0.5518, + "step": 3972 + }, + { + "epoch": 0.6524746987457147, + "grad_norm": 0.44611158326975175, + "learning_rate": 8.674325207099131e-06, + "loss": 0.5523, + "step": 3973 + }, + { + "epoch": 0.6526389259540575, + "grad_norm": 0.3183684377023937, + "learning_rate": 8.674150020570635e-06, + "loss": 0.5302, + "step": 3974 + }, + { + "epoch": 0.6528031531624002, + "grad_norm": 0.34196266717367213, + "learning_rate": 8.673974788706762e-06, + "loss": 0.5159, + "step": 3975 + }, + { + "epoch": 0.652967380370743, + "grad_norm": 0.2900554832151464, + "learning_rate": 8.673799511509418e-06, + "loss": 0.5334, + "step": 3976 + }, + { + "epoch": 0.6531316075790856, + "grad_norm": 0.3386359848323148, + "learning_rate": 8.673624188980503e-06, + "loss": 0.5537, + "step": 3977 + }, + { + "epoch": 0.6532958347874284, + "grad_norm": 0.39546656599547636, + "learning_rate": 8.673448821121923e-06, + "loss": 0.5237, + "step": 3978 + }, + { + "epoch": 0.6534600619957711, + "grad_norm": 0.3317443593450704, + "learning_rate": 8.673273407935584e-06, + "loss": 0.5053, + "step": 3979 + }, + { + "epoch": 0.6536242892041139, + "grad_norm": 0.418188518330638, + "learning_rate": 8.67309794942339e-06, + "loss": 0.5492, + "step": 3980 + }, + { + "epoch": 0.6537885164124566, + "grad_norm": 0.369270618448298, + "learning_rate": 8.672922445587245e-06, + "loss": 0.5104, + "step": 3981 + }, + { + "epoch": 0.6539527436207994, + "grad_norm": 0.38664898700693945, + "learning_rate": 8.672746896429058e-06, + "loss": 0.5316, + "step": 3982 + }, + { + "epoch": 0.6541169708291421, + "grad_norm": 0.33549576162982075, + "learning_rate": 8.672571301950733e-06, + "loss": 0.5425, + "step": 3983 + }, + { + "epoch": 0.6542811980374849, + "grad_norm": 0.40001762129435364, + "learning_rate": 8.67239566215418e-06, + "loss": 0.5307, + "step": 3984 + }, + { + "epoch": 0.6544454252458276, + "grad_norm": 0.3913057314551714, + "learning_rate": 8.672219977041304e-06, + "loss": 0.5247, + "step": 3985 + }, + { + "epoch": 0.6546096524541704, + "grad_norm": 0.31883678483466016, + "learning_rate": 8.672044246614013e-06, + "loss": 0.5077, + "step": 3986 + }, + { + "epoch": 0.6547738796625131, + "grad_norm": 0.29753011492717113, + "learning_rate": 8.67186847087422e-06, + "loss": 0.5299, + "step": 3987 + }, + { + "epoch": 0.6549381068708559, + "grad_norm": 0.30891222511550015, + "learning_rate": 8.671692649823828e-06, + "loss": 0.5183, + "step": 3988 + }, + { + "epoch": 0.6551023340791986, + "grad_norm": 0.36057053254078036, + "learning_rate": 8.671516783464751e-06, + "loss": 0.5522, + "step": 3989 + }, + { + "epoch": 0.6552665612875413, + "grad_norm": 0.30740628581649376, + "learning_rate": 8.671340871798895e-06, + "loss": 0.5421, + "step": 3990 + }, + { + "epoch": 0.6554307884958841, + "grad_norm": 0.3390539225771164, + "learning_rate": 8.671164914828174e-06, + "loss": 0.5367, + "step": 3991 + }, + { + "epoch": 0.6555950157042268, + "grad_norm": 0.3027722746444447, + "learning_rate": 8.670988912554501e-06, + "loss": 0.5447, + "step": 3992 + }, + { + "epoch": 0.6557592429125696, + "grad_norm": 0.8625813728280358, + "learning_rate": 8.670812864979783e-06, + "loss": 0.5196, + "step": 3993 + }, + { + "epoch": 0.6559234701209122, + "grad_norm": 0.4270267910110388, + "learning_rate": 8.670636772105932e-06, + "loss": 0.5183, + "step": 3994 + }, + { + "epoch": 0.656087697329255, + "grad_norm": 0.30519084206610886, + "learning_rate": 8.670460633934864e-06, + "loss": 0.5346, + "step": 3995 + }, + { + "epoch": 0.6562519245375977, + "grad_norm": 0.3523099685252621, + "learning_rate": 8.67028445046849e-06, + "loss": 0.5239, + "step": 3996 + }, + { + "epoch": 0.6564161517459405, + "grad_norm": 0.31211800772194953, + "learning_rate": 8.670108221708725e-06, + "loss": 0.5328, + "step": 3997 + }, + { + "epoch": 0.6565803789542832, + "grad_norm": 0.4759892153046304, + "learning_rate": 8.669931947657481e-06, + "loss": 0.5338, + "step": 3998 + }, + { + "epoch": 0.656744606162626, + "grad_norm": 0.4637487119659366, + "learning_rate": 8.669755628316673e-06, + "loss": 0.5278, + "step": 3999 + }, + { + "epoch": 0.6569088333709687, + "grad_norm": 0.32994096949419766, + "learning_rate": 8.669579263688216e-06, + "loss": 0.5261, + "step": 4000 + }, + { + "epoch": 0.6570730605793115, + "grad_norm": 0.3356383750274373, + "learning_rate": 8.669402853774026e-06, + "loss": 0.5187, + "step": 4001 + }, + { + "epoch": 0.6572372877876542, + "grad_norm": 0.6717913507630792, + "learning_rate": 8.66922639857602e-06, + "loss": 0.5238, + "step": 4002 + }, + { + "epoch": 0.657401514995997, + "grad_norm": 0.3221690604002636, + "learning_rate": 8.669049898096114e-06, + "loss": 0.5559, + "step": 4003 + }, + { + "epoch": 0.6575657422043397, + "grad_norm": 0.3081158006782644, + "learning_rate": 8.668873352336221e-06, + "loss": 0.5057, + "step": 4004 + }, + { + "epoch": 0.6577299694126825, + "grad_norm": 0.32070325867119254, + "learning_rate": 8.668696761298266e-06, + "loss": 0.5211, + "step": 4005 + }, + { + "epoch": 0.6578941966210252, + "grad_norm": 0.31266643294309454, + "learning_rate": 8.66852012498416e-06, + "loss": 0.5343, + "step": 4006 + }, + { + "epoch": 0.658058423829368, + "grad_norm": 0.2863747219818306, + "learning_rate": 8.668343443395824e-06, + "loss": 0.5046, + "step": 4007 + }, + { + "epoch": 0.6582226510377107, + "grad_norm": 0.659439189835058, + "learning_rate": 8.668166716535179e-06, + "loss": 0.5289, + "step": 4008 + }, + { + "epoch": 0.6583868782460535, + "grad_norm": 0.34529192696537236, + "learning_rate": 8.66798994440414e-06, + "loss": 0.5471, + "step": 4009 + }, + { + "epoch": 0.6585511054543961, + "grad_norm": 0.2888681021457298, + "learning_rate": 8.667813127004631e-06, + "loss": 0.5263, + "step": 4010 + }, + { + "epoch": 0.6587153326627389, + "grad_norm": 0.32980455433855144, + "learning_rate": 8.667636264338571e-06, + "loss": 0.5192, + "step": 4011 + }, + { + "epoch": 0.6588795598710816, + "grad_norm": 0.3485846589730208, + "learning_rate": 8.66745935640788e-06, + "loss": 0.5181, + "step": 4012 + }, + { + "epoch": 0.6590437870794243, + "grad_norm": 0.31578509817687833, + "learning_rate": 8.667282403214481e-06, + "loss": 0.5237, + "step": 4013 + }, + { + "epoch": 0.6592080142877671, + "grad_norm": 0.3229530274985868, + "learning_rate": 8.667105404760295e-06, + "loss": 0.5371, + "step": 4014 + }, + { + "epoch": 0.6593722414961098, + "grad_norm": 0.30354742074792096, + "learning_rate": 8.666928361047245e-06, + "loss": 0.5191, + "step": 4015 + }, + { + "epoch": 0.6595364687044526, + "grad_norm": 0.3509968909891949, + "learning_rate": 8.666751272077251e-06, + "loss": 0.5442, + "step": 4016 + }, + { + "epoch": 0.6597006959127953, + "grad_norm": 0.3153934088828136, + "learning_rate": 8.66657413785224e-06, + "loss": 0.5269, + "step": 4017 + }, + { + "epoch": 0.6598649231211381, + "grad_norm": 0.3170905924730358, + "learning_rate": 8.666396958374135e-06, + "loss": 0.5015, + "step": 4018 + }, + { + "epoch": 0.6600291503294808, + "grad_norm": 0.3199674654985588, + "learning_rate": 8.66621973364486e-06, + "loss": 0.5137, + "step": 4019 + }, + { + "epoch": 0.6601933775378236, + "grad_norm": 0.3528630549071056, + "learning_rate": 8.666042463666338e-06, + "loss": 0.5343, + "step": 4020 + }, + { + "epoch": 0.6603576047461663, + "grad_norm": 0.4338982051253656, + "learning_rate": 8.665865148440497e-06, + "loss": 0.5273, + "step": 4021 + }, + { + "epoch": 0.6605218319545091, + "grad_norm": 0.4618783895698222, + "learning_rate": 8.665687787969262e-06, + "loss": 0.5149, + "step": 4022 + }, + { + "epoch": 0.6606860591628518, + "grad_norm": 0.3659376755669406, + "learning_rate": 8.66551038225456e-06, + "loss": 0.5292, + "step": 4023 + }, + { + "epoch": 0.6608502863711946, + "grad_norm": 0.4078323654587953, + "learning_rate": 8.665332931298317e-06, + "loss": 0.5056, + "step": 4024 + }, + { + "epoch": 0.6610145135795373, + "grad_norm": 0.3036588935862725, + "learning_rate": 8.66515543510246e-06, + "loss": 0.5361, + "step": 4025 + }, + { + "epoch": 0.6611787407878801, + "grad_norm": 0.33300491685531275, + "learning_rate": 8.664977893668914e-06, + "loss": 0.5145, + "step": 4026 + }, + { + "epoch": 0.6613429679962227, + "grad_norm": 0.3164346698371754, + "learning_rate": 8.664800306999613e-06, + "loss": 0.5422, + "step": 4027 + }, + { + "epoch": 0.6615071952045655, + "grad_norm": 0.5384011020648434, + "learning_rate": 8.664622675096482e-06, + "loss": 0.5228, + "step": 4028 + }, + { + "epoch": 0.6616714224129082, + "grad_norm": 0.36002664128745665, + "learning_rate": 8.664444997961454e-06, + "loss": 0.5283, + "step": 4029 + }, + { + "epoch": 0.661835649621251, + "grad_norm": 0.34992110750106104, + "learning_rate": 8.664267275596453e-06, + "loss": 0.5273, + "step": 4030 + }, + { + "epoch": 0.6619998768295937, + "grad_norm": 0.33045355618722133, + "learning_rate": 8.664089508003413e-06, + "loss": 0.5309, + "step": 4031 + }, + { + "epoch": 0.6621641040379365, + "grad_norm": 0.3008997087087797, + "learning_rate": 8.663911695184265e-06, + "loss": 0.5214, + "step": 4032 + }, + { + "epoch": 0.6623283312462792, + "grad_norm": 0.2974714182775004, + "learning_rate": 8.663733837140939e-06, + "loss": 0.5514, + "step": 4033 + }, + { + "epoch": 0.662492558454622, + "grad_norm": 0.32714662729303373, + "learning_rate": 8.663555933875366e-06, + "loss": 0.5474, + "step": 4034 + }, + { + "epoch": 0.6626567856629647, + "grad_norm": 0.3179914587930655, + "learning_rate": 8.663377985389478e-06, + "loss": 0.5253, + "step": 4035 + }, + { + "epoch": 0.6628210128713075, + "grad_norm": 0.34607268067750174, + "learning_rate": 8.663199991685212e-06, + "loss": 0.5385, + "step": 4036 + }, + { + "epoch": 0.6629852400796502, + "grad_norm": 0.38888766148880033, + "learning_rate": 8.663021952764496e-06, + "loss": 0.5179, + "step": 4037 + }, + { + "epoch": 0.663149467287993, + "grad_norm": 0.34136523068815844, + "learning_rate": 8.662843868629267e-06, + "loss": 0.5359, + "step": 4038 + }, + { + "epoch": 0.6633136944963357, + "grad_norm": 0.36005173204058966, + "learning_rate": 8.662665739281458e-06, + "loss": 0.5454, + "step": 4039 + }, + { + "epoch": 0.6634779217046785, + "grad_norm": 0.3518704195072764, + "learning_rate": 8.662487564723002e-06, + "loss": 0.5415, + "step": 4040 + }, + { + "epoch": 0.6636421489130212, + "grad_norm": 0.3015583584370122, + "learning_rate": 8.662309344955838e-06, + "loss": 0.5281, + "step": 4041 + }, + { + "epoch": 0.663806376121364, + "grad_norm": 0.2901950538717286, + "learning_rate": 8.662131079981897e-06, + "loss": 0.5156, + "step": 4042 + }, + { + "epoch": 0.6639706033297067, + "grad_norm": 0.3280991639172377, + "learning_rate": 8.661952769803119e-06, + "loss": 0.5402, + "step": 4043 + }, + { + "epoch": 0.6641348305380493, + "grad_norm": 0.310899012651114, + "learning_rate": 8.661774414421438e-06, + "loss": 0.5298, + "step": 4044 + }, + { + "epoch": 0.6642990577463921, + "grad_norm": 0.34032542764121404, + "learning_rate": 8.661596013838793e-06, + "loss": 0.5329, + "step": 4045 + }, + { + "epoch": 0.6644632849547348, + "grad_norm": 0.3048173603991897, + "learning_rate": 8.66141756805712e-06, + "loss": 0.5272, + "step": 4046 + }, + { + "epoch": 0.6646275121630776, + "grad_norm": 0.3861615964125312, + "learning_rate": 8.661239077078358e-06, + "loss": 0.5392, + "step": 4047 + }, + { + "epoch": 0.6647917393714203, + "grad_norm": 0.37328731816957406, + "learning_rate": 8.661060540904447e-06, + "loss": 0.5333, + "step": 4048 + }, + { + "epoch": 0.6649559665797631, + "grad_norm": 0.33058996782192235, + "learning_rate": 8.660881959537324e-06, + "loss": 0.533, + "step": 4049 + }, + { + "epoch": 0.6651201937881058, + "grad_norm": 0.3452711619202778, + "learning_rate": 8.66070333297893e-06, + "loss": 0.5183, + "step": 4050 + }, + { + "epoch": 0.6652844209964486, + "grad_norm": 0.3659269032483274, + "learning_rate": 8.660524661231202e-06, + "loss": 0.5348, + "step": 4051 + }, + { + "epoch": 0.6654486482047913, + "grad_norm": 0.31385515871837055, + "learning_rate": 8.660345944296083e-06, + "loss": 0.5471, + "step": 4052 + }, + { + "epoch": 0.6656128754131341, + "grad_norm": 0.3140491300896386, + "learning_rate": 8.660167182175515e-06, + "loss": 0.5315, + "step": 4053 + }, + { + "epoch": 0.6657771026214768, + "grad_norm": 0.30202244601958633, + "learning_rate": 8.659988374871436e-06, + "loss": 0.5386, + "step": 4054 + }, + { + "epoch": 0.6659413298298196, + "grad_norm": 0.3905089298298893, + "learning_rate": 8.659809522385794e-06, + "loss": 0.5208, + "step": 4055 + }, + { + "epoch": 0.6661055570381623, + "grad_norm": 0.30906324457562473, + "learning_rate": 8.659630624720525e-06, + "loss": 0.5518, + "step": 4056 + }, + { + "epoch": 0.6662697842465051, + "grad_norm": 0.4354062962749167, + "learning_rate": 8.659451681877577e-06, + "loss": 0.5515, + "step": 4057 + }, + { + "epoch": 0.6664340114548478, + "grad_norm": 0.29273028085860114, + "learning_rate": 8.65927269385889e-06, + "loss": 0.514, + "step": 4058 + }, + { + "epoch": 0.6665982386631906, + "grad_norm": 0.42766559142493105, + "learning_rate": 8.659093660666411e-06, + "loss": 0.5504, + "step": 4059 + }, + { + "epoch": 0.6667624658715333, + "grad_norm": 0.4006447442117922, + "learning_rate": 8.658914582302082e-06, + "loss": 0.5121, + "step": 4060 + }, + { + "epoch": 0.666926693079876, + "grad_norm": 0.3034528462862328, + "learning_rate": 8.658735458767848e-06, + "loss": 0.5228, + "step": 4061 + }, + { + "epoch": 0.6670909202882187, + "grad_norm": 0.4233032717301708, + "learning_rate": 8.658556290065655e-06, + "loss": 0.5336, + "step": 4062 + }, + { + "epoch": 0.6672551474965615, + "grad_norm": 0.45100471286674626, + "learning_rate": 8.65837707619745e-06, + "loss": 0.5082, + "step": 4063 + }, + { + "epoch": 0.6674193747049042, + "grad_norm": 1.0766468981490969, + "learning_rate": 8.658197817165181e-06, + "loss": 0.5224, + "step": 4064 + }, + { + "epoch": 0.667583601913247, + "grad_norm": 0.33427942116827647, + "learning_rate": 8.658018512970788e-06, + "loss": 0.5121, + "step": 4065 + }, + { + "epoch": 0.6677478291215897, + "grad_norm": 0.3459647160115218, + "learning_rate": 8.657839163616227e-06, + "loss": 0.5368, + "step": 4066 + }, + { + "epoch": 0.6679120563299324, + "grad_norm": 0.36718583835824414, + "learning_rate": 8.657659769103439e-06, + "loss": 0.5273, + "step": 4067 + }, + { + "epoch": 0.6680762835382752, + "grad_norm": 0.3013228326013261, + "learning_rate": 8.657480329434378e-06, + "loss": 0.561, + "step": 4068 + }, + { + "epoch": 0.6682405107466179, + "grad_norm": 0.31179364960946604, + "learning_rate": 8.657300844610988e-06, + "loss": 0.5286, + "step": 4069 + }, + { + "epoch": 0.6684047379549607, + "grad_norm": 0.37872253313407145, + "learning_rate": 8.657121314635221e-06, + "loss": 0.5431, + "step": 4070 + }, + { + "epoch": 0.6685689651633034, + "grad_norm": 0.33270200758353, + "learning_rate": 8.656941739509027e-06, + "loss": 0.5331, + "step": 4071 + }, + { + "epoch": 0.6687331923716462, + "grad_norm": 0.4963399150092201, + "learning_rate": 8.656762119234356e-06, + "loss": 0.5292, + "step": 4072 + }, + { + "epoch": 0.6688974195799889, + "grad_norm": 0.29276600299262023, + "learning_rate": 8.656582453813157e-06, + "loss": 0.5015, + "step": 4073 + }, + { + "epoch": 0.6690616467883317, + "grad_norm": 0.3227354735508351, + "learning_rate": 8.656402743247385e-06, + "loss": 0.5265, + "step": 4074 + }, + { + "epoch": 0.6692258739966744, + "grad_norm": 0.3580768588600473, + "learning_rate": 8.65622298753899e-06, + "loss": 0.5258, + "step": 4075 + }, + { + "epoch": 0.6693901012050172, + "grad_norm": 0.3428832595500862, + "learning_rate": 8.656043186689923e-06, + "loss": 0.5283, + "step": 4076 + }, + { + "epoch": 0.6695543284133599, + "grad_norm": 0.43434915808781277, + "learning_rate": 8.655863340702139e-06, + "loss": 0.5317, + "step": 4077 + }, + { + "epoch": 0.6697185556217026, + "grad_norm": 0.3118494196364416, + "learning_rate": 8.65568344957759e-06, + "loss": 0.5465, + "step": 4078 + }, + { + "epoch": 0.6698827828300453, + "grad_norm": 0.281140918903688, + "learning_rate": 8.65550351331823e-06, + "loss": 0.5173, + "step": 4079 + }, + { + "epoch": 0.6700470100383881, + "grad_norm": 0.31825315533295956, + "learning_rate": 8.655323531926013e-06, + "loss": 0.5469, + "step": 4080 + }, + { + "epoch": 0.6702112372467308, + "grad_norm": 0.45548632455563126, + "learning_rate": 8.655143505402893e-06, + "loss": 0.5542, + "step": 4081 + }, + { + "epoch": 0.6703754644550736, + "grad_norm": 0.3272018942287102, + "learning_rate": 8.654963433750829e-06, + "loss": 0.5121, + "step": 4082 + }, + { + "epoch": 0.6705396916634163, + "grad_norm": 0.29951992766048624, + "learning_rate": 8.654783316971773e-06, + "loss": 0.5263, + "step": 4083 + }, + { + "epoch": 0.6707039188717591, + "grad_norm": 0.3251815306475271, + "learning_rate": 8.654603155067682e-06, + "loss": 0.5348, + "step": 4084 + }, + { + "epoch": 0.6708681460801018, + "grad_norm": 0.3957221610064905, + "learning_rate": 8.654422948040515e-06, + "loss": 0.5044, + "step": 4085 + }, + { + "epoch": 0.6710323732884446, + "grad_norm": 0.30994219622654656, + "learning_rate": 8.654242695892224e-06, + "loss": 0.5088, + "step": 4086 + }, + { + "epoch": 0.6711966004967873, + "grad_norm": 0.3344417178331218, + "learning_rate": 8.654062398624772e-06, + "loss": 0.517, + "step": 4087 + }, + { + "epoch": 0.6713608277051301, + "grad_norm": 0.3001680261537345, + "learning_rate": 8.653882056240116e-06, + "loss": 0.5182, + "step": 4088 + }, + { + "epoch": 0.6715250549134728, + "grad_norm": 0.3372489161830878, + "learning_rate": 8.653701668740214e-06, + "loss": 0.524, + "step": 4089 + }, + { + "epoch": 0.6716892821218156, + "grad_norm": 0.32761295082448727, + "learning_rate": 8.653521236127023e-06, + "loss": 0.5537, + "step": 4090 + }, + { + "epoch": 0.6718535093301583, + "grad_norm": 0.45024104103317986, + "learning_rate": 8.653340758402508e-06, + "loss": 0.5146, + "step": 4091 + }, + { + "epoch": 0.672017736538501, + "grad_norm": 0.3375009611373258, + "learning_rate": 8.653160235568622e-06, + "loss": 0.515, + "step": 4092 + }, + { + "epoch": 0.6721819637468438, + "grad_norm": 0.30487466834246935, + "learning_rate": 8.652979667627333e-06, + "loss": 0.507, + "step": 4093 + }, + { + "epoch": 0.6723461909551866, + "grad_norm": 0.3390407856209974, + "learning_rate": 8.6527990545806e-06, + "loss": 0.5235, + "step": 4094 + }, + { + "epoch": 0.6725104181635292, + "grad_norm": 0.44647654142082505, + "learning_rate": 8.65261839643038e-06, + "loss": 0.5302, + "step": 4095 + }, + { + "epoch": 0.6726746453718719, + "grad_norm": 0.4409501511793635, + "learning_rate": 8.65243769317864e-06, + "loss": 0.5076, + "step": 4096 + }, + { + "epoch": 0.6728388725802147, + "grad_norm": 0.3545266642369537, + "learning_rate": 8.652256944827341e-06, + "loss": 0.5139, + "step": 4097 + }, + { + "epoch": 0.6730030997885574, + "grad_norm": 0.3919734598018783, + "learning_rate": 8.652076151378446e-06, + "loss": 0.5267, + "step": 4098 + }, + { + "epoch": 0.6731673269969002, + "grad_norm": 0.38540276913674354, + "learning_rate": 8.65189531283392e-06, + "loss": 0.5229, + "step": 4099 + }, + { + "epoch": 0.6733315542052429, + "grad_norm": 0.2910524183414479, + "learning_rate": 8.651714429195725e-06, + "loss": 0.542, + "step": 4100 + }, + { + "epoch": 0.6734957814135857, + "grad_norm": 0.43159118996051093, + "learning_rate": 8.651533500465828e-06, + "loss": 0.5326, + "step": 4101 + }, + { + "epoch": 0.6736600086219284, + "grad_norm": 0.33738359200446183, + "learning_rate": 8.651352526646191e-06, + "loss": 0.5403, + "step": 4102 + }, + { + "epoch": 0.6738242358302712, + "grad_norm": 0.3315680073734393, + "learning_rate": 8.651171507738783e-06, + "loss": 0.516, + "step": 4103 + }, + { + "epoch": 0.6739884630386139, + "grad_norm": 0.3589770475880881, + "learning_rate": 8.650990443745567e-06, + "loss": 0.5414, + "step": 4104 + }, + { + "epoch": 0.6741526902469567, + "grad_norm": 0.35588571798248414, + "learning_rate": 8.65080933466851e-06, + "loss": 0.5215, + "step": 4105 + }, + { + "epoch": 0.6743169174552994, + "grad_norm": 0.3033052472495628, + "learning_rate": 8.65062818050958e-06, + "loss": 0.5435, + "step": 4106 + }, + { + "epoch": 0.6744811446636422, + "grad_norm": 0.6316488676677522, + "learning_rate": 8.650446981270744e-06, + "loss": 0.5224, + "step": 4107 + }, + { + "epoch": 0.6746453718719849, + "grad_norm": 0.3855301938414688, + "learning_rate": 8.650265736953972e-06, + "loss": 0.54, + "step": 4108 + }, + { + "epoch": 0.6748095990803277, + "grad_norm": 0.32885699707547555, + "learning_rate": 8.65008444756123e-06, + "loss": 0.5318, + "step": 4109 + }, + { + "epoch": 0.6749738262886704, + "grad_norm": 0.35338886666359093, + "learning_rate": 8.649903113094487e-06, + "loss": 0.528, + "step": 4110 + }, + { + "epoch": 0.6751380534970132, + "grad_norm": 0.36097457735813854, + "learning_rate": 8.649721733555715e-06, + "loss": 0.5244, + "step": 4111 + }, + { + "epoch": 0.6753022807053558, + "grad_norm": 0.31060975134752294, + "learning_rate": 8.64954030894688e-06, + "loss": 0.5215, + "step": 4112 + }, + { + "epoch": 0.6754665079136986, + "grad_norm": 0.3011457205831512, + "learning_rate": 8.649358839269955e-06, + "loss": 0.508, + "step": 4113 + }, + { + "epoch": 0.6756307351220413, + "grad_norm": 0.3418984691313553, + "learning_rate": 8.649177324526913e-06, + "loss": 0.5235, + "step": 4114 + }, + { + "epoch": 0.675794962330384, + "grad_norm": 0.43582773294360444, + "learning_rate": 8.64899576471972e-06, + "loss": 0.5541, + "step": 4115 + }, + { + "epoch": 0.6759591895387268, + "grad_norm": 0.35149410264601055, + "learning_rate": 8.648814159850354e-06, + "loss": 0.5347, + "step": 4116 + }, + { + "epoch": 0.6761234167470695, + "grad_norm": 0.368562017515984, + "learning_rate": 8.648632509920781e-06, + "loss": 0.542, + "step": 4117 + }, + { + "epoch": 0.6762876439554123, + "grad_norm": 0.323205499569439, + "learning_rate": 8.64845081493298e-06, + "loss": 0.5271, + "step": 4118 + }, + { + "epoch": 0.676451871163755, + "grad_norm": 0.412816416007039, + "learning_rate": 8.64826907488892e-06, + "loss": 0.5231, + "step": 4119 + }, + { + "epoch": 0.6766160983720978, + "grad_norm": 0.2996965923663482, + "learning_rate": 8.648087289790578e-06, + "loss": 0.5389, + "step": 4120 + }, + { + "epoch": 0.6767803255804405, + "grad_norm": 0.3222559504790764, + "learning_rate": 8.647905459639926e-06, + "loss": 0.5224, + "step": 4121 + }, + { + "epoch": 0.6769445527887833, + "grad_norm": 0.30650977456731987, + "learning_rate": 8.647723584438939e-06, + "loss": 0.5262, + "step": 4122 + }, + { + "epoch": 0.677108779997126, + "grad_norm": 0.5112994720395476, + "learning_rate": 8.647541664189593e-06, + "loss": 0.5178, + "step": 4123 + }, + { + "epoch": 0.6772730072054688, + "grad_norm": 0.3522817205412864, + "learning_rate": 8.647359698893867e-06, + "loss": 0.5199, + "step": 4124 + }, + { + "epoch": 0.6774372344138115, + "grad_norm": 0.3024109450912323, + "learning_rate": 8.647177688553731e-06, + "loss": 0.5135, + "step": 4125 + }, + { + "epoch": 0.6776014616221543, + "grad_norm": 0.3174797317269863, + "learning_rate": 8.646995633171165e-06, + "loss": 0.5123, + "step": 4126 + }, + { + "epoch": 0.677765688830497, + "grad_norm": 0.3490783488508878, + "learning_rate": 8.646813532748147e-06, + "loss": 0.5354, + "step": 4127 + }, + { + "epoch": 0.6779299160388398, + "grad_norm": 0.44894277917191494, + "learning_rate": 8.646631387286655e-06, + "loss": 0.5048, + "step": 4128 + }, + { + "epoch": 0.6780941432471824, + "grad_norm": 1.0038385015849884, + "learning_rate": 8.646449196788664e-06, + "loss": 0.5346, + "step": 4129 + }, + { + "epoch": 0.6782583704555252, + "grad_norm": 0.28209868548502737, + "learning_rate": 8.646266961256158e-06, + "loss": 0.5092, + "step": 4130 + }, + { + "epoch": 0.6784225976638679, + "grad_norm": 0.32056520106324327, + "learning_rate": 8.646084680691112e-06, + "loss": 0.521, + "step": 4131 + }, + { + "epoch": 0.6785868248722107, + "grad_norm": 0.30892792961083615, + "learning_rate": 8.645902355095507e-06, + "loss": 0.5233, + "step": 4132 + }, + { + "epoch": 0.6787510520805534, + "grad_norm": 0.3004522844978485, + "learning_rate": 8.645719984471325e-06, + "loss": 0.5519, + "step": 4133 + }, + { + "epoch": 0.6789152792888962, + "grad_norm": 0.37531256935974994, + "learning_rate": 8.645537568820544e-06, + "loss": 0.522, + "step": 4134 + }, + { + "epoch": 0.6790795064972389, + "grad_norm": 0.4463922435591353, + "learning_rate": 8.645355108145146e-06, + "loss": 0.5076, + "step": 4135 + }, + { + "epoch": 0.6792437337055817, + "grad_norm": 0.2871378370372213, + "learning_rate": 8.645172602447113e-06, + "loss": 0.5084, + "step": 4136 + }, + { + "epoch": 0.6794079609139244, + "grad_norm": 0.39296031580485435, + "learning_rate": 8.644990051728428e-06, + "loss": 0.5329, + "step": 4137 + }, + { + "epoch": 0.6795721881222672, + "grad_norm": 0.3206991930440626, + "learning_rate": 8.644807455991071e-06, + "loss": 0.5254, + "step": 4138 + }, + { + "epoch": 0.6797364153306099, + "grad_norm": 0.312091021839642, + "learning_rate": 8.644624815237029e-06, + "loss": 0.5262, + "step": 4139 + }, + { + "epoch": 0.6799006425389527, + "grad_norm": 0.2924739456006572, + "learning_rate": 8.644442129468284e-06, + "loss": 0.5002, + "step": 4140 + }, + { + "epoch": 0.6800648697472954, + "grad_norm": 0.3458651635031552, + "learning_rate": 8.64425939868682e-06, + "loss": 0.5326, + "step": 4141 + }, + { + "epoch": 0.6802290969556382, + "grad_norm": 0.6053301737235616, + "learning_rate": 8.644076622894621e-06, + "loss": 0.5496, + "step": 4142 + }, + { + "epoch": 0.6803933241639809, + "grad_norm": 0.3242956814925467, + "learning_rate": 8.643893802093671e-06, + "loss": 0.5138, + "step": 4143 + }, + { + "epoch": 0.6805575513723237, + "grad_norm": 0.2991928853840647, + "learning_rate": 8.64371093628596e-06, + "loss": 0.5154, + "step": 4144 + }, + { + "epoch": 0.6807217785806664, + "grad_norm": 0.2870604350100677, + "learning_rate": 8.64352802547347e-06, + "loss": 0.5152, + "step": 4145 + }, + { + "epoch": 0.680886005789009, + "grad_norm": 0.2885442702491027, + "learning_rate": 8.64334506965819e-06, + "loss": 0.5375, + "step": 4146 + }, + { + "epoch": 0.6810502329973518, + "grad_norm": 0.36928363961336547, + "learning_rate": 8.643162068842105e-06, + "loss": 0.5292, + "step": 4147 + }, + { + "epoch": 0.6812144602056945, + "grad_norm": 0.3343679239764359, + "learning_rate": 8.642979023027203e-06, + "loss": 0.5281, + "step": 4148 + }, + { + "epoch": 0.6813786874140373, + "grad_norm": 0.3116320481253324, + "learning_rate": 8.642795932215472e-06, + "loss": 0.5394, + "step": 4149 + }, + { + "epoch": 0.68154291462238, + "grad_norm": 0.2847083185415691, + "learning_rate": 8.642612796408904e-06, + "loss": 0.5315, + "step": 4150 + }, + { + "epoch": 0.6817071418307228, + "grad_norm": 0.3789055568649402, + "learning_rate": 8.642429615609483e-06, + "loss": 0.5175, + "step": 4151 + }, + { + "epoch": 0.6818713690390655, + "grad_norm": 0.35801662699820525, + "learning_rate": 8.642246389819202e-06, + "loss": 0.5237, + "step": 4152 + }, + { + "epoch": 0.6820355962474083, + "grad_norm": 0.2999952779090304, + "learning_rate": 8.642063119040049e-06, + "loss": 0.5155, + "step": 4153 + }, + { + "epoch": 0.682199823455751, + "grad_norm": 0.4117495657601018, + "learning_rate": 8.641879803274016e-06, + "loss": 0.5248, + "step": 4154 + }, + { + "epoch": 0.6823640506640938, + "grad_norm": 0.38786034920878637, + "learning_rate": 8.641696442523093e-06, + "loss": 0.5205, + "step": 4155 + }, + { + "epoch": 0.6825282778724365, + "grad_norm": 0.3532378510900068, + "learning_rate": 8.641513036789273e-06, + "loss": 0.5291, + "step": 4156 + }, + { + "epoch": 0.6826925050807793, + "grad_norm": 0.2924513736687471, + "learning_rate": 8.641329586074545e-06, + "loss": 0.517, + "step": 4157 + }, + { + "epoch": 0.682856732289122, + "grad_norm": 0.3274374660126311, + "learning_rate": 8.641146090380903e-06, + "loss": 0.5282, + "step": 4158 + }, + { + "epoch": 0.6830209594974648, + "grad_norm": 0.35013113932458, + "learning_rate": 8.64096254971034e-06, + "loss": 0.5332, + "step": 4159 + }, + { + "epoch": 0.6831851867058075, + "grad_norm": 0.3457641571260403, + "learning_rate": 8.640778964064852e-06, + "loss": 0.5289, + "step": 4160 + }, + { + "epoch": 0.6833494139141503, + "grad_norm": 0.27620805752207256, + "learning_rate": 8.640595333446427e-06, + "loss": 0.5458, + "step": 4161 + }, + { + "epoch": 0.683513641122493, + "grad_norm": 0.33925143437931815, + "learning_rate": 8.640411657857066e-06, + "loss": 0.5537, + "step": 4162 + }, + { + "epoch": 0.6836778683308357, + "grad_norm": 0.35668433810410605, + "learning_rate": 8.64022793729876e-06, + "loss": 0.5283, + "step": 4163 + }, + { + "epoch": 0.6838420955391784, + "grad_norm": 0.28380844419386303, + "learning_rate": 8.640044171773503e-06, + "loss": 0.5207, + "step": 4164 + }, + { + "epoch": 0.6840063227475212, + "grad_norm": 0.36428431243227716, + "learning_rate": 8.639860361283295e-06, + "loss": 0.5371, + "step": 4165 + }, + { + "epoch": 0.6841705499558639, + "grad_norm": 0.3638178524463825, + "learning_rate": 8.63967650583013e-06, + "loss": 0.5396, + "step": 4166 + }, + { + "epoch": 0.6843347771642067, + "grad_norm": 0.34929302756951347, + "learning_rate": 8.639492605416005e-06, + "loss": 0.5368, + "step": 4167 + }, + { + "epoch": 0.6844990043725494, + "grad_norm": 0.3769355480143036, + "learning_rate": 8.639308660042918e-06, + "loss": 0.5357, + "step": 4168 + }, + { + "epoch": 0.6846632315808922, + "grad_norm": 0.2722486789490862, + "learning_rate": 8.639124669712867e-06, + "loss": 0.5201, + "step": 4169 + }, + { + "epoch": 0.6848274587892349, + "grad_norm": 0.30186430431604144, + "learning_rate": 8.63894063442785e-06, + "loss": 0.5302, + "step": 4170 + }, + { + "epoch": 0.6849916859975776, + "grad_norm": 0.2866787935647283, + "learning_rate": 8.638756554189863e-06, + "loss": 0.501, + "step": 4171 + }, + { + "epoch": 0.6851559132059204, + "grad_norm": 0.38690681773281876, + "learning_rate": 8.63857242900091e-06, + "loss": 0.526, + "step": 4172 + }, + { + "epoch": 0.6853201404142631, + "grad_norm": 0.3231383434901005, + "learning_rate": 8.638388258862987e-06, + "loss": 0.5325, + "step": 4173 + }, + { + "epoch": 0.6854843676226059, + "grad_norm": 0.43681580478235355, + "learning_rate": 8.638204043778097e-06, + "loss": 0.5387, + "step": 4174 + }, + { + "epoch": 0.6856485948309486, + "grad_norm": 0.3668910158632165, + "learning_rate": 8.63801978374824e-06, + "loss": 0.5356, + "step": 4175 + }, + { + "epoch": 0.6858128220392914, + "grad_norm": 0.3455507180236111, + "learning_rate": 8.637835478775417e-06, + "loss": 0.5044, + "step": 4176 + }, + { + "epoch": 0.6859770492476341, + "grad_norm": 0.3556586108295262, + "learning_rate": 8.637651128861629e-06, + "loss": 0.5325, + "step": 4177 + }, + { + "epoch": 0.6861412764559769, + "grad_norm": 0.29839684395296556, + "learning_rate": 8.637466734008879e-06, + "loss": 0.5468, + "step": 4178 + }, + { + "epoch": 0.6863055036643196, + "grad_norm": 0.3371453476125466, + "learning_rate": 8.637282294219168e-06, + "loss": 0.508, + "step": 4179 + }, + { + "epoch": 0.6864697308726623, + "grad_norm": 0.48442277629443187, + "learning_rate": 8.637097809494504e-06, + "loss": 0.5469, + "step": 4180 + }, + { + "epoch": 0.686633958081005, + "grad_norm": 0.3501409566376713, + "learning_rate": 8.636913279836884e-06, + "loss": 0.5394, + "step": 4181 + }, + { + "epoch": 0.6867981852893478, + "grad_norm": 0.3153330952861787, + "learning_rate": 8.636728705248319e-06, + "loss": 0.5287, + "step": 4182 + }, + { + "epoch": 0.6869624124976905, + "grad_norm": 0.30647381190495343, + "learning_rate": 8.636544085730808e-06, + "loss": 0.509, + "step": 4183 + }, + { + "epoch": 0.6871266397060333, + "grad_norm": 0.32085086991778444, + "learning_rate": 8.636359421286358e-06, + "loss": 0.5338, + "step": 4184 + }, + { + "epoch": 0.687290866914376, + "grad_norm": 0.29831140561366876, + "learning_rate": 8.636174711916977e-06, + "loss": 0.5275, + "step": 4185 + }, + { + "epoch": 0.6874550941227188, + "grad_norm": 0.3066929012205568, + "learning_rate": 8.635989957624669e-06, + "loss": 0.5218, + "step": 4186 + }, + { + "epoch": 0.6876193213310615, + "grad_norm": 0.33259903521685663, + "learning_rate": 8.635805158411438e-06, + "loss": 0.5006, + "step": 4187 + }, + { + "epoch": 0.6877835485394043, + "grad_norm": 0.2893515986506582, + "learning_rate": 8.635620314279297e-06, + "loss": 0.5177, + "step": 4188 + }, + { + "epoch": 0.687947775747747, + "grad_norm": 0.33727758187688006, + "learning_rate": 8.63543542523025e-06, + "loss": 0.5136, + "step": 4189 + }, + { + "epoch": 0.6881120029560898, + "grad_norm": 0.31013060657131736, + "learning_rate": 8.635250491266304e-06, + "loss": 0.5238, + "step": 4190 + }, + { + "epoch": 0.6882762301644325, + "grad_norm": 0.38573616889290285, + "learning_rate": 8.63506551238947e-06, + "loss": 0.5505, + "step": 4191 + }, + { + "epoch": 0.6884404573727753, + "grad_norm": 0.313359998521622, + "learning_rate": 8.634880488601756e-06, + "loss": 0.5277, + "step": 4192 + }, + { + "epoch": 0.688604684581118, + "grad_norm": 0.349773895723136, + "learning_rate": 8.634695419905173e-06, + "loss": 0.5349, + "step": 4193 + }, + { + "epoch": 0.6887689117894608, + "grad_norm": 0.44196567700319817, + "learning_rate": 8.634510306301728e-06, + "loss": 0.5137, + "step": 4194 + }, + { + "epoch": 0.6889331389978035, + "grad_norm": 0.4226507437950207, + "learning_rate": 8.634325147793434e-06, + "loss": 0.5528, + "step": 4195 + }, + { + "epoch": 0.6890973662061463, + "grad_norm": 0.293215751832506, + "learning_rate": 8.6341399443823e-06, + "loss": 0.5337, + "step": 4196 + }, + { + "epoch": 0.6892615934144889, + "grad_norm": 0.28090267043332035, + "learning_rate": 8.63395469607034e-06, + "loss": 0.5268, + "step": 4197 + }, + { + "epoch": 0.6894258206228316, + "grad_norm": 0.35600281186827526, + "learning_rate": 8.633769402859566e-06, + "loss": 0.5323, + "step": 4198 + }, + { + "epoch": 0.6895900478311744, + "grad_norm": 0.34343124065854136, + "learning_rate": 8.633584064751989e-06, + "loss": 0.5365, + "step": 4199 + }, + { + "epoch": 0.6897542750395171, + "grad_norm": 0.32470237327045537, + "learning_rate": 8.63339868174962e-06, + "loss": 0.5258, + "step": 4200 + }, + { + "epoch": 0.6899185022478599, + "grad_norm": 0.3868201193984014, + "learning_rate": 8.633213253854476e-06, + "loss": 0.5275, + "step": 4201 + }, + { + "epoch": 0.6900827294562026, + "grad_norm": 0.4445413871679448, + "learning_rate": 8.63302778106857e-06, + "loss": 0.5337, + "step": 4202 + }, + { + "epoch": 0.6902469566645454, + "grad_norm": 0.48265095727302576, + "learning_rate": 8.632842263393915e-06, + "loss": 0.523, + "step": 4203 + }, + { + "epoch": 0.6904111838728881, + "grad_norm": 0.47827943743664814, + "learning_rate": 8.632656700832527e-06, + "loss": 0.5416, + "step": 4204 + }, + { + "epoch": 0.6905754110812309, + "grad_norm": 0.31881673938269417, + "learning_rate": 8.63247109338642e-06, + "loss": 0.5456, + "step": 4205 + }, + { + "epoch": 0.6907396382895736, + "grad_norm": 0.36208368158027626, + "learning_rate": 8.632285441057614e-06, + "loss": 0.5353, + "step": 4206 + }, + { + "epoch": 0.6909038654979164, + "grad_norm": 0.3044211762704708, + "learning_rate": 8.632099743848121e-06, + "loss": 0.5347, + "step": 4207 + }, + { + "epoch": 0.6910680927062591, + "grad_norm": 0.43847455160193893, + "learning_rate": 8.631914001759958e-06, + "loss": 0.5257, + "step": 4208 + }, + { + "epoch": 0.6912323199146019, + "grad_norm": 0.3791569545248908, + "learning_rate": 8.631728214795145e-06, + "loss": 0.5228, + "step": 4209 + }, + { + "epoch": 0.6913965471229446, + "grad_norm": 0.31693174954761305, + "learning_rate": 8.6315423829557e-06, + "loss": 0.5511, + "step": 4210 + }, + { + "epoch": 0.6915607743312874, + "grad_norm": 0.30018018314139316, + "learning_rate": 8.631356506243637e-06, + "loss": 0.5272, + "step": 4211 + }, + { + "epoch": 0.6917250015396301, + "grad_norm": 0.35174861259968854, + "learning_rate": 8.63117058466098e-06, + "loss": 0.5475, + "step": 4212 + }, + { + "epoch": 0.6918892287479729, + "grad_norm": 0.307674247694834, + "learning_rate": 8.630984618209743e-06, + "loss": 0.4938, + "step": 4213 + }, + { + "epoch": 0.6920534559563155, + "grad_norm": 0.36394072159220425, + "learning_rate": 8.630798606891951e-06, + "loss": 0.5274, + "step": 4214 + }, + { + "epoch": 0.6922176831646583, + "grad_norm": 0.33159368131380446, + "learning_rate": 8.630612550709622e-06, + "loss": 0.5469, + "step": 4215 + }, + { + "epoch": 0.692381910373001, + "grad_norm": 0.3251253917776796, + "learning_rate": 8.630426449664776e-06, + "loss": 0.547, + "step": 4216 + }, + { + "epoch": 0.6925461375813438, + "grad_norm": 0.3349215700548793, + "learning_rate": 8.630240303759436e-06, + "loss": 0.5146, + "step": 4217 + }, + { + "epoch": 0.6927103647896865, + "grad_norm": 0.320126842055859, + "learning_rate": 8.630054112995621e-06, + "loss": 0.5317, + "step": 4218 + }, + { + "epoch": 0.6928745919980293, + "grad_norm": 0.3873589155863503, + "learning_rate": 8.629867877375356e-06, + "loss": 0.5117, + "step": 4219 + }, + { + "epoch": 0.693038819206372, + "grad_norm": 0.3407313212235653, + "learning_rate": 8.629681596900663e-06, + "loss": 0.5329, + "step": 4220 + }, + { + "epoch": 0.6932030464147148, + "grad_norm": 0.295578297140357, + "learning_rate": 8.629495271573565e-06, + "loss": 0.5033, + "step": 4221 + }, + { + "epoch": 0.6933672736230575, + "grad_norm": 0.42228340319703683, + "learning_rate": 8.629308901396083e-06, + "loss": 0.4983, + "step": 4222 + }, + { + "epoch": 0.6935315008314002, + "grad_norm": 0.4463823743156875, + "learning_rate": 8.629122486370245e-06, + "loss": 0.513, + "step": 4223 + }, + { + "epoch": 0.693695728039743, + "grad_norm": 0.31355091465461943, + "learning_rate": 8.628936026498075e-06, + "loss": 0.5326, + "step": 4224 + }, + { + "epoch": 0.6938599552480857, + "grad_norm": 0.33679477663537927, + "learning_rate": 8.628749521781598e-06, + "loss": 0.5448, + "step": 4225 + }, + { + "epoch": 0.6940241824564285, + "grad_norm": 0.35080468703776435, + "learning_rate": 8.628562972222838e-06, + "loss": 0.4956, + "step": 4226 + }, + { + "epoch": 0.6941884096647712, + "grad_norm": 0.29616783212095005, + "learning_rate": 8.628376377823823e-06, + "loss": 0.5487, + "step": 4227 + }, + { + "epoch": 0.694352636873114, + "grad_norm": 0.3512870033644539, + "learning_rate": 8.628189738586577e-06, + "loss": 0.5192, + "step": 4228 + }, + { + "epoch": 0.6945168640814567, + "grad_norm": 0.7499849953119065, + "learning_rate": 8.628003054513129e-06, + "loss": 0.5412, + "step": 4229 + }, + { + "epoch": 0.6946810912897995, + "grad_norm": 0.34197986132326763, + "learning_rate": 8.627816325605509e-06, + "loss": 0.541, + "step": 4230 + }, + { + "epoch": 0.6948453184981421, + "grad_norm": 0.35850041319838344, + "learning_rate": 8.627629551865741e-06, + "loss": 0.5424, + "step": 4231 + }, + { + "epoch": 0.6950095457064849, + "grad_norm": 0.38240384165529273, + "learning_rate": 8.627442733295855e-06, + "loss": 0.5417, + "step": 4232 + }, + { + "epoch": 0.6951737729148276, + "grad_norm": 0.3236064766935071, + "learning_rate": 8.62725586989788e-06, + "loss": 0.5352, + "step": 4233 + }, + { + "epoch": 0.6953380001231704, + "grad_norm": 0.3556556217013173, + "learning_rate": 8.627068961673844e-06, + "loss": 0.5187, + "step": 4234 + }, + { + "epoch": 0.6955022273315131, + "grad_norm": 0.3423798038428814, + "learning_rate": 8.62688200862578e-06, + "loss": 0.5366, + "step": 4235 + }, + { + "epoch": 0.6956664545398559, + "grad_norm": 0.28350851553753786, + "learning_rate": 8.626695010755719e-06, + "loss": 0.522, + "step": 4236 + }, + { + "epoch": 0.6958306817481986, + "grad_norm": 0.3257472077657691, + "learning_rate": 8.626507968065687e-06, + "loss": 0.5215, + "step": 4237 + }, + { + "epoch": 0.6959949089565414, + "grad_norm": 0.31021167794491405, + "learning_rate": 8.62632088055772e-06, + "loss": 0.5088, + "step": 4238 + }, + { + "epoch": 0.6961591361648841, + "grad_norm": 0.3166485566164785, + "learning_rate": 8.626133748233847e-06, + "loss": 0.5213, + "step": 4239 + }, + { + "epoch": 0.6963233633732269, + "grad_norm": 0.29145841027820457, + "learning_rate": 8.625946571096106e-06, + "loss": 0.4946, + "step": 4240 + }, + { + "epoch": 0.6964875905815696, + "grad_norm": 0.319921850998031, + "learning_rate": 8.625759349146521e-06, + "loss": 0.5386, + "step": 4241 + }, + { + "epoch": 0.6966518177899124, + "grad_norm": 0.308972089997359, + "learning_rate": 8.625572082387132e-06, + "loss": 0.5413, + "step": 4242 + }, + { + "epoch": 0.6968160449982551, + "grad_norm": 0.2835317133915667, + "learning_rate": 8.625384770819972e-06, + "loss": 0.534, + "step": 4243 + }, + { + "epoch": 0.6969802722065979, + "grad_norm": 0.4233385444079988, + "learning_rate": 8.625197414447073e-06, + "loss": 0.5229, + "step": 4244 + }, + { + "epoch": 0.6971444994149406, + "grad_norm": 0.4137599078113359, + "learning_rate": 8.625010013270474e-06, + "loss": 0.4999, + "step": 4245 + }, + { + "epoch": 0.6973087266232834, + "grad_norm": 0.333057995130589, + "learning_rate": 8.624822567292205e-06, + "loss": 0.5021, + "step": 4246 + }, + { + "epoch": 0.6974729538316261, + "grad_norm": 0.3373896893862306, + "learning_rate": 8.624635076514307e-06, + "loss": 0.544, + "step": 4247 + }, + { + "epoch": 0.6976371810399687, + "grad_norm": 0.3598055058817781, + "learning_rate": 8.624447540938813e-06, + "loss": 0.5192, + "step": 4248 + }, + { + "epoch": 0.6978014082483115, + "grad_norm": 0.33727018756814525, + "learning_rate": 8.62425996056776e-06, + "loss": 0.5226, + "step": 4249 + }, + { + "epoch": 0.6979656354566542, + "grad_norm": 0.30647169364146576, + "learning_rate": 8.624072335403188e-06, + "loss": 0.5102, + "step": 4250 + }, + { + "epoch": 0.698129862664997, + "grad_norm": 0.2931637111302099, + "learning_rate": 8.62388466544713e-06, + "loss": 0.5347, + "step": 4251 + }, + { + "epoch": 0.6982940898733397, + "grad_norm": 0.41161972319441537, + "learning_rate": 8.623696950701629e-06, + "loss": 0.498, + "step": 4252 + }, + { + "epoch": 0.6984583170816825, + "grad_norm": 0.34023286853888174, + "learning_rate": 8.623509191168722e-06, + "loss": 0.5267, + "step": 4253 + }, + { + "epoch": 0.6986225442900252, + "grad_norm": 0.4900155873432433, + "learning_rate": 8.623321386850449e-06, + "loss": 0.5373, + "step": 4254 + }, + { + "epoch": 0.698786771498368, + "grad_norm": 0.39734723306453845, + "learning_rate": 8.623133537748847e-06, + "loss": 0.5201, + "step": 4255 + }, + { + "epoch": 0.6989509987067107, + "grad_norm": 0.3420535819762841, + "learning_rate": 8.622945643865959e-06, + "loss": 0.535, + "step": 4256 + }, + { + "epoch": 0.6991152259150535, + "grad_norm": 0.34538252628053706, + "learning_rate": 8.622757705203825e-06, + "loss": 0.5332, + "step": 4257 + }, + { + "epoch": 0.6992794531233962, + "grad_norm": 0.30859870827439856, + "learning_rate": 8.622569721764487e-06, + "loss": 0.5099, + "step": 4258 + }, + { + "epoch": 0.699443680331739, + "grad_norm": 0.3291857838753656, + "learning_rate": 8.622381693549985e-06, + "loss": 0.5341, + "step": 4259 + }, + { + "epoch": 0.6996079075400817, + "grad_norm": 0.36292305907656036, + "learning_rate": 8.62219362056236e-06, + "loss": 0.5301, + "step": 4260 + }, + { + "epoch": 0.6997721347484245, + "grad_norm": 0.3350119610981238, + "learning_rate": 8.622005502803659e-06, + "loss": 0.5331, + "step": 4261 + }, + { + "epoch": 0.6999363619567672, + "grad_norm": 0.35928706753215134, + "learning_rate": 8.621817340275921e-06, + "loss": 0.5262, + "step": 4262 + }, + { + "epoch": 0.70010058916511, + "grad_norm": 0.29995114712883486, + "learning_rate": 8.621629132981194e-06, + "loss": 0.5107, + "step": 4263 + }, + { + "epoch": 0.7002648163734527, + "grad_norm": 0.32459709306920986, + "learning_rate": 8.621440880921519e-06, + "loss": 0.5436, + "step": 4264 + }, + { + "epoch": 0.7004290435817954, + "grad_norm": 0.27867869217003954, + "learning_rate": 8.621252584098938e-06, + "loss": 0.5332, + "step": 4265 + }, + { + "epoch": 0.7005932707901381, + "grad_norm": 0.34422747615332705, + "learning_rate": 8.621064242515503e-06, + "loss": 0.5191, + "step": 4266 + }, + { + "epoch": 0.7007574979984809, + "grad_norm": 0.29331677428471403, + "learning_rate": 8.620875856173253e-06, + "loss": 0.5079, + "step": 4267 + }, + { + "epoch": 0.7009217252068236, + "grad_norm": 0.3121609634949846, + "learning_rate": 8.620687425074238e-06, + "loss": 0.5136, + "step": 4268 + }, + { + "epoch": 0.7010859524151664, + "grad_norm": 0.3838693596047789, + "learning_rate": 8.620498949220502e-06, + "loss": 0.5175, + "step": 4269 + }, + { + "epoch": 0.7012501796235091, + "grad_norm": 0.3399582939068176, + "learning_rate": 8.620310428614094e-06, + "loss": 0.5303, + "step": 4270 + }, + { + "epoch": 0.7014144068318519, + "grad_norm": 0.3882307518798506, + "learning_rate": 8.620121863257062e-06, + "loss": 0.5358, + "step": 4271 + }, + { + "epoch": 0.7015786340401946, + "grad_norm": 0.327844178142628, + "learning_rate": 8.619933253151452e-06, + "loss": 0.5179, + "step": 4272 + }, + { + "epoch": 0.7017428612485374, + "grad_norm": 0.3059441881333392, + "learning_rate": 8.619744598299315e-06, + "loss": 0.5003, + "step": 4273 + }, + { + "epoch": 0.7019070884568801, + "grad_norm": 0.33289088821210483, + "learning_rate": 8.619555898702695e-06, + "loss": 0.5203, + "step": 4274 + }, + { + "epoch": 0.7020713156652229, + "grad_norm": 0.40764636675339916, + "learning_rate": 8.619367154363647e-06, + "loss": 0.5287, + "step": 4275 + }, + { + "epoch": 0.7022355428735656, + "grad_norm": 0.32231801015454054, + "learning_rate": 8.61917836528422e-06, + "loss": 0.5228, + "step": 4276 + }, + { + "epoch": 0.7023997700819083, + "grad_norm": 0.36811692912587884, + "learning_rate": 8.618989531466462e-06, + "loss": 0.5104, + "step": 4277 + }, + { + "epoch": 0.7025639972902511, + "grad_norm": 0.3432330040499813, + "learning_rate": 8.618800652912425e-06, + "loss": 0.5105, + "step": 4278 + }, + { + "epoch": 0.7027282244985938, + "grad_norm": 0.3163500424266403, + "learning_rate": 8.618611729624161e-06, + "loss": 0.536, + "step": 4279 + }, + { + "epoch": 0.7028924517069366, + "grad_norm": 0.3334439933008529, + "learning_rate": 8.61842276160372e-06, + "loss": 0.5218, + "step": 4280 + }, + { + "epoch": 0.7030566789152793, + "grad_norm": 0.3651496955106728, + "learning_rate": 8.618233748853159e-06, + "loss": 0.5291, + "step": 4281 + }, + { + "epoch": 0.703220906123622, + "grad_norm": 0.3909489697860476, + "learning_rate": 8.618044691374524e-06, + "loss": 0.5138, + "step": 4282 + }, + { + "epoch": 0.7033851333319647, + "grad_norm": 0.3432299333346993, + "learning_rate": 8.617855589169873e-06, + "loss": 0.5296, + "step": 4283 + }, + { + "epoch": 0.7035493605403075, + "grad_norm": 0.32106670987419533, + "learning_rate": 8.617666442241261e-06, + "loss": 0.5538, + "step": 4284 + }, + { + "epoch": 0.7037135877486502, + "grad_norm": 0.278987267758817, + "learning_rate": 8.617477250590737e-06, + "loss": 0.5351, + "step": 4285 + }, + { + "epoch": 0.703877814956993, + "grad_norm": 0.30910724383387306, + "learning_rate": 8.617288014220362e-06, + "loss": 0.5326, + "step": 4286 + }, + { + "epoch": 0.7040420421653357, + "grad_norm": 0.323362918561536, + "learning_rate": 8.617098733132187e-06, + "loss": 0.5348, + "step": 4287 + }, + { + "epoch": 0.7042062693736785, + "grad_norm": 0.3078079738940741, + "learning_rate": 8.616909407328268e-06, + "loss": 0.5458, + "step": 4288 + }, + { + "epoch": 0.7043704965820212, + "grad_norm": 0.2844380907240079, + "learning_rate": 8.616720036810664e-06, + "loss": 0.5177, + "step": 4289 + }, + { + "epoch": 0.704534723790364, + "grad_norm": 0.32375350229388816, + "learning_rate": 8.61653062158143e-06, + "loss": 0.5277, + "step": 4290 + }, + { + "epoch": 0.7046989509987067, + "grad_norm": 0.3375882355708296, + "learning_rate": 8.616341161642622e-06, + "loss": 0.5172, + "step": 4291 + }, + { + "epoch": 0.7048631782070495, + "grad_norm": 0.2865337715238847, + "learning_rate": 8.616151656996301e-06, + "loss": 0.5367, + "step": 4292 + }, + { + "epoch": 0.7050274054153922, + "grad_norm": 0.3144427358498577, + "learning_rate": 8.615962107644523e-06, + "loss": 0.4955, + "step": 4293 + }, + { + "epoch": 0.705191632623735, + "grad_norm": 0.3656931892003649, + "learning_rate": 8.615772513589345e-06, + "loss": 0.5221, + "step": 4294 + }, + { + "epoch": 0.7053558598320777, + "grad_norm": 0.2958582370426801, + "learning_rate": 8.61558287483283e-06, + "loss": 0.5537, + "step": 4295 + }, + { + "epoch": 0.7055200870404205, + "grad_norm": 0.3768598902611741, + "learning_rate": 8.615393191377035e-06, + "loss": 0.5235, + "step": 4296 + }, + { + "epoch": 0.7056843142487632, + "grad_norm": 0.3347727814313395, + "learning_rate": 8.615203463224021e-06, + "loss": 0.5174, + "step": 4297 + }, + { + "epoch": 0.705848541457106, + "grad_norm": 0.38916588034781613, + "learning_rate": 8.615013690375849e-06, + "loss": 0.539, + "step": 4298 + }, + { + "epoch": 0.7060127686654486, + "grad_norm": 0.27779637428046317, + "learning_rate": 8.61482387283458e-06, + "loss": 0.5066, + "step": 4299 + }, + { + "epoch": 0.7061769958737913, + "grad_norm": 0.5400534178669388, + "learning_rate": 8.614634010602274e-06, + "loss": 0.508, + "step": 4300 + }, + { + "epoch": 0.7063412230821341, + "grad_norm": 0.37039212228978896, + "learning_rate": 8.614444103680995e-06, + "loss": 0.5402, + "step": 4301 + }, + { + "epoch": 0.7065054502904768, + "grad_norm": 0.279075184659078, + "learning_rate": 8.614254152072805e-06, + "loss": 0.5422, + "step": 4302 + }, + { + "epoch": 0.7066696774988196, + "grad_norm": 0.32756207172905416, + "learning_rate": 8.614064155779767e-06, + "loss": 0.5286, + "step": 4303 + }, + { + "epoch": 0.7068339047071623, + "grad_norm": 0.34922911866740997, + "learning_rate": 8.613874114803945e-06, + "loss": 0.5264, + "step": 4304 + }, + { + "epoch": 0.7069981319155051, + "grad_norm": 0.31257689692201895, + "learning_rate": 8.613684029147401e-06, + "loss": 0.5178, + "step": 4305 + }, + { + "epoch": 0.7071623591238478, + "grad_norm": 0.30307569125011224, + "learning_rate": 8.613493898812202e-06, + "loss": 0.5124, + "step": 4306 + }, + { + "epoch": 0.7073265863321906, + "grad_norm": 0.36069449916053775, + "learning_rate": 8.613303723800413e-06, + "loss": 0.5282, + "step": 4307 + }, + { + "epoch": 0.7074908135405333, + "grad_norm": 0.3092916348181574, + "learning_rate": 8.6131135041141e-06, + "loss": 0.5305, + "step": 4308 + }, + { + "epoch": 0.7076550407488761, + "grad_norm": 0.40891948696883457, + "learning_rate": 8.612923239755325e-06, + "loss": 0.5214, + "step": 4309 + }, + { + "epoch": 0.7078192679572188, + "grad_norm": 0.31616164432034843, + "learning_rate": 8.612732930726157e-06, + "loss": 0.5194, + "step": 4310 + }, + { + "epoch": 0.7079834951655616, + "grad_norm": 0.3000505637839565, + "learning_rate": 8.612542577028663e-06, + "loss": 0.524, + "step": 4311 + }, + { + "epoch": 0.7081477223739043, + "grad_norm": 0.3129782682534076, + "learning_rate": 8.612352178664911e-06, + "loss": 0.5238, + "step": 4312 + }, + { + "epoch": 0.7083119495822471, + "grad_norm": 0.3206955929245123, + "learning_rate": 8.612161735636968e-06, + "loss": 0.5261, + "step": 4313 + }, + { + "epoch": 0.7084761767905898, + "grad_norm": 0.37716743758695925, + "learning_rate": 8.611971247946904e-06, + "loss": 0.4876, + "step": 4314 + }, + { + "epoch": 0.7086404039989326, + "grad_norm": 0.40831633759201236, + "learning_rate": 8.611780715596786e-06, + "loss": 0.5295, + "step": 4315 + }, + { + "epoch": 0.7088046312072752, + "grad_norm": 0.304318562490442, + "learning_rate": 8.611590138588685e-06, + "loss": 0.5323, + "step": 4316 + }, + { + "epoch": 0.708968858415618, + "grad_norm": 0.3975476450385827, + "learning_rate": 8.611399516924666e-06, + "loss": 0.5175, + "step": 4317 + }, + { + "epoch": 0.7091330856239607, + "grad_norm": 0.30548294363985984, + "learning_rate": 8.611208850606806e-06, + "loss": 0.5389, + "step": 4318 + }, + { + "epoch": 0.7092973128323035, + "grad_norm": 0.3777837973806069, + "learning_rate": 8.611018139637175e-06, + "loss": 0.5311, + "step": 4319 + }, + { + "epoch": 0.7094615400406462, + "grad_norm": 0.2973347910181853, + "learning_rate": 8.61082738401784e-06, + "loss": 0.5402, + "step": 4320 + }, + { + "epoch": 0.709625767248989, + "grad_norm": 0.36469127314551625, + "learning_rate": 8.610636583750874e-06, + "loss": 0.5209, + "step": 4321 + }, + { + "epoch": 0.7097899944573317, + "grad_norm": 0.28293959836732613, + "learning_rate": 8.610445738838352e-06, + "loss": 0.5153, + "step": 4322 + }, + { + "epoch": 0.7099542216656745, + "grad_norm": 0.42306390379784664, + "learning_rate": 8.610254849282345e-06, + "loss": 0.5531, + "step": 4323 + }, + { + "epoch": 0.7101184488740172, + "grad_norm": 0.28628622993663794, + "learning_rate": 8.610063915084926e-06, + "loss": 0.5414, + "step": 4324 + }, + { + "epoch": 0.71028267608236, + "grad_norm": 0.281324125785271, + "learning_rate": 8.609872936248168e-06, + "loss": 0.5226, + "step": 4325 + }, + { + "epoch": 0.7104469032907027, + "grad_norm": 0.37477344881709673, + "learning_rate": 8.609681912774149e-06, + "loss": 0.542, + "step": 4326 + }, + { + "epoch": 0.7106111304990455, + "grad_norm": 0.3206181721344887, + "learning_rate": 8.609490844664938e-06, + "loss": 0.5223, + "step": 4327 + }, + { + "epoch": 0.7107753577073882, + "grad_norm": 0.3339242114148264, + "learning_rate": 8.609299731922616e-06, + "loss": 0.5106, + "step": 4328 + }, + { + "epoch": 0.710939584915731, + "grad_norm": 0.2983358499194098, + "learning_rate": 8.609108574549254e-06, + "loss": 0.5397, + "step": 4329 + }, + { + "epoch": 0.7111038121240737, + "grad_norm": 0.33380596620035463, + "learning_rate": 8.608917372546931e-06, + "loss": 0.5411, + "step": 4330 + }, + { + "epoch": 0.7112680393324164, + "grad_norm": 0.2698264997063511, + "learning_rate": 8.608726125917721e-06, + "loss": 0.5073, + "step": 4331 + }, + { + "epoch": 0.7114322665407592, + "grad_norm": 0.3947051740162191, + "learning_rate": 8.608534834663705e-06, + "loss": 0.5249, + "step": 4332 + }, + { + "epoch": 0.7115964937491018, + "grad_norm": 0.2906445128721743, + "learning_rate": 8.608343498786958e-06, + "loss": 0.5403, + "step": 4333 + }, + { + "epoch": 0.7117607209574446, + "grad_norm": 0.30410226624743664, + "learning_rate": 8.60815211828956e-06, + "loss": 0.5335, + "step": 4334 + }, + { + "epoch": 0.7119249481657873, + "grad_norm": 0.28918039186839667, + "learning_rate": 8.607960693173585e-06, + "loss": 0.5053, + "step": 4335 + }, + { + "epoch": 0.7120891753741301, + "grad_norm": 0.39784143895681123, + "learning_rate": 8.607769223441118e-06, + "loss": 0.5263, + "step": 4336 + }, + { + "epoch": 0.7122534025824728, + "grad_norm": 0.3816380356849168, + "learning_rate": 8.607577709094234e-06, + "loss": 0.536, + "step": 4337 + }, + { + "epoch": 0.7124176297908156, + "grad_norm": 0.48876799069675336, + "learning_rate": 8.607386150135016e-06, + "loss": 0.52, + "step": 4338 + }, + { + "epoch": 0.7125818569991583, + "grad_norm": 0.30419187380004425, + "learning_rate": 8.607194546565541e-06, + "loss": 0.5172, + "step": 4339 + }, + { + "epoch": 0.7127460842075011, + "grad_norm": 0.3046540779190067, + "learning_rate": 8.607002898387894e-06, + "loss": 0.5255, + "step": 4340 + }, + { + "epoch": 0.7129103114158438, + "grad_norm": 0.3697888338261436, + "learning_rate": 8.606811205604155e-06, + "loss": 0.5025, + "step": 4341 + }, + { + "epoch": 0.7130745386241866, + "grad_norm": 0.32176447793422863, + "learning_rate": 8.606619468216403e-06, + "loss": 0.4727, + "step": 4342 + }, + { + "epoch": 0.7132387658325293, + "grad_norm": 0.372657494023084, + "learning_rate": 8.606427686226727e-06, + "loss": 0.5283, + "step": 4343 + }, + { + "epoch": 0.7134029930408721, + "grad_norm": 0.3190889106654982, + "learning_rate": 8.606235859637206e-06, + "loss": 0.5326, + "step": 4344 + }, + { + "epoch": 0.7135672202492148, + "grad_norm": 0.2928457979647746, + "learning_rate": 8.606043988449922e-06, + "loss": 0.5128, + "step": 4345 + }, + { + "epoch": 0.7137314474575576, + "grad_norm": 0.2706617064341812, + "learning_rate": 8.60585207266696e-06, + "loss": 0.5224, + "step": 4346 + }, + { + "epoch": 0.7138956746659003, + "grad_norm": 0.3205314934694761, + "learning_rate": 8.605660112290406e-06, + "loss": 0.5028, + "step": 4347 + }, + { + "epoch": 0.7140599018742431, + "grad_norm": 0.33291126764659723, + "learning_rate": 8.605468107322342e-06, + "loss": 0.5144, + "step": 4348 + }, + { + "epoch": 0.7142241290825858, + "grad_norm": 0.319633960734903, + "learning_rate": 8.605276057764857e-06, + "loss": 0.5165, + "step": 4349 + }, + { + "epoch": 0.7143883562909285, + "grad_norm": 0.4054703641916312, + "learning_rate": 8.605083963620034e-06, + "loss": 0.5242, + "step": 4350 + }, + { + "epoch": 0.7145525834992712, + "grad_norm": 0.3103957865806984, + "learning_rate": 8.604891824889961e-06, + "loss": 0.5218, + "step": 4351 + }, + { + "epoch": 0.714716810707614, + "grad_norm": 0.3180642837392691, + "learning_rate": 8.604699641576723e-06, + "loss": 0.528, + "step": 4352 + }, + { + "epoch": 0.7148810379159567, + "grad_norm": 0.3133787583026555, + "learning_rate": 8.604507413682409e-06, + "loss": 0.507, + "step": 4353 + }, + { + "epoch": 0.7150452651242994, + "grad_norm": 0.33877895632971905, + "learning_rate": 8.604315141209108e-06, + "loss": 0.5238, + "step": 4354 + }, + { + "epoch": 0.7152094923326422, + "grad_norm": 0.38840289489043733, + "learning_rate": 8.604122824158905e-06, + "loss": 0.5504, + "step": 4355 + }, + { + "epoch": 0.7153737195409849, + "grad_norm": 0.3955477841512037, + "learning_rate": 8.603930462533889e-06, + "loss": 0.4993, + "step": 4356 + }, + { + "epoch": 0.7155379467493277, + "grad_norm": 0.27883260117502756, + "learning_rate": 8.603738056336152e-06, + "loss": 0.5121, + "step": 4357 + }, + { + "epoch": 0.7157021739576704, + "grad_norm": 0.2850443117354454, + "learning_rate": 8.603545605567782e-06, + "loss": 0.525, + "step": 4358 + }, + { + "epoch": 0.7158664011660132, + "grad_norm": 0.43399050909543463, + "learning_rate": 8.603353110230868e-06, + "loss": 0.5265, + "step": 4359 + }, + { + "epoch": 0.7160306283743559, + "grad_norm": 0.31443791711799157, + "learning_rate": 8.603160570327504e-06, + "loss": 0.535, + "step": 4360 + }, + { + "epoch": 0.7161948555826987, + "grad_norm": 0.2939906689274626, + "learning_rate": 8.602967985859779e-06, + "loss": 0.5262, + "step": 4361 + }, + { + "epoch": 0.7163590827910414, + "grad_norm": 0.3551573997835369, + "learning_rate": 8.602775356829783e-06, + "loss": 0.5221, + "step": 4362 + }, + { + "epoch": 0.7165233099993842, + "grad_norm": 0.37242204799922163, + "learning_rate": 8.602582683239612e-06, + "loss": 0.5108, + "step": 4363 + }, + { + "epoch": 0.7166875372077269, + "grad_norm": 0.3301229591082884, + "learning_rate": 8.602389965091357e-06, + "loss": 0.5033, + "step": 4364 + }, + { + "epoch": 0.7168517644160697, + "grad_norm": 0.5596674893832613, + "learning_rate": 8.602197202387109e-06, + "loss": 0.5257, + "step": 4365 + }, + { + "epoch": 0.7170159916244124, + "grad_norm": 0.32003899380645795, + "learning_rate": 8.602004395128963e-06, + "loss": 0.5096, + "step": 4366 + }, + { + "epoch": 0.7171802188327551, + "grad_norm": 0.32101527283833253, + "learning_rate": 8.601811543319016e-06, + "loss": 0.5154, + "step": 4367 + }, + { + "epoch": 0.7173444460410978, + "grad_norm": 0.3582105517657474, + "learning_rate": 8.601618646959359e-06, + "loss": 0.5368, + "step": 4368 + }, + { + "epoch": 0.7175086732494406, + "grad_norm": 0.3834938893158892, + "learning_rate": 8.601425706052086e-06, + "loss": 0.5245, + "step": 4369 + }, + { + "epoch": 0.7176729004577833, + "grad_norm": 0.31214498157447496, + "learning_rate": 8.601232720599298e-06, + "loss": 0.5392, + "step": 4370 + }, + { + "epoch": 0.7178371276661261, + "grad_norm": 0.2968901286066325, + "learning_rate": 8.601039690603085e-06, + "loss": 0.5512, + "step": 4371 + }, + { + "epoch": 0.7180013548744688, + "grad_norm": 0.3226767453514702, + "learning_rate": 8.600846616065546e-06, + "loss": 0.5344, + "step": 4372 + }, + { + "epoch": 0.7181655820828116, + "grad_norm": 0.28203986767629385, + "learning_rate": 8.600653496988781e-06, + "loss": 0.5252, + "step": 4373 + }, + { + "epoch": 0.7183298092911543, + "grad_norm": 0.3263527602745717, + "learning_rate": 8.60046033337488e-06, + "loss": 0.5381, + "step": 4374 + }, + { + "epoch": 0.7184940364994971, + "grad_norm": 0.3830517652455408, + "learning_rate": 8.600267125225948e-06, + "loss": 0.5363, + "step": 4375 + }, + { + "epoch": 0.7186582637078398, + "grad_norm": 0.3124628134298113, + "learning_rate": 8.60007387254408e-06, + "loss": 0.5039, + "step": 4376 + }, + { + "epoch": 0.7188224909161826, + "grad_norm": 0.4344770371599259, + "learning_rate": 8.599880575331379e-06, + "loss": 0.5167, + "step": 4377 + }, + { + "epoch": 0.7189867181245253, + "grad_norm": 0.44022478012369026, + "learning_rate": 8.599687233589938e-06, + "loss": 0.5214, + "step": 4378 + }, + { + "epoch": 0.719150945332868, + "grad_norm": 0.3688156748206383, + "learning_rate": 8.599493847321862e-06, + "loss": 0.5206, + "step": 4379 + }, + { + "epoch": 0.7193151725412108, + "grad_norm": 0.32935829629851043, + "learning_rate": 8.599300416529249e-06, + "loss": 0.5141, + "step": 4380 + }, + { + "epoch": 0.7194793997495535, + "grad_norm": 0.29232395104565306, + "learning_rate": 8.599106941214199e-06, + "loss": 0.5259, + "step": 4381 + }, + { + "epoch": 0.7196436269578963, + "grad_norm": 0.3230601984030344, + "learning_rate": 8.598913421378815e-06, + "loss": 0.5157, + "step": 4382 + }, + { + "epoch": 0.7198078541662389, + "grad_norm": 0.3366778464764349, + "learning_rate": 8.5987198570252e-06, + "loss": 0.5163, + "step": 4383 + }, + { + "epoch": 0.7199720813745817, + "grad_norm": 0.2822882327510804, + "learning_rate": 8.598526248155453e-06, + "loss": 0.5302, + "step": 4384 + }, + { + "epoch": 0.7201363085829244, + "grad_norm": 0.34913151065251047, + "learning_rate": 8.598332594771678e-06, + "loss": 0.5239, + "step": 4385 + }, + { + "epoch": 0.7203005357912672, + "grad_norm": 0.30615999757517137, + "learning_rate": 8.598138896875982e-06, + "loss": 0.5333, + "step": 4386 + }, + { + "epoch": 0.7204647629996099, + "grad_norm": 0.3338081110150123, + "learning_rate": 8.597945154470462e-06, + "loss": 0.55, + "step": 4387 + }, + { + "epoch": 0.7206289902079527, + "grad_norm": 0.27003702556268694, + "learning_rate": 8.597751367557229e-06, + "loss": 0.5179, + "step": 4388 + }, + { + "epoch": 0.7207932174162954, + "grad_norm": 0.3166293407081746, + "learning_rate": 8.59755753613838e-06, + "loss": 0.5193, + "step": 4389 + }, + { + "epoch": 0.7209574446246382, + "grad_norm": 0.2970707814051773, + "learning_rate": 8.597363660216028e-06, + "loss": 0.5089, + "step": 4390 + }, + { + "epoch": 0.7211216718329809, + "grad_norm": 0.4042859510774983, + "learning_rate": 8.597169739792276e-06, + "loss": 0.5184, + "step": 4391 + }, + { + "epoch": 0.7212858990413237, + "grad_norm": 0.36079226884819154, + "learning_rate": 8.596975774869229e-06, + "loss": 0.5115, + "step": 4392 + }, + { + "epoch": 0.7214501262496664, + "grad_norm": 0.2960898042956307, + "learning_rate": 8.596781765448994e-06, + "loss": 0.5214, + "step": 4393 + }, + { + "epoch": 0.7216143534580092, + "grad_norm": 0.3067528269205476, + "learning_rate": 8.596587711533677e-06, + "loss": 0.5404, + "step": 4394 + }, + { + "epoch": 0.7217785806663519, + "grad_norm": 0.5567244165497772, + "learning_rate": 8.596393613125387e-06, + "loss": 0.5322, + "step": 4395 + }, + { + "epoch": 0.7219428078746947, + "grad_norm": 0.31549950117169867, + "learning_rate": 8.596199470226234e-06, + "loss": 0.5202, + "step": 4396 + }, + { + "epoch": 0.7221070350830374, + "grad_norm": 0.3558221536624943, + "learning_rate": 8.596005282838324e-06, + "loss": 0.5098, + "step": 4397 + }, + { + "epoch": 0.7222712622913802, + "grad_norm": 0.3104817339740221, + "learning_rate": 8.595811050963767e-06, + "loss": 0.5341, + "step": 4398 + }, + { + "epoch": 0.7224354894997229, + "grad_norm": 0.32786752772407324, + "learning_rate": 8.59561677460467e-06, + "loss": 0.5089, + "step": 4399 + }, + { + "epoch": 0.7225997167080656, + "grad_norm": 0.4047878987733635, + "learning_rate": 8.595422453763149e-06, + "loss": 0.5458, + "step": 4400 + }, + { + "epoch": 0.7227639439164083, + "grad_norm": 0.2858382446467845, + "learning_rate": 8.595228088441307e-06, + "loss": 0.5354, + "step": 4401 + }, + { + "epoch": 0.722928171124751, + "grad_norm": 0.32023432751026115, + "learning_rate": 8.595033678641261e-06, + "loss": 0.5254, + "step": 4402 + }, + { + "epoch": 0.7230923983330938, + "grad_norm": 0.3045500747617058, + "learning_rate": 8.594839224365119e-06, + "loss": 0.521, + "step": 4403 + }, + { + "epoch": 0.7232566255414365, + "grad_norm": 0.29098460712999963, + "learning_rate": 8.594644725614995e-06, + "loss": 0.5173, + "step": 4404 + }, + { + "epoch": 0.7234208527497793, + "grad_norm": 0.30702617159676615, + "learning_rate": 8.594450182393002e-06, + "loss": 0.5294, + "step": 4405 + }, + { + "epoch": 0.723585079958122, + "grad_norm": 0.3657684663544443, + "learning_rate": 8.59425559470125e-06, + "loss": 0.5244, + "step": 4406 + }, + { + "epoch": 0.7237493071664648, + "grad_norm": 0.33389501184795556, + "learning_rate": 8.594060962541855e-06, + "loss": 0.5379, + "step": 4407 + }, + { + "epoch": 0.7239135343748075, + "grad_norm": 0.3401852810246631, + "learning_rate": 8.593866285916928e-06, + "loss": 0.523, + "step": 4408 + }, + { + "epoch": 0.7240777615831503, + "grad_norm": 0.31282359408606597, + "learning_rate": 8.593671564828587e-06, + "loss": 0.5356, + "step": 4409 + }, + { + "epoch": 0.724241988791493, + "grad_norm": 0.2828881931783121, + "learning_rate": 8.593476799278945e-06, + "loss": 0.5176, + "step": 4410 + }, + { + "epoch": 0.7244062159998358, + "grad_norm": 0.28740377845431986, + "learning_rate": 8.593281989270117e-06, + "loss": 0.5182, + "step": 4411 + }, + { + "epoch": 0.7245704432081785, + "grad_norm": 0.32739805868398925, + "learning_rate": 8.59308713480422e-06, + "loss": 0.5234, + "step": 4412 + }, + { + "epoch": 0.7247346704165213, + "grad_norm": 0.31856973622465923, + "learning_rate": 8.59289223588337e-06, + "loss": 0.5102, + "step": 4413 + }, + { + "epoch": 0.724898897624864, + "grad_norm": 0.30433947412820167, + "learning_rate": 8.592697292509682e-06, + "loss": 0.5323, + "step": 4414 + }, + { + "epoch": 0.7250631248332068, + "grad_norm": 0.3093281266377262, + "learning_rate": 8.592502304685274e-06, + "loss": 0.4877, + "step": 4415 + }, + { + "epoch": 0.7252273520415495, + "grad_norm": 0.4192937983246652, + "learning_rate": 8.592307272412267e-06, + "loss": 0.5315, + "step": 4416 + }, + { + "epoch": 0.7253915792498922, + "grad_norm": 0.43350275798950666, + "learning_rate": 8.592112195692776e-06, + "loss": 0.5063, + "step": 4417 + }, + { + "epoch": 0.7255558064582349, + "grad_norm": 0.4678623945111824, + "learning_rate": 8.591917074528921e-06, + "loss": 0.5424, + "step": 4418 + }, + { + "epoch": 0.7257200336665777, + "grad_norm": 0.287898271867044, + "learning_rate": 8.59172190892282e-06, + "loss": 0.5038, + "step": 4419 + }, + { + "epoch": 0.7258842608749204, + "grad_norm": 0.29822897607011906, + "learning_rate": 8.591526698876592e-06, + "loss": 0.5165, + "step": 4420 + }, + { + "epoch": 0.7260484880832632, + "grad_norm": 0.3109623634429174, + "learning_rate": 8.591331444392361e-06, + "loss": 0.5296, + "step": 4421 + }, + { + "epoch": 0.7262127152916059, + "grad_norm": 0.3741682736438768, + "learning_rate": 8.591136145472244e-06, + "loss": 0.5166, + "step": 4422 + }, + { + "epoch": 0.7263769424999487, + "grad_norm": 0.3281684045665565, + "learning_rate": 8.590940802118363e-06, + "loss": 0.4932, + "step": 4423 + }, + { + "epoch": 0.7265411697082914, + "grad_norm": 0.2967006165698348, + "learning_rate": 8.59074541433284e-06, + "loss": 0.5341, + "step": 4424 + }, + { + "epoch": 0.7267053969166342, + "grad_norm": 0.41241388629963116, + "learning_rate": 8.590549982117798e-06, + "loss": 0.5022, + "step": 4425 + }, + { + "epoch": 0.7268696241249769, + "grad_norm": 0.3291196432580282, + "learning_rate": 8.590354505475357e-06, + "loss": 0.5139, + "step": 4426 + }, + { + "epoch": 0.7270338513333197, + "grad_norm": 0.32679147888924837, + "learning_rate": 8.590158984407644e-06, + "loss": 0.5137, + "step": 4427 + }, + { + "epoch": 0.7271980785416624, + "grad_norm": 0.28783281690844026, + "learning_rate": 8.589963418916778e-06, + "loss": 0.5296, + "step": 4428 + }, + { + "epoch": 0.7273623057500052, + "grad_norm": 0.31121651109770526, + "learning_rate": 8.589767809004886e-06, + "loss": 0.5407, + "step": 4429 + }, + { + "epoch": 0.7275265329583479, + "grad_norm": 0.3231129949957135, + "learning_rate": 8.589572154674093e-06, + "loss": 0.5187, + "step": 4430 + }, + { + "epoch": 0.7276907601666907, + "grad_norm": 0.35344471569362507, + "learning_rate": 8.589376455926521e-06, + "loss": 0.5165, + "step": 4431 + }, + { + "epoch": 0.7278549873750334, + "grad_norm": 0.30854070183351756, + "learning_rate": 8.589180712764298e-06, + "loss": 0.5286, + "step": 4432 + }, + { + "epoch": 0.7280192145833762, + "grad_norm": 0.3771827531386895, + "learning_rate": 8.58898492518955e-06, + "loss": 0.5381, + "step": 4433 + }, + { + "epoch": 0.7281834417917188, + "grad_norm": 0.3842022509196105, + "learning_rate": 8.588789093204402e-06, + "loss": 0.5166, + "step": 4434 + }, + { + "epoch": 0.7283476690000615, + "grad_norm": 0.3276679515164658, + "learning_rate": 8.58859321681098e-06, + "loss": 0.5236, + "step": 4435 + }, + { + "epoch": 0.7285118962084043, + "grad_norm": 0.32004257785016976, + "learning_rate": 8.588397296011416e-06, + "loss": 0.5196, + "step": 4436 + }, + { + "epoch": 0.728676123416747, + "grad_norm": 0.33663249793293937, + "learning_rate": 8.588201330807833e-06, + "loss": 0.5278, + "step": 4437 + }, + { + "epoch": 0.7288403506250898, + "grad_norm": 0.33869601572570573, + "learning_rate": 8.588005321202361e-06, + "loss": 0.5294, + "step": 4438 + }, + { + "epoch": 0.7290045778334325, + "grad_norm": 0.29466669461762524, + "learning_rate": 8.587809267197132e-06, + "loss": 0.5347, + "step": 4439 + }, + { + "epoch": 0.7291688050417753, + "grad_norm": 0.32128430605037284, + "learning_rate": 8.587613168794269e-06, + "loss": 0.5244, + "step": 4440 + }, + { + "epoch": 0.729333032250118, + "grad_norm": 0.32825756800992645, + "learning_rate": 8.587417025995909e-06, + "loss": 0.5339, + "step": 4441 + }, + { + "epoch": 0.7294972594584608, + "grad_norm": 0.3301350950702061, + "learning_rate": 8.587220838804176e-06, + "loss": 0.5109, + "step": 4442 + }, + { + "epoch": 0.7296614866668035, + "grad_norm": 0.28104238952110344, + "learning_rate": 8.587024607221203e-06, + "loss": 0.5046, + "step": 4443 + }, + { + "epoch": 0.7298257138751463, + "grad_norm": 0.37224012508447873, + "learning_rate": 8.586828331249123e-06, + "loss": 0.5169, + "step": 4444 + }, + { + "epoch": 0.729989941083489, + "grad_norm": 0.39243426907646256, + "learning_rate": 8.586632010890065e-06, + "loss": 0.4953, + "step": 4445 + }, + { + "epoch": 0.7301541682918318, + "grad_norm": 0.3699144774546227, + "learning_rate": 8.586435646146164e-06, + "loss": 0.5161, + "step": 4446 + }, + { + "epoch": 0.7303183955001745, + "grad_norm": 0.403502931249784, + "learning_rate": 8.586239237019552e-06, + "loss": 0.5144, + "step": 4447 + }, + { + "epoch": 0.7304826227085173, + "grad_norm": 0.33530858772743866, + "learning_rate": 8.586042783512361e-06, + "loss": 0.5065, + "step": 4448 + }, + { + "epoch": 0.73064684991686, + "grad_norm": 0.36473692233901733, + "learning_rate": 8.585846285626724e-06, + "loss": 0.5289, + "step": 4449 + }, + { + "epoch": 0.7308110771252028, + "grad_norm": 0.39727298026229163, + "learning_rate": 8.585649743364778e-06, + "loss": 0.5261, + "step": 4450 + }, + { + "epoch": 0.7309753043335454, + "grad_norm": 0.34627397363700524, + "learning_rate": 8.585453156728655e-06, + "loss": 0.4968, + "step": 4451 + }, + { + "epoch": 0.7311395315418882, + "grad_norm": 0.3024856634002474, + "learning_rate": 8.58525652572049e-06, + "loss": 0.5308, + "step": 4452 + }, + { + "epoch": 0.7313037587502309, + "grad_norm": 0.3052356152052224, + "learning_rate": 8.585059850342422e-06, + "loss": 0.4987, + "step": 4453 + }, + { + "epoch": 0.7314679859585737, + "grad_norm": 0.3868676594902889, + "learning_rate": 8.584863130596584e-06, + "loss": 0.5319, + "step": 4454 + }, + { + "epoch": 0.7316322131669164, + "grad_norm": 0.3114269144053163, + "learning_rate": 8.584666366485115e-06, + "loss": 0.5191, + "step": 4455 + }, + { + "epoch": 0.7317964403752591, + "grad_norm": 0.32173171845094, + "learning_rate": 8.584469558010148e-06, + "loss": 0.5146, + "step": 4456 + }, + { + "epoch": 0.7319606675836019, + "grad_norm": 0.32749605605918813, + "learning_rate": 8.584272705173824e-06, + "loss": 0.5223, + "step": 4457 + }, + { + "epoch": 0.7321248947919446, + "grad_norm": 0.35873045536107073, + "learning_rate": 8.58407580797828e-06, + "loss": 0.5307, + "step": 4458 + }, + { + "epoch": 0.7322891220002874, + "grad_norm": 0.33696054538783504, + "learning_rate": 8.583878866425656e-06, + "loss": 0.5112, + "step": 4459 + }, + { + "epoch": 0.7324533492086301, + "grad_norm": 0.322150007734754, + "learning_rate": 8.583681880518088e-06, + "loss": 0.5408, + "step": 4460 + }, + { + "epoch": 0.7326175764169729, + "grad_norm": 0.3577905586167644, + "learning_rate": 8.583484850257717e-06, + "loss": 0.5247, + "step": 4461 + }, + { + "epoch": 0.7327818036253156, + "grad_norm": 0.3890314954445434, + "learning_rate": 8.583287775646683e-06, + "loss": 0.5403, + "step": 4462 + }, + { + "epoch": 0.7329460308336584, + "grad_norm": 0.322614607637845, + "learning_rate": 8.583090656687126e-06, + "loss": 0.5116, + "step": 4463 + }, + { + "epoch": 0.7331102580420011, + "grad_norm": 0.37697488192045586, + "learning_rate": 8.582893493381187e-06, + "loss": 0.5333, + "step": 4464 + }, + { + "epoch": 0.7332744852503439, + "grad_norm": 0.37182964027775867, + "learning_rate": 8.58269628573101e-06, + "loss": 0.5285, + "step": 4465 + }, + { + "epoch": 0.7334387124586866, + "grad_norm": 0.31262978823905135, + "learning_rate": 8.582499033738732e-06, + "loss": 0.5342, + "step": 4466 + }, + { + "epoch": 0.7336029396670294, + "grad_norm": 0.29094147937733944, + "learning_rate": 8.582301737406498e-06, + "loss": 0.522, + "step": 4467 + }, + { + "epoch": 0.733767166875372, + "grad_norm": 0.3595441468442823, + "learning_rate": 8.582104396736453e-06, + "loss": 0.5367, + "step": 4468 + }, + { + "epoch": 0.7339313940837148, + "grad_norm": 0.32354047052265966, + "learning_rate": 8.581907011730735e-06, + "loss": 0.5146, + "step": 4469 + }, + { + "epoch": 0.7340956212920575, + "grad_norm": 0.3434044743236067, + "learning_rate": 8.581709582391492e-06, + "loss": 0.5308, + "step": 4470 + }, + { + "epoch": 0.7342598485004003, + "grad_norm": 0.2966580125366594, + "learning_rate": 8.581512108720868e-06, + "loss": 0.5352, + "step": 4471 + }, + { + "epoch": 0.734424075708743, + "grad_norm": 0.3034642435029345, + "learning_rate": 8.581314590721006e-06, + "loss": 0.529, + "step": 4472 + }, + { + "epoch": 0.7345883029170858, + "grad_norm": 0.3172501587579415, + "learning_rate": 8.581117028394052e-06, + "loss": 0.4962, + "step": 4473 + }, + { + "epoch": 0.7347525301254285, + "grad_norm": 0.3057272334680113, + "learning_rate": 8.580919421742153e-06, + "loss": 0.5362, + "step": 4474 + }, + { + "epoch": 0.7349167573337713, + "grad_norm": 0.27472219505829215, + "learning_rate": 8.580721770767452e-06, + "loss": 0.5451, + "step": 4475 + }, + { + "epoch": 0.735080984542114, + "grad_norm": 0.3101713531189586, + "learning_rate": 8.580524075472099e-06, + "loss": 0.5333, + "step": 4476 + }, + { + "epoch": 0.7352452117504568, + "grad_norm": 0.3294719891527552, + "learning_rate": 8.58032633585824e-06, + "loss": 0.5193, + "step": 4477 + }, + { + "epoch": 0.7354094389587995, + "grad_norm": 2.171350755473058, + "learning_rate": 8.580128551928021e-06, + "loss": 0.4927, + "step": 4478 + }, + { + "epoch": 0.7355736661671423, + "grad_norm": 0.3730561619746024, + "learning_rate": 8.579930723683592e-06, + "loss": 0.4969, + "step": 4479 + }, + { + "epoch": 0.735737893375485, + "grad_norm": 0.5268860411236578, + "learning_rate": 8.579732851127102e-06, + "loss": 0.5256, + "step": 4480 + }, + { + "epoch": 0.7359021205838278, + "grad_norm": 0.3387582635476446, + "learning_rate": 8.5795349342607e-06, + "loss": 0.5631, + "step": 4481 + }, + { + "epoch": 0.7360663477921705, + "grad_norm": 0.28861855998881153, + "learning_rate": 8.579336973086535e-06, + "loss": 0.5199, + "step": 4482 + }, + { + "epoch": 0.7362305750005133, + "grad_norm": 0.2942221387932248, + "learning_rate": 8.579138967606755e-06, + "loss": 0.5036, + "step": 4483 + }, + { + "epoch": 0.736394802208856, + "grad_norm": 1.0358662167933195, + "learning_rate": 8.578940917823514e-06, + "loss": 0.5111, + "step": 4484 + }, + { + "epoch": 0.7365590294171986, + "grad_norm": 0.37438050679631024, + "learning_rate": 8.578742823738961e-06, + "loss": 0.5173, + "step": 4485 + }, + { + "epoch": 0.7367232566255414, + "grad_norm": 0.3277582159694442, + "learning_rate": 8.578544685355248e-06, + "loss": 0.528, + "step": 4486 + }, + { + "epoch": 0.7368874838338841, + "grad_norm": 0.30091880038912744, + "learning_rate": 8.578346502674526e-06, + "loss": 0.5231, + "step": 4487 + }, + { + "epoch": 0.7370517110422269, + "grad_norm": 0.3074065546946728, + "learning_rate": 8.578148275698951e-06, + "loss": 0.5222, + "step": 4488 + }, + { + "epoch": 0.7372159382505696, + "grad_norm": 0.354491887670448, + "learning_rate": 8.577950004430672e-06, + "loss": 0.5327, + "step": 4489 + }, + { + "epoch": 0.7373801654589124, + "grad_norm": 0.29587883852898034, + "learning_rate": 8.577751688871842e-06, + "loss": 0.5161, + "step": 4490 + }, + { + "epoch": 0.7375443926672551, + "grad_norm": 0.3460943354301366, + "learning_rate": 8.577553329024618e-06, + "loss": 0.5146, + "step": 4491 + }, + { + "epoch": 0.7377086198755979, + "grad_norm": 0.2967950619579812, + "learning_rate": 8.577354924891155e-06, + "loss": 0.53, + "step": 4492 + }, + { + "epoch": 0.7378728470839406, + "grad_norm": 0.342330627577297, + "learning_rate": 8.577156476473603e-06, + "loss": 0.5101, + "step": 4493 + }, + { + "epoch": 0.7380370742922834, + "grad_norm": 0.2936759776996891, + "learning_rate": 8.576957983774123e-06, + "loss": 0.5217, + "step": 4494 + }, + { + "epoch": 0.7382013015006261, + "grad_norm": 0.3769017994359706, + "learning_rate": 8.576759446794865e-06, + "loss": 0.5267, + "step": 4495 + }, + { + "epoch": 0.7383655287089689, + "grad_norm": 0.3243258117086145, + "learning_rate": 8.57656086553799e-06, + "loss": 0.5333, + "step": 4496 + }, + { + "epoch": 0.7385297559173116, + "grad_norm": 0.288283887436931, + "learning_rate": 8.576362240005653e-06, + "loss": 0.5028, + "step": 4497 + }, + { + "epoch": 0.7386939831256544, + "grad_norm": 0.47367175940020345, + "learning_rate": 8.576163570200013e-06, + "loss": 0.5243, + "step": 4498 + }, + { + "epoch": 0.7388582103339971, + "grad_norm": 0.33129672486938455, + "learning_rate": 8.575964856123224e-06, + "loss": 0.5116, + "step": 4499 + }, + { + "epoch": 0.7390224375423399, + "grad_norm": 0.30974731627218893, + "learning_rate": 8.575766097777447e-06, + "loss": 0.5142, + "step": 4500 + }, + { + "epoch": 0.7391866647506826, + "grad_norm": 0.32922854453451555, + "learning_rate": 8.575567295164842e-06, + "loss": 0.5181, + "step": 4501 + }, + { + "epoch": 0.7393508919590253, + "grad_norm": 0.3561359117611993, + "learning_rate": 8.575368448287564e-06, + "loss": 0.5343, + "step": 4502 + }, + { + "epoch": 0.739515119167368, + "grad_norm": 0.30225794184268934, + "learning_rate": 8.575169557147775e-06, + "loss": 0.5407, + "step": 4503 + }, + { + "epoch": 0.7396793463757108, + "grad_norm": 0.3190486485860462, + "learning_rate": 8.574970621747636e-06, + "loss": 0.5242, + "step": 4504 + }, + { + "epoch": 0.7398435735840535, + "grad_norm": 1.4215271237322333, + "learning_rate": 8.57477164208931e-06, + "loss": 0.4993, + "step": 4505 + }, + { + "epoch": 0.7400078007923963, + "grad_norm": 0.303708837722683, + "learning_rate": 8.574572618174951e-06, + "loss": 0.5197, + "step": 4506 + }, + { + "epoch": 0.740172028000739, + "grad_norm": 0.3312313092554724, + "learning_rate": 8.574373550006724e-06, + "loss": 0.5142, + "step": 4507 + }, + { + "epoch": 0.7403362552090818, + "grad_norm": 0.3313425846200784, + "learning_rate": 8.574174437586794e-06, + "loss": 0.5084, + "step": 4508 + }, + { + "epoch": 0.7405004824174245, + "grad_norm": 0.34204198699638977, + "learning_rate": 8.573975280917321e-06, + "loss": 0.5148, + "step": 4509 + }, + { + "epoch": 0.7406647096257672, + "grad_norm": 0.2988610461822118, + "learning_rate": 8.573776080000466e-06, + "loss": 0.5205, + "step": 4510 + }, + { + "epoch": 0.74082893683411, + "grad_norm": 0.28717571034744444, + "learning_rate": 8.573576834838397e-06, + "loss": 0.528, + "step": 4511 + }, + { + "epoch": 0.7409931640424527, + "grad_norm": 0.317583520589486, + "learning_rate": 8.573377545433275e-06, + "loss": 0.5248, + "step": 4512 + }, + { + "epoch": 0.7411573912507955, + "grad_norm": 0.32758612071497484, + "learning_rate": 8.573178211787266e-06, + "loss": 0.5128, + "step": 4513 + }, + { + "epoch": 0.7413216184591382, + "grad_norm": 0.33237284591262994, + "learning_rate": 8.572978833902531e-06, + "loss": 0.5349, + "step": 4514 + }, + { + "epoch": 0.741485845667481, + "grad_norm": 0.42198291061829296, + "learning_rate": 8.572779411781242e-06, + "loss": 0.5221, + "step": 4515 + }, + { + "epoch": 0.7416500728758237, + "grad_norm": 0.31109245246405137, + "learning_rate": 8.57257994542556e-06, + "loss": 0.5072, + "step": 4516 + }, + { + "epoch": 0.7418143000841665, + "grad_norm": 0.2980861107722376, + "learning_rate": 8.572380434837653e-06, + "loss": 0.5141, + "step": 4517 + }, + { + "epoch": 0.7419785272925092, + "grad_norm": 0.30924696499077564, + "learning_rate": 8.572180880019688e-06, + "loss": 0.5352, + "step": 4518 + }, + { + "epoch": 0.7421427545008519, + "grad_norm": 0.339786312302817, + "learning_rate": 8.571981280973832e-06, + "loss": 0.5196, + "step": 4519 + }, + { + "epoch": 0.7423069817091946, + "grad_norm": 0.274055301165695, + "learning_rate": 8.571781637702254e-06, + "loss": 0.5105, + "step": 4520 + }, + { + "epoch": 0.7424712089175374, + "grad_norm": 0.34291339425641154, + "learning_rate": 8.571581950207121e-06, + "loss": 0.5071, + "step": 4521 + }, + { + "epoch": 0.7426354361258801, + "grad_norm": 0.2971810938064332, + "learning_rate": 8.571382218490602e-06, + "loss": 0.5295, + "step": 4522 + }, + { + "epoch": 0.7427996633342229, + "grad_norm": 0.27086010154061957, + "learning_rate": 8.571182442554865e-06, + "loss": 0.4966, + "step": 4523 + }, + { + "epoch": 0.7429638905425656, + "grad_norm": 0.3187861637022362, + "learning_rate": 8.570982622402082e-06, + "loss": 0.5025, + "step": 4524 + }, + { + "epoch": 0.7431281177509084, + "grad_norm": 0.304476818924289, + "learning_rate": 8.570782758034423e-06, + "loss": 0.5017, + "step": 4525 + }, + { + "epoch": 0.7432923449592511, + "grad_norm": 0.3673418689445969, + "learning_rate": 8.570582849454057e-06, + "loss": 0.521, + "step": 4526 + }, + { + "epoch": 0.7434565721675939, + "grad_norm": 0.31022619613586055, + "learning_rate": 8.570382896663158e-06, + "loss": 0.5205, + "step": 4527 + }, + { + "epoch": 0.7436207993759366, + "grad_norm": 0.29831412465170415, + "learning_rate": 8.570182899663896e-06, + "loss": 0.538, + "step": 4528 + }, + { + "epoch": 0.7437850265842794, + "grad_norm": 0.26560917646771326, + "learning_rate": 8.569982858458441e-06, + "loss": 0.4942, + "step": 4529 + }, + { + "epoch": 0.7439492537926221, + "grad_norm": 0.40343919917248544, + "learning_rate": 8.56978277304897e-06, + "loss": 0.5388, + "step": 4530 + }, + { + "epoch": 0.7441134810009649, + "grad_norm": 0.3538356979890078, + "learning_rate": 8.569582643437653e-06, + "loss": 0.5234, + "step": 4531 + }, + { + "epoch": 0.7442777082093076, + "grad_norm": 0.3502814342255026, + "learning_rate": 8.569382469626664e-06, + "loss": 0.5098, + "step": 4532 + }, + { + "epoch": 0.7444419354176504, + "grad_norm": 0.33687403724482856, + "learning_rate": 8.56918225161818e-06, + "loss": 0.5133, + "step": 4533 + }, + { + "epoch": 0.7446061626259931, + "grad_norm": 0.28226515273483066, + "learning_rate": 8.56898198941437e-06, + "loss": 0.51, + "step": 4534 + }, + { + "epoch": 0.7447703898343359, + "grad_norm": 0.30469490349467954, + "learning_rate": 8.568781683017414e-06, + "loss": 0.5227, + "step": 4535 + }, + { + "epoch": 0.7449346170426785, + "grad_norm": 0.35862422640377734, + "learning_rate": 8.568581332429486e-06, + "loss": 0.5168, + "step": 4536 + }, + { + "epoch": 0.7450988442510212, + "grad_norm": 0.2818516886467351, + "learning_rate": 8.568380937652761e-06, + "loss": 0.5075, + "step": 4537 + }, + { + "epoch": 0.745263071459364, + "grad_norm": 0.2918812274214278, + "learning_rate": 8.568180498689417e-06, + "loss": 0.5286, + "step": 4538 + }, + { + "epoch": 0.7454272986677067, + "grad_norm": 0.3570737420094653, + "learning_rate": 8.56798001554163e-06, + "loss": 0.5322, + "step": 4539 + }, + { + "epoch": 0.7455915258760495, + "grad_norm": 0.2910121417077147, + "learning_rate": 8.567779488211577e-06, + "loss": 0.5371, + "step": 4540 + }, + { + "epoch": 0.7457557530843922, + "grad_norm": 0.3164570259557555, + "learning_rate": 8.567578916701437e-06, + "loss": 0.5149, + "step": 4541 + }, + { + "epoch": 0.745919980292735, + "grad_norm": 0.31432087465115854, + "learning_rate": 8.567378301013388e-06, + "loss": 0.5174, + "step": 4542 + }, + { + "epoch": 0.7460842075010777, + "grad_norm": 0.3332922788821445, + "learning_rate": 8.56717764114961e-06, + "loss": 0.4993, + "step": 4543 + }, + { + "epoch": 0.7462484347094205, + "grad_norm": 0.2942101993605978, + "learning_rate": 8.56697693711228e-06, + "loss": 0.5338, + "step": 4544 + }, + { + "epoch": 0.7464126619177632, + "grad_norm": 0.3123029158168016, + "learning_rate": 8.566776188903579e-06, + "loss": 0.53, + "step": 4545 + }, + { + "epoch": 0.746576889126106, + "grad_norm": 0.33334606207740836, + "learning_rate": 8.566575396525688e-06, + "loss": 0.5401, + "step": 4546 + }, + { + "epoch": 0.7467411163344487, + "grad_norm": 0.3499456470338676, + "learning_rate": 8.566374559980787e-06, + "loss": 0.5053, + "step": 4547 + }, + { + "epoch": 0.7469053435427915, + "grad_norm": 0.5894023063619104, + "learning_rate": 8.566173679271057e-06, + "loss": 0.5044, + "step": 4548 + }, + { + "epoch": 0.7470695707511342, + "grad_norm": 0.3020765404041951, + "learning_rate": 8.565972754398682e-06, + "loss": 0.5099, + "step": 4549 + }, + { + "epoch": 0.747233797959477, + "grad_norm": 0.3171528202979881, + "learning_rate": 8.565771785365841e-06, + "loss": 0.5044, + "step": 4550 + }, + { + "epoch": 0.7473980251678197, + "grad_norm": 0.36820579002695975, + "learning_rate": 8.565570772174718e-06, + "loss": 0.5264, + "step": 4551 + }, + { + "epoch": 0.7475622523761625, + "grad_norm": 0.3652549638401894, + "learning_rate": 8.565369714827497e-06, + "loss": 0.517, + "step": 4552 + }, + { + "epoch": 0.7477264795845051, + "grad_norm": 0.28535511401718994, + "learning_rate": 8.565168613326362e-06, + "loss": 0.4884, + "step": 4553 + }, + { + "epoch": 0.7478907067928479, + "grad_norm": 0.33634032346122994, + "learning_rate": 8.564967467673494e-06, + "loss": 0.5246, + "step": 4554 + }, + { + "epoch": 0.7480549340011906, + "grad_norm": 0.28810968260779946, + "learning_rate": 8.564766277871081e-06, + "loss": 0.507, + "step": 4555 + }, + { + "epoch": 0.7482191612095334, + "grad_norm": 0.2992743113260256, + "learning_rate": 8.564565043921308e-06, + "loss": 0.5188, + "step": 4556 + }, + { + "epoch": 0.7483833884178761, + "grad_norm": 0.3558728320611078, + "learning_rate": 8.564363765826358e-06, + "loss": 0.5265, + "step": 4557 + }, + { + "epoch": 0.7485476156262189, + "grad_norm": 0.292448000228156, + "learning_rate": 8.564162443588421e-06, + "loss": 0.5252, + "step": 4558 + }, + { + "epoch": 0.7487118428345616, + "grad_norm": 0.3439831596581984, + "learning_rate": 8.56396107720968e-06, + "loss": 0.5187, + "step": 4559 + }, + { + "epoch": 0.7488760700429044, + "grad_norm": 0.40810472098558376, + "learning_rate": 8.563759666692323e-06, + "loss": 0.524, + "step": 4560 + }, + { + "epoch": 0.7490402972512471, + "grad_norm": 0.2845210850491724, + "learning_rate": 8.563558212038538e-06, + "loss": 0.5223, + "step": 4561 + }, + { + "epoch": 0.7492045244595898, + "grad_norm": 0.361454778137837, + "learning_rate": 8.563356713250513e-06, + "loss": 0.5032, + "step": 4562 + }, + { + "epoch": 0.7493687516679326, + "grad_norm": 0.3123393644294071, + "learning_rate": 8.563155170330436e-06, + "loss": 0.5003, + "step": 4563 + }, + { + "epoch": 0.7495329788762753, + "grad_norm": 0.2893188997077662, + "learning_rate": 8.562953583280497e-06, + "loss": 0.5123, + "step": 4564 + }, + { + "epoch": 0.7496972060846181, + "grad_norm": 0.325060447014177, + "learning_rate": 8.562751952102883e-06, + "loss": 0.5326, + "step": 4565 + }, + { + "epoch": 0.7498614332929608, + "grad_norm": 0.38334438203554844, + "learning_rate": 8.562550276799788e-06, + "loss": 0.5499, + "step": 4566 + }, + { + "epoch": 0.7500256605013036, + "grad_norm": 0.2895758675873873, + "learning_rate": 8.5623485573734e-06, + "loss": 0.5258, + "step": 4567 + }, + { + "epoch": 0.7501898877096463, + "grad_norm": 0.34262490892337305, + "learning_rate": 8.562146793825907e-06, + "loss": 0.5252, + "step": 4568 + }, + { + "epoch": 0.7503541149179891, + "grad_norm": 0.3077035016491017, + "learning_rate": 8.561944986159505e-06, + "loss": 0.5358, + "step": 4569 + }, + { + "epoch": 0.7505183421263317, + "grad_norm": 0.3032102839545749, + "learning_rate": 8.561743134376384e-06, + "loss": 0.5255, + "step": 4570 + }, + { + "epoch": 0.7506825693346745, + "grad_norm": 0.33693998727707863, + "learning_rate": 8.561541238478735e-06, + "loss": 0.5141, + "step": 4571 + }, + { + "epoch": 0.7508467965430172, + "grad_norm": 0.2780956966036318, + "learning_rate": 8.561339298468753e-06, + "loss": 0.5382, + "step": 4572 + }, + { + "epoch": 0.75101102375136, + "grad_norm": 0.33119142853483213, + "learning_rate": 8.56113731434863e-06, + "loss": 0.5032, + "step": 4573 + }, + { + "epoch": 0.7511752509597027, + "grad_norm": 0.2969192531504866, + "learning_rate": 8.560935286120562e-06, + "loss": 0.5012, + "step": 4574 + }, + { + "epoch": 0.7513394781680455, + "grad_norm": 0.30932229204461903, + "learning_rate": 8.560733213786741e-06, + "loss": 0.5228, + "step": 4575 + }, + { + "epoch": 0.7515037053763882, + "grad_norm": 0.6375810468219024, + "learning_rate": 8.56053109734936e-06, + "loss": 0.5347, + "step": 4576 + }, + { + "epoch": 0.751667932584731, + "grad_norm": 0.3602747428237164, + "learning_rate": 8.56032893681062e-06, + "loss": 0.5155, + "step": 4577 + }, + { + "epoch": 0.7518321597930737, + "grad_norm": 0.3344243202645859, + "learning_rate": 8.560126732172709e-06, + "loss": 0.5525, + "step": 4578 + }, + { + "epoch": 0.7519963870014165, + "grad_norm": 0.6821768214109283, + "learning_rate": 8.559924483437827e-06, + "loss": 0.5184, + "step": 4579 + }, + { + "epoch": 0.7521606142097592, + "grad_norm": 0.35181079249212044, + "learning_rate": 8.559722190608174e-06, + "loss": 0.5069, + "step": 4580 + }, + { + "epoch": 0.752324841418102, + "grad_norm": 0.32076296100754514, + "learning_rate": 8.55951985368594e-06, + "loss": 0.5245, + "step": 4581 + }, + { + "epoch": 0.7524890686264447, + "grad_norm": 0.34037129519099774, + "learning_rate": 8.55931747267333e-06, + "loss": 0.5204, + "step": 4582 + }, + { + "epoch": 0.7526532958347875, + "grad_norm": 0.35839103085610946, + "learning_rate": 8.559115047572537e-06, + "loss": 0.5028, + "step": 4583 + }, + { + "epoch": 0.7528175230431302, + "grad_norm": 0.32243397643030064, + "learning_rate": 8.55891257838576e-06, + "loss": 0.5202, + "step": 4584 + }, + { + "epoch": 0.752981750251473, + "grad_norm": 0.3207564345064311, + "learning_rate": 8.5587100651152e-06, + "loss": 0.51, + "step": 4585 + }, + { + "epoch": 0.7531459774598157, + "grad_norm": 0.28960495569847716, + "learning_rate": 8.558507507763055e-06, + "loss": 0.5065, + "step": 4586 + }, + { + "epoch": 0.7533102046681583, + "grad_norm": 0.32541735259990995, + "learning_rate": 8.558304906331525e-06, + "loss": 0.5082, + "step": 4587 + }, + { + "epoch": 0.7534744318765011, + "grad_norm": 0.4295942413931405, + "learning_rate": 8.558102260822812e-06, + "loss": 0.5032, + "step": 4588 + }, + { + "epoch": 0.7536386590848438, + "grad_norm": 0.3270938372241511, + "learning_rate": 8.557899571239115e-06, + "loss": 0.5189, + "step": 4589 + }, + { + "epoch": 0.7538028862931866, + "grad_norm": 0.4618734327758789, + "learning_rate": 8.557696837582636e-06, + "loss": 0.5126, + "step": 4590 + }, + { + "epoch": 0.7539671135015293, + "grad_norm": 0.2856325652395653, + "learning_rate": 8.557494059855579e-06, + "loss": 0.495, + "step": 4591 + }, + { + "epoch": 0.7541313407098721, + "grad_norm": 0.2922332368335058, + "learning_rate": 8.557291238060142e-06, + "loss": 0.5173, + "step": 4592 + }, + { + "epoch": 0.7542955679182148, + "grad_norm": 0.26960221791122096, + "learning_rate": 8.557088372198532e-06, + "loss": 0.5192, + "step": 4593 + }, + { + "epoch": 0.7544597951265576, + "grad_norm": 0.29754034652447925, + "learning_rate": 8.55688546227295e-06, + "loss": 0.5044, + "step": 4594 + }, + { + "epoch": 0.7546240223349003, + "grad_norm": 0.302344635843857, + "learning_rate": 8.556682508285601e-06, + "loss": 0.5439, + "step": 4595 + }, + { + "epoch": 0.7547882495432431, + "grad_norm": 0.3497725787114854, + "learning_rate": 8.556479510238688e-06, + "loss": 0.5342, + "step": 4596 + }, + { + "epoch": 0.7549524767515858, + "grad_norm": 0.29555475207440246, + "learning_rate": 8.556276468134418e-06, + "loss": 0.5399, + "step": 4597 + }, + { + "epoch": 0.7551167039599286, + "grad_norm": 0.2604097571549104, + "learning_rate": 8.556073381974992e-06, + "loss": 0.509, + "step": 4598 + }, + { + "epoch": 0.7552809311682713, + "grad_norm": 0.33188600308422034, + "learning_rate": 8.555870251762619e-06, + "loss": 0.5198, + "step": 4599 + }, + { + "epoch": 0.7554451583766141, + "grad_norm": 0.3258458849112466, + "learning_rate": 8.555667077499506e-06, + "loss": 0.5124, + "step": 4600 + }, + { + "epoch": 0.7556093855849568, + "grad_norm": 0.34142640335866914, + "learning_rate": 8.555463859187858e-06, + "loss": 0.5383, + "step": 4601 + }, + { + "epoch": 0.7557736127932996, + "grad_norm": 0.32818633170502004, + "learning_rate": 8.555260596829882e-06, + "loss": 0.533, + "step": 4602 + }, + { + "epoch": 0.7559378400016423, + "grad_norm": 0.3873086927551184, + "learning_rate": 8.555057290427787e-06, + "loss": 0.5217, + "step": 4603 + }, + { + "epoch": 0.756102067209985, + "grad_norm": 0.33476337551175733, + "learning_rate": 8.55485393998378e-06, + "loss": 0.5138, + "step": 4604 + }, + { + "epoch": 0.7562662944183277, + "grad_norm": 0.3318686735908207, + "learning_rate": 8.554650545500068e-06, + "loss": 0.5022, + "step": 4605 + }, + { + "epoch": 0.7564305216266705, + "grad_norm": 0.326403174472294, + "learning_rate": 8.554447106978865e-06, + "loss": 0.5214, + "step": 4606 + }, + { + "epoch": 0.7565947488350132, + "grad_norm": 0.39247588112022863, + "learning_rate": 8.554243624422373e-06, + "loss": 0.5156, + "step": 4607 + }, + { + "epoch": 0.756758976043356, + "grad_norm": 0.326663427806945, + "learning_rate": 8.55404009783281e-06, + "loss": 0.5398, + "step": 4608 + }, + { + "epoch": 0.7569232032516987, + "grad_norm": 0.3459223648779705, + "learning_rate": 8.553836527212381e-06, + "loss": 0.4895, + "step": 4609 + }, + { + "epoch": 0.7570874304600415, + "grad_norm": 0.2941823147267791, + "learning_rate": 8.5536329125633e-06, + "loss": 0.518, + "step": 4610 + }, + { + "epoch": 0.7572516576683842, + "grad_norm": 0.30618428789154545, + "learning_rate": 8.553429253887778e-06, + "loss": 0.5023, + "step": 4611 + }, + { + "epoch": 0.757415884876727, + "grad_norm": 0.31806576498609396, + "learning_rate": 8.553225551188025e-06, + "loss": 0.5371, + "step": 4612 + }, + { + "epoch": 0.7575801120850697, + "grad_norm": 0.35278860041835625, + "learning_rate": 8.553021804466254e-06, + "loss": 0.5085, + "step": 4613 + }, + { + "epoch": 0.7577443392934124, + "grad_norm": 0.3214722436543048, + "learning_rate": 8.55281801372468e-06, + "loss": 0.5145, + "step": 4614 + }, + { + "epoch": 0.7579085665017552, + "grad_norm": 0.32654098045991403, + "learning_rate": 8.552614178965514e-06, + "loss": 0.5254, + "step": 4615 + }, + { + "epoch": 0.758072793710098, + "grad_norm": 0.3281384004825425, + "learning_rate": 8.552410300190972e-06, + "loss": 0.5087, + "step": 4616 + }, + { + "epoch": 0.7582370209184407, + "grad_norm": 0.3279057768090118, + "learning_rate": 8.552206377403265e-06, + "loss": 0.5073, + "step": 4617 + }, + { + "epoch": 0.7584012481267834, + "grad_norm": 0.33926049968773087, + "learning_rate": 8.552002410604613e-06, + "loss": 0.525, + "step": 4618 + }, + { + "epoch": 0.7585654753351262, + "grad_norm": 0.2795274039873359, + "learning_rate": 8.551798399797226e-06, + "loss": 0.5183, + "step": 4619 + }, + { + "epoch": 0.7587297025434689, + "grad_norm": 0.3206834642081961, + "learning_rate": 8.551594344983322e-06, + "loss": 0.5422, + "step": 4620 + }, + { + "epoch": 0.7588939297518116, + "grad_norm": 0.3058743058502909, + "learning_rate": 8.551390246165118e-06, + "loss": 0.4945, + "step": 4621 + }, + { + "epoch": 0.7590581569601543, + "grad_norm": 0.299835948991702, + "learning_rate": 8.551186103344828e-06, + "loss": 0.5012, + "step": 4622 + }, + { + "epoch": 0.7592223841684971, + "grad_norm": 0.2883728461560533, + "learning_rate": 8.550981916524673e-06, + "loss": 0.5257, + "step": 4623 + }, + { + "epoch": 0.7593866113768398, + "grad_norm": 0.2793782805803841, + "learning_rate": 8.550777685706869e-06, + "loss": 0.5489, + "step": 4624 + }, + { + "epoch": 0.7595508385851826, + "grad_norm": 0.2638335021783196, + "learning_rate": 8.550573410893633e-06, + "loss": 0.5093, + "step": 4625 + }, + { + "epoch": 0.7597150657935253, + "grad_norm": 0.3190328077240095, + "learning_rate": 8.550369092087185e-06, + "loss": 0.525, + "step": 4626 + }, + { + "epoch": 0.7598792930018681, + "grad_norm": 0.3627572414267673, + "learning_rate": 8.550164729289743e-06, + "loss": 0.5023, + "step": 4627 + }, + { + "epoch": 0.7600435202102108, + "grad_norm": 0.3706630695321476, + "learning_rate": 8.549960322503529e-06, + "loss": 0.5316, + "step": 4628 + }, + { + "epoch": 0.7602077474185536, + "grad_norm": 0.33057504555303346, + "learning_rate": 8.54975587173076e-06, + "loss": 0.5226, + "step": 4629 + }, + { + "epoch": 0.7603719746268963, + "grad_norm": 0.30309061540235327, + "learning_rate": 8.549551376973658e-06, + "loss": 0.5269, + "step": 4630 + }, + { + "epoch": 0.7605362018352391, + "grad_norm": 0.31184472191583695, + "learning_rate": 8.549346838234442e-06, + "loss": 0.5076, + "step": 4631 + }, + { + "epoch": 0.7607004290435818, + "grad_norm": 0.29095482111017995, + "learning_rate": 8.549142255515338e-06, + "loss": 0.4765, + "step": 4632 + }, + { + "epoch": 0.7608646562519246, + "grad_norm": 0.2922099969230343, + "learning_rate": 8.548937628818564e-06, + "loss": 0.5138, + "step": 4633 + }, + { + "epoch": 0.7610288834602673, + "grad_norm": 0.3400598354912872, + "learning_rate": 8.548732958146344e-06, + "loss": 0.5167, + "step": 4634 + }, + { + "epoch": 0.7611931106686101, + "grad_norm": 0.3381885179424978, + "learning_rate": 8.5485282435009e-06, + "loss": 0.5004, + "step": 4635 + }, + { + "epoch": 0.7613573378769528, + "grad_norm": 0.315306221932674, + "learning_rate": 8.548323484884457e-06, + "loss": 0.5548, + "step": 4636 + }, + { + "epoch": 0.7615215650852956, + "grad_norm": 0.3346000660174875, + "learning_rate": 8.548118682299237e-06, + "loss": 0.5271, + "step": 4637 + }, + { + "epoch": 0.7616857922936382, + "grad_norm": 0.4439712033523865, + "learning_rate": 8.547913835747465e-06, + "loss": 0.5002, + "step": 4638 + }, + { + "epoch": 0.761850019501981, + "grad_norm": 0.31153812213147325, + "learning_rate": 8.547708945231369e-06, + "loss": 0.52, + "step": 4639 + }, + { + "epoch": 0.7620142467103237, + "grad_norm": 0.340893810594224, + "learning_rate": 8.54750401075317e-06, + "loss": 0.518, + "step": 4640 + }, + { + "epoch": 0.7621784739186664, + "grad_norm": 0.2955501959425951, + "learning_rate": 8.547299032315092e-06, + "loss": 0.5409, + "step": 4641 + }, + { + "epoch": 0.7623427011270092, + "grad_norm": 0.28838767397812765, + "learning_rate": 8.547094009919367e-06, + "loss": 0.4977, + "step": 4642 + }, + { + "epoch": 0.7625069283353519, + "grad_norm": 0.36043013999166806, + "learning_rate": 8.546888943568222e-06, + "loss": 0.5148, + "step": 4643 + }, + { + "epoch": 0.7626711555436947, + "grad_norm": 0.2841530706289285, + "learning_rate": 8.546683833263877e-06, + "loss": 0.5261, + "step": 4644 + }, + { + "epoch": 0.7628353827520374, + "grad_norm": 0.3517544924162603, + "learning_rate": 8.546478679008567e-06, + "loss": 0.5239, + "step": 4645 + }, + { + "epoch": 0.7629996099603802, + "grad_norm": 0.3226642978400009, + "learning_rate": 8.546273480804516e-06, + "loss": 0.5214, + "step": 4646 + }, + { + "epoch": 0.7631638371687229, + "grad_norm": 0.3350012172177672, + "learning_rate": 8.546068238653956e-06, + "loss": 0.5265, + "step": 4647 + }, + { + "epoch": 0.7633280643770657, + "grad_norm": 0.3365204860158755, + "learning_rate": 8.54586295255911e-06, + "loss": 0.5224, + "step": 4648 + }, + { + "epoch": 0.7634922915854084, + "grad_norm": 0.30134315958920094, + "learning_rate": 8.545657622522215e-06, + "loss": 0.543, + "step": 4649 + }, + { + "epoch": 0.7636565187937512, + "grad_norm": 0.33019521783542843, + "learning_rate": 8.545452248545498e-06, + "loss": 0.5183, + "step": 4650 + }, + { + "epoch": 0.7638207460020939, + "grad_norm": 0.37232931232505206, + "learning_rate": 8.545246830631188e-06, + "loss": 0.5175, + "step": 4651 + }, + { + "epoch": 0.7639849732104367, + "grad_norm": 0.32383868334851396, + "learning_rate": 8.545041368781517e-06, + "loss": 0.5364, + "step": 4652 + }, + { + "epoch": 0.7641492004187794, + "grad_norm": 0.44489749436817166, + "learning_rate": 8.544835862998718e-06, + "loss": 0.5201, + "step": 4653 + }, + { + "epoch": 0.7643134276271222, + "grad_norm": 0.3077556378016318, + "learning_rate": 8.544630313285022e-06, + "loss": 0.5124, + "step": 4654 + }, + { + "epoch": 0.7644776548354648, + "grad_norm": 0.4603259411059813, + "learning_rate": 8.544424719642661e-06, + "loss": 0.5183, + "step": 4655 + }, + { + "epoch": 0.7646418820438076, + "grad_norm": 0.2832805233868871, + "learning_rate": 8.54421908207387e-06, + "loss": 0.53, + "step": 4656 + }, + { + "epoch": 0.7648061092521503, + "grad_norm": 0.29170501882490907, + "learning_rate": 8.54401340058088e-06, + "loss": 0.5308, + "step": 4657 + }, + { + "epoch": 0.7649703364604931, + "grad_norm": 0.2950625968091386, + "learning_rate": 8.543807675165924e-06, + "loss": 0.5321, + "step": 4658 + }, + { + "epoch": 0.7651345636688358, + "grad_norm": 0.3632685280310422, + "learning_rate": 8.543601905831239e-06, + "loss": 0.5116, + "step": 4659 + }, + { + "epoch": 0.7652987908771786, + "grad_norm": 0.3393323236915578, + "learning_rate": 8.54339609257906e-06, + "loss": 0.5123, + "step": 4660 + }, + { + "epoch": 0.7654630180855213, + "grad_norm": 0.4729462883437287, + "learning_rate": 8.543190235411619e-06, + "loss": 0.519, + "step": 4661 + }, + { + "epoch": 0.7656272452938641, + "grad_norm": 0.3293233461617647, + "learning_rate": 8.542984334331155e-06, + "loss": 0.5236, + "step": 4662 + }, + { + "epoch": 0.7657914725022068, + "grad_norm": 0.36363391812587026, + "learning_rate": 8.542778389339906e-06, + "loss": 0.5094, + "step": 4663 + }, + { + "epoch": 0.7659556997105496, + "grad_norm": 0.30508096976913734, + "learning_rate": 8.542572400440103e-06, + "loss": 0.5097, + "step": 4664 + }, + { + "epoch": 0.7661199269188923, + "grad_norm": 0.31527353025695465, + "learning_rate": 8.542366367633988e-06, + "loss": 0.5233, + "step": 4665 + }, + { + "epoch": 0.766284154127235, + "grad_norm": 0.3002322251825842, + "learning_rate": 8.542160290923796e-06, + "loss": 0.4995, + "step": 4666 + }, + { + "epoch": 0.7664483813355778, + "grad_norm": 0.35868121348697857, + "learning_rate": 8.541954170311768e-06, + "loss": 0.4835, + "step": 4667 + }, + { + "epoch": 0.7666126085439205, + "grad_norm": 0.2816569076089125, + "learning_rate": 8.541748005800139e-06, + "loss": 0.5175, + "step": 4668 + }, + { + "epoch": 0.7667768357522633, + "grad_norm": 0.2794544666938337, + "learning_rate": 8.54154179739115e-06, + "loss": 0.5097, + "step": 4669 + }, + { + "epoch": 0.766941062960606, + "grad_norm": 0.29808345975734646, + "learning_rate": 8.541335545087043e-06, + "loss": 0.4996, + "step": 4670 + }, + { + "epoch": 0.7671052901689488, + "grad_norm": 0.28395721446353817, + "learning_rate": 8.541129248890053e-06, + "loss": 0.5332, + "step": 4671 + }, + { + "epoch": 0.7672695173772914, + "grad_norm": 0.3514233085142723, + "learning_rate": 8.540922908802425e-06, + "loss": 0.4993, + "step": 4672 + }, + { + "epoch": 0.7674337445856342, + "grad_norm": 0.41429693067925427, + "learning_rate": 8.540716524826398e-06, + "loss": 0.5151, + "step": 4673 + }, + { + "epoch": 0.7675979717939769, + "grad_norm": 0.3520524597657039, + "learning_rate": 8.540510096964215e-06, + "loss": 0.5223, + "step": 4674 + }, + { + "epoch": 0.7677621990023197, + "grad_norm": 0.3328068751348965, + "learning_rate": 8.540303625218115e-06, + "loss": 0.5111, + "step": 4675 + }, + { + "epoch": 0.7679264262106624, + "grad_norm": 0.31064697105471334, + "learning_rate": 8.540097109590346e-06, + "loss": 0.5042, + "step": 4676 + }, + { + "epoch": 0.7680906534190052, + "grad_norm": 0.39821289781714075, + "learning_rate": 8.539890550083144e-06, + "loss": 0.5076, + "step": 4677 + }, + { + "epoch": 0.7682548806273479, + "grad_norm": 0.3141655423218781, + "learning_rate": 8.539683946698758e-06, + "loss": 0.5134, + "step": 4678 + }, + { + "epoch": 0.7684191078356907, + "grad_norm": 0.46016224300055375, + "learning_rate": 8.539477299439429e-06, + "loss": 0.5196, + "step": 4679 + }, + { + "epoch": 0.7685833350440334, + "grad_norm": 0.4735631016426876, + "learning_rate": 8.539270608307402e-06, + "loss": 0.5306, + "step": 4680 + }, + { + "epoch": 0.7687475622523762, + "grad_norm": 0.2952547172783303, + "learning_rate": 8.539063873304922e-06, + "loss": 0.4949, + "step": 4681 + }, + { + "epoch": 0.7689117894607189, + "grad_norm": 0.3079803366854746, + "learning_rate": 8.538857094434234e-06, + "loss": 0.5281, + "step": 4682 + }, + { + "epoch": 0.7690760166690617, + "grad_norm": 0.3159192970003273, + "learning_rate": 8.538650271697586e-06, + "loss": 0.49, + "step": 4683 + }, + { + "epoch": 0.7692402438774044, + "grad_norm": 0.31282817662971113, + "learning_rate": 8.538443405097223e-06, + "loss": 0.5307, + "step": 4684 + }, + { + "epoch": 0.7694044710857472, + "grad_norm": 0.32242372926036034, + "learning_rate": 8.538236494635389e-06, + "loss": 0.5396, + "step": 4685 + }, + { + "epoch": 0.7695686982940899, + "grad_norm": 0.48535860275289644, + "learning_rate": 8.538029540314334e-06, + "loss": 0.5167, + "step": 4686 + }, + { + "epoch": 0.7697329255024327, + "grad_norm": 0.30109361527460454, + "learning_rate": 8.537822542136306e-06, + "loss": 0.5317, + "step": 4687 + }, + { + "epoch": 0.7698971527107754, + "grad_norm": 0.4440781157107436, + "learning_rate": 8.537615500103553e-06, + "loss": 0.5044, + "step": 4688 + }, + { + "epoch": 0.770061379919118, + "grad_norm": 0.3322199954679528, + "learning_rate": 8.537408414218323e-06, + "loss": 0.5235, + "step": 4689 + }, + { + "epoch": 0.7702256071274608, + "grad_norm": 0.2736769475032565, + "learning_rate": 8.537201284482864e-06, + "loss": 0.5071, + "step": 4690 + }, + { + "epoch": 0.7703898343358035, + "grad_norm": 0.3251526213165742, + "learning_rate": 8.536994110899428e-06, + "loss": 0.4937, + "step": 4691 + }, + { + "epoch": 0.7705540615441463, + "grad_norm": 0.3412682423457841, + "learning_rate": 8.536786893470264e-06, + "loss": 0.5148, + "step": 4692 + }, + { + "epoch": 0.770718288752489, + "grad_norm": 0.3158500123302273, + "learning_rate": 8.536579632197622e-06, + "loss": 0.4904, + "step": 4693 + }, + { + "epoch": 0.7708825159608318, + "grad_norm": 0.38296860314735637, + "learning_rate": 8.536372327083755e-06, + "loss": 0.5299, + "step": 4694 + }, + { + "epoch": 0.7710467431691745, + "grad_norm": 0.31222738840079106, + "learning_rate": 8.536164978130913e-06, + "loss": 0.535, + "step": 4695 + }, + { + "epoch": 0.7712109703775173, + "grad_norm": 0.843025114686952, + "learning_rate": 8.535957585341349e-06, + "loss": 0.5245, + "step": 4696 + }, + { + "epoch": 0.77137519758586, + "grad_norm": 0.32381269833214565, + "learning_rate": 8.535750148717312e-06, + "loss": 0.5388, + "step": 4697 + }, + { + "epoch": 0.7715394247942028, + "grad_norm": 0.41423456394537234, + "learning_rate": 8.53554266826106e-06, + "loss": 0.5049, + "step": 4698 + }, + { + "epoch": 0.7717036520025455, + "grad_norm": 0.2918748934972015, + "learning_rate": 8.535335143974844e-06, + "loss": 0.5151, + "step": 4699 + }, + { + "epoch": 0.7718678792108883, + "grad_norm": 0.2785197474123093, + "learning_rate": 8.535127575860917e-06, + "loss": 0.523, + "step": 4700 + }, + { + "epoch": 0.772032106419231, + "grad_norm": 0.4162412664492259, + "learning_rate": 8.534919963921536e-06, + "loss": 0.5231, + "step": 4701 + }, + { + "epoch": 0.7721963336275738, + "grad_norm": 0.2769221462291233, + "learning_rate": 8.534712308158954e-06, + "loss": 0.5286, + "step": 4702 + }, + { + "epoch": 0.7723605608359165, + "grad_norm": 0.30009896167616434, + "learning_rate": 8.534504608575426e-06, + "loss": 0.5155, + "step": 4703 + }, + { + "epoch": 0.7725247880442593, + "grad_norm": 0.28533441325104786, + "learning_rate": 8.53429686517321e-06, + "loss": 0.4966, + "step": 4704 + }, + { + "epoch": 0.772689015252602, + "grad_norm": 1.2238805912739166, + "learning_rate": 8.534089077954558e-06, + "loss": 0.5149, + "step": 4705 + }, + { + "epoch": 0.7728532424609447, + "grad_norm": 0.29952221829630815, + "learning_rate": 8.533881246921732e-06, + "loss": 0.5181, + "step": 4706 + }, + { + "epoch": 0.7730174696692874, + "grad_norm": 0.320317196375448, + "learning_rate": 8.533673372076987e-06, + "loss": 0.492, + "step": 4707 + }, + { + "epoch": 0.7731816968776302, + "grad_norm": 0.3663326715952758, + "learning_rate": 8.53346545342258e-06, + "loss": 0.5393, + "step": 4708 + }, + { + "epoch": 0.7733459240859729, + "grad_norm": 0.3036228367230328, + "learning_rate": 8.533257490960768e-06, + "loss": 0.5426, + "step": 4709 + }, + { + "epoch": 0.7735101512943157, + "grad_norm": 0.2865700530075475, + "learning_rate": 8.533049484693813e-06, + "loss": 0.5117, + "step": 4710 + }, + { + "epoch": 0.7736743785026584, + "grad_norm": 0.31601286020177577, + "learning_rate": 8.532841434623974e-06, + "loss": 0.533, + "step": 4711 + }, + { + "epoch": 0.7738386057110012, + "grad_norm": 0.3364380677143077, + "learning_rate": 8.532633340753507e-06, + "loss": 0.53, + "step": 4712 + }, + { + "epoch": 0.7740028329193439, + "grad_norm": 0.35339859314853217, + "learning_rate": 8.532425203084675e-06, + "loss": 0.5104, + "step": 4713 + }, + { + "epoch": 0.7741670601276867, + "grad_norm": 0.5951590155877503, + "learning_rate": 8.532217021619738e-06, + "loss": 0.5445, + "step": 4714 + }, + { + "epoch": 0.7743312873360294, + "grad_norm": 0.29684524677217816, + "learning_rate": 8.532008796360957e-06, + "loss": 0.515, + "step": 4715 + }, + { + "epoch": 0.7744955145443722, + "grad_norm": 0.3610512910167456, + "learning_rate": 8.531800527310594e-06, + "loss": 0.4889, + "step": 4716 + }, + { + "epoch": 0.7746597417527149, + "grad_norm": 0.2866312562125442, + "learning_rate": 8.53159221447091e-06, + "loss": 0.5028, + "step": 4717 + }, + { + "epoch": 0.7748239689610577, + "grad_norm": 0.2883691284842144, + "learning_rate": 8.531383857844169e-06, + "loss": 0.5257, + "step": 4718 + }, + { + "epoch": 0.7749881961694004, + "grad_norm": 0.32116600915112437, + "learning_rate": 8.53117545743263e-06, + "loss": 0.511, + "step": 4719 + }, + { + "epoch": 0.7751524233777431, + "grad_norm": 0.36579551244838965, + "learning_rate": 8.530967013238562e-06, + "loss": 0.5279, + "step": 4720 + }, + { + "epoch": 0.7753166505860859, + "grad_norm": 0.335274958914863, + "learning_rate": 8.530758525264226e-06, + "loss": 0.5219, + "step": 4721 + }, + { + "epoch": 0.7754808777944286, + "grad_norm": 0.35908456979911946, + "learning_rate": 8.530549993511886e-06, + "loss": 0.5122, + "step": 4722 + }, + { + "epoch": 0.7756451050027713, + "grad_norm": 0.3303814149214648, + "learning_rate": 8.53034141798381e-06, + "loss": 0.5188, + "step": 4723 + }, + { + "epoch": 0.775809332211114, + "grad_norm": 0.552579852685583, + "learning_rate": 8.530132798682258e-06, + "loss": 0.5499, + "step": 4724 + }, + { + "epoch": 0.7759735594194568, + "grad_norm": 0.6038296709572685, + "learning_rate": 8.529924135609499e-06, + "loss": 0.5315, + "step": 4725 + }, + { + "epoch": 0.7761377866277995, + "grad_norm": 0.3259739996544789, + "learning_rate": 8.5297154287678e-06, + "loss": 0.4987, + "step": 4726 + }, + { + "epoch": 0.7763020138361423, + "grad_norm": 0.2899897486092834, + "learning_rate": 8.529506678159426e-06, + "loss": 0.5278, + "step": 4727 + }, + { + "epoch": 0.776466241044485, + "grad_norm": 0.2904572716926567, + "learning_rate": 8.529297883786645e-06, + "loss": 0.501, + "step": 4728 + }, + { + "epoch": 0.7766304682528278, + "grad_norm": 0.31236889089317593, + "learning_rate": 8.529089045651726e-06, + "loss": 0.4995, + "step": 4729 + }, + { + "epoch": 0.7767946954611705, + "grad_norm": 0.38956907875588986, + "learning_rate": 8.528880163756935e-06, + "loss": 0.5115, + "step": 4730 + }, + { + "epoch": 0.7769589226695133, + "grad_norm": 0.34404235566924035, + "learning_rate": 8.52867123810454e-06, + "loss": 0.5264, + "step": 4731 + }, + { + "epoch": 0.777123149877856, + "grad_norm": 0.3443712963310854, + "learning_rate": 8.528462268696812e-06, + "loss": 0.512, + "step": 4732 + }, + { + "epoch": 0.7772873770861988, + "grad_norm": 0.28851683888501506, + "learning_rate": 8.528253255536022e-06, + "loss": 0.504, + "step": 4733 + }, + { + "epoch": 0.7774516042945415, + "grad_norm": 0.3181508096343241, + "learning_rate": 8.528044198624438e-06, + "loss": 0.5244, + "step": 4734 + }, + { + "epoch": 0.7776158315028843, + "grad_norm": 0.3867330975953575, + "learning_rate": 8.527835097964331e-06, + "loss": 0.5271, + "step": 4735 + }, + { + "epoch": 0.777780058711227, + "grad_norm": 0.3066079602945166, + "learning_rate": 8.527625953557972e-06, + "loss": 0.4989, + "step": 4736 + }, + { + "epoch": 0.7779442859195698, + "grad_norm": 0.3851892733779904, + "learning_rate": 8.527416765407633e-06, + "loss": 0.5198, + "step": 4737 + }, + { + "epoch": 0.7781085131279125, + "grad_norm": 0.31990920459193956, + "learning_rate": 8.527207533515583e-06, + "loss": 0.5015, + "step": 4738 + }, + { + "epoch": 0.7782727403362553, + "grad_norm": 0.2720719422478482, + "learning_rate": 8.5269982578841e-06, + "loss": 0.4928, + "step": 4739 + }, + { + "epoch": 0.7784369675445979, + "grad_norm": 0.7093090861871565, + "learning_rate": 8.526788938515451e-06, + "loss": 0.511, + "step": 4740 + }, + { + "epoch": 0.7786011947529407, + "grad_norm": 0.28949201274533776, + "learning_rate": 8.526579575411914e-06, + "loss": 0.5134, + "step": 4741 + }, + { + "epoch": 0.7787654219612834, + "grad_norm": 0.41192958031795, + "learning_rate": 8.526370168575762e-06, + "loss": 0.5001, + "step": 4742 + }, + { + "epoch": 0.7789296491696261, + "grad_norm": 0.3955354645563975, + "learning_rate": 8.526160718009267e-06, + "loss": 0.4977, + "step": 4743 + }, + { + "epoch": 0.7790938763779689, + "grad_norm": 0.27548066782513797, + "learning_rate": 8.525951223714705e-06, + "loss": 0.5406, + "step": 4744 + }, + { + "epoch": 0.7792581035863116, + "grad_norm": 0.34792018231763794, + "learning_rate": 8.525741685694353e-06, + "loss": 0.5035, + "step": 4745 + }, + { + "epoch": 0.7794223307946544, + "grad_norm": 0.2558824257167838, + "learning_rate": 8.525532103950485e-06, + "loss": 0.5181, + "step": 4746 + }, + { + "epoch": 0.7795865580029971, + "grad_norm": 0.37147364997022847, + "learning_rate": 8.52532247848538e-06, + "loss": 0.5063, + "step": 4747 + }, + { + "epoch": 0.7797507852113399, + "grad_norm": 0.26862453917752677, + "learning_rate": 8.525112809301308e-06, + "loss": 0.5384, + "step": 4748 + }, + { + "epoch": 0.7799150124196826, + "grad_norm": 0.31769694408212135, + "learning_rate": 8.524903096400554e-06, + "loss": 0.5187, + "step": 4749 + }, + { + "epoch": 0.7800792396280254, + "grad_norm": 0.298163868790542, + "learning_rate": 8.524693339785392e-06, + "loss": 0.507, + "step": 4750 + }, + { + "epoch": 0.7802434668363681, + "grad_norm": 0.2933975935484392, + "learning_rate": 8.524483539458099e-06, + "loss": 0.5265, + "step": 4751 + }, + { + "epoch": 0.7804076940447109, + "grad_norm": 0.3179903130943498, + "learning_rate": 8.524273695420957e-06, + "loss": 0.5126, + "step": 4752 + }, + { + "epoch": 0.7805719212530536, + "grad_norm": 0.26716291057570807, + "learning_rate": 8.524063807676241e-06, + "loss": 0.5204, + "step": 4753 + }, + { + "epoch": 0.7807361484613964, + "grad_norm": 0.31106935340054614, + "learning_rate": 8.523853876226236e-06, + "loss": 0.5321, + "step": 4754 + }, + { + "epoch": 0.7809003756697391, + "grad_norm": 0.31817775903788903, + "learning_rate": 8.523643901073217e-06, + "loss": 0.5051, + "step": 4755 + }, + { + "epoch": 0.7810646028780819, + "grad_norm": 0.3039687088510633, + "learning_rate": 8.523433882219467e-06, + "loss": 0.4994, + "step": 4756 + }, + { + "epoch": 0.7812288300864245, + "grad_norm": 0.2839010503466107, + "learning_rate": 8.523223819667267e-06, + "loss": 0.5216, + "step": 4757 + }, + { + "epoch": 0.7813930572947673, + "grad_norm": 0.4120389386069269, + "learning_rate": 8.523013713418897e-06, + "loss": 0.481, + "step": 4758 + }, + { + "epoch": 0.78155728450311, + "grad_norm": 0.3062612421267718, + "learning_rate": 8.522803563476641e-06, + "loss": 0.4983, + "step": 4759 + }, + { + "epoch": 0.7817215117114528, + "grad_norm": 0.28296796391139384, + "learning_rate": 8.52259336984278e-06, + "loss": 0.5013, + "step": 4760 + }, + { + "epoch": 0.7818857389197955, + "grad_norm": 0.3046360962097441, + "learning_rate": 8.522383132519597e-06, + "loss": 0.53, + "step": 4761 + }, + { + "epoch": 0.7820499661281383, + "grad_norm": 0.3142566255762791, + "learning_rate": 8.522172851509375e-06, + "loss": 0.5181, + "step": 4762 + }, + { + "epoch": 0.782214193336481, + "grad_norm": 0.33770779393011047, + "learning_rate": 8.5219625268144e-06, + "loss": 0.5003, + "step": 4763 + }, + { + "epoch": 0.7823784205448238, + "grad_norm": 0.35718367865642603, + "learning_rate": 8.521752158436954e-06, + "loss": 0.4931, + "step": 4764 + }, + { + "epoch": 0.7825426477531665, + "grad_norm": 0.3941435431475404, + "learning_rate": 8.521541746379323e-06, + "loss": 0.5216, + "step": 4765 + }, + { + "epoch": 0.7827068749615093, + "grad_norm": 0.2865143701939515, + "learning_rate": 8.521331290643791e-06, + "loss": 0.5063, + "step": 4766 + }, + { + "epoch": 0.782871102169852, + "grad_norm": 0.28411967137007504, + "learning_rate": 8.521120791232646e-06, + "loss": 0.5222, + "step": 4767 + }, + { + "epoch": 0.7830353293781948, + "grad_norm": 0.3053542974959439, + "learning_rate": 8.520910248148174e-06, + "loss": 0.5365, + "step": 4768 + }, + { + "epoch": 0.7831995565865375, + "grad_norm": 0.3912844265585773, + "learning_rate": 8.52069966139266e-06, + "loss": 0.5257, + "step": 4769 + }, + { + "epoch": 0.7833637837948803, + "grad_norm": 0.34825980844701676, + "learning_rate": 8.52048903096839e-06, + "loss": 0.5452, + "step": 4770 + }, + { + "epoch": 0.783528011003223, + "grad_norm": 0.2853114021603045, + "learning_rate": 8.520278356877654e-06, + "loss": 0.5073, + "step": 4771 + }, + { + "epoch": 0.7836922382115658, + "grad_norm": 0.3703869120538064, + "learning_rate": 8.52006763912274e-06, + "loss": 0.4969, + "step": 4772 + }, + { + "epoch": 0.7838564654199084, + "grad_norm": 0.27611234565761733, + "learning_rate": 8.519856877705937e-06, + "loss": 0.5344, + "step": 4773 + }, + { + "epoch": 0.7840206926282511, + "grad_norm": 0.28744438714816317, + "learning_rate": 8.519646072629533e-06, + "loss": 0.4983, + "step": 4774 + }, + { + "epoch": 0.7841849198365939, + "grad_norm": 0.3552014651411184, + "learning_rate": 8.519435223895817e-06, + "loss": 0.5018, + "step": 4775 + }, + { + "epoch": 0.7843491470449366, + "grad_norm": 0.2757339930350272, + "learning_rate": 8.519224331507081e-06, + "loss": 0.5222, + "step": 4776 + }, + { + "epoch": 0.7845133742532794, + "grad_norm": 0.3417843165815299, + "learning_rate": 8.519013395465614e-06, + "loss": 0.5093, + "step": 4777 + }, + { + "epoch": 0.7846776014616221, + "grad_norm": 0.3300629702593418, + "learning_rate": 8.518802415773707e-06, + "loss": 0.5405, + "step": 4778 + }, + { + "epoch": 0.7848418286699649, + "grad_norm": 0.3143676247665393, + "learning_rate": 8.518591392433653e-06, + "loss": 0.5189, + "step": 4779 + }, + { + "epoch": 0.7850060558783076, + "grad_norm": 0.33319228445074417, + "learning_rate": 8.518380325447741e-06, + "loss": 0.5412, + "step": 4780 + }, + { + "epoch": 0.7851702830866504, + "grad_norm": 0.5097647567354313, + "learning_rate": 8.518169214818265e-06, + "loss": 0.4978, + "step": 4781 + }, + { + "epoch": 0.7853345102949931, + "grad_norm": 0.31219081932027304, + "learning_rate": 8.51795806054752e-06, + "loss": 0.5069, + "step": 4782 + }, + { + "epoch": 0.7854987375033359, + "grad_norm": 0.3588349702401945, + "learning_rate": 8.517746862637797e-06, + "loss": 0.5079, + "step": 4783 + }, + { + "epoch": 0.7856629647116786, + "grad_norm": 0.3102582288830573, + "learning_rate": 8.517535621091388e-06, + "loss": 0.5205, + "step": 4784 + }, + { + "epoch": 0.7858271919200214, + "grad_norm": 0.2918198592895266, + "learning_rate": 8.517324335910591e-06, + "loss": 0.5156, + "step": 4785 + }, + { + "epoch": 0.7859914191283641, + "grad_norm": 0.33430126386274583, + "learning_rate": 8.5171130070977e-06, + "loss": 0.5051, + "step": 4786 + }, + { + "epoch": 0.7861556463367069, + "grad_norm": 0.3254535703108297, + "learning_rate": 8.516901634655008e-06, + "loss": 0.5253, + "step": 4787 + }, + { + "epoch": 0.7863198735450496, + "grad_norm": 0.3484996970943442, + "learning_rate": 8.516690218584811e-06, + "loss": 0.5093, + "step": 4788 + }, + { + "epoch": 0.7864841007533924, + "grad_norm": 0.263519147067759, + "learning_rate": 8.51647875888941e-06, + "loss": 0.5106, + "step": 4789 + }, + { + "epoch": 0.786648327961735, + "grad_norm": 0.45546638774831777, + "learning_rate": 8.516267255571094e-06, + "loss": 0.5176, + "step": 4790 + }, + { + "epoch": 0.7868125551700778, + "grad_norm": 0.3449889074977041, + "learning_rate": 8.516055708632166e-06, + "loss": 0.5301, + "step": 4791 + }, + { + "epoch": 0.7869767823784205, + "grad_norm": 0.3698943802374998, + "learning_rate": 8.515844118074923e-06, + "loss": 0.5229, + "step": 4792 + }, + { + "epoch": 0.7871410095867633, + "grad_norm": 0.33462868726553185, + "learning_rate": 8.51563248390166e-06, + "loss": 0.5161, + "step": 4793 + }, + { + "epoch": 0.787305236795106, + "grad_norm": 0.3123568050312096, + "learning_rate": 8.515420806114677e-06, + "loss": 0.5109, + "step": 4794 + }, + { + "epoch": 0.7874694640034487, + "grad_norm": 0.34422885124330654, + "learning_rate": 8.515209084716275e-06, + "loss": 0.4919, + "step": 4795 + }, + { + "epoch": 0.7876336912117915, + "grad_norm": 0.3889682489845154, + "learning_rate": 8.514997319708751e-06, + "loss": 0.4851, + "step": 4796 + }, + { + "epoch": 0.7877979184201342, + "grad_norm": 0.36985116050350514, + "learning_rate": 8.514785511094408e-06, + "loss": 0.5292, + "step": 4797 + }, + { + "epoch": 0.787962145628477, + "grad_norm": 0.32305288714276936, + "learning_rate": 8.514573658875541e-06, + "loss": 0.4837, + "step": 4798 + }, + { + "epoch": 0.7881263728368197, + "grad_norm": 0.41138982582893036, + "learning_rate": 8.514361763054456e-06, + "loss": 0.5034, + "step": 4799 + }, + { + "epoch": 0.7882906000451625, + "grad_norm": 0.35851481494533577, + "learning_rate": 8.514149823633453e-06, + "loss": 0.53, + "step": 4800 + }, + { + "epoch": 0.7884548272535052, + "grad_norm": 0.3239257397621457, + "learning_rate": 8.513937840614832e-06, + "loss": 0.5206, + "step": 4801 + }, + { + "epoch": 0.788619054461848, + "grad_norm": 0.49352899282790785, + "learning_rate": 8.513725814000898e-06, + "loss": 0.5201, + "step": 4802 + }, + { + "epoch": 0.7887832816701907, + "grad_norm": 0.29476675219482074, + "learning_rate": 8.513513743793954e-06, + "loss": 0.4968, + "step": 4803 + }, + { + "epoch": 0.7889475088785335, + "grad_norm": 0.3763905524883607, + "learning_rate": 8.513301629996299e-06, + "loss": 0.4996, + "step": 4804 + }, + { + "epoch": 0.7891117360868762, + "grad_norm": 0.5222654404707635, + "learning_rate": 8.513089472610242e-06, + "loss": 0.5233, + "step": 4805 + }, + { + "epoch": 0.789275963295219, + "grad_norm": 0.33072041061279667, + "learning_rate": 8.512877271638084e-06, + "loss": 0.4858, + "step": 4806 + }, + { + "epoch": 0.7894401905035616, + "grad_norm": 0.28791095300849756, + "learning_rate": 8.51266502708213e-06, + "loss": 0.5171, + "step": 4807 + }, + { + "epoch": 0.7896044177119044, + "grad_norm": 0.29139790932119547, + "learning_rate": 8.512452738944686e-06, + "loss": 0.5295, + "step": 4808 + }, + { + "epoch": 0.7897686449202471, + "grad_norm": 0.2964080244353324, + "learning_rate": 8.51224040722806e-06, + "loss": 0.4971, + "step": 4809 + }, + { + "epoch": 0.7899328721285899, + "grad_norm": 0.32197530233101723, + "learning_rate": 8.512028031934554e-06, + "loss": 0.4932, + "step": 4810 + }, + { + "epoch": 0.7900970993369326, + "grad_norm": 0.2843080931858888, + "learning_rate": 8.511815613066475e-06, + "loss": 0.5005, + "step": 4811 + }, + { + "epoch": 0.7902613265452754, + "grad_norm": 0.535477832250208, + "learning_rate": 8.511603150626132e-06, + "loss": 0.5186, + "step": 4812 + }, + { + "epoch": 0.7904255537536181, + "grad_norm": 0.29692623047749966, + "learning_rate": 8.511390644615833e-06, + "loss": 0.5124, + "step": 4813 + }, + { + "epoch": 0.7905897809619609, + "grad_norm": 0.8690043401486638, + "learning_rate": 8.511178095037885e-06, + "loss": 0.5288, + "step": 4814 + }, + { + "epoch": 0.7907540081703036, + "grad_norm": 0.3356252094797372, + "learning_rate": 8.510965501894595e-06, + "loss": 0.5202, + "step": 4815 + }, + { + "epoch": 0.7909182353786464, + "grad_norm": 0.2574155639032546, + "learning_rate": 8.510752865188275e-06, + "loss": 0.4958, + "step": 4816 + }, + { + "epoch": 0.7910824625869891, + "grad_norm": 0.26080904931135435, + "learning_rate": 8.510540184921233e-06, + "loss": 0.4926, + "step": 4817 + }, + { + "epoch": 0.7912466897953319, + "grad_norm": 0.3428360730324517, + "learning_rate": 8.510327461095777e-06, + "loss": 0.5051, + "step": 4818 + }, + { + "epoch": 0.7914109170036746, + "grad_norm": 0.4252710145129881, + "learning_rate": 8.510114693714219e-06, + "loss": 0.5369, + "step": 4819 + }, + { + "epoch": 0.7915751442120174, + "grad_norm": 0.29726020666126923, + "learning_rate": 8.509901882778872e-06, + "loss": 0.4963, + "step": 4820 + }, + { + "epoch": 0.7917393714203601, + "grad_norm": 0.34172269458973087, + "learning_rate": 8.509689028292045e-06, + "loss": 0.539, + "step": 4821 + }, + { + "epoch": 0.7919035986287029, + "grad_norm": 0.30840633192280165, + "learning_rate": 8.509476130256049e-06, + "loss": 0.4956, + "step": 4822 + }, + { + "epoch": 0.7920678258370456, + "grad_norm": 0.27131424112280017, + "learning_rate": 8.509263188673198e-06, + "loss": 0.5096, + "step": 4823 + }, + { + "epoch": 0.7922320530453882, + "grad_norm": 0.3743847578318213, + "learning_rate": 8.509050203545803e-06, + "loss": 0.5065, + "step": 4824 + }, + { + "epoch": 0.792396280253731, + "grad_norm": 0.3795053796958781, + "learning_rate": 8.50883717487618e-06, + "loss": 0.5179, + "step": 4825 + }, + { + "epoch": 0.7925605074620737, + "grad_norm": 0.30026452426018335, + "learning_rate": 8.508624102666642e-06, + "loss": 0.5137, + "step": 4826 + }, + { + "epoch": 0.7927247346704165, + "grad_norm": 0.31326093789198417, + "learning_rate": 8.508410986919499e-06, + "loss": 0.5146, + "step": 4827 + }, + { + "epoch": 0.7928889618787592, + "grad_norm": 0.3620854669484693, + "learning_rate": 8.508197827637073e-06, + "loss": 0.5227, + "step": 4828 + }, + { + "epoch": 0.793053189087102, + "grad_norm": 0.3014049980906719, + "learning_rate": 8.507984624821672e-06, + "loss": 0.5094, + "step": 4829 + }, + { + "epoch": 0.7932174162954447, + "grad_norm": 0.3847631333967564, + "learning_rate": 8.507771378475614e-06, + "loss": 0.5218, + "step": 4830 + }, + { + "epoch": 0.7933816435037875, + "grad_norm": 0.2873710411856193, + "learning_rate": 8.507558088601218e-06, + "loss": 0.4793, + "step": 4831 + }, + { + "epoch": 0.7935458707121302, + "grad_norm": 0.2906589330262389, + "learning_rate": 8.507344755200797e-06, + "loss": 0.5164, + "step": 4832 + }, + { + "epoch": 0.793710097920473, + "grad_norm": 0.2843661884302468, + "learning_rate": 8.507131378276671e-06, + "loss": 0.5149, + "step": 4833 + }, + { + "epoch": 0.7938743251288157, + "grad_norm": 0.3119387145742007, + "learning_rate": 8.506917957831153e-06, + "loss": 0.4946, + "step": 4834 + }, + { + "epoch": 0.7940385523371585, + "grad_norm": 1.3155836455625542, + "learning_rate": 8.506704493866567e-06, + "loss": 0.5234, + "step": 4835 + }, + { + "epoch": 0.7942027795455012, + "grad_norm": 0.33312679287940616, + "learning_rate": 8.506490986385225e-06, + "loss": 0.503, + "step": 4836 + }, + { + "epoch": 0.794367006753844, + "grad_norm": 0.2994563085360433, + "learning_rate": 8.506277435389452e-06, + "loss": 0.5368, + "step": 4837 + }, + { + "epoch": 0.7945312339621867, + "grad_norm": 0.37734204973357055, + "learning_rate": 8.506063840881562e-06, + "loss": 0.5047, + "step": 4838 + }, + { + "epoch": 0.7946954611705295, + "grad_norm": 0.3317028490234112, + "learning_rate": 8.505850202863878e-06, + "loss": 0.5231, + "step": 4839 + }, + { + "epoch": 0.7948596883788722, + "grad_norm": 0.3180236812745663, + "learning_rate": 8.50563652133872e-06, + "loss": 0.498, + "step": 4840 + }, + { + "epoch": 0.7950239155872149, + "grad_norm": 0.31286873557820605, + "learning_rate": 8.505422796308408e-06, + "loss": 0.5154, + "step": 4841 + }, + { + "epoch": 0.7951881427955576, + "grad_norm": 0.3322648618516919, + "learning_rate": 8.505209027775263e-06, + "loss": 0.5144, + "step": 4842 + }, + { + "epoch": 0.7953523700039004, + "grad_norm": 0.3246666933401929, + "learning_rate": 8.504995215741608e-06, + "loss": 0.509, + "step": 4843 + }, + { + "epoch": 0.7955165972122431, + "grad_norm": 0.29849518101027295, + "learning_rate": 8.504781360209765e-06, + "loss": 0.5007, + "step": 4844 + }, + { + "epoch": 0.7956808244205859, + "grad_norm": 0.5686947232248678, + "learning_rate": 8.504567461182056e-06, + "loss": 0.5184, + "step": 4845 + }, + { + "epoch": 0.7958450516289286, + "grad_norm": 0.3001488106983265, + "learning_rate": 8.504353518660804e-06, + "loss": 0.5114, + "step": 4846 + }, + { + "epoch": 0.7960092788372714, + "grad_norm": 0.2899684109515469, + "learning_rate": 8.504139532648333e-06, + "loss": 0.5319, + "step": 4847 + }, + { + "epoch": 0.7961735060456141, + "grad_norm": 0.34811202789673956, + "learning_rate": 8.503925503146968e-06, + "loss": 0.5048, + "step": 4848 + }, + { + "epoch": 0.7963377332539568, + "grad_norm": 0.28213312593019935, + "learning_rate": 8.503711430159031e-06, + "loss": 0.5225, + "step": 4849 + }, + { + "epoch": 0.7965019604622996, + "grad_norm": 0.31306875930476336, + "learning_rate": 8.50349731368685e-06, + "loss": 0.5086, + "step": 4850 + }, + { + "epoch": 0.7966661876706423, + "grad_norm": 0.2767900885059024, + "learning_rate": 8.50328315373275e-06, + "loss": 0.5246, + "step": 4851 + }, + { + "epoch": 0.7968304148789851, + "grad_norm": 0.3163293048450247, + "learning_rate": 8.503068950299054e-06, + "loss": 0.5234, + "step": 4852 + }, + { + "epoch": 0.7969946420873278, + "grad_norm": 0.337058184039474, + "learning_rate": 8.502854703388094e-06, + "loss": 0.5123, + "step": 4853 + }, + { + "epoch": 0.7971588692956706, + "grad_norm": 0.5143054872690204, + "learning_rate": 8.502640413002191e-06, + "loss": 0.5047, + "step": 4854 + }, + { + "epoch": 0.7973230965040133, + "grad_norm": 0.4630950984648018, + "learning_rate": 8.502426079143675e-06, + "loss": 0.5085, + "step": 4855 + }, + { + "epoch": 0.7974873237123561, + "grad_norm": 0.329732576829339, + "learning_rate": 8.502211701814876e-06, + "loss": 0.5182, + "step": 4856 + }, + { + "epoch": 0.7976515509206988, + "grad_norm": 0.3156590174293186, + "learning_rate": 8.501997281018118e-06, + "loss": 0.5087, + "step": 4857 + }, + { + "epoch": 0.7978157781290415, + "grad_norm": 0.3275346397983897, + "learning_rate": 8.501782816755732e-06, + "loss": 0.5282, + "step": 4858 + }, + { + "epoch": 0.7979800053373842, + "grad_norm": 0.3255908453673845, + "learning_rate": 8.50156830903005e-06, + "loss": 0.534, + "step": 4859 + }, + { + "epoch": 0.798144232545727, + "grad_norm": 0.41658664353399205, + "learning_rate": 8.501353757843398e-06, + "loss": 0.5289, + "step": 4860 + }, + { + "epoch": 0.7983084597540697, + "grad_norm": 0.33432186201094094, + "learning_rate": 8.501139163198106e-06, + "loss": 0.4954, + "step": 4861 + }, + { + "epoch": 0.7984726869624125, + "grad_norm": 0.3844040400667963, + "learning_rate": 8.500924525096508e-06, + "loss": 0.5358, + "step": 4862 + }, + { + "epoch": 0.7986369141707552, + "grad_norm": 0.28735140226035877, + "learning_rate": 8.500709843540931e-06, + "loss": 0.5387, + "step": 4863 + }, + { + "epoch": 0.798801141379098, + "grad_norm": 0.4156195104655946, + "learning_rate": 8.500495118533712e-06, + "loss": 0.5146, + "step": 4864 + }, + { + "epoch": 0.7989653685874407, + "grad_norm": 0.4438432023377831, + "learning_rate": 8.500280350077176e-06, + "loss": 0.5063, + "step": 4865 + }, + { + "epoch": 0.7991295957957835, + "grad_norm": 0.313254971038212, + "learning_rate": 8.500065538173663e-06, + "loss": 0.5168, + "step": 4866 + }, + { + "epoch": 0.7992938230041262, + "grad_norm": 0.31318752813737577, + "learning_rate": 8.499850682825502e-06, + "loss": 0.5017, + "step": 4867 + }, + { + "epoch": 0.799458050212469, + "grad_norm": 0.29010297933150886, + "learning_rate": 8.499635784035026e-06, + "loss": 0.5076, + "step": 4868 + }, + { + "epoch": 0.7996222774208117, + "grad_norm": 0.34097058759666493, + "learning_rate": 8.499420841804572e-06, + "loss": 0.5075, + "step": 4869 + }, + { + "epoch": 0.7997865046291545, + "grad_norm": 0.34264820615195757, + "learning_rate": 8.499205856136473e-06, + "loss": 0.5353, + "step": 4870 + }, + { + "epoch": 0.7999507318374972, + "grad_norm": 0.31214009809327814, + "learning_rate": 8.498990827033062e-06, + "loss": 0.5176, + "step": 4871 + }, + { + "epoch": 0.80011495904584, + "grad_norm": 0.2919647666804401, + "learning_rate": 8.498775754496677e-06, + "loss": 0.5113, + "step": 4872 + }, + { + "epoch": 0.8002791862541827, + "grad_norm": 0.29216191333093305, + "learning_rate": 8.498560638529654e-06, + "loss": 0.4954, + "step": 4873 + }, + { + "epoch": 0.8004434134625255, + "grad_norm": 0.3648942224661276, + "learning_rate": 8.498345479134327e-06, + "loss": 0.5205, + "step": 4874 + }, + { + "epoch": 0.8006076406708681, + "grad_norm": 0.30774809099478256, + "learning_rate": 8.498130276313035e-06, + "loss": 0.5478, + "step": 4875 + }, + { + "epoch": 0.8007718678792108, + "grad_norm": 0.3454301234007663, + "learning_rate": 8.497915030068113e-06, + "loss": 0.4924, + "step": 4876 + }, + { + "epoch": 0.8009360950875536, + "grad_norm": 0.3290857865234024, + "learning_rate": 8.497699740401901e-06, + "loss": 0.5202, + "step": 4877 + }, + { + "epoch": 0.8011003222958963, + "grad_norm": 0.3546434922240655, + "learning_rate": 8.497484407316737e-06, + "loss": 0.4984, + "step": 4878 + }, + { + "epoch": 0.8012645495042391, + "grad_norm": 0.33323543843021725, + "learning_rate": 8.49726903081496e-06, + "loss": 0.5027, + "step": 4879 + }, + { + "epoch": 0.8014287767125818, + "grad_norm": 0.26386927946032285, + "learning_rate": 8.497053610898908e-06, + "loss": 0.5337, + "step": 4880 + }, + { + "epoch": 0.8015930039209246, + "grad_norm": 0.40317599119391767, + "learning_rate": 8.49683814757092e-06, + "loss": 0.5328, + "step": 4881 + }, + { + "epoch": 0.8017572311292673, + "grad_norm": 0.29720970041618555, + "learning_rate": 8.49662264083334e-06, + "loss": 0.5172, + "step": 4882 + }, + { + "epoch": 0.8019214583376101, + "grad_norm": 0.30483337934954274, + "learning_rate": 8.496407090688505e-06, + "loss": 0.5347, + "step": 4883 + }, + { + "epoch": 0.8020856855459528, + "grad_norm": 0.3390354026182259, + "learning_rate": 8.496191497138757e-06, + "loss": 0.5032, + "step": 4884 + }, + { + "epoch": 0.8022499127542956, + "grad_norm": 0.46548930989663456, + "learning_rate": 8.495975860186437e-06, + "loss": 0.5125, + "step": 4885 + }, + { + "epoch": 0.8024141399626383, + "grad_norm": 0.3909156993177809, + "learning_rate": 8.495760179833888e-06, + "loss": 0.5321, + "step": 4886 + }, + { + "epoch": 0.8025783671709811, + "grad_norm": 0.30815593000960556, + "learning_rate": 8.495544456083453e-06, + "loss": 0.5152, + "step": 4887 + }, + { + "epoch": 0.8027425943793238, + "grad_norm": 0.43576154208490675, + "learning_rate": 8.495328688937473e-06, + "loss": 0.5118, + "step": 4888 + }, + { + "epoch": 0.8029068215876666, + "grad_norm": 0.3362982778923814, + "learning_rate": 8.495112878398292e-06, + "loss": 0.5199, + "step": 4889 + }, + { + "epoch": 0.8030710487960093, + "grad_norm": 0.35648097356314234, + "learning_rate": 8.494897024468255e-06, + "loss": 0.5203, + "step": 4890 + }, + { + "epoch": 0.8032352760043521, + "grad_norm": 0.40099724052233265, + "learning_rate": 8.494681127149706e-06, + "loss": 0.5142, + "step": 4891 + }, + { + "epoch": 0.8033995032126947, + "grad_norm": 0.38363888188790995, + "learning_rate": 8.49446518644499e-06, + "loss": 0.5151, + "step": 4892 + }, + { + "epoch": 0.8035637304210375, + "grad_norm": 0.32707620079419586, + "learning_rate": 8.494249202356452e-06, + "loss": 0.5143, + "step": 4893 + }, + { + "epoch": 0.8037279576293802, + "grad_norm": 0.28527511913999304, + "learning_rate": 8.494033174886438e-06, + "loss": 0.5169, + "step": 4894 + }, + { + "epoch": 0.803892184837723, + "grad_norm": 0.2832345525798682, + "learning_rate": 8.493817104037294e-06, + "loss": 0.5078, + "step": 4895 + }, + { + "epoch": 0.8040564120460657, + "grad_norm": 1.4074803625802637, + "learning_rate": 8.493600989811366e-06, + "loss": 0.5151, + "step": 4896 + }, + { + "epoch": 0.8042206392544085, + "grad_norm": 0.31620078421037934, + "learning_rate": 8.493384832211003e-06, + "loss": 0.5338, + "step": 4897 + }, + { + "epoch": 0.8043848664627512, + "grad_norm": 0.36798937850075064, + "learning_rate": 8.49316863123855e-06, + "loss": 0.5047, + "step": 4898 + }, + { + "epoch": 0.804549093671094, + "grad_norm": 0.3155938826874778, + "learning_rate": 8.49295238689636e-06, + "loss": 0.5316, + "step": 4899 + }, + { + "epoch": 0.8047133208794367, + "grad_norm": 0.3090401650499103, + "learning_rate": 8.492736099186776e-06, + "loss": 0.4977, + "step": 4900 + }, + { + "epoch": 0.8048775480877794, + "grad_norm": 0.4837767806205548, + "learning_rate": 8.492519768112152e-06, + "loss": 0.5188, + "step": 4901 + }, + { + "epoch": 0.8050417752961222, + "grad_norm": 0.2831082962206799, + "learning_rate": 8.492303393674834e-06, + "loss": 0.5166, + "step": 4902 + }, + { + "epoch": 0.805206002504465, + "grad_norm": 0.3364271799232852, + "learning_rate": 8.492086975877173e-06, + "loss": 0.5032, + "step": 4903 + }, + { + "epoch": 0.8053702297128077, + "grad_norm": 0.3039287965588309, + "learning_rate": 8.49187051472152e-06, + "loss": 0.4997, + "step": 4904 + }, + { + "epoch": 0.8055344569211504, + "grad_norm": 0.34438901266088634, + "learning_rate": 8.491654010210224e-06, + "loss": 0.5224, + "step": 4905 + }, + { + "epoch": 0.8056986841294932, + "grad_norm": 0.410477956559512, + "learning_rate": 8.49143746234564e-06, + "loss": 0.521, + "step": 4906 + }, + { + "epoch": 0.8058629113378359, + "grad_norm": 0.33178147394701, + "learning_rate": 8.491220871130119e-06, + "loss": 0.5141, + "step": 4907 + }, + { + "epoch": 0.8060271385461787, + "grad_norm": 0.31226511617932345, + "learning_rate": 8.49100423656601e-06, + "loss": 0.5098, + "step": 4908 + }, + { + "epoch": 0.8061913657545213, + "grad_norm": 0.3314940198037555, + "learning_rate": 8.49078755865567e-06, + "loss": 0.4966, + "step": 4909 + }, + { + "epoch": 0.8063555929628641, + "grad_norm": 0.4213880727332896, + "learning_rate": 8.490570837401452e-06, + "loss": 0.5326, + "step": 4910 + }, + { + "epoch": 0.8065198201712068, + "grad_norm": 0.25770417899130643, + "learning_rate": 8.490354072805707e-06, + "loss": 0.5258, + "step": 4911 + }, + { + "epoch": 0.8066840473795496, + "grad_norm": 0.29938701046613064, + "learning_rate": 8.49013726487079e-06, + "loss": 0.5288, + "step": 4912 + }, + { + "epoch": 0.8068482745878923, + "grad_norm": 0.3284235847108112, + "learning_rate": 8.489920413599059e-06, + "loss": 0.5331, + "step": 4913 + }, + { + "epoch": 0.8070125017962351, + "grad_norm": 0.3092253263939411, + "learning_rate": 8.489703518992865e-06, + "loss": 0.5127, + "step": 4914 + }, + { + "epoch": 0.8071767290045778, + "grad_norm": 0.31053867213043934, + "learning_rate": 8.489486581054565e-06, + "loss": 0.5181, + "step": 4915 + }, + { + "epoch": 0.8073409562129206, + "grad_norm": 0.2781424039531393, + "learning_rate": 8.489269599786516e-06, + "loss": 0.4907, + "step": 4916 + }, + { + "epoch": 0.8075051834212633, + "grad_norm": 0.29524804072816085, + "learning_rate": 8.489052575191074e-06, + "loss": 0.5155, + "step": 4917 + }, + { + "epoch": 0.8076694106296061, + "grad_norm": 0.4577844332755319, + "learning_rate": 8.488835507270597e-06, + "loss": 0.5027, + "step": 4918 + }, + { + "epoch": 0.8078336378379488, + "grad_norm": 0.30584168281729424, + "learning_rate": 8.488618396027442e-06, + "loss": 0.521, + "step": 4919 + }, + { + "epoch": 0.8079978650462916, + "grad_norm": 0.3880806237530359, + "learning_rate": 8.488401241463966e-06, + "loss": 0.5352, + "step": 4920 + }, + { + "epoch": 0.8081620922546343, + "grad_norm": 0.2955963334280968, + "learning_rate": 8.48818404358253e-06, + "loss": 0.5111, + "step": 4921 + }, + { + "epoch": 0.8083263194629771, + "grad_norm": 0.3345446561322464, + "learning_rate": 8.48796680238549e-06, + "loss": 0.522, + "step": 4922 + }, + { + "epoch": 0.8084905466713198, + "grad_norm": 0.6481791245070249, + "learning_rate": 8.487749517875208e-06, + "loss": 0.5065, + "step": 4923 + }, + { + "epoch": 0.8086547738796626, + "grad_norm": 0.30909560781645284, + "learning_rate": 8.487532190054043e-06, + "loss": 0.5039, + "step": 4924 + }, + { + "epoch": 0.8088190010880053, + "grad_norm": 0.34142963673856785, + "learning_rate": 8.487314818924353e-06, + "loss": 0.5092, + "step": 4925 + }, + { + "epoch": 0.808983228296348, + "grad_norm": 0.4889539171715669, + "learning_rate": 8.487097404488502e-06, + "loss": 0.502, + "step": 4926 + }, + { + "epoch": 0.8091474555046907, + "grad_norm": 0.5818910886026137, + "learning_rate": 8.486879946748852e-06, + "loss": 0.4925, + "step": 4927 + }, + { + "epoch": 0.8093116827130334, + "grad_norm": 0.2932001298911658, + "learning_rate": 8.486662445707762e-06, + "loss": 0.5166, + "step": 4928 + }, + { + "epoch": 0.8094759099213762, + "grad_norm": 0.29941350737145134, + "learning_rate": 8.486444901367594e-06, + "loss": 0.5076, + "step": 4929 + }, + { + "epoch": 0.8096401371297189, + "grad_norm": 0.36587344576528646, + "learning_rate": 8.486227313730716e-06, + "loss": 0.4944, + "step": 4930 + }, + { + "epoch": 0.8098043643380617, + "grad_norm": 0.3008403149194141, + "learning_rate": 8.486009682799484e-06, + "loss": 0.5398, + "step": 4931 + }, + { + "epoch": 0.8099685915464044, + "grad_norm": 0.32000631815715636, + "learning_rate": 8.485792008576269e-06, + "loss": 0.5143, + "step": 4932 + }, + { + "epoch": 0.8101328187547472, + "grad_norm": 0.4554555718255998, + "learning_rate": 8.485574291063427e-06, + "loss": 0.5106, + "step": 4933 + }, + { + "epoch": 0.8102970459630899, + "grad_norm": 0.3396815075083648, + "learning_rate": 8.48535653026333e-06, + "loss": 0.5248, + "step": 4934 + }, + { + "epoch": 0.8104612731714327, + "grad_norm": 0.29132372652667043, + "learning_rate": 8.485138726178337e-06, + "loss": 0.5149, + "step": 4935 + }, + { + "epoch": 0.8106255003797754, + "grad_norm": 0.33433360634314313, + "learning_rate": 8.484920878810818e-06, + "loss": 0.5275, + "step": 4936 + }, + { + "epoch": 0.8107897275881182, + "grad_norm": 0.3804802811332157, + "learning_rate": 8.484702988163138e-06, + "loss": 0.5372, + "step": 4937 + }, + { + "epoch": 0.8109539547964609, + "grad_norm": 0.29590088600146536, + "learning_rate": 8.484485054237663e-06, + "loss": 0.5194, + "step": 4938 + }, + { + "epoch": 0.8111181820048037, + "grad_norm": 0.30898279514885435, + "learning_rate": 8.484267077036761e-06, + "loss": 0.5067, + "step": 4939 + }, + { + "epoch": 0.8112824092131464, + "grad_norm": 0.3169244925099661, + "learning_rate": 8.484049056562796e-06, + "loss": 0.5271, + "step": 4940 + }, + { + "epoch": 0.8114466364214892, + "grad_norm": 0.2927706493730989, + "learning_rate": 8.483830992818141e-06, + "loss": 0.5497, + "step": 4941 + }, + { + "epoch": 0.8116108636298319, + "grad_norm": 0.3785507710524582, + "learning_rate": 8.483612885805161e-06, + "loss": 0.495, + "step": 4942 + }, + { + "epoch": 0.8117750908381746, + "grad_norm": 0.2886097711844155, + "learning_rate": 8.483394735526226e-06, + "loss": 0.5161, + "step": 4943 + }, + { + "epoch": 0.8119393180465173, + "grad_norm": 0.3721913462493364, + "learning_rate": 8.483176541983706e-06, + "loss": 0.5196, + "step": 4944 + }, + { + "epoch": 0.8121035452548601, + "grad_norm": 0.30708790688915927, + "learning_rate": 8.482958305179967e-06, + "loss": 0.516, + "step": 4945 + }, + { + "epoch": 0.8122677724632028, + "grad_norm": 0.3761360983371278, + "learning_rate": 8.482740025117385e-06, + "loss": 0.5023, + "step": 4946 + }, + { + "epoch": 0.8124319996715456, + "grad_norm": 0.3213489583168228, + "learning_rate": 8.482521701798326e-06, + "loss": 0.5272, + "step": 4947 + }, + { + "epoch": 0.8125962268798883, + "grad_norm": 0.3341586201507787, + "learning_rate": 8.482303335225164e-06, + "loss": 0.5461, + "step": 4948 + }, + { + "epoch": 0.812760454088231, + "grad_norm": 0.29858786103982876, + "learning_rate": 8.48208492540027e-06, + "loss": 0.5105, + "step": 4949 + }, + { + "epoch": 0.8129246812965738, + "grad_norm": 0.32960934334045244, + "learning_rate": 8.481866472326015e-06, + "loss": 0.4996, + "step": 4950 + }, + { + "epoch": 0.8130889085049166, + "grad_norm": 0.2762505138441524, + "learning_rate": 8.481647976004773e-06, + "loss": 0.4963, + "step": 4951 + }, + { + "epoch": 0.8132531357132593, + "grad_norm": 0.3157606414855119, + "learning_rate": 8.481429436438916e-06, + "loss": 0.5259, + "step": 4952 + }, + { + "epoch": 0.813417362921602, + "grad_norm": 0.31727752076093524, + "learning_rate": 8.481210853630819e-06, + "loss": 0.4996, + "step": 4953 + }, + { + "epoch": 0.8135815901299448, + "grad_norm": 0.3480649578971305, + "learning_rate": 8.480992227582854e-06, + "loss": 0.5198, + "step": 4954 + }, + { + "epoch": 0.8137458173382875, + "grad_norm": 0.6437859463406534, + "learning_rate": 8.480773558297396e-06, + "loss": 0.5263, + "step": 4955 + }, + { + "epoch": 0.8139100445466303, + "grad_norm": 0.5095893397019333, + "learning_rate": 8.480554845776823e-06, + "loss": 0.5211, + "step": 4956 + }, + { + "epoch": 0.814074271754973, + "grad_norm": 0.3655560668561918, + "learning_rate": 8.480336090023506e-06, + "loss": 0.4943, + "step": 4957 + }, + { + "epoch": 0.8142384989633158, + "grad_norm": 0.29143515911184514, + "learning_rate": 8.480117291039825e-06, + "loss": 0.5029, + "step": 4958 + }, + { + "epoch": 0.8144027261716585, + "grad_norm": 0.3366487975682201, + "learning_rate": 8.479898448828154e-06, + "loss": 0.5425, + "step": 4959 + }, + { + "epoch": 0.8145669533800012, + "grad_norm": 0.3642127674595192, + "learning_rate": 8.479679563390868e-06, + "loss": 0.5197, + "step": 4960 + }, + { + "epoch": 0.8147311805883439, + "grad_norm": 0.2796960538183455, + "learning_rate": 8.479460634730347e-06, + "loss": 0.5093, + "step": 4961 + }, + { + "epoch": 0.8148954077966867, + "grad_norm": 0.3228600225851728, + "learning_rate": 8.47924166284897e-06, + "loss": 0.5218, + "step": 4962 + }, + { + "epoch": 0.8150596350050294, + "grad_norm": 0.4189371272712448, + "learning_rate": 8.479022647749112e-06, + "loss": 0.536, + "step": 4963 + }, + { + "epoch": 0.8152238622133722, + "grad_norm": 0.3709891099591292, + "learning_rate": 8.478803589433154e-06, + "loss": 0.4873, + "step": 4964 + }, + { + "epoch": 0.8153880894217149, + "grad_norm": 0.2950453878548158, + "learning_rate": 8.478584487903475e-06, + "loss": 0.5154, + "step": 4965 + }, + { + "epoch": 0.8155523166300577, + "grad_norm": 0.35668420853045724, + "learning_rate": 8.478365343162452e-06, + "loss": 0.5237, + "step": 4966 + }, + { + "epoch": 0.8157165438384004, + "grad_norm": 0.5139188172905095, + "learning_rate": 8.478146155212469e-06, + "loss": 0.5194, + "step": 4967 + }, + { + "epoch": 0.8158807710467432, + "grad_norm": 0.3158036394353838, + "learning_rate": 8.477926924055905e-06, + "loss": 0.5001, + "step": 4968 + }, + { + "epoch": 0.8160449982550859, + "grad_norm": 0.35621905036856805, + "learning_rate": 8.477707649695139e-06, + "loss": 0.5315, + "step": 4969 + }, + { + "epoch": 0.8162092254634287, + "grad_norm": 0.4811902163500481, + "learning_rate": 8.477488332132554e-06, + "loss": 0.4975, + "step": 4970 + }, + { + "epoch": 0.8163734526717714, + "grad_norm": 0.2924221106772756, + "learning_rate": 8.477268971370535e-06, + "loss": 0.5045, + "step": 4971 + }, + { + "epoch": 0.8165376798801142, + "grad_norm": 0.3111977388086835, + "learning_rate": 8.47704956741146e-06, + "loss": 0.5046, + "step": 4972 + }, + { + "epoch": 0.8167019070884569, + "grad_norm": 0.5014288256969606, + "learning_rate": 8.476830120257715e-06, + "loss": 0.522, + "step": 4973 + }, + { + "epoch": 0.8168661342967997, + "grad_norm": 0.30421993520314305, + "learning_rate": 8.47661062991168e-06, + "loss": 0.5172, + "step": 4974 + }, + { + "epoch": 0.8170303615051424, + "grad_norm": 0.3200743075995799, + "learning_rate": 8.476391096375744e-06, + "loss": 0.4989, + "step": 4975 + }, + { + "epoch": 0.8171945887134852, + "grad_norm": 0.3458207012656067, + "learning_rate": 8.476171519652288e-06, + "loss": 0.5056, + "step": 4976 + }, + { + "epoch": 0.8173588159218278, + "grad_norm": 0.2890057786017369, + "learning_rate": 8.475951899743695e-06, + "loss": 0.5013, + "step": 4977 + }, + { + "epoch": 0.8175230431301705, + "grad_norm": 0.3296551060524736, + "learning_rate": 8.475732236652356e-06, + "loss": 0.51, + "step": 4978 + }, + { + "epoch": 0.8176872703385133, + "grad_norm": 0.3430824710511646, + "learning_rate": 8.47551253038065e-06, + "loss": 0.4904, + "step": 4979 + }, + { + "epoch": 0.817851497546856, + "grad_norm": 0.2904594269875365, + "learning_rate": 8.475292780930968e-06, + "loss": 0.5092, + "step": 4980 + }, + { + "epoch": 0.8180157247551988, + "grad_norm": 0.30751543008238613, + "learning_rate": 8.475072988305696e-06, + "loss": 0.523, + "step": 4981 + }, + { + "epoch": 0.8181799519635415, + "grad_norm": 0.47299931672604006, + "learning_rate": 8.474853152507219e-06, + "loss": 0.4972, + "step": 4982 + }, + { + "epoch": 0.8183441791718843, + "grad_norm": 0.3891091435955476, + "learning_rate": 8.474633273537927e-06, + "loss": 0.5199, + "step": 4983 + }, + { + "epoch": 0.818508406380227, + "grad_norm": 0.3165875924441726, + "learning_rate": 8.474413351400206e-06, + "loss": 0.5225, + "step": 4984 + }, + { + "epoch": 0.8186726335885698, + "grad_norm": 0.3692974632226017, + "learning_rate": 8.474193386096447e-06, + "loss": 0.5208, + "step": 4985 + }, + { + "epoch": 0.8188368607969125, + "grad_norm": 0.7791784938729102, + "learning_rate": 8.473973377629036e-06, + "loss": 0.5032, + "step": 4986 + }, + { + "epoch": 0.8190010880052553, + "grad_norm": 0.4044083364461391, + "learning_rate": 8.473753326000367e-06, + "loss": 0.5233, + "step": 4987 + }, + { + "epoch": 0.819165315213598, + "grad_norm": 0.4033049001473696, + "learning_rate": 8.473533231212827e-06, + "loss": 0.5099, + "step": 4988 + }, + { + "epoch": 0.8193295424219408, + "grad_norm": 0.3596693458396118, + "learning_rate": 8.473313093268805e-06, + "loss": 0.5225, + "step": 4989 + }, + { + "epoch": 0.8194937696302835, + "grad_norm": 0.41076914940960707, + "learning_rate": 8.473092912170692e-06, + "loss": 0.5124, + "step": 4990 + }, + { + "epoch": 0.8196579968386263, + "grad_norm": 0.3401356269581785, + "learning_rate": 8.472872687920884e-06, + "loss": 0.506, + "step": 4991 + }, + { + "epoch": 0.819822224046969, + "grad_norm": 0.47125293670893087, + "learning_rate": 8.472652420521768e-06, + "loss": 0.5247, + "step": 4992 + }, + { + "epoch": 0.8199864512553118, + "grad_norm": 0.28918564354635806, + "learning_rate": 8.472432109975739e-06, + "loss": 0.5196, + "step": 4993 + }, + { + "epoch": 0.8201506784636544, + "grad_norm": 0.36219635279432766, + "learning_rate": 8.47221175628519e-06, + "loss": 0.5192, + "step": 4994 + }, + { + "epoch": 0.8203149056719972, + "grad_norm": 0.34955506029391137, + "learning_rate": 8.47199135945251e-06, + "loss": 0.51, + "step": 4995 + }, + { + "epoch": 0.8204791328803399, + "grad_norm": 0.45748560279916733, + "learning_rate": 8.471770919480099e-06, + "loss": 0.4931, + "step": 4996 + }, + { + "epoch": 0.8206433600886827, + "grad_norm": 0.9575608714826659, + "learning_rate": 8.471550436370348e-06, + "loss": 0.5599, + "step": 4997 + }, + { + "epoch": 0.8208075872970254, + "grad_norm": 0.2853618278425937, + "learning_rate": 8.47132991012565e-06, + "loss": 0.5248, + "step": 4998 + }, + { + "epoch": 0.8209718145053682, + "grad_norm": 0.3093195551711788, + "learning_rate": 8.471109340748404e-06, + "loss": 0.5099, + "step": 4999 + }, + { + "epoch": 0.8211360417137109, + "grad_norm": 0.319549610457436, + "learning_rate": 8.470888728241e-06, + "loss": 0.5118, + "step": 5000 + }, + { + "epoch": 0.8213002689220537, + "grad_norm": 0.27962509644036526, + "learning_rate": 8.47066807260584e-06, + "loss": 0.5468, + "step": 5001 + }, + { + "epoch": 0.8214644961303964, + "grad_norm": 0.2948514500194613, + "learning_rate": 8.470447373845318e-06, + "loss": 0.5016, + "step": 5002 + }, + { + "epoch": 0.8216287233387392, + "grad_norm": 0.34033167071439674, + "learning_rate": 8.470226631961833e-06, + "loss": 0.5065, + "step": 5003 + }, + { + "epoch": 0.8217929505470819, + "grad_norm": 0.2879423318987142, + "learning_rate": 8.470005846957777e-06, + "loss": 0.5033, + "step": 5004 + }, + { + "epoch": 0.8219571777554247, + "grad_norm": 0.29707502090044696, + "learning_rate": 8.469785018835555e-06, + "loss": 0.4962, + "step": 5005 + }, + { + "epoch": 0.8221214049637674, + "grad_norm": 0.32249529594575727, + "learning_rate": 8.46956414759756e-06, + "loss": 0.5189, + "step": 5006 + }, + { + "epoch": 0.8222856321721101, + "grad_norm": 0.44102538660174917, + "learning_rate": 8.469343233246193e-06, + "loss": 0.5341, + "step": 5007 + }, + { + "epoch": 0.8224498593804529, + "grad_norm": 0.36244736256653914, + "learning_rate": 8.469122275783853e-06, + "loss": 0.5041, + "step": 5008 + }, + { + "epoch": 0.8226140865887956, + "grad_norm": 0.43767800046863825, + "learning_rate": 8.46890127521294e-06, + "loss": 0.5344, + "step": 5009 + }, + { + "epoch": 0.8227783137971384, + "grad_norm": 0.7840909278204317, + "learning_rate": 8.468680231535856e-06, + "loss": 0.5084, + "step": 5010 + }, + { + "epoch": 0.822942541005481, + "grad_norm": 0.39755023280098944, + "learning_rate": 8.468459144754998e-06, + "loss": 0.5206, + "step": 5011 + }, + { + "epoch": 0.8231067682138238, + "grad_norm": 0.3169144920240181, + "learning_rate": 8.468238014872769e-06, + "loss": 0.5044, + "step": 5012 + }, + { + "epoch": 0.8232709954221665, + "grad_norm": 0.32759349000574406, + "learning_rate": 8.468016841891572e-06, + "loss": 0.5082, + "step": 5013 + }, + { + "epoch": 0.8234352226305093, + "grad_norm": 0.3040957401155583, + "learning_rate": 8.467795625813808e-06, + "loss": 0.5149, + "step": 5014 + }, + { + "epoch": 0.823599449838852, + "grad_norm": 0.5909867028738016, + "learning_rate": 8.46757436664188e-06, + "loss": 0.5066, + "step": 5015 + }, + { + "epoch": 0.8237636770471948, + "grad_norm": 0.3331879110597187, + "learning_rate": 8.46735306437819e-06, + "loss": 0.5252, + "step": 5016 + }, + { + "epoch": 0.8239279042555375, + "grad_norm": 0.28786272922468276, + "learning_rate": 8.467131719025143e-06, + "loss": 0.5219, + "step": 5017 + }, + { + "epoch": 0.8240921314638803, + "grad_norm": 0.37116912026207094, + "learning_rate": 8.466910330585142e-06, + "loss": 0.5163, + "step": 5018 + }, + { + "epoch": 0.824256358672223, + "grad_norm": 0.41794447544277624, + "learning_rate": 8.466688899060593e-06, + "loss": 0.5218, + "step": 5019 + }, + { + "epoch": 0.8244205858805658, + "grad_norm": 0.3212900078391455, + "learning_rate": 8.466467424453898e-06, + "loss": 0.5098, + "step": 5020 + }, + { + "epoch": 0.8245848130889085, + "grad_norm": 0.3470167865008464, + "learning_rate": 8.466245906767464e-06, + "loss": 0.4971, + "step": 5021 + }, + { + "epoch": 0.8247490402972513, + "grad_norm": 0.3205638094446894, + "learning_rate": 8.4660243460037e-06, + "loss": 0.5236, + "step": 5022 + }, + { + "epoch": 0.824913267505594, + "grad_norm": 0.31070966884605533, + "learning_rate": 8.465802742165007e-06, + "loss": 0.4918, + "step": 5023 + }, + { + "epoch": 0.8250774947139368, + "grad_norm": 0.3138870801849995, + "learning_rate": 8.465581095253795e-06, + "loss": 0.5128, + "step": 5024 + }, + { + "epoch": 0.8252417219222795, + "grad_norm": 0.45582958216423314, + "learning_rate": 8.465359405272471e-06, + "loss": 0.5094, + "step": 5025 + }, + { + "epoch": 0.8254059491306223, + "grad_norm": 0.32263746700503765, + "learning_rate": 8.465137672223444e-06, + "loss": 0.5176, + "step": 5026 + }, + { + "epoch": 0.825570176338965, + "grad_norm": 0.36571660406047724, + "learning_rate": 8.464915896109118e-06, + "loss": 0.5176, + "step": 5027 + }, + { + "epoch": 0.8257344035473076, + "grad_norm": 0.37715049533934497, + "learning_rate": 8.464694076931907e-06, + "loss": 0.5048, + "step": 5028 + }, + { + "epoch": 0.8258986307556504, + "grad_norm": 0.3525169971936548, + "learning_rate": 8.464472214694216e-06, + "loss": 0.4956, + "step": 5029 + }, + { + "epoch": 0.8260628579639931, + "grad_norm": 0.34935023750783545, + "learning_rate": 8.464250309398457e-06, + "loss": 0.5043, + "step": 5030 + }, + { + "epoch": 0.8262270851723359, + "grad_norm": 0.3003527457744416, + "learning_rate": 8.464028361047037e-06, + "loss": 0.5098, + "step": 5031 + }, + { + "epoch": 0.8263913123806786, + "grad_norm": 0.3211925859594905, + "learning_rate": 8.463806369642373e-06, + "loss": 0.5151, + "step": 5032 + }, + { + "epoch": 0.8265555395890214, + "grad_norm": 0.2881298613603624, + "learning_rate": 8.46358433518687e-06, + "loss": 0.522, + "step": 5033 + }, + { + "epoch": 0.8267197667973641, + "grad_norm": 0.3088604066956432, + "learning_rate": 8.46336225768294e-06, + "loss": 0.5042, + "step": 5034 + }, + { + "epoch": 0.8268839940057069, + "grad_norm": 0.3207947009742772, + "learning_rate": 8.463140137132997e-06, + "loss": 0.4924, + "step": 5035 + }, + { + "epoch": 0.8270482212140496, + "grad_norm": 0.30583600515085474, + "learning_rate": 8.462917973539454e-06, + "loss": 0.5144, + "step": 5036 + }, + { + "epoch": 0.8272124484223924, + "grad_norm": 0.3273592428080974, + "learning_rate": 8.462695766904724e-06, + "loss": 0.5046, + "step": 5037 + }, + { + "epoch": 0.8273766756307351, + "grad_norm": 0.24219539916900257, + "learning_rate": 8.462473517231217e-06, + "loss": 0.4942, + "step": 5038 + }, + { + "epoch": 0.8275409028390779, + "grad_norm": 0.30598944645398374, + "learning_rate": 8.462251224521349e-06, + "loss": 0.5089, + "step": 5039 + }, + { + "epoch": 0.8277051300474206, + "grad_norm": 0.2890067799934394, + "learning_rate": 8.462028888777536e-06, + "loss": 0.5153, + "step": 5040 + }, + { + "epoch": 0.8278693572557634, + "grad_norm": 0.31820577059903815, + "learning_rate": 8.461806510002189e-06, + "loss": 0.4943, + "step": 5041 + }, + { + "epoch": 0.8280335844641061, + "grad_norm": 0.3160940376849428, + "learning_rate": 8.461584088197726e-06, + "loss": 0.5148, + "step": 5042 + }, + { + "epoch": 0.8281978116724489, + "grad_norm": 0.30044612758537664, + "learning_rate": 8.461361623366564e-06, + "loss": 0.498, + "step": 5043 + }, + { + "epoch": 0.8283620388807916, + "grad_norm": 0.26764067211180775, + "learning_rate": 8.461139115511116e-06, + "loss": 0.4952, + "step": 5044 + }, + { + "epoch": 0.8285262660891343, + "grad_norm": 0.31280182664520434, + "learning_rate": 8.4609165646338e-06, + "loss": 0.5031, + "step": 5045 + }, + { + "epoch": 0.828690493297477, + "grad_norm": 1.0443570125444503, + "learning_rate": 8.460693970737033e-06, + "loss": 0.5144, + "step": 5046 + }, + { + "epoch": 0.8288547205058198, + "grad_norm": 0.31610380504111363, + "learning_rate": 8.460471333823232e-06, + "loss": 0.5046, + "step": 5047 + }, + { + "epoch": 0.8290189477141625, + "grad_norm": 0.2824804649298298, + "learning_rate": 8.460248653894818e-06, + "loss": 0.5478, + "step": 5048 + }, + { + "epoch": 0.8291831749225053, + "grad_norm": 0.345837308750286, + "learning_rate": 8.460025930954206e-06, + "loss": 0.511, + "step": 5049 + }, + { + "epoch": 0.829347402130848, + "grad_norm": 0.3016406355661234, + "learning_rate": 8.459803165003815e-06, + "loss": 0.5182, + "step": 5050 + }, + { + "epoch": 0.8295116293391908, + "grad_norm": 0.3562213213518067, + "learning_rate": 8.459580356046067e-06, + "loss": 0.5082, + "step": 5051 + }, + { + "epoch": 0.8296758565475335, + "grad_norm": 0.3025481909002121, + "learning_rate": 8.459357504083381e-06, + "loss": 0.5105, + "step": 5052 + }, + { + "epoch": 0.8298400837558763, + "grad_norm": 0.33732616817813776, + "learning_rate": 8.459134609118175e-06, + "loss": 0.5031, + "step": 5053 + }, + { + "epoch": 0.830004310964219, + "grad_norm": 0.330670296404074, + "learning_rate": 8.458911671152874e-06, + "loss": 0.491, + "step": 5054 + }, + { + "epoch": 0.8301685381725618, + "grad_norm": 0.31696394533350386, + "learning_rate": 8.458688690189897e-06, + "loss": 0.5151, + "step": 5055 + }, + { + "epoch": 0.8303327653809045, + "grad_norm": 0.35470601501695254, + "learning_rate": 8.458465666231665e-06, + "loss": 0.5185, + "step": 5056 + }, + { + "epoch": 0.8304969925892473, + "grad_norm": 0.2745766999857397, + "learning_rate": 8.4582425992806e-06, + "loss": 0.5128, + "step": 5057 + }, + { + "epoch": 0.83066121979759, + "grad_norm": 0.2910543164432107, + "learning_rate": 8.458019489339129e-06, + "loss": 0.5187, + "step": 5058 + }, + { + "epoch": 0.8308254470059327, + "grad_norm": 0.3428610977610601, + "learning_rate": 8.457796336409672e-06, + "loss": 0.5007, + "step": 5059 + }, + { + "epoch": 0.8309896742142755, + "grad_norm": 0.27773717516347307, + "learning_rate": 8.45757314049465e-06, + "loss": 0.4994, + "step": 5060 + }, + { + "epoch": 0.8311539014226182, + "grad_norm": 0.28802939482408896, + "learning_rate": 8.457349901596492e-06, + "loss": 0.5205, + "step": 5061 + }, + { + "epoch": 0.8313181286309609, + "grad_norm": 0.26938147503957355, + "learning_rate": 8.45712661971762e-06, + "loss": 0.4928, + "step": 5062 + }, + { + "epoch": 0.8314823558393036, + "grad_norm": 0.27567436454998057, + "learning_rate": 8.456903294860462e-06, + "loss": 0.506, + "step": 5063 + }, + { + "epoch": 0.8316465830476464, + "grad_norm": 0.2871225248215651, + "learning_rate": 8.456679927027438e-06, + "loss": 0.5058, + "step": 5064 + }, + { + "epoch": 0.8318108102559891, + "grad_norm": 0.3209137982397029, + "learning_rate": 8.45645651622098e-06, + "loss": 0.5263, + "step": 5065 + }, + { + "epoch": 0.8319750374643319, + "grad_norm": 0.2785837470184095, + "learning_rate": 8.456233062443508e-06, + "loss": 0.51, + "step": 5066 + }, + { + "epoch": 0.8321392646726746, + "grad_norm": 0.29409807614294214, + "learning_rate": 8.456009565697455e-06, + "loss": 0.5561, + "step": 5067 + }, + { + "epoch": 0.8323034918810174, + "grad_norm": 0.28712716258345833, + "learning_rate": 8.455786025985244e-06, + "loss": 0.5115, + "step": 5068 + }, + { + "epoch": 0.8324677190893601, + "grad_norm": 0.284167141367049, + "learning_rate": 8.455562443309308e-06, + "loss": 0.5049, + "step": 5069 + }, + { + "epoch": 0.8326319462977029, + "grad_norm": 0.29551746610441004, + "learning_rate": 8.455338817672069e-06, + "loss": 0.4867, + "step": 5070 + }, + { + "epoch": 0.8327961735060456, + "grad_norm": 0.27646057535916885, + "learning_rate": 8.455115149075961e-06, + "loss": 0.5164, + "step": 5071 + }, + { + "epoch": 0.8329604007143884, + "grad_norm": 0.3010201736022869, + "learning_rate": 8.45489143752341e-06, + "loss": 0.5152, + "step": 5072 + }, + { + "epoch": 0.8331246279227311, + "grad_norm": 0.2622201228558617, + "learning_rate": 8.454667683016847e-06, + "loss": 0.4946, + "step": 5073 + }, + { + "epoch": 0.8332888551310739, + "grad_norm": 0.3100844241068556, + "learning_rate": 8.454443885558702e-06, + "loss": 0.5018, + "step": 5074 + }, + { + "epoch": 0.8334530823394166, + "grad_norm": 0.2609866015590737, + "learning_rate": 8.454220045151407e-06, + "loss": 0.5066, + "step": 5075 + }, + { + "epoch": 0.8336173095477594, + "grad_norm": 0.443640168900632, + "learning_rate": 8.45399616179739e-06, + "loss": 0.4883, + "step": 5076 + }, + { + "epoch": 0.8337815367561021, + "grad_norm": 0.3412324134114467, + "learning_rate": 8.453772235499085e-06, + "loss": 0.5173, + "step": 5077 + }, + { + "epoch": 0.8339457639644449, + "grad_norm": 0.31007454373042503, + "learning_rate": 8.453548266258924e-06, + "loss": 0.5246, + "step": 5078 + }, + { + "epoch": 0.8341099911727875, + "grad_norm": 0.3047252056284528, + "learning_rate": 8.45332425407934e-06, + "loss": 0.521, + "step": 5079 + }, + { + "epoch": 0.8342742183811303, + "grad_norm": 0.3046193883490871, + "learning_rate": 8.453100198962764e-06, + "loss": 0.5287, + "step": 5080 + }, + { + "epoch": 0.834438445589473, + "grad_norm": 0.2786460052072658, + "learning_rate": 8.45287610091163e-06, + "loss": 0.5062, + "step": 5081 + }, + { + "epoch": 0.8346026727978157, + "grad_norm": 0.3078651792888881, + "learning_rate": 8.452651959928374e-06, + "loss": 0.5048, + "step": 5082 + }, + { + "epoch": 0.8347669000061585, + "grad_norm": 0.2877663514705134, + "learning_rate": 8.452427776015428e-06, + "loss": 0.5403, + "step": 5083 + }, + { + "epoch": 0.8349311272145012, + "grad_norm": 0.275191748660642, + "learning_rate": 8.452203549175226e-06, + "loss": 0.5112, + "step": 5084 + }, + { + "epoch": 0.835095354422844, + "grad_norm": 0.29317499292240806, + "learning_rate": 8.451979279410207e-06, + "loss": 0.5207, + "step": 5085 + }, + { + "epoch": 0.8352595816311867, + "grad_norm": 0.28347051510346993, + "learning_rate": 8.451754966722804e-06, + "loss": 0.4999, + "step": 5086 + }, + { + "epoch": 0.8354238088395295, + "grad_norm": 0.3021476025483518, + "learning_rate": 8.451530611115456e-06, + "loss": 0.5157, + "step": 5087 + }, + { + "epoch": 0.8355880360478722, + "grad_norm": 0.32453355242197585, + "learning_rate": 8.451306212590595e-06, + "loss": 0.4964, + "step": 5088 + }, + { + "epoch": 0.835752263256215, + "grad_norm": 0.2786831375180388, + "learning_rate": 8.451081771150663e-06, + "loss": 0.5112, + "step": 5089 + }, + { + "epoch": 0.8359164904645577, + "grad_norm": 0.3315162275707279, + "learning_rate": 8.450857286798095e-06, + "loss": 0.5169, + "step": 5090 + }, + { + "epoch": 0.8360807176729005, + "grad_norm": 0.43424131015762185, + "learning_rate": 8.450632759535329e-06, + "loss": 0.4969, + "step": 5091 + }, + { + "epoch": 0.8362449448812432, + "grad_norm": 0.34337420336609564, + "learning_rate": 8.450408189364805e-06, + "loss": 0.522, + "step": 5092 + }, + { + "epoch": 0.836409172089586, + "grad_norm": 0.27471053663336875, + "learning_rate": 8.450183576288962e-06, + "loss": 0.5141, + "step": 5093 + }, + { + "epoch": 0.8365733992979287, + "grad_norm": 0.416085438847666, + "learning_rate": 8.449958920310237e-06, + "loss": 0.5061, + "step": 5094 + }, + { + "epoch": 0.8367376265062715, + "grad_norm": 0.32310592092724405, + "learning_rate": 8.449734221431073e-06, + "loss": 0.5235, + "step": 5095 + }, + { + "epoch": 0.8369018537146141, + "grad_norm": 0.3186821997518407, + "learning_rate": 8.449509479653911e-06, + "loss": 0.4872, + "step": 5096 + }, + { + "epoch": 0.8370660809229569, + "grad_norm": 0.2843584333397972, + "learning_rate": 8.449284694981187e-06, + "loss": 0.5301, + "step": 5097 + }, + { + "epoch": 0.8372303081312996, + "grad_norm": 0.3032449654190625, + "learning_rate": 8.449059867415348e-06, + "loss": 0.5181, + "step": 5098 + }, + { + "epoch": 0.8373945353396424, + "grad_norm": 0.33532208528088053, + "learning_rate": 8.448834996958833e-06, + "loss": 0.5166, + "step": 5099 + }, + { + "epoch": 0.8375587625479851, + "grad_norm": 0.3208069728155115, + "learning_rate": 8.448610083614085e-06, + "loss": 0.5185, + "step": 5100 + }, + { + "epoch": 0.8377229897563279, + "grad_norm": 0.2805736230950408, + "learning_rate": 8.448385127383546e-06, + "loss": 0.5269, + "step": 5101 + }, + { + "epoch": 0.8378872169646706, + "grad_norm": 0.28498221869287943, + "learning_rate": 8.448160128269659e-06, + "loss": 0.5032, + "step": 5102 + }, + { + "epoch": 0.8380514441730134, + "grad_norm": 0.3092573990050545, + "learning_rate": 8.44793508627487e-06, + "loss": 0.5213, + "step": 5103 + }, + { + "epoch": 0.8382156713813561, + "grad_norm": 0.2936327617561994, + "learning_rate": 8.447710001401622e-06, + "loss": 0.4974, + "step": 5104 + }, + { + "epoch": 0.8383798985896989, + "grad_norm": 0.3591294236741196, + "learning_rate": 8.447484873652358e-06, + "loss": 0.5023, + "step": 5105 + }, + { + "epoch": 0.8385441257980416, + "grad_norm": 0.3441918054279239, + "learning_rate": 8.447259703029525e-06, + "loss": 0.5147, + "step": 5106 + }, + { + "epoch": 0.8387083530063844, + "grad_norm": 0.3430452362801982, + "learning_rate": 8.447034489535569e-06, + "loss": 0.5152, + "step": 5107 + }, + { + "epoch": 0.8388725802147271, + "grad_norm": 0.31263270317206515, + "learning_rate": 8.446809233172934e-06, + "loss": 0.524, + "step": 5108 + }, + { + "epoch": 0.8390368074230699, + "grad_norm": 0.28639111012314905, + "learning_rate": 8.446583933944067e-06, + "loss": 0.519, + "step": 5109 + }, + { + "epoch": 0.8392010346314126, + "grad_norm": 0.3446313615810392, + "learning_rate": 8.446358591851417e-06, + "loss": 0.5128, + "step": 5110 + }, + { + "epoch": 0.8393652618397553, + "grad_norm": 0.5348002296317965, + "learning_rate": 8.446133206897429e-06, + "loss": 0.4922, + "step": 5111 + }, + { + "epoch": 0.8395294890480981, + "grad_norm": 0.5051111534121978, + "learning_rate": 8.445907779084553e-06, + "loss": 0.5418, + "step": 5112 + }, + { + "epoch": 0.8396937162564407, + "grad_norm": 0.30406474946147793, + "learning_rate": 8.445682308415235e-06, + "loss": 0.4946, + "step": 5113 + }, + { + "epoch": 0.8398579434647835, + "grad_norm": 0.3380086818662066, + "learning_rate": 8.445456794891925e-06, + "loss": 0.5255, + "step": 5114 + }, + { + "epoch": 0.8400221706731262, + "grad_norm": 0.3438745728098006, + "learning_rate": 8.445231238517073e-06, + "loss": 0.5423, + "step": 5115 + }, + { + "epoch": 0.840186397881469, + "grad_norm": 0.40381447460269726, + "learning_rate": 8.44500563929313e-06, + "loss": 0.5137, + "step": 5116 + }, + { + "epoch": 0.8403506250898117, + "grad_norm": 0.40787796480711397, + "learning_rate": 8.444779997222541e-06, + "loss": 0.5064, + "step": 5117 + }, + { + "epoch": 0.8405148522981545, + "grad_norm": 0.36288902748934776, + "learning_rate": 8.444554312307763e-06, + "loss": 0.5082, + "step": 5118 + }, + { + "epoch": 0.8406790795064972, + "grad_norm": 0.30047433656216793, + "learning_rate": 8.444328584551243e-06, + "loss": 0.512, + "step": 5119 + }, + { + "epoch": 0.84084330671484, + "grad_norm": 0.3750624604659782, + "learning_rate": 8.444102813955435e-06, + "loss": 0.5186, + "step": 5120 + }, + { + "epoch": 0.8410075339231827, + "grad_norm": 0.34600270284100215, + "learning_rate": 8.443877000522788e-06, + "loss": 0.4973, + "step": 5121 + }, + { + "epoch": 0.8411717611315255, + "grad_norm": 0.3139983714728081, + "learning_rate": 8.443651144255756e-06, + "loss": 0.5256, + "step": 5122 + }, + { + "epoch": 0.8413359883398682, + "grad_norm": 0.3346488739033981, + "learning_rate": 8.443425245156795e-06, + "loss": 0.5163, + "step": 5123 + }, + { + "epoch": 0.841500215548211, + "grad_norm": 0.2948288518207391, + "learning_rate": 8.443199303228355e-06, + "loss": 0.5234, + "step": 5124 + }, + { + "epoch": 0.8416644427565537, + "grad_norm": 0.33203493477375534, + "learning_rate": 8.44297331847289e-06, + "loss": 0.5105, + "step": 5125 + }, + { + "epoch": 0.8418286699648965, + "grad_norm": 0.30379991266832673, + "learning_rate": 8.442747290892856e-06, + "loss": 0.5144, + "step": 5126 + }, + { + "epoch": 0.8419928971732392, + "grad_norm": 0.28029415277526454, + "learning_rate": 8.44252122049071e-06, + "loss": 0.508, + "step": 5127 + }, + { + "epoch": 0.842157124381582, + "grad_norm": 0.49847696844628037, + "learning_rate": 8.4422951072689e-06, + "loss": 0.5056, + "step": 5128 + }, + { + "epoch": 0.8423213515899247, + "grad_norm": 0.5120725889382312, + "learning_rate": 8.44206895122989e-06, + "loss": 0.5298, + "step": 5129 + }, + { + "epoch": 0.8424855787982674, + "grad_norm": 0.2882998585510167, + "learning_rate": 8.44184275237613e-06, + "loss": 0.5023, + "step": 5130 + }, + { + "epoch": 0.8426498060066101, + "grad_norm": 0.3236394026994081, + "learning_rate": 8.441616510710082e-06, + "loss": 0.5098, + "step": 5131 + }, + { + "epoch": 0.8428140332149529, + "grad_norm": 0.3631167452727695, + "learning_rate": 8.441390226234199e-06, + "loss": 0.5206, + "step": 5132 + }, + { + "epoch": 0.8429782604232956, + "grad_norm": 0.3183503409285738, + "learning_rate": 8.441163898950941e-06, + "loss": 0.5229, + "step": 5133 + }, + { + "epoch": 0.8431424876316383, + "grad_norm": 0.30255751019464133, + "learning_rate": 8.440937528862766e-06, + "loss": 0.5029, + "step": 5134 + }, + { + "epoch": 0.8433067148399811, + "grad_norm": 0.3335196583075735, + "learning_rate": 8.440711115972131e-06, + "loss": 0.504, + "step": 5135 + }, + { + "epoch": 0.8434709420483238, + "grad_norm": 0.32001255656113226, + "learning_rate": 8.440484660281496e-06, + "loss": 0.495, + "step": 5136 + }, + { + "epoch": 0.8436351692566666, + "grad_norm": 0.36895012575981045, + "learning_rate": 8.440258161793321e-06, + "loss": 0.5073, + "step": 5137 + }, + { + "epoch": 0.8437993964650093, + "grad_norm": 0.2755984017532756, + "learning_rate": 8.440031620510068e-06, + "loss": 0.5289, + "step": 5138 + }, + { + "epoch": 0.8439636236733521, + "grad_norm": 0.3473947843063845, + "learning_rate": 8.439805036434191e-06, + "loss": 0.5169, + "step": 5139 + }, + { + "epoch": 0.8441278508816948, + "grad_norm": 0.30833567846849247, + "learning_rate": 8.439578409568158e-06, + "loss": 0.5049, + "step": 5140 + }, + { + "epoch": 0.8442920780900376, + "grad_norm": 0.2868463061560813, + "learning_rate": 8.439351739914427e-06, + "loss": 0.5024, + "step": 5141 + }, + { + "epoch": 0.8444563052983803, + "grad_norm": 0.27930508177210783, + "learning_rate": 8.439125027475459e-06, + "loss": 0.5055, + "step": 5142 + }, + { + "epoch": 0.8446205325067231, + "grad_norm": 0.38485774036475545, + "learning_rate": 8.438898272253719e-06, + "loss": 0.511, + "step": 5143 + }, + { + "epoch": 0.8447847597150658, + "grad_norm": 0.30246859023528366, + "learning_rate": 8.438671474251667e-06, + "loss": 0.5309, + "step": 5144 + }, + { + "epoch": 0.8449489869234086, + "grad_norm": 0.324011948208211, + "learning_rate": 8.43844463347177e-06, + "loss": 0.506, + "step": 5145 + }, + { + "epoch": 0.8451132141317512, + "grad_norm": 0.3072522424422487, + "learning_rate": 8.438217749916488e-06, + "loss": 0.5089, + "step": 5146 + }, + { + "epoch": 0.845277441340094, + "grad_norm": 0.44271916061839944, + "learning_rate": 8.437990823588285e-06, + "loss": 0.5399, + "step": 5147 + }, + { + "epoch": 0.8454416685484367, + "grad_norm": 0.4356296662687189, + "learning_rate": 8.43776385448963e-06, + "loss": 0.5228, + "step": 5148 + }, + { + "epoch": 0.8456058957567795, + "grad_norm": 0.2668006225000898, + "learning_rate": 8.437536842622982e-06, + "loss": 0.5014, + "step": 5149 + }, + { + "epoch": 0.8457701229651222, + "grad_norm": 0.26564245328819, + "learning_rate": 8.437309787990813e-06, + "loss": 0.5096, + "step": 5150 + }, + { + "epoch": 0.845934350173465, + "grad_norm": 0.2983680257581951, + "learning_rate": 8.437082690595584e-06, + "loss": 0.5136, + "step": 5151 + }, + { + "epoch": 0.8460985773818077, + "grad_norm": 0.40578399435243784, + "learning_rate": 8.436855550439765e-06, + "loss": 0.4978, + "step": 5152 + }, + { + "epoch": 0.8462628045901505, + "grad_norm": 0.44301373104727876, + "learning_rate": 8.43662836752582e-06, + "loss": 0.4988, + "step": 5153 + }, + { + "epoch": 0.8464270317984932, + "grad_norm": 0.39247545992276484, + "learning_rate": 8.436401141856218e-06, + "loss": 0.5338, + "step": 5154 + }, + { + "epoch": 0.846591259006836, + "grad_norm": 0.2681976914938843, + "learning_rate": 8.436173873433428e-06, + "loss": 0.5075, + "step": 5155 + }, + { + "epoch": 0.8467554862151787, + "grad_norm": 0.3673054253397242, + "learning_rate": 8.435946562259917e-06, + "loss": 0.4936, + "step": 5156 + }, + { + "epoch": 0.8469197134235215, + "grad_norm": 0.3534936525961469, + "learning_rate": 8.435719208338153e-06, + "loss": 0.5091, + "step": 5157 + }, + { + "epoch": 0.8470839406318642, + "grad_norm": 0.3079026562799358, + "learning_rate": 8.435491811670605e-06, + "loss": 0.5126, + "step": 5158 + }, + { + "epoch": 0.847248167840207, + "grad_norm": 0.37457392062869027, + "learning_rate": 8.435264372259745e-06, + "loss": 0.511, + "step": 5159 + }, + { + "epoch": 0.8474123950485497, + "grad_norm": 0.31250952877477833, + "learning_rate": 8.435036890108042e-06, + "loss": 0.5055, + "step": 5160 + }, + { + "epoch": 0.8475766222568925, + "grad_norm": 0.34748176886976945, + "learning_rate": 8.434809365217968e-06, + "loss": 0.5216, + "step": 5161 + }, + { + "epoch": 0.8477408494652352, + "grad_norm": 0.4337562987176165, + "learning_rate": 8.434581797591992e-06, + "loss": 0.5325, + "step": 5162 + }, + { + "epoch": 0.8479050766735778, + "grad_norm": 0.30289333346415104, + "learning_rate": 8.434354187232587e-06, + "loss": 0.5138, + "step": 5163 + }, + { + "epoch": 0.8480693038819206, + "grad_norm": 0.28790463141112965, + "learning_rate": 8.434126534142223e-06, + "loss": 0.5058, + "step": 5164 + }, + { + "epoch": 0.8482335310902633, + "grad_norm": 0.28706340640753114, + "learning_rate": 8.433898838323375e-06, + "loss": 0.5226, + "step": 5165 + }, + { + "epoch": 0.8483977582986061, + "grad_norm": 0.32648676816606786, + "learning_rate": 8.433671099778517e-06, + "loss": 0.5049, + "step": 5166 + }, + { + "epoch": 0.8485619855069488, + "grad_norm": 0.3414586375904832, + "learning_rate": 8.43344331851012e-06, + "loss": 0.4967, + "step": 5167 + }, + { + "epoch": 0.8487262127152916, + "grad_norm": 0.3822562701157267, + "learning_rate": 8.433215494520657e-06, + "loss": 0.4928, + "step": 5168 + }, + { + "epoch": 0.8488904399236343, + "grad_norm": 0.30473631955247477, + "learning_rate": 8.432987627812606e-06, + "loss": 0.5051, + "step": 5169 + }, + { + "epoch": 0.8490546671319771, + "grad_norm": 0.4222486533363501, + "learning_rate": 8.432759718388437e-06, + "loss": 0.5026, + "step": 5170 + }, + { + "epoch": 0.8492188943403198, + "grad_norm": 0.435952572835449, + "learning_rate": 8.43253176625063e-06, + "loss": 0.5014, + "step": 5171 + }, + { + "epoch": 0.8493831215486626, + "grad_norm": 0.35308923943760095, + "learning_rate": 8.432303771401659e-06, + "loss": 0.5239, + "step": 5172 + }, + { + "epoch": 0.8495473487570053, + "grad_norm": 0.3010694457237855, + "learning_rate": 8.432075733844e-06, + "loss": 0.5149, + "step": 5173 + }, + { + "epoch": 0.8497115759653481, + "grad_norm": 0.32228750187138333, + "learning_rate": 8.43184765358013e-06, + "loss": 0.5011, + "step": 5174 + }, + { + "epoch": 0.8498758031736908, + "grad_norm": 0.34908233140622946, + "learning_rate": 8.431619530612525e-06, + "loss": 0.5016, + "step": 5175 + }, + { + "epoch": 0.8500400303820336, + "grad_norm": 0.3444780549516735, + "learning_rate": 8.431391364943665e-06, + "loss": 0.4882, + "step": 5176 + }, + { + "epoch": 0.8502042575903763, + "grad_norm": 0.36189762611863124, + "learning_rate": 8.431163156576028e-06, + "loss": 0.5351, + "step": 5177 + }, + { + "epoch": 0.8503684847987191, + "grad_norm": 0.31406121921663654, + "learning_rate": 8.430934905512087e-06, + "loss": 0.5083, + "step": 5178 + }, + { + "epoch": 0.8505327120070618, + "grad_norm": 0.33779577449560827, + "learning_rate": 8.43070661175433e-06, + "loss": 0.5108, + "step": 5179 + }, + { + "epoch": 0.8506969392154045, + "grad_norm": 0.31857681875078253, + "learning_rate": 8.430478275305228e-06, + "loss": 0.5219, + "step": 5180 + }, + { + "epoch": 0.8508611664237472, + "grad_norm": 0.33539172162870345, + "learning_rate": 8.430249896167269e-06, + "loss": 0.5095, + "step": 5181 + }, + { + "epoch": 0.85102539363209, + "grad_norm": 0.3775500012400237, + "learning_rate": 8.430021474342928e-06, + "loss": 0.5084, + "step": 5182 + }, + { + "epoch": 0.8511896208404327, + "grad_norm": 0.3059924527973876, + "learning_rate": 8.429793009834685e-06, + "loss": 0.51, + "step": 5183 + }, + { + "epoch": 0.8513538480487755, + "grad_norm": 0.3871570428014681, + "learning_rate": 8.429564502645026e-06, + "loss": 0.527, + "step": 5184 + }, + { + "epoch": 0.8515180752571182, + "grad_norm": 0.3048202893164031, + "learning_rate": 8.429335952776428e-06, + "loss": 0.5057, + "step": 5185 + }, + { + "epoch": 0.851682302465461, + "grad_norm": 0.3039632803794553, + "learning_rate": 8.429107360231377e-06, + "loss": 0.4986, + "step": 5186 + }, + { + "epoch": 0.8518465296738037, + "grad_norm": 0.5060909240230959, + "learning_rate": 8.428878725012354e-06, + "loss": 0.5102, + "step": 5187 + }, + { + "epoch": 0.8520107568821464, + "grad_norm": 0.3212393034431408, + "learning_rate": 8.428650047121843e-06, + "loss": 0.5046, + "step": 5188 + }, + { + "epoch": 0.8521749840904892, + "grad_norm": 0.27915733547122695, + "learning_rate": 8.428421326562328e-06, + "loss": 0.5198, + "step": 5189 + }, + { + "epoch": 0.852339211298832, + "grad_norm": 0.28708430427289955, + "learning_rate": 8.42819256333629e-06, + "loss": 0.5145, + "step": 5190 + }, + { + "epoch": 0.8525034385071747, + "grad_norm": 0.30120292472904436, + "learning_rate": 8.427963757446218e-06, + "loss": 0.5093, + "step": 5191 + }, + { + "epoch": 0.8526676657155174, + "grad_norm": 0.35635732289785643, + "learning_rate": 8.427734908894594e-06, + "loss": 0.5047, + "step": 5192 + }, + { + "epoch": 0.8528318929238602, + "grad_norm": 0.28154755868785863, + "learning_rate": 8.427506017683905e-06, + "loss": 0.4954, + "step": 5193 + }, + { + "epoch": 0.8529961201322029, + "grad_norm": 0.31810663307518133, + "learning_rate": 8.427277083816636e-06, + "loss": 0.5163, + "step": 5194 + }, + { + "epoch": 0.8531603473405457, + "grad_norm": 0.4674936251067023, + "learning_rate": 8.427048107295275e-06, + "loss": 0.5103, + "step": 5195 + }, + { + "epoch": 0.8533245745488884, + "grad_norm": 0.3264717854724029, + "learning_rate": 8.426819088122307e-06, + "loss": 0.5157, + "step": 5196 + }, + { + "epoch": 0.8534888017572311, + "grad_norm": 0.34619937788013283, + "learning_rate": 8.42659002630022e-06, + "loss": 0.5148, + "step": 5197 + }, + { + "epoch": 0.8536530289655738, + "grad_norm": 0.30500362594914693, + "learning_rate": 8.426360921831503e-06, + "loss": 0.4945, + "step": 5198 + }, + { + "epoch": 0.8538172561739166, + "grad_norm": 0.35639512512192423, + "learning_rate": 8.426131774718641e-06, + "loss": 0.5236, + "step": 5199 + }, + { + "epoch": 0.8539814833822593, + "grad_norm": 0.32749143917043566, + "learning_rate": 8.425902584964129e-06, + "loss": 0.516, + "step": 5200 + }, + { + "epoch": 0.8541457105906021, + "grad_norm": 0.355371873683865, + "learning_rate": 8.425673352570448e-06, + "loss": 0.5243, + "step": 5201 + }, + { + "epoch": 0.8543099377989448, + "grad_norm": 0.3431264317700648, + "learning_rate": 8.425444077540094e-06, + "loss": 0.5081, + "step": 5202 + }, + { + "epoch": 0.8544741650072876, + "grad_norm": 0.46083194627277885, + "learning_rate": 8.425214759875558e-06, + "loss": 0.5187, + "step": 5203 + }, + { + "epoch": 0.8546383922156303, + "grad_norm": 0.3628692906065567, + "learning_rate": 8.424985399579323e-06, + "loss": 0.4859, + "step": 5204 + }, + { + "epoch": 0.8548026194239731, + "grad_norm": 0.3008814972088375, + "learning_rate": 8.424755996653889e-06, + "loss": 0.4949, + "step": 5205 + }, + { + "epoch": 0.8549668466323158, + "grad_norm": 0.33859496251143534, + "learning_rate": 8.424526551101741e-06, + "loss": 0.5117, + "step": 5206 + }, + { + "epoch": 0.8551310738406586, + "grad_norm": 0.3178485647764328, + "learning_rate": 8.424297062925375e-06, + "loss": 0.5097, + "step": 5207 + }, + { + "epoch": 0.8552953010490013, + "grad_norm": 0.310539171285264, + "learning_rate": 8.42406753212728e-06, + "loss": 0.5085, + "step": 5208 + }, + { + "epoch": 0.8554595282573441, + "grad_norm": 0.3222299566904057, + "learning_rate": 8.423837958709952e-06, + "loss": 0.5099, + "step": 5209 + }, + { + "epoch": 0.8556237554656868, + "grad_norm": 0.3062954314262607, + "learning_rate": 8.423608342675883e-06, + "loss": 0.4782, + "step": 5210 + }, + { + "epoch": 0.8557879826740296, + "grad_norm": 0.3855658097772442, + "learning_rate": 8.423378684027568e-06, + "loss": 0.525, + "step": 5211 + }, + { + "epoch": 0.8559522098823723, + "grad_norm": 0.4804967081262836, + "learning_rate": 8.4231489827675e-06, + "loss": 0.5052, + "step": 5212 + }, + { + "epoch": 0.856116437090715, + "grad_norm": 0.3271596419138035, + "learning_rate": 8.422919238898173e-06, + "loss": 0.4708, + "step": 5213 + }, + { + "epoch": 0.8562806642990577, + "grad_norm": 0.31481541934023216, + "learning_rate": 8.422689452422084e-06, + "loss": 0.5051, + "step": 5214 + }, + { + "epoch": 0.8564448915074004, + "grad_norm": 0.3099269647615455, + "learning_rate": 8.42245962334173e-06, + "loss": 0.4819, + "step": 5215 + }, + { + "epoch": 0.8566091187157432, + "grad_norm": 0.3438652387948608, + "learning_rate": 8.422229751659602e-06, + "loss": 0.4947, + "step": 5216 + }, + { + "epoch": 0.8567733459240859, + "grad_norm": 0.3459932973707239, + "learning_rate": 8.421999837378202e-06, + "loss": 0.53, + "step": 5217 + }, + { + "epoch": 0.8569375731324287, + "grad_norm": 0.27299285950949304, + "learning_rate": 8.421769880500025e-06, + "loss": 0.5281, + "step": 5218 + }, + { + "epoch": 0.8571018003407714, + "grad_norm": 0.27523955025295577, + "learning_rate": 8.421539881027568e-06, + "loss": 0.4841, + "step": 5219 + }, + { + "epoch": 0.8572660275491142, + "grad_norm": 0.34338702265752363, + "learning_rate": 8.42130983896333e-06, + "loss": 0.5255, + "step": 5220 + }, + { + "epoch": 0.8574302547574569, + "grad_norm": 0.3695909557488081, + "learning_rate": 8.421079754309808e-06, + "loss": 0.5017, + "step": 5221 + }, + { + "epoch": 0.8575944819657997, + "grad_norm": 0.26262217495334517, + "learning_rate": 8.420849627069504e-06, + "loss": 0.5295, + "step": 5222 + }, + { + "epoch": 0.8577587091741424, + "grad_norm": 0.3373161131890121, + "learning_rate": 8.420619457244915e-06, + "loss": 0.5172, + "step": 5223 + }, + { + "epoch": 0.8579229363824852, + "grad_norm": 0.4176117431308755, + "learning_rate": 8.42038924483854e-06, + "loss": 0.5118, + "step": 5224 + }, + { + "epoch": 0.8580871635908279, + "grad_norm": 0.4681792027400935, + "learning_rate": 8.420158989852881e-06, + "loss": 0.5068, + "step": 5225 + }, + { + "epoch": 0.8582513907991707, + "grad_norm": 0.2916379992376806, + "learning_rate": 8.41992869229044e-06, + "loss": 0.4982, + "step": 5226 + }, + { + "epoch": 0.8584156180075134, + "grad_norm": 0.27631425561265743, + "learning_rate": 8.419698352153715e-06, + "loss": 0.492, + "step": 5227 + }, + { + "epoch": 0.8585798452158562, + "grad_norm": 0.2795686993363181, + "learning_rate": 8.41946796944521e-06, + "loss": 0.5012, + "step": 5228 + }, + { + "epoch": 0.8587440724241989, + "grad_norm": 0.3963207725944165, + "learning_rate": 8.419237544167427e-06, + "loss": 0.4981, + "step": 5229 + }, + { + "epoch": 0.8589082996325417, + "grad_norm": 0.3328663936121557, + "learning_rate": 8.419007076322869e-06, + "loss": 0.5219, + "step": 5230 + }, + { + "epoch": 0.8590725268408843, + "grad_norm": 0.3493391789677639, + "learning_rate": 8.418776565914036e-06, + "loss": 0.5091, + "step": 5231 + }, + { + "epoch": 0.8592367540492271, + "grad_norm": 0.29098994266551437, + "learning_rate": 8.418546012943436e-06, + "loss": 0.5031, + "step": 5232 + }, + { + "epoch": 0.8594009812575698, + "grad_norm": 0.37668531359547847, + "learning_rate": 8.41831541741357e-06, + "loss": 0.5135, + "step": 5233 + }, + { + "epoch": 0.8595652084659126, + "grad_norm": 0.29737232046529355, + "learning_rate": 8.418084779326944e-06, + "loss": 0.5127, + "step": 5234 + }, + { + "epoch": 0.8597294356742553, + "grad_norm": 0.3572696957219688, + "learning_rate": 8.417854098686062e-06, + "loss": 0.5112, + "step": 5235 + }, + { + "epoch": 0.859893662882598, + "grad_norm": 0.29116505908127643, + "learning_rate": 8.41762337549343e-06, + "loss": 0.5032, + "step": 5236 + }, + { + "epoch": 0.8600578900909408, + "grad_norm": 0.3227422175765217, + "learning_rate": 8.417392609751553e-06, + "loss": 0.5192, + "step": 5237 + }, + { + "epoch": 0.8602221172992836, + "grad_norm": 0.3052186718364593, + "learning_rate": 8.417161801462939e-06, + "loss": 0.5033, + "step": 5238 + }, + { + "epoch": 0.8603863445076263, + "grad_norm": 0.2792044493744665, + "learning_rate": 8.416930950630094e-06, + "loss": 0.5113, + "step": 5239 + }, + { + "epoch": 0.860550571715969, + "grad_norm": 0.28137907424947856, + "learning_rate": 8.416700057255524e-06, + "loss": 0.485, + "step": 5240 + }, + { + "epoch": 0.8607147989243118, + "grad_norm": 0.3848989139038607, + "learning_rate": 8.416469121341739e-06, + "loss": 0.5146, + "step": 5241 + }, + { + "epoch": 0.8608790261326545, + "grad_norm": 0.3647020656817828, + "learning_rate": 8.416238142891246e-06, + "loss": 0.5123, + "step": 5242 + }, + { + "epoch": 0.8610432533409973, + "grad_norm": 0.30122311372081473, + "learning_rate": 8.416007121906553e-06, + "loss": 0.5191, + "step": 5243 + }, + { + "epoch": 0.86120748054934, + "grad_norm": 0.3073748720310311, + "learning_rate": 8.41577605839017e-06, + "loss": 0.5236, + "step": 5244 + }, + { + "epoch": 0.8613717077576828, + "grad_norm": 0.2905245125656662, + "learning_rate": 8.415544952344607e-06, + "loss": 0.4822, + "step": 5245 + }, + { + "epoch": 0.8615359349660255, + "grad_norm": 0.30468902373544227, + "learning_rate": 8.415313803772374e-06, + "loss": 0.4926, + "step": 5246 + }, + { + "epoch": 0.8617001621743683, + "grad_norm": 0.31458419828369694, + "learning_rate": 8.415082612675979e-06, + "loss": 0.5063, + "step": 5247 + }, + { + "epoch": 0.8618643893827109, + "grad_norm": 0.30256781957016937, + "learning_rate": 8.414851379057936e-06, + "loss": 0.5143, + "step": 5248 + }, + { + "epoch": 0.8620286165910537, + "grad_norm": 0.3512098795701238, + "learning_rate": 8.414620102920755e-06, + "loss": 0.5257, + "step": 5249 + }, + { + "epoch": 0.8621928437993964, + "grad_norm": 0.33016065512395715, + "learning_rate": 8.414388784266948e-06, + "loss": 0.5003, + "step": 5250 + }, + { + "epoch": 0.8623570710077392, + "grad_norm": 0.349765377162814, + "learning_rate": 8.41415742309903e-06, + "loss": 0.5098, + "step": 5251 + }, + { + "epoch": 0.8625212982160819, + "grad_norm": 0.3232602041674713, + "learning_rate": 8.413926019419508e-06, + "loss": 0.5323, + "step": 5252 + }, + { + "epoch": 0.8626855254244247, + "grad_norm": 0.33022614357718966, + "learning_rate": 8.4136945732309e-06, + "loss": 0.504, + "step": 5253 + }, + { + "epoch": 0.8628497526327674, + "grad_norm": 0.40419642120733285, + "learning_rate": 8.413463084535718e-06, + "loss": 0.5134, + "step": 5254 + }, + { + "epoch": 0.8630139798411102, + "grad_norm": 0.2947040903999055, + "learning_rate": 8.413231553336478e-06, + "loss": 0.52, + "step": 5255 + }, + { + "epoch": 0.8631782070494529, + "grad_norm": 0.3128924063031786, + "learning_rate": 8.412999979635692e-06, + "loss": 0.5085, + "step": 5256 + }, + { + "epoch": 0.8633424342577957, + "grad_norm": 0.34440429823573077, + "learning_rate": 8.412768363435875e-06, + "loss": 0.4934, + "step": 5257 + }, + { + "epoch": 0.8635066614661384, + "grad_norm": 0.3223141456692689, + "learning_rate": 8.412536704739547e-06, + "loss": 0.506, + "step": 5258 + }, + { + "epoch": 0.8636708886744812, + "grad_norm": 0.3620705564585329, + "learning_rate": 8.41230500354922e-06, + "loss": 0.5231, + "step": 5259 + }, + { + "epoch": 0.8638351158828239, + "grad_norm": 0.40002625330809766, + "learning_rate": 8.41207325986741e-06, + "loss": 0.5201, + "step": 5260 + }, + { + "epoch": 0.8639993430911667, + "grad_norm": 0.3658458873894559, + "learning_rate": 8.411841473696637e-06, + "loss": 0.5202, + "step": 5261 + }, + { + "epoch": 0.8641635702995094, + "grad_norm": 0.4622292620162486, + "learning_rate": 8.411609645039415e-06, + "loss": 0.4848, + "step": 5262 + }, + { + "epoch": 0.8643277975078522, + "grad_norm": 0.34047389699532343, + "learning_rate": 8.411377773898267e-06, + "loss": 0.5357, + "step": 5263 + }, + { + "epoch": 0.8644920247161949, + "grad_norm": 0.33676241125712014, + "learning_rate": 8.411145860275706e-06, + "loss": 0.4958, + "step": 5264 + }, + { + "epoch": 0.8646562519245375, + "grad_norm": 0.31135847431054403, + "learning_rate": 8.410913904174252e-06, + "loss": 0.5062, + "step": 5265 + }, + { + "epoch": 0.8648204791328803, + "grad_norm": 0.39107972410690034, + "learning_rate": 8.410681905596426e-06, + "loss": 0.5082, + "step": 5266 + }, + { + "epoch": 0.864984706341223, + "grad_norm": 0.3025022980375748, + "learning_rate": 8.410449864544748e-06, + "loss": 0.5117, + "step": 5267 + }, + { + "epoch": 0.8651489335495658, + "grad_norm": 0.27122718992549955, + "learning_rate": 8.410217781021736e-06, + "loss": 0.5079, + "step": 5268 + }, + { + "epoch": 0.8653131607579085, + "grad_norm": 0.33819644861136894, + "learning_rate": 8.409985655029912e-06, + "loss": 0.5044, + "step": 5269 + }, + { + "epoch": 0.8654773879662513, + "grad_norm": 0.3125457178886905, + "learning_rate": 8.409753486571795e-06, + "loss": 0.4664, + "step": 5270 + }, + { + "epoch": 0.865641615174594, + "grad_norm": 0.3283184967907039, + "learning_rate": 8.409521275649912e-06, + "loss": 0.5255, + "step": 5271 + }, + { + "epoch": 0.8658058423829368, + "grad_norm": 0.35440138371082225, + "learning_rate": 8.40928902226678e-06, + "loss": 0.5087, + "step": 5272 + }, + { + "epoch": 0.8659700695912795, + "grad_norm": 1.3917466044548144, + "learning_rate": 8.409056726424922e-06, + "loss": 0.4998, + "step": 5273 + }, + { + "epoch": 0.8661342967996223, + "grad_norm": 0.2904222163965924, + "learning_rate": 8.408824388126863e-06, + "loss": 0.4879, + "step": 5274 + }, + { + "epoch": 0.866298524007965, + "grad_norm": 0.3338819728127975, + "learning_rate": 8.408592007375125e-06, + "loss": 0.5197, + "step": 5275 + }, + { + "epoch": 0.8664627512163078, + "grad_norm": 0.3069337778776626, + "learning_rate": 8.408359584172234e-06, + "loss": 0.4761, + "step": 5276 + }, + { + "epoch": 0.8666269784246505, + "grad_norm": 0.3198664615484879, + "learning_rate": 8.40812711852071e-06, + "loss": 0.5159, + "step": 5277 + }, + { + "epoch": 0.8667912056329933, + "grad_norm": 0.2928415185108605, + "learning_rate": 8.407894610423082e-06, + "loss": 0.4881, + "step": 5278 + }, + { + "epoch": 0.866955432841336, + "grad_norm": 0.2942661306167567, + "learning_rate": 8.407662059881872e-06, + "loss": 0.5326, + "step": 5279 + }, + { + "epoch": 0.8671196600496788, + "grad_norm": 0.33386555344083646, + "learning_rate": 8.407429466899608e-06, + "loss": 0.5147, + "step": 5280 + }, + { + "epoch": 0.8672838872580215, + "grad_norm": 0.35711923571285464, + "learning_rate": 8.407196831478817e-06, + "loss": 0.5014, + "step": 5281 + }, + { + "epoch": 0.8674481144663642, + "grad_norm": 0.3719107496609669, + "learning_rate": 8.406964153622023e-06, + "loss": 0.5145, + "step": 5282 + }, + { + "epoch": 0.8676123416747069, + "grad_norm": 0.336945720581192, + "learning_rate": 8.406731433331756e-06, + "loss": 0.5096, + "step": 5283 + }, + { + "epoch": 0.8677765688830497, + "grad_norm": 0.3087115536368896, + "learning_rate": 8.40649867061054e-06, + "loss": 0.5227, + "step": 5284 + }, + { + "epoch": 0.8679407960913924, + "grad_norm": 0.47804182837248027, + "learning_rate": 8.406265865460905e-06, + "loss": 0.5085, + "step": 5285 + }, + { + "epoch": 0.8681050232997352, + "grad_norm": 0.3378849373597841, + "learning_rate": 8.406033017885381e-06, + "loss": 0.4883, + "step": 5286 + }, + { + "epoch": 0.8682692505080779, + "grad_norm": 0.4126831812504061, + "learning_rate": 8.405800127886493e-06, + "loss": 0.5181, + "step": 5287 + }, + { + "epoch": 0.8684334777164207, + "grad_norm": 0.27899762274396317, + "learning_rate": 8.405567195466775e-06, + "loss": 0.5227, + "step": 5288 + }, + { + "epoch": 0.8685977049247634, + "grad_norm": 0.33191781981524426, + "learning_rate": 8.405334220628754e-06, + "loss": 0.4971, + "step": 5289 + }, + { + "epoch": 0.8687619321331062, + "grad_norm": 0.29662629916763467, + "learning_rate": 8.405101203374962e-06, + "loss": 0.5116, + "step": 5290 + }, + { + "epoch": 0.8689261593414489, + "grad_norm": 0.32905380917085053, + "learning_rate": 8.404868143707927e-06, + "loss": 0.5111, + "step": 5291 + }, + { + "epoch": 0.8690903865497916, + "grad_norm": 0.3228454402336175, + "learning_rate": 8.404635041630184e-06, + "loss": 0.533, + "step": 5292 + }, + { + "epoch": 0.8692546137581344, + "grad_norm": 0.3159360456674922, + "learning_rate": 8.404401897144262e-06, + "loss": 0.5142, + "step": 5293 + }, + { + "epoch": 0.8694188409664771, + "grad_norm": 0.31946115452138973, + "learning_rate": 8.404168710252692e-06, + "loss": 0.5095, + "step": 5294 + }, + { + "epoch": 0.8695830681748199, + "grad_norm": 0.7962735316090845, + "learning_rate": 8.403935480958011e-06, + "loss": 0.5373, + "step": 5295 + }, + { + "epoch": 0.8697472953831626, + "grad_norm": 1.2910079032447723, + "learning_rate": 8.40370220926275e-06, + "loss": 0.4944, + "step": 5296 + }, + { + "epoch": 0.8699115225915054, + "grad_norm": 0.3078680638961182, + "learning_rate": 8.40346889516944e-06, + "loss": 0.4875, + "step": 5297 + }, + { + "epoch": 0.8700757497998481, + "grad_norm": 0.35891347383172, + "learning_rate": 8.40323553868062e-06, + "loss": 0.5185, + "step": 5298 + }, + { + "epoch": 0.8702399770081908, + "grad_norm": 0.5653737358384439, + "learning_rate": 8.40300213979882e-06, + "loss": 0.51, + "step": 5299 + }, + { + "epoch": 0.8704042042165335, + "grad_norm": 0.32168689984896887, + "learning_rate": 8.402768698526577e-06, + "loss": 0.5194, + "step": 5300 + }, + { + "epoch": 0.8705684314248763, + "grad_norm": 0.3381898859178391, + "learning_rate": 8.402535214866426e-06, + "loss": 0.5038, + "step": 5301 + }, + { + "epoch": 0.870732658633219, + "grad_norm": 0.34810336533021025, + "learning_rate": 8.402301688820903e-06, + "loss": 0.5198, + "step": 5302 + }, + { + "epoch": 0.8708968858415618, + "grad_norm": 0.3556606225523487, + "learning_rate": 8.402068120392545e-06, + "loss": 0.51, + "step": 5303 + }, + { + "epoch": 0.8710611130499045, + "grad_norm": 0.2772688917845348, + "learning_rate": 8.401834509583889e-06, + "loss": 0.5163, + "step": 5304 + }, + { + "epoch": 0.8712253402582473, + "grad_norm": 0.3070635849318607, + "learning_rate": 8.40160085639747e-06, + "loss": 0.5107, + "step": 5305 + }, + { + "epoch": 0.87138956746659, + "grad_norm": 0.35390726184331905, + "learning_rate": 8.401367160835826e-06, + "loss": 0.5043, + "step": 5306 + }, + { + "epoch": 0.8715537946749328, + "grad_norm": 0.33187000412444934, + "learning_rate": 8.401133422901497e-06, + "loss": 0.5116, + "step": 5307 + }, + { + "epoch": 0.8717180218832755, + "grad_norm": 0.40659711524377584, + "learning_rate": 8.400899642597022e-06, + "loss": 0.5036, + "step": 5308 + }, + { + "epoch": 0.8718822490916183, + "grad_norm": 0.3851914077124121, + "learning_rate": 8.400665819924938e-06, + "loss": 0.5056, + "step": 5309 + }, + { + "epoch": 0.872046476299961, + "grad_norm": 0.32665644167454533, + "learning_rate": 8.400431954887785e-06, + "loss": 0.513, + "step": 5310 + }, + { + "epoch": 0.8722107035083038, + "grad_norm": 0.41519115858739336, + "learning_rate": 8.400198047488105e-06, + "loss": 0.5219, + "step": 5311 + }, + { + "epoch": 0.8723749307166465, + "grad_norm": 0.3552816056869143, + "learning_rate": 8.399964097728436e-06, + "loss": 0.516, + "step": 5312 + }, + { + "epoch": 0.8725391579249893, + "grad_norm": 0.2861691798039096, + "learning_rate": 8.39973010561132e-06, + "loss": 0.4861, + "step": 5313 + }, + { + "epoch": 0.872703385133332, + "grad_norm": 0.32524768948820126, + "learning_rate": 8.399496071139298e-06, + "loss": 0.4899, + "step": 5314 + }, + { + "epoch": 0.8728676123416748, + "grad_norm": 0.34057732140952096, + "learning_rate": 8.39926199431491e-06, + "loss": 0.508, + "step": 5315 + }, + { + "epoch": 0.8730318395500174, + "grad_norm": 0.4133044624728016, + "learning_rate": 8.399027875140703e-06, + "loss": 0.4852, + "step": 5316 + }, + { + "epoch": 0.8731960667583601, + "grad_norm": 0.3126250586763464, + "learning_rate": 8.398793713619218e-06, + "loss": 0.4831, + "step": 5317 + }, + { + "epoch": 0.8733602939667029, + "grad_norm": 0.35913370774447195, + "learning_rate": 8.398559509752995e-06, + "loss": 0.5068, + "step": 5318 + }, + { + "epoch": 0.8735245211750456, + "grad_norm": 0.3425673312783436, + "learning_rate": 8.398325263544582e-06, + "loss": 0.5124, + "step": 5319 + }, + { + "epoch": 0.8736887483833884, + "grad_norm": 0.31258714176169194, + "learning_rate": 8.39809097499652e-06, + "loss": 0.5134, + "step": 5320 + }, + { + "epoch": 0.8738529755917311, + "grad_norm": 0.30534161830143747, + "learning_rate": 8.397856644111356e-06, + "loss": 0.5052, + "step": 5321 + }, + { + "epoch": 0.8740172028000739, + "grad_norm": 0.3471453883343114, + "learning_rate": 8.397622270891632e-06, + "loss": 0.475, + "step": 5322 + }, + { + "epoch": 0.8741814300084166, + "grad_norm": 0.30377985143302766, + "learning_rate": 8.397387855339896e-06, + "loss": 0.4997, + "step": 5323 + }, + { + "epoch": 0.8743456572167594, + "grad_norm": 0.32085464444851813, + "learning_rate": 8.397153397458694e-06, + "loss": 0.5079, + "step": 5324 + }, + { + "epoch": 0.8745098844251021, + "grad_norm": 0.3914308128149654, + "learning_rate": 8.396918897250571e-06, + "loss": 0.5051, + "step": 5325 + }, + { + "epoch": 0.8746741116334449, + "grad_norm": 0.3968388154681836, + "learning_rate": 8.396684354718076e-06, + "loss": 0.4986, + "step": 5326 + }, + { + "epoch": 0.8748383388417876, + "grad_norm": 0.2997877334792993, + "learning_rate": 8.396449769863754e-06, + "loss": 0.5121, + "step": 5327 + }, + { + "epoch": 0.8750025660501304, + "grad_norm": 0.4294864886076223, + "learning_rate": 8.396215142690154e-06, + "loss": 0.5012, + "step": 5328 + }, + { + "epoch": 0.8751667932584731, + "grad_norm": 0.31195660001293835, + "learning_rate": 8.395980473199826e-06, + "loss": 0.4977, + "step": 5329 + }, + { + "epoch": 0.8753310204668159, + "grad_norm": 0.3503078869614596, + "learning_rate": 8.395745761395314e-06, + "loss": 0.515, + "step": 5330 + }, + { + "epoch": 0.8754952476751586, + "grad_norm": 0.35642303207734, + "learning_rate": 8.395511007279172e-06, + "loss": 0.4936, + "step": 5331 + }, + { + "epoch": 0.8756594748835014, + "grad_norm": 0.37437379291643125, + "learning_rate": 8.395276210853946e-06, + "loss": 0.5064, + "step": 5332 + }, + { + "epoch": 0.875823702091844, + "grad_norm": 0.32548744308565203, + "learning_rate": 8.39504137212219e-06, + "loss": 0.5251, + "step": 5333 + }, + { + "epoch": 0.8759879293001868, + "grad_norm": 0.3368536214240922, + "learning_rate": 8.394806491086453e-06, + "loss": 0.4841, + "step": 5334 + }, + { + "epoch": 0.8761521565085295, + "grad_norm": 0.32379612932847374, + "learning_rate": 8.394571567749283e-06, + "loss": 0.519, + "step": 5335 + }, + { + "epoch": 0.8763163837168723, + "grad_norm": 0.254907543345491, + "learning_rate": 8.394336602113235e-06, + "loss": 0.4934, + "step": 5336 + }, + { + "epoch": 0.876480610925215, + "grad_norm": 0.292578050848311, + "learning_rate": 8.39410159418086e-06, + "loss": 0.5038, + "step": 5337 + }, + { + "epoch": 0.8766448381335578, + "grad_norm": 0.33068895234319823, + "learning_rate": 8.393866543954713e-06, + "loss": 0.4894, + "step": 5338 + }, + { + "epoch": 0.8768090653419005, + "grad_norm": 0.3022738707416738, + "learning_rate": 8.393631451437341e-06, + "loss": 0.495, + "step": 5339 + }, + { + "epoch": 0.8769732925502433, + "grad_norm": 0.3309924116679411, + "learning_rate": 8.393396316631301e-06, + "loss": 0.4907, + "step": 5340 + }, + { + "epoch": 0.877137519758586, + "grad_norm": 0.3072778428978678, + "learning_rate": 8.393161139539147e-06, + "loss": 0.5083, + "step": 5341 + }, + { + "epoch": 0.8773017469669288, + "grad_norm": 0.4056601024135022, + "learning_rate": 8.392925920163433e-06, + "loss": 0.494, + "step": 5342 + }, + { + "epoch": 0.8774659741752715, + "grad_norm": 0.3292865631147971, + "learning_rate": 8.392690658506713e-06, + "loss": 0.477, + "step": 5343 + }, + { + "epoch": 0.8776302013836143, + "grad_norm": 0.3173888394810557, + "learning_rate": 8.392455354571542e-06, + "loss": 0.5042, + "step": 5344 + }, + { + "epoch": 0.877794428591957, + "grad_norm": 0.41650203746899644, + "learning_rate": 8.392220008360478e-06, + "loss": 0.5128, + "step": 5345 + }, + { + "epoch": 0.8779586558002997, + "grad_norm": 0.3128633850645757, + "learning_rate": 8.391984619876073e-06, + "loss": 0.5117, + "step": 5346 + }, + { + "epoch": 0.8781228830086425, + "grad_norm": 0.3739198066157396, + "learning_rate": 8.391749189120889e-06, + "loss": 0.4983, + "step": 5347 + }, + { + "epoch": 0.8782871102169852, + "grad_norm": 0.3270565163589913, + "learning_rate": 8.391513716097476e-06, + "loss": 0.5094, + "step": 5348 + }, + { + "epoch": 0.878451337425328, + "grad_norm": 0.36481461717522407, + "learning_rate": 8.391278200808398e-06, + "loss": 0.4884, + "step": 5349 + }, + { + "epoch": 0.8786155646336706, + "grad_norm": 0.3380444447356424, + "learning_rate": 8.39104264325621e-06, + "loss": 0.4973, + "step": 5350 + }, + { + "epoch": 0.8787797918420134, + "grad_norm": 0.3226705104713618, + "learning_rate": 8.390807043443468e-06, + "loss": 0.5139, + "step": 5351 + }, + { + "epoch": 0.8789440190503561, + "grad_norm": 0.3562909994144863, + "learning_rate": 8.390571401372737e-06, + "loss": 0.5125, + "step": 5352 + }, + { + "epoch": 0.8791082462586989, + "grad_norm": 0.3465108100828412, + "learning_rate": 8.39033571704657e-06, + "loss": 0.5038, + "step": 5353 + }, + { + "epoch": 0.8792724734670416, + "grad_norm": 0.46935542963470755, + "learning_rate": 8.390099990467531e-06, + "loss": 0.5063, + "step": 5354 + }, + { + "epoch": 0.8794367006753844, + "grad_norm": 0.34137317758615526, + "learning_rate": 8.389864221638179e-06, + "loss": 0.514, + "step": 5355 + }, + { + "epoch": 0.8796009278837271, + "grad_norm": 0.29825938926351886, + "learning_rate": 8.389628410561074e-06, + "loss": 0.5368, + "step": 5356 + }, + { + "epoch": 0.8797651550920699, + "grad_norm": 0.32769587710392506, + "learning_rate": 8.389392557238777e-06, + "loss": 0.5395, + "step": 5357 + }, + { + "epoch": 0.8799293823004126, + "grad_norm": 0.3103698388540546, + "learning_rate": 8.389156661673851e-06, + "loss": 0.4951, + "step": 5358 + }, + { + "epoch": 0.8800936095087554, + "grad_norm": 0.31391936442953045, + "learning_rate": 8.388920723868858e-06, + "loss": 0.5139, + "step": 5359 + }, + { + "epoch": 0.8802578367170981, + "grad_norm": 0.30702069935676596, + "learning_rate": 8.388684743826358e-06, + "loss": 0.4914, + "step": 5360 + }, + { + "epoch": 0.8804220639254409, + "grad_norm": 0.287494636924883, + "learning_rate": 8.388448721548916e-06, + "loss": 0.5036, + "step": 5361 + }, + { + "epoch": 0.8805862911337836, + "grad_norm": 0.3079894511670955, + "learning_rate": 8.388212657039097e-06, + "loss": 0.4936, + "step": 5362 + }, + { + "epoch": 0.8807505183421264, + "grad_norm": 0.3360880269155661, + "learning_rate": 8.387976550299462e-06, + "loss": 0.5035, + "step": 5363 + }, + { + "epoch": 0.8809147455504691, + "grad_norm": 0.5102037605942567, + "learning_rate": 8.387740401332574e-06, + "loss": 0.5013, + "step": 5364 + }, + { + "epoch": 0.8810789727588119, + "grad_norm": 0.3313606109020479, + "learning_rate": 8.387504210141003e-06, + "loss": 0.4879, + "step": 5365 + }, + { + "epoch": 0.8812431999671546, + "grad_norm": 0.3999436482907233, + "learning_rate": 8.387267976727312e-06, + "loss": 0.5208, + "step": 5366 + }, + { + "epoch": 0.8814074271754972, + "grad_norm": 0.3326458033025108, + "learning_rate": 8.387031701094066e-06, + "loss": 0.5074, + "step": 5367 + }, + { + "epoch": 0.88157165438384, + "grad_norm": 0.34094614319822397, + "learning_rate": 8.386795383243828e-06, + "loss": 0.5159, + "step": 5368 + }, + { + "epoch": 0.8817358815921827, + "grad_norm": 0.3706442446081152, + "learning_rate": 8.386559023179172e-06, + "loss": 0.4928, + "step": 5369 + }, + { + "epoch": 0.8819001088005255, + "grad_norm": 0.35070846954591484, + "learning_rate": 8.38632262090266e-06, + "loss": 0.5259, + "step": 5370 + }, + { + "epoch": 0.8820643360088682, + "grad_norm": 0.3703793978322178, + "learning_rate": 8.386086176416859e-06, + "loss": 0.521, + "step": 5371 + }, + { + "epoch": 0.882228563217211, + "grad_norm": 0.3957430663808368, + "learning_rate": 8.38584968972434e-06, + "loss": 0.5108, + "step": 5372 + }, + { + "epoch": 0.8823927904255537, + "grad_norm": 0.3077741513865519, + "learning_rate": 8.385613160827672e-06, + "loss": 0.5007, + "step": 5373 + }, + { + "epoch": 0.8825570176338965, + "grad_norm": 0.32405883564522314, + "learning_rate": 8.385376589729419e-06, + "loss": 0.4945, + "step": 5374 + }, + { + "epoch": 0.8827212448422392, + "grad_norm": 0.32194303637237365, + "learning_rate": 8.385139976432155e-06, + "loss": 0.5216, + "step": 5375 + }, + { + "epoch": 0.882885472050582, + "grad_norm": 0.34449583253230837, + "learning_rate": 8.384903320938449e-06, + "loss": 0.5024, + "step": 5376 + }, + { + "epoch": 0.8830496992589247, + "grad_norm": 0.3035329723939267, + "learning_rate": 8.38466662325087e-06, + "loss": 0.5021, + "step": 5377 + }, + { + "epoch": 0.8832139264672675, + "grad_norm": 0.34319045438113116, + "learning_rate": 8.384429883371989e-06, + "loss": 0.5162, + "step": 5378 + }, + { + "epoch": 0.8833781536756102, + "grad_norm": 0.36034984090698774, + "learning_rate": 8.384193101304377e-06, + "loss": 0.5028, + "step": 5379 + }, + { + "epoch": 0.883542380883953, + "grad_norm": 0.33601207342080214, + "learning_rate": 8.383956277050609e-06, + "loss": 0.4913, + "step": 5380 + }, + { + "epoch": 0.8837066080922957, + "grad_norm": 0.394729287186722, + "learning_rate": 8.383719410613254e-06, + "loss": 0.5089, + "step": 5381 + }, + { + "epoch": 0.8838708353006385, + "grad_norm": 0.33148220436620357, + "learning_rate": 8.383482501994884e-06, + "loss": 0.4929, + "step": 5382 + }, + { + "epoch": 0.8840350625089812, + "grad_norm": 0.4144935425174578, + "learning_rate": 8.383245551198074e-06, + "loss": 0.5209, + "step": 5383 + }, + { + "epoch": 0.8841992897173239, + "grad_norm": 0.29606041865345, + "learning_rate": 8.383008558225395e-06, + "loss": 0.4981, + "step": 5384 + }, + { + "epoch": 0.8843635169256666, + "grad_norm": 0.31134244297032126, + "learning_rate": 8.382771523079424e-06, + "loss": 0.4992, + "step": 5385 + }, + { + "epoch": 0.8845277441340094, + "grad_norm": 0.3370958117903781, + "learning_rate": 8.382534445762735e-06, + "loss": 0.4937, + "step": 5386 + }, + { + "epoch": 0.8846919713423521, + "grad_norm": 0.3001698439254947, + "learning_rate": 8.3822973262779e-06, + "loss": 0.5329, + "step": 5387 + }, + { + "epoch": 0.8848561985506949, + "grad_norm": 0.30084477565026285, + "learning_rate": 8.382060164627499e-06, + "loss": 0.5059, + "step": 5388 + }, + { + "epoch": 0.8850204257590376, + "grad_norm": 0.3346426570065145, + "learning_rate": 8.381822960814102e-06, + "loss": 0.5013, + "step": 5389 + }, + { + "epoch": 0.8851846529673804, + "grad_norm": 0.29458072214294884, + "learning_rate": 8.381585714840291e-06, + "loss": 0.505, + "step": 5390 + }, + { + "epoch": 0.8853488801757231, + "grad_norm": 0.3378818988362336, + "learning_rate": 8.38134842670864e-06, + "loss": 0.5187, + "step": 5391 + }, + { + "epoch": 0.8855131073840659, + "grad_norm": 0.35910179479581733, + "learning_rate": 8.381111096421725e-06, + "loss": 0.5095, + "step": 5392 + }, + { + "epoch": 0.8856773345924086, + "grad_norm": 0.31216769263690824, + "learning_rate": 8.380873723982126e-06, + "loss": 0.5111, + "step": 5393 + }, + { + "epoch": 0.8858415618007514, + "grad_norm": 0.4289924367030972, + "learning_rate": 8.380636309392419e-06, + "loss": 0.4956, + "step": 5394 + }, + { + "epoch": 0.8860057890090941, + "grad_norm": 0.4956517822724945, + "learning_rate": 8.380398852655184e-06, + "loss": 0.4938, + "step": 5395 + }, + { + "epoch": 0.8861700162174369, + "grad_norm": 0.28831824779869475, + "learning_rate": 8.380161353773e-06, + "loss": 0.5125, + "step": 5396 + }, + { + "epoch": 0.8863342434257796, + "grad_norm": 0.3097895844490996, + "learning_rate": 8.379923812748447e-06, + "loss": 0.513, + "step": 5397 + }, + { + "epoch": 0.8864984706341223, + "grad_norm": 0.3296849530356679, + "learning_rate": 8.379686229584103e-06, + "loss": 0.4992, + "step": 5398 + }, + { + "epoch": 0.8866626978424651, + "grad_norm": 0.40568229779401993, + "learning_rate": 8.37944860428255e-06, + "loss": 0.5006, + "step": 5399 + }, + { + "epoch": 0.8868269250508078, + "grad_norm": 0.34325966725882706, + "learning_rate": 8.379210936846368e-06, + "loss": 0.4764, + "step": 5400 + }, + { + "epoch": 0.8869911522591505, + "grad_norm": 0.4758915504276449, + "learning_rate": 8.378973227278139e-06, + "loss": 0.5055, + "step": 5401 + }, + { + "epoch": 0.8871553794674932, + "grad_norm": 0.297381855982727, + "learning_rate": 8.378735475580444e-06, + "loss": 0.498, + "step": 5402 + }, + { + "epoch": 0.887319606675836, + "grad_norm": 0.4933418017370094, + "learning_rate": 8.378497681755865e-06, + "loss": 0.4995, + "step": 5403 + }, + { + "epoch": 0.8874838338841787, + "grad_norm": 0.3983723166968519, + "learning_rate": 8.378259845806986e-06, + "loss": 0.5365, + "step": 5404 + }, + { + "epoch": 0.8876480610925215, + "grad_norm": 0.3143111854125081, + "learning_rate": 8.378021967736388e-06, + "loss": 0.5197, + "step": 5405 + }, + { + "epoch": 0.8878122883008642, + "grad_norm": 0.3801638714521817, + "learning_rate": 8.377784047546657e-06, + "loss": 0.4944, + "step": 5406 + }, + { + "epoch": 0.887976515509207, + "grad_norm": 0.34407817181065525, + "learning_rate": 8.377546085240376e-06, + "loss": 0.4994, + "step": 5407 + }, + { + "epoch": 0.8881407427175497, + "grad_norm": 0.3133043465281556, + "learning_rate": 8.37730808082013e-06, + "loss": 0.4759, + "step": 5408 + }, + { + "epoch": 0.8883049699258925, + "grad_norm": 0.2965789041034004, + "learning_rate": 8.377070034288505e-06, + "loss": 0.5046, + "step": 5409 + }, + { + "epoch": 0.8884691971342352, + "grad_norm": 0.326618198478491, + "learning_rate": 8.376831945648081e-06, + "loss": 0.5124, + "step": 5410 + }, + { + "epoch": 0.888633424342578, + "grad_norm": 0.3053859757388855, + "learning_rate": 8.37659381490145e-06, + "loss": 0.4893, + "step": 5411 + }, + { + "epoch": 0.8887976515509207, + "grad_norm": 0.2954348959691266, + "learning_rate": 8.376355642051196e-06, + "loss": 0.5098, + "step": 5412 + }, + { + "epoch": 0.8889618787592635, + "grad_norm": 0.3135215141770848, + "learning_rate": 8.376117427099907e-06, + "loss": 0.4979, + "step": 5413 + }, + { + "epoch": 0.8891261059676062, + "grad_norm": 0.3271102608994896, + "learning_rate": 8.375879170050167e-06, + "loss": 0.5178, + "step": 5414 + }, + { + "epoch": 0.889290333175949, + "grad_norm": 0.38229922813402745, + "learning_rate": 8.375640870904568e-06, + "loss": 0.5232, + "step": 5415 + }, + { + "epoch": 0.8894545603842917, + "grad_norm": 0.3797870401260684, + "learning_rate": 8.375402529665694e-06, + "loss": 0.4986, + "step": 5416 + }, + { + "epoch": 0.8896187875926345, + "grad_norm": 0.31200316783816984, + "learning_rate": 8.375164146336137e-06, + "loss": 0.5241, + "step": 5417 + }, + { + "epoch": 0.8897830148009771, + "grad_norm": 0.3064704900631936, + "learning_rate": 8.374925720918485e-06, + "loss": 0.4955, + "step": 5418 + }, + { + "epoch": 0.8899472420093198, + "grad_norm": 0.3058769827785828, + "learning_rate": 8.374687253415326e-06, + "loss": 0.4883, + "step": 5419 + }, + { + "epoch": 0.8901114692176626, + "grad_norm": 0.2843411368389142, + "learning_rate": 8.374448743829252e-06, + "loss": 0.5217, + "step": 5420 + }, + { + "epoch": 0.8902756964260053, + "grad_norm": 0.3728611557769457, + "learning_rate": 8.37421019216285e-06, + "loss": 0.4938, + "step": 5421 + }, + { + "epoch": 0.8904399236343481, + "grad_norm": 0.3011377477606396, + "learning_rate": 8.373971598418717e-06, + "loss": 0.5077, + "step": 5422 + }, + { + "epoch": 0.8906041508426908, + "grad_norm": 0.36206134781906424, + "learning_rate": 8.373732962599441e-06, + "loss": 0.494, + "step": 5423 + }, + { + "epoch": 0.8907683780510336, + "grad_norm": 0.3908084739505783, + "learning_rate": 8.373494284707613e-06, + "loss": 0.5201, + "step": 5424 + }, + { + "epoch": 0.8909326052593763, + "grad_norm": 0.2749425323407986, + "learning_rate": 8.373255564745824e-06, + "loss": 0.4962, + "step": 5425 + }, + { + "epoch": 0.8910968324677191, + "grad_norm": 0.3303930882629094, + "learning_rate": 8.373016802716673e-06, + "loss": 0.5098, + "step": 5426 + }, + { + "epoch": 0.8912610596760618, + "grad_norm": 0.2998241441933591, + "learning_rate": 8.372777998622745e-06, + "loss": 0.4869, + "step": 5427 + }, + { + "epoch": 0.8914252868844046, + "grad_norm": 0.29451950284439987, + "learning_rate": 8.37253915246664e-06, + "loss": 0.5264, + "step": 5428 + }, + { + "epoch": 0.8915895140927473, + "grad_norm": 0.3063418491635172, + "learning_rate": 8.37230026425095e-06, + "loss": 0.5038, + "step": 5429 + }, + { + "epoch": 0.8917537413010901, + "grad_norm": 0.2996944732030016, + "learning_rate": 8.372061333978266e-06, + "loss": 0.5014, + "step": 5430 + }, + { + "epoch": 0.8919179685094328, + "grad_norm": 0.3424116018767127, + "learning_rate": 8.37182236165119e-06, + "loss": 0.5089, + "step": 5431 + }, + { + "epoch": 0.8920821957177756, + "grad_norm": 0.29079573719915974, + "learning_rate": 8.371583347272314e-06, + "loss": 0.5219, + "step": 5432 + }, + { + "epoch": 0.8922464229261183, + "grad_norm": 0.3304927425490326, + "learning_rate": 8.37134429084423e-06, + "loss": 0.5338, + "step": 5433 + }, + { + "epoch": 0.8924106501344611, + "grad_norm": 0.4902664749685111, + "learning_rate": 8.371105192369541e-06, + "loss": 0.5078, + "step": 5434 + }, + { + "epoch": 0.8925748773428037, + "grad_norm": 0.39179044375461103, + "learning_rate": 8.37086605185084e-06, + "loss": 0.514, + "step": 5435 + }, + { + "epoch": 0.8927391045511465, + "grad_norm": 0.3111137817750461, + "learning_rate": 8.370626869290725e-06, + "loss": 0.4722, + "step": 5436 + }, + { + "epoch": 0.8929033317594892, + "grad_norm": 0.30758120367970715, + "learning_rate": 8.370387644691796e-06, + "loss": 0.495, + "step": 5437 + }, + { + "epoch": 0.893067558967832, + "grad_norm": 0.3982671596775974, + "learning_rate": 8.370148378056647e-06, + "loss": 0.5242, + "step": 5438 + }, + { + "epoch": 0.8932317861761747, + "grad_norm": 0.27220767951031977, + "learning_rate": 8.369909069387879e-06, + "loss": 0.5019, + "step": 5439 + }, + { + "epoch": 0.8933960133845175, + "grad_norm": 0.3798596024471178, + "learning_rate": 8.369669718688093e-06, + "loss": 0.5175, + "step": 5440 + }, + { + "epoch": 0.8935602405928602, + "grad_norm": 0.46454027331217806, + "learning_rate": 8.369430325959884e-06, + "loss": 0.5056, + "step": 5441 + }, + { + "epoch": 0.893724467801203, + "grad_norm": 0.2694505235037501, + "learning_rate": 8.369190891205858e-06, + "loss": 0.5006, + "step": 5442 + }, + { + "epoch": 0.8938886950095457, + "grad_norm": 0.2990622026346461, + "learning_rate": 8.36895141442861e-06, + "loss": 0.5154, + "step": 5443 + }, + { + "epoch": 0.8940529222178885, + "grad_norm": 0.30742433459438284, + "learning_rate": 8.368711895630743e-06, + "loss": 0.5201, + "step": 5444 + }, + { + "epoch": 0.8942171494262312, + "grad_norm": 0.2877974512432656, + "learning_rate": 8.36847233481486e-06, + "loss": 0.4951, + "step": 5445 + }, + { + "epoch": 0.894381376634574, + "grad_norm": 0.3799216470015632, + "learning_rate": 8.368232731983559e-06, + "loss": 0.524, + "step": 5446 + }, + { + "epoch": 0.8945456038429167, + "grad_norm": 0.28985046386333174, + "learning_rate": 8.367993087139446e-06, + "loss": 0.4789, + "step": 5447 + }, + { + "epoch": 0.8947098310512595, + "grad_norm": 0.2770590893279945, + "learning_rate": 8.367753400285122e-06, + "loss": 0.5151, + "step": 5448 + }, + { + "epoch": 0.8948740582596022, + "grad_norm": 0.3125926454125177, + "learning_rate": 8.367513671423191e-06, + "loss": 0.5164, + "step": 5449 + }, + { + "epoch": 0.895038285467945, + "grad_norm": 0.2789030307629401, + "learning_rate": 8.367273900556256e-06, + "loss": 0.4886, + "step": 5450 + }, + { + "epoch": 0.8952025126762877, + "grad_norm": 0.3147777575769946, + "learning_rate": 8.367034087686924e-06, + "loss": 0.5288, + "step": 5451 + }, + { + "epoch": 0.8953667398846303, + "grad_norm": 0.3087500424646868, + "learning_rate": 8.366794232817795e-06, + "loss": 0.493, + "step": 5452 + }, + { + "epoch": 0.8955309670929731, + "grad_norm": 0.2880004076961026, + "learning_rate": 8.366554335951474e-06, + "loss": 0.4967, + "step": 5453 + }, + { + "epoch": 0.8956951943013158, + "grad_norm": 0.26914699822687915, + "learning_rate": 8.366314397090572e-06, + "loss": 0.4957, + "step": 5454 + }, + { + "epoch": 0.8958594215096586, + "grad_norm": 0.853277733033621, + "learning_rate": 8.36607441623769e-06, + "loss": 0.512, + "step": 5455 + }, + { + "epoch": 0.8960236487180013, + "grad_norm": 0.28789407556256835, + "learning_rate": 8.365834393395438e-06, + "loss": 0.5066, + "step": 5456 + }, + { + "epoch": 0.8961878759263441, + "grad_norm": 0.2684557559113861, + "learning_rate": 8.36559432856642e-06, + "loss": 0.5182, + "step": 5457 + }, + { + "epoch": 0.8963521031346868, + "grad_norm": 0.3644698809817585, + "learning_rate": 8.365354221753245e-06, + "loss": 0.5137, + "step": 5458 + }, + { + "epoch": 0.8965163303430296, + "grad_norm": 0.27994902706665303, + "learning_rate": 8.36511407295852e-06, + "loss": 0.4943, + "step": 5459 + }, + { + "epoch": 0.8966805575513723, + "grad_norm": 0.35579239769018195, + "learning_rate": 8.364873882184851e-06, + "loss": 0.4925, + "step": 5460 + }, + { + "epoch": 0.8968447847597151, + "grad_norm": 0.35041238641861094, + "learning_rate": 8.364633649434853e-06, + "loss": 0.5022, + "step": 5461 + }, + { + "epoch": 0.8970090119680578, + "grad_norm": 0.3580120860072093, + "learning_rate": 8.364393374711128e-06, + "loss": 0.5104, + "step": 5462 + }, + { + "epoch": 0.8971732391764006, + "grad_norm": 0.27954324217496795, + "learning_rate": 8.364153058016292e-06, + "loss": 0.5193, + "step": 5463 + }, + { + "epoch": 0.8973374663847433, + "grad_norm": 0.3101665856935753, + "learning_rate": 8.363912699352949e-06, + "loss": 0.5095, + "step": 5464 + }, + { + "epoch": 0.8975016935930861, + "grad_norm": 0.28512021161180856, + "learning_rate": 8.363672298723714e-06, + "loss": 0.4987, + "step": 5465 + }, + { + "epoch": 0.8976659208014288, + "grad_norm": 0.35211494571368435, + "learning_rate": 8.363431856131196e-06, + "loss": 0.4955, + "step": 5466 + }, + { + "epoch": 0.8978301480097716, + "grad_norm": 0.32744803108370507, + "learning_rate": 8.363191371578006e-06, + "loss": 0.5022, + "step": 5467 + }, + { + "epoch": 0.8979943752181143, + "grad_norm": 0.3249957296286912, + "learning_rate": 8.36295084506676e-06, + "loss": 0.4945, + "step": 5468 + }, + { + "epoch": 0.898158602426457, + "grad_norm": 0.28285293262085576, + "learning_rate": 8.362710276600065e-06, + "loss": 0.5094, + "step": 5469 + }, + { + "epoch": 0.8983228296347997, + "grad_norm": 0.27983567395929104, + "learning_rate": 8.362469666180536e-06, + "loss": 0.4874, + "step": 5470 + }, + { + "epoch": 0.8984870568431425, + "grad_norm": 0.397188968187713, + "learning_rate": 8.362229013810786e-06, + "loss": 0.5007, + "step": 5471 + }, + { + "epoch": 0.8986512840514852, + "grad_norm": 0.29935214259494414, + "learning_rate": 8.361988319493429e-06, + "loss": 0.4832, + "step": 5472 + }, + { + "epoch": 0.898815511259828, + "grad_norm": 0.35287779460582697, + "learning_rate": 8.36174758323108e-06, + "loss": 0.5194, + "step": 5473 + }, + { + "epoch": 0.8989797384681707, + "grad_norm": 0.3208882613202515, + "learning_rate": 8.361506805026352e-06, + "loss": 0.4964, + "step": 5474 + }, + { + "epoch": 0.8991439656765134, + "grad_norm": 0.3071412681372098, + "learning_rate": 8.361265984881862e-06, + "loss": 0.4833, + "step": 5475 + }, + { + "epoch": 0.8993081928848562, + "grad_norm": 0.2924027689988717, + "learning_rate": 8.361025122800223e-06, + "loss": 0.5363, + "step": 5476 + }, + { + "epoch": 0.8994724200931989, + "grad_norm": 0.2815585011141773, + "learning_rate": 8.360784218784054e-06, + "loss": 0.502, + "step": 5477 + }, + { + "epoch": 0.8996366473015417, + "grad_norm": 0.361696972613681, + "learning_rate": 8.360543272835968e-06, + "loss": 0.5054, + "step": 5478 + }, + { + "epoch": 0.8998008745098844, + "grad_norm": 0.2999832471190714, + "learning_rate": 8.360302284958586e-06, + "loss": 0.4908, + "step": 5479 + }, + { + "epoch": 0.8999651017182272, + "grad_norm": 0.4261624509261548, + "learning_rate": 8.360061255154521e-06, + "loss": 0.4898, + "step": 5480 + }, + { + "epoch": 0.9001293289265699, + "grad_norm": 0.2855915803003546, + "learning_rate": 8.359820183426395e-06, + "loss": 0.5164, + "step": 5481 + }, + { + "epoch": 0.9002935561349127, + "grad_norm": 0.29396206548228376, + "learning_rate": 8.359579069776822e-06, + "loss": 0.5047, + "step": 5482 + }, + { + "epoch": 0.9004577833432554, + "grad_norm": 0.3239709685752758, + "learning_rate": 8.359337914208424e-06, + "loss": 0.4881, + "step": 5483 + }, + { + "epoch": 0.9006220105515982, + "grad_norm": 0.3189529432048531, + "learning_rate": 8.35909671672382e-06, + "loss": 0.5118, + "step": 5484 + }, + { + "epoch": 0.9007862377599409, + "grad_norm": 0.3015118477260375, + "learning_rate": 8.358855477325628e-06, + "loss": 0.5252, + "step": 5485 + }, + { + "epoch": 0.9009504649682836, + "grad_norm": 0.3220454001782521, + "learning_rate": 8.35861419601647e-06, + "loss": 0.5245, + "step": 5486 + }, + { + "epoch": 0.9011146921766263, + "grad_norm": 0.29201083259315386, + "learning_rate": 8.358372872798964e-06, + "loss": 0.5238, + "step": 5487 + }, + { + "epoch": 0.9012789193849691, + "grad_norm": 0.3027181699860984, + "learning_rate": 8.358131507675735e-06, + "loss": 0.4992, + "step": 5488 + }, + { + "epoch": 0.9014431465933118, + "grad_norm": 0.3017338774888951, + "learning_rate": 8.357890100649397e-06, + "loss": 0.5002, + "step": 5489 + }, + { + "epoch": 0.9016073738016546, + "grad_norm": 0.2942002751508648, + "learning_rate": 8.357648651722582e-06, + "loss": 0.503, + "step": 5490 + }, + { + "epoch": 0.9017716010099973, + "grad_norm": 1.0776946000844136, + "learning_rate": 8.357407160897904e-06, + "loss": 0.5231, + "step": 5491 + }, + { + "epoch": 0.9019358282183401, + "grad_norm": 0.2969948419282714, + "learning_rate": 8.357165628177992e-06, + "loss": 0.5096, + "step": 5492 + }, + { + "epoch": 0.9021000554266828, + "grad_norm": 0.3085209220722856, + "learning_rate": 8.356924053565463e-06, + "loss": 0.4987, + "step": 5493 + }, + { + "epoch": 0.9022642826350256, + "grad_norm": 0.3732207561485276, + "learning_rate": 8.356682437062946e-06, + "loss": 0.5137, + "step": 5494 + }, + { + "epoch": 0.9024285098433683, + "grad_norm": 0.3006585094430494, + "learning_rate": 8.356440778673063e-06, + "loss": 0.4939, + "step": 5495 + }, + { + "epoch": 0.9025927370517111, + "grad_norm": 0.33128076755712377, + "learning_rate": 8.356199078398437e-06, + "loss": 0.4933, + "step": 5496 + }, + { + "epoch": 0.9027569642600538, + "grad_norm": 0.2931680063313428, + "learning_rate": 8.355957336241697e-06, + "loss": 0.4963, + "step": 5497 + }, + { + "epoch": 0.9029211914683966, + "grad_norm": 0.43709410205234245, + "learning_rate": 8.355715552205467e-06, + "loss": 0.5172, + "step": 5498 + }, + { + "epoch": 0.9030854186767393, + "grad_norm": 0.5518303131475227, + "learning_rate": 8.355473726292373e-06, + "loss": 0.5146, + "step": 5499 + }, + { + "epoch": 0.903249645885082, + "grad_norm": 0.35951046327730357, + "learning_rate": 8.35523185850504e-06, + "loss": 0.5053, + "step": 5500 + }, + { + "epoch": 0.9034138730934248, + "grad_norm": 0.31330180644732103, + "learning_rate": 8.354989948846096e-06, + "loss": 0.5169, + "step": 5501 + }, + { + "epoch": 0.9035781003017676, + "grad_norm": 0.2881362333682155, + "learning_rate": 8.354747997318168e-06, + "loss": 0.49, + "step": 5502 + }, + { + "epoch": 0.9037423275101102, + "grad_norm": 0.360101900220015, + "learning_rate": 8.354506003923884e-06, + "loss": 0.5072, + "step": 5503 + }, + { + "epoch": 0.9039065547184529, + "grad_norm": 0.31153369530916664, + "learning_rate": 8.354263968665873e-06, + "loss": 0.4925, + "step": 5504 + }, + { + "epoch": 0.9040707819267957, + "grad_norm": 0.37296961938079903, + "learning_rate": 8.354021891546764e-06, + "loss": 0.5135, + "step": 5505 + }, + { + "epoch": 0.9042350091351384, + "grad_norm": 0.9154141358636969, + "learning_rate": 8.353779772569184e-06, + "loss": 0.5015, + "step": 5506 + }, + { + "epoch": 0.9043992363434812, + "grad_norm": 0.296837819666743, + "learning_rate": 8.353537611735765e-06, + "loss": 0.5172, + "step": 5507 + }, + { + "epoch": 0.9045634635518239, + "grad_norm": 0.5177522705968512, + "learning_rate": 8.353295409049137e-06, + "loss": 0.5139, + "step": 5508 + }, + { + "epoch": 0.9047276907601667, + "grad_norm": 0.5204967918982784, + "learning_rate": 8.353053164511928e-06, + "loss": 0.5154, + "step": 5509 + }, + { + "epoch": 0.9048919179685094, + "grad_norm": 0.27139361175624743, + "learning_rate": 8.352810878126771e-06, + "loss": 0.4869, + "step": 5510 + }, + { + "epoch": 0.9050561451768522, + "grad_norm": 0.349204517447571, + "learning_rate": 8.352568549896298e-06, + "loss": 0.4964, + "step": 5511 + }, + { + "epoch": 0.9052203723851949, + "grad_norm": 0.3148885616058754, + "learning_rate": 8.352326179823139e-06, + "loss": 0.4921, + "step": 5512 + }, + { + "epoch": 0.9053845995935377, + "grad_norm": 0.5633992988398933, + "learning_rate": 8.352083767909929e-06, + "loss": 0.5095, + "step": 5513 + }, + { + "epoch": 0.9055488268018804, + "grad_norm": 0.3212531165331761, + "learning_rate": 8.351841314159298e-06, + "loss": 0.5034, + "step": 5514 + }, + { + "epoch": 0.9057130540102232, + "grad_norm": 0.45538311186140923, + "learning_rate": 8.351598818573881e-06, + "loss": 0.4843, + "step": 5515 + }, + { + "epoch": 0.9058772812185659, + "grad_norm": 0.2766324918567951, + "learning_rate": 8.351356281156313e-06, + "loss": 0.5163, + "step": 5516 + }, + { + "epoch": 0.9060415084269087, + "grad_norm": 0.36399910518378154, + "learning_rate": 8.351113701909225e-06, + "loss": 0.5107, + "step": 5517 + }, + { + "epoch": 0.9062057356352514, + "grad_norm": 0.2757312212678587, + "learning_rate": 8.350871080835253e-06, + "loss": 0.4934, + "step": 5518 + }, + { + "epoch": 0.9063699628435941, + "grad_norm": 0.2858011061130112, + "learning_rate": 8.350628417937031e-06, + "loss": 0.5012, + "step": 5519 + }, + { + "epoch": 0.9065341900519368, + "grad_norm": 0.4340611250040619, + "learning_rate": 8.350385713217198e-06, + "loss": 0.5228, + "step": 5520 + }, + { + "epoch": 0.9066984172602796, + "grad_norm": 0.3087231221133075, + "learning_rate": 8.350142966678389e-06, + "loss": 0.4983, + "step": 5521 + }, + { + "epoch": 0.9068626444686223, + "grad_norm": 0.3000003148489532, + "learning_rate": 8.349900178323235e-06, + "loss": 0.493, + "step": 5522 + }, + { + "epoch": 0.907026871676965, + "grad_norm": 0.32176413494489486, + "learning_rate": 8.349657348154382e-06, + "loss": 0.5153, + "step": 5523 + }, + { + "epoch": 0.9071910988853078, + "grad_norm": 0.33979379386785885, + "learning_rate": 8.34941447617446e-06, + "loss": 0.5057, + "step": 5524 + }, + { + "epoch": 0.9073553260936505, + "grad_norm": 0.2904116241815765, + "learning_rate": 8.349171562386111e-06, + "loss": 0.5207, + "step": 5525 + }, + { + "epoch": 0.9075195533019933, + "grad_norm": 0.3719685326435547, + "learning_rate": 8.348928606791971e-06, + "loss": 0.483, + "step": 5526 + }, + { + "epoch": 0.907683780510336, + "grad_norm": 0.42582862430775315, + "learning_rate": 8.348685609394678e-06, + "loss": 0.4974, + "step": 5527 + }, + { + "epoch": 0.9078480077186788, + "grad_norm": 0.37620185257746835, + "learning_rate": 8.348442570196875e-06, + "loss": 0.5158, + "step": 5528 + }, + { + "epoch": 0.9080122349270215, + "grad_norm": 0.34756089762014936, + "learning_rate": 8.348199489201198e-06, + "loss": 0.4865, + "step": 5529 + }, + { + "epoch": 0.9081764621353643, + "grad_norm": 0.3427940825007974, + "learning_rate": 8.34795636641029e-06, + "loss": 0.5037, + "step": 5530 + }, + { + "epoch": 0.908340689343707, + "grad_norm": 0.34160888168112385, + "learning_rate": 8.347713201826788e-06, + "loss": 0.5134, + "step": 5531 + }, + { + "epoch": 0.9085049165520498, + "grad_norm": 0.30878978550613373, + "learning_rate": 8.347469995453336e-06, + "loss": 0.5287, + "step": 5532 + }, + { + "epoch": 0.9086691437603925, + "grad_norm": 0.3362932220872595, + "learning_rate": 8.347226747292575e-06, + "loss": 0.5171, + "step": 5533 + }, + { + "epoch": 0.9088333709687353, + "grad_norm": 0.6424167138285161, + "learning_rate": 8.346983457347146e-06, + "loss": 0.513, + "step": 5534 + }, + { + "epoch": 0.908997598177078, + "grad_norm": 0.32394177404105495, + "learning_rate": 8.346740125619689e-06, + "loss": 0.4576, + "step": 5535 + }, + { + "epoch": 0.9091618253854207, + "grad_norm": 0.34000863262833253, + "learning_rate": 8.346496752112854e-06, + "loss": 0.5224, + "step": 5536 + }, + { + "epoch": 0.9093260525937634, + "grad_norm": 0.3247810713170286, + "learning_rate": 8.346253336829277e-06, + "loss": 0.4893, + "step": 5537 + }, + { + "epoch": 0.9094902798021062, + "grad_norm": 0.2939508899368689, + "learning_rate": 8.346009879771605e-06, + "loss": 0.51, + "step": 5538 + }, + { + "epoch": 0.9096545070104489, + "grad_norm": 0.32148698533969494, + "learning_rate": 8.345766380942483e-06, + "loss": 0.5037, + "step": 5539 + }, + { + "epoch": 0.9098187342187917, + "grad_norm": 0.28369865166116354, + "learning_rate": 8.345522840344553e-06, + "loss": 0.4952, + "step": 5540 + }, + { + "epoch": 0.9099829614271344, + "grad_norm": 0.5346924787068149, + "learning_rate": 8.345279257980461e-06, + "loss": 0.5126, + "step": 5541 + }, + { + "epoch": 0.9101471886354772, + "grad_norm": 0.32168574288406493, + "learning_rate": 8.345035633852855e-06, + "loss": 0.5086, + "step": 5542 + }, + { + "epoch": 0.9103114158438199, + "grad_norm": 0.3307362293505546, + "learning_rate": 8.344791967964377e-06, + "loss": 0.5107, + "step": 5543 + }, + { + "epoch": 0.9104756430521627, + "grad_norm": 0.3365560145974131, + "learning_rate": 8.344548260317678e-06, + "loss": 0.488, + "step": 5544 + }, + { + "epoch": 0.9106398702605054, + "grad_norm": 0.3480169970867232, + "learning_rate": 8.3443045109154e-06, + "loss": 0.4893, + "step": 5545 + }, + { + "epoch": 0.9108040974688482, + "grad_norm": 0.299993591213752, + "learning_rate": 8.344060719760193e-06, + "loss": 0.4908, + "step": 5546 + }, + { + "epoch": 0.9109683246771909, + "grad_norm": 0.2999257482434843, + "learning_rate": 8.343816886854707e-06, + "loss": 0.4876, + "step": 5547 + }, + { + "epoch": 0.9111325518855337, + "grad_norm": 0.3924513462074901, + "learning_rate": 8.343573012201584e-06, + "loss": 0.5141, + "step": 5548 + }, + { + "epoch": 0.9112967790938764, + "grad_norm": 0.3399115729754476, + "learning_rate": 8.34332909580348e-06, + "loss": 0.5082, + "step": 5549 + }, + { + "epoch": 0.9114610063022192, + "grad_norm": 0.3194858626705532, + "learning_rate": 8.343085137663037e-06, + "loss": 0.4766, + "step": 5550 + }, + { + "epoch": 0.9116252335105619, + "grad_norm": 0.47154597792158837, + "learning_rate": 8.342841137782912e-06, + "loss": 0.5031, + "step": 5551 + }, + { + "epoch": 0.9117894607189047, + "grad_norm": 0.4575805099261349, + "learning_rate": 8.342597096165748e-06, + "loss": 0.51, + "step": 5552 + }, + { + "epoch": 0.9119536879272473, + "grad_norm": 0.4071492592305457, + "learning_rate": 8.342353012814202e-06, + "loss": 0.4831, + "step": 5553 + }, + { + "epoch": 0.91211791513559, + "grad_norm": 0.3173333243154672, + "learning_rate": 8.34210888773092e-06, + "loss": 0.4953, + "step": 5554 + }, + { + "epoch": 0.9122821423439328, + "grad_norm": 0.3570551254398815, + "learning_rate": 8.341864720918558e-06, + "loss": 0.5008, + "step": 5555 + }, + { + "epoch": 0.9124463695522755, + "grad_norm": 0.3179412588931241, + "learning_rate": 8.341620512379762e-06, + "loss": 0.5091, + "step": 5556 + }, + { + "epoch": 0.9126105967606183, + "grad_norm": 0.3702640163571449, + "learning_rate": 8.341376262117189e-06, + "loss": 0.4887, + "step": 5557 + }, + { + "epoch": 0.912774823968961, + "grad_norm": 0.33515234301086044, + "learning_rate": 8.341131970133491e-06, + "loss": 0.5023, + "step": 5558 + }, + { + "epoch": 0.9129390511773038, + "grad_norm": 0.29906964223061433, + "learning_rate": 8.34088763643132e-06, + "loss": 0.4972, + "step": 5559 + }, + { + "epoch": 0.9131032783856465, + "grad_norm": 0.3263066136291013, + "learning_rate": 8.340643261013328e-06, + "loss": 0.4997, + "step": 5560 + }, + { + "epoch": 0.9132675055939893, + "grad_norm": 0.27989015681060403, + "learning_rate": 8.340398843882175e-06, + "loss": 0.5007, + "step": 5561 + }, + { + "epoch": 0.913431732802332, + "grad_norm": 0.4458218715249904, + "learning_rate": 8.340154385040511e-06, + "loss": 0.4915, + "step": 5562 + }, + { + "epoch": 0.9135959600106748, + "grad_norm": 0.30774662912355444, + "learning_rate": 8.339909884490991e-06, + "loss": 0.4783, + "step": 5563 + }, + { + "epoch": 0.9137601872190175, + "grad_norm": 0.2993000222057823, + "learning_rate": 8.339665342236275e-06, + "loss": 0.5053, + "step": 5564 + }, + { + "epoch": 0.9139244144273603, + "grad_norm": 0.3966672210293373, + "learning_rate": 8.339420758279012e-06, + "loss": 0.5183, + "step": 5565 + }, + { + "epoch": 0.914088641635703, + "grad_norm": 0.30596909940584427, + "learning_rate": 8.339176132621864e-06, + "loss": 0.494, + "step": 5566 + }, + { + "epoch": 0.9142528688440458, + "grad_norm": 0.35348430779350265, + "learning_rate": 8.338931465267485e-06, + "loss": 0.5163, + "step": 5567 + }, + { + "epoch": 0.9144170960523885, + "grad_norm": 0.27033879796104393, + "learning_rate": 8.338686756218534e-06, + "loss": 0.5094, + "step": 5568 + }, + { + "epoch": 0.9145813232607313, + "grad_norm": 0.3913309376275306, + "learning_rate": 8.338442005477667e-06, + "loss": 0.5038, + "step": 5569 + }, + { + "epoch": 0.9147455504690739, + "grad_norm": 0.28689614433794935, + "learning_rate": 8.338197213047544e-06, + "loss": 0.5067, + "step": 5570 + }, + { + "epoch": 0.9149097776774167, + "grad_norm": 0.4050157925870621, + "learning_rate": 8.337952378930823e-06, + "loss": 0.4983, + "step": 5571 + }, + { + "epoch": 0.9150740048857594, + "grad_norm": 0.2865568953025675, + "learning_rate": 8.337707503130163e-06, + "loss": 0.4963, + "step": 5572 + }, + { + "epoch": 0.9152382320941022, + "grad_norm": 0.29733174921046923, + "learning_rate": 8.337462585648224e-06, + "loss": 0.5153, + "step": 5573 + }, + { + "epoch": 0.9154024593024449, + "grad_norm": 0.3180051252198621, + "learning_rate": 8.337217626487665e-06, + "loss": 0.5151, + "step": 5574 + }, + { + "epoch": 0.9155666865107877, + "grad_norm": 0.40773067814807856, + "learning_rate": 8.336972625651149e-06, + "loss": 0.508, + "step": 5575 + }, + { + "epoch": 0.9157309137191304, + "grad_norm": 0.3232392270623946, + "learning_rate": 8.336727583141335e-06, + "loss": 0.5015, + "step": 5576 + }, + { + "epoch": 0.9158951409274732, + "grad_norm": 0.5050168357579791, + "learning_rate": 8.336482498960883e-06, + "loss": 0.5041, + "step": 5577 + }, + { + "epoch": 0.9160593681358159, + "grad_norm": 0.36104906234395595, + "learning_rate": 8.336237373112456e-06, + "loss": 0.4821, + "step": 5578 + }, + { + "epoch": 0.9162235953441586, + "grad_norm": 0.3076149792819011, + "learning_rate": 8.33599220559872e-06, + "loss": 0.5098, + "step": 5579 + }, + { + "epoch": 0.9163878225525014, + "grad_norm": 0.6724914596205441, + "learning_rate": 8.335746996422332e-06, + "loss": 0.504, + "step": 5580 + }, + { + "epoch": 0.9165520497608441, + "grad_norm": 0.42408137374830124, + "learning_rate": 8.335501745585959e-06, + "loss": 0.504, + "step": 5581 + }, + { + "epoch": 0.9167162769691869, + "grad_norm": 0.41731179466139035, + "learning_rate": 8.335256453092263e-06, + "loss": 0.5103, + "step": 5582 + }, + { + "epoch": 0.9168805041775296, + "grad_norm": 0.47722812707169454, + "learning_rate": 8.335011118943908e-06, + "loss": 0.4966, + "step": 5583 + }, + { + "epoch": 0.9170447313858724, + "grad_norm": 0.4352879061716675, + "learning_rate": 8.33476574314356e-06, + "loss": 0.4979, + "step": 5584 + }, + { + "epoch": 0.9172089585942151, + "grad_norm": 0.341870163144751, + "learning_rate": 8.334520325693881e-06, + "loss": 0.5084, + "step": 5585 + }, + { + "epoch": 0.9173731858025579, + "grad_norm": 0.5324312522979684, + "learning_rate": 8.334274866597541e-06, + "loss": 0.4798, + "step": 5586 + }, + { + "epoch": 0.9175374130109005, + "grad_norm": 0.39344851183289753, + "learning_rate": 8.334029365857202e-06, + "loss": 0.4974, + "step": 5587 + }, + { + "epoch": 0.9177016402192433, + "grad_norm": 0.46426361762009455, + "learning_rate": 8.333783823475533e-06, + "loss": 0.5035, + "step": 5588 + }, + { + "epoch": 0.917865867427586, + "grad_norm": 0.4075244707525329, + "learning_rate": 8.333538239455199e-06, + "loss": 0.48, + "step": 5589 + }, + { + "epoch": 0.9180300946359288, + "grad_norm": 0.36870409810649557, + "learning_rate": 8.333292613798868e-06, + "loss": 0.505, + "step": 5590 + }, + { + "epoch": 0.9181943218442715, + "grad_norm": 0.34786470966404165, + "learning_rate": 8.333046946509209e-06, + "loss": 0.4902, + "step": 5591 + }, + { + "epoch": 0.9183585490526143, + "grad_norm": 0.2921529860594467, + "learning_rate": 8.332801237588886e-06, + "loss": 0.4824, + "step": 5592 + }, + { + "epoch": 0.918522776260957, + "grad_norm": 0.6192703487224838, + "learning_rate": 8.332555487040574e-06, + "loss": 0.4948, + "step": 5593 + }, + { + "epoch": 0.9186870034692998, + "grad_norm": 0.3317052036236938, + "learning_rate": 8.332309694866937e-06, + "loss": 0.5057, + "step": 5594 + }, + { + "epoch": 0.9188512306776425, + "grad_norm": 0.38647318368047534, + "learning_rate": 8.332063861070646e-06, + "loss": 0.5057, + "step": 5595 + }, + { + "epoch": 0.9190154578859853, + "grad_norm": 0.3435710297493405, + "learning_rate": 8.331817985654374e-06, + "loss": 0.4992, + "step": 5596 + }, + { + "epoch": 0.919179685094328, + "grad_norm": 0.34725817180505014, + "learning_rate": 8.331572068620785e-06, + "loss": 0.4929, + "step": 5597 + }, + { + "epoch": 0.9193439123026708, + "grad_norm": 0.3235167989235117, + "learning_rate": 8.331326109972556e-06, + "loss": 0.5095, + "step": 5598 + }, + { + "epoch": 0.9195081395110135, + "grad_norm": 0.39005161313201875, + "learning_rate": 8.331080109712355e-06, + "loss": 0.4961, + "step": 5599 + }, + { + "epoch": 0.9196723667193563, + "grad_norm": 0.3454792276254286, + "learning_rate": 8.330834067842853e-06, + "loss": 0.5285, + "step": 5600 + }, + { + "epoch": 0.919836593927699, + "grad_norm": 1.0221632544154688, + "learning_rate": 8.330587984366726e-06, + "loss": 0.4986, + "step": 5601 + }, + { + "epoch": 0.9200008211360418, + "grad_norm": 0.2994832026081828, + "learning_rate": 8.330341859286645e-06, + "loss": 0.4923, + "step": 5602 + }, + { + "epoch": 0.9201650483443845, + "grad_norm": 0.4052390833737057, + "learning_rate": 8.330095692605283e-06, + "loss": 0.5122, + "step": 5603 + }, + { + "epoch": 0.9203292755527271, + "grad_norm": 0.28648054829708214, + "learning_rate": 8.329849484325313e-06, + "loss": 0.5106, + "step": 5604 + }, + { + "epoch": 0.9204935027610699, + "grad_norm": 0.3063542399597326, + "learning_rate": 8.32960323444941e-06, + "loss": 0.5021, + "step": 5605 + }, + { + "epoch": 0.9206577299694126, + "grad_norm": 0.29985373636301893, + "learning_rate": 8.329356942980245e-06, + "loss": 0.5086, + "step": 5606 + }, + { + "epoch": 0.9208219571777554, + "grad_norm": 0.35166974021523667, + "learning_rate": 8.329110609920499e-06, + "loss": 0.5017, + "step": 5607 + }, + { + "epoch": 0.9209861843860981, + "grad_norm": 0.3991541658144142, + "learning_rate": 8.328864235272845e-06, + "loss": 0.5246, + "step": 5608 + }, + { + "epoch": 0.9211504115944409, + "grad_norm": 0.3259044403152944, + "learning_rate": 8.328617819039955e-06, + "loss": 0.4865, + "step": 5609 + }, + { + "epoch": 0.9213146388027836, + "grad_norm": 0.30778544588909174, + "learning_rate": 8.328371361224512e-06, + "loss": 0.4978, + "step": 5610 + }, + { + "epoch": 0.9214788660111264, + "grad_norm": 0.5084722341526301, + "learning_rate": 8.328124861829188e-06, + "loss": 0.5107, + "step": 5611 + }, + { + "epoch": 0.9216430932194691, + "grad_norm": 0.3117471003397797, + "learning_rate": 8.327878320856662e-06, + "loss": 0.4869, + "step": 5612 + }, + { + "epoch": 0.9218073204278119, + "grad_norm": 0.34543706062762425, + "learning_rate": 8.32763173830961e-06, + "loss": 0.515, + "step": 5613 + }, + { + "epoch": 0.9219715476361546, + "grad_norm": 0.334776273461246, + "learning_rate": 8.327385114190714e-06, + "loss": 0.4968, + "step": 5614 + }, + { + "epoch": 0.9221357748444974, + "grad_norm": 0.32328953533067045, + "learning_rate": 8.327138448502649e-06, + "loss": 0.4855, + "step": 5615 + }, + { + "epoch": 0.9223000020528401, + "grad_norm": 0.3302809998928379, + "learning_rate": 8.326891741248094e-06, + "loss": 0.5031, + "step": 5616 + }, + { + "epoch": 0.9224642292611829, + "grad_norm": 0.3127553399557638, + "learning_rate": 8.32664499242973e-06, + "loss": 0.4988, + "step": 5617 + }, + { + "epoch": 0.9226284564695256, + "grad_norm": 0.4586672526902129, + "learning_rate": 8.326398202050236e-06, + "loss": 0.4693, + "step": 5618 + }, + { + "epoch": 0.9227926836778684, + "grad_norm": 0.3313144730744136, + "learning_rate": 8.326151370112294e-06, + "loss": 0.5186, + "step": 5619 + }, + { + "epoch": 0.9229569108862111, + "grad_norm": 0.3316385292266702, + "learning_rate": 8.325904496618583e-06, + "loss": 0.5033, + "step": 5620 + }, + { + "epoch": 0.9231211380945538, + "grad_norm": 0.29228573535246216, + "learning_rate": 8.325657581571784e-06, + "loss": 0.5004, + "step": 5621 + }, + { + "epoch": 0.9232853653028965, + "grad_norm": 0.4226834910779713, + "learning_rate": 8.325410624974582e-06, + "loss": 0.493, + "step": 5622 + }, + { + "epoch": 0.9234495925112393, + "grad_norm": 0.3458030476801539, + "learning_rate": 8.325163626829656e-06, + "loss": 0.5041, + "step": 5623 + }, + { + "epoch": 0.923613819719582, + "grad_norm": 0.4242642019903402, + "learning_rate": 8.324916587139689e-06, + "loss": 0.5033, + "step": 5624 + }, + { + "epoch": 0.9237780469279248, + "grad_norm": 0.3100029512474492, + "learning_rate": 8.324669505907365e-06, + "loss": 0.5062, + "step": 5625 + }, + { + "epoch": 0.9239422741362675, + "grad_norm": 0.3185639380577177, + "learning_rate": 8.324422383135368e-06, + "loss": 0.4924, + "step": 5626 + }, + { + "epoch": 0.9241065013446103, + "grad_norm": 0.4050609577398203, + "learning_rate": 8.32417521882638e-06, + "loss": 0.4899, + "step": 5627 + }, + { + "epoch": 0.924270728552953, + "grad_norm": 0.31533095420755064, + "learning_rate": 8.323928012983087e-06, + "loss": 0.4909, + "step": 5628 + }, + { + "epoch": 0.9244349557612958, + "grad_norm": 0.2842521067234476, + "learning_rate": 8.323680765608173e-06, + "loss": 0.4873, + "step": 5629 + }, + { + "epoch": 0.9245991829696385, + "grad_norm": 0.3258802618679284, + "learning_rate": 8.323433476704325e-06, + "loss": 0.5076, + "step": 5630 + }, + { + "epoch": 0.9247634101779812, + "grad_norm": 0.299409442711691, + "learning_rate": 8.323186146274228e-06, + "loss": 0.4988, + "step": 5631 + }, + { + "epoch": 0.924927637386324, + "grad_norm": 0.3156360159559121, + "learning_rate": 8.322938774320568e-06, + "loss": 0.4989, + "step": 5632 + }, + { + "epoch": 0.9250918645946667, + "grad_norm": 0.2938555458815616, + "learning_rate": 8.32269136084603e-06, + "loss": 0.5171, + "step": 5633 + }, + { + "epoch": 0.9252560918030095, + "grad_norm": 0.40501414098775324, + "learning_rate": 8.322443905853303e-06, + "loss": 0.526, + "step": 5634 + }, + { + "epoch": 0.9254203190113522, + "grad_norm": 0.3931817021892204, + "learning_rate": 8.322196409345074e-06, + "loss": 0.4976, + "step": 5635 + }, + { + "epoch": 0.925584546219695, + "grad_norm": 0.4093911838085258, + "learning_rate": 8.321948871324033e-06, + "loss": 0.5009, + "step": 5636 + }, + { + "epoch": 0.9257487734280377, + "grad_norm": 0.3601362068180492, + "learning_rate": 8.321701291792867e-06, + "loss": 0.5002, + "step": 5637 + }, + { + "epoch": 0.9259130006363804, + "grad_norm": 0.8262370739905388, + "learning_rate": 8.321453670754264e-06, + "loss": 0.4923, + "step": 5638 + }, + { + "epoch": 0.9260772278447231, + "grad_norm": 1.323739662827969, + "learning_rate": 8.321206008210914e-06, + "loss": 0.5188, + "step": 5639 + }, + { + "epoch": 0.9262414550530659, + "grad_norm": 0.2963276769726195, + "learning_rate": 8.320958304165509e-06, + "loss": 0.5017, + "step": 5640 + }, + { + "epoch": 0.9264056822614086, + "grad_norm": 0.35472151961006554, + "learning_rate": 8.320710558620736e-06, + "loss": 0.5008, + "step": 5641 + }, + { + "epoch": 0.9265699094697514, + "grad_norm": 0.47096437311419886, + "learning_rate": 8.320462771579287e-06, + "loss": 0.5187, + "step": 5642 + }, + { + "epoch": 0.9267341366780941, + "grad_norm": 0.3116469536361895, + "learning_rate": 8.320214943043856e-06, + "loss": 0.4931, + "step": 5643 + }, + { + "epoch": 0.9268983638864369, + "grad_norm": 0.34571874140096176, + "learning_rate": 8.31996707301713e-06, + "loss": 0.5038, + "step": 5644 + }, + { + "epoch": 0.9270625910947796, + "grad_norm": 0.3000670197452908, + "learning_rate": 8.319719161501803e-06, + "loss": 0.5081, + "step": 5645 + }, + { + "epoch": 0.9272268183031224, + "grad_norm": 0.5315823597866184, + "learning_rate": 8.319471208500568e-06, + "loss": 0.5268, + "step": 5646 + }, + { + "epoch": 0.9273910455114651, + "grad_norm": 0.3194174964969078, + "learning_rate": 8.319223214016118e-06, + "loss": 0.4853, + "step": 5647 + }, + { + "epoch": 0.9275552727198079, + "grad_norm": 0.2909221729139333, + "learning_rate": 8.318975178051146e-06, + "loss": 0.4761, + "step": 5648 + }, + { + "epoch": 0.9277194999281506, + "grad_norm": 0.3266591840424489, + "learning_rate": 8.318727100608347e-06, + "loss": 0.4968, + "step": 5649 + }, + { + "epoch": 0.9278837271364934, + "grad_norm": 0.3951634535641033, + "learning_rate": 8.318478981690415e-06, + "loss": 0.4913, + "step": 5650 + }, + { + "epoch": 0.9280479543448361, + "grad_norm": 0.5327103196781549, + "learning_rate": 8.318230821300044e-06, + "loss": 0.5045, + "step": 5651 + }, + { + "epoch": 0.9282121815531789, + "grad_norm": 0.3387983201320291, + "learning_rate": 8.317982619439927e-06, + "loss": 0.496, + "step": 5652 + }, + { + "epoch": 0.9283764087615216, + "grad_norm": 0.3534333758409748, + "learning_rate": 8.317734376112766e-06, + "loss": 0.4819, + "step": 5653 + }, + { + "epoch": 0.9285406359698644, + "grad_norm": 0.45450659603234983, + "learning_rate": 8.317486091321253e-06, + "loss": 0.4933, + "step": 5654 + }, + { + "epoch": 0.928704863178207, + "grad_norm": 0.49711559052900334, + "learning_rate": 8.317237765068083e-06, + "loss": 0.5089, + "step": 5655 + }, + { + "epoch": 0.9288690903865497, + "grad_norm": 0.33609533448723666, + "learning_rate": 8.316989397355956e-06, + "loss": 0.5134, + "step": 5656 + }, + { + "epoch": 0.9290333175948925, + "grad_norm": 0.4638457318413463, + "learning_rate": 8.31674098818757e-06, + "loss": 0.4927, + "step": 5657 + }, + { + "epoch": 0.9291975448032352, + "grad_norm": 0.3490282675574865, + "learning_rate": 8.31649253756562e-06, + "loss": 0.4798, + "step": 5658 + }, + { + "epoch": 0.929361772011578, + "grad_norm": 0.37016474674914635, + "learning_rate": 8.316244045492809e-06, + "loss": 0.5155, + "step": 5659 + }, + { + "epoch": 0.9295259992199207, + "grad_norm": 0.37075033799182244, + "learning_rate": 8.31599551197183e-06, + "loss": 0.5202, + "step": 5660 + }, + { + "epoch": 0.9296902264282635, + "grad_norm": 0.3424732929894227, + "learning_rate": 8.315746937005386e-06, + "loss": 0.52, + "step": 5661 + }, + { + "epoch": 0.9298544536366062, + "grad_norm": 0.304676215982256, + "learning_rate": 8.315498320596177e-06, + "loss": 0.4924, + "step": 5662 + }, + { + "epoch": 0.930018680844949, + "grad_norm": 0.3090516157537145, + "learning_rate": 8.315249662746901e-06, + "loss": 0.5067, + "step": 5663 + }, + { + "epoch": 0.9301829080532917, + "grad_norm": 0.35434684092220853, + "learning_rate": 8.315000963460261e-06, + "loss": 0.5053, + "step": 5664 + }, + { + "epoch": 0.9303471352616345, + "grad_norm": 0.3235687017825308, + "learning_rate": 8.314752222738956e-06, + "loss": 0.5004, + "step": 5665 + }, + { + "epoch": 0.9305113624699772, + "grad_norm": 0.323592417972458, + "learning_rate": 8.31450344058569e-06, + "loss": 0.5178, + "step": 5666 + }, + { + "epoch": 0.93067558967832, + "grad_norm": 0.3269011607386471, + "learning_rate": 8.314254617003163e-06, + "loss": 0.5111, + "step": 5667 + }, + { + "epoch": 0.9308398168866627, + "grad_norm": 0.44163641425139216, + "learning_rate": 8.31400575199408e-06, + "loss": 0.5087, + "step": 5668 + }, + { + "epoch": 0.9310040440950055, + "grad_norm": 0.544952712160666, + "learning_rate": 8.313756845561139e-06, + "loss": 0.5166, + "step": 5669 + }, + { + "epoch": 0.9311682713033482, + "grad_norm": 0.31794568803873646, + "learning_rate": 8.313507897707047e-06, + "loss": 0.5212, + "step": 5670 + }, + { + "epoch": 0.931332498511691, + "grad_norm": 0.39736597639619714, + "learning_rate": 8.313258908434507e-06, + "loss": 0.509, + "step": 5671 + }, + { + "epoch": 0.9314967257200336, + "grad_norm": 0.30840079244447743, + "learning_rate": 8.313009877746226e-06, + "loss": 0.4957, + "step": 5672 + }, + { + "epoch": 0.9316609529283764, + "grad_norm": 0.34739304809928856, + "learning_rate": 8.312760805644905e-06, + "loss": 0.4971, + "step": 5673 + }, + { + "epoch": 0.9318251801367191, + "grad_norm": 0.335988944225995, + "learning_rate": 8.312511692133251e-06, + "loss": 0.5009, + "step": 5674 + }, + { + "epoch": 0.9319894073450619, + "grad_norm": 0.3893509807380632, + "learning_rate": 8.312262537213966e-06, + "loss": 0.5073, + "step": 5675 + }, + { + "epoch": 0.9321536345534046, + "grad_norm": 0.33293820028332555, + "learning_rate": 8.312013340889763e-06, + "loss": 0.4899, + "step": 5676 + }, + { + "epoch": 0.9323178617617474, + "grad_norm": 0.4913326187606124, + "learning_rate": 8.311764103163342e-06, + "loss": 0.5175, + "step": 5677 + }, + { + "epoch": 0.9324820889700901, + "grad_norm": 0.34344484845465234, + "learning_rate": 8.311514824037414e-06, + "loss": 0.4871, + "step": 5678 + }, + { + "epoch": 0.9326463161784329, + "grad_norm": 0.31321528574452057, + "learning_rate": 8.311265503514684e-06, + "loss": 0.5137, + "step": 5679 + }, + { + "epoch": 0.9328105433867756, + "grad_norm": 0.28952544103066263, + "learning_rate": 8.311016141597862e-06, + "loss": 0.4889, + "step": 5680 + }, + { + "epoch": 0.9329747705951184, + "grad_norm": 0.3679457217525709, + "learning_rate": 8.310766738289653e-06, + "loss": 0.49, + "step": 5681 + }, + { + "epoch": 0.9331389978034611, + "grad_norm": 2.1742606439642436, + "learning_rate": 8.31051729359277e-06, + "loss": 0.5173, + "step": 5682 + }, + { + "epoch": 0.9333032250118038, + "grad_norm": 0.3115240268193473, + "learning_rate": 8.310267807509918e-06, + "loss": 0.5, + "step": 5683 + }, + { + "epoch": 0.9334674522201466, + "grad_norm": 0.4170080800009593, + "learning_rate": 8.310018280043811e-06, + "loss": 0.53, + "step": 5684 + }, + { + "epoch": 0.9336316794284893, + "grad_norm": 0.33607485508133855, + "learning_rate": 8.309768711197156e-06, + "loss": 0.5004, + "step": 5685 + }, + { + "epoch": 0.9337959066368321, + "grad_norm": 0.34227008895253036, + "learning_rate": 8.309519100972664e-06, + "loss": 0.514, + "step": 5686 + }, + { + "epoch": 0.9339601338451748, + "grad_norm": 0.32230259816889123, + "learning_rate": 8.30926944937305e-06, + "loss": 0.4983, + "step": 5687 + }, + { + "epoch": 0.9341243610535176, + "grad_norm": 0.31003054600860314, + "learning_rate": 8.309019756401016e-06, + "loss": 0.4976, + "step": 5688 + }, + { + "epoch": 0.9342885882618602, + "grad_norm": 0.33146582662481516, + "learning_rate": 8.308770022059285e-06, + "loss": 0.5141, + "step": 5689 + }, + { + "epoch": 0.934452815470203, + "grad_norm": 0.29520921099226544, + "learning_rate": 8.30852024635056e-06, + "loss": 0.5035, + "step": 5690 + }, + { + "epoch": 0.9346170426785457, + "grad_norm": 0.2898500067975092, + "learning_rate": 8.308270429277562e-06, + "loss": 0.491, + "step": 5691 + }, + { + "epoch": 0.9347812698868885, + "grad_norm": 0.514172034927915, + "learning_rate": 8.308020570842998e-06, + "loss": 0.5117, + "step": 5692 + }, + { + "epoch": 0.9349454970952312, + "grad_norm": 0.32118782445662386, + "learning_rate": 8.307770671049586e-06, + "loss": 0.5002, + "step": 5693 + }, + { + "epoch": 0.935109724303574, + "grad_norm": 0.28434844389192415, + "learning_rate": 8.307520729900037e-06, + "loss": 0.5111, + "step": 5694 + }, + { + "epoch": 0.9352739515119167, + "grad_norm": 0.3150859794798156, + "learning_rate": 8.307270747397067e-06, + "loss": 0.5157, + "step": 5695 + }, + { + "epoch": 0.9354381787202595, + "grad_norm": 0.337073001654949, + "learning_rate": 8.30702072354339e-06, + "loss": 0.4861, + "step": 5696 + }, + { + "epoch": 0.9356024059286022, + "grad_norm": 0.3450124523657821, + "learning_rate": 8.306770658341723e-06, + "loss": 0.4864, + "step": 5697 + }, + { + "epoch": 0.935766633136945, + "grad_norm": 0.3124292867346078, + "learning_rate": 8.306520551794781e-06, + "loss": 0.5167, + "step": 5698 + }, + { + "epoch": 0.9359308603452877, + "grad_norm": 0.35958452699114296, + "learning_rate": 8.30627040390528e-06, + "loss": 0.4719, + "step": 5699 + }, + { + "epoch": 0.9360950875536305, + "grad_norm": 0.39144058349399646, + "learning_rate": 8.306020214675938e-06, + "loss": 0.4831, + "step": 5700 + }, + { + "epoch": 0.9362593147619732, + "grad_norm": 0.32711266635221053, + "learning_rate": 8.305769984109473e-06, + "loss": 0.5025, + "step": 5701 + }, + { + "epoch": 0.936423541970316, + "grad_norm": 0.32745055724297306, + "learning_rate": 8.3055197122086e-06, + "loss": 0.5015, + "step": 5702 + }, + { + "epoch": 0.9365877691786587, + "grad_norm": 0.2867180288316144, + "learning_rate": 8.30526939897604e-06, + "loss": 0.5111, + "step": 5703 + }, + { + "epoch": 0.9367519963870015, + "grad_norm": 0.295602851080418, + "learning_rate": 8.30501904441451e-06, + "loss": 0.4967, + "step": 5704 + }, + { + "epoch": 0.9369162235953442, + "grad_norm": 0.4263974761885831, + "learning_rate": 8.30476864852673e-06, + "loss": 0.4912, + "step": 5705 + }, + { + "epoch": 0.9370804508036868, + "grad_norm": 0.33887320596973486, + "learning_rate": 8.304518211315417e-06, + "loss": 0.4882, + "step": 5706 + }, + { + "epoch": 0.9372446780120296, + "grad_norm": 0.3675882652000894, + "learning_rate": 8.304267732783296e-06, + "loss": 0.4911, + "step": 5707 + }, + { + "epoch": 0.9374089052203723, + "grad_norm": 0.44201220610549874, + "learning_rate": 8.304017212933082e-06, + "loss": 0.5259, + "step": 5708 + }, + { + "epoch": 0.9375731324287151, + "grad_norm": 0.4404115118052987, + "learning_rate": 8.303766651767501e-06, + "loss": 0.5017, + "step": 5709 + }, + { + "epoch": 0.9377373596370578, + "grad_norm": 0.3166832436259222, + "learning_rate": 8.30351604928927e-06, + "loss": 0.498, + "step": 5710 + }, + { + "epoch": 0.9379015868454006, + "grad_norm": 0.3220410348280446, + "learning_rate": 8.303265405501113e-06, + "loss": 0.4816, + "step": 5711 + }, + { + "epoch": 0.9380658140537433, + "grad_norm": 0.49047811853744966, + "learning_rate": 8.303014720405753e-06, + "loss": 0.513, + "step": 5712 + }, + { + "epoch": 0.9382300412620861, + "grad_norm": 0.33189355575641755, + "learning_rate": 8.302763994005908e-06, + "loss": 0.5082, + "step": 5713 + }, + { + "epoch": 0.9383942684704288, + "grad_norm": 0.3758239947437896, + "learning_rate": 8.30251322630431e-06, + "loss": 0.5046, + "step": 5714 + }, + { + "epoch": 0.9385584956787716, + "grad_norm": 0.33422695693160437, + "learning_rate": 8.302262417303673e-06, + "loss": 0.4978, + "step": 5715 + }, + { + "epoch": 0.9387227228871143, + "grad_norm": 0.3216246403940893, + "learning_rate": 8.302011567006726e-06, + "loss": 0.4972, + "step": 5716 + }, + { + "epoch": 0.9388869500954571, + "grad_norm": 0.3375123906537756, + "learning_rate": 8.301760675416193e-06, + "loss": 0.5055, + "step": 5717 + }, + { + "epoch": 0.9390511773037998, + "grad_norm": 0.3245098375695823, + "learning_rate": 8.301509742534797e-06, + "loss": 0.4923, + "step": 5718 + }, + { + "epoch": 0.9392154045121426, + "grad_norm": 0.300498447635204, + "learning_rate": 8.301258768365269e-06, + "loss": 0.5084, + "step": 5719 + }, + { + "epoch": 0.9393796317204853, + "grad_norm": 0.2916667872950895, + "learning_rate": 8.301007752910327e-06, + "loss": 0.4728, + "step": 5720 + }, + { + "epoch": 0.9395438589288281, + "grad_norm": 0.29636711926991816, + "learning_rate": 8.300756696172703e-06, + "loss": 0.5363, + "step": 5721 + }, + { + "epoch": 0.9397080861371708, + "grad_norm": 0.36656193950960636, + "learning_rate": 8.300505598155121e-06, + "loss": 0.5059, + "step": 5722 + }, + { + "epoch": 0.9398723133455135, + "grad_norm": 0.3305934017377256, + "learning_rate": 8.30025445886031e-06, + "loss": 0.4864, + "step": 5723 + }, + { + "epoch": 0.9400365405538562, + "grad_norm": 0.2750640832912153, + "learning_rate": 8.300003278290996e-06, + "loss": 0.4961, + "step": 5724 + }, + { + "epoch": 0.940200767762199, + "grad_norm": 0.2876805407058977, + "learning_rate": 8.299752056449908e-06, + "loss": 0.4967, + "step": 5725 + }, + { + "epoch": 0.9403649949705417, + "grad_norm": 0.35466903617984374, + "learning_rate": 8.299500793339775e-06, + "loss": 0.4968, + "step": 5726 + }, + { + "epoch": 0.9405292221788845, + "grad_norm": 0.340814806284325, + "learning_rate": 8.299249488963322e-06, + "loss": 0.4908, + "step": 5727 + }, + { + "epoch": 0.9406934493872272, + "grad_norm": 0.3322242158557963, + "learning_rate": 8.298998143323286e-06, + "loss": 0.5069, + "step": 5728 + }, + { + "epoch": 0.94085767659557, + "grad_norm": 0.2999082292928264, + "learning_rate": 8.298746756422389e-06, + "loss": 0.4959, + "step": 5729 + }, + { + "epoch": 0.9410219038039127, + "grad_norm": 0.31878824304806774, + "learning_rate": 8.298495328263367e-06, + "loss": 0.5125, + "step": 5730 + }, + { + "epoch": 0.9411861310122555, + "grad_norm": 0.36476084540179676, + "learning_rate": 8.298243858848947e-06, + "loss": 0.4991, + "step": 5731 + }, + { + "epoch": 0.9413503582205982, + "grad_norm": 0.2634703515079286, + "learning_rate": 8.297992348181862e-06, + "loss": 0.5131, + "step": 5732 + }, + { + "epoch": 0.941514585428941, + "grad_norm": 0.2739454213919614, + "learning_rate": 8.297740796264845e-06, + "loss": 0.5084, + "step": 5733 + }, + { + "epoch": 0.9416788126372837, + "grad_norm": 0.32154129766877954, + "learning_rate": 8.297489203100623e-06, + "loss": 0.49, + "step": 5734 + }, + { + "epoch": 0.9418430398456265, + "grad_norm": 0.2974470814025031, + "learning_rate": 8.297237568691936e-06, + "loss": 0.4825, + "step": 5735 + }, + { + "epoch": 0.9420072670539692, + "grad_norm": 0.3876497388425138, + "learning_rate": 8.29698589304151e-06, + "loss": 0.523, + "step": 5736 + }, + { + "epoch": 0.942171494262312, + "grad_norm": 0.28791684142287477, + "learning_rate": 8.296734176152083e-06, + "loss": 0.4948, + "step": 5737 + }, + { + "epoch": 0.9423357214706547, + "grad_norm": 0.26299740923860576, + "learning_rate": 8.296482418026387e-06, + "loss": 0.5154, + "step": 5738 + }, + { + "epoch": 0.9424999486789974, + "grad_norm": 0.3163157616250974, + "learning_rate": 8.296230618667156e-06, + "loss": 0.4981, + "step": 5739 + }, + { + "epoch": 0.9426641758873401, + "grad_norm": 0.3183861930352154, + "learning_rate": 8.295978778077128e-06, + "loss": 0.4857, + "step": 5740 + }, + { + "epoch": 0.9428284030956828, + "grad_norm": 0.32525277310277495, + "learning_rate": 8.295726896259033e-06, + "loss": 0.5191, + "step": 5741 + }, + { + "epoch": 0.9429926303040256, + "grad_norm": 0.3938453035546823, + "learning_rate": 8.29547497321561e-06, + "loss": 0.49, + "step": 5742 + }, + { + "epoch": 0.9431568575123683, + "grad_norm": 0.2898850112140784, + "learning_rate": 8.295223008949595e-06, + "loss": 0.4781, + "step": 5743 + }, + { + "epoch": 0.9433210847207111, + "grad_norm": 0.3301700391413957, + "learning_rate": 8.294971003463724e-06, + "loss": 0.4957, + "step": 5744 + }, + { + "epoch": 0.9434853119290538, + "grad_norm": 0.2804840088969759, + "learning_rate": 8.294718956760736e-06, + "loss": 0.486, + "step": 5745 + }, + { + "epoch": 0.9436495391373966, + "grad_norm": 0.30848614862924606, + "learning_rate": 8.294466868843363e-06, + "loss": 0.4949, + "step": 5746 + }, + { + "epoch": 0.9438137663457393, + "grad_norm": 0.2778952083425503, + "learning_rate": 8.294214739714348e-06, + "loss": 0.4947, + "step": 5747 + }, + { + "epoch": 0.9439779935540821, + "grad_norm": 0.2912022792909345, + "learning_rate": 8.293962569376428e-06, + "loss": 0.5038, + "step": 5748 + }, + { + "epoch": 0.9441422207624248, + "grad_norm": 0.31372201934122734, + "learning_rate": 8.293710357832344e-06, + "loss": 0.5014, + "step": 5749 + }, + { + "epoch": 0.9443064479707676, + "grad_norm": 0.4827497389908659, + "learning_rate": 8.29345810508483e-06, + "loss": 0.4936, + "step": 5750 + }, + { + "epoch": 0.9444706751791103, + "grad_norm": 0.2978087052404848, + "learning_rate": 8.29320581113663e-06, + "loss": 0.5146, + "step": 5751 + }, + { + "epoch": 0.9446349023874531, + "grad_norm": 0.29384689410191, + "learning_rate": 8.292953475990481e-06, + "loss": 0.518, + "step": 5752 + }, + { + "epoch": 0.9447991295957958, + "grad_norm": 0.3121394187485199, + "learning_rate": 8.292701099649129e-06, + "loss": 0.4906, + "step": 5753 + }, + { + "epoch": 0.9449633568041386, + "grad_norm": 0.2965066015713293, + "learning_rate": 8.292448682115309e-06, + "loss": 0.5148, + "step": 5754 + }, + { + "epoch": 0.9451275840124813, + "grad_norm": 0.3857468121891203, + "learning_rate": 8.292196223391766e-06, + "loss": 0.4877, + "step": 5755 + }, + { + "epoch": 0.9452918112208241, + "grad_norm": 1.0082520752566388, + "learning_rate": 8.29194372348124e-06, + "loss": 0.4954, + "step": 5756 + }, + { + "epoch": 0.9454560384291667, + "grad_norm": 0.2987206706456987, + "learning_rate": 8.291691182386476e-06, + "loss": 0.5066, + "step": 5757 + }, + { + "epoch": 0.9456202656375094, + "grad_norm": 0.43792593819559156, + "learning_rate": 8.291438600110214e-06, + "loss": 0.5064, + "step": 5758 + }, + { + "epoch": 0.9457844928458522, + "grad_norm": 0.3036885285849899, + "learning_rate": 8.291185976655199e-06, + "loss": 0.4887, + "step": 5759 + }, + { + "epoch": 0.945948720054195, + "grad_norm": 0.39977706995255674, + "learning_rate": 8.290933312024174e-06, + "loss": 0.5063, + "step": 5760 + }, + { + "epoch": 0.9461129472625377, + "grad_norm": 0.30397770713157646, + "learning_rate": 8.290680606219883e-06, + "loss": 0.5182, + "step": 5761 + }, + { + "epoch": 0.9462771744708804, + "grad_norm": 0.29067054573184586, + "learning_rate": 8.290427859245072e-06, + "loss": 0.4953, + "step": 5762 + }, + { + "epoch": 0.9464414016792232, + "grad_norm": 0.2807311725585339, + "learning_rate": 8.290175071102486e-06, + "loss": 0.5128, + "step": 5763 + }, + { + "epoch": 0.9466056288875659, + "grad_norm": 0.35856042580615943, + "learning_rate": 8.289922241794869e-06, + "loss": 0.5088, + "step": 5764 + }, + { + "epoch": 0.9467698560959087, + "grad_norm": 0.36928919153283474, + "learning_rate": 8.289669371324966e-06, + "loss": 0.5089, + "step": 5765 + }, + { + "epoch": 0.9469340833042514, + "grad_norm": 0.6615780615800806, + "learning_rate": 8.289416459695527e-06, + "loss": 0.5067, + "step": 5766 + }, + { + "epoch": 0.9470983105125942, + "grad_norm": 0.28510177778702833, + "learning_rate": 8.289163506909297e-06, + "loss": 0.482, + "step": 5767 + }, + { + "epoch": 0.9472625377209369, + "grad_norm": 0.30217945823341685, + "learning_rate": 8.288910512969021e-06, + "loss": 0.5077, + "step": 5768 + }, + { + "epoch": 0.9474267649292797, + "grad_norm": 0.278858026357913, + "learning_rate": 8.288657477877452e-06, + "loss": 0.4868, + "step": 5769 + }, + { + "epoch": 0.9475909921376224, + "grad_norm": 0.3424671698610464, + "learning_rate": 8.288404401637332e-06, + "loss": 0.5011, + "step": 5770 + }, + { + "epoch": 0.9477552193459652, + "grad_norm": 0.29888762243279215, + "learning_rate": 8.288151284251417e-06, + "loss": 0.5014, + "step": 5771 + }, + { + "epoch": 0.9479194465543079, + "grad_norm": 0.3767666704556441, + "learning_rate": 8.28789812572245e-06, + "loss": 0.4968, + "step": 5772 + }, + { + "epoch": 0.9480836737626507, + "grad_norm": 0.3007862052726042, + "learning_rate": 8.287644926053182e-06, + "loss": 0.5111, + "step": 5773 + }, + { + "epoch": 0.9482479009709933, + "grad_norm": 0.36206526827161356, + "learning_rate": 8.287391685246363e-06, + "loss": 0.5185, + "step": 5774 + }, + { + "epoch": 0.9484121281793361, + "grad_norm": 0.29609444550203556, + "learning_rate": 8.287138403304746e-06, + "loss": 0.4892, + "step": 5775 + }, + { + "epoch": 0.9485763553876788, + "grad_norm": 0.2949042867632768, + "learning_rate": 8.286885080231079e-06, + "loss": 0.5162, + "step": 5776 + }, + { + "epoch": 0.9487405825960216, + "grad_norm": 0.3347524409023741, + "learning_rate": 8.286631716028112e-06, + "loss": 0.4835, + "step": 5777 + }, + { + "epoch": 0.9489048098043643, + "grad_norm": 0.2867731768433254, + "learning_rate": 8.286378310698603e-06, + "loss": 0.5115, + "step": 5778 + }, + { + "epoch": 0.9490690370127071, + "grad_norm": 0.3303083901828599, + "learning_rate": 8.286124864245298e-06, + "loss": 0.4904, + "step": 5779 + }, + { + "epoch": 0.9492332642210498, + "grad_norm": 0.30811639311477274, + "learning_rate": 8.285871376670953e-06, + "loss": 0.4935, + "step": 5780 + }, + { + "epoch": 0.9493974914293926, + "grad_norm": 0.3030673745127514, + "learning_rate": 8.285617847978318e-06, + "loss": 0.5014, + "step": 5781 + }, + { + "epoch": 0.9495617186377353, + "grad_norm": 0.3174059059837985, + "learning_rate": 8.285364278170152e-06, + "loss": 0.5008, + "step": 5782 + }, + { + "epoch": 0.9497259458460781, + "grad_norm": 0.32230661304949565, + "learning_rate": 8.285110667249202e-06, + "loss": 0.4981, + "step": 5783 + }, + { + "epoch": 0.9498901730544208, + "grad_norm": 0.2954900836401079, + "learning_rate": 8.284857015218228e-06, + "loss": 0.5152, + "step": 5784 + }, + { + "epoch": 0.9500544002627636, + "grad_norm": 0.38341331561161734, + "learning_rate": 8.284603322079982e-06, + "loss": 0.516, + "step": 5785 + }, + { + "epoch": 0.9502186274711063, + "grad_norm": 0.288180037663649, + "learning_rate": 8.284349587837222e-06, + "loss": 0.4971, + "step": 5786 + }, + { + "epoch": 0.950382854679449, + "grad_norm": 0.2639278006502859, + "learning_rate": 8.284095812492701e-06, + "loss": 0.4973, + "step": 5787 + }, + { + "epoch": 0.9505470818877918, + "grad_norm": 0.36465823278362897, + "learning_rate": 8.283841996049176e-06, + "loss": 0.5043, + "step": 5788 + }, + { + "epoch": 0.9507113090961345, + "grad_norm": 0.40869908171798774, + "learning_rate": 8.283588138509406e-06, + "loss": 0.497, + "step": 5789 + }, + { + "epoch": 0.9508755363044773, + "grad_norm": 0.2948081143710643, + "learning_rate": 8.283334239876145e-06, + "loss": 0.5012, + "step": 5790 + }, + { + "epoch": 0.9510397635128199, + "grad_norm": 0.30131869054135385, + "learning_rate": 8.283080300152151e-06, + "loss": 0.5096, + "step": 5791 + }, + { + "epoch": 0.9512039907211627, + "grad_norm": 0.2893470713777381, + "learning_rate": 8.282826319340185e-06, + "loss": 0.4803, + "step": 5792 + }, + { + "epoch": 0.9513682179295054, + "grad_norm": 0.2924402635990607, + "learning_rate": 8.282572297443002e-06, + "loss": 0.5045, + "step": 5793 + }, + { + "epoch": 0.9515324451378482, + "grad_norm": 0.30826834323579505, + "learning_rate": 8.282318234463361e-06, + "loss": 0.4942, + "step": 5794 + }, + { + "epoch": 0.9516966723461909, + "grad_norm": 0.2575049688426361, + "learning_rate": 8.282064130404025e-06, + "loss": 0.5073, + "step": 5795 + }, + { + "epoch": 0.9518608995545337, + "grad_norm": 0.2616141640965354, + "learning_rate": 8.281809985267752e-06, + "loss": 0.5015, + "step": 5796 + }, + { + "epoch": 0.9520251267628764, + "grad_norm": 0.3188316003808188, + "learning_rate": 8.2815557990573e-06, + "loss": 0.519, + "step": 5797 + }, + { + "epoch": 0.9521893539712192, + "grad_norm": 0.2517430104882772, + "learning_rate": 8.281301571775431e-06, + "loss": 0.4858, + "step": 5798 + }, + { + "epoch": 0.9523535811795619, + "grad_norm": 0.31257213248047166, + "learning_rate": 8.28104730342491e-06, + "loss": 0.4955, + "step": 5799 + }, + { + "epoch": 0.9525178083879047, + "grad_norm": 0.3406386793193978, + "learning_rate": 8.280792994008492e-06, + "loss": 0.4923, + "step": 5800 + }, + { + "epoch": 0.9526820355962474, + "grad_norm": 0.2913983953202928, + "learning_rate": 8.280538643528944e-06, + "loss": 0.5039, + "step": 5801 + }, + { + "epoch": 0.9528462628045902, + "grad_norm": 0.28772973551798076, + "learning_rate": 8.280284251989024e-06, + "loss": 0.506, + "step": 5802 + }, + { + "epoch": 0.9530104900129329, + "grad_norm": 0.30481361757707226, + "learning_rate": 8.280029819391499e-06, + "loss": 0.5027, + "step": 5803 + }, + { + "epoch": 0.9531747172212757, + "grad_norm": 0.31008301473548017, + "learning_rate": 8.279775345739133e-06, + "loss": 0.4902, + "step": 5804 + }, + { + "epoch": 0.9533389444296184, + "grad_norm": 0.31681938852800007, + "learning_rate": 8.279520831034688e-06, + "loss": 0.4983, + "step": 5805 + }, + { + "epoch": 0.9535031716379612, + "grad_norm": 0.36114831329488967, + "learning_rate": 8.279266275280926e-06, + "loss": 0.5168, + "step": 5806 + }, + { + "epoch": 0.9536673988463039, + "grad_norm": 0.35047545093351945, + "learning_rate": 8.279011678480614e-06, + "loss": 0.5074, + "step": 5807 + }, + { + "epoch": 0.9538316260546466, + "grad_norm": 0.3312605893409433, + "learning_rate": 8.27875704063652e-06, + "loss": 0.495, + "step": 5808 + }, + { + "epoch": 0.9539958532629893, + "grad_norm": 0.5613178656403057, + "learning_rate": 8.278502361751403e-06, + "loss": 0.5045, + "step": 5809 + }, + { + "epoch": 0.954160080471332, + "grad_norm": 0.28894635191610946, + "learning_rate": 8.278247641828035e-06, + "loss": 0.4889, + "step": 5810 + }, + { + "epoch": 0.9543243076796748, + "grad_norm": 0.33802860483632036, + "learning_rate": 8.27799288086918e-06, + "loss": 0.4807, + "step": 5811 + }, + { + "epoch": 0.9544885348880175, + "grad_norm": 0.3051254564753109, + "learning_rate": 8.277738078877606e-06, + "loss": 0.4955, + "step": 5812 + }, + { + "epoch": 0.9546527620963603, + "grad_norm": 0.31460451693320945, + "learning_rate": 8.277483235856079e-06, + "loss": 0.4888, + "step": 5813 + }, + { + "epoch": 0.954816989304703, + "grad_norm": 0.3011782131412546, + "learning_rate": 8.277228351807367e-06, + "loss": 0.4981, + "step": 5814 + }, + { + "epoch": 0.9549812165130458, + "grad_norm": 0.3790567372061715, + "learning_rate": 8.276973426734238e-06, + "loss": 0.5033, + "step": 5815 + }, + { + "epoch": 0.9551454437213885, + "grad_norm": 0.29233332389067523, + "learning_rate": 8.276718460639464e-06, + "loss": 0.5025, + "step": 5816 + }, + { + "epoch": 0.9553096709297313, + "grad_norm": 0.28425808137423847, + "learning_rate": 8.276463453525809e-06, + "loss": 0.4782, + "step": 5817 + }, + { + "epoch": 0.955473898138074, + "grad_norm": 0.3215019093899259, + "learning_rate": 8.276208405396048e-06, + "loss": 0.5106, + "step": 5818 + }, + { + "epoch": 0.9556381253464168, + "grad_norm": 0.38444114269586827, + "learning_rate": 8.275953316252946e-06, + "loss": 0.495, + "step": 5819 + }, + { + "epoch": 0.9558023525547595, + "grad_norm": 0.2708150920738918, + "learning_rate": 8.275698186099278e-06, + "loss": 0.4782, + "step": 5820 + }, + { + "epoch": 0.9559665797631023, + "grad_norm": 0.574284020674942, + "learning_rate": 8.27544301493781e-06, + "loss": 0.5015, + "step": 5821 + }, + { + "epoch": 0.956130806971445, + "grad_norm": 0.3558660647373787, + "learning_rate": 8.27518780277132e-06, + "loss": 0.4873, + "step": 5822 + }, + { + "epoch": 0.9562950341797878, + "grad_norm": 0.2853300694877287, + "learning_rate": 8.274932549602575e-06, + "loss": 0.5086, + "step": 5823 + }, + { + "epoch": 0.9564592613881305, + "grad_norm": 0.291071201624435, + "learning_rate": 8.274677255434348e-06, + "loss": 0.5011, + "step": 5824 + }, + { + "epoch": 0.9566234885964732, + "grad_norm": 0.3247243607708094, + "learning_rate": 8.274421920269412e-06, + "loss": 0.5057, + "step": 5825 + }, + { + "epoch": 0.9567877158048159, + "grad_norm": 0.26119747354567163, + "learning_rate": 8.274166544110541e-06, + "loss": 0.4932, + "step": 5826 + }, + { + "epoch": 0.9569519430131587, + "grad_norm": 0.3785889300892869, + "learning_rate": 8.273911126960507e-06, + "loss": 0.4927, + "step": 5827 + }, + { + "epoch": 0.9571161702215014, + "grad_norm": 0.38237933514300165, + "learning_rate": 8.273655668822086e-06, + "loss": 0.4997, + "step": 5828 + }, + { + "epoch": 0.9572803974298442, + "grad_norm": 0.3519938306470608, + "learning_rate": 8.273400169698051e-06, + "loss": 0.4969, + "step": 5829 + }, + { + "epoch": 0.9574446246381869, + "grad_norm": 0.2765721380041462, + "learning_rate": 8.27314462959118e-06, + "loss": 0.4796, + "step": 5830 + }, + { + "epoch": 0.9576088518465297, + "grad_norm": 0.3239066959022419, + "learning_rate": 8.272889048504244e-06, + "loss": 0.4818, + "step": 5831 + }, + { + "epoch": 0.9577730790548724, + "grad_norm": 0.28322233184270795, + "learning_rate": 8.272633426440021e-06, + "loss": 0.4928, + "step": 5832 + }, + { + "epoch": 0.9579373062632152, + "grad_norm": 0.2633683826750771, + "learning_rate": 8.272377763401287e-06, + "loss": 0.4911, + "step": 5833 + }, + { + "epoch": 0.9581015334715579, + "grad_norm": 0.29772403363255917, + "learning_rate": 8.27212205939082e-06, + "loss": 0.4798, + "step": 5834 + }, + { + "epoch": 0.9582657606799007, + "grad_norm": 0.3413359910040477, + "learning_rate": 8.271866314411395e-06, + "loss": 0.5078, + "step": 5835 + }, + { + "epoch": 0.9584299878882434, + "grad_norm": 0.34028950114887185, + "learning_rate": 8.271610528465792e-06, + "loss": 0.5045, + "step": 5836 + }, + { + "epoch": 0.9585942150965862, + "grad_norm": 0.27175870163911775, + "learning_rate": 8.271354701556786e-06, + "loss": 0.5135, + "step": 5837 + }, + { + "epoch": 0.9587584423049289, + "grad_norm": 0.3075850133116066, + "learning_rate": 8.27109883368716e-06, + "loss": 0.4803, + "step": 5838 + }, + { + "epoch": 0.9589226695132717, + "grad_norm": 0.3351562770791254, + "learning_rate": 8.270842924859688e-06, + "loss": 0.4922, + "step": 5839 + }, + { + "epoch": 0.9590868967216144, + "grad_norm": 0.2860224701466842, + "learning_rate": 8.270586975077154e-06, + "loss": 0.5001, + "step": 5840 + }, + { + "epoch": 0.9592511239299572, + "grad_norm": 0.2810690644684735, + "learning_rate": 8.270330984342334e-06, + "loss": 0.517, + "step": 5841 + }, + { + "epoch": 0.9594153511382998, + "grad_norm": 0.3647064774101022, + "learning_rate": 8.27007495265801e-06, + "loss": 0.5175, + "step": 5842 + }, + { + "epoch": 0.9595795783466425, + "grad_norm": 0.29633901351977443, + "learning_rate": 8.269818880026963e-06, + "loss": 0.5043, + "step": 5843 + }, + { + "epoch": 0.9597438055549853, + "grad_norm": 0.31863430795181835, + "learning_rate": 8.269562766451974e-06, + "loss": 0.5044, + "step": 5844 + }, + { + "epoch": 0.959908032763328, + "grad_norm": 0.288021139963858, + "learning_rate": 8.269306611935826e-06, + "loss": 0.486, + "step": 5845 + }, + { + "epoch": 0.9600722599716708, + "grad_norm": 0.40002756696662767, + "learning_rate": 8.269050416481298e-06, + "loss": 0.4917, + "step": 5846 + }, + { + "epoch": 0.9602364871800135, + "grad_norm": 0.2847656625043193, + "learning_rate": 8.268794180091175e-06, + "loss": 0.5, + "step": 5847 + }, + { + "epoch": 0.9604007143883563, + "grad_norm": 0.2876921231037368, + "learning_rate": 8.268537902768239e-06, + "loss": 0.4946, + "step": 5848 + }, + { + "epoch": 0.960564941596699, + "grad_norm": 0.3600375948303123, + "learning_rate": 8.268281584515273e-06, + "loss": 0.5072, + "step": 5849 + }, + { + "epoch": 0.9607291688050418, + "grad_norm": 0.27348127589456944, + "learning_rate": 8.268025225335063e-06, + "loss": 0.5142, + "step": 5850 + }, + { + "epoch": 0.9608933960133845, + "grad_norm": 0.29609853935174174, + "learning_rate": 8.267768825230392e-06, + "loss": 0.4903, + "step": 5851 + }, + { + "epoch": 0.9610576232217273, + "grad_norm": 0.3485982841735851, + "learning_rate": 8.267512384204043e-06, + "loss": 0.5078, + "step": 5852 + }, + { + "epoch": 0.96122185043007, + "grad_norm": 0.31985530611185276, + "learning_rate": 8.267255902258804e-06, + "loss": 0.4809, + "step": 5853 + }, + { + "epoch": 0.9613860776384128, + "grad_norm": 0.2827682502321915, + "learning_rate": 8.266999379397458e-06, + "loss": 0.5171, + "step": 5854 + }, + { + "epoch": 0.9615503048467555, + "grad_norm": 0.3558347178567936, + "learning_rate": 8.266742815622794e-06, + "loss": 0.5205, + "step": 5855 + }, + { + "epoch": 0.9617145320550983, + "grad_norm": 0.31569356701732815, + "learning_rate": 8.266486210937595e-06, + "loss": 0.488, + "step": 5856 + }, + { + "epoch": 0.961878759263441, + "grad_norm": 0.280486740752955, + "learning_rate": 8.266229565344651e-06, + "loss": 0.5223, + "step": 5857 + }, + { + "epoch": 0.9620429864717838, + "grad_norm": 0.30616030690303003, + "learning_rate": 8.265972878846751e-06, + "loss": 0.495, + "step": 5858 + }, + { + "epoch": 0.9622072136801264, + "grad_norm": 0.2971057720358319, + "learning_rate": 8.265716151446677e-06, + "loss": 0.4886, + "step": 5859 + }, + { + "epoch": 0.9623714408884692, + "grad_norm": 0.4860466387593598, + "learning_rate": 8.26545938314722e-06, + "loss": 0.4974, + "step": 5860 + }, + { + "epoch": 0.9625356680968119, + "grad_norm": 0.31491956302389884, + "learning_rate": 8.265202573951172e-06, + "loss": 0.4915, + "step": 5861 + }, + { + "epoch": 0.9626998953051547, + "grad_norm": 0.5128107426900643, + "learning_rate": 8.26494572386132e-06, + "loss": 0.5052, + "step": 5862 + }, + { + "epoch": 0.9628641225134974, + "grad_norm": 0.28645208841438113, + "learning_rate": 8.264688832880453e-06, + "loss": 0.4772, + "step": 5863 + }, + { + "epoch": 0.9630283497218401, + "grad_norm": 0.3334911284628625, + "learning_rate": 8.264431901011358e-06, + "loss": 0.5136, + "step": 5864 + }, + { + "epoch": 0.9631925769301829, + "grad_norm": 0.272860391882876, + "learning_rate": 8.264174928256832e-06, + "loss": 0.5046, + "step": 5865 + }, + { + "epoch": 0.9633568041385256, + "grad_norm": 0.3606448704975928, + "learning_rate": 8.263917914619662e-06, + "loss": 0.4904, + "step": 5866 + }, + { + "epoch": 0.9635210313468684, + "grad_norm": 0.3258598539123226, + "learning_rate": 8.263660860102641e-06, + "loss": 0.4854, + "step": 5867 + }, + { + "epoch": 0.9636852585552111, + "grad_norm": 0.28589834527541785, + "learning_rate": 8.26340376470856e-06, + "loss": 0.5102, + "step": 5868 + }, + { + "epoch": 0.9638494857635539, + "grad_norm": 0.2781441103589388, + "learning_rate": 8.26314662844021e-06, + "loss": 0.5007, + "step": 5869 + }, + { + "epoch": 0.9640137129718966, + "grad_norm": 0.2598080441340957, + "learning_rate": 8.262889451300386e-06, + "loss": 0.4703, + "step": 5870 + }, + { + "epoch": 0.9641779401802394, + "grad_norm": 0.3118870916093809, + "learning_rate": 8.26263223329188e-06, + "loss": 0.4806, + "step": 5871 + }, + { + "epoch": 0.9643421673885821, + "grad_norm": 0.28664634469887024, + "learning_rate": 8.262374974417486e-06, + "loss": 0.5147, + "step": 5872 + }, + { + "epoch": 0.9645063945969249, + "grad_norm": 0.29088471375612124, + "learning_rate": 8.26211767468e-06, + "loss": 0.4965, + "step": 5873 + }, + { + "epoch": 0.9646706218052676, + "grad_norm": 0.2934428904263493, + "learning_rate": 8.261860334082212e-06, + "loss": 0.4904, + "step": 5874 + }, + { + "epoch": 0.9648348490136104, + "grad_norm": 0.3262898836978391, + "learning_rate": 8.26160295262692e-06, + "loss": 0.4977, + "step": 5875 + }, + { + "epoch": 0.964999076221953, + "grad_norm": 0.28309291470955317, + "learning_rate": 8.26134553031692e-06, + "loss": 0.4942, + "step": 5876 + }, + { + "epoch": 0.9651633034302958, + "grad_norm": 0.2727890246165504, + "learning_rate": 8.261088067155008e-06, + "loss": 0.4947, + "step": 5877 + }, + { + "epoch": 0.9653275306386385, + "grad_norm": 0.28695720943171166, + "learning_rate": 8.260830563143976e-06, + "loss": 0.5002, + "step": 5878 + }, + { + "epoch": 0.9654917578469813, + "grad_norm": 0.5454995618657891, + "learning_rate": 8.260573018286626e-06, + "loss": 0.4919, + "step": 5879 + }, + { + "epoch": 0.965655985055324, + "grad_norm": 0.3329320093126202, + "learning_rate": 8.260315432585754e-06, + "loss": 0.5237, + "step": 5880 + }, + { + "epoch": 0.9658202122636668, + "grad_norm": 0.3253192711895212, + "learning_rate": 8.260057806044155e-06, + "loss": 0.5048, + "step": 5881 + }, + { + "epoch": 0.9659844394720095, + "grad_norm": 0.30670573862884587, + "learning_rate": 8.259800138664628e-06, + "loss": 0.4797, + "step": 5882 + }, + { + "epoch": 0.9661486666803523, + "grad_norm": 0.30624801488520037, + "learning_rate": 8.259542430449975e-06, + "loss": 0.4847, + "step": 5883 + }, + { + "epoch": 0.966312893888695, + "grad_norm": 0.29449002020199405, + "learning_rate": 8.259284681402992e-06, + "loss": 0.5065, + "step": 5884 + }, + { + "epoch": 0.9664771210970378, + "grad_norm": 0.30474092018275056, + "learning_rate": 8.259026891526478e-06, + "loss": 0.4815, + "step": 5885 + }, + { + "epoch": 0.9666413483053805, + "grad_norm": 0.284568010057758, + "learning_rate": 8.258769060823232e-06, + "loss": 0.486, + "step": 5886 + }, + { + "epoch": 0.9668055755137233, + "grad_norm": 0.32753086529797554, + "learning_rate": 8.258511189296057e-06, + "loss": 0.483, + "step": 5887 + }, + { + "epoch": 0.966969802722066, + "grad_norm": 0.2446509694431007, + "learning_rate": 8.258253276947752e-06, + "loss": 0.4698, + "step": 5888 + }, + { + "epoch": 0.9671340299304088, + "grad_norm": 0.28462449616187135, + "learning_rate": 8.257995323781122e-06, + "loss": 0.4873, + "step": 5889 + }, + { + "epoch": 0.9672982571387515, + "grad_norm": 0.26788096167237824, + "learning_rate": 8.257737329798961e-06, + "loss": 0.5108, + "step": 5890 + }, + { + "epoch": 0.9674624843470943, + "grad_norm": 0.27341395472584296, + "learning_rate": 8.257479295004079e-06, + "loss": 0.4932, + "step": 5891 + }, + { + "epoch": 0.967626711555437, + "grad_norm": 0.30169532415899364, + "learning_rate": 8.257221219399272e-06, + "loss": 0.4656, + "step": 5892 + }, + { + "epoch": 0.9677909387637796, + "grad_norm": 2.138078420857035, + "learning_rate": 8.256963102987349e-06, + "loss": 0.5071, + "step": 5893 + }, + { + "epoch": 0.9679551659721224, + "grad_norm": 0.29300939357485084, + "learning_rate": 8.256704945771108e-06, + "loss": 0.4886, + "step": 5894 + }, + { + "epoch": 0.9681193931804651, + "grad_norm": 0.2819584830142459, + "learning_rate": 8.256446747753356e-06, + "loss": 0.4812, + "step": 5895 + }, + { + "epoch": 0.9682836203888079, + "grad_norm": 0.30045867899014606, + "learning_rate": 8.256188508936896e-06, + "loss": 0.4915, + "step": 5896 + }, + { + "epoch": 0.9684478475971506, + "grad_norm": 0.7846435366223166, + "learning_rate": 8.255930229324535e-06, + "loss": 0.5115, + "step": 5897 + }, + { + "epoch": 0.9686120748054934, + "grad_norm": 0.330147198674231, + "learning_rate": 8.255671908919075e-06, + "loss": 0.4656, + "step": 5898 + }, + { + "epoch": 0.9687763020138361, + "grad_norm": 0.2768259882158433, + "learning_rate": 8.255413547723323e-06, + "loss": 0.5064, + "step": 5899 + }, + { + "epoch": 0.9689405292221789, + "grad_norm": 0.3766778545533828, + "learning_rate": 8.255155145740084e-06, + "loss": 0.5089, + "step": 5900 + }, + { + "epoch": 0.9691047564305216, + "grad_norm": 0.4340928286077416, + "learning_rate": 8.254896702972167e-06, + "loss": 0.5171, + "step": 5901 + }, + { + "epoch": 0.9692689836388644, + "grad_norm": 0.32064135591269965, + "learning_rate": 8.254638219422378e-06, + "loss": 0.4786, + "step": 5902 + }, + { + "epoch": 0.9694332108472071, + "grad_norm": 0.35896251714917476, + "learning_rate": 8.254379695093523e-06, + "loss": 0.4763, + "step": 5903 + }, + { + "epoch": 0.9695974380555499, + "grad_norm": 0.28803051965097765, + "learning_rate": 8.25412112998841e-06, + "loss": 0.498, + "step": 5904 + }, + { + "epoch": 0.9697616652638926, + "grad_norm": 0.2843031109294082, + "learning_rate": 8.253862524109849e-06, + "loss": 0.5165, + "step": 5905 + }, + { + "epoch": 0.9699258924722354, + "grad_norm": 0.2703477212769139, + "learning_rate": 8.253603877460647e-06, + "loss": 0.5026, + "step": 5906 + }, + { + "epoch": 0.9700901196805781, + "grad_norm": 0.37540094540547747, + "learning_rate": 8.253345190043613e-06, + "loss": 0.4745, + "step": 5907 + }, + { + "epoch": 0.9702543468889209, + "grad_norm": 0.34686637153638844, + "learning_rate": 8.253086461861561e-06, + "loss": 0.5061, + "step": 5908 + }, + { + "epoch": 0.9704185740972635, + "grad_norm": 0.28355130217507657, + "learning_rate": 8.252827692917295e-06, + "loss": 0.4877, + "step": 5909 + }, + { + "epoch": 0.9705828013056063, + "grad_norm": 0.3208759472372884, + "learning_rate": 8.252568883213628e-06, + "loss": 0.5004, + "step": 5910 + }, + { + "epoch": 0.970747028513949, + "grad_norm": 0.27097365741813967, + "learning_rate": 8.25231003275337e-06, + "loss": 0.5167, + "step": 5911 + }, + { + "epoch": 0.9709112557222918, + "grad_norm": 0.26847747863284094, + "learning_rate": 8.252051141539335e-06, + "loss": 0.5085, + "step": 5912 + }, + { + "epoch": 0.9710754829306345, + "grad_norm": 0.2744476278316739, + "learning_rate": 8.251792209574333e-06, + "loss": 0.4946, + "step": 5913 + }, + { + "epoch": 0.9712397101389773, + "grad_norm": 0.2949039780636973, + "learning_rate": 8.251533236861175e-06, + "loss": 0.5, + "step": 5914 + }, + { + "epoch": 0.97140393734732, + "grad_norm": 0.2546736861392502, + "learning_rate": 8.251274223402676e-06, + "loss": 0.4998, + "step": 5915 + }, + { + "epoch": 0.9715681645556627, + "grad_norm": 0.33011871520716857, + "learning_rate": 8.251015169201649e-06, + "loss": 0.4979, + "step": 5916 + }, + { + "epoch": 0.9717323917640055, + "grad_norm": 0.36791096342909235, + "learning_rate": 8.250756074260903e-06, + "loss": 0.5164, + "step": 5917 + }, + { + "epoch": 0.9718966189723482, + "grad_norm": 0.2940151689618452, + "learning_rate": 8.25049693858326e-06, + "loss": 0.4945, + "step": 5918 + }, + { + "epoch": 0.972060846180691, + "grad_norm": 0.30790865804305934, + "learning_rate": 8.250237762171527e-06, + "loss": 0.4901, + "step": 5919 + }, + { + "epoch": 0.9722250733890337, + "grad_norm": 0.30633815270914644, + "learning_rate": 8.249978545028526e-06, + "loss": 0.4894, + "step": 5920 + }, + { + "epoch": 0.9723893005973765, + "grad_norm": 0.29364052405576624, + "learning_rate": 8.249719287157066e-06, + "loss": 0.5001, + "step": 5921 + }, + { + "epoch": 0.9725535278057192, + "grad_norm": 0.368512399902772, + "learning_rate": 8.249459988559965e-06, + "loss": 0.4937, + "step": 5922 + }, + { + "epoch": 0.972717755014062, + "grad_norm": 0.2963704816857771, + "learning_rate": 8.249200649240041e-06, + "loss": 0.4981, + "step": 5923 + }, + { + "epoch": 0.9728819822224047, + "grad_norm": 0.3420546608598498, + "learning_rate": 8.248941269200109e-06, + "loss": 0.4944, + "step": 5924 + }, + { + "epoch": 0.9730462094307475, + "grad_norm": 0.26229608477987965, + "learning_rate": 8.248681848442985e-06, + "loss": 0.5222, + "step": 5925 + }, + { + "epoch": 0.9732104366390901, + "grad_norm": 0.3123298058628242, + "learning_rate": 8.248422386971489e-06, + "loss": 0.4972, + "step": 5926 + }, + { + "epoch": 0.9733746638474329, + "grad_norm": 0.2899224102116319, + "learning_rate": 8.248162884788437e-06, + "loss": 0.5155, + "step": 5927 + }, + { + "epoch": 0.9735388910557756, + "grad_norm": 0.2642905460836601, + "learning_rate": 8.24790334189665e-06, + "loss": 0.5054, + "step": 5928 + }, + { + "epoch": 0.9737031182641184, + "grad_norm": 0.2811233637741677, + "learning_rate": 8.247643758298943e-06, + "loss": 0.5033, + "step": 5929 + }, + { + "epoch": 0.9738673454724611, + "grad_norm": 0.3921209165058693, + "learning_rate": 8.24738413399814e-06, + "loss": 0.4756, + "step": 5930 + }, + { + "epoch": 0.9740315726808039, + "grad_norm": 0.4300011495238529, + "learning_rate": 8.247124468997057e-06, + "loss": 0.4726, + "step": 5931 + }, + { + "epoch": 0.9741957998891466, + "grad_norm": 0.32118620284015836, + "learning_rate": 8.246864763298516e-06, + "loss": 0.5117, + "step": 5932 + }, + { + "epoch": 0.9743600270974894, + "grad_norm": 0.2675433845486991, + "learning_rate": 8.246605016905338e-06, + "loss": 0.4906, + "step": 5933 + }, + { + "epoch": 0.9745242543058321, + "grad_norm": 0.3604906603250053, + "learning_rate": 8.246345229820341e-06, + "loss": 0.4758, + "step": 5934 + }, + { + "epoch": 0.9746884815141749, + "grad_norm": 0.2552549150791266, + "learning_rate": 8.246085402046351e-06, + "loss": 0.4811, + "step": 5935 + }, + { + "epoch": 0.9748527087225176, + "grad_norm": 0.27213223431139516, + "learning_rate": 8.245825533586188e-06, + "loss": 0.5028, + "step": 5936 + }, + { + "epoch": 0.9750169359308604, + "grad_norm": 0.2923501485405211, + "learning_rate": 8.245565624442674e-06, + "loss": 0.4761, + "step": 5937 + }, + { + "epoch": 0.9751811631392031, + "grad_norm": 0.25637703091485053, + "learning_rate": 8.245305674618631e-06, + "loss": 0.4957, + "step": 5938 + }, + { + "epoch": 0.9753453903475459, + "grad_norm": 0.29585731786389724, + "learning_rate": 8.245045684116885e-06, + "loss": 0.5024, + "step": 5939 + }, + { + "epoch": 0.9755096175558886, + "grad_norm": 0.3097492958132811, + "learning_rate": 8.244785652940257e-06, + "loss": 0.4911, + "step": 5940 + }, + { + "epoch": 0.9756738447642314, + "grad_norm": 0.2796684436959236, + "learning_rate": 8.244525581091574e-06, + "loss": 0.4873, + "step": 5941 + }, + { + "epoch": 0.9758380719725741, + "grad_norm": 0.4901344389329459, + "learning_rate": 8.244265468573657e-06, + "loss": 0.4952, + "step": 5942 + }, + { + "epoch": 0.9760022991809167, + "grad_norm": 0.29449038922514387, + "learning_rate": 8.244005315389335e-06, + "loss": 0.5011, + "step": 5943 + }, + { + "epoch": 0.9761665263892595, + "grad_norm": 0.2500820874976813, + "learning_rate": 8.24374512154143e-06, + "loss": 0.4833, + "step": 5944 + }, + { + "epoch": 0.9763307535976022, + "grad_norm": 0.2831071561547265, + "learning_rate": 8.24348488703277e-06, + "loss": 0.5094, + "step": 5945 + }, + { + "epoch": 0.976494980805945, + "grad_norm": 0.322064842547227, + "learning_rate": 8.243224611866182e-06, + "loss": 0.5126, + "step": 5946 + }, + { + "epoch": 0.9766592080142877, + "grad_norm": 0.2683354525422334, + "learning_rate": 8.242964296044494e-06, + "loss": 0.4987, + "step": 5947 + }, + { + "epoch": 0.9768234352226305, + "grad_norm": 0.2992209909395012, + "learning_rate": 8.242703939570527e-06, + "loss": 0.5231, + "step": 5948 + }, + { + "epoch": 0.9769876624309732, + "grad_norm": 0.3050729492504926, + "learning_rate": 8.242443542447115e-06, + "loss": 0.5136, + "step": 5949 + }, + { + "epoch": 0.977151889639316, + "grad_norm": 0.29155035443597166, + "learning_rate": 8.242183104677083e-06, + "loss": 0.4942, + "step": 5950 + }, + { + "epoch": 0.9773161168476587, + "grad_norm": 0.25878300704709084, + "learning_rate": 8.24192262626326e-06, + "loss": 0.4816, + "step": 5951 + }, + { + "epoch": 0.9774803440560015, + "grad_norm": 0.3087053142553389, + "learning_rate": 8.241662107208478e-06, + "loss": 0.5206, + "step": 5952 + }, + { + "epoch": 0.9776445712643442, + "grad_norm": 0.4975144518325369, + "learning_rate": 8.241401547515563e-06, + "loss": 0.4931, + "step": 5953 + }, + { + "epoch": 0.977808798472687, + "grad_norm": 0.3519450335069974, + "learning_rate": 8.241140947187347e-06, + "loss": 0.5137, + "step": 5954 + }, + { + "epoch": 0.9779730256810297, + "grad_norm": 0.27425219345136576, + "learning_rate": 8.240880306226659e-06, + "loss": 0.4938, + "step": 5955 + }, + { + "epoch": 0.9781372528893725, + "grad_norm": 0.313446231366959, + "learning_rate": 8.24061962463633e-06, + "loss": 0.4833, + "step": 5956 + }, + { + "epoch": 0.9783014800977152, + "grad_norm": 0.31047102932394477, + "learning_rate": 8.240358902419192e-06, + "loss": 0.5167, + "step": 5957 + }, + { + "epoch": 0.978465707306058, + "grad_norm": 0.3358474346649635, + "learning_rate": 8.240098139578076e-06, + "loss": 0.4916, + "step": 5958 + }, + { + "epoch": 0.9786299345144007, + "grad_norm": 0.2916965129858983, + "learning_rate": 8.239837336115814e-06, + "loss": 0.5071, + "step": 5959 + }, + { + "epoch": 0.9787941617227434, + "grad_norm": 0.3787787228362414, + "learning_rate": 8.23957649203524e-06, + "loss": 0.5196, + "step": 5960 + }, + { + "epoch": 0.9789583889310861, + "grad_norm": 0.3064291953677091, + "learning_rate": 8.239315607339186e-06, + "loss": 0.4962, + "step": 5961 + }, + { + "epoch": 0.9791226161394289, + "grad_norm": 0.2839286416723755, + "learning_rate": 8.239054682030485e-06, + "loss": 0.4926, + "step": 5962 + }, + { + "epoch": 0.9792868433477716, + "grad_norm": 0.28085264039852126, + "learning_rate": 8.238793716111971e-06, + "loss": 0.5022, + "step": 5963 + }, + { + "epoch": 0.9794510705561144, + "grad_norm": 0.33066819531167013, + "learning_rate": 8.23853270958648e-06, + "loss": 0.4936, + "step": 5964 + }, + { + "epoch": 0.9796152977644571, + "grad_norm": 0.26488196400063624, + "learning_rate": 8.238271662456844e-06, + "loss": 0.4916, + "step": 5965 + }, + { + "epoch": 0.9797795249727999, + "grad_norm": 0.3473924170842142, + "learning_rate": 8.2380105747259e-06, + "loss": 0.5109, + "step": 5966 + }, + { + "epoch": 0.9799437521811426, + "grad_norm": 0.3409154975608138, + "learning_rate": 8.237749446396485e-06, + "loss": 0.4985, + "step": 5967 + }, + { + "epoch": 0.9801079793894854, + "grad_norm": 0.3418439073219856, + "learning_rate": 8.237488277471433e-06, + "loss": 0.4969, + "step": 5968 + }, + { + "epoch": 0.9802722065978281, + "grad_norm": 0.2569144853172575, + "learning_rate": 8.237227067953581e-06, + "loss": 0.4764, + "step": 5969 + }, + { + "epoch": 0.9804364338061708, + "grad_norm": 0.31028003969795925, + "learning_rate": 8.236965817845766e-06, + "loss": 0.4886, + "step": 5970 + }, + { + "epoch": 0.9806006610145136, + "grad_norm": 0.254519739291749, + "learning_rate": 8.236704527150826e-06, + "loss": 0.4989, + "step": 5971 + }, + { + "epoch": 0.9807648882228563, + "grad_norm": 0.4203026362129426, + "learning_rate": 8.236443195871597e-06, + "loss": 0.4991, + "step": 5972 + }, + { + "epoch": 0.9809291154311991, + "grad_norm": 0.26955631129226254, + "learning_rate": 8.23618182401092e-06, + "loss": 0.5171, + "step": 5973 + }, + { + "epoch": 0.9810933426395418, + "grad_norm": 0.2675493381061797, + "learning_rate": 8.235920411571632e-06, + "loss": 0.5036, + "step": 5974 + }, + { + "epoch": 0.9812575698478846, + "grad_norm": 0.33819000674277316, + "learning_rate": 8.235658958556573e-06, + "loss": 0.4782, + "step": 5975 + }, + { + "epoch": 0.9814217970562273, + "grad_norm": 0.2732732202700863, + "learning_rate": 8.235397464968581e-06, + "loss": 0.517, + "step": 5976 + }, + { + "epoch": 0.98158602426457, + "grad_norm": 0.2927136697546449, + "learning_rate": 8.235135930810499e-06, + "loss": 0.4975, + "step": 5977 + }, + { + "epoch": 0.9817502514729127, + "grad_norm": 0.29528840625993147, + "learning_rate": 8.234874356085165e-06, + "loss": 0.5094, + "step": 5978 + }, + { + "epoch": 0.9819144786812555, + "grad_norm": 0.28678695381696284, + "learning_rate": 8.234612740795422e-06, + "loss": 0.5042, + "step": 5979 + }, + { + "epoch": 0.9820787058895982, + "grad_norm": 0.2947556284863859, + "learning_rate": 8.23435108494411e-06, + "loss": 0.4788, + "step": 5980 + }, + { + "epoch": 0.982242933097941, + "grad_norm": 0.32944895594123025, + "learning_rate": 8.23408938853407e-06, + "loss": 0.486, + "step": 5981 + }, + { + "epoch": 0.9824071603062837, + "grad_norm": 0.2849669802030251, + "learning_rate": 8.233827651568146e-06, + "loss": 0.5017, + "step": 5982 + }, + { + "epoch": 0.9825713875146265, + "grad_norm": 0.34083531204598366, + "learning_rate": 8.23356587404918e-06, + "loss": 0.4901, + "step": 5983 + }, + { + "epoch": 0.9827356147229692, + "grad_norm": 0.44565048901596793, + "learning_rate": 8.233304055980015e-06, + "loss": 0.5058, + "step": 5984 + }, + { + "epoch": 0.982899841931312, + "grad_norm": 0.3147569511729347, + "learning_rate": 8.233042197363495e-06, + "loss": 0.5062, + "step": 5985 + }, + { + "epoch": 0.9830640691396547, + "grad_norm": 0.30713689506371955, + "learning_rate": 8.232780298202464e-06, + "loss": 0.4924, + "step": 5986 + }, + { + "epoch": 0.9832282963479975, + "grad_norm": 0.35891472000084595, + "learning_rate": 8.232518358499768e-06, + "loss": 0.4979, + "step": 5987 + }, + { + "epoch": 0.9833925235563402, + "grad_norm": 0.2610523577041328, + "learning_rate": 8.232256378258248e-06, + "loss": 0.4785, + "step": 5988 + }, + { + "epoch": 0.983556750764683, + "grad_norm": 0.29778668994780577, + "learning_rate": 8.231994357480754e-06, + "loss": 0.5078, + "step": 5989 + }, + { + "epoch": 0.9837209779730257, + "grad_norm": 0.37794507997054505, + "learning_rate": 8.231732296170127e-06, + "loss": 0.4875, + "step": 5990 + }, + { + "epoch": 0.9838852051813685, + "grad_norm": 0.3080732467400163, + "learning_rate": 8.231470194329218e-06, + "loss": 0.5156, + "step": 5991 + }, + { + "epoch": 0.9840494323897112, + "grad_norm": 0.31909404424037935, + "learning_rate": 8.23120805196087e-06, + "loss": 0.4996, + "step": 5992 + }, + { + "epoch": 0.984213659598054, + "grad_norm": 0.3035152405076065, + "learning_rate": 8.230945869067931e-06, + "loss": 0.4752, + "step": 5993 + }, + { + "epoch": 0.9843778868063966, + "grad_norm": 0.3463376211275316, + "learning_rate": 8.23068364565325e-06, + "loss": 0.4903, + "step": 5994 + }, + { + "epoch": 0.9845421140147393, + "grad_norm": 0.30248850655697274, + "learning_rate": 8.230421381719674e-06, + "loss": 0.5164, + "step": 5995 + }, + { + "epoch": 0.9847063412230821, + "grad_norm": 0.27197297877835336, + "learning_rate": 8.230159077270053e-06, + "loss": 0.5114, + "step": 5996 + }, + { + "epoch": 0.9848705684314248, + "grad_norm": 0.39503979748818113, + "learning_rate": 8.229896732307233e-06, + "loss": 0.4937, + "step": 5997 + }, + { + "epoch": 0.9850347956397676, + "grad_norm": 0.27231037109643863, + "learning_rate": 8.229634346834064e-06, + "loss": 0.4854, + "step": 5998 + }, + { + "epoch": 0.9851990228481103, + "grad_norm": 0.2743727883763561, + "learning_rate": 8.229371920853399e-06, + "loss": 0.4894, + "step": 5999 + }, + { + "epoch": 0.9853632500564531, + "grad_norm": 0.3098154188220738, + "learning_rate": 8.229109454368082e-06, + "loss": 0.4931, + "step": 6000 + }, + { + "epoch": 0.9855274772647958, + "grad_norm": 0.31105934974202104, + "learning_rate": 8.22884694738097e-06, + "loss": 0.5277, + "step": 6001 + }, + { + "epoch": 0.9856917044731386, + "grad_norm": 0.30333660429551784, + "learning_rate": 8.22858439989491e-06, + "loss": 0.4962, + "step": 6002 + }, + { + "epoch": 0.9858559316814813, + "grad_norm": 0.28194793432475335, + "learning_rate": 8.228321811912757e-06, + "loss": 0.4693, + "step": 6003 + }, + { + "epoch": 0.9860201588898241, + "grad_norm": 0.27521067135499894, + "learning_rate": 8.22805918343736e-06, + "loss": 0.4978, + "step": 6004 + }, + { + "epoch": 0.9861843860981668, + "grad_norm": 0.3516121931850756, + "learning_rate": 8.227796514471571e-06, + "loss": 0.4923, + "step": 6005 + }, + { + "epoch": 0.9863486133065096, + "grad_norm": 0.26710061007145536, + "learning_rate": 8.227533805018245e-06, + "loss": 0.5155, + "step": 6006 + }, + { + "epoch": 0.9865128405148523, + "grad_norm": 0.29224389368396897, + "learning_rate": 8.227271055080236e-06, + "loss": 0.4866, + "step": 6007 + }, + { + "epoch": 0.9866770677231951, + "grad_norm": 0.3534119652282733, + "learning_rate": 8.227008264660396e-06, + "loss": 0.5215, + "step": 6008 + }, + { + "epoch": 0.9868412949315378, + "grad_norm": 0.2726025456910224, + "learning_rate": 8.226745433761578e-06, + "loss": 0.5074, + "step": 6009 + }, + { + "epoch": 0.9870055221398806, + "grad_norm": 0.3227584179564337, + "learning_rate": 8.226482562386638e-06, + "loss": 0.476, + "step": 6010 + }, + { + "epoch": 0.9871697493482232, + "grad_norm": 0.2584528878550159, + "learning_rate": 8.226219650538432e-06, + "loss": 0.5026, + "step": 6011 + }, + { + "epoch": 0.987333976556566, + "grad_norm": 0.2983429827813741, + "learning_rate": 8.225956698219814e-06, + "loss": 0.5065, + "step": 6012 + }, + { + "epoch": 0.9874982037649087, + "grad_norm": 0.26958664034702273, + "learning_rate": 8.225693705433639e-06, + "loss": 0.5034, + "step": 6013 + }, + { + "epoch": 0.9876624309732515, + "grad_norm": 0.3463746350662079, + "learning_rate": 8.225430672182768e-06, + "loss": 0.4991, + "step": 6014 + }, + { + "epoch": 0.9878266581815942, + "grad_norm": 0.48975988570215695, + "learning_rate": 8.225167598470052e-06, + "loss": 0.4868, + "step": 6015 + }, + { + "epoch": 0.987990885389937, + "grad_norm": 0.28846554880418807, + "learning_rate": 8.22490448429835e-06, + "loss": 0.4941, + "step": 6016 + }, + { + "epoch": 0.9881551125982797, + "grad_norm": 0.25948125490643514, + "learning_rate": 8.224641329670522e-06, + "loss": 0.4822, + "step": 6017 + }, + { + "epoch": 0.9883193398066225, + "grad_norm": 0.4795604048369903, + "learning_rate": 8.224378134589426e-06, + "loss": 0.4985, + "step": 6018 + }, + { + "epoch": 0.9884835670149652, + "grad_norm": 0.27508472004944673, + "learning_rate": 8.224114899057917e-06, + "loss": 0.4824, + "step": 6019 + }, + { + "epoch": 0.988647794223308, + "grad_norm": 0.2941581038633356, + "learning_rate": 8.223851623078856e-06, + "loss": 0.4916, + "step": 6020 + }, + { + "epoch": 0.9888120214316507, + "grad_norm": 0.2857124408871979, + "learning_rate": 8.223588306655105e-06, + "loss": 0.4978, + "step": 6021 + }, + { + "epoch": 0.9889762486399934, + "grad_norm": 0.28161753210610235, + "learning_rate": 8.22332494978952e-06, + "loss": 0.4917, + "step": 6022 + }, + { + "epoch": 0.9891404758483362, + "grad_norm": 0.3404370427089407, + "learning_rate": 8.223061552484962e-06, + "loss": 0.5161, + "step": 6023 + }, + { + "epoch": 0.989304703056679, + "grad_norm": 0.29723498104859414, + "learning_rate": 8.222798114744294e-06, + "loss": 0.4953, + "step": 6024 + }, + { + "epoch": 0.9894689302650217, + "grad_norm": 0.25821568853288496, + "learning_rate": 8.222534636570375e-06, + "loss": 0.4889, + "step": 6025 + }, + { + "epoch": 0.9896331574733644, + "grad_norm": 0.34018544169293335, + "learning_rate": 8.222271117966067e-06, + "loss": 0.4987, + "step": 6026 + }, + { + "epoch": 0.9897973846817072, + "grad_norm": 0.4406773775755572, + "learning_rate": 8.222007558934234e-06, + "loss": 0.5073, + "step": 6027 + }, + { + "epoch": 0.9899616118900498, + "grad_norm": 0.5805674365221384, + "learning_rate": 8.221743959477735e-06, + "loss": 0.4827, + "step": 6028 + }, + { + "epoch": 0.9901258390983926, + "grad_norm": 0.2996142802822292, + "learning_rate": 8.221480319599435e-06, + "loss": 0.5045, + "step": 6029 + }, + { + "epoch": 0.9902900663067353, + "grad_norm": 0.29344698377790773, + "learning_rate": 8.221216639302199e-06, + "loss": 0.4996, + "step": 6030 + }, + { + "epoch": 0.9904542935150781, + "grad_norm": 0.32978471469877835, + "learning_rate": 8.220952918588888e-06, + "loss": 0.4832, + "step": 6031 + }, + { + "epoch": 0.9906185207234208, + "grad_norm": 0.348936338155547, + "learning_rate": 8.220689157462367e-06, + "loss": 0.4999, + "step": 6032 + }, + { + "epoch": 0.9907827479317636, + "grad_norm": 0.2929668547648481, + "learning_rate": 8.220425355925503e-06, + "loss": 0.4946, + "step": 6033 + }, + { + "epoch": 0.9909469751401063, + "grad_norm": 0.28969490861086555, + "learning_rate": 8.220161513981157e-06, + "loss": 0.508, + "step": 6034 + }, + { + "epoch": 0.9911112023484491, + "grad_norm": 0.30362512496995825, + "learning_rate": 8.219897631632197e-06, + "loss": 0.5036, + "step": 6035 + }, + { + "epoch": 0.9912754295567918, + "grad_norm": 0.6155640604324444, + "learning_rate": 8.21963370888149e-06, + "loss": 0.4919, + "step": 6036 + }, + { + "epoch": 0.9914396567651346, + "grad_norm": 0.37171675217371414, + "learning_rate": 8.219369745731901e-06, + "loss": 0.5144, + "step": 6037 + }, + { + "epoch": 0.9916038839734773, + "grad_norm": 0.285698244962252, + "learning_rate": 8.219105742186297e-06, + "loss": 0.5015, + "step": 6038 + }, + { + "epoch": 0.9917681111818201, + "grad_norm": 0.35864388593535235, + "learning_rate": 8.218841698247545e-06, + "loss": 0.512, + "step": 6039 + }, + { + "epoch": 0.9919323383901628, + "grad_norm": 0.277329370636313, + "learning_rate": 8.218577613918514e-06, + "loss": 0.4877, + "step": 6040 + }, + { + "epoch": 0.9920965655985056, + "grad_norm": 0.27621896049336875, + "learning_rate": 8.21831348920207e-06, + "loss": 0.4903, + "step": 6041 + }, + { + "epoch": 0.9922607928068483, + "grad_norm": 0.3112911663410362, + "learning_rate": 8.218049324101086e-06, + "loss": 0.4834, + "step": 6042 + }, + { + "epoch": 0.9924250200151911, + "grad_norm": 0.3014841204227243, + "learning_rate": 8.217785118618426e-06, + "loss": 0.4922, + "step": 6043 + }, + { + "epoch": 0.9925892472235338, + "grad_norm": 0.2767958028017139, + "learning_rate": 8.217520872756962e-06, + "loss": 0.4973, + "step": 6044 + }, + { + "epoch": 0.9927534744318764, + "grad_norm": 0.2918501643305302, + "learning_rate": 8.217256586519567e-06, + "loss": 0.5088, + "step": 6045 + }, + { + "epoch": 0.9929177016402192, + "grad_norm": 0.32321579020808244, + "learning_rate": 8.216992259909105e-06, + "loss": 0.4937, + "step": 6046 + }, + { + "epoch": 0.993081928848562, + "grad_norm": 0.29135334956770775, + "learning_rate": 8.21672789292845e-06, + "loss": 0.4867, + "step": 6047 + }, + { + "epoch": 0.9932461560569047, + "grad_norm": 0.2755437262202141, + "learning_rate": 8.216463485580474e-06, + "loss": 0.5026, + "step": 6048 + }, + { + "epoch": 0.9934103832652474, + "grad_norm": 0.28258408765853643, + "learning_rate": 8.216199037868048e-06, + "loss": 0.4922, + "step": 6049 + }, + { + "epoch": 0.9935746104735902, + "grad_norm": 0.2856610733524225, + "learning_rate": 8.215934549794043e-06, + "loss": 0.4934, + "step": 6050 + }, + { + "epoch": 0.9937388376819329, + "grad_norm": 0.26939264608863184, + "learning_rate": 8.215670021361335e-06, + "loss": 0.5089, + "step": 6051 + }, + { + "epoch": 0.9939030648902757, + "grad_norm": 0.26228522266134563, + "learning_rate": 8.215405452572793e-06, + "loss": 0.5082, + "step": 6052 + }, + { + "epoch": 0.9940672920986184, + "grad_norm": 0.2865666730968439, + "learning_rate": 8.215140843431293e-06, + "loss": 0.4986, + "step": 6053 + }, + { + "epoch": 0.9942315193069612, + "grad_norm": 0.29112888158896566, + "learning_rate": 8.21487619393971e-06, + "loss": 0.4927, + "step": 6054 + }, + { + "epoch": 0.9943957465153039, + "grad_norm": 0.32408251237526065, + "learning_rate": 8.214611504100914e-06, + "loss": 0.4819, + "step": 6055 + }, + { + "epoch": 0.9945599737236467, + "grad_norm": 0.2980623923723772, + "learning_rate": 8.214346773917784e-06, + "loss": 0.4951, + "step": 6056 + }, + { + "epoch": 0.9947242009319894, + "grad_norm": 0.3298708840715456, + "learning_rate": 8.214082003393193e-06, + "loss": 0.4933, + "step": 6057 + }, + { + "epoch": 0.9948884281403322, + "grad_norm": 0.2886454071627483, + "learning_rate": 8.213817192530015e-06, + "loss": 0.5084, + "step": 6058 + }, + { + "epoch": 0.9950526553486749, + "grad_norm": 0.26521275064187566, + "learning_rate": 8.213552341331133e-06, + "loss": 0.5027, + "step": 6059 + }, + { + "epoch": 0.9952168825570177, + "grad_norm": 0.35831906260849744, + "learning_rate": 8.213287449799416e-06, + "loss": 0.5002, + "step": 6060 + }, + { + "epoch": 0.9953811097653604, + "grad_norm": 0.49904049416821655, + "learning_rate": 8.213022517937744e-06, + "loss": 0.4946, + "step": 6061 + }, + { + "epoch": 0.9955453369737031, + "grad_norm": 0.273090783481902, + "learning_rate": 8.212757545748994e-06, + "loss": 0.5072, + "step": 6062 + }, + { + "epoch": 0.9957095641820458, + "grad_norm": 0.3750463965518032, + "learning_rate": 8.212492533236046e-06, + "loss": 0.493, + "step": 6063 + }, + { + "epoch": 0.9958737913903886, + "grad_norm": 0.2788278001894012, + "learning_rate": 8.212227480401774e-06, + "loss": 0.5033, + "step": 6064 + }, + { + "epoch": 0.9960380185987313, + "grad_norm": 0.25147500943496537, + "learning_rate": 8.211962387249062e-06, + "loss": 0.5144, + "step": 6065 + }, + { + "epoch": 0.9962022458070741, + "grad_norm": 0.32790265255873924, + "learning_rate": 8.211697253780785e-06, + "loss": 0.5175, + "step": 6066 + }, + { + "epoch": 0.9963664730154168, + "grad_norm": 0.3162485512645619, + "learning_rate": 8.211432079999824e-06, + "loss": 0.4795, + "step": 6067 + }, + { + "epoch": 0.9965307002237596, + "grad_norm": 0.31059518886608145, + "learning_rate": 8.211166865909058e-06, + "loss": 0.4959, + "step": 6068 + }, + { + "epoch": 0.9966949274321023, + "grad_norm": 0.34734654159685363, + "learning_rate": 8.210901611511371e-06, + "loss": 0.4801, + "step": 6069 + }, + { + "epoch": 0.996859154640445, + "grad_norm": 0.2947683331373121, + "learning_rate": 8.21063631680964e-06, + "loss": 0.5083, + "step": 6070 + }, + { + "epoch": 0.9970233818487878, + "grad_norm": 0.315138018398247, + "learning_rate": 8.210370981806748e-06, + "loss": 0.5118, + "step": 6071 + }, + { + "epoch": 0.9971876090571306, + "grad_norm": 0.33194756505125095, + "learning_rate": 8.210105606505577e-06, + "loss": 0.4917, + "step": 6072 + }, + { + "epoch": 0.9973518362654733, + "grad_norm": 0.3193908762263436, + "learning_rate": 8.209840190909009e-06, + "loss": 0.4971, + "step": 6073 + }, + { + "epoch": 0.997516063473816, + "grad_norm": 0.2793014691048366, + "learning_rate": 8.209574735019925e-06, + "loss": 0.4966, + "step": 6074 + }, + { + "epoch": 0.9976802906821588, + "grad_norm": 0.2966439416928268, + "learning_rate": 8.20930923884121e-06, + "loss": 0.5136, + "step": 6075 + }, + { + "epoch": 0.9978445178905015, + "grad_norm": 0.33803139976143903, + "learning_rate": 8.209043702375749e-06, + "loss": 0.5076, + "step": 6076 + }, + { + "epoch": 0.9980087450988443, + "grad_norm": 0.3729413860774098, + "learning_rate": 8.208778125626423e-06, + "loss": 0.4941, + "step": 6077 + }, + { + "epoch": 0.998172972307187, + "grad_norm": 0.29849360362127925, + "learning_rate": 8.208512508596118e-06, + "loss": 0.5089, + "step": 6078 + }, + { + "epoch": 0.9983371995155297, + "grad_norm": 0.3708493783333557, + "learning_rate": 8.208246851287717e-06, + "loss": 0.5017, + "step": 6079 + }, + { + "epoch": 0.9985014267238724, + "grad_norm": 0.2648040375203547, + "learning_rate": 8.207981153704108e-06, + "loss": 0.5087, + "step": 6080 + }, + { + "epoch": 0.9986656539322152, + "grad_norm": 0.330039886934558, + "learning_rate": 8.207715415848176e-06, + "loss": 0.4689, + "step": 6081 + }, + { + "epoch": 0.9988298811405579, + "grad_norm": 0.30651609282300624, + "learning_rate": 8.207449637722806e-06, + "loss": 0.4954, + "step": 6082 + }, + { + "epoch": 0.9989941083489007, + "grad_norm": 0.3019876534538926, + "learning_rate": 8.207183819330884e-06, + "loss": 0.5002, + "step": 6083 + }, + { + "epoch": 0.9991583355572434, + "grad_norm": 0.2769342143142423, + "learning_rate": 8.206917960675301e-06, + "loss": 0.5214, + "step": 6084 + }, + { + "epoch": 0.9993225627655862, + "grad_norm": 0.3556437500448538, + "learning_rate": 8.20665206175894e-06, + "loss": 0.4916, + "step": 6085 + }, + { + "epoch": 0.9994867899739289, + "grad_norm": 0.32734441118412566, + "learning_rate": 8.206386122584692e-06, + "loss": 0.476, + "step": 6086 + }, + { + "epoch": 0.9996510171822717, + "grad_norm": 0.33157470358823943, + "learning_rate": 8.206120143155443e-06, + "loss": 0.5219, + "step": 6087 + }, + { + "epoch": 0.9998152443906144, + "grad_norm": 0.3207924796762015, + "learning_rate": 8.205854123474083e-06, + "loss": 0.4969, + "step": 6088 + }, + { + "epoch": 0.9999794715989572, + "grad_norm": 0.268248980958867, + "learning_rate": 8.205588063543502e-06, + "loss": 0.4925, + "step": 6089 + }, + { + "epoch": 1.0001436988072998, + "grad_norm": 0.2701398571042544, + "learning_rate": 8.205321963366588e-06, + "loss": 0.4835, + "step": 6090 + }, + { + "epoch": 1.0003079260156427, + "grad_norm": 0.29488775351168806, + "learning_rate": 8.205055822946233e-06, + "loss": 0.4957, + "step": 6091 + }, + { + "epoch": 1.0004721532239853, + "grad_norm": 0.3105970052078029, + "learning_rate": 8.204789642285324e-06, + "loss": 0.4934, + "step": 6092 + }, + { + "epoch": 1.0006363804323282, + "grad_norm": 0.3507102653544947, + "learning_rate": 8.204523421386757e-06, + "loss": 0.4722, + "step": 6093 + }, + { + "epoch": 1.0008006076406708, + "grad_norm": 0.30553103864347814, + "learning_rate": 8.20425716025342e-06, + "loss": 0.4892, + "step": 6094 + }, + { + "epoch": 1.0009648348490137, + "grad_norm": 0.3301092842380056, + "learning_rate": 8.203990858888206e-06, + "loss": 0.4748, + "step": 6095 + }, + { + "epoch": 1.0011290620573563, + "grad_norm": 0.3285339943190183, + "learning_rate": 8.203724517294007e-06, + "loss": 0.5096, + "step": 6096 + }, + { + "epoch": 1.0012932892656992, + "grad_norm": 0.4022453312728853, + "learning_rate": 8.203458135473716e-06, + "loss": 0.482, + "step": 6097 + }, + { + "epoch": 1.0014575164740418, + "grad_norm": 0.3226651889997907, + "learning_rate": 8.203191713430225e-06, + "loss": 0.4779, + "step": 6098 + }, + { + "epoch": 1.0016217436823847, + "grad_norm": 0.32939182814617934, + "learning_rate": 8.20292525116643e-06, + "loss": 0.4972, + "step": 6099 + }, + { + "epoch": 1.0017859708907273, + "grad_norm": 0.3599122148730225, + "learning_rate": 8.202658748685223e-06, + "loss": 0.4785, + "step": 6100 + }, + { + "epoch": 1.0019501980990702, + "grad_norm": 0.26857771384618684, + "learning_rate": 8.202392205989498e-06, + "loss": 0.4919, + "step": 6101 + }, + { + "epoch": 1.0021144253074128, + "grad_norm": 0.25228675296021535, + "learning_rate": 8.202125623082151e-06, + "loss": 0.4978, + "step": 6102 + }, + { + "epoch": 1.0022786525157557, + "grad_norm": 0.2786770686176153, + "learning_rate": 8.20185899996608e-06, + "loss": 0.5098, + "step": 6103 + }, + { + "epoch": 1.0024428797240983, + "grad_norm": 0.27270211292454544, + "learning_rate": 8.201592336644176e-06, + "loss": 0.5052, + "step": 6104 + }, + { + "epoch": 1.002607106932441, + "grad_norm": 0.287144611347033, + "learning_rate": 8.201325633119337e-06, + "loss": 0.5038, + "step": 6105 + }, + { + "epoch": 1.0027713341407838, + "grad_norm": 0.336224498870284, + "learning_rate": 8.201058889394461e-06, + "loss": 0.4697, + "step": 6106 + }, + { + "epoch": 1.0029355613491264, + "grad_norm": 0.39484648633479735, + "learning_rate": 8.200792105472442e-06, + "loss": 0.5035, + "step": 6107 + }, + { + "epoch": 1.0030997885574693, + "grad_norm": 0.3712949110335043, + "learning_rate": 8.200525281356183e-06, + "loss": 0.4759, + "step": 6108 + }, + { + "epoch": 1.003264015765812, + "grad_norm": 0.270083642715779, + "learning_rate": 8.200258417048578e-06, + "loss": 0.5062, + "step": 6109 + }, + { + "epoch": 1.0034282429741548, + "grad_norm": 0.35149759365651945, + "learning_rate": 8.199991512552525e-06, + "loss": 0.485, + "step": 6110 + }, + { + "epoch": 1.0035924701824974, + "grad_norm": 0.27952436185145424, + "learning_rate": 8.199724567870923e-06, + "loss": 0.4849, + "step": 6111 + }, + { + "epoch": 1.0037566973908403, + "grad_norm": 0.31188337577290126, + "learning_rate": 8.199457583006671e-06, + "loss": 0.5232, + "step": 6112 + }, + { + "epoch": 1.003920924599183, + "grad_norm": 0.36877171718415647, + "learning_rate": 8.199190557962673e-06, + "loss": 0.4813, + "step": 6113 + }, + { + "epoch": 1.0040851518075258, + "grad_norm": 0.29489431860375687, + "learning_rate": 8.198923492741825e-06, + "loss": 0.5065, + "step": 6114 + }, + { + "epoch": 1.0042493790158684, + "grad_norm": 0.2871256782320963, + "learning_rate": 8.198656387347028e-06, + "loss": 0.4967, + "step": 6115 + }, + { + "epoch": 1.0044136062242113, + "grad_norm": 0.3253782587546533, + "learning_rate": 8.198389241781185e-06, + "loss": 0.5157, + "step": 6116 + }, + { + "epoch": 1.004577833432554, + "grad_norm": 0.2866600487798448, + "learning_rate": 8.198122056047195e-06, + "loss": 0.5116, + "step": 6117 + }, + { + "epoch": 1.0047420606408968, + "grad_norm": 0.3204737775521936, + "learning_rate": 8.197854830147961e-06, + "loss": 0.4931, + "step": 6118 + }, + { + "epoch": 1.0049062878492394, + "grad_norm": 0.305333919369849, + "learning_rate": 8.197587564086386e-06, + "loss": 0.495, + "step": 6119 + }, + { + "epoch": 1.0050705150575823, + "grad_norm": 0.3240184222494894, + "learning_rate": 8.19732025786537e-06, + "loss": 0.4901, + "step": 6120 + }, + { + "epoch": 1.005234742265925, + "grad_norm": 0.2586365842115682, + "learning_rate": 8.197052911487823e-06, + "loss": 0.5028, + "step": 6121 + }, + { + "epoch": 1.0053989694742675, + "grad_norm": 0.7875093702083333, + "learning_rate": 8.19678552495664e-06, + "loss": 0.4758, + "step": 6122 + }, + { + "epoch": 1.0055631966826104, + "grad_norm": 0.2959337138717507, + "learning_rate": 8.19651809827473e-06, + "loss": 0.5094, + "step": 6123 + }, + { + "epoch": 1.005727423890953, + "grad_norm": 0.36942449603120836, + "learning_rate": 8.196250631444996e-06, + "loss": 0.497, + "step": 6124 + }, + { + "epoch": 1.005891651099296, + "grad_norm": 0.2877734449312031, + "learning_rate": 8.195983124470346e-06, + "loss": 0.4854, + "step": 6125 + }, + { + "epoch": 1.0060558783076385, + "grad_norm": 0.285412931493051, + "learning_rate": 8.19571557735368e-06, + "loss": 0.4869, + "step": 6126 + }, + { + "epoch": 1.0062201055159814, + "grad_norm": 0.29427375436401393, + "learning_rate": 8.195447990097908e-06, + "loss": 0.4873, + "step": 6127 + }, + { + "epoch": 1.006384332724324, + "grad_norm": 0.3929275658433873, + "learning_rate": 8.195180362705935e-06, + "loss": 0.4779, + "step": 6128 + }, + { + "epoch": 1.006548559932667, + "grad_norm": 0.34591623515733444, + "learning_rate": 8.194912695180668e-06, + "loss": 0.494, + "step": 6129 + }, + { + "epoch": 1.0067127871410095, + "grad_norm": 0.30622477678448884, + "learning_rate": 8.194644987525013e-06, + "loss": 0.4757, + "step": 6130 + }, + { + "epoch": 1.0068770143493524, + "grad_norm": 0.29008846351927825, + "learning_rate": 8.194377239741879e-06, + "loss": 0.4972, + "step": 6131 + }, + { + "epoch": 1.007041241557695, + "grad_norm": 0.2901644160895777, + "learning_rate": 8.194109451834172e-06, + "loss": 0.4962, + "step": 6132 + }, + { + "epoch": 1.007205468766038, + "grad_norm": 0.39246471775157815, + "learning_rate": 8.193841623804803e-06, + "loss": 0.5022, + "step": 6133 + }, + { + "epoch": 1.0073696959743805, + "grad_norm": 0.33816828096835855, + "learning_rate": 8.193573755656681e-06, + "loss": 0.4755, + "step": 6134 + }, + { + "epoch": 1.0075339231827234, + "grad_norm": 0.2882989726136812, + "learning_rate": 8.193305847392713e-06, + "loss": 0.4939, + "step": 6135 + }, + { + "epoch": 1.007698150391066, + "grad_norm": 0.8477706290899284, + "learning_rate": 8.193037899015809e-06, + "loss": 0.4873, + "step": 6136 + }, + { + "epoch": 1.0078623775994089, + "grad_norm": 0.4657233345201176, + "learning_rate": 8.19276991052888e-06, + "loss": 0.489, + "step": 6137 + }, + { + "epoch": 1.0080266048077515, + "grad_norm": 0.3198771301938268, + "learning_rate": 8.192501881934838e-06, + "loss": 0.5226, + "step": 6138 + }, + { + "epoch": 1.0081908320160942, + "grad_norm": 0.37111830598748036, + "learning_rate": 8.192233813236591e-06, + "loss": 0.4778, + "step": 6139 + }, + { + "epoch": 1.008355059224437, + "grad_norm": 0.29241114461127055, + "learning_rate": 8.191965704437053e-06, + "loss": 0.4817, + "step": 6140 + }, + { + "epoch": 1.0085192864327797, + "grad_norm": 0.3005976492011157, + "learning_rate": 8.191697555539135e-06, + "loss": 0.4732, + "step": 6141 + }, + { + "epoch": 1.0086835136411225, + "grad_norm": 0.2967960749197657, + "learning_rate": 8.19142936654575e-06, + "loss": 0.4941, + "step": 6142 + }, + { + "epoch": 1.0088477408494652, + "grad_norm": 0.28884899020507654, + "learning_rate": 8.191161137459809e-06, + "loss": 0.4748, + "step": 6143 + }, + { + "epoch": 1.009011968057808, + "grad_norm": 0.525761885715074, + "learning_rate": 8.190892868284228e-06, + "loss": 0.4712, + "step": 6144 + }, + { + "epoch": 1.0091761952661507, + "grad_norm": 0.31060904710486237, + "learning_rate": 8.190624559021916e-06, + "loss": 0.4727, + "step": 6145 + }, + { + "epoch": 1.0093404224744935, + "grad_norm": 0.24722628302120625, + "learning_rate": 8.190356209675793e-06, + "loss": 0.4859, + "step": 6146 + }, + { + "epoch": 1.0095046496828362, + "grad_norm": 0.3700887463419303, + "learning_rate": 8.19008782024877e-06, + "loss": 0.4817, + "step": 6147 + }, + { + "epoch": 1.009668876891179, + "grad_norm": 0.28728444716481255, + "learning_rate": 8.189819390743762e-06, + "loss": 0.4808, + "step": 6148 + }, + { + "epoch": 1.0098331040995217, + "grad_norm": 0.29078167293740487, + "learning_rate": 8.189550921163685e-06, + "loss": 0.4983, + "step": 6149 + }, + { + "epoch": 1.0099973313078645, + "grad_norm": 0.3636550521570456, + "learning_rate": 8.189282411511457e-06, + "loss": 0.471, + "step": 6150 + }, + { + "epoch": 1.0101615585162071, + "grad_norm": 1.2961617563754146, + "learning_rate": 8.18901386178999e-06, + "loss": 0.499, + "step": 6151 + }, + { + "epoch": 1.01032578572455, + "grad_norm": 0.31558961032524924, + "learning_rate": 8.188745272002206e-06, + "loss": 0.5054, + "step": 6152 + }, + { + "epoch": 1.0104900129328926, + "grad_norm": 0.3187404488115447, + "learning_rate": 8.188476642151016e-06, + "loss": 0.4903, + "step": 6153 + }, + { + "epoch": 1.0106542401412355, + "grad_norm": 0.4396331603761133, + "learning_rate": 8.188207972239343e-06, + "loss": 0.5123, + "step": 6154 + }, + { + "epoch": 1.0108184673495781, + "grad_norm": 0.3098010717106893, + "learning_rate": 8.187939262270101e-06, + "loss": 0.5034, + "step": 6155 + }, + { + "epoch": 1.0109826945579208, + "grad_norm": 0.5654184895396511, + "learning_rate": 8.18767051224621e-06, + "loss": 0.494, + "step": 6156 + }, + { + "epoch": 1.0111469217662636, + "grad_norm": 0.3515168501435642, + "learning_rate": 8.18740172217059e-06, + "loss": 0.4978, + "step": 6157 + }, + { + "epoch": 1.0113111489746063, + "grad_norm": 0.3442198924300222, + "learning_rate": 8.18713289204616e-06, + "loss": 0.4948, + "step": 6158 + }, + { + "epoch": 1.0114753761829491, + "grad_norm": 0.2753283941014209, + "learning_rate": 8.186864021875836e-06, + "loss": 0.5203, + "step": 6159 + }, + { + "epoch": 1.0116396033912918, + "grad_norm": 0.27913090907463, + "learning_rate": 8.186595111662544e-06, + "loss": 0.4882, + "step": 6160 + }, + { + "epoch": 1.0118038305996346, + "grad_norm": 0.2762978360605873, + "learning_rate": 8.186326161409202e-06, + "loss": 0.4912, + "step": 6161 + }, + { + "epoch": 1.0119680578079773, + "grad_norm": 0.286562286465793, + "learning_rate": 8.186057171118731e-06, + "loss": 0.4877, + "step": 6162 + }, + { + "epoch": 1.0121322850163201, + "grad_norm": 0.33537335818622976, + "learning_rate": 8.185788140794053e-06, + "loss": 0.4776, + "step": 6163 + }, + { + "epoch": 1.0122965122246628, + "grad_norm": 0.39338628713320906, + "learning_rate": 8.18551907043809e-06, + "loss": 0.5107, + "step": 6164 + }, + { + "epoch": 1.0124607394330056, + "grad_norm": 0.2926564624467248, + "learning_rate": 8.18524996005376e-06, + "loss": 0.5054, + "step": 6165 + }, + { + "epoch": 1.0126249666413483, + "grad_norm": 0.3033449050165952, + "learning_rate": 8.184980809643992e-06, + "loss": 0.5057, + "step": 6166 + }, + { + "epoch": 1.0127891938496911, + "grad_norm": 0.34123205757993946, + "learning_rate": 8.184711619211708e-06, + "loss": 0.4784, + "step": 6167 + }, + { + "epoch": 1.0129534210580338, + "grad_norm": 0.4648784639014738, + "learning_rate": 8.18444238875983e-06, + "loss": 0.5068, + "step": 6168 + }, + { + "epoch": 1.0131176482663766, + "grad_norm": 0.3461201369794482, + "learning_rate": 8.184173118291282e-06, + "loss": 0.4855, + "step": 6169 + }, + { + "epoch": 1.0132818754747193, + "grad_norm": 0.3150133852802901, + "learning_rate": 8.183903807808989e-06, + "loss": 0.4996, + "step": 6170 + }, + { + "epoch": 1.0134461026830621, + "grad_norm": 0.401520522556826, + "learning_rate": 8.183634457315875e-06, + "loss": 0.4812, + "step": 6171 + }, + { + "epoch": 1.0136103298914048, + "grad_norm": 0.3923158095446052, + "learning_rate": 8.18336506681487e-06, + "loss": 0.4672, + "step": 6172 + }, + { + "epoch": 1.0137745570997474, + "grad_norm": 0.3516625662622814, + "learning_rate": 8.183095636308895e-06, + "loss": 0.5008, + "step": 6173 + }, + { + "epoch": 1.0139387843080903, + "grad_norm": 0.3250018573957792, + "learning_rate": 8.182826165800877e-06, + "loss": 0.5109, + "step": 6174 + }, + { + "epoch": 1.014103011516433, + "grad_norm": 0.28774650547836594, + "learning_rate": 8.182556655293743e-06, + "loss": 0.477, + "step": 6175 + }, + { + "epoch": 1.0142672387247758, + "grad_norm": 0.3476177081030143, + "learning_rate": 8.182287104790421e-06, + "loss": 0.4912, + "step": 6176 + }, + { + "epoch": 1.0144314659331184, + "grad_norm": 0.41693100296127283, + "learning_rate": 8.182017514293839e-06, + "loss": 0.5074, + "step": 6177 + }, + { + "epoch": 1.0145956931414613, + "grad_norm": 0.3266041736928415, + "learning_rate": 8.181747883806924e-06, + "loss": 0.4978, + "step": 6178 + }, + { + "epoch": 1.014759920349804, + "grad_norm": 0.3506095610631528, + "learning_rate": 8.181478213332604e-06, + "loss": 0.4827, + "step": 6179 + }, + { + "epoch": 1.0149241475581467, + "grad_norm": 0.2756591884041051, + "learning_rate": 8.181208502873809e-06, + "loss": 0.4806, + "step": 6180 + }, + { + "epoch": 1.0150883747664894, + "grad_norm": 0.2498743413864132, + "learning_rate": 8.180938752433467e-06, + "loss": 0.4937, + "step": 6181 + }, + { + "epoch": 1.0152526019748322, + "grad_norm": 0.32540974513603127, + "learning_rate": 8.180668962014509e-06, + "loss": 0.5095, + "step": 6182 + }, + { + "epoch": 1.0154168291831749, + "grad_norm": 0.29822184379944644, + "learning_rate": 8.180399131619865e-06, + "loss": 0.4799, + "step": 6183 + }, + { + "epoch": 1.0155810563915177, + "grad_norm": 0.29922201319644454, + "learning_rate": 8.180129261252465e-06, + "loss": 0.4823, + "step": 6184 + }, + { + "epoch": 1.0157452835998604, + "grad_norm": 0.3408181411078289, + "learning_rate": 8.17985935091524e-06, + "loss": 0.508, + "step": 6185 + }, + { + "epoch": 1.0159095108082032, + "grad_norm": 0.3017704206443717, + "learning_rate": 8.179589400611124e-06, + "loss": 0.4762, + "step": 6186 + }, + { + "epoch": 1.0160737380165459, + "grad_norm": 0.28087313141512144, + "learning_rate": 8.179319410343046e-06, + "loss": 0.4864, + "step": 6187 + }, + { + "epoch": 1.0162379652248887, + "grad_norm": 0.343968512634284, + "learning_rate": 8.17904938011394e-06, + "loss": 0.518, + "step": 6188 + }, + { + "epoch": 1.0164021924332314, + "grad_norm": 0.35943975526034677, + "learning_rate": 8.178779309926736e-06, + "loss": 0.4914, + "step": 6189 + }, + { + "epoch": 1.016566419641574, + "grad_norm": 0.30449402360044453, + "learning_rate": 8.17850919978437e-06, + "loss": 0.5151, + "step": 6190 + }, + { + "epoch": 1.0167306468499169, + "grad_norm": 0.4024272798430009, + "learning_rate": 8.178239049689776e-06, + "loss": 0.5001, + "step": 6191 + }, + { + "epoch": 1.0168948740582595, + "grad_norm": 0.42775826191503635, + "learning_rate": 8.177968859645886e-06, + "loss": 0.4927, + "step": 6192 + }, + { + "epoch": 1.0170591012666024, + "grad_norm": 0.3054641612310703, + "learning_rate": 8.177698629655635e-06, + "loss": 0.4729, + "step": 6193 + }, + { + "epoch": 1.017223328474945, + "grad_norm": 0.3662068760964567, + "learning_rate": 8.177428359721959e-06, + "loss": 0.4779, + "step": 6194 + }, + { + "epoch": 1.0173875556832879, + "grad_norm": 0.30360510720936423, + "learning_rate": 8.177158049847793e-06, + "loss": 0.4873, + "step": 6195 + }, + { + "epoch": 1.0175517828916305, + "grad_norm": 0.3055845582263593, + "learning_rate": 8.176887700036074e-06, + "loss": 0.5044, + "step": 6196 + }, + { + "epoch": 1.0177160100999734, + "grad_norm": 0.30314495024705285, + "learning_rate": 8.176617310289734e-06, + "loss": 0.5077, + "step": 6197 + }, + { + "epoch": 1.017880237308316, + "grad_norm": 0.27451553538847134, + "learning_rate": 8.176346880611716e-06, + "loss": 0.4883, + "step": 6198 + }, + { + "epoch": 1.0180444645166589, + "grad_norm": 0.29349998728599647, + "learning_rate": 8.176076411004954e-06, + "loss": 0.4968, + "step": 6199 + }, + { + "epoch": 1.0182086917250015, + "grad_norm": 0.690790330399613, + "learning_rate": 8.175805901472382e-06, + "loss": 0.5199, + "step": 6200 + }, + { + "epoch": 1.0183729189333444, + "grad_norm": 0.39758600380845943, + "learning_rate": 8.175535352016944e-06, + "loss": 0.5017, + "step": 6201 + }, + { + "epoch": 1.018537146141687, + "grad_norm": 0.30757472221998394, + "learning_rate": 8.175264762641575e-06, + "loss": 0.4902, + "step": 6202 + }, + { + "epoch": 1.0187013733500299, + "grad_norm": 0.2635507175010998, + "learning_rate": 8.174994133349214e-06, + "loss": 0.4966, + "step": 6203 + }, + { + "epoch": 1.0188656005583725, + "grad_norm": 0.345398957493199, + "learning_rate": 8.174723464142802e-06, + "loss": 0.4957, + "step": 6204 + }, + { + "epoch": 1.0190298277667154, + "grad_norm": 0.27142952139291604, + "learning_rate": 8.174452755025279e-06, + "loss": 0.4826, + "step": 6205 + }, + { + "epoch": 1.019194054975058, + "grad_norm": 0.2915572983361061, + "learning_rate": 8.174182005999583e-06, + "loss": 0.4927, + "step": 6206 + }, + { + "epoch": 1.0193582821834006, + "grad_norm": 0.27599364042760827, + "learning_rate": 8.173911217068654e-06, + "loss": 0.5096, + "step": 6207 + }, + { + "epoch": 1.0195225093917435, + "grad_norm": 0.29414455181960725, + "learning_rate": 8.173640388235436e-06, + "loss": 0.5038, + "step": 6208 + }, + { + "epoch": 1.0196867366000861, + "grad_norm": 0.342711202223622, + "learning_rate": 8.173369519502868e-06, + "loss": 0.4912, + "step": 6209 + }, + { + "epoch": 1.019850963808429, + "grad_norm": 0.2670830751046088, + "learning_rate": 8.173098610873893e-06, + "loss": 0.4981, + "step": 6210 + }, + { + "epoch": 1.0200151910167716, + "grad_norm": 0.2999878589401095, + "learning_rate": 8.172827662351455e-06, + "loss": 0.4759, + "step": 6211 + }, + { + "epoch": 1.0201794182251145, + "grad_norm": 0.31873822598064333, + "learning_rate": 8.172556673938493e-06, + "loss": 0.4731, + "step": 6212 + }, + { + "epoch": 1.0203436454334571, + "grad_norm": 0.2742035415040891, + "learning_rate": 8.172285645637952e-06, + "loss": 0.4674, + "step": 6213 + }, + { + "epoch": 1.0205078726418, + "grad_norm": 0.31574870554300666, + "learning_rate": 8.172014577452778e-06, + "loss": 0.4775, + "step": 6214 + }, + { + "epoch": 1.0206720998501426, + "grad_norm": 0.31740606222826007, + "learning_rate": 8.17174346938591e-06, + "loss": 0.5123, + "step": 6215 + }, + { + "epoch": 1.0208363270584855, + "grad_norm": 0.27353324703910803, + "learning_rate": 8.171472321440297e-06, + "loss": 0.4837, + "step": 6216 + }, + { + "epoch": 1.0210005542668281, + "grad_norm": 0.2944184923332211, + "learning_rate": 8.171201133618883e-06, + "loss": 0.4803, + "step": 6217 + }, + { + "epoch": 1.021164781475171, + "grad_norm": 0.3013537117744429, + "learning_rate": 8.170929905924613e-06, + "loss": 0.4995, + "step": 6218 + }, + { + "epoch": 1.0213290086835136, + "grad_norm": 0.44433352396382897, + "learning_rate": 8.17065863836043e-06, + "loss": 0.4857, + "step": 6219 + }, + { + "epoch": 1.0214932358918565, + "grad_norm": 0.363216757817997, + "learning_rate": 8.170387330929286e-06, + "loss": 0.4831, + "step": 6220 + }, + { + "epoch": 1.0216574631001991, + "grad_norm": 0.2788261710336551, + "learning_rate": 8.170115983634123e-06, + "loss": 0.484, + "step": 6221 + }, + { + "epoch": 1.021821690308542, + "grad_norm": 0.31049045914006534, + "learning_rate": 8.169844596477889e-06, + "loss": 0.4982, + "step": 6222 + }, + { + "epoch": 1.0219859175168846, + "grad_norm": 0.31160712706194205, + "learning_rate": 8.169573169463534e-06, + "loss": 0.5045, + "step": 6223 + }, + { + "epoch": 1.0221501447252272, + "grad_norm": 0.26787311763186605, + "learning_rate": 8.169301702594002e-06, + "loss": 0.4912, + "step": 6224 + }, + { + "epoch": 1.02231437193357, + "grad_norm": 0.28728606908000814, + "learning_rate": 8.169030195872242e-06, + "loss": 0.5129, + "step": 6225 + }, + { + "epoch": 1.0224785991419127, + "grad_norm": 0.4256925386572884, + "learning_rate": 8.168758649301207e-06, + "loss": 0.4969, + "step": 6226 + }, + { + "epoch": 1.0226428263502556, + "grad_norm": 0.2743511180790035, + "learning_rate": 8.168487062883842e-06, + "loss": 0.4909, + "step": 6227 + }, + { + "epoch": 1.0228070535585982, + "grad_norm": 0.3205816659724719, + "learning_rate": 8.168215436623099e-06, + "loss": 0.5116, + "step": 6228 + }, + { + "epoch": 1.022971280766941, + "grad_norm": 0.2797160146405526, + "learning_rate": 8.167943770521928e-06, + "loss": 0.4889, + "step": 6229 + }, + { + "epoch": 1.0231355079752837, + "grad_norm": 0.3298401016695871, + "learning_rate": 8.167672064583277e-06, + "loss": 0.4948, + "step": 6230 + }, + { + "epoch": 1.0232997351836266, + "grad_norm": 0.2820165824015003, + "learning_rate": 8.1674003188101e-06, + "loss": 0.4816, + "step": 6231 + }, + { + "epoch": 1.0234639623919692, + "grad_norm": 0.7516619960945273, + "learning_rate": 8.167128533205348e-06, + "loss": 0.4978, + "step": 6232 + }, + { + "epoch": 1.023628189600312, + "grad_norm": 0.3242252762838541, + "learning_rate": 8.16685670777197e-06, + "loss": 0.4701, + "step": 6233 + }, + { + "epoch": 1.0237924168086547, + "grad_norm": 0.28078186626292395, + "learning_rate": 8.166584842512922e-06, + "loss": 0.5035, + "step": 6234 + }, + { + "epoch": 1.0239566440169976, + "grad_norm": 0.2892632624945348, + "learning_rate": 8.166312937431154e-06, + "loss": 0.482, + "step": 6235 + }, + { + "epoch": 1.0241208712253402, + "grad_norm": 0.30068090471091746, + "learning_rate": 8.166040992529623e-06, + "loss": 0.4935, + "step": 6236 + }, + { + "epoch": 1.024285098433683, + "grad_norm": 0.30986046486983354, + "learning_rate": 8.165769007811278e-06, + "loss": 0.503, + "step": 6237 + }, + { + "epoch": 1.0244493256420257, + "grad_norm": 0.2652243823608064, + "learning_rate": 8.165496983279075e-06, + "loss": 0.4862, + "step": 6238 + }, + { + "epoch": 1.0246135528503686, + "grad_norm": 0.38129905728032715, + "learning_rate": 8.165224918935968e-06, + "loss": 0.5033, + "step": 6239 + }, + { + "epoch": 1.0247777800587112, + "grad_norm": 0.3330210906119507, + "learning_rate": 8.164952814784913e-06, + "loss": 0.4925, + "step": 6240 + }, + { + "epoch": 1.0249420072670539, + "grad_norm": 0.6285421145833283, + "learning_rate": 8.164680670828864e-06, + "loss": 0.4926, + "step": 6241 + }, + { + "epoch": 1.0251062344753967, + "grad_norm": 0.28295504725253984, + "learning_rate": 8.164408487070778e-06, + "loss": 0.4815, + "step": 6242 + }, + { + "epoch": 1.0252704616837394, + "grad_norm": 0.34828429823959156, + "learning_rate": 8.164136263513609e-06, + "loss": 0.4895, + "step": 6243 + }, + { + "epoch": 1.0254346888920822, + "grad_norm": 0.33668507831640215, + "learning_rate": 8.163864000160318e-06, + "loss": 0.5016, + "step": 6244 + }, + { + "epoch": 1.0255989161004249, + "grad_norm": 0.3371227103810928, + "learning_rate": 8.163591697013857e-06, + "loss": 0.4818, + "step": 6245 + }, + { + "epoch": 1.0257631433087677, + "grad_norm": 0.37006258498862343, + "learning_rate": 8.163319354077188e-06, + "loss": 0.4906, + "step": 6246 + }, + { + "epoch": 1.0259273705171104, + "grad_norm": 0.3321561343583107, + "learning_rate": 8.163046971353263e-06, + "loss": 0.4972, + "step": 6247 + }, + { + "epoch": 1.0260915977254532, + "grad_norm": 0.33758474607503053, + "learning_rate": 8.162774548845047e-06, + "loss": 0.4886, + "step": 6248 + }, + { + "epoch": 1.0262558249337959, + "grad_norm": 0.5719844368152448, + "learning_rate": 8.162502086555494e-06, + "loss": 0.4776, + "step": 6249 + }, + { + "epoch": 1.0264200521421387, + "grad_norm": 0.3230093959766538, + "learning_rate": 8.162229584487566e-06, + "loss": 0.4895, + "step": 6250 + }, + { + "epoch": 1.0265842793504814, + "grad_norm": 0.27381092738906676, + "learning_rate": 8.16195704264422e-06, + "loss": 0.4846, + "step": 6251 + }, + { + "epoch": 1.0267485065588242, + "grad_norm": 0.3576355296217209, + "learning_rate": 8.16168446102842e-06, + "loss": 0.5213, + "step": 6252 + }, + { + "epoch": 1.0269127337671669, + "grad_norm": 0.5889585564078031, + "learning_rate": 8.161411839643121e-06, + "loss": 0.5044, + "step": 6253 + }, + { + "epoch": 1.0270769609755097, + "grad_norm": 0.2979480569514252, + "learning_rate": 8.161139178491291e-06, + "loss": 0.4715, + "step": 6254 + }, + { + "epoch": 1.0272411881838523, + "grad_norm": 0.2771883322556505, + "learning_rate": 8.160866477575885e-06, + "loss": 0.4982, + "step": 6255 + }, + { + "epoch": 1.0274054153921952, + "grad_norm": 0.27340407742016054, + "learning_rate": 8.160593736899869e-06, + "loss": 0.494, + "step": 6256 + }, + { + "epoch": 1.0275696426005378, + "grad_norm": 0.3568661465565456, + "learning_rate": 8.1603209564662e-06, + "loss": 0.5152, + "step": 6257 + }, + { + "epoch": 1.0277338698088805, + "grad_norm": 0.27665099872106325, + "learning_rate": 8.160048136277846e-06, + "loss": 0.5109, + "step": 6258 + }, + { + "epoch": 1.0278980970172233, + "grad_norm": 0.31119032990878753, + "learning_rate": 8.15977527633777e-06, + "loss": 0.5023, + "step": 6259 + }, + { + "epoch": 1.028062324225566, + "grad_norm": 0.316524222391938, + "learning_rate": 8.159502376648932e-06, + "loss": 0.4826, + "step": 6260 + }, + { + "epoch": 1.0282265514339088, + "grad_norm": 0.3633517365497554, + "learning_rate": 8.159229437214298e-06, + "loss": 0.4991, + "step": 6261 + }, + { + "epoch": 1.0283907786422515, + "grad_norm": 0.2734262319354803, + "learning_rate": 8.158956458036833e-06, + "loss": 0.4845, + "step": 6262 + }, + { + "epoch": 1.0285550058505943, + "grad_norm": 0.3625579511734066, + "learning_rate": 8.158683439119499e-06, + "loss": 0.4943, + "step": 6263 + }, + { + "epoch": 1.028719233058937, + "grad_norm": 0.4547336209639623, + "learning_rate": 8.158410380465264e-06, + "loss": 0.4927, + "step": 6264 + }, + { + "epoch": 1.0288834602672798, + "grad_norm": 0.2978449984905939, + "learning_rate": 8.158137282077095e-06, + "loss": 0.5134, + "step": 6265 + }, + { + "epoch": 1.0290476874756225, + "grad_norm": 0.3792401828910724, + "learning_rate": 8.157864143957952e-06, + "loss": 0.4817, + "step": 6266 + }, + { + "epoch": 1.0292119146839653, + "grad_norm": 0.2943182894634262, + "learning_rate": 8.157590966110808e-06, + "loss": 0.4708, + "step": 6267 + }, + { + "epoch": 1.029376141892308, + "grad_norm": 0.29181766258028285, + "learning_rate": 8.157317748538628e-06, + "loss": 0.499, + "step": 6268 + }, + { + "epoch": 1.0295403691006508, + "grad_norm": 0.35172680386374233, + "learning_rate": 8.157044491244378e-06, + "loss": 0.488, + "step": 6269 + }, + { + "epoch": 1.0297045963089935, + "grad_norm": 0.4369680201670247, + "learning_rate": 8.156771194231026e-06, + "loss": 0.4881, + "step": 6270 + }, + { + "epoch": 1.0298688235173363, + "grad_norm": 0.32279653310986156, + "learning_rate": 8.156497857501543e-06, + "loss": 0.511, + "step": 6271 + }, + { + "epoch": 1.030033050725679, + "grad_norm": 0.29216682293506124, + "learning_rate": 8.156224481058893e-06, + "loss": 0.5069, + "step": 6272 + }, + { + "epoch": 1.0301972779340218, + "grad_norm": 0.3398346029618168, + "learning_rate": 8.155951064906052e-06, + "loss": 0.5047, + "step": 6273 + }, + { + "epoch": 1.0303615051423645, + "grad_norm": 0.2996315228865986, + "learning_rate": 8.155677609045982e-06, + "loss": 0.4937, + "step": 6274 + }, + { + "epoch": 1.030525732350707, + "grad_norm": 0.36267652125254585, + "learning_rate": 8.155404113481658e-06, + "loss": 0.5147, + "step": 6275 + }, + { + "epoch": 1.03068995955905, + "grad_norm": 0.3672582502381371, + "learning_rate": 8.155130578216048e-06, + "loss": 0.5099, + "step": 6276 + }, + { + "epoch": 1.0308541867673926, + "grad_norm": 0.30163867575416514, + "learning_rate": 8.154857003252125e-06, + "loss": 0.5172, + "step": 6277 + }, + { + "epoch": 1.0310184139757355, + "grad_norm": 0.3176725462606026, + "learning_rate": 8.154583388592858e-06, + "loss": 0.5058, + "step": 6278 + }, + { + "epoch": 1.031182641184078, + "grad_norm": 0.28021080159548745, + "learning_rate": 8.154309734241219e-06, + "loss": 0.4831, + "step": 6279 + }, + { + "epoch": 1.031346868392421, + "grad_norm": 0.3046852140004648, + "learning_rate": 8.154036040200182e-06, + "loss": 0.4731, + "step": 6280 + }, + { + "epoch": 1.0315110956007636, + "grad_norm": 0.34338511193132853, + "learning_rate": 8.153762306472718e-06, + "loss": 0.4932, + "step": 6281 + }, + { + "epoch": 1.0316753228091065, + "grad_norm": 0.33288344314950746, + "learning_rate": 8.153488533061803e-06, + "loss": 0.4729, + "step": 6282 + }, + { + "epoch": 1.031839550017449, + "grad_norm": 0.28550101428400565, + "learning_rate": 8.153214719970404e-06, + "loss": 0.5082, + "step": 6283 + }, + { + "epoch": 1.032003777225792, + "grad_norm": 0.3258304073804269, + "learning_rate": 8.152940867201502e-06, + "loss": 0.4795, + "step": 6284 + }, + { + "epoch": 1.0321680044341346, + "grad_norm": 0.2961807049535856, + "learning_rate": 8.152666974758068e-06, + "loss": 0.4791, + "step": 6285 + }, + { + "epoch": 1.0323322316424774, + "grad_norm": 0.3435258210556305, + "learning_rate": 8.152393042643075e-06, + "loss": 0.5016, + "step": 6286 + }, + { + "epoch": 1.03249645885082, + "grad_norm": 0.3171183540701704, + "learning_rate": 8.1521190708595e-06, + "loss": 0.5012, + "step": 6287 + }, + { + "epoch": 1.032660686059163, + "grad_norm": 0.3163255037337862, + "learning_rate": 8.15184505941032e-06, + "loss": 0.4734, + "step": 6288 + }, + { + "epoch": 1.0328249132675056, + "grad_norm": 0.28360770056732126, + "learning_rate": 8.15157100829851e-06, + "loss": 0.5327, + "step": 6289 + }, + { + "epoch": 1.0329891404758484, + "grad_norm": 0.3295009314736953, + "learning_rate": 8.151296917527048e-06, + "loss": 0.4601, + "step": 6290 + }, + { + "epoch": 1.033153367684191, + "grad_norm": 0.3109452542130047, + "learning_rate": 8.151022787098904e-06, + "loss": 0.4886, + "step": 6291 + }, + { + "epoch": 1.0333175948925337, + "grad_norm": 0.3422775777883866, + "learning_rate": 8.150748617017064e-06, + "loss": 0.4804, + "step": 6292 + }, + { + "epoch": 1.0334818221008766, + "grad_norm": 0.32723287713230975, + "learning_rate": 8.150474407284502e-06, + "loss": 0.5031, + "step": 6293 + }, + { + "epoch": 1.0336460493092192, + "grad_norm": 0.3649000496586239, + "learning_rate": 8.150200157904194e-06, + "loss": 0.4876, + "step": 6294 + }, + { + "epoch": 1.033810276517562, + "grad_norm": 0.2602417938122742, + "learning_rate": 8.149925868879123e-06, + "loss": 0.4804, + "step": 6295 + }, + { + "epoch": 1.0339745037259047, + "grad_norm": 0.2759579393389584, + "learning_rate": 8.149651540212267e-06, + "loss": 0.4981, + "step": 6296 + }, + { + "epoch": 1.0341387309342476, + "grad_norm": 0.2666833973790393, + "learning_rate": 8.149377171906601e-06, + "loss": 0.4779, + "step": 6297 + }, + { + "epoch": 1.0343029581425902, + "grad_norm": 0.42156453734766997, + "learning_rate": 8.149102763965112e-06, + "loss": 0.5211, + "step": 6298 + }, + { + "epoch": 1.034467185350933, + "grad_norm": 0.2823997950925321, + "learning_rate": 8.148828316390776e-06, + "loss": 0.496, + "step": 6299 + }, + { + "epoch": 1.0346314125592757, + "grad_norm": 0.34982207290752043, + "learning_rate": 8.148553829186573e-06, + "loss": 0.4689, + "step": 6300 + }, + { + "epoch": 1.0347956397676186, + "grad_norm": 0.391208131768673, + "learning_rate": 8.148279302355487e-06, + "loss": 0.4973, + "step": 6301 + }, + { + "epoch": 1.0349598669759612, + "grad_norm": 0.28742279855847286, + "learning_rate": 8.148004735900498e-06, + "loss": 0.511, + "step": 6302 + }, + { + "epoch": 1.035124094184304, + "grad_norm": 0.30432930318759355, + "learning_rate": 8.147730129824588e-06, + "loss": 0.4845, + "step": 6303 + }, + { + "epoch": 1.0352883213926467, + "grad_norm": 0.2696241666956859, + "learning_rate": 8.14745548413074e-06, + "loss": 0.4746, + "step": 6304 + }, + { + "epoch": 1.0354525486009896, + "grad_norm": 0.3517664508179872, + "learning_rate": 8.147180798821937e-06, + "loss": 0.5045, + "step": 6305 + }, + { + "epoch": 1.0356167758093322, + "grad_norm": 0.4605530445297964, + "learning_rate": 8.146906073901163e-06, + "loss": 0.4981, + "step": 6306 + }, + { + "epoch": 1.035781003017675, + "grad_norm": 0.6338064223676042, + "learning_rate": 8.1466313093714e-06, + "loss": 0.4754, + "step": 6307 + }, + { + "epoch": 1.0359452302260177, + "grad_norm": 0.30453246058597266, + "learning_rate": 8.146356505235634e-06, + "loss": 0.484, + "step": 6308 + }, + { + "epoch": 1.0361094574343603, + "grad_norm": 0.3257172020653211, + "learning_rate": 8.146081661496848e-06, + "loss": 0.4794, + "step": 6309 + }, + { + "epoch": 1.0362736846427032, + "grad_norm": 0.34488299780916387, + "learning_rate": 8.145806778158027e-06, + "loss": 0.4687, + "step": 6310 + }, + { + "epoch": 1.0364379118510458, + "grad_norm": 0.3213369359566095, + "learning_rate": 8.14553185522216e-06, + "loss": 0.4761, + "step": 6311 + }, + { + "epoch": 1.0366021390593887, + "grad_norm": 0.32874328293494665, + "learning_rate": 8.145256892692229e-06, + "loss": 0.5035, + "step": 6312 + }, + { + "epoch": 1.0367663662677313, + "grad_norm": 0.2970117333301102, + "learning_rate": 8.14498189057122e-06, + "loss": 0.4938, + "step": 6313 + }, + { + "epoch": 1.0369305934760742, + "grad_norm": 0.38096806888632473, + "learning_rate": 8.144706848862123e-06, + "loss": 0.4961, + "step": 6314 + }, + { + "epoch": 1.0370948206844168, + "grad_norm": 0.28450513054266885, + "learning_rate": 8.144431767567925e-06, + "loss": 0.4956, + "step": 6315 + }, + { + "epoch": 1.0372590478927597, + "grad_norm": 0.33008924689826424, + "learning_rate": 8.14415664669161e-06, + "loss": 0.4889, + "step": 6316 + }, + { + "epoch": 1.0374232751011023, + "grad_norm": 0.31032871022062886, + "learning_rate": 8.14388148623617e-06, + "loss": 0.4774, + "step": 6317 + }, + { + "epoch": 1.0375875023094452, + "grad_norm": 0.3226996695818067, + "learning_rate": 8.143606286204592e-06, + "loss": 0.4722, + "step": 6318 + }, + { + "epoch": 1.0377517295177878, + "grad_norm": 0.28015504286106274, + "learning_rate": 8.143331046599863e-06, + "loss": 0.4974, + "step": 6319 + }, + { + "epoch": 1.0379159567261307, + "grad_norm": 0.3216228193671261, + "learning_rate": 8.143055767424978e-06, + "loss": 0.5035, + "step": 6320 + }, + { + "epoch": 1.0380801839344733, + "grad_norm": 0.32400022353714, + "learning_rate": 8.14278044868292e-06, + "loss": 0.4915, + "step": 6321 + }, + { + "epoch": 1.0382444111428162, + "grad_norm": 0.3889307559896388, + "learning_rate": 8.142505090376683e-06, + "loss": 0.5154, + "step": 6322 + }, + { + "epoch": 1.0384086383511588, + "grad_norm": 0.31872095687613694, + "learning_rate": 8.142229692509258e-06, + "loss": 0.4746, + "step": 6323 + }, + { + "epoch": 1.0385728655595017, + "grad_norm": 0.3825206261235205, + "learning_rate": 8.141954255083633e-06, + "loss": 0.5269, + "step": 6324 + }, + { + "epoch": 1.0387370927678443, + "grad_norm": 0.3205615672991652, + "learning_rate": 8.141678778102804e-06, + "loss": 0.4983, + "step": 6325 + }, + { + "epoch": 1.038901319976187, + "grad_norm": 0.4040202032633061, + "learning_rate": 8.141403261569759e-06, + "loss": 0.4822, + "step": 6326 + }, + { + "epoch": 1.0390655471845298, + "grad_norm": 0.37001803297905866, + "learning_rate": 8.141127705487492e-06, + "loss": 0.4972, + "step": 6327 + }, + { + "epoch": 1.0392297743928725, + "grad_norm": 0.33923188659094977, + "learning_rate": 8.140852109858997e-06, + "loss": 0.511, + "step": 6328 + }, + { + "epoch": 1.0393940016012153, + "grad_norm": 0.29837615842553383, + "learning_rate": 8.140576474687264e-06, + "loss": 0.4919, + "step": 6329 + }, + { + "epoch": 1.039558228809558, + "grad_norm": 0.2711498146698936, + "learning_rate": 8.140300799975289e-06, + "loss": 0.4858, + "step": 6330 + }, + { + "epoch": 1.0397224560179008, + "grad_norm": 0.323007121971415, + "learning_rate": 8.140025085726067e-06, + "loss": 0.5001, + "step": 6331 + }, + { + "epoch": 1.0398866832262434, + "grad_norm": 0.4009112749843193, + "learning_rate": 8.139749331942591e-06, + "loss": 0.4872, + "step": 6332 + }, + { + "epoch": 1.0400509104345863, + "grad_norm": 0.26720307728953957, + "learning_rate": 8.139473538627855e-06, + "loss": 0.4931, + "step": 6333 + }, + { + "epoch": 1.040215137642929, + "grad_norm": 0.29601146133600026, + "learning_rate": 8.139197705784857e-06, + "loss": 0.4857, + "step": 6334 + }, + { + "epoch": 1.0403793648512718, + "grad_norm": 0.3176755739657561, + "learning_rate": 8.13892183341659e-06, + "loss": 0.4981, + "step": 6335 + }, + { + "epoch": 1.0405435920596144, + "grad_norm": 0.31923052268403973, + "learning_rate": 8.138645921526053e-06, + "loss": 0.4985, + "step": 6336 + }, + { + "epoch": 1.0407078192679573, + "grad_norm": 0.27262402169127714, + "learning_rate": 8.138369970116242e-06, + "loss": 0.4889, + "step": 6337 + }, + { + "epoch": 1.0408720464763, + "grad_norm": 0.2981051983207571, + "learning_rate": 8.13809397919015e-06, + "loss": 0.4818, + "step": 6338 + }, + { + "epoch": 1.0410362736846428, + "grad_norm": 0.26663296072728127, + "learning_rate": 8.137817948750781e-06, + "loss": 0.4983, + "step": 6339 + }, + { + "epoch": 1.0412005008929854, + "grad_norm": 0.3322672618709718, + "learning_rate": 8.13754187880113e-06, + "loss": 0.4972, + "step": 6340 + }, + { + "epoch": 1.0413647281013283, + "grad_norm": 0.3164200711690184, + "learning_rate": 8.137265769344193e-06, + "loss": 0.4834, + "step": 6341 + }, + { + "epoch": 1.041528955309671, + "grad_norm": 0.29570874323950064, + "learning_rate": 8.136989620382973e-06, + "loss": 0.4938, + "step": 6342 + }, + { + "epoch": 1.0416931825180136, + "grad_norm": 0.3395825507914, + "learning_rate": 8.136713431920469e-06, + "loss": 0.4713, + "step": 6343 + }, + { + "epoch": 1.0418574097263564, + "grad_norm": 0.3596792089972651, + "learning_rate": 8.136437203959677e-06, + "loss": 0.4718, + "step": 6344 + }, + { + "epoch": 1.042021636934699, + "grad_norm": 0.2913957363979316, + "learning_rate": 8.1361609365036e-06, + "loss": 0.49, + "step": 6345 + }, + { + "epoch": 1.042185864143042, + "grad_norm": 0.30153632831779337, + "learning_rate": 8.135884629555236e-06, + "loss": 0.4872, + "step": 6346 + }, + { + "epoch": 1.0423500913513846, + "grad_norm": 0.3027949712850462, + "learning_rate": 8.135608283117589e-06, + "loss": 0.5091, + "step": 6347 + }, + { + "epoch": 1.0425143185597274, + "grad_norm": 0.3667060182197534, + "learning_rate": 8.135331897193659e-06, + "loss": 0.5129, + "step": 6348 + }, + { + "epoch": 1.04267854576807, + "grad_norm": 0.34954670908517377, + "learning_rate": 8.135055471786448e-06, + "loss": 0.4907, + "step": 6349 + }, + { + "epoch": 1.042842772976413, + "grad_norm": 0.5746944717000683, + "learning_rate": 8.134779006898958e-06, + "loss": 0.5251, + "step": 6350 + }, + { + "epoch": 1.0430070001847556, + "grad_norm": 0.43380942060730043, + "learning_rate": 8.134502502534192e-06, + "loss": 0.4859, + "step": 6351 + }, + { + "epoch": 1.0431712273930984, + "grad_norm": 0.3356139791441698, + "learning_rate": 8.134225958695153e-06, + "loss": 0.4771, + "step": 6352 + }, + { + "epoch": 1.043335454601441, + "grad_norm": 0.3385240491576104, + "learning_rate": 8.133949375384844e-06, + "loss": 0.4863, + "step": 6353 + }, + { + "epoch": 1.043499681809784, + "grad_norm": 0.3291980594155157, + "learning_rate": 8.13367275260627e-06, + "loss": 0.5003, + "step": 6354 + }, + { + "epoch": 1.0436639090181266, + "grad_norm": 0.3383124118965275, + "learning_rate": 8.133396090362435e-06, + "loss": 0.4945, + "step": 6355 + }, + { + "epoch": 1.0438281362264694, + "grad_norm": 0.5204865829726072, + "learning_rate": 8.133119388656344e-06, + "loss": 0.4808, + "step": 6356 + }, + { + "epoch": 1.043992363434812, + "grad_norm": 0.2809584481919922, + "learning_rate": 8.132842647491002e-06, + "loss": 0.4964, + "step": 6357 + }, + { + "epoch": 1.044156590643155, + "grad_norm": 0.3785537153231929, + "learning_rate": 8.132565866869414e-06, + "loss": 0.4898, + "step": 6358 + }, + { + "epoch": 1.0443208178514976, + "grad_norm": 0.2990071029129028, + "learning_rate": 8.132289046794584e-06, + "loss": 0.4954, + "step": 6359 + }, + { + "epoch": 1.0444850450598402, + "grad_norm": 0.38818656992221234, + "learning_rate": 8.132012187269526e-06, + "loss": 0.5099, + "step": 6360 + }, + { + "epoch": 1.044649272268183, + "grad_norm": 0.3580751562409733, + "learning_rate": 8.13173528829724e-06, + "loss": 0.4925, + "step": 6361 + }, + { + "epoch": 1.0448134994765257, + "grad_norm": 0.4102291661116885, + "learning_rate": 8.131458349880735e-06, + "loss": 0.4965, + "step": 6362 + }, + { + "epoch": 1.0449777266848685, + "grad_norm": 0.3294150987730053, + "learning_rate": 8.13118137202302e-06, + "loss": 0.5011, + "step": 6363 + }, + { + "epoch": 1.0451419538932112, + "grad_norm": 0.31692405906232507, + "learning_rate": 8.130904354727103e-06, + "loss": 0.4875, + "step": 6364 + }, + { + "epoch": 1.045306181101554, + "grad_norm": 0.2950992571838892, + "learning_rate": 8.130627297995991e-06, + "loss": 0.5003, + "step": 6365 + }, + { + "epoch": 1.0454704083098967, + "grad_norm": 0.45984806915384435, + "learning_rate": 8.130350201832697e-06, + "loss": 0.4856, + "step": 6366 + }, + { + "epoch": 1.0456346355182395, + "grad_norm": 0.4105295951420729, + "learning_rate": 8.130073066240226e-06, + "loss": 0.5028, + "step": 6367 + }, + { + "epoch": 1.0457988627265822, + "grad_norm": 0.36942653174057527, + "learning_rate": 8.129795891221592e-06, + "loss": 0.4869, + "step": 6368 + }, + { + "epoch": 1.045963089934925, + "grad_norm": 0.34164681035447897, + "learning_rate": 8.1295186767798e-06, + "loss": 0.501, + "step": 6369 + }, + { + "epoch": 1.0461273171432677, + "grad_norm": 0.6678599138574988, + "learning_rate": 8.12924142291787e-06, + "loss": 0.4662, + "step": 6370 + }, + { + "epoch": 1.0462915443516105, + "grad_norm": 0.5031081494107003, + "learning_rate": 8.128964129638802e-06, + "loss": 0.4945, + "step": 6371 + }, + { + "epoch": 1.0464557715599532, + "grad_norm": 0.2899303698213951, + "learning_rate": 8.128686796945615e-06, + "loss": 0.4913, + "step": 6372 + }, + { + "epoch": 1.046619998768296, + "grad_norm": 0.3061299471747233, + "learning_rate": 8.128409424841319e-06, + "loss": 0.5124, + "step": 6373 + }, + { + "epoch": 1.0467842259766387, + "grad_norm": 0.3186445589777642, + "learning_rate": 8.128132013328928e-06, + "loss": 0.5102, + "step": 6374 + }, + { + "epoch": 1.0469484531849815, + "grad_norm": 0.30979226756641537, + "learning_rate": 8.127854562411452e-06, + "loss": 0.4907, + "step": 6375 + }, + { + "epoch": 1.0471126803933242, + "grad_norm": 0.35666387304376157, + "learning_rate": 8.127577072091906e-06, + "loss": 0.4862, + "step": 6376 + }, + { + "epoch": 1.0472769076016668, + "grad_norm": 0.33999127955278113, + "learning_rate": 8.127299542373306e-06, + "loss": 0.5005, + "step": 6377 + }, + { + "epoch": 1.0474411348100097, + "grad_norm": 0.32517006313544056, + "learning_rate": 8.127021973258664e-06, + "loss": 0.5225, + "step": 6378 + }, + { + "epoch": 1.0476053620183523, + "grad_norm": 0.30386677497274767, + "learning_rate": 8.126744364750991e-06, + "loss": 0.5187, + "step": 6379 + }, + { + "epoch": 1.0477695892266952, + "grad_norm": 0.3028445243411251, + "learning_rate": 8.12646671685331e-06, + "loss": 0.4911, + "step": 6380 + }, + { + "epoch": 1.0479338164350378, + "grad_norm": 0.3348978408227218, + "learning_rate": 8.126189029568631e-06, + "loss": 0.5013, + "step": 6381 + }, + { + "epoch": 1.0480980436433807, + "grad_norm": 0.31603848477950536, + "learning_rate": 8.125911302899973e-06, + "loss": 0.4846, + "step": 6382 + }, + { + "epoch": 1.0482622708517233, + "grad_norm": 0.2986142070302935, + "learning_rate": 8.125633536850349e-06, + "loss": 0.4859, + "step": 6383 + }, + { + "epoch": 1.0484264980600662, + "grad_norm": 0.267336828835043, + "learning_rate": 8.125355731422778e-06, + "loss": 0.5089, + "step": 6384 + }, + { + "epoch": 1.0485907252684088, + "grad_norm": 0.26584039989022573, + "learning_rate": 8.125077886620277e-06, + "loss": 0.4855, + "step": 6385 + }, + { + "epoch": 1.0487549524767517, + "grad_norm": 0.2942213330070643, + "learning_rate": 8.124800002445864e-06, + "loss": 0.4914, + "step": 6386 + }, + { + "epoch": 1.0489191796850943, + "grad_norm": 0.34842284141271185, + "learning_rate": 8.124522078902556e-06, + "loss": 0.4798, + "step": 6387 + }, + { + "epoch": 1.0490834068934372, + "grad_norm": 0.28931993160219377, + "learning_rate": 8.124244115993372e-06, + "loss": 0.4917, + "step": 6388 + }, + { + "epoch": 1.0492476341017798, + "grad_norm": 0.2914300210726281, + "learning_rate": 8.123966113721331e-06, + "loss": 0.4918, + "step": 6389 + }, + { + "epoch": 1.0494118613101227, + "grad_norm": 0.3163077994522434, + "learning_rate": 8.123688072089455e-06, + "loss": 0.5009, + "step": 6390 + }, + { + "epoch": 1.0495760885184653, + "grad_norm": 0.2867811327270028, + "learning_rate": 8.123409991100758e-06, + "loss": 0.4716, + "step": 6391 + }, + { + "epoch": 1.0497403157268081, + "grad_norm": 0.30351396153490035, + "learning_rate": 8.123131870758266e-06, + "loss": 0.5001, + "step": 6392 + }, + { + "epoch": 1.0499045429351508, + "grad_norm": 0.26456114320785307, + "learning_rate": 8.122853711064997e-06, + "loss": 0.5115, + "step": 6393 + }, + { + "epoch": 1.0500687701434934, + "grad_norm": 0.3490191334110551, + "learning_rate": 8.12257551202397e-06, + "loss": 0.5033, + "step": 6394 + }, + { + "epoch": 1.0502329973518363, + "grad_norm": 0.32342546862186944, + "learning_rate": 8.122297273638212e-06, + "loss": 0.4858, + "step": 6395 + }, + { + "epoch": 1.050397224560179, + "grad_norm": 0.32261052856924055, + "learning_rate": 8.122018995910738e-06, + "loss": 0.4902, + "step": 6396 + }, + { + "epoch": 1.0505614517685218, + "grad_norm": 0.32048637806924013, + "learning_rate": 8.121740678844576e-06, + "loss": 0.4695, + "step": 6397 + }, + { + "epoch": 1.0507256789768644, + "grad_norm": 0.3235467650978821, + "learning_rate": 8.121462322442749e-06, + "loss": 0.5074, + "step": 6398 + }, + { + "epoch": 1.0508899061852073, + "grad_norm": 0.3222995364524542, + "learning_rate": 8.121183926708274e-06, + "loss": 0.5017, + "step": 6399 + }, + { + "epoch": 1.05105413339355, + "grad_norm": 0.3099918502023668, + "learning_rate": 8.120905491644181e-06, + "loss": 0.4903, + "step": 6400 + }, + { + "epoch": 1.0512183606018928, + "grad_norm": 0.2991011827463223, + "learning_rate": 8.120627017253492e-06, + "loss": 0.4863, + "step": 6401 + }, + { + "epoch": 1.0513825878102354, + "grad_norm": 0.3032089180759681, + "learning_rate": 8.12034850353923e-06, + "loss": 0.5018, + "step": 6402 + }, + { + "epoch": 1.0515468150185783, + "grad_norm": 0.30778202378615055, + "learning_rate": 8.120069950504426e-06, + "loss": 0.4971, + "step": 6403 + }, + { + "epoch": 1.051711042226921, + "grad_norm": 0.33858123870495077, + "learning_rate": 8.119791358152097e-06, + "loss": 0.5004, + "step": 6404 + }, + { + "epoch": 1.0518752694352638, + "grad_norm": 0.31106722488997146, + "learning_rate": 8.119512726485272e-06, + "loss": 0.4889, + "step": 6405 + }, + { + "epoch": 1.0520394966436064, + "grad_norm": 0.28103243070829426, + "learning_rate": 8.119234055506979e-06, + "loss": 0.4964, + "step": 6406 + }, + { + "epoch": 1.0522037238519493, + "grad_norm": 0.30513334246236656, + "learning_rate": 8.118955345220243e-06, + "loss": 0.5038, + "step": 6407 + }, + { + "epoch": 1.052367951060292, + "grad_norm": 0.2703130936598874, + "learning_rate": 8.11867659562809e-06, + "loss": 0.4753, + "step": 6408 + }, + { + "epoch": 1.0525321782686348, + "grad_norm": 0.34695632434730056, + "learning_rate": 8.118397806733549e-06, + "loss": 0.4753, + "step": 6409 + }, + { + "epoch": 1.0526964054769774, + "grad_norm": 0.35036872667227215, + "learning_rate": 8.11811897853965e-06, + "loss": 0.4835, + "step": 6410 + }, + { + "epoch": 1.05286063268532, + "grad_norm": 0.2752474487062583, + "learning_rate": 8.117840111049418e-06, + "loss": 0.4795, + "step": 6411 + }, + { + "epoch": 1.053024859893663, + "grad_norm": 0.3954558426656484, + "learning_rate": 8.117561204265881e-06, + "loss": 0.4736, + "step": 6412 + }, + { + "epoch": 1.0531890871020055, + "grad_norm": 0.3024301299880532, + "learning_rate": 8.117282258192073e-06, + "loss": 0.4901, + "step": 6413 + }, + { + "epoch": 1.0533533143103484, + "grad_norm": 0.36585136371914173, + "learning_rate": 8.117003272831018e-06, + "loss": 0.4861, + "step": 6414 + }, + { + "epoch": 1.053517541518691, + "grad_norm": 0.30042123148773786, + "learning_rate": 8.116724248185751e-06, + "loss": 0.5021, + "step": 6415 + }, + { + "epoch": 1.053681768727034, + "grad_norm": 0.3273045503653264, + "learning_rate": 8.1164451842593e-06, + "loss": 0.5005, + "step": 6416 + }, + { + "epoch": 1.0538459959353765, + "grad_norm": 0.3148844502941428, + "learning_rate": 8.116166081054698e-06, + "loss": 0.4799, + "step": 6417 + }, + { + "epoch": 1.0540102231437194, + "grad_norm": 0.30624673877227687, + "learning_rate": 8.11588693857497e-06, + "loss": 0.5072, + "step": 6418 + }, + { + "epoch": 1.054174450352062, + "grad_norm": 0.3166765249654235, + "learning_rate": 8.115607756823156e-06, + "loss": 0.4966, + "step": 6419 + }, + { + "epoch": 1.054338677560405, + "grad_norm": 0.2659242128969254, + "learning_rate": 8.115328535802283e-06, + "loss": 0.488, + "step": 6420 + }, + { + "epoch": 1.0545029047687475, + "grad_norm": 0.25926394594099467, + "learning_rate": 8.115049275515386e-06, + "loss": 0.4734, + "step": 6421 + }, + { + "epoch": 1.0546671319770904, + "grad_norm": 0.36846139712139503, + "learning_rate": 8.114769975965496e-06, + "loss": 0.52, + "step": 6422 + }, + { + "epoch": 1.054831359185433, + "grad_norm": 0.28318609317997545, + "learning_rate": 8.114490637155648e-06, + "loss": 0.4973, + "step": 6423 + }, + { + "epoch": 1.0549955863937759, + "grad_norm": 0.3365623874313242, + "learning_rate": 8.114211259088875e-06, + "loss": 0.4806, + "step": 6424 + }, + { + "epoch": 1.0551598136021185, + "grad_norm": 0.37523450137919945, + "learning_rate": 8.113931841768212e-06, + "loss": 0.4829, + "step": 6425 + }, + { + "epoch": 1.0553240408104614, + "grad_norm": 0.29403851701146794, + "learning_rate": 8.113652385196695e-06, + "loss": 0.4704, + "step": 6426 + }, + { + "epoch": 1.055488268018804, + "grad_norm": 0.2945693218797296, + "learning_rate": 8.113372889377357e-06, + "loss": 0.4648, + "step": 6427 + }, + { + "epoch": 1.0556524952271467, + "grad_norm": 0.3418559064297287, + "learning_rate": 8.113093354313234e-06, + "loss": 0.498, + "step": 6428 + }, + { + "epoch": 1.0558167224354895, + "grad_norm": 0.3366328880212634, + "learning_rate": 8.112813780007362e-06, + "loss": 0.4891, + "step": 6429 + }, + { + "epoch": 1.0559809496438322, + "grad_norm": 0.3358091705998052, + "learning_rate": 8.112534166462778e-06, + "loss": 0.4675, + "step": 6430 + }, + { + "epoch": 1.056145176852175, + "grad_norm": 0.31678677491645746, + "learning_rate": 8.112254513682519e-06, + "loss": 0.4944, + "step": 6431 + }, + { + "epoch": 1.0563094040605177, + "grad_norm": 0.2648603428506825, + "learning_rate": 8.111974821669623e-06, + "loss": 0.4974, + "step": 6432 + }, + { + "epoch": 1.0564736312688605, + "grad_norm": 0.2982763427295238, + "learning_rate": 8.111695090427125e-06, + "loss": 0.4994, + "step": 6433 + }, + { + "epoch": 1.0566378584772032, + "grad_norm": 0.3244793940093539, + "learning_rate": 8.111415319958066e-06, + "loss": 0.505, + "step": 6434 + }, + { + "epoch": 1.056802085685546, + "grad_norm": 0.33260643099417697, + "learning_rate": 8.111135510265483e-06, + "loss": 0.5006, + "step": 6435 + }, + { + "epoch": 1.0569663128938886, + "grad_norm": 0.2925676158677974, + "learning_rate": 8.110855661352416e-06, + "loss": 0.4819, + "step": 6436 + }, + { + "epoch": 1.0571305401022315, + "grad_norm": 0.3493362290759631, + "learning_rate": 8.110575773221903e-06, + "loss": 0.4902, + "step": 6437 + }, + { + "epoch": 1.0572947673105741, + "grad_norm": 0.3925359125596493, + "learning_rate": 8.110295845876985e-06, + "loss": 0.4894, + "step": 6438 + }, + { + "epoch": 1.057458994518917, + "grad_norm": 0.2901191149400785, + "learning_rate": 8.110015879320703e-06, + "loss": 0.4952, + "step": 6439 + }, + { + "epoch": 1.0576232217272596, + "grad_norm": 0.31226298482160286, + "learning_rate": 8.109735873556097e-06, + "loss": 0.4765, + "step": 6440 + }, + { + "epoch": 1.0577874489356025, + "grad_norm": 0.2608839794267408, + "learning_rate": 8.109455828586206e-06, + "loss": 0.4841, + "step": 6441 + }, + { + "epoch": 1.0579516761439451, + "grad_norm": 0.41179817747126496, + "learning_rate": 8.109175744414074e-06, + "loss": 0.4995, + "step": 6442 + }, + { + "epoch": 1.058115903352288, + "grad_norm": 0.27599730187609833, + "learning_rate": 8.108895621042743e-06, + "loss": 0.4825, + "step": 6443 + }, + { + "epoch": 1.0582801305606306, + "grad_norm": 0.38717551812691675, + "learning_rate": 8.108615458475256e-06, + "loss": 0.4693, + "step": 6444 + }, + { + "epoch": 1.0584443577689733, + "grad_norm": 0.309503198292017, + "learning_rate": 8.108335256714653e-06, + "loss": 0.5025, + "step": 6445 + }, + { + "epoch": 1.0586085849773161, + "grad_norm": 0.3527154897099522, + "learning_rate": 8.108055015763979e-06, + "loss": 0.4978, + "step": 6446 + }, + { + "epoch": 1.0587728121856588, + "grad_norm": 0.34243211145802427, + "learning_rate": 8.10777473562628e-06, + "loss": 0.4918, + "step": 6447 + }, + { + "epoch": 1.0589370393940016, + "grad_norm": 0.5315935088433437, + "learning_rate": 8.107494416304595e-06, + "loss": 0.5002, + "step": 6448 + }, + { + "epoch": 1.0591012666023443, + "grad_norm": 0.34448608785796825, + "learning_rate": 8.107214057801971e-06, + "loss": 0.4802, + "step": 6449 + }, + { + "epoch": 1.0592654938106871, + "grad_norm": 0.2679657284532223, + "learning_rate": 8.106933660121455e-06, + "loss": 0.4862, + "step": 6450 + }, + { + "epoch": 1.0594297210190298, + "grad_norm": 0.3044665667323601, + "learning_rate": 8.10665322326609e-06, + "loss": 0.4964, + "step": 6451 + }, + { + "epoch": 1.0595939482273726, + "grad_norm": 0.44302506304396316, + "learning_rate": 8.106372747238923e-06, + "loss": 0.5033, + "step": 6452 + }, + { + "epoch": 1.0597581754357153, + "grad_norm": 0.3007104836080406, + "learning_rate": 8.106092232043002e-06, + "loss": 0.5041, + "step": 6453 + }, + { + "epoch": 1.0599224026440581, + "grad_norm": 0.30669849052499437, + "learning_rate": 8.105811677681367e-06, + "loss": 0.4813, + "step": 6454 + }, + { + "epoch": 1.0600866298524008, + "grad_norm": 0.33585983656673835, + "learning_rate": 8.105531084157072e-06, + "loss": 0.5003, + "step": 6455 + }, + { + "epoch": 1.0602508570607436, + "grad_norm": 0.35016795972792997, + "learning_rate": 8.105250451473162e-06, + "loss": 0.4867, + "step": 6456 + }, + { + "epoch": 1.0604150842690863, + "grad_norm": 0.3019447943301932, + "learning_rate": 8.104969779632685e-06, + "loss": 0.4829, + "step": 6457 + }, + { + "epoch": 1.0605793114774291, + "grad_norm": 0.44307691839299485, + "learning_rate": 8.10468906863869e-06, + "loss": 0.4817, + "step": 6458 + }, + { + "epoch": 1.0607435386857718, + "grad_norm": 0.7749990416144014, + "learning_rate": 8.104408318494224e-06, + "loss": 0.4748, + "step": 6459 + }, + { + "epoch": 1.0609077658941146, + "grad_norm": 0.3369575113116707, + "learning_rate": 8.104127529202338e-06, + "loss": 0.4745, + "step": 6460 + }, + { + "epoch": 1.0610719931024573, + "grad_norm": 0.30228965862062723, + "learning_rate": 8.103846700766081e-06, + "loss": 0.5156, + "step": 6461 + }, + { + "epoch": 1.0612362203108, + "grad_norm": 0.28323225167550203, + "learning_rate": 8.103565833188503e-06, + "loss": 0.4891, + "step": 6462 + }, + { + "epoch": 1.0614004475191428, + "grad_norm": 0.35102381961251805, + "learning_rate": 8.103284926472654e-06, + "loss": 0.5034, + "step": 6463 + }, + { + "epoch": 1.0615646747274854, + "grad_norm": 0.31904575779002, + "learning_rate": 8.103003980621585e-06, + "loss": 0.4735, + "step": 6464 + }, + { + "epoch": 1.0617289019358283, + "grad_norm": 0.37108455696765513, + "learning_rate": 8.10272299563835e-06, + "loss": 0.4903, + "step": 6465 + }, + { + "epoch": 1.0618931291441709, + "grad_norm": 0.3043493567023591, + "learning_rate": 8.102441971525999e-06, + "loss": 0.4817, + "step": 6466 + }, + { + "epoch": 1.0620573563525137, + "grad_norm": 0.2969183688186469, + "learning_rate": 8.10216090828758e-06, + "loss": 0.4812, + "step": 6467 + }, + { + "epoch": 1.0622215835608564, + "grad_norm": 0.2940431258639217, + "learning_rate": 8.101879805926152e-06, + "loss": 0.468, + "step": 6468 + }, + { + "epoch": 1.0623858107691992, + "grad_norm": 0.3572806332667413, + "learning_rate": 8.101598664444765e-06, + "loss": 0.4983, + "step": 6469 + }, + { + "epoch": 1.0625500379775419, + "grad_norm": 0.32983325411802555, + "learning_rate": 8.101317483846475e-06, + "loss": 0.4785, + "step": 6470 + }, + { + "epoch": 1.0627142651858847, + "grad_norm": 0.2903993681330892, + "learning_rate": 8.101036264134332e-06, + "loss": 0.4691, + "step": 6471 + }, + { + "epoch": 1.0628784923942274, + "grad_norm": 0.2829052100129413, + "learning_rate": 8.100755005311392e-06, + "loss": 0.5018, + "step": 6472 + }, + { + "epoch": 1.0630427196025702, + "grad_norm": 0.3091537259100867, + "learning_rate": 8.10047370738071e-06, + "loss": 0.4959, + "step": 6473 + }, + { + "epoch": 1.0632069468109129, + "grad_norm": 0.3072357245938547, + "learning_rate": 8.100192370345343e-06, + "loss": 0.5023, + "step": 6474 + }, + { + "epoch": 1.0633711740192557, + "grad_norm": 0.29934615692093325, + "learning_rate": 8.099910994208345e-06, + "loss": 0.4843, + "step": 6475 + }, + { + "epoch": 1.0635354012275984, + "grad_norm": 0.29597394203379296, + "learning_rate": 8.09962957897277e-06, + "loss": 0.4746, + "step": 6476 + }, + { + "epoch": 1.0636996284359412, + "grad_norm": 0.28468282474338635, + "learning_rate": 8.099348124641676e-06, + "loss": 0.475, + "step": 6477 + }, + { + "epoch": 1.0638638556442839, + "grad_norm": 0.33136689956285587, + "learning_rate": 8.09906663121812e-06, + "loss": 0.4736, + "step": 6478 + }, + { + "epoch": 1.0640280828526265, + "grad_norm": 0.31234683664303364, + "learning_rate": 8.09878509870516e-06, + "loss": 0.4958, + "step": 6479 + }, + { + "epoch": 1.0641923100609694, + "grad_norm": 0.4152643175893618, + "learning_rate": 8.098503527105852e-06, + "loss": 0.4807, + "step": 6480 + }, + { + "epoch": 1.064356537269312, + "grad_norm": 0.35436166148095855, + "learning_rate": 8.098221916423257e-06, + "loss": 0.4767, + "step": 6481 + }, + { + "epoch": 1.0645207644776549, + "grad_norm": 0.2786918130841321, + "learning_rate": 8.097940266660431e-06, + "loss": 0.4951, + "step": 6482 + }, + { + "epoch": 1.0646849916859975, + "grad_norm": 0.2984696445751109, + "learning_rate": 8.097658577820436e-06, + "loss": 0.5136, + "step": 6483 + }, + { + "epoch": 1.0648492188943404, + "grad_norm": 0.2860516275936091, + "learning_rate": 8.097376849906326e-06, + "loss": 0.5039, + "step": 6484 + }, + { + "epoch": 1.065013446102683, + "grad_norm": 0.35011048090627467, + "learning_rate": 8.097095082921165e-06, + "loss": 0.4975, + "step": 6485 + }, + { + "epoch": 1.0651776733110259, + "grad_norm": 0.3063508579204193, + "learning_rate": 8.096813276868014e-06, + "loss": 0.4957, + "step": 6486 + }, + { + "epoch": 1.0653419005193685, + "grad_norm": 0.4053799879807205, + "learning_rate": 8.09653143174993e-06, + "loss": 0.5002, + "step": 6487 + }, + { + "epoch": 1.0655061277277114, + "grad_norm": 0.32329174991449955, + "learning_rate": 8.096249547569976e-06, + "loss": 0.4846, + "step": 6488 + }, + { + "epoch": 1.065670354936054, + "grad_norm": 0.24985790032644659, + "learning_rate": 8.095967624331216e-06, + "loss": 0.4756, + "step": 6489 + }, + { + "epoch": 1.0658345821443969, + "grad_norm": 0.2642583003677523, + "learning_rate": 8.095685662036706e-06, + "loss": 0.4837, + "step": 6490 + }, + { + "epoch": 1.0659988093527395, + "grad_norm": 0.286132146880121, + "learning_rate": 8.095403660689514e-06, + "loss": 0.4827, + "step": 6491 + }, + { + "epoch": 1.0661630365610824, + "grad_norm": 0.2745315651876201, + "learning_rate": 8.0951216202927e-06, + "loss": 0.482, + "step": 6492 + }, + { + "epoch": 1.066327263769425, + "grad_norm": 0.34287529332732586, + "learning_rate": 8.094839540849332e-06, + "loss": 0.4928, + "step": 6493 + }, + { + "epoch": 1.0664914909777679, + "grad_norm": 0.3333024161421935, + "learning_rate": 8.094557422362467e-06, + "loss": 0.5016, + "step": 6494 + }, + { + "epoch": 1.0666557181861105, + "grad_norm": 0.35575934337835535, + "learning_rate": 8.094275264835171e-06, + "loss": 0.4946, + "step": 6495 + }, + { + "epoch": 1.0668199453944531, + "grad_norm": 0.3056346750452761, + "learning_rate": 8.09399306827051e-06, + "loss": 0.4858, + "step": 6496 + }, + { + "epoch": 1.066984172602796, + "grad_norm": 0.31099343207150504, + "learning_rate": 8.093710832671548e-06, + "loss": 0.5108, + "step": 6497 + }, + { + "epoch": 1.0671483998111386, + "grad_norm": 0.322914818516821, + "learning_rate": 8.09342855804135e-06, + "loss": 0.4811, + "step": 6498 + }, + { + "epoch": 1.0673126270194815, + "grad_norm": 0.300902999390282, + "learning_rate": 8.093146244382983e-06, + "loss": 0.4938, + "step": 6499 + }, + { + "epoch": 1.0674768542278241, + "grad_norm": 0.31601742707388225, + "learning_rate": 8.092863891699512e-06, + "loss": 0.478, + "step": 6500 + }, + { + "epoch": 1.067641081436167, + "grad_norm": 0.3046928653259927, + "learning_rate": 8.092581499994007e-06, + "loss": 0.4958, + "step": 6501 + }, + { + "epoch": 1.0678053086445096, + "grad_norm": 0.3645549423936772, + "learning_rate": 8.09229906926953e-06, + "loss": 0.4993, + "step": 6502 + }, + { + "epoch": 1.0679695358528525, + "grad_norm": 0.2768289330042299, + "learning_rate": 8.092016599529151e-06, + "loss": 0.4973, + "step": 6503 + }, + { + "epoch": 1.0681337630611951, + "grad_norm": 0.36660395418493563, + "learning_rate": 8.091734090775939e-06, + "loss": 0.4891, + "step": 6504 + }, + { + "epoch": 1.068297990269538, + "grad_norm": 0.3270702435861176, + "learning_rate": 8.09145154301296e-06, + "loss": 0.4957, + "step": 6505 + }, + { + "epoch": 1.0684622174778806, + "grad_norm": 0.3498736633281296, + "learning_rate": 8.091168956243282e-06, + "loss": 0.4966, + "step": 6506 + }, + { + "epoch": 1.0686264446862235, + "grad_norm": 0.2997356142625861, + "learning_rate": 8.090886330469978e-06, + "loss": 0.5231, + "step": 6507 + }, + { + "epoch": 1.0687906718945661, + "grad_norm": 0.3697001884215529, + "learning_rate": 8.090603665696114e-06, + "loss": 0.4855, + "step": 6508 + }, + { + "epoch": 1.068954899102909, + "grad_norm": 0.5614008437965243, + "learning_rate": 8.090320961924763e-06, + "loss": 0.4989, + "step": 6509 + }, + { + "epoch": 1.0691191263112516, + "grad_norm": 0.3112432296277617, + "learning_rate": 8.090038219158993e-06, + "loss": 0.4942, + "step": 6510 + }, + { + "epoch": 1.0692833535195945, + "grad_norm": 0.3863364983192195, + "learning_rate": 8.089755437401877e-06, + "loss": 0.4669, + "step": 6511 + }, + { + "epoch": 1.069447580727937, + "grad_norm": 0.2590499894062906, + "learning_rate": 8.089472616656484e-06, + "loss": 0.5017, + "step": 6512 + }, + { + "epoch": 1.0696118079362797, + "grad_norm": 0.3294195425723214, + "learning_rate": 8.089189756925888e-06, + "loss": 0.4761, + "step": 6513 + }, + { + "epoch": 1.0697760351446226, + "grad_norm": 0.3477239348659754, + "learning_rate": 8.088906858213158e-06, + "loss": 0.5022, + "step": 6514 + }, + { + "epoch": 1.0699402623529652, + "grad_norm": 0.2957800807082142, + "learning_rate": 8.08862392052137e-06, + "loss": 0.4896, + "step": 6515 + }, + { + "epoch": 1.070104489561308, + "grad_norm": 0.277456686832057, + "learning_rate": 8.088340943853595e-06, + "loss": 0.4879, + "step": 6516 + }, + { + "epoch": 1.0702687167696507, + "grad_norm": 0.2896224449064534, + "learning_rate": 8.088057928212907e-06, + "loss": 0.4825, + "step": 6517 + }, + { + "epoch": 1.0704329439779936, + "grad_norm": 0.366817404524916, + "learning_rate": 8.08777487360238e-06, + "loss": 0.5051, + "step": 6518 + }, + { + "epoch": 1.0705971711863362, + "grad_norm": 0.3243539755109171, + "learning_rate": 8.087491780025088e-06, + "loss": 0.4889, + "step": 6519 + }, + { + "epoch": 1.070761398394679, + "grad_norm": 0.28142740942527517, + "learning_rate": 8.087208647484104e-06, + "loss": 0.4679, + "step": 6520 + }, + { + "epoch": 1.0709256256030217, + "grad_norm": 0.29184571231550355, + "learning_rate": 8.086925475982506e-06, + "loss": 0.4633, + "step": 6521 + }, + { + "epoch": 1.0710898528113646, + "grad_norm": 0.3149114520591637, + "learning_rate": 8.08664226552337e-06, + "loss": 0.4862, + "step": 6522 + }, + { + "epoch": 1.0712540800197072, + "grad_norm": 0.3523906143591502, + "learning_rate": 8.086359016109768e-06, + "loss": 0.4926, + "step": 6523 + }, + { + "epoch": 1.07141830722805, + "grad_norm": 0.2883075445690294, + "learning_rate": 8.08607572774478e-06, + "loss": 0.4749, + "step": 6524 + }, + { + "epoch": 1.0715825344363927, + "grad_norm": 0.31977848898120104, + "learning_rate": 8.08579240043148e-06, + "loss": 0.4897, + "step": 6525 + }, + { + "epoch": 1.0717467616447356, + "grad_norm": 0.27525095872824124, + "learning_rate": 8.085509034172947e-06, + "loss": 0.4904, + "step": 6526 + }, + { + "epoch": 1.0719109888530782, + "grad_norm": 0.31146255558943825, + "learning_rate": 8.085225628972259e-06, + "loss": 0.5011, + "step": 6527 + }, + { + "epoch": 1.072075216061421, + "grad_norm": 0.3464338330797061, + "learning_rate": 8.084942184832492e-06, + "loss": 0.4926, + "step": 6528 + }, + { + "epoch": 1.0722394432697637, + "grad_norm": 0.27288841993252994, + "learning_rate": 8.084658701756726e-06, + "loss": 0.4704, + "step": 6529 + }, + { + "epoch": 1.0724036704781064, + "grad_norm": 0.4372370882014702, + "learning_rate": 8.08437517974804e-06, + "loss": 0.4896, + "step": 6530 + }, + { + "epoch": 1.0725678976864492, + "grad_norm": 0.30634972416888445, + "learning_rate": 8.084091618809513e-06, + "loss": 0.4975, + "step": 6531 + }, + { + "epoch": 1.0727321248947919, + "grad_norm": 0.3221699368499719, + "learning_rate": 8.083808018944226e-06, + "loss": 0.4745, + "step": 6532 + }, + { + "epoch": 1.0728963521031347, + "grad_norm": 0.31599052239465103, + "learning_rate": 8.083524380155257e-06, + "loss": 0.4781, + "step": 6533 + }, + { + "epoch": 1.0730605793114774, + "grad_norm": 0.302902407696806, + "learning_rate": 8.083240702445687e-06, + "loss": 0.4798, + "step": 6534 + }, + { + "epoch": 1.0732248065198202, + "grad_norm": 0.29767747935360217, + "learning_rate": 8.082956985818598e-06, + "loss": 0.4879, + "step": 6535 + }, + { + "epoch": 1.0733890337281629, + "grad_norm": 0.30394027053429523, + "learning_rate": 8.08267323027707e-06, + "loss": 0.5003, + "step": 6536 + }, + { + "epoch": 1.0735532609365057, + "grad_norm": 0.2854278337518416, + "learning_rate": 8.082389435824187e-06, + "loss": 0.4773, + "step": 6537 + }, + { + "epoch": 1.0737174881448484, + "grad_norm": 0.3401948261966552, + "learning_rate": 8.08210560246303e-06, + "loss": 0.5256, + "step": 6538 + }, + { + "epoch": 1.0738817153531912, + "grad_norm": 0.287518456779464, + "learning_rate": 8.081821730196682e-06, + "loss": 0.4981, + "step": 6539 + }, + { + "epoch": 1.0740459425615339, + "grad_norm": 0.32190628608598426, + "learning_rate": 8.081537819028225e-06, + "loss": 0.4962, + "step": 6540 + }, + { + "epoch": 1.0742101697698767, + "grad_norm": 0.37003432367879086, + "learning_rate": 8.081253868960745e-06, + "loss": 0.4993, + "step": 6541 + }, + { + "epoch": 1.0743743969782193, + "grad_norm": 0.2904353068128155, + "learning_rate": 8.080969879997323e-06, + "loss": 0.4684, + "step": 6542 + }, + { + "epoch": 1.0745386241865622, + "grad_norm": 0.29501265903261076, + "learning_rate": 8.080685852141045e-06, + "loss": 0.4512, + "step": 6543 + }, + { + "epoch": 1.0747028513949048, + "grad_norm": 0.2885464571816661, + "learning_rate": 8.080401785394997e-06, + "loss": 0.5054, + "step": 6544 + }, + { + "epoch": 1.0748670786032477, + "grad_norm": 0.282323525694132, + "learning_rate": 8.08011767976226e-06, + "loss": 0.4795, + "step": 6545 + }, + { + "epoch": 1.0750313058115903, + "grad_norm": 0.31958791943190906, + "learning_rate": 8.079833535245927e-06, + "loss": 0.5158, + "step": 6546 + }, + { + "epoch": 1.075195533019933, + "grad_norm": 0.2982945132663225, + "learning_rate": 8.079549351849077e-06, + "loss": 0.4961, + "step": 6547 + }, + { + "epoch": 1.0753597602282758, + "grad_norm": 0.3665811507990705, + "learning_rate": 8.0792651295748e-06, + "loss": 0.488, + "step": 6548 + }, + { + "epoch": 1.0755239874366185, + "grad_norm": 0.324428934023983, + "learning_rate": 8.078980868426183e-06, + "loss": 0.5029, + "step": 6549 + }, + { + "epoch": 1.0756882146449613, + "grad_norm": 0.30815951965964145, + "learning_rate": 8.078696568406311e-06, + "loss": 0.4726, + "step": 6550 + }, + { + "epoch": 1.075852441853304, + "grad_norm": 0.6351629022687446, + "learning_rate": 8.078412229518273e-06, + "loss": 0.4868, + "step": 6551 + }, + { + "epoch": 1.0760166690616468, + "grad_norm": 0.27095936706763635, + "learning_rate": 8.07812785176516e-06, + "loss": 0.4846, + "step": 6552 + }, + { + "epoch": 1.0761808962699895, + "grad_norm": 0.3373418483243227, + "learning_rate": 8.077843435150056e-06, + "loss": 0.5021, + "step": 6553 + }, + { + "epoch": 1.0763451234783323, + "grad_norm": 0.2769493764982112, + "learning_rate": 8.077558979676052e-06, + "loss": 0.4995, + "step": 6554 + }, + { + "epoch": 1.076509350686675, + "grad_norm": 0.3391246782727593, + "learning_rate": 8.077274485346239e-06, + "loss": 0.4947, + "step": 6555 + }, + { + "epoch": 1.0766735778950178, + "grad_norm": 0.3194796445907453, + "learning_rate": 8.076989952163704e-06, + "loss": 0.501, + "step": 6556 + }, + { + "epoch": 1.0768378051033605, + "grad_norm": 0.2884927330411459, + "learning_rate": 8.07670538013154e-06, + "loss": 0.4765, + "step": 6557 + }, + { + "epoch": 1.0770020323117033, + "grad_norm": 0.27582021063538154, + "learning_rate": 8.076420769252837e-06, + "loss": 0.4917, + "step": 6558 + }, + { + "epoch": 1.077166259520046, + "grad_norm": 0.32878546008855475, + "learning_rate": 8.076136119530685e-06, + "loss": 0.4925, + "step": 6559 + }, + { + "epoch": 1.0773304867283888, + "grad_norm": 0.3719181139981429, + "learning_rate": 8.075851430968176e-06, + "loss": 0.4834, + "step": 6560 + }, + { + "epoch": 1.0774947139367315, + "grad_norm": 0.299817050063877, + "learning_rate": 8.075566703568402e-06, + "loss": 0.4817, + "step": 6561 + }, + { + "epoch": 1.0776589411450743, + "grad_norm": 0.27926753984629715, + "learning_rate": 8.075281937334456e-06, + "loss": 0.4684, + "step": 6562 + }, + { + "epoch": 1.077823168353417, + "grad_norm": 0.3141612633195604, + "learning_rate": 8.074997132269431e-06, + "loss": 0.4943, + "step": 6563 + }, + { + "epoch": 1.0779873955617596, + "grad_norm": 0.3227802156237344, + "learning_rate": 8.07471228837642e-06, + "loss": 0.4909, + "step": 6564 + }, + { + "epoch": 1.0781516227701025, + "grad_norm": 0.27393818022295974, + "learning_rate": 8.074427405658516e-06, + "loss": 0.479, + "step": 6565 + }, + { + "epoch": 1.078315849978445, + "grad_norm": 0.34096578885764833, + "learning_rate": 8.074142484118814e-06, + "loss": 0.4819, + "step": 6566 + }, + { + "epoch": 1.078480077186788, + "grad_norm": 0.3628942414376617, + "learning_rate": 8.073857523760407e-06, + "loss": 0.4822, + "step": 6567 + }, + { + "epoch": 1.0786443043951306, + "grad_norm": 0.3121047978071521, + "learning_rate": 8.073572524586392e-06, + "loss": 0.499, + "step": 6568 + }, + { + "epoch": 1.0788085316034735, + "grad_norm": 0.3710136396120407, + "learning_rate": 8.073287486599864e-06, + "loss": 0.5031, + "step": 6569 + }, + { + "epoch": 1.078972758811816, + "grad_norm": 0.27173145739016036, + "learning_rate": 8.073002409803917e-06, + "loss": 0.4897, + "step": 6570 + }, + { + "epoch": 1.079136986020159, + "grad_norm": 0.3607583744416979, + "learning_rate": 8.072717294201649e-06, + "loss": 0.498, + "step": 6571 + }, + { + "epoch": 1.0793012132285016, + "grad_norm": 0.33623588695032847, + "learning_rate": 8.072432139796157e-06, + "loss": 0.4989, + "step": 6572 + }, + { + "epoch": 1.0794654404368444, + "grad_norm": 0.3315744739698578, + "learning_rate": 8.072146946590536e-06, + "loss": 0.4796, + "step": 6573 + }, + { + "epoch": 1.079629667645187, + "grad_norm": 0.3222025399523138, + "learning_rate": 8.071861714587885e-06, + "loss": 0.4918, + "step": 6574 + }, + { + "epoch": 1.07979389485353, + "grad_norm": 0.31798138184786123, + "learning_rate": 8.071576443791302e-06, + "loss": 0.496, + "step": 6575 + }, + { + "epoch": 1.0799581220618726, + "grad_norm": 0.29852066241948144, + "learning_rate": 8.071291134203885e-06, + "loss": 0.4937, + "step": 6576 + }, + { + "epoch": 1.0801223492702154, + "grad_norm": 0.26278834220161956, + "learning_rate": 8.071005785828732e-06, + "loss": 0.4872, + "step": 6577 + }, + { + "epoch": 1.080286576478558, + "grad_norm": 0.2891753603047115, + "learning_rate": 8.070720398668944e-06, + "loss": 0.4777, + "step": 6578 + }, + { + "epoch": 1.080450803686901, + "grad_norm": 0.3858364653964431, + "learning_rate": 8.070434972727617e-06, + "loss": 0.5132, + "step": 6579 + }, + { + "epoch": 1.0806150308952436, + "grad_norm": 0.3764177313317822, + "learning_rate": 8.070149508007854e-06, + "loss": 0.5014, + "step": 6580 + }, + { + "epoch": 1.0807792581035862, + "grad_norm": 0.37878161708248925, + "learning_rate": 8.069864004512756e-06, + "loss": 0.4936, + "step": 6581 + }, + { + "epoch": 1.080943485311929, + "grad_norm": 0.278614098273255, + "learning_rate": 8.069578462245422e-06, + "loss": 0.4762, + "step": 6582 + }, + { + "epoch": 1.0811077125202717, + "grad_norm": 0.32093940639927887, + "learning_rate": 8.069292881208955e-06, + "loss": 0.4862, + "step": 6583 + }, + { + "epoch": 1.0812719397286146, + "grad_norm": 0.29447297248866366, + "learning_rate": 8.069007261406454e-06, + "loss": 0.4892, + "step": 6584 + }, + { + "epoch": 1.0814361669369572, + "grad_norm": 0.36869470719217107, + "learning_rate": 8.068721602841023e-06, + "loss": 0.5079, + "step": 6585 + }, + { + "epoch": 1.0816003941453, + "grad_norm": 0.28216039771339985, + "learning_rate": 8.068435905515764e-06, + "loss": 0.4853, + "step": 6586 + }, + { + "epoch": 1.0817646213536427, + "grad_norm": 0.3304883936698149, + "learning_rate": 8.068150169433781e-06, + "loss": 0.5068, + "step": 6587 + }, + { + "epoch": 1.0819288485619856, + "grad_norm": 0.33187478705588386, + "learning_rate": 8.067864394598177e-06, + "loss": 0.4933, + "step": 6588 + }, + { + "epoch": 1.0820930757703282, + "grad_norm": 0.3068115166683626, + "learning_rate": 8.067578581012054e-06, + "loss": 0.5052, + "step": 6589 + }, + { + "epoch": 1.082257302978671, + "grad_norm": 0.26991613220821475, + "learning_rate": 8.067292728678519e-06, + "loss": 0.5059, + "step": 6590 + }, + { + "epoch": 1.0824215301870137, + "grad_norm": 0.3177761633438965, + "learning_rate": 8.067006837600674e-06, + "loss": 0.4982, + "step": 6591 + }, + { + "epoch": 1.0825857573953566, + "grad_norm": 0.27743314130195196, + "learning_rate": 8.066720907781625e-06, + "loss": 0.473, + "step": 6592 + }, + { + "epoch": 1.0827499846036992, + "grad_norm": 0.27281498017793654, + "learning_rate": 8.066434939224478e-06, + "loss": 0.4999, + "step": 6593 + }, + { + "epoch": 1.082914211812042, + "grad_norm": 0.29010521029271646, + "learning_rate": 8.06614893193234e-06, + "loss": 0.4622, + "step": 6594 + }, + { + "epoch": 1.0830784390203847, + "grad_norm": 0.3875811075929337, + "learning_rate": 8.065862885908317e-06, + "loss": 0.5063, + "step": 6595 + }, + { + "epoch": 1.0832426662287276, + "grad_norm": 0.30672239793347644, + "learning_rate": 8.065576801155512e-06, + "loss": 0.4781, + "step": 6596 + }, + { + "epoch": 1.0834068934370702, + "grad_norm": 0.2790991854199382, + "learning_rate": 8.065290677677036e-06, + "loss": 0.46, + "step": 6597 + }, + { + "epoch": 1.0835711206454128, + "grad_norm": 0.37332831621147183, + "learning_rate": 8.065004515475994e-06, + "loss": 0.508, + "step": 6598 + }, + { + "epoch": 1.0837353478537557, + "grad_norm": 0.3434760999904214, + "learning_rate": 8.064718314555497e-06, + "loss": 0.4793, + "step": 6599 + }, + { + "epoch": 1.0838995750620983, + "grad_norm": 0.6233700972166917, + "learning_rate": 8.06443207491865e-06, + "loss": 0.4577, + "step": 6600 + }, + { + "epoch": 1.0840638022704412, + "grad_norm": 0.3384347060463355, + "learning_rate": 8.064145796568567e-06, + "loss": 0.4932, + "step": 6601 + }, + { + "epoch": 1.0842280294787838, + "grad_norm": 0.289153448823887, + "learning_rate": 8.063859479508352e-06, + "loss": 0.4912, + "step": 6602 + }, + { + "epoch": 1.0843922566871267, + "grad_norm": 0.26405473877925406, + "learning_rate": 8.063573123741117e-06, + "loss": 0.5015, + "step": 6603 + }, + { + "epoch": 1.0845564838954693, + "grad_norm": 0.31409076950725495, + "learning_rate": 8.063286729269971e-06, + "loss": 0.4959, + "step": 6604 + }, + { + "epoch": 1.0847207111038122, + "grad_norm": 0.30705620044601656, + "learning_rate": 8.063000296098026e-06, + "loss": 0.4956, + "step": 6605 + }, + { + "epoch": 1.0848849383121548, + "grad_norm": 0.31862613078857577, + "learning_rate": 8.062713824228393e-06, + "loss": 0.5036, + "step": 6606 + }, + { + "epoch": 1.0850491655204977, + "grad_norm": 0.7253417981589776, + "learning_rate": 8.062427313664183e-06, + "loss": 0.4844, + "step": 6607 + }, + { + "epoch": 1.0852133927288403, + "grad_norm": 0.7457709244399511, + "learning_rate": 8.062140764408505e-06, + "loss": 0.4859, + "step": 6608 + }, + { + "epoch": 1.0853776199371832, + "grad_norm": 0.300353254880477, + "learning_rate": 8.061854176464477e-06, + "loss": 0.4849, + "step": 6609 + }, + { + "epoch": 1.0855418471455258, + "grad_norm": 0.3253936123596342, + "learning_rate": 8.061567549835206e-06, + "loss": 0.4553, + "step": 6610 + }, + { + "epoch": 1.0857060743538687, + "grad_norm": 0.3120425180000209, + "learning_rate": 8.061280884523808e-06, + "loss": 0.4681, + "step": 6611 + }, + { + "epoch": 1.0858703015622113, + "grad_norm": 0.34523817729463896, + "learning_rate": 8.060994180533395e-06, + "loss": 0.4912, + "step": 6612 + }, + { + "epoch": 1.0860345287705542, + "grad_norm": 0.3159024449975521, + "learning_rate": 8.060707437867082e-06, + "loss": 0.5238, + "step": 6613 + }, + { + "epoch": 1.0861987559788968, + "grad_norm": 0.2708163187055613, + "learning_rate": 8.060420656527983e-06, + "loss": 0.4802, + "step": 6614 + }, + { + "epoch": 1.0863629831872395, + "grad_norm": 0.2717693408294249, + "learning_rate": 8.060133836519213e-06, + "loss": 0.5011, + "step": 6615 + }, + { + "epoch": 1.0865272103955823, + "grad_norm": 0.3046084952629126, + "learning_rate": 8.059846977843885e-06, + "loss": 0.4808, + "step": 6616 + }, + { + "epoch": 1.086691437603925, + "grad_norm": 0.3484788088356775, + "learning_rate": 8.059560080505119e-06, + "loss": 0.4709, + "step": 6617 + }, + { + "epoch": 1.0868556648122678, + "grad_norm": 0.4789898019185958, + "learning_rate": 8.059273144506029e-06, + "loss": 0.4782, + "step": 6618 + }, + { + "epoch": 1.0870198920206104, + "grad_norm": 0.2721149994086529, + "learning_rate": 8.058986169849727e-06, + "loss": 0.5058, + "step": 6619 + }, + { + "epoch": 1.0871841192289533, + "grad_norm": 0.5085546027446511, + "learning_rate": 8.058699156539336e-06, + "loss": 0.4936, + "step": 6620 + }, + { + "epoch": 1.087348346437296, + "grad_norm": 0.28291266607791543, + "learning_rate": 8.058412104577971e-06, + "loss": 0.4956, + "step": 6621 + }, + { + "epoch": 1.0875125736456388, + "grad_norm": 0.24750759560662303, + "learning_rate": 8.058125013968749e-06, + "loss": 0.4757, + "step": 6622 + }, + { + "epoch": 1.0876768008539814, + "grad_norm": 0.2834097176537607, + "learning_rate": 8.057837884714789e-06, + "loss": 0.4825, + "step": 6623 + }, + { + "epoch": 1.0878410280623243, + "grad_norm": 0.6745488072249444, + "learning_rate": 8.05755071681921e-06, + "loss": 0.5016, + "step": 6624 + }, + { + "epoch": 1.088005255270667, + "grad_norm": 0.26972117677597274, + "learning_rate": 8.057263510285128e-06, + "loss": 0.4905, + "step": 6625 + }, + { + "epoch": 1.0881694824790098, + "grad_norm": 0.38028687216798407, + "learning_rate": 8.056976265115665e-06, + "loss": 0.5123, + "step": 6626 + }, + { + "epoch": 1.0883337096873524, + "grad_norm": 0.32171686241552383, + "learning_rate": 8.056688981313942e-06, + "loss": 0.5023, + "step": 6627 + }, + { + "epoch": 1.0884979368956953, + "grad_norm": 0.35366407163708424, + "learning_rate": 8.056401658883075e-06, + "loss": 0.4843, + "step": 6628 + }, + { + "epoch": 1.088662164104038, + "grad_norm": 0.31785783871908235, + "learning_rate": 8.056114297826187e-06, + "loss": 0.464, + "step": 6629 + }, + { + "epoch": 1.0888263913123808, + "grad_norm": 0.37713493307433466, + "learning_rate": 8.055826898146401e-06, + "loss": 0.5046, + "step": 6630 + }, + { + "epoch": 1.0889906185207234, + "grad_norm": 0.30751278180262237, + "learning_rate": 8.055539459846836e-06, + "loss": 0.4861, + "step": 6631 + }, + { + "epoch": 1.089154845729066, + "grad_norm": 0.2679614159390897, + "learning_rate": 8.055251982930612e-06, + "loss": 0.5073, + "step": 6632 + }, + { + "epoch": 1.089319072937409, + "grad_norm": 0.3583802438166624, + "learning_rate": 8.054964467400856e-06, + "loss": 0.4893, + "step": 6633 + }, + { + "epoch": 1.0894833001457516, + "grad_norm": 0.281128336748366, + "learning_rate": 8.054676913260687e-06, + "loss": 0.4896, + "step": 6634 + }, + { + "epoch": 1.0896475273540944, + "grad_norm": 0.31164083615547156, + "learning_rate": 8.054389320513229e-06, + "loss": 0.4937, + "step": 6635 + }, + { + "epoch": 1.089811754562437, + "grad_norm": 0.2842462734493817, + "learning_rate": 8.054101689161607e-06, + "loss": 0.5003, + "step": 6636 + }, + { + "epoch": 1.08997598177078, + "grad_norm": 0.41851163082840925, + "learning_rate": 8.053814019208944e-06, + "loss": 0.4766, + "step": 6637 + }, + { + "epoch": 1.0901402089791226, + "grad_norm": 0.32067594574199665, + "learning_rate": 8.053526310658364e-06, + "loss": 0.4981, + "step": 6638 + }, + { + "epoch": 1.0903044361874654, + "grad_norm": 0.39601430427739676, + "learning_rate": 8.053238563512993e-06, + "loss": 0.4858, + "step": 6639 + }, + { + "epoch": 1.090468663395808, + "grad_norm": 0.31151104434543075, + "learning_rate": 8.052950777775953e-06, + "loss": 0.4751, + "step": 6640 + }, + { + "epoch": 1.090632890604151, + "grad_norm": 0.35616453883131216, + "learning_rate": 8.052662953450373e-06, + "loss": 0.4833, + "step": 6641 + }, + { + "epoch": 1.0907971178124936, + "grad_norm": 0.32908409486875606, + "learning_rate": 8.05237509053938e-06, + "loss": 0.5086, + "step": 6642 + }, + { + "epoch": 1.0909613450208364, + "grad_norm": 0.34134888377075057, + "learning_rate": 8.052087189046095e-06, + "loss": 0.5087, + "step": 6643 + }, + { + "epoch": 1.091125572229179, + "grad_norm": 0.47345364219699215, + "learning_rate": 8.05179924897365e-06, + "loss": 0.5156, + "step": 6644 + }, + { + "epoch": 1.091289799437522, + "grad_norm": 0.3445889661685117, + "learning_rate": 8.05151127032517e-06, + "loss": 0.4996, + "step": 6645 + }, + { + "epoch": 1.0914540266458646, + "grad_norm": 0.29551209314615595, + "learning_rate": 8.051223253103785e-06, + "loss": 0.4818, + "step": 6646 + }, + { + "epoch": 1.0916182538542074, + "grad_norm": 0.2947551790268186, + "learning_rate": 8.05093519731262e-06, + "loss": 0.4825, + "step": 6647 + }, + { + "epoch": 1.09178248106255, + "grad_norm": 0.3834351018766136, + "learning_rate": 8.050647102954806e-06, + "loss": 0.4972, + "step": 6648 + }, + { + "epoch": 1.0919467082708927, + "grad_norm": 0.3396270899894368, + "learning_rate": 8.050358970033471e-06, + "loss": 0.4774, + "step": 6649 + }, + { + "epoch": 1.0921109354792355, + "grad_norm": 0.2892429889400422, + "learning_rate": 8.050070798551745e-06, + "loss": 0.5017, + "step": 6650 + }, + { + "epoch": 1.0922751626875782, + "grad_norm": 0.2861359821069163, + "learning_rate": 8.049782588512757e-06, + "loss": 0.4789, + "step": 6651 + }, + { + "epoch": 1.092439389895921, + "grad_norm": 0.297574352808594, + "learning_rate": 8.049494339919636e-06, + "loss": 0.483, + "step": 6652 + }, + { + "epoch": 1.0926036171042637, + "grad_norm": 0.295904675266878, + "learning_rate": 8.049206052775515e-06, + "loss": 0.4759, + "step": 6653 + }, + { + "epoch": 1.0927678443126065, + "grad_norm": 0.27957006578379917, + "learning_rate": 8.048917727083526e-06, + "loss": 0.4742, + "step": 6654 + }, + { + "epoch": 1.0929320715209492, + "grad_norm": 0.33890350213174564, + "learning_rate": 8.048629362846796e-06, + "loss": 0.4757, + "step": 6655 + }, + { + "epoch": 1.093096298729292, + "grad_norm": 0.3047428157307131, + "learning_rate": 8.04834096006846e-06, + "loss": 0.4861, + "step": 6656 + }, + { + "epoch": 1.0932605259376347, + "grad_norm": 0.5228184394903771, + "learning_rate": 8.048052518751653e-06, + "loss": 0.4754, + "step": 6657 + }, + { + "epoch": 1.0934247531459775, + "grad_norm": 0.3112575437387302, + "learning_rate": 8.047764038899505e-06, + "loss": 0.4831, + "step": 6658 + }, + { + "epoch": 1.0935889803543202, + "grad_norm": 0.29681541702163144, + "learning_rate": 8.047475520515147e-06, + "loss": 0.4925, + "step": 6659 + }, + { + "epoch": 1.093753207562663, + "grad_norm": 0.280871176322509, + "learning_rate": 8.047186963601714e-06, + "loss": 0.4898, + "step": 6660 + }, + { + "epoch": 1.0939174347710057, + "grad_norm": 0.30043549760570193, + "learning_rate": 8.04689836816234e-06, + "loss": 0.4929, + "step": 6661 + }, + { + "epoch": 1.0940816619793485, + "grad_norm": 0.31300225911045476, + "learning_rate": 8.046609734200162e-06, + "loss": 0.5052, + "step": 6662 + }, + { + "epoch": 1.0942458891876912, + "grad_norm": 0.28856161920277196, + "learning_rate": 8.046321061718312e-06, + "loss": 0.4975, + "step": 6663 + }, + { + "epoch": 1.094410116396034, + "grad_norm": 0.26293407990784057, + "learning_rate": 8.046032350719928e-06, + "loss": 0.4676, + "step": 6664 + }, + { + "epoch": 1.0945743436043767, + "grad_norm": 0.4804798489350901, + "learning_rate": 8.04574360120814e-06, + "loss": 0.5155, + "step": 6665 + }, + { + "epoch": 1.0947385708127193, + "grad_norm": 0.29817475251799247, + "learning_rate": 8.045454813186092e-06, + "loss": 0.4926, + "step": 6666 + }, + { + "epoch": 1.0949027980210622, + "grad_norm": 0.2792147268849725, + "learning_rate": 8.045165986656914e-06, + "loss": 0.4694, + "step": 6667 + }, + { + "epoch": 1.0950670252294048, + "grad_norm": 0.26740986430834013, + "learning_rate": 8.044877121623747e-06, + "loss": 0.47, + "step": 6668 + }, + { + "epoch": 1.0952312524377477, + "grad_norm": 0.3175206189654057, + "learning_rate": 8.044588218089726e-06, + "loss": 0.5061, + "step": 6669 + }, + { + "epoch": 1.0953954796460903, + "grad_norm": 0.3521071238807635, + "learning_rate": 8.04429927605799e-06, + "loss": 0.498, + "step": 6670 + }, + { + "epoch": 1.0955597068544332, + "grad_norm": 0.30179167992615835, + "learning_rate": 8.044010295531676e-06, + "loss": 0.474, + "step": 6671 + }, + { + "epoch": 1.0957239340627758, + "grad_norm": 0.41781233652809985, + "learning_rate": 8.043721276513922e-06, + "loss": 0.4931, + "step": 6672 + }, + { + "epoch": 1.0958881612711187, + "grad_norm": 0.32796241643504886, + "learning_rate": 8.043432219007872e-06, + "loss": 0.4832, + "step": 6673 + }, + { + "epoch": 1.0960523884794613, + "grad_norm": 0.2986490602919634, + "learning_rate": 8.04314312301666e-06, + "loss": 0.4907, + "step": 6674 + }, + { + "epoch": 1.0962166156878042, + "grad_norm": 0.3823780932901472, + "learning_rate": 8.042853988543427e-06, + "loss": 0.4867, + "step": 6675 + }, + { + "epoch": 1.0963808428961468, + "grad_norm": 0.3210929957900738, + "learning_rate": 8.042564815591314e-06, + "loss": 0.4768, + "step": 6676 + }, + { + "epoch": 1.0965450701044896, + "grad_norm": 0.37904104957766954, + "learning_rate": 8.042275604163462e-06, + "loss": 0.4926, + "step": 6677 + }, + { + "epoch": 1.0967092973128323, + "grad_norm": 0.3349723807012331, + "learning_rate": 8.041986354263013e-06, + "loss": 0.5118, + "step": 6678 + }, + { + "epoch": 1.0968735245211751, + "grad_norm": 0.3571590062105453, + "learning_rate": 8.041697065893105e-06, + "loss": 0.4751, + "step": 6679 + }, + { + "epoch": 1.0970377517295178, + "grad_norm": 0.2921285386665655, + "learning_rate": 8.041407739056885e-06, + "loss": 0.4739, + "step": 6680 + }, + { + "epoch": 1.0972019789378606, + "grad_norm": 0.31244656526919706, + "learning_rate": 8.04111837375749e-06, + "loss": 0.4929, + "step": 6681 + }, + { + "epoch": 1.0973662061462033, + "grad_norm": 0.47267268926626577, + "learning_rate": 8.040828969998068e-06, + "loss": 0.4898, + "step": 6682 + }, + { + "epoch": 1.097530433354546, + "grad_norm": 0.2693517145732392, + "learning_rate": 8.04053952778176e-06, + "loss": 0.498, + "step": 6683 + }, + { + "epoch": 1.0976946605628888, + "grad_norm": 0.28351936482992657, + "learning_rate": 8.040250047111706e-06, + "loss": 0.4876, + "step": 6684 + }, + { + "epoch": 1.0978588877712314, + "grad_norm": 0.3594674569385834, + "learning_rate": 8.039960527991055e-06, + "loss": 0.4815, + "step": 6685 + }, + { + "epoch": 1.0980231149795743, + "grad_norm": 0.30259751217946457, + "learning_rate": 8.03967097042295e-06, + "loss": 0.5022, + "step": 6686 + }, + { + "epoch": 1.098187342187917, + "grad_norm": 0.38365337530932603, + "learning_rate": 8.039381374410536e-06, + "loss": 0.4949, + "step": 6687 + }, + { + "epoch": 1.0983515693962598, + "grad_norm": 0.30124925300942795, + "learning_rate": 8.039091739956959e-06, + "loss": 0.5001, + "step": 6688 + }, + { + "epoch": 1.0985157966046024, + "grad_norm": 0.2848395875961075, + "learning_rate": 8.03880206706536e-06, + "loss": 0.5062, + "step": 6689 + }, + { + "epoch": 1.0986800238129453, + "grad_norm": 0.4221184803884185, + "learning_rate": 8.038512355738892e-06, + "loss": 0.4814, + "step": 6690 + }, + { + "epoch": 1.098844251021288, + "grad_norm": 0.2897228472309949, + "learning_rate": 8.038222605980698e-06, + "loss": 0.4858, + "step": 6691 + }, + { + "epoch": 1.0990084782296308, + "grad_norm": 0.2999559771720914, + "learning_rate": 8.037932817793924e-06, + "loss": 0.4975, + "step": 6692 + }, + { + "epoch": 1.0991727054379734, + "grad_norm": 0.2796159071321173, + "learning_rate": 8.037642991181721e-06, + "loss": 0.4843, + "step": 6693 + }, + { + "epoch": 1.0993369326463163, + "grad_norm": 0.3118067130960556, + "learning_rate": 8.037353126147233e-06, + "loss": 0.4975, + "step": 6694 + }, + { + "epoch": 1.099501159854659, + "grad_norm": 0.3917793286041874, + "learning_rate": 8.03706322269361e-06, + "loss": 0.4984, + "step": 6695 + }, + { + "epoch": 1.0996653870630018, + "grad_norm": 0.2701048707532968, + "learning_rate": 8.036773280824e-06, + "loss": 0.4592, + "step": 6696 + }, + { + "epoch": 1.0998296142713444, + "grad_norm": 0.3657316253260058, + "learning_rate": 8.036483300541554e-06, + "loss": 0.4971, + "step": 6697 + }, + { + "epoch": 1.0999938414796873, + "grad_norm": 0.3368853629369273, + "learning_rate": 8.036193281849419e-06, + "loss": 0.478, + "step": 6698 + }, + { + "epoch": 1.10015806868803, + "grad_norm": 0.30968426218990286, + "learning_rate": 8.035903224750745e-06, + "loss": 0.5134, + "step": 6699 + }, + { + "epoch": 1.1003222958963725, + "grad_norm": 0.31875699537562413, + "learning_rate": 8.035613129248683e-06, + "loss": 0.4717, + "step": 6700 + }, + { + "epoch": 1.1004865231047154, + "grad_norm": 0.2941617351152997, + "learning_rate": 8.035322995346386e-06, + "loss": 0.4622, + "step": 6701 + }, + { + "epoch": 1.100650750313058, + "grad_norm": 0.2992618349245907, + "learning_rate": 8.035032823047001e-06, + "loss": 0.4923, + "step": 6702 + }, + { + "epoch": 1.100814977521401, + "grad_norm": 0.32832975541354775, + "learning_rate": 8.034742612353681e-06, + "loss": 0.4931, + "step": 6703 + }, + { + "epoch": 1.1009792047297435, + "grad_norm": 0.36302102867175484, + "learning_rate": 8.034452363269581e-06, + "loss": 0.4783, + "step": 6704 + }, + { + "epoch": 1.1011434319380864, + "grad_norm": 0.33724953597312485, + "learning_rate": 8.034162075797849e-06, + "loss": 0.4849, + "step": 6705 + }, + { + "epoch": 1.101307659146429, + "grad_norm": 0.26336764360879583, + "learning_rate": 8.033871749941642e-06, + "loss": 0.4979, + "step": 6706 + }, + { + "epoch": 1.101471886354772, + "grad_norm": 0.46284018414744904, + "learning_rate": 8.033581385704108e-06, + "loss": 0.4716, + "step": 6707 + }, + { + "epoch": 1.1016361135631145, + "grad_norm": 0.26767536775305845, + "learning_rate": 8.033290983088405e-06, + "loss": 0.4923, + "step": 6708 + }, + { + "epoch": 1.1018003407714574, + "grad_norm": 0.33545393017343467, + "learning_rate": 8.033000542097685e-06, + "loss": 0.4894, + "step": 6709 + }, + { + "epoch": 1.1019645679798, + "grad_norm": 0.28161470373946695, + "learning_rate": 8.032710062735103e-06, + "loss": 0.4919, + "step": 6710 + }, + { + "epoch": 1.1021287951881429, + "grad_norm": 0.29950638831768156, + "learning_rate": 8.032419545003815e-06, + "loss": 0.4772, + "step": 6711 + }, + { + "epoch": 1.1022930223964855, + "grad_norm": 0.36968967122204577, + "learning_rate": 8.032128988906975e-06, + "loss": 0.5008, + "step": 6712 + }, + { + "epoch": 1.1024572496048284, + "grad_norm": 0.3152702117734696, + "learning_rate": 8.03183839444774e-06, + "loss": 0.5156, + "step": 6713 + }, + { + "epoch": 1.102621476813171, + "grad_norm": 0.27945506823563687, + "learning_rate": 8.031547761629264e-06, + "loss": 0.4931, + "step": 6714 + }, + { + "epoch": 1.1027857040215139, + "grad_norm": 0.40158231798578, + "learning_rate": 8.031257090454704e-06, + "loss": 0.4854, + "step": 6715 + }, + { + "epoch": 1.1029499312298565, + "grad_norm": 0.3677471976180123, + "learning_rate": 8.030966380927218e-06, + "loss": 0.4895, + "step": 6716 + }, + { + "epoch": 1.1031141584381992, + "grad_norm": 0.32174666226612697, + "learning_rate": 8.030675633049964e-06, + "loss": 0.5074, + "step": 6717 + }, + { + "epoch": 1.103278385646542, + "grad_norm": 0.28039648079111684, + "learning_rate": 8.030384846826098e-06, + "loss": 0.4949, + "step": 6718 + }, + { + "epoch": 1.1034426128548847, + "grad_norm": 0.373231733104368, + "learning_rate": 8.03009402225878e-06, + "loss": 0.4826, + "step": 6719 + }, + { + "epoch": 1.1036068400632275, + "grad_norm": 0.2952935970118553, + "learning_rate": 8.029803159351167e-06, + "loss": 0.477, + "step": 6720 + }, + { + "epoch": 1.1037710672715701, + "grad_norm": 0.708722428069576, + "learning_rate": 8.029512258106419e-06, + "loss": 0.4978, + "step": 6721 + }, + { + "epoch": 1.103935294479913, + "grad_norm": 0.5013498552378116, + "learning_rate": 8.029221318527697e-06, + "loss": 0.5029, + "step": 6722 + }, + { + "epoch": 1.1040995216882556, + "grad_norm": 0.2879331351210175, + "learning_rate": 8.028930340618158e-06, + "loss": 0.4899, + "step": 6723 + }, + { + "epoch": 1.1042637488965985, + "grad_norm": 0.30600835982671126, + "learning_rate": 8.028639324380962e-06, + "loss": 0.4869, + "step": 6724 + }, + { + "epoch": 1.1044279761049411, + "grad_norm": 0.3288737516377353, + "learning_rate": 8.028348269819273e-06, + "loss": 0.4858, + "step": 6725 + }, + { + "epoch": 1.104592203313284, + "grad_norm": 0.3064727566685573, + "learning_rate": 8.02805717693625e-06, + "loss": 0.4765, + "step": 6726 + }, + { + "epoch": 1.1047564305216266, + "grad_norm": 0.3315491714374862, + "learning_rate": 8.027766045735054e-06, + "loss": 0.4951, + "step": 6727 + }, + { + "epoch": 1.1049206577299695, + "grad_norm": 0.32381105894241796, + "learning_rate": 8.02747487621885e-06, + "loss": 0.4908, + "step": 6728 + }, + { + "epoch": 1.1050848849383121, + "grad_norm": 0.29213412468763467, + "learning_rate": 8.027183668390795e-06, + "loss": 0.4844, + "step": 6729 + }, + { + "epoch": 1.105249112146655, + "grad_norm": 0.3109379295275468, + "learning_rate": 8.026892422254058e-06, + "loss": 0.4955, + "step": 6730 + }, + { + "epoch": 1.1054133393549976, + "grad_norm": 0.30931573698151227, + "learning_rate": 8.0266011378118e-06, + "loss": 0.4868, + "step": 6731 + }, + { + "epoch": 1.1055775665633405, + "grad_norm": 0.33109195922803386, + "learning_rate": 8.02630981506718e-06, + "loss": 0.4952, + "step": 6732 + }, + { + "epoch": 1.1057417937716831, + "grad_norm": 0.3782335185626781, + "learning_rate": 8.026018454023368e-06, + "loss": 0.4915, + "step": 6733 + }, + { + "epoch": 1.1059060209800258, + "grad_norm": 0.2978339135303565, + "learning_rate": 8.025727054683528e-06, + "loss": 0.5001, + "step": 6734 + }, + { + "epoch": 1.1060702481883686, + "grad_norm": 0.32795336859626906, + "learning_rate": 8.02543561705082e-06, + "loss": 0.488, + "step": 6735 + }, + { + "epoch": 1.1062344753967113, + "grad_norm": 0.4264225502151214, + "learning_rate": 8.025144141128416e-06, + "loss": 0.4796, + "step": 6736 + }, + { + "epoch": 1.1063987026050541, + "grad_norm": 0.35193668625045177, + "learning_rate": 8.024852626919476e-06, + "loss": 0.4659, + "step": 6737 + }, + { + "epoch": 1.1065629298133968, + "grad_norm": 0.5959409766547124, + "learning_rate": 8.02456107442717e-06, + "loss": 0.5001, + "step": 6738 + }, + { + "epoch": 1.1067271570217396, + "grad_norm": 0.28543948369974015, + "learning_rate": 8.024269483654663e-06, + "loss": 0.4819, + "step": 6739 + }, + { + "epoch": 1.1068913842300823, + "grad_norm": 0.35121396805542204, + "learning_rate": 8.02397785460512e-06, + "loss": 0.5072, + "step": 6740 + }, + { + "epoch": 1.1070556114384251, + "grad_norm": 0.34711974817765606, + "learning_rate": 8.023686187281715e-06, + "loss": 0.497, + "step": 6741 + }, + { + "epoch": 1.1072198386467678, + "grad_norm": 0.2914907052694788, + "learning_rate": 8.023394481687607e-06, + "loss": 0.4943, + "step": 6742 + }, + { + "epoch": 1.1073840658551106, + "grad_norm": 0.24210791352085734, + "learning_rate": 8.023102737825968e-06, + "loss": 0.4686, + "step": 6743 + }, + { + "epoch": 1.1075482930634533, + "grad_norm": 0.30686925835224993, + "learning_rate": 8.022810955699969e-06, + "loss": 0.4998, + "step": 6744 + }, + { + "epoch": 1.1077125202717961, + "grad_norm": 0.364074147936255, + "learning_rate": 8.022519135312778e-06, + "loss": 0.4706, + "step": 6745 + }, + { + "epoch": 1.1078767474801388, + "grad_norm": 0.3451141324499041, + "learning_rate": 8.02222727666756e-06, + "loss": 0.4977, + "step": 6746 + }, + { + "epoch": 1.1080409746884816, + "grad_norm": 0.4224844755920305, + "learning_rate": 8.02193537976749e-06, + "loss": 0.4832, + "step": 6747 + }, + { + "epoch": 1.1082052018968243, + "grad_norm": 0.3336599123558074, + "learning_rate": 8.021643444615738e-06, + "loss": 0.4962, + "step": 6748 + }, + { + "epoch": 1.1083694291051671, + "grad_norm": 0.29675735868901576, + "learning_rate": 8.021351471215474e-06, + "loss": 0.4864, + "step": 6749 + }, + { + "epoch": 1.1085336563135098, + "grad_norm": 0.37380530835569564, + "learning_rate": 8.021059459569865e-06, + "loss": 0.5131, + "step": 6750 + }, + { + "epoch": 1.1086978835218524, + "grad_norm": 0.3060505137161988, + "learning_rate": 8.020767409682087e-06, + "loss": 0.4857, + "step": 6751 + }, + { + "epoch": 1.1088621107301952, + "grad_norm": 0.3374431312187076, + "learning_rate": 8.020475321555313e-06, + "loss": 0.497, + "step": 6752 + }, + { + "epoch": 1.1090263379385379, + "grad_norm": 0.2895837020492894, + "learning_rate": 8.020183195192712e-06, + "loss": 0.4897, + "step": 6753 + }, + { + "epoch": 1.1091905651468807, + "grad_norm": 0.30210173515026933, + "learning_rate": 8.019891030597459e-06, + "loss": 0.4866, + "step": 6754 + }, + { + "epoch": 1.1093547923552234, + "grad_norm": 0.30843038965702, + "learning_rate": 8.019598827772726e-06, + "loss": 0.4946, + "step": 6755 + }, + { + "epoch": 1.1095190195635662, + "grad_norm": 0.35010649192439486, + "learning_rate": 8.019306586721687e-06, + "loss": 0.4874, + "step": 6756 + }, + { + "epoch": 1.1096832467719089, + "grad_norm": 0.33598334774441196, + "learning_rate": 8.019014307447516e-06, + "loss": 0.4906, + "step": 6757 + }, + { + "epoch": 1.1098474739802517, + "grad_norm": 0.3193771491741254, + "learning_rate": 8.018721989953385e-06, + "loss": 0.4726, + "step": 6758 + }, + { + "epoch": 1.1100117011885944, + "grad_norm": 0.30846186162965583, + "learning_rate": 8.018429634242472e-06, + "loss": 0.4822, + "step": 6759 + }, + { + "epoch": 1.1101759283969372, + "grad_norm": 0.2720305115460639, + "learning_rate": 8.018137240317953e-06, + "loss": 0.4925, + "step": 6760 + }, + { + "epoch": 1.1103401556052799, + "grad_norm": 0.3475860658133177, + "learning_rate": 8.017844808183002e-06, + "loss": 0.4853, + "step": 6761 + }, + { + "epoch": 1.1105043828136227, + "grad_norm": 0.2984241485105639, + "learning_rate": 8.017552337840797e-06, + "loss": 0.503, + "step": 6762 + }, + { + "epoch": 1.1106686100219654, + "grad_norm": 0.2736700880333033, + "learning_rate": 8.017259829294508e-06, + "loss": 0.4958, + "step": 6763 + }, + { + "epoch": 1.1108328372303082, + "grad_norm": 0.2947106459552869, + "learning_rate": 8.01696728254732e-06, + "loss": 0.4935, + "step": 6764 + }, + { + "epoch": 1.1109970644386509, + "grad_norm": 0.4868928828665441, + "learning_rate": 8.016674697602408e-06, + "loss": 0.4979, + "step": 6765 + }, + { + "epoch": 1.1111612916469937, + "grad_norm": 0.35004835932239026, + "learning_rate": 8.016382074462947e-06, + "loss": 0.4935, + "step": 6766 + }, + { + "epoch": 1.1113255188553364, + "grad_norm": 0.31130301478663125, + "learning_rate": 8.016089413132118e-06, + "loss": 0.5001, + "step": 6767 + }, + { + "epoch": 1.111489746063679, + "grad_norm": 0.3159981965485992, + "learning_rate": 8.0157967136131e-06, + "loss": 0.4817, + "step": 6768 + }, + { + "epoch": 1.1116539732720219, + "grad_norm": 0.3877824930328974, + "learning_rate": 8.015503975909066e-06, + "loss": 0.4923, + "step": 6769 + }, + { + "epoch": 1.1118182004803645, + "grad_norm": 0.29734466757560113, + "learning_rate": 8.015211200023204e-06, + "loss": 0.4786, + "step": 6770 + }, + { + "epoch": 1.1119824276887074, + "grad_norm": 0.5139772002706117, + "learning_rate": 8.014918385958688e-06, + "loss": 0.4958, + "step": 6771 + }, + { + "epoch": 1.11214665489705, + "grad_norm": 0.3794058659356544, + "learning_rate": 8.0146255337187e-06, + "loss": 0.4877, + "step": 6772 + }, + { + "epoch": 1.1123108821053929, + "grad_norm": 0.5224552555310031, + "learning_rate": 8.014332643306422e-06, + "loss": 0.4793, + "step": 6773 + }, + { + "epoch": 1.1124751093137355, + "grad_norm": 0.33283451191101004, + "learning_rate": 8.014039714725034e-06, + "loss": 0.4912, + "step": 6774 + }, + { + "epoch": 1.1126393365220784, + "grad_norm": 0.29502653757934966, + "learning_rate": 8.013746747977716e-06, + "loss": 0.4949, + "step": 6775 + }, + { + "epoch": 1.112803563730421, + "grad_norm": 0.3077537731044598, + "learning_rate": 8.013453743067653e-06, + "loss": 0.4841, + "step": 6776 + }, + { + "epoch": 1.1129677909387639, + "grad_norm": 0.3273063129273603, + "learning_rate": 8.013160699998025e-06, + "loss": 0.4913, + "step": 6777 + }, + { + "epoch": 1.1131320181471065, + "grad_norm": 0.3087636468252292, + "learning_rate": 8.012867618772014e-06, + "loss": 0.4965, + "step": 6778 + }, + { + "epoch": 1.1132962453554494, + "grad_norm": 0.39746359055685737, + "learning_rate": 8.012574499392805e-06, + "loss": 0.489, + "step": 6779 + }, + { + "epoch": 1.113460472563792, + "grad_norm": 0.37044284746318834, + "learning_rate": 8.012281341863583e-06, + "loss": 0.5052, + "step": 6780 + }, + { + "epoch": 1.1136246997721349, + "grad_norm": 0.3366605150154806, + "learning_rate": 8.011988146187527e-06, + "loss": 0.4752, + "step": 6781 + }, + { + "epoch": 1.1137889269804775, + "grad_norm": 0.25809968388187504, + "learning_rate": 8.011694912367826e-06, + "loss": 0.4877, + "step": 6782 + }, + { + "epoch": 1.1139531541888203, + "grad_norm": 0.326670691401543, + "learning_rate": 8.011401640407663e-06, + "loss": 0.4749, + "step": 6783 + }, + { + "epoch": 1.114117381397163, + "grad_norm": 0.28171208952437016, + "learning_rate": 8.011108330310224e-06, + "loss": 0.4985, + "step": 6784 + }, + { + "epoch": 1.1142816086055056, + "grad_norm": 0.301397371835579, + "learning_rate": 8.010814982078693e-06, + "loss": 0.4937, + "step": 6785 + }, + { + "epoch": 1.1144458358138485, + "grad_norm": 0.31271833861298604, + "learning_rate": 8.010521595716257e-06, + "loss": 0.5043, + "step": 6786 + }, + { + "epoch": 1.1146100630221911, + "grad_norm": 0.3061642359290164, + "learning_rate": 8.010228171226104e-06, + "loss": 0.4773, + "step": 6787 + }, + { + "epoch": 1.114774290230534, + "grad_norm": 0.34226686963469144, + "learning_rate": 8.009934708611418e-06, + "loss": 0.4814, + "step": 6788 + }, + { + "epoch": 1.1149385174388766, + "grad_norm": 0.2925460215378301, + "learning_rate": 8.00964120787539e-06, + "loss": 0.5067, + "step": 6789 + }, + { + "epoch": 1.1151027446472195, + "grad_norm": 0.5386012244349162, + "learning_rate": 8.009347669021203e-06, + "loss": 0.5001, + "step": 6790 + }, + { + "epoch": 1.1152669718555621, + "grad_norm": 0.3130942002159212, + "learning_rate": 8.009054092052048e-06, + "loss": 0.4763, + "step": 6791 + }, + { + "epoch": 1.115431199063905, + "grad_norm": 0.31529862231909345, + "learning_rate": 8.008760476971114e-06, + "loss": 0.5011, + "step": 6792 + }, + { + "epoch": 1.1155954262722476, + "grad_norm": 0.2964668287424008, + "learning_rate": 8.00846682378159e-06, + "loss": 0.482, + "step": 6793 + }, + { + "epoch": 1.1157596534805905, + "grad_norm": 0.47499097388877043, + "learning_rate": 8.008173132486663e-06, + "loss": 0.4886, + "step": 6794 + }, + { + "epoch": 1.1159238806889331, + "grad_norm": 0.2928459229882892, + "learning_rate": 8.007879403089523e-06, + "loss": 0.4676, + "step": 6795 + }, + { + "epoch": 1.116088107897276, + "grad_norm": 0.3433109964688705, + "learning_rate": 8.007585635593364e-06, + "loss": 0.4909, + "step": 6796 + }, + { + "epoch": 1.1162523351056186, + "grad_norm": 0.4728391511187168, + "learning_rate": 8.007291830001372e-06, + "loss": 0.4819, + "step": 6797 + }, + { + "epoch": 1.1164165623139615, + "grad_norm": 0.4378303059005657, + "learning_rate": 8.006997986316741e-06, + "loss": 0.5186, + "step": 6798 + }, + { + "epoch": 1.116580789522304, + "grad_norm": 0.31289617638300654, + "learning_rate": 8.006704104542661e-06, + "loss": 0.4973, + "step": 6799 + }, + { + "epoch": 1.116745016730647, + "grad_norm": 0.34202747263702443, + "learning_rate": 8.006410184682325e-06, + "loss": 0.47, + "step": 6800 + }, + { + "epoch": 1.1169092439389896, + "grad_norm": 0.2870020047280274, + "learning_rate": 8.006116226738924e-06, + "loss": 0.4959, + "step": 6801 + }, + { + "epoch": 1.1170734711473322, + "grad_norm": 0.35538818945126666, + "learning_rate": 8.00582223071565e-06, + "loss": 0.4819, + "step": 6802 + }, + { + "epoch": 1.117237698355675, + "grad_norm": 0.39746931662385276, + "learning_rate": 8.005528196615698e-06, + "loss": 0.4882, + "step": 6803 + }, + { + "epoch": 1.1174019255640177, + "grad_norm": 0.32267732981456915, + "learning_rate": 8.005234124442263e-06, + "loss": 0.4929, + "step": 6804 + }, + { + "epoch": 1.1175661527723606, + "grad_norm": 0.4033719198932124, + "learning_rate": 8.004940014198535e-06, + "loss": 0.5093, + "step": 6805 + }, + { + "epoch": 1.1177303799807032, + "grad_norm": 0.29773637858266944, + "learning_rate": 8.00464586588771e-06, + "loss": 0.4692, + "step": 6806 + }, + { + "epoch": 1.117894607189046, + "grad_norm": 0.34154499889118795, + "learning_rate": 8.004351679512983e-06, + "loss": 0.4915, + "step": 6807 + }, + { + "epoch": 1.1180588343973887, + "grad_norm": 0.3385191940718238, + "learning_rate": 8.004057455077549e-06, + "loss": 0.4972, + "step": 6808 + }, + { + "epoch": 1.1182230616057316, + "grad_norm": 0.31447094702417544, + "learning_rate": 8.003763192584602e-06, + "loss": 0.4847, + "step": 6809 + }, + { + "epoch": 1.1183872888140742, + "grad_norm": 0.3199578937694982, + "learning_rate": 8.003468892037342e-06, + "loss": 0.4921, + "step": 6810 + }, + { + "epoch": 1.118551516022417, + "grad_norm": 0.28372764438025105, + "learning_rate": 8.003174553438961e-06, + "loss": 0.4905, + "step": 6811 + }, + { + "epoch": 1.1187157432307597, + "grad_norm": 0.2801623722482946, + "learning_rate": 8.002880176792659e-06, + "loss": 0.4684, + "step": 6812 + }, + { + "epoch": 1.1188799704391026, + "grad_norm": 0.30831350090955995, + "learning_rate": 8.002585762101632e-06, + "loss": 0.4729, + "step": 6813 + }, + { + "epoch": 1.1190441976474452, + "grad_norm": 0.3297361277092859, + "learning_rate": 8.002291309369075e-06, + "loss": 0.4947, + "step": 6814 + }, + { + "epoch": 1.119208424855788, + "grad_norm": 0.28407434308522084, + "learning_rate": 8.001996818598192e-06, + "loss": 0.4891, + "step": 6815 + }, + { + "epoch": 1.1193726520641307, + "grad_norm": 0.3316328143968119, + "learning_rate": 8.001702289792178e-06, + "loss": 0.4746, + "step": 6816 + }, + { + "epoch": 1.1195368792724736, + "grad_norm": 0.30843174241476523, + "learning_rate": 8.001407722954228e-06, + "loss": 0.4867, + "step": 6817 + }, + { + "epoch": 1.1197011064808162, + "grad_norm": 0.30407152262582604, + "learning_rate": 8.00111311808755e-06, + "loss": 0.4799, + "step": 6818 + }, + { + "epoch": 1.1198653336891589, + "grad_norm": 0.30124797465704084, + "learning_rate": 8.000818475195335e-06, + "loss": 0.5165, + "step": 6819 + }, + { + "epoch": 1.1200295608975017, + "grad_norm": 0.27888439867394527, + "learning_rate": 8.00052379428079e-06, + "loss": 0.4949, + "step": 6820 + }, + { + "epoch": 1.1201937881058444, + "grad_norm": 0.2996848725896366, + "learning_rate": 8.00022907534711e-06, + "loss": 0.4691, + "step": 6821 + }, + { + "epoch": 1.1203580153141872, + "grad_norm": 0.49092113883894145, + "learning_rate": 7.999934318397499e-06, + "loss": 0.4864, + "step": 6822 + }, + { + "epoch": 1.1205222425225299, + "grad_norm": 0.32199904567550197, + "learning_rate": 7.99963952343516e-06, + "loss": 0.5065, + "step": 6823 + }, + { + "epoch": 1.1206864697308727, + "grad_norm": 0.3454881500764911, + "learning_rate": 7.99934469046329e-06, + "loss": 0.5026, + "step": 6824 + }, + { + "epoch": 1.1208506969392154, + "grad_norm": 0.34670731138934535, + "learning_rate": 7.999049819485094e-06, + "loss": 0.4836, + "step": 6825 + }, + { + "epoch": 1.1210149241475582, + "grad_norm": 0.40482327295583864, + "learning_rate": 7.998754910503777e-06, + "loss": 0.4726, + "step": 6826 + }, + { + "epoch": 1.1211791513559008, + "grad_norm": 0.2845574511224747, + "learning_rate": 7.998459963522537e-06, + "loss": 0.4893, + "step": 6827 + }, + { + "epoch": 1.1213433785642437, + "grad_norm": 0.2887523915174508, + "learning_rate": 7.998164978544581e-06, + "loss": 0.4788, + "step": 6828 + }, + { + "epoch": 1.1215076057725863, + "grad_norm": 0.28850627113746496, + "learning_rate": 7.99786995557311e-06, + "loss": 0.5, + "step": 6829 + }, + { + "epoch": 1.1216718329809292, + "grad_norm": 0.42010910356929393, + "learning_rate": 7.997574894611332e-06, + "loss": 0.4881, + "step": 6830 + }, + { + "epoch": 1.1218360601892718, + "grad_norm": 0.3032247075642661, + "learning_rate": 7.997279795662447e-06, + "loss": 0.4677, + "step": 6831 + }, + { + "epoch": 1.1220002873976145, + "grad_norm": 0.3286475855194463, + "learning_rate": 7.996984658729664e-06, + "loss": 0.5162, + "step": 6832 + }, + { + "epoch": 1.1221645146059573, + "grad_norm": 0.28999341860845684, + "learning_rate": 7.996689483816187e-06, + "loss": 0.4928, + "step": 6833 + }, + { + "epoch": 1.1223287418143002, + "grad_norm": 0.42759062227189226, + "learning_rate": 7.996394270925222e-06, + "loss": 0.486, + "step": 6834 + }, + { + "epoch": 1.1224929690226428, + "grad_norm": 0.3450545136947932, + "learning_rate": 7.996099020059975e-06, + "loss": 0.4977, + "step": 6835 + }, + { + "epoch": 1.1226571962309855, + "grad_norm": 0.3517990654176804, + "learning_rate": 7.995803731223652e-06, + "loss": 0.4919, + "step": 6836 + }, + { + "epoch": 1.1228214234393283, + "grad_norm": 0.30347220564861077, + "learning_rate": 7.995508404419462e-06, + "loss": 0.4698, + "step": 6837 + }, + { + "epoch": 1.122985650647671, + "grad_norm": 0.4887298383039076, + "learning_rate": 7.995213039650613e-06, + "loss": 0.469, + "step": 6838 + }, + { + "epoch": 1.1231498778560138, + "grad_norm": 0.3297509541396508, + "learning_rate": 7.994917636920311e-06, + "loss": 0.4922, + "step": 6839 + }, + { + "epoch": 1.1233141050643565, + "grad_norm": 0.36093345558286805, + "learning_rate": 7.994622196231764e-06, + "loss": 0.5015, + "step": 6840 + }, + { + "epoch": 1.1234783322726993, + "grad_norm": 0.2868360244813704, + "learning_rate": 7.994326717588181e-06, + "loss": 0.4784, + "step": 6841 + }, + { + "epoch": 1.123642559481042, + "grad_norm": 0.3860834145580953, + "learning_rate": 7.994031200992771e-06, + "loss": 0.4697, + "step": 6842 + }, + { + "epoch": 1.1238067866893848, + "grad_norm": 0.30524245277727996, + "learning_rate": 7.993735646448747e-06, + "loss": 0.4782, + "step": 6843 + }, + { + "epoch": 1.1239710138977275, + "grad_norm": 0.33538156594853824, + "learning_rate": 7.993440053959317e-06, + "loss": 0.4734, + "step": 6844 + }, + { + "epoch": 1.1241352411060703, + "grad_norm": 0.29676905678033594, + "learning_rate": 7.993144423527688e-06, + "loss": 0.4693, + "step": 6845 + }, + { + "epoch": 1.124299468314413, + "grad_norm": 0.36949417708823395, + "learning_rate": 7.992848755157078e-06, + "loss": 0.4808, + "step": 6846 + }, + { + "epoch": 1.1244636955227558, + "grad_norm": 0.28107760405872567, + "learning_rate": 7.99255304885069e-06, + "loss": 0.4771, + "step": 6847 + }, + { + "epoch": 1.1246279227310985, + "grad_norm": 0.3159370048129486, + "learning_rate": 7.992257304611742e-06, + "loss": 0.5189, + "step": 6848 + }, + { + "epoch": 1.124792149939441, + "grad_norm": 0.3667710699910642, + "learning_rate": 7.991961522443443e-06, + "loss": 0.4874, + "step": 6849 + }, + { + "epoch": 1.124956377147784, + "grad_norm": 0.4336645803076832, + "learning_rate": 7.991665702349006e-06, + "loss": 0.4809, + "step": 6850 + }, + { + "epoch": 1.1251206043561268, + "grad_norm": 0.3175651323732905, + "learning_rate": 7.991369844331644e-06, + "loss": 0.4693, + "step": 6851 + }, + { + "epoch": 1.1252848315644695, + "grad_norm": 0.31051693646801093, + "learning_rate": 7.991073948394571e-06, + "loss": 0.4779, + "step": 6852 + }, + { + "epoch": 1.125449058772812, + "grad_norm": 0.3380347830518644, + "learning_rate": 7.990778014541e-06, + "loss": 0.5012, + "step": 6853 + }, + { + "epoch": 1.125613285981155, + "grad_norm": 0.3200429128632671, + "learning_rate": 7.990482042774146e-06, + "loss": 0.4813, + "step": 6854 + }, + { + "epoch": 1.1257775131894976, + "grad_norm": 0.6531875458342193, + "learning_rate": 7.990186033097221e-06, + "loss": 0.4828, + "step": 6855 + }, + { + "epoch": 1.1259417403978405, + "grad_norm": 0.29262763987154605, + "learning_rate": 7.989889985513443e-06, + "loss": 0.4919, + "step": 6856 + }, + { + "epoch": 1.126105967606183, + "grad_norm": 0.34676785422500556, + "learning_rate": 7.989593900026025e-06, + "loss": 0.4872, + "step": 6857 + }, + { + "epoch": 1.126270194814526, + "grad_norm": 0.3232893839101977, + "learning_rate": 7.989297776638185e-06, + "loss": 0.4767, + "step": 6858 + }, + { + "epoch": 1.1264344220228686, + "grad_norm": 0.4125187177592716, + "learning_rate": 7.98900161535314e-06, + "loss": 0.4851, + "step": 6859 + }, + { + "epoch": 1.1265986492312114, + "grad_norm": 0.4946867385562897, + "learning_rate": 7.988705416174103e-06, + "loss": 0.5002, + "step": 6860 + }, + { + "epoch": 1.126762876439554, + "grad_norm": 0.3315793843980202, + "learning_rate": 7.988409179104291e-06, + "loss": 0.4835, + "step": 6861 + }, + { + "epoch": 1.126927103647897, + "grad_norm": 0.306718722386014, + "learning_rate": 7.988112904146926e-06, + "loss": 0.4587, + "step": 6862 + }, + { + "epoch": 1.1270913308562396, + "grad_norm": 0.31903874679389793, + "learning_rate": 7.987816591305222e-06, + "loss": 0.4702, + "step": 6863 + }, + { + "epoch": 1.1272555580645824, + "grad_norm": 0.2940358413813076, + "learning_rate": 7.987520240582398e-06, + "loss": 0.4732, + "step": 6864 + }, + { + "epoch": 1.127419785272925, + "grad_norm": 0.2965111002119381, + "learning_rate": 7.987223851981673e-06, + "loss": 0.4842, + "step": 6865 + }, + { + "epoch": 1.1275840124812677, + "grad_norm": 0.29814281040642143, + "learning_rate": 7.986927425506266e-06, + "loss": 0.5065, + "step": 6866 + }, + { + "epoch": 1.1277482396896106, + "grad_norm": 0.44997584875021623, + "learning_rate": 7.986630961159396e-06, + "loss": 0.4937, + "step": 6867 + }, + { + "epoch": 1.1279124668979534, + "grad_norm": 0.3079144456977793, + "learning_rate": 7.986334458944284e-06, + "loss": 0.4688, + "step": 6868 + }, + { + "epoch": 1.128076694106296, + "grad_norm": 0.28911938864641884, + "learning_rate": 7.986037918864149e-06, + "loss": 0.4743, + "step": 6869 + }, + { + "epoch": 1.1282409213146387, + "grad_norm": 0.3061321697174208, + "learning_rate": 7.985741340922214e-06, + "loss": 0.4876, + "step": 6870 + }, + { + "epoch": 1.1284051485229816, + "grad_norm": 0.31625410917552904, + "learning_rate": 7.985444725121698e-06, + "loss": 0.5166, + "step": 6871 + }, + { + "epoch": 1.1285693757313242, + "grad_norm": 0.36309716087794164, + "learning_rate": 7.985148071465822e-06, + "loss": 0.4754, + "step": 6872 + }, + { + "epoch": 1.128733602939667, + "grad_norm": 0.6592469202298972, + "learning_rate": 7.984851379957809e-06, + "loss": 0.4874, + "step": 6873 + }, + { + "epoch": 1.1288978301480097, + "grad_norm": 0.31436127933999225, + "learning_rate": 7.984554650600883e-06, + "loss": 0.4789, + "step": 6874 + }, + { + "epoch": 1.1290620573563526, + "grad_norm": 0.2779929040169858, + "learning_rate": 7.984257883398264e-06, + "loss": 0.4827, + "step": 6875 + }, + { + "epoch": 1.1292262845646952, + "grad_norm": 0.2747578546096707, + "learning_rate": 7.983961078353175e-06, + "loss": 0.5012, + "step": 6876 + }, + { + "epoch": 1.129390511773038, + "grad_norm": 0.31846168785030443, + "learning_rate": 7.983664235468845e-06, + "loss": 0.5102, + "step": 6877 + }, + { + "epoch": 1.1295547389813807, + "grad_norm": 0.771861999176361, + "learning_rate": 7.98336735474849e-06, + "loss": 0.4946, + "step": 6878 + }, + { + "epoch": 1.1297189661897236, + "grad_norm": 0.502259440692531, + "learning_rate": 7.98307043619534e-06, + "loss": 0.4755, + "step": 6879 + }, + { + "epoch": 1.1298831933980662, + "grad_norm": 0.35067185395810174, + "learning_rate": 7.982773479812616e-06, + "loss": 0.4899, + "step": 6880 + }, + { + "epoch": 1.130047420606409, + "grad_norm": 0.27701882831760466, + "learning_rate": 7.98247648560355e-06, + "loss": 0.5046, + "step": 6881 + }, + { + "epoch": 1.1302116478147517, + "grad_norm": 0.3261854870605588, + "learning_rate": 7.98217945357136e-06, + "loss": 0.4823, + "step": 6882 + }, + { + "epoch": 1.1303758750230943, + "grad_norm": 0.2753413581996859, + "learning_rate": 7.981882383719276e-06, + "loss": 0.4917, + "step": 6883 + }, + { + "epoch": 1.1305401022314372, + "grad_norm": 0.2713442153753001, + "learning_rate": 7.981585276050522e-06, + "loss": 0.4697, + "step": 6884 + }, + { + "epoch": 1.13070432943978, + "grad_norm": 0.3051670893385003, + "learning_rate": 7.981288130568328e-06, + "loss": 0.4753, + "step": 6885 + }, + { + "epoch": 1.1308685566481227, + "grad_norm": 0.3512406823319829, + "learning_rate": 7.980990947275918e-06, + "loss": 0.4971, + "step": 6886 + }, + { + "epoch": 1.1310327838564653, + "grad_norm": 0.48853751705585835, + "learning_rate": 7.980693726176525e-06, + "loss": 0.491, + "step": 6887 + }, + { + "epoch": 1.1311970110648082, + "grad_norm": 0.42627311188328976, + "learning_rate": 7.98039646727337e-06, + "loss": 0.4857, + "step": 6888 + }, + { + "epoch": 1.1313612382731508, + "grad_norm": 0.2639648626523676, + "learning_rate": 7.980099170569687e-06, + "loss": 0.4999, + "step": 6889 + }, + { + "epoch": 1.1315254654814937, + "grad_norm": 0.339761116269173, + "learning_rate": 7.979801836068703e-06, + "loss": 0.5077, + "step": 6890 + }, + { + "epoch": 1.1316896926898363, + "grad_norm": 0.3125885329528476, + "learning_rate": 7.979504463773647e-06, + "loss": 0.5013, + "step": 6891 + }, + { + "epoch": 1.1318539198981792, + "grad_norm": 0.35306176008613577, + "learning_rate": 7.979207053687749e-06, + "loss": 0.4881, + "step": 6892 + }, + { + "epoch": 1.1320181471065218, + "grad_norm": 0.29872445785240126, + "learning_rate": 7.97890960581424e-06, + "loss": 0.475, + "step": 6893 + }, + { + "epoch": 1.1321823743148647, + "grad_norm": 0.44540460265682047, + "learning_rate": 7.97861212015635e-06, + "loss": 0.4885, + "step": 6894 + }, + { + "epoch": 1.1323466015232073, + "grad_norm": 0.292544060291412, + "learning_rate": 7.978314596717308e-06, + "loss": 0.4914, + "step": 6895 + }, + { + "epoch": 1.1325108287315502, + "grad_norm": 0.3707873447225147, + "learning_rate": 7.97801703550035e-06, + "loss": 0.4705, + "step": 6896 + }, + { + "epoch": 1.1326750559398928, + "grad_norm": 0.31368514148862653, + "learning_rate": 7.977719436508702e-06, + "loss": 0.4815, + "step": 6897 + }, + { + "epoch": 1.1328392831482357, + "grad_norm": 0.3268045977778402, + "learning_rate": 7.977421799745602e-06, + "loss": 0.4857, + "step": 6898 + }, + { + "epoch": 1.1330035103565783, + "grad_norm": 0.328387273053832, + "learning_rate": 7.977124125214278e-06, + "loss": 0.4928, + "step": 6899 + }, + { + "epoch": 1.133167737564921, + "grad_norm": 0.8777662985504897, + "learning_rate": 7.976826412917966e-06, + "loss": 0.4823, + "step": 6900 + }, + { + "epoch": 1.1333319647732638, + "grad_norm": 0.30699456552514476, + "learning_rate": 7.9765286628599e-06, + "loss": 0.4806, + "step": 6901 + }, + { + "epoch": 1.1334961919816067, + "grad_norm": 0.3127291424152957, + "learning_rate": 7.976230875043309e-06, + "loss": 0.4816, + "step": 6902 + }, + { + "epoch": 1.1336604191899493, + "grad_norm": 0.3543717904664054, + "learning_rate": 7.975933049471433e-06, + "loss": 0.4662, + "step": 6903 + }, + { + "epoch": 1.133824646398292, + "grad_norm": 0.3399489359198452, + "learning_rate": 7.975635186147504e-06, + "loss": 0.5052, + "step": 6904 + }, + { + "epoch": 1.1339888736066348, + "grad_norm": 0.30131910741574397, + "learning_rate": 7.975337285074755e-06, + "loss": 0.5135, + "step": 6905 + }, + { + "epoch": 1.1341531008149774, + "grad_norm": 0.31272070176723527, + "learning_rate": 7.975039346256427e-06, + "loss": 0.5042, + "step": 6906 + }, + { + "epoch": 1.1343173280233203, + "grad_norm": 0.29675626380205483, + "learning_rate": 7.974741369695752e-06, + "loss": 0.4861, + "step": 6907 + }, + { + "epoch": 1.134481555231663, + "grad_norm": 0.37560474384127424, + "learning_rate": 7.974443355395965e-06, + "loss": 0.4789, + "step": 6908 + }, + { + "epoch": 1.1346457824400058, + "grad_norm": 0.28532039885356286, + "learning_rate": 7.974145303360305e-06, + "loss": 0.5013, + "step": 6909 + }, + { + "epoch": 1.1348100096483484, + "grad_norm": 0.9029060408738501, + "learning_rate": 7.97384721359201e-06, + "loss": 0.4825, + "step": 6910 + }, + { + "epoch": 1.1349742368566913, + "grad_norm": 0.3563699828853312, + "learning_rate": 7.973549086094317e-06, + "loss": 0.5041, + "step": 6911 + }, + { + "epoch": 1.135138464065034, + "grad_norm": 0.28934448513751987, + "learning_rate": 7.973250920870463e-06, + "loss": 0.4934, + "step": 6912 + }, + { + "epoch": 1.1353026912733768, + "grad_norm": 0.30062125993335753, + "learning_rate": 7.972952717923686e-06, + "loss": 0.464, + "step": 6913 + }, + { + "epoch": 1.1354669184817194, + "grad_norm": 0.319334843105809, + "learning_rate": 7.972654477257226e-06, + "loss": 0.4851, + "step": 6914 + }, + { + "epoch": 1.1356311456900623, + "grad_norm": 0.29289754121365025, + "learning_rate": 7.972356198874322e-06, + "loss": 0.4783, + "step": 6915 + }, + { + "epoch": 1.135795372898405, + "grad_norm": 0.5508218005907385, + "learning_rate": 7.972057882778214e-06, + "loss": 0.4843, + "step": 6916 + }, + { + "epoch": 1.1359596001067476, + "grad_norm": 0.3015173952359254, + "learning_rate": 7.97175952897214e-06, + "loss": 0.4723, + "step": 6917 + }, + { + "epoch": 1.1361238273150904, + "grad_norm": 0.29478196668659074, + "learning_rate": 7.971461137459344e-06, + "loss": 0.4849, + "step": 6918 + }, + { + "epoch": 1.1362880545234333, + "grad_norm": 0.29208606846186064, + "learning_rate": 7.971162708243062e-06, + "loss": 0.5051, + "step": 6919 + }, + { + "epoch": 1.136452281731776, + "grad_norm": 0.3201941594667582, + "learning_rate": 7.97086424132654e-06, + "loss": 0.4686, + "step": 6920 + }, + { + "epoch": 1.1366165089401186, + "grad_norm": 0.320605390702946, + "learning_rate": 7.970565736713015e-06, + "loss": 0.4832, + "step": 6921 + }, + { + "epoch": 1.1367807361484614, + "grad_norm": 0.4047041761384959, + "learning_rate": 7.970267194405732e-06, + "loss": 0.4722, + "step": 6922 + }, + { + "epoch": 1.136944963356804, + "grad_norm": 0.30560656069904657, + "learning_rate": 7.969968614407934e-06, + "loss": 0.5096, + "step": 6923 + }, + { + "epoch": 1.137109190565147, + "grad_norm": 0.2658716031775788, + "learning_rate": 7.969669996722862e-06, + "loss": 0.4676, + "step": 6924 + }, + { + "epoch": 1.1372734177734896, + "grad_norm": 0.3873747529118186, + "learning_rate": 7.96937134135376e-06, + "loss": 0.4907, + "step": 6925 + }, + { + "epoch": 1.1374376449818324, + "grad_norm": 0.31960115771207515, + "learning_rate": 7.969072648303874e-06, + "loss": 0.4802, + "step": 6926 + }, + { + "epoch": 1.137601872190175, + "grad_norm": 0.3590451481218717, + "learning_rate": 7.968773917576445e-06, + "loss": 0.4909, + "step": 6927 + }, + { + "epoch": 1.137766099398518, + "grad_norm": 0.35887748907508493, + "learning_rate": 7.968475149174718e-06, + "loss": 0.4828, + "step": 6928 + }, + { + "epoch": 1.1379303266068606, + "grad_norm": 0.343101086073062, + "learning_rate": 7.96817634310194e-06, + "loss": 0.4925, + "step": 6929 + }, + { + "epoch": 1.1380945538152034, + "grad_norm": 0.30393988234287916, + "learning_rate": 7.967877499361351e-06, + "loss": 0.4939, + "step": 6930 + }, + { + "epoch": 1.138258781023546, + "grad_norm": 0.5099165088420305, + "learning_rate": 7.967578617956203e-06, + "loss": 0.471, + "step": 6931 + }, + { + "epoch": 1.138423008231889, + "grad_norm": 0.40570362767769275, + "learning_rate": 7.96727969888974e-06, + "loss": 0.4982, + "step": 6932 + }, + { + "epoch": 1.1385872354402315, + "grad_norm": 0.2764636379439328, + "learning_rate": 7.966980742165207e-06, + "loss": 0.4871, + "step": 6933 + }, + { + "epoch": 1.1387514626485742, + "grad_norm": 0.31228463120141703, + "learning_rate": 7.966681747785852e-06, + "loss": 0.4833, + "step": 6934 + }, + { + "epoch": 1.138915689856917, + "grad_norm": 0.28009158703806936, + "learning_rate": 7.966382715754922e-06, + "loss": 0.5005, + "step": 6935 + }, + { + "epoch": 1.13907991706526, + "grad_norm": 0.30035288065758636, + "learning_rate": 7.966083646075666e-06, + "loss": 0.4989, + "step": 6936 + }, + { + "epoch": 1.1392441442736025, + "grad_norm": 0.32496233991867585, + "learning_rate": 7.96578453875133e-06, + "loss": 0.4964, + "step": 6937 + }, + { + "epoch": 1.1394083714819452, + "grad_norm": 0.27863619283479796, + "learning_rate": 7.965485393785167e-06, + "loss": 0.4853, + "step": 6938 + }, + { + "epoch": 1.139572598690288, + "grad_norm": 0.3023354281991656, + "learning_rate": 7.965186211180421e-06, + "loss": 0.477, + "step": 6939 + }, + { + "epoch": 1.1397368258986307, + "grad_norm": 0.3838169698056991, + "learning_rate": 7.964886990940344e-06, + "loss": 0.4988, + "step": 6940 + }, + { + "epoch": 1.1399010531069735, + "grad_norm": 0.3102097812889304, + "learning_rate": 7.964587733068186e-06, + "loss": 0.4733, + "step": 6941 + }, + { + "epoch": 1.1400652803153162, + "grad_norm": 0.38695000979047367, + "learning_rate": 7.964288437567195e-06, + "loss": 0.4877, + "step": 6942 + }, + { + "epoch": 1.140229507523659, + "grad_norm": 0.30817199886482777, + "learning_rate": 7.963989104440625e-06, + "loss": 0.4835, + "step": 6943 + }, + { + "epoch": 1.1403937347320017, + "grad_norm": 0.30619596819987116, + "learning_rate": 7.963689733691724e-06, + "loss": 0.4842, + "step": 6944 + }, + { + "epoch": 1.1405579619403445, + "grad_norm": 0.2729947507683516, + "learning_rate": 7.963390325323744e-06, + "loss": 0.4712, + "step": 6945 + }, + { + "epoch": 1.1407221891486872, + "grad_norm": 0.2844335128707053, + "learning_rate": 7.963090879339939e-06, + "loss": 0.5104, + "step": 6946 + }, + { + "epoch": 1.14088641635703, + "grad_norm": 0.35143865483515885, + "learning_rate": 7.962791395743559e-06, + "loss": 0.5016, + "step": 6947 + }, + { + "epoch": 1.1410506435653727, + "grad_norm": 0.29935758721236755, + "learning_rate": 7.962491874537856e-06, + "loss": 0.4927, + "step": 6948 + }, + { + "epoch": 1.1412148707737155, + "grad_norm": 0.38742203083071736, + "learning_rate": 7.962192315726086e-06, + "loss": 0.4872, + "step": 6949 + }, + { + "epoch": 1.1413790979820582, + "grad_norm": 0.26125003171023947, + "learning_rate": 7.961892719311504e-06, + "loss": 0.4861, + "step": 6950 + }, + { + "epoch": 1.1415433251904008, + "grad_norm": 0.2788446059911499, + "learning_rate": 7.961593085297357e-06, + "loss": 0.4806, + "step": 6951 + }, + { + "epoch": 1.1417075523987437, + "grad_norm": 0.37282944348130787, + "learning_rate": 7.961293413686906e-06, + "loss": 0.514, + "step": 6952 + }, + { + "epoch": 1.1418717796070865, + "grad_norm": 0.3248997890565748, + "learning_rate": 7.960993704483402e-06, + "loss": 0.4991, + "step": 6953 + }, + { + "epoch": 1.1420360068154292, + "grad_norm": 0.36815448389317, + "learning_rate": 7.960693957690101e-06, + "loss": 0.4934, + "step": 6954 + }, + { + "epoch": 1.1422002340237718, + "grad_norm": 0.43626717191432984, + "learning_rate": 7.960394173310259e-06, + "loss": 0.4954, + "step": 6955 + }, + { + "epoch": 1.1423644612321147, + "grad_norm": 0.34564209422226533, + "learning_rate": 7.96009435134713e-06, + "loss": 0.5072, + "step": 6956 + }, + { + "epoch": 1.1425286884404573, + "grad_norm": 0.3229667721598629, + "learning_rate": 7.959794491803975e-06, + "loss": 0.484, + "step": 6957 + }, + { + "epoch": 1.1426929156488002, + "grad_norm": 0.5355285268543933, + "learning_rate": 7.959494594684047e-06, + "loss": 0.5168, + "step": 6958 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3110418435011149, + "learning_rate": 7.959194659990602e-06, + "loss": 0.4927, + "step": 6959 + }, + { + "epoch": 1.1430213700654857, + "grad_norm": 0.28513444305183067, + "learning_rate": 7.958894687726902e-06, + "loss": 0.4817, + "step": 6960 + }, + { + "epoch": 1.1431855972738283, + "grad_norm": 0.27728566842644153, + "learning_rate": 7.958594677896201e-06, + "loss": 0.4884, + "step": 6961 + }, + { + "epoch": 1.1433498244821712, + "grad_norm": 0.3150109157076501, + "learning_rate": 7.958294630501761e-06, + "loss": 0.5025, + "step": 6962 + }, + { + "epoch": 1.1435140516905138, + "grad_norm": 0.37821826587535073, + "learning_rate": 7.957994545546838e-06, + "loss": 0.4677, + "step": 6963 + }, + { + "epoch": 1.1436782788988566, + "grad_norm": 0.272360968366249, + "learning_rate": 7.95769442303469e-06, + "loss": 0.4825, + "step": 6964 + }, + { + "epoch": 1.1438425061071993, + "grad_norm": 0.29025116102764614, + "learning_rate": 7.957394262968581e-06, + "loss": 0.4693, + "step": 6965 + }, + { + "epoch": 1.1440067333155421, + "grad_norm": 0.3988120980456517, + "learning_rate": 7.957094065351767e-06, + "loss": 0.4878, + "step": 6966 + }, + { + "epoch": 1.1441709605238848, + "grad_norm": 0.32215514818082297, + "learning_rate": 7.956793830187512e-06, + "loss": 0.5049, + "step": 6967 + }, + { + "epoch": 1.1443351877322274, + "grad_norm": 0.3141652271654795, + "learning_rate": 7.956493557479074e-06, + "loss": 0.4791, + "step": 6968 + }, + { + "epoch": 1.1444994149405703, + "grad_norm": 0.29312116929724086, + "learning_rate": 7.956193247229714e-06, + "loss": 0.4669, + "step": 6969 + }, + { + "epoch": 1.1446636421489131, + "grad_norm": 0.3198331201930757, + "learning_rate": 7.955892899442697e-06, + "loss": 0.4892, + "step": 6970 + }, + { + "epoch": 1.1448278693572558, + "grad_norm": 0.8979760840640667, + "learning_rate": 7.955592514121281e-06, + "loss": 0.4767, + "step": 6971 + }, + { + "epoch": 1.1449920965655984, + "grad_norm": 0.2990306712439826, + "learning_rate": 7.955292091268733e-06, + "loss": 0.5102, + "step": 6972 + }, + { + "epoch": 1.1451563237739413, + "grad_norm": 0.3053581095580193, + "learning_rate": 7.954991630888311e-06, + "loss": 0.4723, + "step": 6973 + }, + { + "epoch": 1.145320550982284, + "grad_norm": 0.3150822655293769, + "learning_rate": 7.954691132983282e-06, + "loss": 0.4704, + "step": 6974 + }, + { + "epoch": 1.1454847781906268, + "grad_norm": 0.2882659024696099, + "learning_rate": 7.954390597556908e-06, + "loss": 0.4725, + "step": 6975 + }, + { + "epoch": 1.1456490053989694, + "grad_norm": 0.2797251128466923, + "learning_rate": 7.954090024612453e-06, + "loss": 0.4743, + "step": 6976 + }, + { + "epoch": 1.1458132326073123, + "grad_norm": 0.3642369480396053, + "learning_rate": 7.953789414153183e-06, + "loss": 0.5029, + "step": 6977 + }, + { + "epoch": 1.145977459815655, + "grad_norm": 0.29543854541561576, + "learning_rate": 7.953488766182361e-06, + "loss": 0.4801, + "step": 6978 + }, + { + "epoch": 1.1461416870239978, + "grad_norm": 0.3695440138181109, + "learning_rate": 7.953188080703254e-06, + "loss": 0.4773, + "step": 6979 + }, + { + "epoch": 1.1463059142323404, + "grad_norm": 0.35804494112721486, + "learning_rate": 7.952887357719125e-06, + "loss": 0.4892, + "step": 6980 + }, + { + "epoch": 1.1464701414406833, + "grad_norm": 0.32574825202652413, + "learning_rate": 7.952586597233245e-06, + "loss": 0.5225, + "step": 6981 + }, + { + "epoch": 1.146634368649026, + "grad_norm": 0.2549369661307935, + "learning_rate": 7.952285799248875e-06, + "loss": 0.4872, + "step": 6982 + }, + { + "epoch": 1.1467985958573688, + "grad_norm": 0.2836474018881919, + "learning_rate": 7.951984963769287e-06, + "loss": 0.4807, + "step": 6983 + }, + { + "epoch": 1.1469628230657114, + "grad_norm": 0.3010588829949219, + "learning_rate": 7.951684090797744e-06, + "loss": 0.4766, + "step": 6984 + }, + { + "epoch": 1.147127050274054, + "grad_norm": 0.2774477777434259, + "learning_rate": 7.951383180337516e-06, + "loss": 0.4914, + "step": 6985 + }, + { + "epoch": 1.147291277482397, + "grad_norm": 0.43537709750406217, + "learning_rate": 7.951082232391873e-06, + "loss": 0.4822, + "step": 6986 + }, + { + "epoch": 1.1474555046907398, + "grad_norm": 0.29585741210346284, + "learning_rate": 7.950781246964079e-06, + "loss": 0.47, + "step": 6987 + }, + { + "epoch": 1.1476197318990824, + "grad_norm": 0.3355195438180755, + "learning_rate": 7.950480224057406e-06, + "loss": 0.4849, + "step": 6988 + }, + { + "epoch": 1.147783959107425, + "grad_norm": 0.32705446324079807, + "learning_rate": 7.950179163675124e-06, + "loss": 0.4889, + "step": 6989 + }, + { + "epoch": 1.147948186315768, + "grad_norm": 0.2670710136847381, + "learning_rate": 7.9498780658205e-06, + "loss": 0.4794, + "step": 6990 + }, + { + "epoch": 1.1481124135241105, + "grad_norm": 0.30118225566962875, + "learning_rate": 7.949576930496808e-06, + "loss": 0.4883, + "step": 6991 + }, + { + "epoch": 1.1482766407324534, + "grad_norm": 0.36070352710146236, + "learning_rate": 7.949275757707316e-06, + "loss": 0.477, + "step": 6992 + }, + { + "epoch": 1.148440867940796, + "grad_norm": 0.3061406159550247, + "learning_rate": 7.948974547455297e-06, + "loss": 0.4931, + "step": 6993 + }, + { + "epoch": 1.1486050951491389, + "grad_norm": 0.3545081273612038, + "learning_rate": 7.94867329974402e-06, + "loss": 0.4847, + "step": 6994 + }, + { + "epoch": 1.1487693223574815, + "grad_norm": 0.3311782995765307, + "learning_rate": 7.948372014576756e-06, + "loss": 0.4959, + "step": 6995 + }, + { + "epoch": 1.1489335495658244, + "grad_norm": 0.3077919240460935, + "learning_rate": 7.94807069195678e-06, + "loss": 0.4757, + "step": 6996 + }, + { + "epoch": 1.149097776774167, + "grad_norm": 0.2870260528487584, + "learning_rate": 7.947769331887365e-06, + "loss": 0.4615, + "step": 6997 + }, + { + "epoch": 1.1492620039825099, + "grad_norm": 0.29569179787697997, + "learning_rate": 7.947467934371783e-06, + "loss": 0.4936, + "step": 6998 + }, + { + "epoch": 1.1494262311908525, + "grad_norm": 0.35220813871941586, + "learning_rate": 7.947166499413307e-06, + "loss": 0.4593, + "step": 6999 + }, + { + "epoch": 1.1495904583991954, + "grad_norm": 0.32263584184992183, + "learning_rate": 7.946865027015212e-06, + "loss": 0.4904, + "step": 7000 + }, + { + "epoch": 1.149754685607538, + "grad_norm": 0.5166828266471525, + "learning_rate": 7.94656351718077e-06, + "loss": 0.4904, + "step": 7001 + }, + { + "epoch": 1.1499189128158807, + "grad_norm": 0.2712023974328414, + "learning_rate": 7.946261969913257e-06, + "loss": 0.4818, + "step": 7002 + }, + { + "epoch": 1.1500831400242235, + "grad_norm": 0.26915717210756196, + "learning_rate": 7.94596038521595e-06, + "loss": 0.4819, + "step": 7003 + }, + { + "epoch": 1.1502473672325664, + "grad_norm": 0.30042354024540735, + "learning_rate": 7.945658763092124e-06, + "loss": 0.4732, + "step": 7004 + }, + { + "epoch": 1.150411594440909, + "grad_norm": 0.32655935806435304, + "learning_rate": 7.94535710354505e-06, + "loss": 0.4978, + "step": 7005 + }, + { + "epoch": 1.1505758216492517, + "grad_norm": 0.3199955481342031, + "learning_rate": 7.94505540657801e-06, + "loss": 0.4744, + "step": 7006 + }, + { + "epoch": 1.1507400488575945, + "grad_norm": 0.35223648349278697, + "learning_rate": 7.94475367219428e-06, + "loss": 0.4725, + "step": 7007 + }, + { + "epoch": 1.1509042760659371, + "grad_norm": 0.34834147750497035, + "learning_rate": 7.944451900397133e-06, + "loss": 0.4773, + "step": 7008 + }, + { + "epoch": 1.15106850327428, + "grad_norm": 0.3568296672581254, + "learning_rate": 7.944150091189854e-06, + "loss": 0.495, + "step": 7009 + }, + { + "epoch": 1.1512327304826226, + "grad_norm": 0.3000946461945242, + "learning_rate": 7.943848244575712e-06, + "loss": 0.4742, + "step": 7010 + }, + { + "epoch": 1.1513969576909655, + "grad_norm": 0.2721934419714511, + "learning_rate": 7.943546360557992e-06, + "loss": 0.496, + "step": 7011 + }, + { + "epoch": 1.1515611848993081, + "grad_norm": 0.31830130435810267, + "learning_rate": 7.94324443913997e-06, + "loss": 0.508, + "step": 7012 + }, + { + "epoch": 1.151725412107651, + "grad_norm": 0.3229175169530436, + "learning_rate": 7.942942480324925e-06, + "loss": 0.4816, + "step": 7013 + }, + { + "epoch": 1.1518896393159936, + "grad_norm": 0.4111817485404657, + "learning_rate": 7.942640484116138e-06, + "loss": 0.4684, + "step": 7014 + }, + { + "epoch": 1.1520538665243365, + "grad_norm": 0.2656930494883279, + "learning_rate": 7.942338450516888e-06, + "loss": 0.458, + "step": 7015 + }, + { + "epoch": 1.1522180937326791, + "grad_norm": 0.31851059556159345, + "learning_rate": 7.942036379530456e-06, + "loss": 0.4754, + "step": 7016 + }, + { + "epoch": 1.152382320941022, + "grad_norm": 0.27602728318038483, + "learning_rate": 7.941734271160122e-06, + "loss": 0.4817, + "step": 7017 + }, + { + "epoch": 1.1525465481493646, + "grad_norm": 0.29468735688655845, + "learning_rate": 7.941432125409168e-06, + "loss": 0.4742, + "step": 7018 + }, + { + "epoch": 1.1527107753577073, + "grad_norm": 0.5489676416342217, + "learning_rate": 7.941129942280876e-06, + "loss": 0.4855, + "step": 7019 + }, + { + "epoch": 1.1528750025660501, + "grad_norm": 0.3221574182624256, + "learning_rate": 7.940827721778525e-06, + "loss": 0.4695, + "step": 7020 + }, + { + "epoch": 1.153039229774393, + "grad_norm": 0.314407501924177, + "learning_rate": 7.940525463905401e-06, + "loss": 0.4909, + "step": 7021 + }, + { + "epoch": 1.1532034569827356, + "grad_norm": 0.3598478943499774, + "learning_rate": 7.940223168664785e-06, + "loss": 0.5016, + "step": 7022 + }, + { + "epoch": 1.1533676841910783, + "grad_norm": 0.39422712873691973, + "learning_rate": 7.93992083605996e-06, + "loss": 0.4817, + "step": 7023 + }, + { + "epoch": 1.1535319113994211, + "grad_norm": 0.36165863495626405, + "learning_rate": 7.939618466094213e-06, + "loss": 0.4803, + "step": 7024 + }, + { + "epoch": 1.1536961386077638, + "grad_norm": 0.31260829963726494, + "learning_rate": 7.939316058770823e-06, + "loss": 0.486, + "step": 7025 + }, + { + "epoch": 1.1538603658161066, + "grad_norm": 0.34725257054626907, + "learning_rate": 7.939013614093078e-06, + "loss": 0.4775, + "step": 7026 + }, + { + "epoch": 1.1540245930244493, + "grad_norm": 0.2669254990323276, + "learning_rate": 7.93871113206426e-06, + "loss": 0.4889, + "step": 7027 + }, + { + "epoch": 1.1541888202327921, + "grad_norm": 0.4093792540774185, + "learning_rate": 7.938408612687657e-06, + "loss": 0.491, + "step": 7028 + }, + { + "epoch": 1.1543530474411348, + "grad_norm": 0.3290675100373528, + "learning_rate": 7.938106055966554e-06, + "loss": 0.4954, + "step": 7029 + }, + { + "epoch": 1.1545172746494776, + "grad_norm": 0.3225318096562668, + "learning_rate": 7.937803461904236e-06, + "loss": 0.4846, + "step": 7030 + }, + { + "epoch": 1.1546815018578203, + "grad_norm": 0.3516853816464303, + "learning_rate": 7.93750083050399e-06, + "loss": 0.4874, + "step": 7031 + }, + { + "epoch": 1.1548457290661631, + "grad_norm": 0.32631152573393535, + "learning_rate": 7.937198161769102e-06, + "loss": 0.4735, + "step": 7032 + }, + { + "epoch": 1.1550099562745058, + "grad_norm": 0.31907307198476026, + "learning_rate": 7.936895455702861e-06, + "loss": 0.49, + "step": 7033 + }, + { + "epoch": 1.1551741834828486, + "grad_norm": 0.27136595054747875, + "learning_rate": 7.936592712308557e-06, + "loss": 0.4598, + "step": 7034 + }, + { + "epoch": 1.1553384106911913, + "grad_norm": 0.3763844729573229, + "learning_rate": 7.93628993158947e-06, + "loss": 0.4689, + "step": 7035 + }, + { + "epoch": 1.155502637899534, + "grad_norm": 0.3195562295488287, + "learning_rate": 7.935987113548896e-06, + "loss": 0.4878, + "step": 7036 + }, + { + "epoch": 1.1556668651078768, + "grad_norm": 0.2761377095445116, + "learning_rate": 7.93568425819012e-06, + "loss": 0.4706, + "step": 7037 + }, + { + "epoch": 1.1558310923162196, + "grad_norm": 0.5413992090888955, + "learning_rate": 7.935381365516435e-06, + "loss": 0.5024, + "step": 7038 + }, + { + "epoch": 1.1559953195245622, + "grad_norm": 0.5469978575561415, + "learning_rate": 7.935078435531127e-06, + "loss": 0.4571, + "step": 7039 + }, + { + "epoch": 1.1561595467329049, + "grad_norm": 0.2823534067099165, + "learning_rate": 7.934775468237486e-06, + "loss": 0.4698, + "step": 7040 + }, + { + "epoch": 1.1563237739412477, + "grad_norm": 0.44633643276642965, + "learning_rate": 7.934472463638807e-06, + "loss": 0.4956, + "step": 7041 + }, + { + "epoch": 1.1564880011495904, + "grad_norm": 0.376102625780165, + "learning_rate": 7.934169421738377e-06, + "loss": 0.4824, + "step": 7042 + }, + { + "epoch": 1.1566522283579332, + "grad_norm": 0.27588776388549247, + "learning_rate": 7.933866342539488e-06, + "loss": 0.5206, + "step": 7043 + }, + { + "epoch": 1.1568164555662759, + "grad_norm": 0.2878482112541905, + "learning_rate": 7.933563226045431e-06, + "loss": 0.4852, + "step": 7044 + }, + { + "epoch": 1.1569806827746187, + "grad_norm": 0.2697563816295159, + "learning_rate": 7.933260072259501e-06, + "loss": 0.499, + "step": 7045 + }, + { + "epoch": 1.1571449099829614, + "grad_norm": 0.28236168172776016, + "learning_rate": 7.932956881184988e-06, + "loss": 0.5005, + "step": 7046 + }, + { + "epoch": 1.1573091371913042, + "grad_norm": 0.3338941095186326, + "learning_rate": 7.932653652825185e-06, + "loss": 0.4796, + "step": 7047 + }, + { + "epoch": 1.1574733643996469, + "grad_norm": 0.3625356119489219, + "learning_rate": 7.932350387183387e-06, + "loss": 0.5081, + "step": 7048 + }, + { + "epoch": 1.1576375916079897, + "grad_norm": 0.27136370495062984, + "learning_rate": 7.932047084262887e-06, + "loss": 0.4879, + "step": 7049 + }, + { + "epoch": 1.1578018188163324, + "grad_norm": 0.3366519673880185, + "learning_rate": 7.931743744066978e-06, + "loss": 0.491, + "step": 7050 + }, + { + "epoch": 1.1579660460246752, + "grad_norm": 0.30178495916921333, + "learning_rate": 7.931440366598956e-06, + "loss": 0.4831, + "step": 7051 + }, + { + "epoch": 1.1581302732330179, + "grad_norm": 0.32500677131083183, + "learning_rate": 7.931136951862117e-06, + "loss": 0.4977, + "step": 7052 + }, + { + "epoch": 1.1582945004413605, + "grad_norm": 0.2751069112378041, + "learning_rate": 7.930833499859752e-06, + "loss": 0.4689, + "step": 7053 + }, + { + "epoch": 1.1584587276497034, + "grad_norm": 0.3078889240750525, + "learning_rate": 7.930530010595161e-06, + "loss": 0.4922, + "step": 7054 + }, + { + "epoch": 1.1586229548580462, + "grad_norm": 0.2956108438153107, + "learning_rate": 7.93022648407164e-06, + "loss": 0.4897, + "step": 7055 + }, + { + "epoch": 1.1587871820663889, + "grad_norm": 0.2927169524393216, + "learning_rate": 7.929922920292483e-06, + "loss": 0.4975, + "step": 7056 + }, + { + "epoch": 1.1589514092747315, + "grad_norm": 0.33526725775792277, + "learning_rate": 7.929619319260988e-06, + "loss": 0.4927, + "step": 7057 + }, + { + "epoch": 1.1591156364830744, + "grad_norm": 0.4759390949788766, + "learning_rate": 7.929315680980456e-06, + "loss": 0.4908, + "step": 7058 + }, + { + "epoch": 1.159279863691417, + "grad_norm": 0.32076151526244373, + "learning_rate": 7.929012005454178e-06, + "loss": 0.4795, + "step": 7059 + }, + { + "epoch": 1.1594440908997599, + "grad_norm": 0.31178290972430867, + "learning_rate": 7.928708292685458e-06, + "loss": 0.5054, + "step": 7060 + }, + { + "epoch": 1.1596083181081025, + "grad_norm": 0.3028328443170019, + "learning_rate": 7.928404542677592e-06, + "loss": 0.4939, + "step": 7061 + }, + { + "epoch": 1.1597725453164454, + "grad_norm": 0.8078948086337088, + "learning_rate": 7.92810075543388e-06, + "loss": 0.4848, + "step": 7062 + }, + { + "epoch": 1.159936772524788, + "grad_norm": 0.2892938136222857, + "learning_rate": 7.927796930957622e-06, + "loss": 0.4998, + "step": 7063 + }, + { + "epoch": 1.1601009997331309, + "grad_norm": 0.3019318848201967, + "learning_rate": 7.927493069252115e-06, + "loss": 0.4862, + "step": 7064 + }, + { + "epoch": 1.1602652269414735, + "grad_norm": 0.2908551244366857, + "learning_rate": 7.927189170320663e-06, + "loss": 0.4978, + "step": 7065 + }, + { + "epoch": 1.1604294541498164, + "grad_norm": 0.29280416653917135, + "learning_rate": 7.926885234166562e-06, + "loss": 0.4755, + "step": 7066 + }, + { + "epoch": 1.160593681358159, + "grad_norm": 0.269957864981838, + "learning_rate": 7.926581260793119e-06, + "loss": 0.4701, + "step": 7067 + }, + { + "epoch": 1.1607579085665019, + "grad_norm": 0.30097708194571493, + "learning_rate": 7.926277250203629e-06, + "loss": 0.4644, + "step": 7068 + }, + { + "epoch": 1.1609221357748445, + "grad_norm": 0.32757186478730316, + "learning_rate": 7.925973202401399e-06, + "loss": 0.4749, + "step": 7069 + }, + { + "epoch": 1.1610863629831871, + "grad_norm": 0.40123882700762675, + "learning_rate": 7.925669117389728e-06, + "loss": 0.4788, + "step": 7070 + }, + { + "epoch": 1.16125059019153, + "grad_norm": 0.3603074875810292, + "learning_rate": 7.92536499517192e-06, + "loss": 0.4812, + "step": 7071 + }, + { + "epoch": 1.1614148173998728, + "grad_norm": 0.3698279608261931, + "learning_rate": 7.92506083575128e-06, + "loss": 0.4665, + "step": 7072 + }, + { + "epoch": 1.1615790446082155, + "grad_norm": 0.4518716080031763, + "learning_rate": 7.924756639131108e-06, + "loss": 0.4997, + "step": 7073 + }, + { + "epoch": 1.1617432718165581, + "grad_norm": 0.33189232317990797, + "learning_rate": 7.92445240531471e-06, + "loss": 0.4534, + "step": 7074 + }, + { + "epoch": 1.161907499024901, + "grad_norm": 0.3044263355830535, + "learning_rate": 7.924148134305389e-06, + "loss": 0.5074, + "step": 7075 + }, + { + "epoch": 1.1620717262332436, + "grad_norm": 0.34185010495544116, + "learning_rate": 7.92384382610645e-06, + "loss": 0.4768, + "step": 7076 + }, + { + "epoch": 1.1622359534415865, + "grad_norm": 0.3453996765799976, + "learning_rate": 7.9235394807212e-06, + "loss": 0.4863, + "step": 7077 + }, + { + "epoch": 1.1624001806499291, + "grad_norm": 0.3128815977202802, + "learning_rate": 7.923235098152943e-06, + "loss": 0.5233, + "step": 7078 + }, + { + "epoch": 1.162564407858272, + "grad_norm": 0.5247087085994546, + "learning_rate": 7.922930678404983e-06, + "loss": 0.4742, + "step": 7079 + }, + { + "epoch": 1.1627286350666146, + "grad_norm": 0.39150724559453093, + "learning_rate": 7.922626221480629e-06, + "loss": 0.4781, + "step": 7080 + }, + { + "epoch": 1.1628928622749575, + "grad_norm": 0.33804533874168374, + "learning_rate": 7.922321727383187e-06, + "loss": 0.4862, + "step": 7081 + }, + { + "epoch": 1.1630570894833, + "grad_norm": 0.43511358014238194, + "learning_rate": 7.922017196115964e-06, + "loss": 0.4706, + "step": 7082 + }, + { + "epoch": 1.163221316691643, + "grad_norm": 0.32505034890033285, + "learning_rate": 7.921712627682266e-06, + "loss": 0.483, + "step": 7083 + }, + { + "epoch": 1.1633855438999856, + "grad_norm": 0.29173852633659864, + "learning_rate": 7.921408022085404e-06, + "loss": 0.4763, + "step": 7084 + }, + { + "epoch": 1.1635497711083285, + "grad_norm": 0.32413350522873113, + "learning_rate": 7.921103379328685e-06, + "loss": 0.5107, + "step": 7085 + }, + { + "epoch": 1.163713998316671, + "grad_norm": 0.29827489864002776, + "learning_rate": 7.920798699415416e-06, + "loss": 0.4904, + "step": 7086 + }, + { + "epoch": 1.1638782255250137, + "grad_norm": 0.32881461136004475, + "learning_rate": 7.92049398234891e-06, + "loss": 0.4878, + "step": 7087 + }, + { + "epoch": 1.1640424527333566, + "grad_norm": 0.26966455996545646, + "learning_rate": 7.92018922813247e-06, + "loss": 0.4843, + "step": 7088 + }, + { + "epoch": 1.1642066799416995, + "grad_norm": 0.3524190577600752, + "learning_rate": 7.919884436769413e-06, + "loss": 0.4807, + "step": 7089 + }, + { + "epoch": 1.164370907150042, + "grad_norm": 0.2873202615000851, + "learning_rate": 7.919579608263045e-06, + "loss": 0.4889, + "step": 7090 + }, + { + "epoch": 1.1645351343583847, + "grad_norm": 0.32072446570760915, + "learning_rate": 7.91927474261668e-06, + "loss": 0.4881, + "step": 7091 + }, + { + "epoch": 1.1646993615667276, + "grad_norm": 0.35024441319253874, + "learning_rate": 7.918969839833625e-06, + "loss": 0.4794, + "step": 7092 + }, + { + "epoch": 1.1648635887750702, + "grad_norm": 0.3299835820205014, + "learning_rate": 7.918664899917194e-06, + "loss": 0.4884, + "step": 7093 + }, + { + "epoch": 1.165027815983413, + "grad_norm": 0.29717281189054834, + "learning_rate": 7.9183599228707e-06, + "loss": 0.4886, + "step": 7094 + }, + { + "epoch": 1.1651920431917557, + "grad_norm": 0.3284713919170598, + "learning_rate": 7.918054908697453e-06, + "loss": 0.4818, + "step": 7095 + }, + { + "epoch": 1.1653562704000986, + "grad_norm": 0.2925217287720715, + "learning_rate": 7.917749857400766e-06, + "loss": 0.4869, + "step": 7096 + }, + { + "epoch": 1.1655204976084412, + "grad_norm": 0.2765372710402451, + "learning_rate": 7.917444768983954e-06, + "loss": 0.4661, + "step": 7097 + }, + { + "epoch": 1.165684724816784, + "grad_norm": 0.3101792926600381, + "learning_rate": 7.917139643450331e-06, + "loss": 0.4848, + "step": 7098 + }, + { + "epoch": 1.1658489520251267, + "grad_norm": 0.2848413701567469, + "learning_rate": 7.916834480803207e-06, + "loss": 0.4973, + "step": 7099 + }, + { + "epoch": 1.1660131792334696, + "grad_norm": 0.28047689897386036, + "learning_rate": 7.9165292810459e-06, + "loss": 0.4548, + "step": 7100 + }, + { + "epoch": 1.1661774064418122, + "grad_norm": 0.35063738100165215, + "learning_rate": 7.916224044181723e-06, + "loss": 0.4869, + "step": 7101 + }, + { + "epoch": 1.166341633650155, + "grad_norm": 0.2823049704902083, + "learning_rate": 7.915918770213992e-06, + "loss": 0.4783, + "step": 7102 + }, + { + "epoch": 1.1665058608584977, + "grad_norm": 0.29208609576357075, + "learning_rate": 7.915613459146022e-06, + "loss": 0.4933, + "step": 7103 + }, + { + "epoch": 1.1666700880668404, + "grad_norm": 0.2908603084533477, + "learning_rate": 7.915308110981129e-06, + "loss": 0.4762, + "step": 7104 + }, + { + "epoch": 1.1668343152751832, + "grad_norm": 0.3142549539312152, + "learning_rate": 7.915002725722632e-06, + "loss": 0.467, + "step": 7105 + }, + { + "epoch": 1.166998542483526, + "grad_norm": 0.297802378353576, + "learning_rate": 7.914697303373843e-06, + "loss": 0.4712, + "step": 7106 + }, + { + "epoch": 1.1671627696918687, + "grad_norm": 0.30827567036324066, + "learning_rate": 7.914391843938082e-06, + "loss": 0.4782, + "step": 7107 + }, + { + "epoch": 1.1673269969002114, + "grad_norm": 0.27376585280710897, + "learning_rate": 7.914086347418667e-06, + "loss": 0.4667, + "step": 7108 + }, + { + "epoch": 1.1674912241085542, + "grad_norm": 0.2912467864119177, + "learning_rate": 7.913780813818914e-06, + "loss": 0.4891, + "step": 7109 + }, + { + "epoch": 1.1676554513168969, + "grad_norm": 0.3755970122826635, + "learning_rate": 7.913475243142145e-06, + "loss": 0.4698, + "step": 7110 + }, + { + "epoch": 1.1678196785252397, + "grad_norm": 0.3126792391825246, + "learning_rate": 7.913169635391675e-06, + "loss": 0.4962, + "step": 7111 + }, + { + "epoch": 1.1679839057335824, + "grad_norm": 0.33128634368155074, + "learning_rate": 7.912863990570826e-06, + "loss": 0.4906, + "step": 7112 + }, + { + "epoch": 1.1681481329419252, + "grad_norm": 0.276069135824882, + "learning_rate": 7.912558308682914e-06, + "loss": 0.5039, + "step": 7113 + }, + { + "epoch": 1.1683123601502678, + "grad_norm": 0.30298873479148086, + "learning_rate": 7.912252589731262e-06, + "loss": 0.4697, + "step": 7114 + }, + { + "epoch": 1.1684765873586107, + "grad_norm": 0.34034916001374615, + "learning_rate": 7.91194683371919e-06, + "loss": 0.4949, + "step": 7115 + }, + { + "epoch": 1.1686408145669533, + "grad_norm": 0.30127604093427846, + "learning_rate": 7.911641040650019e-06, + "loss": 0.4656, + "step": 7116 + }, + { + "epoch": 1.1688050417752962, + "grad_norm": 0.30448083371606166, + "learning_rate": 7.911335210527068e-06, + "loss": 0.504, + "step": 7117 + }, + { + "epoch": 1.1689692689836388, + "grad_norm": 0.3490091808051166, + "learning_rate": 7.911029343353664e-06, + "loss": 0.4747, + "step": 7118 + }, + { + "epoch": 1.1691334961919817, + "grad_norm": 0.28283313823611345, + "learning_rate": 7.910723439133123e-06, + "loss": 0.4985, + "step": 7119 + }, + { + "epoch": 1.1692977234003243, + "grad_norm": 1.1831427012153144, + "learning_rate": 7.910417497868768e-06, + "loss": 0.5003, + "step": 7120 + }, + { + "epoch": 1.169461950608667, + "grad_norm": 0.37970954472659757, + "learning_rate": 7.910111519563926e-06, + "loss": 0.4743, + "step": 7121 + }, + { + "epoch": 1.1696261778170098, + "grad_norm": 0.3060323198171699, + "learning_rate": 7.909805504221917e-06, + "loss": 0.4727, + "step": 7122 + }, + { + "epoch": 1.1697904050253527, + "grad_norm": 0.3090375579006394, + "learning_rate": 7.909499451846065e-06, + "loss": 0.4854, + "step": 7123 + }, + { + "epoch": 1.1699546322336953, + "grad_norm": 0.35422086742882625, + "learning_rate": 7.909193362439696e-06, + "loss": 0.481, + "step": 7124 + }, + { + "epoch": 1.170118859442038, + "grad_norm": 0.2833349932119131, + "learning_rate": 7.908887236006131e-06, + "loss": 0.4609, + "step": 7125 + }, + { + "epoch": 1.1702830866503808, + "grad_norm": 0.3438027843268845, + "learning_rate": 7.908581072548698e-06, + "loss": 0.4978, + "step": 7126 + }, + { + "epoch": 1.1704473138587235, + "grad_norm": 0.3084865886495486, + "learning_rate": 7.908274872070718e-06, + "loss": 0.4879, + "step": 7127 + }, + { + "epoch": 1.1706115410670663, + "grad_norm": 0.2968696229849295, + "learning_rate": 7.907968634575524e-06, + "loss": 0.4669, + "step": 7128 + }, + { + "epoch": 1.170775768275409, + "grad_norm": 0.30218235608708677, + "learning_rate": 7.907662360066435e-06, + "loss": 0.4836, + "step": 7129 + }, + { + "epoch": 1.1709399954837518, + "grad_norm": 0.29267877493515143, + "learning_rate": 7.907356048546781e-06, + "loss": 0.4828, + "step": 7130 + }, + { + "epoch": 1.1711042226920945, + "grad_norm": 0.30254596069123213, + "learning_rate": 7.907049700019888e-06, + "loss": 0.4937, + "step": 7131 + }, + { + "epoch": 1.1712684499004373, + "grad_norm": 0.3029131007987612, + "learning_rate": 7.906743314489084e-06, + "loss": 0.4709, + "step": 7132 + }, + { + "epoch": 1.17143267710878, + "grad_norm": 0.38195767468163316, + "learning_rate": 7.906436891957694e-06, + "loss": 0.4835, + "step": 7133 + }, + { + "epoch": 1.1715969043171228, + "grad_norm": 0.32624775519999805, + "learning_rate": 7.906130432429048e-06, + "loss": 0.5137, + "step": 7134 + }, + { + "epoch": 1.1717611315254655, + "grad_norm": 0.2754028052145745, + "learning_rate": 7.905823935906474e-06, + "loss": 0.4959, + "step": 7135 + }, + { + "epoch": 1.1719253587338083, + "grad_norm": 0.4444728587958453, + "learning_rate": 7.905517402393304e-06, + "loss": 0.514, + "step": 7136 + }, + { + "epoch": 1.172089585942151, + "grad_norm": 0.2726245516199059, + "learning_rate": 7.905210831892863e-06, + "loss": 0.4567, + "step": 7137 + }, + { + "epoch": 1.1722538131504936, + "grad_norm": 0.28681242761135123, + "learning_rate": 7.904904224408481e-06, + "loss": 0.4986, + "step": 7138 + }, + { + "epoch": 1.1724180403588365, + "grad_norm": 0.2756319660402748, + "learning_rate": 7.904597579943488e-06, + "loss": 0.4842, + "step": 7139 + }, + { + "epoch": 1.1725822675671793, + "grad_norm": 0.33331624186474573, + "learning_rate": 7.904290898501218e-06, + "loss": 0.4678, + "step": 7140 + }, + { + "epoch": 1.172746494775522, + "grad_norm": 0.3325946939274747, + "learning_rate": 7.903984180084999e-06, + "loss": 0.4677, + "step": 7141 + }, + { + "epoch": 1.1729107219838646, + "grad_norm": 0.2699322093677336, + "learning_rate": 7.903677424698163e-06, + "loss": 0.473, + "step": 7142 + }, + { + "epoch": 1.1730749491922075, + "grad_norm": 0.28956151873217795, + "learning_rate": 7.903370632344042e-06, + "loss": 0.4828, + "step": 7143 + }, + { + "epoch": 1.17323917640055, + "grad_norm": 0.5323259173407964, + "learning_rate": 7.903063803025965e-06, + "loss": 0.4677, + "step": 7144 + }, + { + "epoch": 1.173403403608893, + "grad_norm": 0.38857627895908703, + "learning_rate": 7.902756936747268e-06, + "loss": 0.4758, + "step": 7145 + }, + { + "epoch": 1.1735676308172356, + "grad_norm": 0.3227428663071013, + "learning_rate": 7.902450033511284e-06, + "loss": 0.4831, + "step": 7146 + }, + { + "epoch": 1.1737318580255784, + "grad_norm": 0.3086496978455175, + "learning_rate": 7.902143093321344e-06, + "loss": 0.4857, + "step": 7147 + }, + { + "epoch": 1.173896085233921, + "grad_norm": 0.32069342974778725, + "learning_rate": 7.901836116180784e-06, + "loss": 0.4907, + "step": 7148 + }, + { + "epoch": 1.174060312442264, + "grad_norm": 0.4404613405259411, + "learning_rate": 7.901529102092935e-06, + "loss": 0.4866, + "step": 7149 + }, + { + "epoch": 1.1742245396506066, + "grad_norm": 0.28651128563078515, + "learning_rate": 7.901222051061133e-06, + "loss": 0.4649, + "step": 7150 + }, + { + "epoch": 1.1743887668589494, + "grad_norm": 2.9411089773533248, + "learning_rate": 7.900914963088717e-06, + "loss": 0.4908, + "step": 7151 + }, + { + "epoch": 1.174552994067292, + "grad_norm": 0.3004882084769234, + "learning_rate": 7.900607838179015e-06, + "loss": 0.4819, + "step": 7152 + }, + { + "epoch": 1.174717221275635, + "grad_norm": 0.2929426082232998, + "learning_rate": 7.900300676335366e-06, + "loss": 0.4785, + "step": 7153 + }, + { + "epoch": 1.1748814484839776, + "grad_norm": 0.3017982722752946, + "learning_rate": 7.899993477561107e-06, + "loss": 0.4783, + "step": 7154 + }, + { + "epoch": 1.1750456756923202, + "grad_norm": 0.37263447468744393, + "learning_rate": 7.899686241859574e-06, + "loss": 0.4787, + "step": 7155 + }, + { + "epoch": 1.175209902900663, + "grad_norm": 0.2718529502001876, + "learning_rate": 7.899378969234103e-06, + "loss": 0.4677, + "step": 7156 + }, + { + "epoch": 1.175374130109006, + "grad_norm": 0.32113420311098073, + "learning_rate": 7.899071659688032e-06, + "loss": 0.4871, + "step": 7157 + }, + { + "epoch": 1.1755383573173486, + "grad_norm": 0.30453037794151244, + "learning_rate": 7.8987643132247e-06, + "loss": 0.4866, + "step": 7158 + }, + { + "epoch": 1.1757025845256912, + "grad_norm": 0.37467717111887067, + "learning_rate": 7.898456929847442e-06, + "loss": 0.4978, + "step": 7159 + }, + { + "epoch": 1.175866811734034, + "grad_norm": 0.3034542609392963, + "learning_rate": 7.898149509559599e-06, + "loss": 0.4899, + "step": 7160 + }, + { + "epoch": 1.1760310389423767, + "grad_norm": 0.40653052696984354, + "learning_rate": 7.897842052364508e-06, + "loss": 0.4876, + "step": 7161 + }, + { + "epoch": 1.1761952661507196, + "grad_norm": 0.4270247739034037, + "learning_rate": 7.89753455826551e-06, + "loss": 0.4674, + "step": 7162 + }, + { + "epoch": 1.1763594933590622, + "grad_norm": 0.3277953459600014, + "learning_rate": 7.897227027265943e-06, + "loss": 0.4772, + "step": 7163 + }, + { + "epoch": 1.176523720567405, + "grad_norm": 0.4031180802456469, + "learning_rate": 7.89691945936915e-06, + "loss": 0.5218, + "step": 7164 + }, + { + "epoch": 1.1766879477757477, + "grad_norm": 0.3228656700450739, + "learning_rate": 7.89661185457847e-06, + "loss": 0.4886, + "step": 7165 + }, + { + "epoch": 1.1768521749840906, + "grad_norm": 0.30127464136306215, + "learning_rate": 7.89630421289724e-06, + "loss": 0.4974, + "step": 7166 + }, + { + "epoch": 1.1770164021924332, + "grad_norm": 0.3407442389191083, + "learning_rate": 7.895996534328806e-06, + "loss": 0.4672, + "step": 7167 + }, + { + "epoch": 1.177180629400776, + "grad_norm": 0.3611630703926426, + "learning_rate": 7.895688818876508e-06, + "loss": 0.5189, + "step": 7168 + }, + { + "epoch": 1.1773448566091187, + "grad_norm": 0.3194743450347194, + "learning_rate": 7.895381066543691e-06, + "loss": 0.5102, + "step": 7169 + }, + { + "epoch": 1.1775090838174616, + "grad_norm": 0.33776411509241444, + "learning_rate": 7.895073277333694e-06, + "loss": 0.4825, + "step": 7170 + }, + { + "epoch": 1.1776733110258042, + "grad_norm": 0.33471683930689616, + "learning_rate": 7.89476545124986e-06, + "loss": 0.4675, + "step": 7171 + }, + { + "epoch": 1.1778375382341468, + "grad_norm": 0.3248330468687593, + "learning_rate": 7.894457588295533e-06, + "loss": 0.4876, + "step": 7172 + }, + { + "epoch": 1.1780017654424897, + "grad_norm": 0.3094418395243431, + "learning_rate": 7.894149688474058e-06, + "loss": 0.4779, + "step": 7173 + }, + { + "epoch": 1.1781659926508325, + "grad_norm": 0.4121252840483226, + "learning_rate": 7.893841751788777e-06, + "loss": 0.5082, + "step": 7174 + }, + { + "epoch": 1.1783302198591752, + "grad_norm": 0.306690976540399, + "learning_rate": 7.893533778243037e-06, + "loss": 0.4679, + "step": 7175 + }, + { + "epoch": 1.1784944470675178, + "grad_norm": 0.3098485120429855, + "learning_rate": 7.89322576784018e-06, + "loss": 0.4656, + "step": 7176 + }, + { + "epoch": 1.1786586742758607, + "grad_norm": 0.3139158217304466, + "learning_rate": 7.892917720583553e-06, + "loss": 0.4817, + "step": 7177 + }, + { + "epoch": 1.1788229014842033, + "grad_norm": 0.27150948840561384, + "learning_rate": 7.892609636476502e-06, + "loss": 0.4734, + "step": 7178 + }, + { + "epoch": 1.1789871286925462, + "grad_norm": 0.3792047308148616, + "learning_rate": 7.892301515522371e-06, + "loss": 0.4892, + "step": 7179 + }, + { + "epoch": 1.1791513559008888, + "grad_norm": 0.2695161583725051, + "learning_rate": 7.89199335772451e-06, + "loss": 0.4469, + "step": 7180 + }, + { + "epoch": 1.1793155831092317, + "grad_norm": 0.29931825739146534, + "learning_rate": 7.891685163086262e-06, + "loss": 0.4822, + "step": 7181 + }, + { + "epoch": 1.1794798103175743, + "grad_norm": 0.2696831618893344, + "learning_rate": 7.891376931610977e-06, + "loss": 0.4963, + "step": 7182 + }, + { + "epoch": 1.1796440375259172, + "grad_norm": 0.3602896096956024, + "learning_rate": 7.891068663302003e-06, + "loss": 0.4941, + "step": 7183 + }, + { + "epoch": 1.1798082647342598, + "grad_norm": 0.3380357807441332, + "learning_rate": 7.890760358162686e-06, + "loss": 0.4828, + "step": 7184 + }, + { + "epoch": 1.1799724919426027, + "grad_norm": 0.3158752432413669, + "learning_rate": 7.890452016196373e-06, + "loss": 0.4987, + "step": 7185 + }, + { + "epoch": 1.1801367191509453, + "grad_norm": 0.54489675202446, + "learning_rate": 7.89014363740642e-06, + "loss": 0.4751, + "step": 7186 + }, + { + "epoch": 1.1803009463592882, + "grad_norm": 0.2946086684484438, + "learning_rate": 7.889835221796168e-06, + "loss": 0.4927, + "step": 7187 + }, + { + "epoch": 1.1804651735676308, + "grad_norm": 0.29319451335645197, + "learning_rate": 7.889526769368971e-06, + "loss": 0.4791, + "step": 7188 + }, + { + "epoch": 1.1806294007759734, + "grad_norm": 0.3168463859588959, + "learning_rate": 7.88921828012818e-06, + "loss": 0.4695, + "step": 7189 + }, + { + "epoch": 1.1807936279843163, + "grad_norm": 0.3594514406581799, + "learning_rate": 7.888909754077142e-06, + "loss": 0.4798, + "step": 7190 + }, + { + "epoch": 1.1809578551926592, + "grad_norm": 0.2990964083755579, + "learning_rate": 7.888601191219211e-06, + "loss": 0.4916, + "step": 7191 + }, + { + "epoch": 1.1811220824010018, + "grad_norm": 0.5104588170168158, + "learning_rate": 7.888292591557738e-06, + "loss": 0.4768, + "step": 7192 + }, + { + "epoch": 1.1812863096093444, + "grad_norm": 0.31064065418444264, + "learning_rate": 7.887983955096072e-06, + "loss": 0.4828, + "step": 7193 + }, + { + "epoch": 1.1814505368176873, + "grad_norm": 0.31641511809024947, + "learning_rate": 7.887675281837568e-06, + "loss": 0.4864, + "step": 7194 + }, + { + "epoch": 1.18161476402603, + "grad_norm": 0.3876844228637263, + "learning_rate": 7.887366571785577e-06, + "loss": 0.4788, + "step": 7195 + }, + { + "epoch": 1.1817789912343728, + "grad_norm": 0.28781277770735475, + "learning_rate": 7.887057824943451e-06, + "loss": 0.4813, + "step": 7196 + }, + { + "epoch": 1.1819432184427154, + "grad_norm": 0.2780428348147511, + "learning_rate": 7.886749041314546e-06, + "loss": 0.4975, + "step": 7197 + }, + { + "epoch": 1.1821074456510583, + "grad_norm": 0.3092106232254678, + "learning_rate": 7.886440220902214e-06, + "loss": 0.4926, + "step": 7198 + }, + { + "epoch": 1.182271672859401, + "grad_norm": 0.2840760200057259, + "learning_rate": 7.88613136370981e-06, + "loss": 0.4898, + "step": 7199 + }, + { + "epoch": 1.1824359000677438, + "grad_norm": 0.27574696423953743, + "learning_rate": 7.885822469740688e-06, + "loss": 0.4891, + "step": 7200 + }, + { + "epoch": 1.1826001272760864, + "grad_norm": 0.35965149805749896, + "learning_rate": 7.885513538998203e-06, + "loss": 0.4588, + "step": 7201 + }, + { + "epoch": 1.1827643544844293, + "grad_norm": 0.33303040766200065, + "learning_rate": 7.885204571485709e-06, + "loss": 0.4753, + "step": 7202 + }, + { + "epoch": 1.182928581692772, + "grad_norm": 0.2593780964338368, + "learning_rate": 7.884895567206563e-06, + "loss": 0.4835, + "step": 7203 + }, + { + "epoch": 1.1830928089011148, + "grad_norm": 0.35610378246540597, + "learning_rate": 7.884586526164121e-06, + "loss": 0.4746, + "step": 7204 + }, + { + "epoch": 1.1832570361094574, + "grad_norm": 0.28443270166004814, + "learning_rate": 7.88427744836174e-06, + "loss": 0.4791, + "step": 7205 + }, + { + "epoch": 1.1834212633178, + "grad_norm": 0.2831758991670496, + "learning_rate": 7.883968333802774e-06, + "loss": 0.4684, + "step": 7206 + }, + { + "epoch": 1.183585490526143, + "grad_norm": 0.35471014312117216, + "learning_rate": 7.883659182490585e-06, + "loss": 0.4668, + "step": 7207 + }, + { + "epoch": 1.1837497177344858, + "grad_norm": 0.29537181388674116, + "learning_rate": 7.883349994428527e-06, + "loss": 0.4802, + "step": 7208 + }, + { + "epoch": 1.1839139449428284, + "grad_norm": 0.3368787078344288, + "learning_rate": 7.88304076961996e-06, + "loss": 0.4802, + "step": 7209 + }, + { + "epoch": 1.184078172151171, + "grad_norm": 0.36423298240411317, + "learning_rate": 7.88273150806824e-06, + "loss": 0.4846, + "step": 7210 + }, + { + "epoch": 1.184242399359514, + "grad_norm": 0.45191874292943746, + "learning_rate": 7.88242220977673e-06, + "loss": 0.4971, + "step": 7211 + }, + { + "epoch": 1.1844066265678566, + "grad_norm": 0.31730056357201625, + "learning_rate": 7.882112874748787e-06, + "loss": 0.5007, + "step": 7212 + }, + { + "epoch": 1.1845708537761994, + "grad_norm": 0.276740020652526, + "learning_rate": 7.881803502987769e-06, + "loss": 0.4603, + "step": 7213 + }, + { + "epoch": 1.184735080984542, + "grad_norm": 0.3226551874841209, + "learning_rate": 7.881494094497038e-06, + "loss": 0.4973, + "step": 7214 + }, + { + "epoch": 1.184899308192885, + "grad_norm": 0.2698997195963249, + "learning_rate": 7.881184649279956e-06, + "loss": 0.4571, + "step": 7215 + }, + { + "epoch": 1.1850635354012276, + "grad_norm": 0.3116107257091327, + "learning_rate": 7.88087516733988e-06, + "loss": 0.4939, + "step": 7216 + }, + { + "epoch": 1.1852277626095704, + "grad_norm": 0.26601114390957037, + "learning_rate": 7.880565648680174e-06, + "loss": 0.4781, + "step": 7217 + }, + { + "epoch": 1.185391989817913, + "grad_norm": 0.2960317652813218, + "learning_rate": 7.880256093304199e-06, + "loss": 0.498, + "step": 7218 + }, + { + "epoch": 1.185556217026256, + "grad_norm": 0.2897931619508775, + "learning_rate": 7.879946501215317e-06, + "loss": 0.4928, + "step": 7219 + }, + { + "epoch": 1.1857204442345985, + "grad_norm": 0.3444469864217827, + "learning_rate": 7.87963687241689e-06, + "loss": 0.4992, + "step": 7220 + }, + { + "epoch": 1.1858846714429414, + "grad_norm": 0.32963841574885944, + "learning_rate": 7.879327206912283e-06, + "loss": 0.4832, + "step": 7221 + }, + { + "epoch": 1.186048898651284, + "grad_norm": 0.27874786931557877, + "learning_rate": 7.879017504704856e-06, + "loss": 0.4807, + "step": 7222 + }, + { + "epoch": 1.1862131258596267, + "grad_norm": 0.2954496205585582, + "learning_rate": 7.878707765797975e-06, + "loss": 0.4638, + "step": 7223 + }, + { + "epoch": 1.1863773530679695, + "grad_norm": 0.2997197947778376, + "learning_rate": 7.878397990195004e-06, + "loss": 0.4811, + "step": 7224 + }, + { + "epoch": 1.1865415802763124, + "grad_norm": 0.3608949284326675, + "learning_rate": 7.878088177899307e-06, + "loss": 0.4879, + "step": 7225 + }, + { + "epoch": 1.186705807484655, + "grad_norm": 0.31512186735330405, + "learning_rate": 7.877778328914248e-06, + "loss": 0.4778, + "step": 7226 + }, + { + "epoch": 1.1868700346929977, + "grad_norm": 0.5780317877360018, + "learning_rate": 7.877468443243195e-06, + "loss": 0.4951, + "step": 7227 + }, + { + "epoch": 1.1870342619013405, + "grad_norm": 0.30742943688087454, + "learning_rate": 7.877158520889509e-06, + "loss": 0.4756, + "step": 7228 + }, + { + "epoch": 1.1871984891096832, + "grad_norm": 0.30278431149830287, + "learning_rate": 7.87684856185656e-06, + "loss": 0.4714, + "step": 7229 + }, + { + "epoch": 1.187362716318026, + "grad_norm": 0.3118677206225516, + "learning_rate": 7.876538566147713e-06, + "loss": 0.4795, + "step": 7230 + }, + { + "epoch": 1.1875269435263687, + "grad_norm": 0.3327275128495535, + "learning_rate": 7.876228533766335e-06, + "loss": 0.485, + "step": 7231 + }, + { + "epoch": 1.1876911707347115, + "grad_norm": 0.280947840200177, + "learning_rate": 7.875918464715795e-06, + "loss": 0.4733, + "step": 7232 + }, + { + "epoch": 1.1878553979430542, + "grad_norm": 0.27631423496822494, + "learning_rate": 7.875608358999456e-06, + "loss": 0.4769, + "step": 7233 + }, + { + "epoch": 1.188019625151397, + "grad_norm": 0.3424215714956797, + "learning_rate": 7.875298216620692e-06, + "loss": 0.4745, + "step": 7234 + }, + { + "epoch": 1.1881838523597397, + "grad_norm": 0.3392612659281871, + "learning_rate": 7.874988037582868e-06, + "loss": 0.4821, + "step": 7235 + }, + { + "epoch": 1.1883480795680825, + "grad_norm": 0.30768246487911133, + "learning_rate": 7.874677821889352e-06, + "loss": 0.492, + "step": 7236 + }, + { + "epoch": 1.1885123067764252, + "grad_norm": 0.4768362934562472, + "learning_rate": 7.874367569543516e-06, + "loss": 0.4709, + "step": 7237 + }, + { + "epoch": 1.188676533984768, + "grad_norm": 0.39163590605246273, + "learning_rate": 7.874057280548727e-06, + "loss": 0.4987, + "step": 7238 + }, + { + "epoch": 1.1888407611931107, + "grad_norm": 0.48949358688016253, + "learning_rate": 7.873746954908358e-06, + "loss": 0.473, + "step": 7239 + }, + { + "epoch": 1.1890049884014533, + "grad_norm": 0.34669067573595047, + "learning_rate": 7.873436592625775e-06, + "loss": 0.4871, + "step": 7240 + }, + { + "epoch": 1.1891692156097962, + "grad_norm": 0.2881379597778867, + "learning_rate": 7.873126193704353e-06, + "loss": 0.4835, + "step": 7241 + }, + { + "epoch": 1.189333442818139, + "grad_norm": 0.28928702283893437, + "learning_rate": 7.872815758147463e-06, + "loss": 0.5041, + "step": 7242 + }, + { + "epoch": 1.1894976700264817, + "grad_norm": 0.2779046323729139, + "learning_rate": 7.872505285958475e-06, + "loss": 0.4861, + "step": 7243 + }, + { + "epoch": 1.1896618972348243, + "grad_norm": 0.32171186966240967, + "learning_rate": 7.872194777140761e-06, + "loss": 0.4708, + "step": 7244 + }, + { + "epoch": 1.1898261244431672, + "grad_norm": 0.33224800587019176, + "learning_rate": 7.871884231697693e-06, + "loss": 0.4752, + "step": 7245 + }, + { + "epoch": 1.1899903516515098, + "grad_norm": 0.39098922706360334, + "learning_rate": 7.871573649632646e-06, + "loss": 0.4722, + "step": 7246 + }, + { + "epoch": 1.1901545788598527, + "grad_norm": 0.260711638664843, + "learning_rate": 7.871263030948992e-06, + "loss": 0.4923, + "step": 7247 + }, + { + "epoch": 1.1903188060681953, + "grad_norm": 0.3056696630001395, + "learning_rate": 7.870952375650105e-06, + "loss": 0.4753, + "step": 7248 + }, + { + "epoch": 1.1904830332765381, + "grad_norm": 0.33626309268971577, + "learning_rate": 7.870641683739358e-06, + "loss": 0.4845, + "step": 7249 + }, + { + "epoch": 1.1906472604848808, + "grad_norm": 0.40524575271035757, + "learning_rate": 7.870330955220124e-06, + "loss": 0.476, + "step": 7250 + }, + { + "epoch": 1.1908114876932236, + "grad_norm": 0.30297205462394605, + "learning_rate": 7.870020190095783e-06, + "loss": 0.4827, + "step": 7251 + }, + { + "epoch": 1.1909757149015663, + "grad_norm": 0.2851543936766989, + "learning_rate": 7.869709388369705e-06, + "loss": 0.4808, + "step": 7252 + }, + { + "epoch": 1.1911399421099091, + "grad_norm": 0.27115976850848544, + "learning_rate": 7.869398550045268e-06, + "loss": 0.4858, + "step": 7253 + }, + { + "epoch": 1.1913041693182518, + "grad_norm": 0.30476809755351303, + "learning_rate": 7.86908767512585e-06, + "loss": 0.4876, + "step": 7254 + }, + { + "epoch": 1.1914683965265946, + "grad_norm": 0.3758066090495282, + "learning_rate": 7.868776763614824e-06, + "loss": 0.4615, + "step": 7255 + }, + { + "epoch": 1.1916326237349373, + "grad_norm": 0.2936697469025561, + "learning_rate": 7.868465815515568e-06, + "loss": 0.4939, + "step": 7256 + }, + { + "epoch": 1.19179685094328, + "grad_norm": 0.2704607304163568, + "learning_rate": 7.868154830831458e-06, + "loss": 0.4794, + "step": 7257 + }, + { + "epoch": 1.1919610781516228, + "grad_norm": 0.28176731888861567, + "learning_rate": 7.867843809565873e-06, + "loss": 0.4723, + "step": 7258 + }, + { + "epoch": 1.1921253053599656, + "grad_norm": 0.2760536908218363, + "learning_rate": 7.86753275172219e-06, + "loss": 0.4917, + "step": 7259 + }, + { + "epoch": 1.1922895325683083, + "grad_norm": 0.25631381946862203, + "learning_rate": 7.86722165730379e-06, + "loss": 0.4652, + "step": 7260 + }, + { + "epoch": 1.192453759776651, + "grad_norm": 0.27318642320256536, + "learning_rate": 7.866910526314049e-06, + "loss": 0.4479, + "step": 7261 + }, + { + "epoch": 1.1926179869849938, + "grad_norm": 0.28956412253272856, + "learning_rate": 7.866599358756347e-06, + "loss": 0.4779, + "step": 7262 + }, + { + "epoch": 1.1927822141933364, + "grad_norm": 0.30001643869690897, + "learning_rate": 7.866288154634064e-06, + "loss": 0.4873, + "step": 7263 + }, + { + "epoch": 1.1929464414016793, + "grad_norm": 0.2924444418420769, + "learning_rate": 7.86597691395058e-06, + "loss": 0.4865, + "step": 7264 + }, + { + "epoch": 1.193110668610022, + "grad_norm": 0.25822309694150847, + "learning_rate": 7.865665636709275e-06, + "loss": 0.4937, + "step": 7265 + }, + { + "epoch": 1.1932748958183648, + "grad_norm": 0.2582323654371515, + "learning_rate": 7.865354322913529e-06, + "loss": 0.4789, + "step": 7266 + }, + { + "epoch": 1.1934391230267074, + "grad_norm": 0.2560281240282641, + "learning_rate": 7.865042972566723e-06, + "loss": 0.4958, + "step": 7267 + }, + { + "epoch": 1.1936033502350503, + "grad_norm": 0.283105334712507, + "learning_rate": 7.86473158567224e-06, + "loss": 0.4857, + "step": 7268 + }, + { + "epoch": 1.193767577443393, + "grad_norm": 0.3074559301853208, + "learning_rate": 7.864420162233464e-06, + "loss": 0.4822, + "step": 7269 + }, + { + "epoch": 1.1939318046517358, + "grad_norm": 0.272089654585132, + "learning_rate": 7.864108702253773e-06, + "loss": 0.4826, + "step": 7270 + }, + { + "epoch": 1.1940960318600784, + "grad_norm": 0.34574826869825065, + "learning_rate": 7.863797205736552e-06, + "loss": 0.4898, + "step": 7271 + }, + { + "epoch": 1.1942602590684213, + "grad_norm": 0.3400107585169252, + "learning_rate": 7.863485672685183e-06, + "loss": 0.4679, + "step": 7272 + }, + { + "epoch": 1.194424486276764, + "grad_norm": 0.2794512133883541, + "learning_rate": 7.863174103103053e-06, + "loss": 0.4935, + "step": 7273 + }, + { + "epoch": 1.1945887134851065, + "grad_norm": 0.26107892204001254, + "learning_rate": 7.862862496993541e-06, + "loss": 0.4756, + "step": 7274 + }, + { + "epoch": 1.1947529406934494, + "grad_norm": 0.3010930525172073, + "learning_rate": 7.862550854360033e-06, + "loss": 0.4778, + "step": 7275 + }, + { + "epoch": 1.1949171679017923, + "grad_norm": 0.31706413409182155, + "learning_rate": 7.862239175205915e-06, + "loss": 0.4893, + "step": 7276 + }, + { + "epoch": 1.195081395110135, + "grad_norm": 0.6518163187078756, + "learning_rate": 7.861927459534572e-06, + "loss": 0.4701, + "step": 7277 + }, + { + "epoch": 1.1952456223184775, + "grad_norm": 0.2829791035958546, + "learning_rate": 7.86161570734939e-06, + "loss": 0.4871, + "step": 7278 + }, + { + "epoch": 1.1954098495268204, + "grad_norm": 0.2739474278242147, + "learning_rate": 7.861303918653752e-06, + "loss": 0.4755, + "step": 7279 + }, + { + "epoch": 1.195574076735163, + "grad_norm": 0.3436202620943284, + "learning_rate": 7.860992093451049e-06, + "loss": 0.4733, + "step": 7280 + }, + { + "epoch": 1.1957383039435059, + "grad_norm": 0.30182634928669677, + "learning_rate": 7.860680231744663e-06, + "loss": 0.5101, + "step": 7281 + }, + { + "epoch": 1.1959025311518485, + "grad_norm": 0.2987955559095261, + "learning_rate": 7.860368333537984e-06, + "loss": 0.5102, + "step": 7282 + }, + { + "epoch": 1.1960667583601914, + "grad_norm": 0.2696173051533773, + "learning_rate": 7.860056398834399e-06, + "loss": 0.4914, + "step": 7283 + }, + { + "epoch": 1.196230985568534, + "grad_norm": 0.3588548931620607, + "learning_rate": 7.859744427637295e-06, + "loss": 0.4714, + "step": 7284 + }, + { + "epoch": 1.1963952127768769, + "grad_norm": 0.2960177348685412, + "learning_rate": 7.85943241995006e-06, + "loss": 0.487, + "step": 7285 + }, + { + "epoch": 1.1965594399852195, + "grad_norm": 0.3321192088445762, + "learning_rate": 7.859120375776086e-06, + "loss": 0.4574, + "step": 7286 + }, + { + "epoch": 1.1967236671935624, + "grad_norm": 0.3047274250456088, + "learning_rate": 7.85880829511876e-06, + "loss": 0.4767, + "step": 7287 + }, + { + "epoch": 1.196887894401905, + "grad_norm": 0.2940277359499893, + "learning_rate": 7.85849617798147e-06, + "loss": 0.4727, + "step": 7288 + }, + { + "epoch": 1.1970521216102479, + "grad_norm": 0.31805435075245675, + "learning_rate": 7.858184024367606e-06, + "loss": 0.5016, + "step": 7289 + }, + { + "epoch": 1.1972163488185905, + "grad_norm": 0.2893430047178316, + "learning_rate": 7.857871834280562e-06, + "loss": 0.473, + "step": 7290 + }, + { + "epoch": 1.1973805760269332, + "grad_norm": 0.3214483934935793, + "learning_rate": 7.857559607723724e-06, + "loss": 0.4891, + "step": 7291 + }, + { + "epoch": 1.197544803235276, + "grad_norm": 0.2779040042258734, + "learning_rate": 7.857247344700485e-06, + "loss": 0.4857, + "step": 7292 + }, + { + "epoch": 1.1977090304436186, + "grad_norm": 0.29268673694745784, + "learning_rate": 7.85693504521424e-06, + "loss": 0.5038, + "step": 7293 + }, + { + "epoch": 1.1978732576519615, + "grad_norm": 0.2583632166117344, + "learning_rate": 7.856622709268375e-06, + "loss": 0.4702, + "step": 7294 + }, + { + "epoch": 1.1980374848603041, + "grad_norm": 0.3039569672751797, + "learning_rate": 7.856310336866284e-06, + "loss": 0.4916, + "step": 7295 + }, + { + "epoch": 1.198201712068647, + "grad_norm": 0.37623591673858825, + "learning_rate": 7.855997928011363e-06, + "loss": 0.4649, + "step": 7296 + }, + { + "epoch": 1.1983659392769896, + "grad_norm": 0.32792556540288026, + "learning_rate": 7.855685482707001e-06, + "loss": 0.5132, + "step": 7297 + }, + { + "epoch": 1.1985301664853325, + "grad_norm": 0.33159113984877747, + "learning_rate": 7.855373000956595e-06, + "loss": 0.4862, + "step": 7298 + }, + { + "epoch": 1.1986943936936751, + "grad_norm": 0.46686228289895976, + "learning_rate": 7.855060482763534e-06, + "loss": 0.5074, + "step": 7299 + }, + { + "epoch": 1.198858620902018, + "grad_norm": 0.29765525293464823, + "learning_rate": 7.854747928131219e-06, + "loss": 0.4779, + "step": 7300 + }, + { + "epoch": 1.1990228481103606, + "grad_norm": 0.31934509278801926, + "learning_rate": 7.854435337063037e-06, + "loss": 0.4852, + "step": 7301 + }, + { + "epoch": 1.1991870753187035, + "grad_norm": 0.424679981759697, + "learning_rate": 7.85412270956239e-06, + "loss": 0.4786, + "step": 7302 + }, + { + "epoch": 1.1993513025270461, + "grad_norm": 0.28126109328181276, + "learning_rate": 7.853810045632668e-06, + "loss": 0.4889, + "step": 7303 + }, + { + "epoch": 1.199515529735389, + "grad_norm": 0.27650129426773495, + "learning_rate": 7.853497345277272e-06, + "loss": 0.4622, + "step": 7304 + }, + { + "epoch": 1.1996797569437316, + "grad_norm": 0.3398188905283428, + "learning_rate": 7.853184608499593e-06, + "loss": 0.4689, + "step": 7305 + }, + { + "epoch": 1.1998439841520745, + "grad_norm": 0.298516041385809, + "learning_rate": 7.852871835303031e-06, + "loss": 0.4656, + "step": 7306 + }, + { + "epoch": 1.2000082113604171, + "grad_norm": 0.24499209339877884, + "learning_rate": 7.852559025690981e-06, + "loss": 0.4924, + "step": 7307 + }, + { + "epoch": 1.2001724385687598, + "grad_norm": 0.27863042979543257, + "learning_rate": 7.852246179666844e-06, + "loss": 0.4795, + "step": 7308 + }, + { + "epoch": 1.2003366657771026, + "grad_norm": 0.2756647160445179, + "learning_rate": 7.851933297234012e-06, + "loss": 0.4754, + "step": 7309 + }, + { + "epoch": 1.2005008929854453, + "grad_norm": 0.2831834434790002, + "learning_rate": 7.85162037839589e-06, + "loss": 0.4737, + "step": 7310 + }, + { + "epoch": 1.2006651201937881, + "grad_norm": 0.29162631607674616, + "learning_rate": 7.851307423155871e-06, + "loss": 0.4983, + "step": 7311 + }, + { + "epoch": 1.2008293474021308, + "grad_norm": 0.27672574832406416, + "learning_rate": 7.850994431517356e-06, + "loss": 0.4777, + "step": 7312 + }, + { + "epoch": 1.2009935746104736, + "grad_norm": 0.27869712581342154, + "learning_rate": 7.850681403483745e-06, + "loss": 0.506, + "step": 7313 + }, + { + "epoch": 1.2011578018188163, + "grad_norm": 0.2923731692650365, + "learning_rate": 7.850368339058438e-06, + "loss": 0.503, + "step": 7314 + }, + { + "epoch": 1.2013220290271591, + "grad_norm": 0.42420781724762624, + "learning_rate": 7.850055238244835e-06, + "loss": 0.472, + "step": 7315 + }, + { + "epoch": 1.2014862562355018, + "grad_norm": 0.2648715107534324, + "learning_rate": 7.849742101046333e-06, + "loss": 0.4963, + "step": 7316 + }, + { + "epoch": 1.2016504834438446, + "grad_norm": 0.2982734322965936, + "learning_rate": 7.849428927466338e-06, + "loss": 0.4765, + "step": 7317 + }, + { + "epoch": 1.2018147106521873, + "grad_norm": 0.34924463673787454, + "learning_rate": 7.849115717508252e-06, + "loss": 0.4945, + "step": 7318 + }, + { + "epoch": 1.2019789378605301, + "grad_norm": 0.3021561206760538, + "learning_rate": 7.84880247117547e-06, + "loss": 0.4746, + "step": 7319 + }, + { + "epoch": 1.2021431650688728, + "grad_norm": 0.4022430865737298, + "learning_rate": 7.848489188471401e-06, + "loss": 0.4897, + "step": 7320 + }, + { + "epoch": 1.2023073922772156, + "grad_norm": 0.28673004576717576, + "learning_rate": 7.848175869399444e-06, + "loss": 0.4809, + "step": 7321 + }, + { + "epoch": 1.2024716194855583, + "grad_norm": 0.2604773400575898, + "learning_rate": 7.847862513963003e-06, + "loss": 0.4712, + "step": 7322 + }, + { + "epoch": 1.2026358466939011, + "grad_norm": 0.44028520566728796, + "learning_rate": 7.847549122165481e-06, + "loss": 0.4975, + "step": 7323 + }, + { + "epoch": 1.2028000739022437, + "grad_norm": 0.30359081977857466, + "learning_rate": 7.847235694010283e-06, + "loss": 0.4754, + "step": 7324 + }, + { + "epoch": 1.2029643011105864, + "grad_norm": 0.2688194720858126, + "learning_rate": 7.846922229500812e-06, + "loss": 0.4809, + "step": 7325 + }, + { + "epoch": 1.2031285283189292, + "grad_norm": 0.27441461039552933, + "learning_rate": 7.846608728640471e-06, + "loss": 0.4881, + "step": 7326 + }, + { + "epoch": 1.2032927555272719, + "grad_norm": 0.36663350552704727, + "learning_rate": 7.846295191432668e-06, + "loss": 0.4883, + "step": 7327 + }, + { + "epoch": 1.2034569827356147, + "grad_norm": 0.30093440887610545, + "learning_rate": 7.845981617880808e-06, + "loss": 0.4791, + "step": 7328 + }, + { + "epoch": 1.2036212099439574, + "grad_norm": 0.30653428630193286, + "learning_rate": 7.845668007988292e-06, + "loss": 0.47, + "step": 7329 + }, + { + "epoch": 1.2037854371523002, + "grad_norm": 0.2898402343328568, + "learning_rate": 7.845354361758533e-06, + "loss": 0.4795, + "step": 7330 + }, + { + "epoch": 1.2039496643606429, + "grad_norm": 0.2890752297414382, + "learning_rate": 7.845040679194934e-06, + "loss": 0.4865, + "step": 7331 + }, + { + "epoch": 1.2041138915689857, + "grad_norm": 0.3624219698154435, + "learning_rate": 7.8447269603009e-06, + "loss": 0.475, + "step": 7332 + }, + { + "epoch": 1.2042781187773284, + "grad_norm": 0.296336578656, + "learning_rate": 7.844413205079842e-06, + "loss": 0.4954, + "step": 7333 + }, + { + "epoch": 1.2044423459856712, + "grad_norm": 0.35399910890682923, + "learning_rate": 7.844099413535167e-06, + "loss": 0.4947, + "step": 7334 + }, + { + "epoch": 1.2046065731940139, + "grad_norm": 0.33519400632511764, + "learning_rate": 7.843785585670279e-06, + "loss": 0.4739, + "step": 7335 + }, + { + "epoch": 1.2047708004023567, + "grad_norm": 0.28240279450470623, + "learning_rate": 7.843471721488593e-06, + "loss": 0.5025, + "step": 7336 + }, + { + "epoch": 1.2049350276106994, + "grad_norm": 0.25326684831156665, + "learning_rate": 7.843157820993515e-06, + "loss": 0.4706, + "step": 7337 + }, + { + "epoch": 1.2050992548190422, + "grad_norm": 0.29493194722350735, + "learning_rate": 7.84284388418845e-06, + "loss": 0.4717, + "step": 7338 + }, + { + "epoch": 1.2052634820273849, + "grad_norm": 0.298157102180501, + "learning_rate": 7.842529911076815e-06, + "loss": 0.5083, + "step": 7339 + }, + { + "epoch": 1.2054277092357277, + "grad_norm": 0.4151324217642813, + "learning_rate": 7.842215901662014e-06, + "loss": 0.4794, + "step": 7340 + }, + { + "epoch": 1.2055919364440704, + "grad_norm": 0.2826353275894072, + "learning_rate": 7.841901855947462e-06, + "loss": 0.4877, + "step": 7341 + }, + { + "epoch": 1.205756163652413, + "grad_norm": 0.28516782000663105, + "learning_rate": 7.841587773936568e-06, + "loss": 0.4773, + "step": 7342 + }, + { + "epoch": 1.2059203908607559, + "grad_norm": 0.3039940380114767, + "learning_rate": 7.841273655632741e-06, + "loss": 0.4945, + "step": 7343 + }, + { + "epoch": 1.2060846180690985, + "grad_norm": 0.40356339790866297, + "learning_rate": 7.840959501039397e-06, + "loss": 0.4859, + "step": 7344 + }, + { + "epoch": 1.2062488452774414, + "grad_norm": 0.5442637135737383, + "learning_rate": 7.840645310159945e-06, + "loss": 0.474, + "step": 7345 + }, + { + "epoch": 1.206413072485784, + "grad_norm": 0.3455945345487415, + "learning_rate": 7.840331082997799e-06, + "loss": 0.5056, + "step": 7346 + }, + { + "epoch": 1.2065772996941269, + "grad_norm": 0.3203479224628378, + "learning_rate": 7.840016819556369e-06, + "loss": 0.491, + "step": 7347 + }, + { + "epoch": 1.2067415269024695, + "grad_norm": 0.43912639657814323, + "learning_rate": 7.839702519839071e-06, + "loss": 0.4847, + "step": 7348 + }, + { + "epoch": 1.2069057541108124, + "grad_norm": 0.2913019474705546, + "learning_rate": 7.839388183849318e-06, + "loss": 0.4542, + "step": 7349 + }, + { + "epoch": 1.207069981319155, + "grad_norm": 0.2997673311165085, + "learning_rate": 7.839073811590524e-06, + "loss": 0.451, + "step": 7350 + }, + { + "epoch": 1.2072342085274979, + "grad_norm": 0.32497023477104164, + "learning_rate": 7.838759403066103e-06, + "loss": 0.4607, + "step": 7351 + }, + { + "epoch": 1.2073984357358405, + "grad_norm": 0.30168217147672366, + "learning_rate": 7.83844495827947e-06, + "loss": 0.4608, + "step": 7352 + }, + { + "epoch": 1.2075626629441834, + "grad_norm": 0.3171359606091761, + "learning_rate": 7.83813047723404e-06, + "loss": 0.4863, + "step": 7353 + }, + { + "epoch": 1.207726890152526, + "grad_norm": 0.3601274627022798, + "learning_rate": 7.83781595993323e-06, + "loss": 0.4696, + "step": 7354 + }, + { + "epoch": 1.2078911173608688, + "grad_norm": 0.3203665019807861, + "learning_rate": 7.837501406380452e-06, + "loss": 0.4717, + "step": 7355 + }, + { + "epoch": 1.2080553445692115, + "grad_norm": 0.29351737271594, + "learning_rate": 7.837186816579128e-06, + "loss": 0.4904, + "step": 7356 + }, + { + "epoch": 1.2082195717775543, + "grad_norm": 0.33220095890415596, + "learning_rate": 7.83687219053267e-06, + "loss": 0.4878, + "step": 7357 + }, + { + "epoch": 1.208383798985897, + "grad_norm": 0.2975976385334508, + "learning_rate": 7.836557528244497e-06, + "loss": 0.4774, + "step": 7358 + }, + { + "epoch": 1.2085480261942396, + "grad_norm": 0.277639522422605, + "learning_rate": 7.836242829718028e-06, + "loss": 0.4827, + "step": 7359 + }, + { + "epoch": 1.2087122534025825, + "grad_norm": 0.28230791338424405, + "learning_rate": 7.835928094956677e-06, + "loss": 0.4858, + "step": 7360 + }, + { + "epoch": 1.2088764806109251, + "grad_norm": 0.3070274198241027, + "learning_rate": 7.835613323963867e-06, + "loss": 0.4732, + "step": 7361 + }, + { + "epoch": 1.209040707819268, + "grad_norm": 0.3168714949165464, + "learning_rate": 7.835298516743014e-06, + "loss": 0.4867, + "step": 7362 + }, + { + "epoch": 1.2092049350276106, + "grad_norm": 0.3453218427128881, + "learning_rate": 7.834983673297537e-06, + "loss": 0.4556, + "step": 7363 + }, + { + "epoch": 1.2093691622359535, + "grad_norm": 0.3528769701781255, + "learning_rate": 7.834668793630856e-06, + "loss": 0.4985, + "step": 7364 + }, + { + "epoch": 1.2095333894442961, + "grad_norm": 0.2638158286045824, + "learning_rate": 7.834353877746391e-06, + "loss": 0.4675, + "step": 7365 + }, + { + "epoch": 1.209697616652639, + "grad_norm": 0.2788721286113114, + "learning_rate": 7.834038925647563e-06, + "loss": 0.4796, + "step": 7366 + }, + { + "epoch": 1.2098618438609816, + "grad_norm": 0.31074874505065736, + "learning_rate": 7.833723937337792e-06, + "loss": 0.4873, + "step": 7367 + }, + { + "epoch": 1.2100260710693245, + "grad_norm": 0.2616105603314999, + "learning_rate": 7.8334089128205e-06, + "loss": 0.4853, + "step": 7368 + }, + { + "epoch": 1.210190298277667, + "grad_norm": 0.25894275590866295, + "learning_rate": 7.833093852099104e-06, + "loss": 0.4763, + "step": 7369 + }, + { + "epoch": 1.21035452548601, + "grad_norm": 0.28972598700545293, + "learning_rate": 7.832778755177034e-06, + "loss": 0.4718, + "step": 7370 + }, + { + "epoch": 1.2105187526943526, + "grad_norm": 0.53511817324183, + "learning_rate": 7.832463622057705e-06, + "loss": 0.478, + "step": 7371 + }, + { + "epoch": 1.2106829799026955, + "grad_norm": 0.2908338880957059, + "learning_rate": 7.832148452744544e-06, + "loss": 0.4872, + "step": 7372 + }, + { + "epoch": 1.210847207111038, + "grad_norm": 0.31807390385130946, + "learning_rate": 7.83183324724097e-06, + "loss": 0.492, + "step": 7373 + }, + { + "epoch": 1.211011434319381, + "grad_norm": 0.296720479252511, + "learning_rate": 7.831518005550412e-06, + "loss": 0.4774, + "step": 7374 + }, + { + "epoch": 1.2111756615277236, + "grad_norm": 0.2966182688407565, + "learning_rate": 7.831202727676287e-06, + "loss": 0.4725, + "step": 7375 + }, + { + "epoch": 1.2113398887360662, + "grad_norm": 0.2888443450249788, + "learning_rate": 7.830887413622027e-06, + "loss": 0.4726, + "step": 7376 + }, + { + "epoch": 1.211504115944409, + "grad_norm": 0.27658756184306366, + "learning_rate": 7.830572063391049e-06, + "loss": 0.4877, + "step": 7377 + }, + { + "epoch": 1.2116683431527517, + "grad_norm": 0.3201453442232231, + "learning_rate": 7.830256676986785e-06, + "loss": 0.4801, + "step": 7378 + }, + { + "epoch": 1.2118325703610946, + "grad_norm": 0.30096927890993763, + "learning_rate": 7.829941254412654e-06, + "loss": 0.4828, + "step": 7379 + }, + { + "epoch": 1.2119967975694372, + "grad_norm": 0.3084680100763514, + "learning_rate": 7.829625795672085e-06, + "loss": 0.4981, + "step": 7380 + }, + { + "epoch": 1.21216102477778, + "grad_norm": 0.2678693496989439, + "learning_rate": 7.829310300768505e-06, + "loss": 0.4669, + "step": 7381 + }, + { + "epoch": 1.2123252519861227, + "grad_norm": 0.2697735354847363, + "learning_rate": 7.828994769705339e-06, + "loss": 0.4929, + "step": 7382 + }, + { + "epoch": 1.2124894791944656, + "grad_norm": 0.2560319566439462, + "learning_rate": 7.828679202486015e-06, + "loss": 0.4841, + "step": 7383 + }, + { + "epoch": 1.2126537064028082, + "grad_norm": 0.2836939551834787, + "learning_rate": 7.828363599113959e-06, + "loss": 0.4697, + "step": 7384 + }, + { + "epoch": 1.212817933611151, + "grad_norm": 0.31007946274580067, + "learning_rate": 7.828047959592601e-06, + "loss": 0.4882, + "step": 7385 + }, + { + "epoch": 1.2129821608194937, + "grad_norm": 0.29459524995287784, + "learning_rate": 7.827732283925366e-06, + "loss": 0.482, + "step": 7386 + }, + { + "epoch": 1.2131463880278366, + "grad_norm": 0.3793586584509073, + "learning_rate": 7.827416572115686e-06, + "loss": 0.4817, + "step": 7387 + }, + { + "epoch": 1.2133106152361792, + "grad_norm": 0.2644897980491223, + "learning_rate": 7.827100824166988e-06, + "loss": 0.4794, + "step": 7388 + }, + { + "epoch": 1.213474842444522, + "grad_norm": 0.3017168977924796, + "learning_rate": 7.826785040082702e-06, + "loss": 0.4818, + "step": 7389 + }, + { + "epoch": 1.2136390696528647, + "grad_norm": 0.305708816789366, + "learning_rate": 7.826469219866257e-06, + "loss": 0.4802, + "step": 7390 + }, + { + "epoch": 1.2138032968612076, + "grad_norm": 0.3059340241183472, + "learning_rate": 7.826153363521082e-06, + "loss": 0.5047, + "step": 7391 + }, + { + "epoch": 1.2139675240695502, + "grad_norm": 0.40313192821803495, + "learning_rate": 7.82583747105061e-06, + "loss": 0.497, + "step": 7392 + }, + { + "epoch": 1.2141317512778929, + "grad_norm": 0.3177682689036567, + "learning_rate": 7.82552154245827e-06, + "loss": 0.4912, + "step": 7393 + }, + { + "epoch": 1.2142959784862357, + "grad_norm": 0.33379735245100434, + "learning_rate": 7.825205577747495e-06, + "loss": 0.4787, + "step": 7394 + }, + { + "epoch": 1.2144602056945784, + "grad_norm": 0.32610810464507667, + "learning_rate": 7.824889576921718e-06, + "loss": 0.4641, + "step": 7395 + }, + { + "epoch": 1.2146244329029212, + "grad_norm": 0.2576373012534115, + "learning_rate": 7.824573539984367e-06, + "loss": 0.48, + "step": 7396 + }, + { + "epoch": 1.2147886601112639, + "grad_norm": 0.2962684067235093, + "learning_rate": 7.824257466938875e-06, + "loss": 0.4809, + "step": 7397 + }, + { + "epoch": 1.2149528873196067, + "grad_norm": 0.3373698620542774, + "learning_rate": 7.823941357788679e-06, + "loss": 0.4949, + "step": 7398 + }, + { + "epoch": 1.2151171145279493, + "grad_norm": 0.2714003069990936, + "learning_rate": 7.823625212537206e-06, + "loss": 0.4791, + "step": 7399 + }, + { + "epoch": 1.2152813417362922, + "grad_norm": 0.27004087404298466, + "learning_rate": 7.823309031187897e-06, + "loss": 0.4778, + "step": 7400 + }, + { + "epoch": 1.2154455689446348, + "grad_norm": 0.27836482022600734, + "learning_rate": 7.82299281374418e-06, + "loss": 0.4814, + "step": 7401 + }, + { + "epoch": 1.2156097961529777, + "grad_norm": 0.5709071029748667, + "learning_rate": 7.822676560209493e-06, + "loss": 0.4845, + "step": 7402 + }, + { + "epoch": 1.2157740233613203, + "grad_norm": 0.28200824916519507, + "learning_rate": 7.822360270587269e-06, + "loss": 0.4959, + "step": 7403 + }, + { + "epoch": 1.2159382505696632, + "grad_norm": 0.36164400269974106, + "learning_rate": 7.822043944880943e-06, + "loss": 0.4751, + "step": 7404 + }, + { + "epoch": 1.2161024777780058, + "grad_norm": 0.2920172636322637, + "learning_rate": 7.821727583093951e-06, + "loss": 0.4898, + "step": 7405 + }, + { + "epoch": 1.2162667049863487, + "grad_norm": 0.27259579242872134, + "learning_rate": 7.821411185229732e-06, + "loss": 0.4832, + "step": 7406 + }, + { + "epoch": 1.2164309321946913, + "grad_norm": 2.1964313211670277, + "learning_rate": 7.821094751291716e-06, + "loss": 0.4811, + "step": 7407 + }, + { + "epoch": 1.2165951594030342, + "grad_norm": 0.3163824618854596, + "learning_rate": 7.820778281283346e-06, + "loss": 0.4965, + "step": 7408 + }, + { + "epoch": 1.2167593866113768, + "grad_norm": 0.29345503086766606, + "learning_rate": 7.820461775208056e-06, + "loss": 0.4871, + "step": 7409 + }, + { + "epoch": 1.2169236138197195, + "grad_norm": 0.30196703868592945, + "learning_rate": 7.820145233069284e-06, + "loss": 0.4605, + "step": 7410 + }, + { + "epoch": 1.2170878410280623, + "grad_norm": 0.28311895504982654, + "learning_rate": 7.81982865487047e-06, + "loss": 0.4912, + "step": 7411 + }, + { + "epoch": 1.217252068236405, + "grad_norm": 0.30339328791642767, + "learning_rate": 7.819512040615047e-06, + "loss": 0.4609, + "step": 7412 + }, + { + "epoch": 1.2174162954447478, + "grad_norm": 0.2596969919404455, + "learning_rate": 7.819195390306459e-06, + "loss": 0.4753, + "step": 7413 + }, + { + "epoch": 1.2175805226530905, + "grad_norm": 0.322969273895278, + "learning_rate": 7.818878703948144e-06, + "loss": 0.4794, + "step": 7414 + }, + { + "epoch": 1.2177447498614333, + "grad_norm": 0.3519640745006145, + "learning_rate": 7.818561981543541e-06, + "loss": 0.4773, + "step": 7415 + }, + { + "epoch": 1.217908977069776, + "grad_norm": 0.34107543982912136, + "learning_rate": 7.81824522309609e-06, + "loss": 0.492, + "step": 7416 + }, + { + "epoch": 1.2180732042781188, + "grad_norm": 0.3281282771747284, + "learning_rate": 7.817928428609229e-06, + "loss": 0.4803, + "step": 7417 + }, + { + "epoch": 1.2182374314864615, + "grad_norm": 0.5308962228651787, + "learning_rate": 7.817611598086403e-06, + "loss": 0.4642, + "step": 7418 + }, + { + "epoch": 1.2184016586948043, + "grad_norm": 0.26900387954254784, + "learning_rate": 7.81729473153105e-06, + "loss": 0.4797, + "step": 7419 + }, + { + "epoch": 1.218565885903147, + "grad_norm": 0.28024744617753733, + "learning_rate": 7.816977828946612e-06, + "loss": 0.4846, + "step": 7420 + }, + { + "epoch": 1.2187301131114898, + "grad_norm": 0.2647335194613172, + "learning_rate": 7.816660890336532e-06, + "loss": 0.4755, + "step": 7421 + }, + { + "epoch": 1.2188943403198325, + "grad_norm": 0.2945285536442789, + "learning_rate": 7.816343915704252e-06, + "loss": 0.4746, + "step": 7422 + }, + { + "epoch": 1.2190585675281753, + "grad_norm": 0.2835965254364738, + "learning_rate": 7.816026905053214e-06, + "loss": 0.4782, + "step": 7423 + }, + { + "epoch": 1.219222794736518, + "grad_norm": 0.4352423901004788, + "learning_rate": 7.815709858386861e-06, + "loss": 0.481, + "step": 7424 + }, + { + "epoch": 1.2193870219448608, + "grad_norm": 0.2687199290132344, + "learning_rate": 7.815392775708639e-06, + "loss": 0.4907, + "step": 7425 + }, + { + "epoch": 1.2195512491532035, + "grad_norm": 0.26676255690653605, + "learning_rate": 7.815075657021986e-06, + "loss": 0.4787, + "step": 7426 + }, + { + "epoch": 1.219715476361546, + "grad_norm": 0.32625127264289355, + "learning_rate": 7.814758502330352e-06, + "loss": 0.4822, + "step": 7427 + }, + { + "epoch": 1.219879703569889, + "grad_norm": 0.2957996126267077, + "learning_rate": 7.814441311637179e-06, + "loss": 0.4793, + "step": 7428 + }, + { + "epoch": 1.2200439307782316, + "grad_norm": 0.3095382988258092, + "learning_rate": 7.814124084945911e-06, + "loss": 0.4866, + "step": 7429 + }, + { + "epoch": 1.2202081579865744, + "grad_norm": 0.2886374847171023, + "learning_rate": 7.813806822259996e-06, + "loss": 0.4791, + "step": 7430 + }, + { + "epoch": 1.220372385194917, + "grad_norm": 0.3268027407819441, + "learning_rate": 7.81348952358288e-06, + "loss": 0.4972, + "step": 7431 + }, + { + "epoch": 1.22053661240326, + "grad_norm": 0.3406074181446285, + "learning_rate": 7.813172188918005e-06, + "loss": 0.5041, + "step": 7432 + }, + { + "epoch": 1.2207008396116026, + "grad_norm": 0.4363014651036817, + "learning_rate": 7.81285481826882e-06, + "loss": 0.466, + "step": 7433 + }, + { + "epoch": 1.2208650668199454, + "grad_norm": 0.27869275838433155, + "learning_rate": 7.812537411638776e-06, + "loss": 0.4781, + "step": 7434 + }, + { + "epoch": 1.221029294028288, + "grad_norm": 0.28443886078659325, + "learning_rate": 7.812219969031313e-06, + "loss": 0.4733, + "step": 7435 + }, + { + "epoch": 1.221193521236631, + "grad_norm": 0.2876297976501121, + "learning_rate": 7.811902490449884e-06, + "loss": 0.4638, + "step": 7436 + }, + { + "epoch": 1.2213577484449736, + "grad_norm": 0.27526069765232986, + "learning_rate": 7.811584975897936e-06, + "loss": 0.4671, + "step": 7437 + }, + { + "epoch": 1.2215219756533164, + "grad_norm": 0.2595326036832628, + "learning_rate": 7.811267425378915e-06, + "loss": 0.4738, + "step": 7438 + }, + { + "epoch": 1.221686202861659, + "grad_norm": 0.47780546288542847, + "learning_rate": 7.810949838896273e-06, + "loss": 0.4714, + "step": 7439 + }, + { + "epoch": 1.221850430070002, + "grad_norm": 0.3353387377929998, + "learning_rate": 7.81063221645346e-06, + "loss": 0.4911, + "step": 7440 + }, + { + "epoch": 1.2220146572783446, + "grad_norm": 0.2867479853754556, + "learning_rate": 7.81031455805392e-06, + "loss": 0.4739, + "step": 7441 + }, + { + "epoch": 1.2221788844866874, + "grad_norm": 0.2873199750403933, + "learning_rate": 7.80999686370111e-06, + "loss": 0.4777, + "step": 7442 + }, + { + "epoch": 1.22234311169503, + "grad_norm": 0.2786622828477527, + "learning_rate": 7.809679133398477e-06, + "loss": 0.4834, + "step": 7443 + }, + { + "epoch": 1.2225073389033727, + "grad_norm": 0.26644504322703355, + "learning_rate": 7.809361367149472e-06, + "loss": 0.4781, + "step": 7444 + }, + { + "epoch": 1.2226715661117156, + "grad_norm": 0.32504077445550017, + "learning_rate": 7.809043564957546e-06, + "loss": 0.4964, + "step": 7445 + }, + { + "epoch": 1.2228357933200582, + "grad_norm": 0.3443886570485489, + "learning_rate": 7.808725726826152e-06, + "loss": 0.4831, + "step": 7446 + }, + { + "epoch": 1.223000020528401, + "grad_norm": 0.3078921394902557, + "learning_rate": 7.808407852758741e-06, + "loss": 0.486, + "step": 7447 + }, + { + "epoch": 1.2231642477367437, + "grad_norm": 0.3137312338134038, + "learning_rate": 7.808089942758765e-06, + "loss": 0.4817, + "step": 7448 + }, + { + "epoch": 1.2233284749450866, + "grad_norm": 0.29602008603252766, + "learning_rate": 7.80777199682968e-06, + "loss": 0.4811, + "step": 7449 + }, + { + "epoch": 1.2234927021534292, + "grad_norm": 0.33542775379730577, + "learning_rate": 7.807454014974935e-06, + "loss": 0.4842, + "step": 7450 + }, + { + "epoch": 1.223656929361772, + "grad_norm": 0.28587329682265583, + "learning_rate": 7.807135997197983e-06, + "loss": 0.4821, + "step": 7451 + }, + { + "epoch": 1.2238211565701147, + "grad_norm": 0.31582625696235567, + "learning_rate": 7.806817943502283e-06, + "loss": 0.4764, + "step": 7452 + }, + { + "epoch": 1.2239853837784576, + "grad_norm": 0.4508737429529132, + "learning_rate": 7.806499853891286e-06, + "loss": 0.4682, + "step": 7453 + }, + { + "epoch": 1.2241496109868002, + "grad_norm": 0.34199337453924816, + "learning_rate": 7.806181728368447e-06, + "loss": 0.4876, + "step": 7454 + }, + { + "epoch": 1.224313838195143, + "grad_norm": 0.29162050236465775, + "learning_rate": 7.805863566937222e-06, + "loss": 0.4816, + "step": 7455 + }, + { + "epoch": 1.2244780654034857, + "grad_norm": 0.25706162141912203, + "learning_rate": 7.805545369601068e-06, + "loss": 0.4749, + "step": 7456 + }, + { + "epoch": 1.2246422926118286, + "grad_norm": 0.30467782555406403, + "learning_rate": 7.805227136363438e-06, + "loss": 0.482, + "step": 7457 + }, + { + "epoch": 1.2248065198201712, + "grad_norm": 0.4620126635135206, + "learning_rate": 7.804908867227787e-06, + "loss": 0.4814, + "step": 7458 + }, + { + "epoch": 1.224970747028514, + "grad_norm": 0.6847698761546367, + "learning_rate": 7.804590562197577e-06, + "loss": 0.4663, + "step": 7459 + }, + { + "epoch": 1.2251349742368567, + "grad_norm": 0.3919829953997119, + "learning_rate": 7.80427222127626e-06, + "loss": 0.46, + "step": 7460 + }, + { + "epoch": 1.2252992014451993, + "grad_norm": 0.9675079986361146, + "learning_rate": 7.803953844467296e-06, + "loss": 0.4948, + "step": 7461 + }, + { + "epoch": 1.2254634286535422, + "grad_norm": 0.4048339109198711, + "learning_rate": 7.803635431774145e-06, + "loss": 0.4751, + "step": 7462 + }, + { + "epoch": 1.2256276558618848, + "grad_norm": 0.33359033850303504, + "learning_rate": 7.80331698320026e-06, + "loss": 0.4903, + "step": 7463 + }, + { + "epoch": 1.2257918830702277, + "grad_norm": 0.4551179514390609, + "learning_rate": 7.802998498749104e-06, + "loss": 0.4831, + "step": 7464 + }, + { + "epoch": 1.2259561102785703, + "grad_norm": 0.2962278157330623, + "learning_rate": 7.802679978424136e-06, + "loss": 0.4894, + "step": 7465 + }, + { + "epoch": 1.2261203374869132, + "grad_norm": 0.3567481952133428, + "learning_rate": 7.802361422228812e-06, + "loss": 0.4846, + "step": 7466 + }, + { + "epoch": 1.2262845646952558, + "grad_norm": 0.2693500702419807, + "learning_rate": 7.802042830166594e-06, + "loss": 0.4673, + "step": 7467 + }, + { + "epoch": 1.2264487919035987, + "grad_norm": 0.2679369801617365, + "learning_rate": 7.801724202240943e-06, + "loss": 0.4873, + "step": 7468 + }, + { + "epoch": 1.2266130191119413, + "grad_norm": 0.28647992777890036, + "learning_rate": 7.801405538455317e-06, + "loss": 0.4603, + "step": 7469 + }, + { + "epoch": 1.2267772463202842, + "grad_norm": 0.2850138121930835, + "learning_rate": 7.801086838813181e-06, + "loss": 0.4809, + "step": 7470 + }, + { + "epoch": 1.2269414735286268, + "grad_norm": 0.3343754442632032, + "learning_rate": 7.800768103317991e-06, + "loss": 0.4717, + "step": 7471 + }, + { + "epoch": 1.2271057007369697, + "grad_norm": 0.3380996961344888, + "learning_rate": 7.800449331973215e-06, + "loss": 0.4688, + "step": 7472 + }, + { + "epoch": 1.2272699279453123, + "grad_norm": 0.2828645657673943, + "learning_rate": 7.80013052478231e-06, + "loss": 0.4826, + "step": 7473 + }, + { + "epoch": 1.2274341551536552, + "grad_norm": 0.34488922487887297, + "learning_rate": 7.799811681748743e-06, + "loss": 0.4963, + "step": 7474 + }, + { + "epoch": 1.2275983823619978, + "grad_norm": 0.36977738285825107, + "learning_rate": 7.799492802875973e-06, + "loss": 0.4799, + "step": 7475 + }, + { + "epoch": 1.2277626095703407, + "grad_norm": 0.4107523883100698, + "learning_rate": 7.799173888167465e-06, + "loss": 0.4791, + "step": 7476 + }, + { + "epoch": 1.2279268367786833, + "grad_norm": 0.3065610046874601, + "learning_rate": 7.798854937626682e-06, + "loss": 0.4837, + "step": 7477 + }, + { + "epoch": 1.228091063987026, + "grad_norm": 0.3125906559459154, + "learning_rate": 7.79853595125709e-06, + "loss": 0.4933, + "step": 7478 + }, + { + "epoch": 1.2282552911953688, + "grad_norm": 0.2626573945619913, + "learning_rate": 7.79821692906215e-06, + "loss": 0.4714, + "step": 7479 + }, + { + "epoch": 1.2284195184037114, + "grad_norm": 0.3571791073570016, + "learning_rate": 7.797897871045332e-06, + "loss": 0.4705, + "step": 7480 + }, + { + "epoch": 1.2285837456120543, + "grad_norm": 0.34360557571526473, + "learning_rate": 7.797578777210096e-06, + "loss": 0.4663, + "step": 7481 + }, + { + "epoch": 1.228747972820397, + "grad_norm": 0.2963914639094682, + "learning_rate": 7.797259647559912e-06, + "loss": 0.4723, + "step": 7482 + }, + { + "epoch": 1.2289122000287398, + "grad_norm": 0.27699861933825076, + "learning_rate": 7.796940482098244e-06, + "loss": 0.4731, + "step": 7483 + }, + { + "epoch": 1.2290764272370824, + "grad_norm": 0.2969103632510052, + "learning_rate": 7.796621280828558e-06, + "loss": 0.4761, + "step": 7484 + }, + { + "epoch": 1.2292406544454253, + "grad_norm": 0.3065597940547537, + "learning_rate": 7.796302043754321e-06, + "loss": 0.4505, + "step": 7485 + }, + { + "epoch": 1.229404881653768, + "grad_norm": 0.26417503495718664, + "learning_rate": 7.795982770879e-06, + "loss": 0.4748, + "step": 7486 + }, + { + "epoch": 1.2295691088621108, + "grad_norm": 0.3398290499434709, + "learning_rate": 7.795663462206067e-06, + "loss": 0.4669, + "step": 7487 + }, + { + "epoch": 1.2297333360704534, + "grad_norm": 0.34742823434001574, + "learning_rate": 7.795344117738982e-06, + "loss": 0.4781, + "step": 7488 + }, + { + "epoch": 1.2298975632787963, + "grad_norm": 0.2945219892926102, + "learning_rate": 7.795024737481219e-06, + "loss": 0.4687, + "step": 7489 + }, + { + "epoch": 1.230061790487139, + "grad_norm": 0.2770494663562303, + "learning_rate": 7.794705321436248e-06, + "loss": 0.4587, + "step": 7490 + }, + { + "epoch": 1.2302260176954818, + "grad_norm": 0.2968149118488759, + "learning_rate": 7.794385869607532e-06, + "loss": 0.511, + "step": 7491 + }, + { + "epoch": 1.2303902449038244, + "grad_norm": 0.2915104293412883, + "learning_rate": 7.794066381998546e-06, + "loss": 0.4689, + "step": 7492 + }, + { + "epoch": 1.2305544721121673, + "grad_norm": 0.35989630798640004, + "learning_rate": 7.793746858612759e-06, + "loss": 0.4944, + "step": 7493 + }, + { + "epoch": 1.23071869932051, + "grad_norm": 0.3141717355163613, + "learning_rate": 7.79342729945364e-06, + "loss": 0.4933, + "step": 7494 + }, + { + "epoch": 1.2308829265288526, + "grad_norm": 0.3191568033697543, + "learning_rate": 7.793107704524659e-06, + "loss": 0.4767, + "step": 7495 + }, + { + "epoch": 1.2310471537371954, + "grad_norm": 0.3239132079874789, + "learning_rate": 7.792788073829289e-06, + "loss": 0.4672, + "step": 7496 + }, + { + "epoch": 1.231211380945538, + "grad_norm": 0.2635865983108721, + "learning_rate": 7.792468407371e-06, + "loss": 0.4478, + "step": 7497 + }, + { + "epoch": 1.231375608153881, + "grad_norm": 0.38292422598333736, + "learning_rate": 7.792148705153266e-06, + "loss": 0.4812, + "step": 7498 + }, + { + "epoch": 1.2315398353622236, + "grad_norm": 0.37145374460876307, + "learning_rate": 7.791828967179559e-06, + "loss": 0.4876, + "step": 7499 + }, + { + "epoch": 1.2317040625705664, + "grad_norm": 0.28915250611709403, + "learning_rate": 7.791509193453348e-06, + "loss": 0.4792, + "step": 7500 + }, + { + "epoch": 1.231868289778909, + "grad_norm": 0.2889110278668834, + "learning_rate": 7.79118938397811e-06, + "loss": 0.4905, + "step": 7501 + }, + { + "epoch": 1.232032516987252, + "grad_norm": 0.33366376283958143, + "learning_rate": 7.790869538757317e-06, + "loss": 0.5038, + "step": 7502 + }, + { + "epoch": 1.2321967441955946, + "grad_norm": 0.44289224941063765, + "learning_rate": 7.790549657794443e-06, + "loss": 0.4876, + "step": 7503 + }, + { + "epoch": 1.2323609714039374, + "grad_norm": 0.3136464858676508, + "learning_rate": 7.790229741092962e-06, + "loss": 0.4955, + "step": 7504 + }, + { + "epoch": 1.23252519861228, + "grad_norm": 0.2531177671121733, + "learning_rate": 7.78990978865635e-06, + "loss": 0.4962, + "step": 7505 + }, + { + "epoch": 1.232689425820623, + "grad_norm": 0.2978682970598931, + "learning_rate": 7.789589800488081e-06, + "loss": 0.5, + "step": 7506 + }, + { + "epoch": 1.2328536530289655, + "grad_norm": 0.2534835272168865, + "learning_rate": 7.789269776591631e-06, + "loss": 0.4681, + "step": 7507 + }, + { + "epoch": 1.2330178802373084, + "grad_norm": 0.35502693503125016, + "learning_rate": 7.788949716970472e-06, + "loss": 0.4868, + "step": 7508 + }, + { + "epoch": 1.233182107445651, + "grad_norm": 0.35494605281913755, + "learning_rate": 7.788629621628084e-06, + "loss": 0.4862, + "step": 7509 + }, + { + "epoch": 1.233346334653994, + "grad_norm": 0.2796519161526337, + "learning_rate": 7.788309490567945e-06, + "loss": 0.4809, + "step": 7510 + }, + { + "epoch": 1.2335105618623365, + "grad_norm": 0.35202315535595735, + "learning_rate": 7.787989323793527e-06, + "loss": 0.4927, + "step": 7511 + }, + { + "epoch": 1.2336747890706792, + "grad_norm": 0.27572529786791367, + "learning_rate": 7.787669121308312e-06, + "loss": 0.477, + "step": 7512 + }, + { + "epoch": 1.233839016279022, + "grad_norm": 0.29646983151833184, + "learning_rate": 7.787348883115774e-06, + "loss": 0.4731, + "step": 7513 + }, + { + "epoch": 1.2340032434873647, + "grad_norm": 0.32889863862531893, + "learning_rate": 7.787028609219394e-06, + "loss": 0.4951, + "step": 7514 + }, + { + "epoch": 1.2341674706957075, + "grad_norm": 0.2655153867934256, + "learning_rate": 7.78670829962265e-06, + "loss": 0.4506, + "step": 7515 + }, + { + "epoch": 1.2343316979040502, + "grad_norm": 0.2663148975867709, + "learning_rate": 7.786387954329018e-06, + "loss": 0.4742, + "step": 7516 + }, + { + "epoch": 1.234495925112393, + "grad_norm": 0.42508777196790876, + "learning_rate": 7.786067573341982e-06, + "loss": 0.4826, + "step": 7517 + }, + { + "epoch": 1.2346601523207357, + "grad_norm": 0.3041477860219649, + "learning_rate": 7.785747156665018e-06, + "loss": 0.4699, + "step": 7518 + }, + { + "epoch": 1.2348243795290785, + "grad_norm": 0.2897416257015565, + "learning_rate": 7.785426704301607e-06, + "loss": 0.4883, + "step": 7519 + }, + { + "epoch": 1.2349886067374212, + "grad_norm": 0.2720161031746429, + "learning_rate": 7.785106216255229e-06, + "loss": 0.4819, + "step": 7520 + }, + { + "epoch": 1.235152833945764, + "grad_norm": 0.3006375841709048, + "learning_rate": 7.784785692529365e-06, + "loss": 0.4817, + "step": 7521 + }, + { + "epoch": 1.2353170611541067, + "grad_norm": 0.31301572775303516, + "learning_rate": 7.784465133127498e-06, + "loss": 0.4642, + "step": 7522 + }, + { + "epoch": 1.2354812883624495, + "grad_norm": 0.3852031934863473, + "learning_rate": 7.784144538053108e-06, + "loss": 0.4686, + "step": 7523 + }, + { + "epoch": 1.2356455155707922, + "grad_norm": 0.2872787747457896, + "learning_rate": 7.783823907309676e-06, + "loss": 0.4778, + "step": 7524 + }, + { + "epoch": 1.235809742779135, + "grad_norm": 0.2864937598952176, + "learning_rate": 7.783503240900686e-06, + "loss": 0.4869, + "step": 7525 + }, + { + "epoch": 1.2359739699874777, + "grad_norm": 0.3873222568675411, + "learning_rate": 7.78318253882962e-06, + "loss": 0.4727, + "step": 7526 + }, + { + "epoch": 1.2361381971958205, + "grad_norm": 0.32706793356824965, + "learning_rate": 7.782861801099963e-06, + "loss": 0.4931, + "step": 7527 + }, + { + "epoch": 1.2363024244041632, + "grad_norm": 0.24977174505164065, + "learning_rate": 7.782541027715195e-06, + "loss": 0.4436, + "step": 7528 + }, + { + "epoch": 1.2364666516125058, + "grad_norm": 0.31267245459003645, + "learning_rate": 7.782220218678804e-06, + "loss": 0.4731, + "step": 7529 + }, + { + "epoch": 1.2366308788208487, + "grad_norm": 0.33351244334836483, + "learning_rate": 7.78189937399427e-06, + "loss": 0.4922, + "step": 7530 + }, + { + "epoch": 1.2367951060291913, + "grad_norm": 0.3155971525806248, + "learning_rate": 7.781578493665083e-06, + "loss": 0.5006, + "step": 7531 + }, + { + "epoch": 1.2369593332375342, + "grad_norm": 0.3519882716741187, + "learning_rate": 7.78125757769472e-06, + "loss": 0.4795, + "step": 7532 + }, + { + "epoch": 1.2371235604458768, + "grad_norm": 0.2979601040823767, + "learning_rate": 7.780936626086675e-06, + "loss": 0.4766, + "step": 7533 + }, + { + "epoch": 1.2372877876542197, + "grad_norm": 0.3063945709411937, + "learning_rate": 7.78061563884443e-06, + "loss": 0.4789, + "step": 7534 + }, + { + "epoch": 1.2374520148625623, + "grad_norm": 0.26722072179018536, + "learning_rate": 7.780294615971471e-06, + "loss": 0.4838, + "step": 7535 + }, + { + "epoch": 1.2376162420709051, + "grad_norm": 0.292009730274553, + "learning_rate": 7.779973557471285e-06, + "loss": 0.4987, + "step": 7536 + }, + { + "epoch": 1.2377804692792478, + "grad_norm": 0.2807020434939495, + "learning_rate": 7.77965246334736e-06, + "loss": 0.4715, + "step": 7537 + }, + { + "epoch": 1.2379446964875906, + "grad_norm": 0.3453670092832338, + "learning_rate": 7.77933133360318e-06, + "loss": 0.4902, + "step": 7538 + }, + { + "epoch": 1.2381089236959333, + "grad_norm": 0.31388854164530383, + "learning_rate": 7.779010168242236e-06, + "loss": 0.4856, + "step": 7539 + }, + { + "epoch": 1.2382731509042761, + "grad_norm": 0.2867020260405545, + "learning_rate": 7.778688967268017e-06, + "loss": 0.4847, + "step": 7540 + }, + { + "epoch": 1.2384373781126188, + "grad_norm": 0.3438963236599197, + "learning_rate": 7.77836773068401e-06, + "loss": 0.5006, + "step": 7541 + }, + { + "epoch": 1.2386016053209616, + "grad_norm": 0.4297254179891695, + "learning_rate": 7.778046458493703e-06, + "loss": 0.4747, + "step": 7542 + }, + { + "epoch": 1.2387658325293043, + "grad_norm": 0.3582137581158092, + "learning_rate": 7.777725150700587e-06, + "loss": 0.4832, + "step": 7543 + }, + { + "epoch": 1.2389300597376471, + "grad_norm": 0.29072289657454703, + "learning_rate": 7.777403807308148e-06, + "loss": 0.4966, + "step": 7544 + }, + { + "epoch": 1.2390942869459898, + "grad_norm": 0.3252671784530234, + "learning_rate": 7.777082428319884e-06, + "loss": 0.4866, + "step": 7545 + }, + { + "epoch": 1.2392585141543324, + "grad_norm": 0.2908401782457425, + "learning_rate": 7.776761013739277e-06, + "loss": 0.4755, + "step": 7546 + }, + { + "epoch": 1.2394227413626753, + "grad_norm": 0.28684276655151986, + "learning_rate": 7.776439563569825e-06, + "loss": 0.4761, + "step": 7547 + }, + { + "epoch": 1.239586968571018, + "grad_norm": 0.30531377495350137, + "learning_rate": 7.776118077815012e-06, + "loss": 0.4831, + "step": 7548 + }, + { + "epoch": 1.2397511957793608, + "grad_norm": 0.29723211564359786, + "learning_rate": 7.775796556478336e-06, + "loss": 0.4801, + "step": 7549 + }, + { + "epoch": 1.2399154229877034, + "grad_norm": 0.2936682730249869, + "learning_rate": 7.775474999563285e-06, + "loss": 0.4832, + "step": 7550 + }, + { + "epoch": 1.2400796501960463, + "grad_norm": 0.3018141932268422, + "learning_rate": 7.775153407073353e-06, + "loss": 0.4906, + "step": 7551 + }, + { + "epoch": 1.240243877404389, + "grad_norm": 0.2787629798347041, + "learning_rate": 7.774831779012033e-06, + "loss": 0.4896, + "step": 7552 + }, + { + "epoch": 1.2404081046127318, + "grad_norm": 0.34481809463993834, + "learning_rate": 7.774510115382818e-06, + "loss": 0.4659, + "step": 7553 + }, + { + "epoch": 1.2405723318210744, + "grad_norm": 0.2779282824687869, + "learning_rate": 7.774188416189201e-06, + "loss": 0.4546, + "step": 7554 + }, + { + "epoch": 1.2407365590294173, + "grad_norm": 0.2786407520838788, + "learning_rate": 7.773866681434676e-06, + "loss": 0.4798, + "step": 7555 + }, + { + "epoch": 1.24090078623776, + "grad_norm": 0.3285343236852004, + "learning_rate": 7.77354491112274e-06, + "loss": 0.4797, + "step": 7556 + }, + { + "epoch": 1.2410650134461028, + "grad_norm": 0.37506765944836495, + "learning_rate": 7.773223105256883e-06, + "loss": 0.4845, + "step": 7557 + }, + { + "epoch": 1.2412292406544454, + "grad_norm": 0.35685647641502605, + "learning_rate": 7.772901263840605e-06, + "loss": 0.4877, + "step": 7558 + }, + { + "epoch": 1.2413934678627883, + "grad_norm": 0.3358896496566247, + "learning_rate": 7.772579386877396e-06, + "loss": 0.49, + "step": 7559 + }, + { + "epoch": 1.241557695071131, + "grad_norm": 0.3514289943534612, + "learning_rate": 7.772257474370757e-06, + "loss": 0.4744, + "step": 7560 + }, + { + "epoch": 1.2417219222794738, + "grad_norm": 0.2937933570234482, + "learning_rate": 7.771935526324183e-06, + "loss": 0.4843, + "step": 7561 + }, + { + "epoch": 1.2418861494878164, + "grad_norm": 0.3393020982420418, + "learning_rate": 7.77161354274117e-06, + "loss": 0.4573, + "step": 7562 + }, + { + "epoch": 1.242050376696159, + "grad_norm": 0.3228901072188093, + "learning_rate": 7.771291523625214e-06, + "loss": 0.5017, + "step": 7563 + }, + { + "epoch": 1.242214603904502, + "grad_norm": 0.3065450686112153, + "learning_rate": 7.770969468979814e-06, + "loss": 0.4758, + "step": 7564 + }, + { + "epoch": 1.2423788311128445, + "grad_norm": 0.3106520824315073, + "learning_rate": 7.770647378808469e-06, + "loss": 0.4759, + "step": 7565 + }, + { + "epoch": 1.2425430583211874, + "grad_norm": 0.2937174607831136, + "learning_rate": 7.770325253114674e-06, + "loss": 0.4622, + "step": 7566 + }, + { + "epoch": 1.24270728552953, + "grad_norm": 0.3386310405929065, + "learning_rate": 7.770003091901928e-06, + "loss": 0.4851, + "step": 7567 + }, + { + "epoch": 1.2428715127378729, + "grad_norm": 0.34444528064525604, + "learning_rate": 7.769680895173733e-06, + "loss": 0.4878, + "step": 7568 + }, + { + "epoch": 1.2430357399462155, + "grad_norm": 0.27978430576428204, + "learning_rate": 7.769358662933584e-06, + "loss": 0.4847, + "step": 7569 + }, + { + "epoch": 1.2431999671545584, + "grad_norm": 0.31389488030986795, + "learning_rate": 7.769036395184987e-06, + "loss": 0.4745, + "step": 7570 + }, + { + "epoch": 1.243364194362901, + "grad_norm": 0.27170554485549553, + "learning_rate": 7.768714091931436e-06, + "loss": 0.4794, + "step": 7571 + }, + { + "epoch": 1.2435284215712439, + "grad_norm": 0.33136051639556463, + "learning_rate": 7.768391753176434e-06, + "loss": 0.4644, + "step": 7572 + }, + { + "epoch": 1.2436926487795865, + "grad_norm": 0.3093556203425737, + "learning_rate": 7.768069378923483e-06, + "loss": 0.4646, + "step": 7573 + }, + { + "epoch": 1.2438568759879294, + "grad_norm": 0.46973993506400613, + "learning_rate": 7.767746969176082e-06, + "loss": 0.4632, + "step": 7574 + }, + { + "epoch": 1.244021103196272, + "grad_norm": 0.26908039743071843, + "learning_rate": 7.767424523937735e-06, + "loss": 0.4705, + "step": 7575 + }, + { + "epoch": 1.2441853304046149, + "grad_norm": 0.37290345238525935, + "learning_rate": 7.767102043211942e-06, + "loss": 0.4944, + "step": 7576 + }, + { + "epoch": 1.2443495576129575, + "grad_norm": 0.43611265222170165, + "learning_rate": 7.766779527002208e-06, + "loss": 0.4909, + "step": 7577 + }, + { + "epoch": 1.2445137848213004, + "grad_norm": 0.3328354020728991, + "learning_rate": 7.766456975312032e-06, + "loss": 0.4696, + "step": 7578 + }, + { + "epoch": 1.244678012029643, + "grad_norm": 0.25915177451173665, + "learning_rate": 7.766134388144921e-06, + "loss": 0.4875, + "step": 7579 + }, + { + "epoch": 1.2448422392379856, + "grad_norm": 0.25832700528645586, + "learning_rate": 7.765811765504377e-06, + "loss": 0.4837, + "step": 7580 + }, + { + "epoch": 1.2450064664463285, + "grad_norm": 0.30829434869947336, + "learning_rate": 7.765489107393903e-06, + "loss": 0.4748, + "step": 7581 + }, + { + "epoch": 1.2451706936546711, + "grad_norm": 0.30836619497139556, + "learning_rate": 7.765166413817006e-06, + "loss": 0.486, + "step": 7582 + }, + { + "epoch": 1.245334920863014, + "grad_norm": 0.6948548750504677, + "learning_rate": 7.76484368477719e-06, + "loss": 0.4691, + "step": 7583 + }, + { + "epoch": 1.2454991480713566, + "grad_norm": 0.2700607921883031, + "learning_rate": 7.764520920277958e-06, + "loss": 0.473, + "step": 7584 + }, + { + "epoch": 1.2456633752796995, + "grad_norm": 0.3288894558422325, + "learning_rate": 7.764198120322816e-06, + "loss": 0.4831, + "step": 7585 + }, + { + "epoch": 1.2458276024880421, + "grad_norm": 0.3254518062997147, + "learning_rate": 7.763875284915272e-06, + "loss": 0.4723, + "step": 7586 + }, + { + "epoch": 1.245991829696385, + "grad_norm": 0.3169491475419649, + "learning_rate": 7.76355241405883e-06, + "loss": 0.5002, + "step": 7587 + }, + { + "epoch": 1.2461560569047276, + "grad_norm": 0.2896778965883232, + "learning_rate": 7.763229507757e-06, + "loss": 0.4784, + "step": 7588 + }, + { + "epoch": 1.2463202841130705, + "grad_norm": 0.6643183902372932, + "learning_rate": 7.762906566013287e-06, + "loss": 0.4641, + "step": 7589 + }, + { + "epoch": 1.2464845113214131, + "grad_norm": 0.4694379861367071, + "learning_rate": 7.762583588831197e-06, + "loss": 0.4951, + "step": 7590 + }, + { + "epoch": 1.246648738529756, + "grad_norm": 0.2714033681351259, + "learning_rate": 7.76226057621424e-06, + "loss": 0.4512, + "step": 7591 + }, + { + "epoch": 1.2468129657380986, + "grad_norm": 0.33738219660603047, + "learning_rate": 7.761937528165923e-06, + "loss": 0.4711, + "step": 7592 + }, + { + "epoch": 1.2469771929464415, + "grad_norm": 0.31785803759257947, + "learning_rate": 7.761614444689755e-06, + "loss": 0.4799, + "step": 7593 + }, + { + "epoch": 1.2471414201547841, + "grad_norm": 0.2944630094956895, + "learning_rate": 7.761291325789244e-06, + "loss": 0.4857, + "step": 7594 + }, + { + "epoch": 1.247305647363127, + "grad_norm": 0.2539993541834688, + "learning_rate": 7.760968171467903e-06, + "loss": 0.4545, + "step": 7595 + }, + { + "epoch": 1.2474698745714696, + "grad_norm": 0.31121532100643007, + "learning_rate": 7.760644981729238e-06, + "loss": 0.4742, + "step": 7596 + }, + { + "epoch": 1.2476341017798123, + "grad_norm": 0.41726789416464144, + "learning_rate": 7.76032175657676e-06, + "loss": 0.4569, + "step": 7597 + }, + { + "epoch": 1.2477983289881551, + "grad_norm": 0.2858322392854143, + "learning_rate": 7.759998496013981e-06, + "loss": 0.4628, + "step": 7598 + }, + { + "epoch": 1.2479625561964978, + "grad_norm": 0.29840578768448217, + "learning_rate": 7.759675200044411e-06, + "loss": 0.4865, + "step": 7599 + }, + { + "epoch": 1.2481267834048406, + "grad_norm": 0.429100208558146, + "learning_rate": 7.75935186867156e-06, + "loss": 0.4707, + "step": 7600 + }, + { + "epoch": 1.2482910106131833, + "grad_norm": 0.2838003969767729, + "learning_rate": 7.759028501898942e-06, + "loss": 0.4709, + "step": 7601 + }, + { + "epoch": 1.2484552378215261, + "grad_norm": 0.3557099389351078, + "learning_rate": 7.758705099730069e-06, + "loss": 0.4723, + "step": 7602 + }, + { + "epoch": 1.2486194650298688, + "grad_norm": 0.2870847693915563, + "learning_rate": 7.75838166216845e-06, + "loss": 0.4866, + "step": 7603 + }, + { + "epoch": 1.2487836922382116, + "grad_norm": 0.3188591937844555, + "learning_rate": 7.758058189217604e-06, + "loss": 0.4486, + "step": 7604 + }, + { + "epoch": 1.2489479194465543, + "grad_norm": 0.37056779120212635, + "learning_rate": 7.757734680881036e-06, + "loss": 0.4855, + "step": 7605 + }, + { + "epoch": 1.2491121466548971, + "grad_norm": 0.3639198034440745, + "learning_rate": 7.757411137162267e-06, + "loss": 0.4747, + "step": 7606 + }, + { + "epoch": 1.2492763738632398, + "grad_norm": 0.2787655583803941, + "learning_rate": 7.757087558064806e-06, + "loss": 0.4783, + "step": 7607 + }, + { + "epoch": 1.2494406010715826, + "grad_norm": 0.30779539880108575, + "learning_rate": 7.756763943592173e-06, + "loss": 0.4965, + "step": 7608 + }, + { + "epoch": 1.2496048282799253, + "grad_norm": 0.31669536490305483, + "learning_rate": 7.756440293747877e-06, + "loss": 0.4912, + "step": 7609 + }, + { + "epoch": 1.249769055488268, + "grad_norm": 0.308687391665335, + "learning_rate": 7.756116608535436e-06, + "loss": 0.4889, + "step": 7610 + }, + { + "epoch": 1.2499332826966107, + "grad_norm": 0.38901222664933416, + "learning_rate": 7.755792887958365e-06, + "loss": 0.4977, + "step": 7611 + }, + { + "epoch": 1.2500975099049536, + "grad_norm": 0.28324939107336244, + "learning_rate": 7.75546913202018e-06, + "loss": 0.4828, + "step": 7612 + }, + { + "epoch": 1.2502617371132962, + "grad_norm": 0.4111706793850341, + "learning_rate": 7.755145340724396e-06, + "loss": 0.4756, + "step": 7613 + }, + { + "epoch": 1.2504259643216389, + "grad_norm": 0.3162486727419526, + "learning_rate": 7.754821514074534e-06, + "loss": 0.4869, + "step": 7614 + }, + { + "epoch": 1.2505901915299817, + "grad_norm": 0.30369645916751725, + "learning_rate": 7.754497652074106e-06, + "loss": 0.498, + "step": 7615 + }, + { + "epoch": 1.2507544187383246, + "grad_norm": 0.2973675199028055, + "learning_rate": 7.754173754726631e-06, + "loss": 0.4724, + "step": 7616 + }, + { + "epoch": 1.2509186459466672, + "grad_norm": 0.3635026371907883, + "learning_rate": 7.75384982203563e-06, + "loss": 0.4735, + "step": 7617 + }, + { + "epoch": 1.2510828731550099, + "grad_norm": 0.3335327209309586, + "learning_rate": 7.753525854004618e-06, + "loss": 0.4845, + "step": 7618 + }, + { + "epoch": 1.2512471003633527, + "grad_norm": 0.27222796938938104, + "learning_rate": 7.753201850637111e-06, + "loss": 0.4745, + "step": 7619 + }, + { + "epoch": 1.2514113275716954, + "grad_norm": 0.25174104281048826, + "learning_rate": 7.752877811936634e-06, + "loss": 0.4695, + "step": 7620 + }, + { + "epoch": 1.2515755547800382, + "grad_norm": 0.37642279840391096, + "learning_rate": 7.752553737906702e-06, + "loss": 0.4846, + "step": 7621 + }, + { + "epoch": 1.2517397819883809, + "grad_norm": 0.28660532539995426, + "learning_rate": 7.752229628550837e-06, + "loss": 0.4891, + "step": 7622 + }, + { + "epoch": 1.2519040091967237, + "grad_norm": 0.5222307981778282, + "learning_rate": 7.75190548387256e-06, + "loss": 0.4841, + "step": 7623 + }, + { + "epoch": 1.2520682364050664, + "grad_norm": 0.2689431118979005, + "learning_rate": 7.751581303875387e-06, + "loss": 0.4654, + "step": 7624 + }, + { + "epoch": 1.2522324636134092, + "grad_norm": 0.26136425839250754, + "learning_rate": 7.751257088562843e-06, + "loss": 0.4892, + "step": 7625 + }, + { + "epoch": 1.2523966908217519, + "grad_norm": 0.3392486862232706, + "learning_rate": 7.75093283793845e-06, + "loss": 0.4785, + "step": 7626 + }, + { + "epoch": 1.2525609180300945, + "grad_norm": 0.28026937023228876, + "learning_rate": 7.750608552005726e-06, + "loss": 0.4753, + "step": 7627 + }, + { + "epoch": 1.2527251452384374, + "grad_norm": 0.3107954217489059, + "learning_rate": 7.750284230768194e-06, + "loss": 0.4883, + "step": 7628 + }, + { + "epoch": 1.2528893724467802, + "grad_norm": 0.3049446665147886, + "learning_rate": 7.749959874229381e-06, + "loss": 0.4796, + "step": 7629 + }, + { + "epoch": 1.2530535996551229, + "grad_norm": 0.3168276384106462, + "learning_rate": 7.749635482392802e-06, + "loss": 0.4955, + "step": 7630 + }, + { + "epoch": 1.2532178268634655, + "grad_norm": 0.3235340299163566, + "learning_rate": 7.749311055261989e-06, + "loss": 0.481, + "step": 7631 + }, + { + "epoch": 1.2533820540718084, + "grad_norm": 0.39371662529450346, + "learning_rate": 7.748986592840457e-06, + "loss": 0.4861, + "step": 7632 + }, + { + "epoch": 1.2535462812801512, + "grad_norm": 0.285427242350543, + "learning_rate": 7.748662095131736e-06, + "loss": 0.498, + "step": 7633 + }, + { + "epoch": 1.2537105084884939, + "grad_norm": 0.33650249587902675, + "learning_rate": 7.748337562139348e-06, + "loss": 0.4865, + "step": 7634 + }, + { + "epoch": 1.2538747356968365, + "grad_norm": 0.3027523754521188, + "learning_rate": 7.748012993866817e-06, + "loss": 0.4752, + "step": 7635 + }, + { + "epoch": 1.2540389629051794, + "grad_norm": 0.2806626536948512, + "learning_rate": 7.747688390317672e-06, + "loss": 0.4701, + "step": 7636 + }, + { + "epoch": 1.254203190113522, + "grad_norm": 0.2646422086313468, + "learning_rate": 7.747363751495434e-06, + "loss": 0.4891, + "step": 7637 + }, + { + "epoch": 1.2543674173218649, + "grad_norm": 0.273441689961552, + "learning_rate": 7.747039077403631e-06, + "loss": 0.4606, + "step": 7638 + }, + { + "epoch": 1.2545316445302075, + "grad_norm": 0.39997955334967183, + "learning_rate": 7.746714368045788e-06, + "loss": 0.4665, + "step": 7639 + }, + { + "epoch": 1.2546958717385504, + "grad_norm": 0.3156118656661959, + "learning_rate": 7.746389623425435e-06, + "loss": 0.4848, + "step": 7640 + }, + { + "epoch": 1.254860098946893, + "grad_norm": 0.2993686116712013, + "learning_rate": 7.746064843546096e-06, + "loss": 0.4766, + "step": 7641 + }, + { + "epoch": 1.2550243261552358, + "grad_norm": 0.35049350059175216, + "learning_rate": 7.745740028411296e-06, + "loss": 0.4811, + "step": 7642 + }, + { + "epoch": 1.2551885533635785, + "grad_norm": 0.33841545284685837, + "learning_rate": 7.74541517802457e-06, + "loss": 0.4875, + "step": 7643 + }, + { + "epoch": 1.2553527805719211, + "grad_norm": 0.29653348815931907, + "learning_rate": 7.745090292389438e-06, + "loss": 0.4834, + "step": 7644 + }, + { + "epoch": 1.255517007780264, + "grad_norm": 1.0619524130211613, + "learning_rate": 7.744765371509437e-06, + "loss": 0.4715, + "step": 7645 + }, + { + "epoch": 1.2556812349886068, + "grad_norm": 0.33283296033041115, + "learning_rate": 7.744440415388089e-06, + "loss": 0.4661, + "step": 7646 + }, + { + "epoch": 1.2558454621969495, + "grad_norm": 0.3195752649112358, + "learning_rate": 7.744115424028925e-06, + "loss": 0.4981, + "step": 7647 + }, + { + "epoch": 1.2560096894052921, + "grad_norm": 0.2546867337134892, + "learning_rate": 7.74379039743548e-06, + "loss": 0.4771, + "step": 7648 + }, + { + "epoch": 1.256173916613635, + "grad_norm": 0.30738186122930333, + "learning_rate": 7.743465335611276e-06, + "loss": 0.4666, + "step": 7649 + }, + { + "epoch": 1.2563381438219778, + "grad_norm": 0.27859639311481776, + "learning_rate": 7.74314023855985e-06, + "loss": 0.4712, + "step": 7650 + }, + { + "epoch": 1.2565023710303205, + "grad_norm": 0.3330781418753308, + "learning_rate": 7.742815106284728e-06, + "loss": 0.4659, + "step": 7651 + }, + { + "epoch": 1.2566665982386631, + "grad_norm": 0.32030910397101686, + "learning_rate": 7.742489938789444e-06, + "loss": 0.4489, + "step": 7652 + }, + { + "epoch": 1.256830825447006, + "grad_norm": 0.358633123705577, + "learning_rate": 7.74216473607753e-06, + "loss": 0.455, + "step": 7653 + }, + { + "epoch": 1.2569950526553486, + "grad_norm": 0.28410974154650376, + "learning_rate": 7.741839498152515e-06, + "loss": 0.4801, + "step": 7654 + }, + { + "epoch": 1.2571592798636915, + "grad_norm": 0.34427076281168334, + "learning_rate": 7.741514225017935e-06, + "loss": 0.487, + "step": 7655 + }, + { + "epoch": 1.257323507072034, + "grad_norm": 0.31974128073632185, + "learning_rate": 7.741188916677321e-06, + "loss": 0.4781, + "step": 7656 + }, + { + "epoch": 1.257487734280377, + "grad_norm": 0.2841276269841283, + "learning_rate": 7.740863573134208e-06, + "loss": 0.4656, + "step": 7657 + }, + { + "epoch": 1.2576519614887196, + "grad_norm": 0.29801297035583985, + "learning_rate": 7.740538194392126e-06, + "loss": 0.492, + "step": 7658 + }, + { + "epoch": 1.2578161886970625, + "grad_norm": 0.40026325549229824, + "learning_rate": 7.740212780454611e-06, + "loss": 0.4843, + "step": 7659 + }, + { + "epoch": 1.257980415905405, + "grad_norm": 0.3383340220937869, + "learning_rate": 7.739887331325199e-06, + "loss": 0.4584, + "step": 7660 + }, + { + "epoch": 1.2581446431137477, + "grad_norm": 0.3879046565559537, + "learning_rate": 7.73956184700742e-06, + "loss": 0.4712, + "step": 7661 + }, + { + "epoch": 1.2583088703220906, + "grad_norm": 0.3046193115336133, + "learning_rate": 7.739236327504814e-06, + "loss": 0.4757, + "step": 7662 + }, + { + "epoch": 1.2584730975304335, + "grad_norm": 0.27583639779518276, + "learning_rate": 7.738910772820915e-06, + "loss": 0.4625, + "step": 7663 + }, + { + "epoch": 1.258637324738776, + "grad_norm": 0.2743611344103936, + "learning_rate": 7.738585182959257e-06, + "loss": 0.4819, + "step": 7664 + }, + { + "epoch": 1.2588015519471187, + "grad_norm": 0.26396529926757517, + "learning_rate": 7.73825955792338e-06, + "loss": 0.4645, + "step": 7665 + }, + { + "epoch": 1.2589657791554616, + "grad_norm": 0.33484568225629724, + "learning_rate": 7.737933897716815e-06, + "loss": 0.483, + "step": 7666 + }, + { + "epoch": 1.2591300063638045, + "grad_norm": 0.2855404254518291, + "learning_rate": 7.737608202343104e-06, + "loss": 0.4797, + "step": 7667 + }, + { + "epoch": 1.259294233572147, + "grad_norm": 0.2678897122032849, + "learning_rate": 7.737282471805782e-06, + "loss": 0.4677, + "step": 7668 + }, + { + "epoch": 1.2594584607804897, + "grad_norm": 0.33378647809677103, + "learning_rate": 7.736956706108388e-06, + "loss": 0.4698, + "step": 7669 + }, + { + "epoch": 1.2596226879888326, + "grad_norm": 0.33762949882200577, + "learning_rate": 7.736630905254458e-06, + "loss": 0.4698, + "step": 7670 + }, + { + "epoch": 1.2597869151971752, + "grad_norm": 0.28816229123616627, + "learning_rate": 7.736305069247535e-06, + "loss": 0.4851, + "step": 7671 + }, + { + "epoch": 1.259951142405518, + "grad_norm": 0.3474911045704335, + "learning_rate": 7.735979198091151e-06, + "loss": 0.4729, + "step": 7672 + }, + { + "epoch": 1.2601153696138607, + "grad_norm": 0.31218960292473513, + "learning_rate": 7.735653291788851e-06, + "loss": 0.4663, + "step": 7673 + }, + { + "epoch": 1.2602795968222036, + "grad_norm": 0.2741703738720939, + "learning_rate": 7.735327350344173e-06, + "loss": 0.4788, + "step": 7674 + }, + { + "epoch": 1.2604438240305462, + "grad_norm": 0.3220346800939815, + "learning_rate": 7.735001373760658e-06, + "loss": 0.4785, + "step": 7675 + }, + { + "epoch": 1.260608051238889, + "grad_norm": 0.2687168544207819, + "learning_rate": 7.734675362041843e-06, + "loss": 0.4707, + "step": 7676 + }, + { + "epoch": 1.2607722784472317, + "grad_norm": 0.2882314916411854, + "learning_rate": 7.734349315191272e-06, + "loss": 0.4771, + "step": 7677 + }, + { + "epoch": 1.2609365056555744, + "grad_norm": 0.27978276123566304, + "learning_rate": 7.734023233212484e-06, + "loss": 0.4883, + "step": 7678 + }, + { + "epoch": 1.2611007328639172, + "grad_norm": 0.4947967595175015, + "learning_rate": 7.733697116109024e-06, + "loss": 0.4965, + "step": 7679 + }, + { + "epoch": 1.26126496007226, + "grad_norm": 0.3388665890482255, + "learning_rate": 7.73337096388443e-06, + "loss": 0.4687, + "step": 7680 + }, + { + "epoch": 1.2614291872806027, + "grad_norm": 0.34797616819323746, + "learning_rate": 7.733044776542248e-06, + "loss": 0.4794, + "step": 7681 + }, + { + "epoch": 1.2615934144889454, + "grad_norm": 0.3238567132030185, + "learning_rate": 7.732718554086017e-06, + "loss": 0.4868, + "step": 7682 + }, + { + "epoch": 1.2617576416972882, + "grad_norm": 0.2742114952715846, + "learning_rate": 7.732392296519283e-06, + "loss": 0.4742, + "step": 7683 + }, + { + "epoch": 1.261921868905631, + "grad_norm": 0.29131539497818637, + "learning_rate": 7.732066003845588e-06, + "loss": 0.4896, + "step": 7684 + }, + { + "epoch": 1.2620860961139737, + "grad_norm": 0.2622760343595224, + "learning_rate": 7.731739676068477e-06, + "loss": 0.4703, + "step": 7685 + }, + { + "epoch": 1.2622503233223163, + "grad_norm": 0.2930264769742631, + "learning_rate": 7.731413313191492e-06, + "loss": 0.4515, + "step": 7686 + }, + { + "epoch": 1.2624145505306592, + "grad_norm": 0.34566046733120337, + "learning_rate": 7.731086915218181e-06, + "loss": 0.4722, + "step": 7687 + }, + { + "epoch": 1.2625787777390018, + "grad_norm": 0.39056124524545094, + "learning_rate": 7.730760482152085e-06, + "loss": 0.4809, + "step": 7688 + }, + { + "epoch": 1.2627430049473447, + "grad_norm": 0.3505071210062579, + "learning_rate": 7.730434013996753e-06, + "loss": 0.4887, + "step": 7689 + }, + { + "epoch": 1.2629072321556873, + "grad_norm": 0.34205645309234606, + "learning_rate": 7.730107510755729e-06, + "loss": 0.4932, + "step": 7690 + }, + { + "epoch": 1.2630714593640302, + "grad_norm": 0.29161143835530734, + "learning_rate": 7.729780972432559e-06, + "loss": 0.4854, + "step": 7691 + }, + { + "epoch": 1.2632356865723728, + "grad_norm": 0.45514430334380673, + "learning_rate": 7.729454399030791e-06, + "loss": 0.4945, + "step": 7692 + }, + { + "epoch": 1.2633999137807157, + "grad_norm": 0.3141566231439699, + "learning_rate": 7.72912779055397e-06, + "loss": 0.4761, + "step": 7693 + }, + { + "epoch": 1.2635641409890583, + "grad_norm": 0.3559790524001432, + "learning_rate": 7.728801147005643e-06, + "loss": 0.4822, + "step": 7694 + }, + { + "epoch": 1.263728368197401, + "grad_norm": 0.3237951014226304, + "learning_rate": 7.728474468389361e-06, + "loss": 0.5001, + "step": 7695 + }, + { + "epoch": 1.2638925954057438, + "grad_norm": 0.3288019447855348, + "learning_rate": 7.72814775470867e-06, + "loss": 0.4917, + "step": 7696 + }, + { + "epoch": 1.2640568226140867, + "grad_norm": 0.431996411415779, + "learning_rate": 7.727821005967117e-06, + "loss": 0.4715, + "step": 7697 + }, + { + "epoch": 1.2642210498224293, + "grad_norm": 0.2891597431196658, + "learning_rate": 7.727494222168252e-06, + "loss": 0.4691, + "step": 7698 + }, + { + "epoch": 1.264385277030772, + "grad_norm": 0.37428064068458555, + "learning_rate": 7.727167403315625e-06, + "loss": 0.4835, + "step": 7699 + }, + { + "epoch": 1.2645495042391148, + "grad_norm": 0.3053769528132182, + "learning_rate": 7.726840549412784e-06, + "loss": 0.4808, + "step": 7700 + }, + { + "epoch": 1.2647137314474577, + "grad_norm": 0.4850739518874018, + "learning_rate": 7.726513660463282e-06, + "loss": 0.4627, + "step": 7701 + }, + { + "epoch": 1.2648779586558003, + "grad_norm": 0.2920598155904834, + "learning_rate": 7.726186736470666e-06, + "loss": 0.4685, + "step": 7702 + }, + { + "epoch": 1.265042185864143, + "grad_norm": 0.338510521081132, + "learning_rate": 7.725859777438487e-06, + "loss": 0.4743, + "step": 7703 + }, + { + "epoch": 1.2652064130724858, + "grad_norm": 0.4296571563713848, + "learning_rate": 7.725532783370298e-06, + "loss": 0.4923, + "step": 7704 + }, + { + "epoch": 1.2653706402808285, + "grad_norm": 0.30765920212094816, + "learning_rate": 7.725205754269648e-06, + "loss": 0.5024, + "step": 7705 + }, + { + "epoch": 1.2655348674891713, + "grad_norm": 0.2709899542946473, + "learning_rate": 7.724878690140093e-06, + "loss": 0.4717, + "step": 7706 + }, + { + "epoch": 1.265699094697514, + "grad_norm": 0.29725395484723255, + "learning_rate": 7.724551590985182e-06, + "loss": 0.4627, + "step": 7707 + }, + { + "epoch": 1.2658633219058568, + "grad_norm": 0.4053905691221062, + "learning_rate": 7.724224456808465e-06, + "loss": 0.4852, + "step": 7708 + }, + { + "epoch": 1.2660275491141995, + "grad_norm": 0.3116877740483153, + "learning_rate": 7.723897287613502e-06, + "loss": 0.4735, + "step": 7709 + }, + { + "epoch": 1.2661917763225423, + "grad_norm": 0.35449971632410593, + "learning_rate": 7.72357008340384e-06, + "loss": 0.4981, + "step": 7710 + }, + { + "epoch": 1.266356003530885, + "grad_norm": 0.3482399264073418, + "learning_rate": 7.723242844183038e-06, + "loss": 0.4552, + "step": 7711 + }, + { + "epoch": 1.2665202307392276, + "grad_norm": 0.437774132734577, + "learning_rate": 7.722915569954646e-06, + "loss": 0.4631, + "step": 7712 + }, + { + "epoch": 1.2666844579475705, + "grad_norm": 0.26839427210078487, + "learning_rate": 7.72258826072222e-06, + "loss": 0.495, + "step": 7713 + }, + { + "epoch": 1.2668486851559133, + "grad_norm": 0.31217056993105924, + "learning_rate": 7.722260916489313e-06, + "loss": 0.4844, + "step": 7714 + }, + { + "epoch": 1.267012912364256, + "grad_norm": 0.4914939353825111, + "learning_rate": 7.721933537259483e-06, + "loss": 0.4797, + "step": 7715 + }, + { + "epoch": 1.2671771395725986, + "grad_norm": 0.29182947550725524, + "learning_rate": 7.721606123036288e-06, + "loss": 0.4833, + "step": 7716 + }, + { + "epoch": 1.2673413667809414, + "grad_norm": 0.37793151937577985, + "learning_rate": 7.721278673823278e-06, + "loss": 0.4669, + "step": 7717 + }, + { + "epoch": 1.2675055939892843, + "grad_norm": 0.26048313020401365, + "learning_rate": 7.720951189624013e-06, + "loss": 0.4815, + "step": 7718 + }, + { + "epoch": 1.267669821197627, + "grad_norm": 0.3111343689668257, + "learning_rate": 7.720623670442048e-06, + "loss": 0.471, + "step": 7719 + }, + { + "epoch": 1.2678340484059696, + "grad_norm": 0.3135420875235267, + "learning_rate": 7.720296116280944e-06, + "loss": 0.4746, + "step": 7720 + }, + { + "epoch": 1.2679982756143124, + "grad_norm": 0.2976984908795594, + "learning_rate": 7.719968527144253e-06, + "loss": 0.4635, + "step": 7721 + }, + { + "epoch": 1.268162502822655, + "grad_norm": 0.2756385116643583, + "learning_rate": 7.719640903035538e-06, + "loss": 0.4821, + "step": 7722 + }, + { + "epoch": 1.268326730030998, + "grad_norm": 0.3310928364944098, + "learning_rate": 7.719313243958353e-06, + "loss": 0.5135, + "step": 7723 + }, + { + "epoch": 1.2684909572393406, + "grad_norm": 0.2898540745776658, + "learning_rate": 7.71898554991626e-06, + "loss": 0.4747, + "step": 7724 + }, + { + "epoch": 1.2686551844476834, + "grad_norm": 0.29846089427282313, + "learning_rate": 7.718657820912816e-06, + "loss": 0.4645, + "step": 7725 + }, + { + "epoch": 1.268819411656026, + "grad_norm": 0.4277166211567597, + "learning_rate": 7.718330056951582e-06, + "loss": 0.4626, + "step": 7726 + }, + { + "epoch": 1.268983638864369, + "grad_norm": 0.3386584451128976, + "learning_rate": 7.718002258036117e-06, + "loss": 0.45, + "step": 7727 + }, + { + "epoch": 1.2691478660727116, + "grad_norm": 0.31044845044724056, + "learning_rate": 7.717674424169983e-06, + "loss": 0.493, + "step": 7728 + }, + { + "epoch": 1.2693120932810542, + "grad_norm": 0.28929110658363827, + "learning_rate": 7.717346555356737e-06, + "loss": 0.4995, + "step": 7729 + }, + { + "epoch": 1.269476320489397, + "grad_norm": 0.2750277317962798, + "learning_rate": 7.717018651599942e-06, + "loss": 0.4523, + "step": 7730 + }, + { + "epoch": 1.26964054769774, + "grad_norm": 0.2646663022346015, + "learning_rate": 7.71669071290316e-06, + "loss": 0.4684, + "step": 7731 + }, + { + "epoch": 1.2698047749060826, + "grad_norm": 0.2722408985365008, + "learning_rate": 7.716362739269952e-06, + "loss": 0.4867, + "step": 7732 + }, + { + "epoch": 1.2699690021144252, + "grad_norm": 0.2813196075417855, + "learning_rate": 7.71603473070388e-06, + "loss": 0.5011, + "step": 7733 + }, + { + "epoch": 1.270133229322768, + "grad_norm": 0.2811195028154342, + "learning_rate": 7.715706687208507e-06, + "loss": 0.4883, + "step": 7734 + }, + { + "epoch": 1.270297456531111, + "grad_norm": 0.3121290061085945, + "learning_rate": 7.715378608787394e-06, + "loss": 0.5022, + "step": 7735 + }, + { + "epoch": 1.2704616837394536, + "grad_norm": 0.29219405088342026, + "learning_rate": 7.715050495444108e-06, + "loss": 0.4765, + "step": 7736 + }, + { + "epoch": 1.2706259109477962, + "grad_norm": 0.29988768120792714, + "learning_rate": 7.71472234718221e-06, + "loss": 0.4849, + "step": 7737 + }, + { + "epoch": 1.270790138156139, + "grad_norm": 0.30856320601214104, + "learning_rate": 7.714394164005264e-06, + "loss": 0.4753, + "step": 7738 + }, + { + "epoch": 1.2709543653644817, + "grad_norm": 0.29738267786753775, + "learning_rate": 7.714065945916834e-06, + "loss": 0.4698, + "step": 7739 + }, + { + "epoch": 1.2711185925728246, + "grad_norm": 0.3323745523516455, + "learning_rate": 7.713737692920488e-06, + "loss": 0.4704, + "step": 7740 + }, + { + "epoch": 1.2712828197811672, + "grad_norm": 0.2512031804860789, + "learning_rate": 7.713409405019786e-06, + "loss": 0.4737, + "step": 7741 + }, + { + "epoch": 1.27144704698951, + "grad_norm": 0.28820433890993685, + "learning_rate": 7.713081082218297e-06, + "loss": 0.4625, + "step": 7742 + }, + { + "epoch": 1.2716112741978527, + "grad_norm": 0.3977011594936401, + "learning_rate": 7.712752724519588e-06, + "loss": 0.4781, + "step": 7743 + }, + { + "epoch": 1.2717755014061956, + "grad_norm": 0.26281093424723, + "learning_rate": 7.712424331927221e-06, + "loss": 0.4773, + "step": 7744 + }, + { + "epoch": 1.2719397286145382, + "grad_norm": 0.3895924508235183, + "learning_rate": 7.712095904444767e-06, + "loss": 0.4865, + "step": 7745 + }, + { + "epoch": 1.2721039558228808, + "grad_norm": 0.6117971589272038, + "learning_rate": 7.71176744207579e-06, + "loss": 0.4621, + "step": 7746 + }, + { + "epoch": 1.2722681830312237, + "grad_norm": 0.2916185512611372, + "learning_rate": 7.71143894482386e-06, + "loss": 0.4791, + "step": 7747 + }, + { + "epoch": 1.2724324102395665, + "grad_norm": 0.3215675610815016, + "learning_rate": 7.711110412692543e-06, + "loss": 0.4743, + "step": 7748 + }, + { + "epoch": 1.2725966374479092, + "grad_norm": 0.3985246567089694, + "learning_rate": 7.710781845685406e-06, + "loss": 0.4826, + "step": 7749 + }, + { + "epoch": 1.2727608646562518, + "grad_norm": 0.3815067560152665, + "learning_rate": 7.710453243806021e-06, + "loss": 0.4847, + "step": 7750 + }, + { + "epoch": 1.2729250918645947, + "grad_norm": 0.2545415166282443, + "learning_rate": 7.710124607057954e-06, + "loss": 0.452, + "step": 7751 + }, + { + "epoch": 1.2730893190729375, + "grad_norm": 0.3512974958871999, + "learning_rate": 7.709795935444777e-06, + "loss": 0.4545, + "step": 7752 + }, + { + "epoch": 1.2732535462812802, + "grad_norm": 0.33763955275343127, + "learning_rate": 7.709467228970056e-06, + "loss": 0.4819, + "step": 7753 + }, + { + "epoch": 1.2734177734896228, + "grad_norm": 0.47180854525995025, + "learning_rate": 7.709138487637365e-06, + "loss": 0.4773, + "step": 7754 + }, + { + "epoch": 1.2735820006979657, + "grad_norm": 0.2883546400469497, + "learning_rate": 7.708809711450272e-06, + "loss": 0.4654, + "step": 7755 + }, + { + "epoch": 1.2737462279063083, + "grad_norm": 0.27563610891719326, + "learning_rate": 7.708480900412348e-06, + "loss": 0.4801, + "step": 7756 + }, + { + "epoch": 1.2739104551146512, + "grad_norm": 0.5716890214745179, + "learning_rate": 7.708152054527165e-06, + "loss": 0.4688, + "step": 7757 + }, + { + "epoch": 1.2740746823229938, + "grad_norm": 0.42195057385213214, + "learning_rate": 7.707823173798295e-06, + "loss": 0.4796, + "step": 7758 + }, + { + "epoch": 1.2742389095313367, + "grad_norm": 0.30176871840975483, + "learning_rate": 7.707494258229308e-06, + "loss": 0.4812, + "step": 7759 + }, + { + "epoch": 1.2744031367396793, + "grad_norm": 0.3565361831398607, + "learning_rate": 7.707165307823778e-06, + "loss": 0.4662, + "step": 7760 + }, + { + "epoch": 1.2745673639480222, + "grad_norm": 0.29370991851580663, + "learning_rate": 7.706836322585278e-06, + "loss": 0.485, + "step": 7761 + }, + { + "epoch": 1.2747315911563648, + "grad_norm": 0.3047484919897347, + "learning_rate": 7.70650730251738e-06, + "loss": 0.4779, + "step": 7762 + }, + { + "epoch": 1.2748958183647074, + "grad_norm": 0.28808269074598614, + "learning_rate": 7.706178247623659e-06, + "loss": 0.4763, + "step": 7763 + }, + { + "epoch": 1.2750600455730503, + "grad_norm": 0.27915183570391694, + "learning_rate": 7.705849157907686e-06, + "loss": 0.4792, + "step": 7764 + }, + { + "epoch": 1.2752242727813932, + "grad_norm": 0.32806606132369737, + "learning_rate": 7.705520033373038e-06, + "loss": 0.4698, + "step": 7765 + }, + { + "epoch": 1.2753884999897358, + "grad_norm": 0.3089376020988689, + "learning_rate": 7.70519087402329e-06, + "loss": 0.5008, + "step": 7766 + }, + { + "epoch": 1.2755527271980784, + "grad_norm": 0.2725056465017377, + "learning_rate": 7.704861679862013e-06, + "loss": 0.4721, + "step": 7767 + }, + { + "epoch": 1.2757169544064213, + "grad_norm": 0.28418412299316054, + "learning_rate": 7.704532450892785e-06, + "loss": 0.4714, + "step": 7768 + }, + { + "epoch": 1.2758811816147642, + "grad_norm": 0.3165971984097728, + "learning_rate": 7.704203187119183e-06, + "loss": 0.4929, + "step": 7769 + }, + { + "epoch": 1.2760454088231068, + "grad_norm": 0.36671004266675833, + "learning_rate": 7.703873888544782e-06, + "loss": 0.4695, + "step": 7770 + }, + { + "epoch": 1.2762096360314494, + "grad_norm": 0.2693544975307786, + "learning_rate": 7.703544555173158e-06, + "loss": 0.476, + "step": 7771 + }, + { + "epoch": 1.2763738632397923, + "grad_norm": 0.3594750493142773, + "learning_rate": 7.703215187007889e-06, + "loss": 0.4489, + "step": 7772 + }, + { + "epoch": 1.276538090448135, + "grad_norm": 0.3247458570973557, + "learning_rate": 7.70288578405255e-06, + "loss": 0.4897, + "step": 7773 + }, + { + "epoch": 1.2767023176564778, + "grad_norm": 0.31715952464757263, + "learning_rate": 7.702556346310721e-06, + "loss": 0.4879, + "step": 7774 + }, + { + "epoch": 1.2768665448648204, + "grad_norm": 0.3040453912621078, + "learning_rate": 7.70222687378598e-06, + "loss": 0.4747, + "step": 7775 + }, + { + "epoch": 1.2770307720731633, + "grad_norm": 0.35248094697874277, + "learning_rate": 7.701897366481903e-06, + "loss": 0.4936, + "step": 7776 + }, + { + "epoch": 1.277194999281506, + "grad_norm": 0.28411064669334896, + "learning_rate": 7.70156782440207e-06, + "loss": 0.4849, + "step": 7777 + }, + { + "epoch": 1.2773592264898488, + "grad_norm": 0.30851415055717885, + "learning_rate": 7.701238247550064e-06, + "loss": 0.4883, + "step": 7778 + }, + { + "epoch": 1.2775234536981914, + "grad_norm": 0.30100077430210254, + "learning_rate": 7.700908635929458e-06, + "loss": 0.4926, + "step": 7779 + }, + { + "epoch": 1.277687680906534, + "grad_norm": 0.3256247158390222, + "learning_rate": 7.700578989543835e-06, + "loss": 0.4769, + "step": 7780 + }, + { + "epoch": 1.277851908114877, + "grad_norm": 0.3831007490176088, + "learning_rate": 7.700249308396775e-06, + "loss": 0.481, + "step": 7781 + }, + { + "epoch": 1.2780161353232198, + "grad_norm": 0.2900015324255877, + "learning_rate": 7.699919592491862e-06, + "loss": 0.4581, + "step": 7782 + }, + { + "epoch": 1.2781803625315624, + "grad_norm": 0.39968965012855234, + "learning_rate": 7.699589841832671e-06, + "loss": 0.475, + "step": 7783 + }, + { + "epoch": 1.278344589739905, + "grad_norm": 0.2987916540815722, + "learning_rate": 7.699260056422787e-06, + "loss": 0.4799, + "step": 7784 + }, + { + "epoch": 1.278508816948248, + "grad_norm": 0.3031895511173132, + "learning_rate": 7.69893023626579e-06, + "loss": 0.4982, + "step": 7785 + }, + { + "epoch": 1.2786730441565908, + "grad_norm": 0.3036242316048294, + "learning_rate": 7.698600381365264e-06, + "loss": 0.4725, + "step": 7786 + }, + { + "epoch": 1.2788372713649334, + "grad_norm": 0.29859693726042974, + "learning_rate": 7.698270491724793e-06, + "loss": 0.4765, + "step": 7787 + }, + { + "epoch": 1.279001498573276, + "grad_norm": 0.2845879852561358, + "learning_rate": 7.697940567347956e-06, + "loss": 0.4864, + "step": 7788 + }, + { + "epoch": 1.279165725781619, + "grad_norm": 0.28163700973601685, + "learning_rate": 7.697610608238338e-06, + "loss": 0.4703, + "step": 7789 + }, + { + "epoch": 1.2793299529899615, + "grad_norm": 0.30036853676830083, + "learning_rate": 7.697280614399523e-06, + "loss": 0.4867, + "step": 7790 + }, + { + "epoch": 1.2794941801983044, + "grad_norm": 0.28760448854099513, + "learning_rate": 7.696950585835094e-06, + "loss": 0.4448, + "step": 7791 + }, + { + "epoch": 1.279658407406647, + "grad_norm": 0.33460690891254224, + "learning_rate": 7.696620522548638e-06, + "loss": 0.4686, + "step": 7792 + }, + { + "epoch": 1.27982263461499, + "grad_norm": 0.27290421235975976, + "learning_rate": 7.696290424543737e-06, + "loss": 0.4735, + "step": 7793 + }, + { + "epoch": 1.2799868618233325, + "grad_norm": 0.2896345427090911, + "learning_rate": 7.695960291823978e-06, + "loss": 0.4585, + "step": 7794 + }, + { + "epoch": 1.2801510890316754, + "grad_norm": 0.2884151664673828, + "learning_rate": 7.695630124392945e-06, + "loss": 0.459, + "step": 7795 + }, + { + "epoch": 1.280315316240018, + "grad_norm": 0.29439858268755925, + "learning_rate": 7.695299922254224e-06, + "loss": 0.4635, + "step": 7796 + }, + { + "epoch": 1.2804795434483607, + "grad_norm": 0.27699445691313457, + "learning_rate": 7.694969685411404e-06, + "loss": 0.484, + "step": 7797 + }, + { + "epoch": 1.2806437706567035, + "grad_norm": 0.24163729471658402, + "learning_rate": 7.694639413868068e-06, + "loss": 0.4605, + "step": 7798 + }, + { + "epoch": 1.2808079978650464, + "grad_norm": 0.2896967081115697, + "learning_rate": 7.694309107627806e-06, + "loss": 0.4856, + "step": 7799 + }, + { + "epoch": 1.280972225073389, + "grad_norm": 0.299584030087436, + "learning_rate": 7.693978766694204e-06, + "loss": 0.452, + "step": 7800 + }, + { + "epoch": 1.2811364522817317, + "grad_norm": 0.3385113952854699, + "learning_rate": 7.693648391070851e-06, + "loss": 0.4516, + "step": 7801 + }, + { + "epoch": 1.2813006794900745, + "grad_norm": 0.2772793930227556, + "learning_rate": 7.693317980761334e-06, + "loss": 0.4786, + "step": 7802 + }, + { + "epoch": 1.2814649066984174, + "grad_norm": 0.3340403620740822, + "learning_rate": 7.69298753576924e-06, + "loss": 0.485, + "step": 7803 + }, + { + "epoch": 1.28162913390676, + "grad_norm": 0.3315139362421689, + "learning_rate": 7.692657056098163e-06, + "loss": 0.4865, + "step": 7804 + }, + { + "epoch": 1.2817933611151027, + "grad_norm": 0.36606393979166196, + "learning_rate": 7.692326541751687e-06, + "loss": 0.4633, + "step": 7805 + }, + { + "epoch": 1.2819575883234455, + "grad_norm": 0.2958031648909456, + "learning_rate": 7.691995992733404e-06, + "loss": 0.4859, + "step": 7806 + }, + { + "epoch": 1.2821218155317882, + "grad_norm": 0.29278420815146317, + "learning_rate": 7.691665409046905e-06, + "loss": 0.4689, + "step": 7807 + }, + { + "epoch": 1.282286042740131, + "grad_norm": 0.26629433728377444, + "learning_rate": 7.69133479069578e-06, + "loss": 0.4652, + "step": 7808 + }, + { + "epoch": 1.2824502699484737, + "grad_norm": 0.3085504664475122, + "learning_rate": 7.691004137683617e-06, + "loss": 0.4806, + "step": 7809 + }, + { + "epoch": 1.2826144971568165, + "grad_norm": 0.31694594797826553, + "learning_rate": 7.69067345001401e-06, + "loss": 0.4751, + "step": 7810 + }, + { + "epoch": 1.2827787243651592, + "grad_norm": 0.31604221751382056, + "learning_rate": 7.690342727690553e-06, + "loss": 0.4662, + "step": 7811 + }, + { + "epoch": 1.282942951573502, + "grad_norm": 0.3576539317488584, + "learning_rate": 7.690011970716833e-06, + "loss": 0.483, + "step": 7812 + }, + { + "epoch": 1.2831071787818447, + "grad_norm": 0.26640826289893155, + "learning_rate": 7.689681179096443e-06, + "loss": 0.4585, + "step": 7813 + }, + { + "epoch": 1.2832714059901873, + "grad_norm": 0.24764426877232484, + "learning_rate": 7.68935035283298e-06, + "loss": 0.4626, + "step": 7814 + }, + { + "epoch": 1.2834356331985302, + "grad_norm": 0.31402171978360294, + "learning_rate": 7.689019491930033e-06, + "loss": 0.4765, + "step": 7815 + }, + { + "epoch": 1.283599860406873, + "grad_norm": 0.33472276789419364, + "learning_rate": 7.688688596391197e-06, + "loss": 0.4848, + "step": 7816 + }, + { + "epoch": 1.2837640876152157, + "grad_norm": 0.31467631849619093, + "learning_rate": 7.688357666220065e-06, + "loss": 0.4696, + "step": 7817 + }, + { + "epoch": 1.2839283148235583, + "grad_norm": 0.5620271778383376, + "learning_rate": 7.688026701420233e-06, + "loss": 0.4873, + "step": 7818 + }, + { + "epoch": 1.2840925420319012, + "grad_norm": 0.30644020342381995, + "learning_rate": 7.687695701995295e-06, + "loss": 0.5001, + "step": 7819 + }, + { + "epoch": 1.284256769240244, + "grad_norm": 0.31653718755727756, + "learning_rate": 7.687364667948842e-06, + "loss": 0.4556, + "step": 7820 + }, + { + "epoch": 1.2844209964485866, + "grad_norm": 0.35087942099619734, + "learning_rate": 7.687033599284475e-06, + "loss": 0.4497, + "step": 7821 + }, + { + "epoch": 1.2845852236569293, + "grad_norm": 0.28672157890856287, + "learning_rate": 7.686702496005788e-06, + "loss": 0.4981, + "step": 7822 + }, + { + "epoch": 1.2847494508652721, + "grad_norm": 0.3079510574279276, + "learning_rate": 7.686371358116374e-06, + "loss": 0.4902, + "step": 7823 + }, + { + "epoch": 1.2849136780736148, + "grad_norm": 0.44588293878026053, + "learning_rate": 7.686040185619835e-06, + "loss": 0.46, + "step": 7824 + }, + { + "epoch": 1.2850779052819576, + "grad_norm": 0.872447022198342, + "learning_rate": 7.685708978519764e-06, + "loss": 0.459, + "step": 7825 + }, + { + "epoch": 1.2852421324903003, + "grad_norm": 0.3303561048803215, + "learning_rate": 7.68537773681976e-06, + "loss": 0.4975, + "step": 7826 + }, + { + "epoch": 1.2854063596986431, + "grad_norm": 0.26456162856250737, + "learning_rate": 7.685046460523419e-06, + "loss": 0.4888, + "step": 7827 + }, + { + "epoch": 1.2855705869069858, + "grad_norm": 0.2816527615338236, + "learning_rate": 7.684715149634339e-06, + "loss": 0.4782, + "step": 7828 + }, + { + "epoch": 1.2857348141153286, + "grad_norm": 0.3147258571690476, + "learning_rate": 7.68438380415612e-06, + "loss": 0.4844, + "step": 7829 + }, + { + "epoch": 1.2858990413236713, + "grad_norm": 0.27745651039854247, + "learning_rate": 7.68405242409236e-06, + "loss": 0.4556, + "step": 7830 + }, + { + "epoch": 1.286063268532014, + "grad_norm": 0.30628636649669544, + "learning_rate": 7.683721009446657e-06, + "loss": 0.4868, + "step": 7831 + }, + { + "epoch": 1.2862274957403568, + "grad_norm": 0.42664379936326835, + "learning_rate": 7.683389560222612e-06, + "loss": 0.4784, + "step": 7832 + }, + { + "epoch": 1.2863917229486996, + "grad_norm": 0.2680221112365354, + "learning_rate": 7.683058076423825e-06, + "loss": 0.4547, + "step": 7833 + }, + { + "epoch": 1.2865559501570423, + "grad_norm": 0.28850192993663787, + "learning_rate": 7.682726558053896e-06, + "loss": 0.4816, + "step": 7834 + }, + { + "epoch": 1.286720177365385, + "grad_norm": 0.3826116769850467, + "learning_rate": 7.682395005116424e-06, + "loss": 0.4604, + "step": 7835 + }, + { + "epoch": 1.2868844045737278, + "grad_norm": 0.3269271472240874, + "learning_rate": 7.682063417615011e-06, + "loss": 0.4856, + "step": 7836 + }, + { + "epoch": 1.2870486317820706, + "grad_norm": 0.2903246693555103, + "learning_rate": 7.681731795553259e-06, + "loss": 0.4717, + "step": 7837 + }, + { + "epoch": 1.2872128589904133, + "grad_norm": 0.40635716934363103, + "learning_rate": 7.681400138934768e-06, + "loss": 0.4641, + "step": 7838 + }, + { + "epoch": 1.287377086198756, + "grad_norm": 2.2006250297078624, + "learning_rate": 7.681068447763143e-06, + "loss": 0.4772, + "step": 7839 + }, + { + "epoch": 1.2875413134070988, + "grad_norm": 0.25645959340248886, + "learning_rate": 7.680736722041985e-06, + "loss": 0.4555, + "step": 7840 + }, + { + "epoch": 1.2877055406154414, + "grad_norm": 0.29428794743469366, + "learning_rate": 7.680404961774898e-06, + "loss": 0.4554, + "step": 7841 + }, + { + "epoch": 1.2878697678237843, + "grad_norm": 0.504873026867374, + "learning_rate": 7.680073166965482e-06, + "loss": 0.4737, + "step": 7842 + }, + { + "epoch": 1.288033995032127, + "grad_norm": 0.3206722968962623, + "learning_rate": 7.679741337617344e-06, + "loss": 0.4555, + "step": 7843 + }, + { + "epoch": 1.2881982222404698, + "grad_norm": 0.31375416139386475, + "learning_rate": 7.679409473734085e-06, + "loss": 0.4731, + "step": 7844 + }, + { + "epoch": 1.2883624494488124, + "grad_norm": 0.32465719686723155, + "learning_rate": 7.67907757531931e-06, + "loss": 0.4608, + "step": 7845 + }, + { + "epoch": 1.2885266766571553, + "grad_norm": 0.2873096643272258, + "learning_rate": 7.678745642376627e-06, + "loss": 0.4706, + "step": 7846 + }, + { + "epoch": 1.288690903865498, + "grad_norm": 0.28816537852812213, + "learning_rate": 7.67841367490964e-06, + "loss": 0.4673, + "step": 7847 + }, + { + "epoch": 1.2888551310738405, + "grad_norm": 0.27675067172937057, + "learning_rate": 7.67808167292195e-06, + "loss": 0.4654, + "step": 7848 + }, + { + "epoch": 1.2890193582821834, + "grad_norm": 0.3456711175077695, + "learning_rate": 7.67774963641717e-06, + "loss": 0.4813, + "step": 7849 + }, + { + "epoch": 1.2891835854905263, + "grad_norm": 0.3354036100274437, + "learning_rate": 7.677417565398899e-06, + "loss": 0.4839, + "step": 7850 + }, + { + "epoch": 1.289347812698869, + "grad_norm": 0.6324881168177896, + "learning_rate": 7.677085459870749e-06, + "loss": 0.4587, + "step": 7851 + }, + { + "epoch": 1.2895120399072115, + "grad_norm": 0.3504917226050069, + "learning_rate": 7.676753319836324e-06, + "loss": 0.4733, + "step": 7852 + }, + { + "epoch": 1.2896762671155544, + "grad_norm": 0.44980333945469475, + "learning_rate": 7.676421145299233e-06, + "loss": 0.4676, + "step": 7853 + }, + { + "epoch": 1.2898404943238972, + "grad_norm": 0.5372650657765722, + "learning_rate": 7.676088936263084e-06, + "loss": 0.4819, + "step": 7854 + }, + { + "epoch": 1.2900047215322399, + "grad_norm": 0.3343313623537713, + "learning_rate": 7.675756692731483e-06, + "loss": 0.4794, + "step": 7855 + }, + { + "epoch": 1.2901689487405825, + "grad_norm": 0.3946859710241651, + "learning_rate": 7.67542441470804e-06, + "loss": 0.476, + "step": 7856 + }, + { + "epoch": 1.2903331759489254, + "grad_norm": 0.2813570879176587, + "learning_rate": 7.675092102196365e-06, + "loss": 0.4871, + "step": 7857 + }, + { + "epoch": 1.290497403157268, + "grad_norm": 0.3379571926965503, + "learning_rate": 7.674759755200064e-06, + "loss": 0.4802, + "step": 7858 + }, + { + "epoch": 1.2906616303656109, + "grad_norm": 0.44337435809416115, + "learning_rate": 7.67442737372275e-06, + "loss": 0.5086, + "step": 7859 + }, + { + "epoch": 1.2908258575739535, + "grad_norm": 0.2620657068472747, + "learning_rate": 7.674094957768031e-06, + "loss": 0.4507, + "step": 7860 + }, + { + "epoch": 1.2909900847822964, + "grad_norm": 0.39448810420551583, + "learning_rate": 7.673762507339517e-06, + "loss": 0.4757, + "step": 7861 + }, + { + "epoch": 1.291154311990639, + "grad_norm": 0.27602612398995907, + "learning_rate": 7.67343002244082e-06, + "loss": 0.4791, + "step": 7862 + }, + { + "epoch": 1.2913185391989819, + "grad_norm": 0.30145053047947384, + "learning_rate": 7.67309750307555e-06, + "loss": 0.4637, + "step": 7863 + }, + { + "epoch": 1.2914827664073245, + "grad_norm": 0.28617067481182934, + "learning_rate": 7.672764949247322e-06, + "loss": 0.4892, + "step": 7864 + }, + { + "epoch": 1.2916469936156671, + "grad_norm": 0.30999809485905655, + "learning_rate": 7.672432360959743e-06, + "loss": 0.5012, + "step": 7865 + }, + { + "epoch": 1.29181122082401, + "grad_norm": 0.32598640649592947, + "learning_rate": 7.672099738216427e-06, + "loss": 0.4727, + "step": 7866 + }, + { + "epoch": 1.2919754480323529, + "grad_norm": 0.4536033430129751, + "learning_rate": 7.671767081020988e-06, + "loss": 0.4617, + "step": 7867 + }, + { + "epoch": 1.2921396752406955, + "grad_norm": 0.2727844305112922, + "learning_rate": 7.671434389377038e-06, + "loss": 0.4555, + "step": 7868 + }, + { + "epoch": 1.2923039024490381, + "grad_norm": 0.3128387369899632, + "learning_rate": 7.67110166328819e-06, + "loss": 0.4667, + "step": 7869 + }, + { + "epoch": 1.292468129657381, + "grad_norm": 0.314249956724488, + "learning_rate": 7.670768902758058e-06, + "loss": 0.4856, + "step": 7870 + }, + { + "epoch": 1.2926323568657239, + "grad_norm": 0.4280519629179726, + "learning_rate": 7.670436107790254e-06, + "loss": 0.4689, + "step": 7871 + }, + { + "epoch": 1.2927965840740665, + "grad_norm": 0.289677597791496, + "learning_rate": 7.670103278388398e-06, + "loss": 0.4686, + "step": 7872 + }, + { + "epoch": 1.2929608112824091, + "grad_norm": 0.2946877863022075, + "learning_rate": 7.6697704145561e-06, + "loss": 0.5096, + "step": 7873 + }, + { + "epoch": 1.293125038490752, + "grad_norm": 0.2894735076390649, + "learning_rate": 7.669437516296976e-06, + "loss": 0.4614, + "step": 7874 + }, + { + "epoch": 1.2932892656990946, + "grad_norm": 0.3028986464007853, + "learning_rate": 7.669104583614642e-06, + "loss": 0.4541, + "step": 7875 + }, + { + "epoch": 1.2934534929074375, + "grad_norm": 0.25582922737852953, + "learning_rate": 7.668771616512716e-06, + "loss": 0.4725, + "step": 7876 + }, + { + "epoch": 1.2936177201157801, + "grad_norm": 0.33755488887573115, + "learning_rate": 7.668438614994812e-06, + "loss": 0.4787, + "step": 7877 + }, + { + "epoch": 1.293781947324123, + "grad_norm": 0.32217811133152874, + "learning_rate": 7.668105579064546e-06, + "loss": 0.4938, + "step": 7878 + }, + { + "epoch": 1.2939461745324656, + "grad_norm": 0.30142798654133546, + "learning_rate": 7.667772508725538e-06, + "loss": 0.4816, + "step": 7879 + }, + { + "epoch": 1.2941104017408085, + "grad_norm": 0.26134890830428037, + "learning_rate": 7.667439403981402e-06, + "loss": 0.4805, + "step": 7880 + }, + { + "epoch": 1.2942746289491511, + "grad_norm": 0.2859171992401748, + "learning_rate": 7.66710626483576e-06, + "loss": 0.4585, + "step": 7881 + }, + { + "epoch": 1.2944388561574938, + "grad_norm": 0.27490980788645614, + "learning_rate": 7.666773091292227e-06, + "loss": 0.4662, + "step": 7882 + }, + { + "epoch": 1.2946030833658366, + "grad_norm": 0.29612051837616454, + "learning_rate": 7.666439883354421e-06, + "loss": 0.4681, + "step": 7883 + }, + { + "epoch": 1.2947673105741795, + "grad_norm": 0.3558706635350218, + "learning_rate": 7.666106641025965e-06, + "loss": 0.4668, + "step": 7884 + }, + { + "epoch": 1.2949315377825221, + "grad_norm": 0.29081743807681926, + "learning_rate": 7.665773364310476e-06, + "loss": 0.4673, + "step": 7885 + }, + { + "epoch": 1.2950957649908648, + "grad_norm": 0.36587349094750693, + "learning_rate": 7.665440053211571e-06, + "loss": 0.472, + "step": 7886 + }, + { + "epoch": 1.2952599921992076, + "grad_norm": 0.3846805506898003, + "learning_rate": 7.665106707732875e-06, + "loss": 0.478, + "step": 7887 + }, + { + "epoch": 1.2954242194075505, + "grad_norm": 0.4763302799103845, + "learning_rate": 7.664773327878005e-06, + "loss": 0.4822, + "step": 7888 + }, + { + "epoch": 1.2955884466158931, + "grad_norm": 0.2659489183122764, + "learning_rate": 7.664439913650583e-06, + "loss": 0.4482, + "step": 7889 + }, + { + "epoch": 1.2957526738242358, + "grad_norm": 0.3211381900041531, + "learning_rate": 7.66410646505423e-06, + "loss": 0.4634, + "step": 7890 + }, + { + "epoch": 1.2959169010325786, + "grad_norm": 0.3289264312374959, + "learning_rate": 7.663772982092569e-06, + "loss": 0.4912, + "step": 7891 + }, + { + "epoch": 1.2960811282409213, + "grad_norm": 0.2874286485731477, + "learning_rate": 7.66343946476922e-06, + "loss": 0.4758, + "step": 7892 + }, + { + "epoch": 1.2962453554492641, + "grad_norm": 0.29145863615207435, + "learning_rate": 7.663105913087804e-06, + "loss": 0.4948, + "step": 7893 + }, + { + "epoch": 1.2964095826576068, + "grad_norm": 0.3151056218253248, + "learning_rate": 7.662772327051947e-06, + "loss": 0.4864, + "step": 7894 + }, + { + "epoch": 1.2965738098659496, + "grad_norm": 0.5383538039811437, + "learning_rate": 7.662438706665272e-06, + "loss": 0.4827, + "step": 7895 + }, + { + "epoch": 1.2967380370742922, + "grad_norm": 0.2904499074975075, + "learning_rate": 7.662105051931401e-06, + "loss": 0.4577, + "step": 7896 + }, + { + "epoch": 1.296902264282635, + "grad_norm": 0.3404388178106021, + "learning_rate": 7.661771362853958e-06, + "loss": 0.4973, + "step": 7897 + }, + { + "epoch": 1.2970664914909777, + "grad_norm": 0.28420348043237087, + "learning_rate": 7.661437639436565e-06, + "loss": 0.4746, + "step": 7898 + }, + { + "epoch": 1.2972307186993204, + "grad_norm": 0.31300648105957757, + "learning_rate": 7.661103881682851e-06, + "loss": 0.4718, + "step": 7899 + }, + { + "epoch": 1.2973949459076632, + "grad_norm": 0.2617389825483887, + "learning_rate": 7.660770089596437e-06, + "loss": 0.5036, + "step": 7900 + }, + { + "epoch": 1.297559173116006, + "grad_norm": 0.3558329374409586, + "learning_rate": 7.660436263180954e-06, + "loss": 0.4704, + "step": 7901 + }, + { + "epoch": 1.2977234003243487, + "grad_norm": 0.3296553098157969, + "learning_rate": 7.66010240244002e-06, + "loss": 0.4836, + "step": 7902 + }, + { + "epoch": 1.2978876275326914, + "grad_norm": 0.40087534912194606, + "learning_rate": 7.659768507377265e-06, + "loss": 0.4807, + "step": 7903 + }, + { + "epoch": 1.2980518547410342, + "grad_norm": 0.29088669339411083, + "learning_rate": 7.659434577996318e-06, + "loss": 0.4739, + "step": 7904 + }, + { + "epoch": 1.298216081949377, + "grad_norm": 0.2990246246306228, + "learning_rate": 7.659100614300798e-06, + "loss": 0.4529, + "step": 7905 + }, + { + "epoch": 1.2983803091577197, + "grad_norm": 0.29901740330002713, + "learning_rate": 7.658766616294343e-06, + "loss": 0.4725, + "step": 7906 + }, + { + "epoch": 1.2985445363660624, + "grad_norm": 0.30059464854419815, + "learning_rate": 7.65843258398057e-06, + "loss": 0.4856, + "step": 7907 + }, + { + "epoch": 1.2987087635744052, + "grad_norm": 0.6435372599150829, + "learning_rate": 7.658098517363115e-06, + "loss": 0.4717, + "step": 7908 + }, + { + "epoch": 1.2988729907827479, + "grad_norm": 0.3214157341766446, + "learning_rate": 7.657764416445601e-06, + "loss": 0.4679, + "step": 7909 + }, + { + "epoch": 1.2990372179910907, + "grad_norm": 0.324097294636231, + "learning_rate": 7.65743028123166e-06, + "loss": 0.458, + "step": 7910 + }, + { + "epoch": 1.2992014451994334, + "grad_norm": 0.3110100490491375, + "learning_rate": 7.657096111724917e-06, + "loss": 0.46, + "step": 7911 + }, + { + "epoch": 1.2993656724077762, + "grad_norm": 0.3124904806207106, + "learning_rate": 7.656761907929006e-06, + "loss": 0.4666, + "step": 7912 + }, + { + "epoch": 1.2995298996161189, + "grad_norm": 0.29953706182151885, + "learning_rate": 7.656427669847557e-06, + "loss": 0.5017, + "step": 7913 + }, + { + "epoch": 1.2996941268244617, + "grad_norm": 0.3084623975455468, + "learning_rate": 7.656093397484195e-06, + "loss": 0.4756, + "step": 7914 + }, + { + "epoch": 1.2998583540328044, + "grad_norm": 0.2995223537477973, + "learning_rate": 7.655759090842554e-06, + "loss": 0.4865, + "step": 7915 + }, + { + "epoch": 1.300022581241147, + "grad_norm": 0.34171160928676775, + "learning_rate": 7.655424749926265e-06, + "loss": 0.4924, + "step": 7916 + }, + { + "epoch": 1.3001868084494899, + "grad_norm": 0.37593951569139283, + "learning_rate": 7.655090374738958e-06, + "loss": 0.4758, + "step": 7917 + }, + { + "epoch": 1.3003510356578327, + "grad_norm": 0.27259654106749204, + "learning_rate": 7.654755965284266e-06, + "loss": 0.4855, + "step": 7918 + }, + { + "epoch": 1.3005152628661754, + "grad_norm": 0.26982803154783935, + "learning_rate": 7.65442152156582e-06, + "loss": 0.4638, + "step": 7919 + }, + { + "epoch": 1.300679490074518, + "grad_norm": 0.3358575968821194, + "learning_rate": 7.654087043587253e-06, + "loss": 0.4822, + "step": 7920 + }, + { + "epoch": 1.3008437172828609, + "grad_norm": 0.3025242716671943, + "learning_rate": 7.653752531352197e-06, + "loss": 0.4854, + "step": 7921 + }, + { + "epoch": 1.3010079444912037, + "grad_norm": 0.31977526421106844, + "learning_rate": 7.653417984864286e-06, + "loss": 0.4781, + "step": 7922 + }, + { + "epoch": 1.3011721716995464, + "grad_norm": 0.4255861102576953, + "learning_rate": 7.653083404127154e-06, + "loss": 0.4695, + "step": 7923 + }, + { + "epoch": 1.301336398907889, + "grad_norm": 0.32376807035451516, + "learning_rate": 7.652748789144432e-06, + "loss": 0.4724, + "step": 7924 + }, + { + "epoch": 1.3015006261162319, + "grad_norm": 0.2621931897315843, + "learning_rate": 7.652414139919758e-06, + "loss": 0.483, + "step": 7925 + }, + { + "epoch": 1.3016648533245745, + "grad_norm": 0.32369868128194706, + "learning_rate": 7.652079456456765e-06, + "loss": 0.4788, + "step": 7926 + }, + { + "epoch": 1.3018290805329173, + "grad_norm": 0.29915004458420025, + "learning_rate": 7.651744738759086e-06, + "loss": 0.4656, + "step": 7927 + }, + { + "epoch": 1.30199330774126, + "grad_norm": 0.3552626648407864, + "learning_rate": 7.65140998683036e-06, + "loss": 0.4942, + "step": 7928 + }, + { + "epoch": 1.3021575349496028, + "grad_norm": 0.28752600760195246, + "learning_rate": 7.65107520067422e-06, + "loss": 0.4736, + "step": 7929 + }, + { + "epoch": 1.3023217621579455, + "grad_norm": 0.3278934170298695, + "learning_rate": 7.650740380294304e-06, + "loss": 0.4824, + "step": 7930 + }, + { + "epoch": 1.3024859893662883, + "grad_norm": 0.29068672152221015, + "learning_rate": 7.650405525694247e-06, + "loss": 0.4952, + "step": 7931 + }, + { + "epoch": 1.302650216574631, + "grad_norm": 0.36659481274056716, + "learning_rate": 7.650070636877686e-06, + "loss": 0.4905, + "step": 7932 + }, + { + "epoch": 1.3028144437829736, + "grad_norm": 0.318759886416517, + "learning_rate": 7.64973571384826e-06, + "loss": 0.4993, + "step": 7933 + }, + { + "epoch": 1.3029786709913165, + "grad_norm": 0.4421733290213734, + "learning_rate": 7.649400756609603e-06, + "loss": 0.4704, + "step": 7934 + }, + { + "epoch": 1.3031428981996593, + "grad_norm": 0.4854735090770185, + "learning_rate": 7.649065765165356e-06, + "loss": 0.4819, + "step": 7935 + }, + { + "epoch": 1.303307125408002, + "grad_norm": 0.3538652860607975, + "learning_rate": 7.648730739519159e-06, + "loss": 0.4599, + "step": 7936 + }, + { + "epoch": 1.3034713526163446, + "grad_norm": 0.2711419157816641, + "learning_rate": 7.648395679674645e-06, + "loss": 0.4653, + "step": 7937 + }, + { + "epoch": 1.3036355798246875, + "grad_norm": 0.28791299089951694, + "learning_rate": 7.648060585635457e-06, + "loss": 0.467, + "step": 7938 + }, + { + "epoch": 1.3037998070330303, + "grad_norm": 0.3147023306380566, + "learning_rate": 7.647725457405235e-06, + "loss": 0.4785, + "step": 7939 + }, + { + "epoch": 1.303964034241373, + "grad_norm": 0.33694405249481874, + "learning_rate": 7.647390294987618e-06, + "loss": 0.4759, + "step": 7940 + }, + { + "epoch": 1.3041282614497156, + "grad_norm": 0.3704706312364689, + "learning_rate": 7.647055098386243e-06, + "loss": 0.4731, + "step": 7941 + }, + { + "epoch": 1.3042924886580585, + "grad_norm": 0.2981416710176911, + "learning_rate": 7.646719867604756e-06, + "loss": 0.4727, + "step": 7942 + }, + { + "epoch": 1.304456715866401, + "grad_norm": 0.4236048668542745, + "learning_rate": 7.646384602646794e-06, + "loss": 0.491, + "step": 7943 + }, + { + "epoch": 1.304620943074744, + "grad_norm": 0.25920675415941064, + "learning_rate": 7.646049303516001e-06, + "loss": 0.4647, + "step": 7944 + }, + { + "epoch": 1.3047851702830866, + "grad_norm": 0.37288961353740263, + "learning_rate": 7.645713970216015e-06, + "loss": 0.4729, + "step": 7945 + }, + { + "epoch": 1.3049493974914295, + "grad_norm": 0.30280530467800054, + "learning_rate": 7.645378602750481e-06, + "loss": 0.4951, + "step": 7946 + }, + { + "epoch": 1.305113624699772, + "grad_norm": 0.32637097512312524, + "learning_rate": 7.645043201123042e-06, + "loss": 0.4863, + "step": 7947 + }, + { + "epoch": 1.305277851908115, + "grad_norm": 0.46033399936430586, + "learning_rate": 7.64470776533734e-06, + "loss": 0.4814, + "step": 7948 + }, + { + "epoch": 1.3054420791164576, + "grad_norm": 0.31590207279548094, + "learning_rate": 7.644372295397015e-06, + "loss": 0.4821, + "step": 7949 + }, + { + "epoch": 1.3056063063248002, + "grad_norm": 0.31853698788592905, + "learning_rate": 7.644036791305715e-06, + "loss": 0.4544, + "step": 7950 + }, + { + "epoch": 1.305770533533143, + "grad_norm": 0.28684806650613004, + "learning_rate": 7.643701253067082e-06, + "loss": 0.4912, + "step": 7951 + }, + { + "epoch": 1.305934760741486, + "grad_norm": 0.43154634307997536, + "learning_rate": 7.64336568068476e-06, + "loss": 0.4867, + "step": 7952 + }, + { + "epoch": 1.3060989879498286, + "grad_norm": 0.3614722309268757, + "learning_rate": 7.643030074162395e-06, + "loss": 0.4644, + "step": 7953 + }, + { + "epoch": 1.3062632151581712, + "grad_norm": 0.2847346531183879, + "learning_rate": 7.64269443350363e-06, + "loss": 0.4463, + "step": 7954 + }, + { + "epoch": 1.306427442366514, + "grad_norm": 0.33721951652964166, + "learning_rate": 7.642358758712112e-06, + "loss": 0.4786, + "step": 7955 + }, + { + "epoch": 1.306591669574857, + "grad_norm": 0.29914397820153493, + "learning_rate": 7.642023049791485e-06, + "loss": 0.4622, + "step": 7956 + }, + { + "epoch": 1.3067558967831996, + "grad_norm": 0.2687785996804088, + "learning_rate": 7.641687306745399e-06, + "loss": 0.4709, + "step": 7957 + }, + { + "epoch": 1.3069201239915422, + "grad_norm": 0.31963062378427437, + "learning_rate": 7.641351529577494e-06, + "loss": 0.468, + "step": 7958 + }, + { + "epoch": 1.307084351199885, + "grad_norm": 0.545023022599308, + "learning_rate": 7.641015718291425e-06, + "loss": 0.4806, + "step": 7959 + }, + { + "epoch": 1.3072485784082277, + "grad_norm": 0.395362345706082, + "learning_rate": 7.640679872890832e-06, + "loss": 0.4628, + "step": 7960 + }, + { + "epoch": 1.3074128056165706, + "grad_norm": 0.466981966839366, + "learning_rate": 7.640343993379368e-06, + "loss": 0.4682, + "step": 7961 + }, + { + "epoch": 1.3075770328249132, + "grad_norm": 0.28939268969208437, + "learning_rate": 7.640008079760676e-06, + "loss": 0.4592, + "step": 7962 + }, + { + "epoch": 1.307741260033256, + "grad_norm": 0.33538113413961285, + "learning_rate": 7.639672132038407e-06, + "loss": 0.4806, + "step": 7963 + }, + { + "epoch": 1.3079054872415987, + "grad_norm": 0.32099795596270486, + "learning_rate": 7.639336150216211e-06, + "loss": 0.482, + "step": 7964 + }, + { + "epoch": 1.3080697144499416, + "grad_norm": 0.553802750895834, + "learning_rate": 7.639000134297735e-06, + "loss": 0.4876, + "step": 7965 + }, + { + "epoch": 1.3082339416582842, + "grad_norm": 0.34601641793654353, + "learning_rate": 7.638664084286629e-06, + "loss": 0.4836, + "step": 7966 + }, + { + "epoch": 1.3083981688666269, + "grad_norm": 0.2989365043791791, + "learning_rate": 7.638328000186545e-06, + "loss": 0.4683, + "step": 7967 + }, + { + "epoch": 1.3085623960749697, + "grad_norm": 0.3029078532588946, + "learning_rate": 7.63799188200113e-06, + "loss": 0.4701, + "step": 7968 + }, + { + "epoch": 1.3087266232833126, + "grad_norm": 0.4104382751629648, + "learning_rate": 7.637655729734036e-06, + "loss": 0.4682, + "step": 7969 + }, + { + "epoch": 1.3088908504916552, + "grad_norm": 0.3401069351664016, + "learning_rate": 7.637319543388913e-06, + "loss": 0.4816, + "step": 7970 + }, + { + "epoch": 1.3090550776999978, + "grad_norm": 0.36051234077198613, + "learning_rate": 7.63698332296941e-06, + "loss": 0.4704, + "step": 7971 + }, + { + "epoch": 1.3092193049083407, + "grad_norm": 0.2680635234976543, + "learning_rate": 7.636647068479188e-06, + "loss": 0.4862, + "step": 7972 + }, + { + "epoch": 1.3093835321166836, + "grad_norm": 0.35280649807851294, + "learning_rate": 7.636310779921889e-06, + "loss": 0.4844, + "step": 7973 + }, + { + "epoch": 1.3095477593250262, + "grad_norm": 0.3241893700421079, + "learning_rate": 7.63597445730117e-06, + "loss": 0.482, + "step": 7974 + }, + { + "epoch": 1.3097119865333688, + "grad_norm": 0.25658079785257215, + "learning_rate": 7.635638100620683e-06, + "loss": 0.4656, + "step": 7975 + }, + { + "epoch": 1.3098762137417117, + "grad_norm": 0.340790886688055, + "learning_rate": 7.63530170988408e-06, + "loss": 0.4806, + "step": 7976 + }, + { + "epoch": 1.3100404409500543, + "grad_norm": 0.29263625469920773, + "learning_rate": 7.634965285095018e-06, + "loss": 0.4735, + "step": 7977 + }, + { + "epoch": 1.3102046681583972, + "grad_norm": 0.31652479596424954, + "learning_rate": 7.634628826257148e-06, + "loss": 0.4759, + "step": 7978 + }, + { + "epoch": 1.3103688953667398, + "grad_norm": 0.3299859845201978, + "learning_rate": 7.634292333374123e-06, + "loss": 0.4708, + "step": 7979 + }, + { + "epoch": 1.3105331225750827, + "grad_norm": 0.272066769077068, + "learning_rate": 7.633955806449603e-06, + "loss": 0.4941, + "step": 7980 + }, + { + "epoch": 1.3106973497834253, + "grad_norm": 0.31916471197677193, + "learning_rate": 7.633619245487237e-06, + "loss": 0.4492, + "step": 7981 + }, + { + "epoch": 1.3108615769917682, + "grad_norm": 0.41008229525586465, + "learning_rate": 7.633282650490684e-06, + "loss": 0.4817, + "step": 7982 + }, + { + "epoch": 1.3110258042001108, + "grad_norm": 0.46510580333747115, + "learning_rate": 7.632946021463597e-06, + "loss": 0.4693, + "step": 7983 + }, + { + "epoch": 1.3111900314084535, + "grad_norm": 0.26202627385773686, + "learning_rate": 7.632609358409637e-06, + "loss": 0.4582, + "step": 7984 + }, + { + "epoch": 1.3113542586167963, + "grad_norm": 0.5539807098934775, + "learning_rate": 7.632272661332454e-06, + "loss": 0.4755, + "step": 7985 + }, + { + "epoch": 1.3115184858251392, + "grad_norm": 0.28471144410474203, + "learning_rate": 7.63193593023571e-06, + "loss": 0.4628, + "step": 7986 + }, + { + "epoch": 1.3116827130334818, + "grad_norm": 0.2806760601317828, + "learning_rate": 7.631599165123058e-06, + "loss": 0.4716, + "step": 7987 + }, + { + "epoch": 1.3118469402418245, + "grad_norm": 0.3691693460679819, + "learning_rate": 7.631262365998161e-06, + "loss": 0.4798, + "step": 7988 + }, + { + "epoch": 1.3120111674501673, + "grad_norm": 0.2821735353713559, + "learning_rate": 7.630925532864672e-06, + "loss": 0.4818, + "step": 7989 + }, + { + "epoch": 1.3121753946585102, + "grad_norm": 0.34444638448337844, + "learning_rate": 7.630588665726253e-06, + "loss": 0.4925, + "step": 7990 + }, + { + "epoch": 1.3123396218668528, + "grad_norm": 0.33577779366945215, + "learning_rate": 7.63025176458656e-06, + "loss": 0.4806, + "step": 7991 + }, + { + "epoch": 1.3125038490751955, + "grad_norm": 0.30897078274720075, + "learning_rate": 7.629914829449253e-06, + "loss": 0.4806, + "step": 7992 + }, + { + "epoch": 1.3126680762835383, + "grad_norm": 0.390373369768928, + "learning_rate": 7.629577860317991e-06, + "loss": 0.5017, + "step": 7993 + }, + { + "epoch": 1.312832303491881, + "grad_norm": 0.3872526842133121, + "learning_rate": 7.6292408571964354e-06, + "loss": 0.4827, + "step": 7994 + }, + { + "epoch": 1.3129965307002238, + "grad_norm": 0.32955819048590035, + "learning_rate": 7.6289038200882445e-06, + "loss": 0.4687, + "step": 7995 + }, + { + "epoch": 1.3131607579085665, + "grad_norm": 0.31306415443858815, + "learning_rate": 7.628566748997081e-06, + "loss": 0.4652, + "step": 7996 + }, + { + "epoch": 1.3133249851169093, + "grad_norm": 0.34561030406592913, + "learning_rate": 7.628229643926603e-06, + "loss": 0.475, + "step": 7997 + }, + { + "epoch": 1.313489212325252, + "grad_norm": 0.35034799088508667, + "learning_rate": 7.627892504880474e-06, + "loss": 0.4701, + "step": 7998 + }, + { + "epoch": 1.3136534395335948, + "grad_norm": 0.2711573129803233, + "learning_rate": 7.627555331862355e-06, + "loss": 0.4778, + "step": 7999 + }, + { + "epoch": 1.3138176667419375, + "grad_norm": 0.376753307536093, + "learning_rate": 7.627218124875908e-06, + "loss": 0.4861, + "step": 8000 + }, + { + "epoch": 1.31398189395028, + "grad_norm": 0.3478626411436554, + "learning_rate": 7.626880883924795e-06, + "loss": 0.4648, + "step": 8001 + }, + { + "epoch": 1.314146121158623, + "grad_norm": 0.29665293395553316, + "learning_rate": 7.62654360901268e-06, + "loss": 0.484, + "step": 8002 + }, + { + "epoch": 1.3143103483669658, + "grad_norm": 0.3763583038100201, + "learning_rate": 7.626206300143224e-06, + "loss": 0.4835, + "step": 8003 + }, + { + "epoch": 1.3144745755753084, + "grad_norm": 0.33950295810942993, + "learning_rate": 7.625868957320092e-06, + "loss": 0.4712, + "step": 8004 + }, + { + "epoch": 1.314638802783651, + "grad_norm": 0.2764303498324206, + "learning_rate": 7.62553158054695e-06, + "loss": 0.4672, + "step": 8005 + }, + { + "epoch": 1.314803029991994, + "grad_norm": 0.2943201514727324, + "learning_rate": 7.625194169827458e-06, + "loss": 0.488, + "step": 8006 + }, + { + "epoch": 1.3149672572003368, + "grad_norm": 0.3697002180213209, + "learning_rate": 7.6248567251652825e-06, + "loss": 0.4727, + "step": 8007 + }, + { + "epoch": 1.3151314844086794, + "grad_norm": 0.3757351595733627, + "learning_rate": 7.6245192465640885e-06, + "loss": 0.4865, + "step": 8008 + }, + { + "epoch": 1.315295711617022, + "grad_norm": 0.3738711600535506, + "learning_rate": 7.624181734027541e-06, + "loss": 0.4717, + "step": 8009 + }, + { + "epoch": 1.315459938825365, + "grad_norm": 0.28050608814768224, + "learning_rate": 7.623844187559308e-06, + "loss": 0.5125, + "step": 8010 + }, + { + "epoch": 1.3156241660337076, + "grad_norm": 0.2919003234877574, + "learning_rate": 7.623506607163052e-06, + "loss": 0.4539, + "step": 8011 + }, + { + "epoch": 1.3157883932420504, + "grad_norm": 0.3622981290018002, + "learning_rate": 7.6231689928424415e-06, + "loss": 0.4625, + "step": 8012 + }, + { + "epoch": 1.315952620450393, + "grad_norm": 0.3516883118340059, + "learning_rate": 7.622831344601143e-06, + "loss": 0.4845, + "step": 8013 + }, + { + "epoch": 1.316116847658736, + "grad_norm": 0.29660929220781496, + "learning_rate": 7.622493662442823e-06, + "loss": 0.4642, + "step": 8014 + }, + { + "epoch": 1.3162810748670786, + "grad_norm": 1.2368299556551743, + "learning_rate": 7.622155946371151e-06, + "loss": 0.4798, + "step": 8015 + }, + { + "epoch": 1.3164453020754214, + "grad_norm": 0.29717759444454805, + "learning_rate": 7.621818196389793e-06, + "loss": 0.4756, + "step": 8016 + }, + { + "epoch": 1.316609529283764, + "grad_norm": 0.3918627797621546, + "learning_rate": 7.621480412502418e-06, + "loss": 0.4788, + "step": 8017 + }, + { + "epoch": 1.3167737564921067, + "grad_norm": 0.29543561155814907, + "learning_rate": 7.621142594712694e-06, + "loss": 0.4981, + "step": 8018 + }, + { + "epoch": 1.3169379837004496, + "grad_norm": 0.37159328406059056, + "learning_rate": 7.620804743024291e-06, + "loss": 0.4948, + "step": 8019 + }, + { + "epoch": 1.3171022109087924, + "grad_norm": 0.2783048589916877, + "learning_rate": 7.620466857440879e-06, + "loss": 0.4717, + "step": 8020 + }, + { + "epoch": 1.317266438117135, + "grad_norm": 0.306567811369454, + "learning_rate": 7.620128937966125e-06, + "loss": 0.4724, + "step": 8021 + }, + { + "epoch": 1.3174306653254777, + "grad_norm": 0.29327978868608195, + "learning_rate": 7.619790984603702e-06, + "loss": 0.4633, + "step": 8022 + }, + { + "epoch": 1.3175948925338206, + "grad_norm": 0.3917006265643016, + "learning_rate": 7.61945299735728e-06, + "loss": 0.4604, + "step": 8023 + }, + { + "epoch": 1.3177591197421634, + "grad_norm": 0.2667837817112557, + "learning_rate": 7.619114976230528e-06, + "loss": 0.4541, + "step": 8024 + }, + { + "epoch": 1.317923346950506, + "grad_norm": 0.35010204950872126, + "learning_rate": 7.6187769212271194e-06, + "loss": 0.4935, + "step": 8025 + }, + { + "epoch": 1.3180875741588487, + "grad_norm": 0.29046284897377767, + "learning_rate": 7.618438832350725e-06, + "loss": 0.4803, + "step": 8026 + }, + { + "epoch": 1.3182518013671916, + "grad_norm": 0.33231245117304176, + "learning_rate": 7.618100709605017e-06, + "loss": 0.469, + "step": 8027 + }, + { + "epoch": 1.3184160285755342, + "grad_norm": 0.29343771355212916, + "learning_rate": 7.617762552993667e-06, + "loss": 0.4845, + "step": 8028 + }, + { + "epoch": 1.318580255783877, + "grad_norm": 0.3313127989483271, + "learning_rate": 7.617424362520349e-06, + "loss": 0.4435, + "step": 8029 + }, + { + "epoch": 1.3187444829922197, + "grad_norm": 0.28290016169336146, + "learning_rate": 7.617086138188733e-06, + "loss": 0.4678, + "step": 8030 + }, + { + "epoch": 1.3189087102005626, + "grad_norm": 0.38666467517347725, + "learning_rate": 7.616747880002497e-06, + "loss": 0.4674, + "step": 8031 + }, + { + "epoch": 1.3190729374089052, + "grad_norm": 0.356353790147301, + "learning_rate": 7.616409587965312e-06, + "loss": 0.4725, + "step": 8032 + }, + { + "epoch": 1.319237164617248, + "grad_norm": 0.40378474599862085, + "learning_rate": 7.616071262080853e-06, + "loss": 0.4639, + "step": 8033 + }, + { + "epoch": 1.3194013918255907, + "grad_norm": 0.3306098327327851, + "learning_rate": 7.6157329023527925e-06, + "loss": 0.4802, + "step": 8034 + }, + { + "epoch": 1.3195656190339333, + "grad_norm": 0.35127743283140145, + "learning_rate": 7.61539450878481e-06, + "loss": 0.4667, + "step": 8035 + }, + { + "epoch": 1.3197298462422762, + "grad_norm": 0.4309638643922434, + "learning_rate": 7.615056081380577e-06, + "loss": 0.4838, + "step": 8036 + }, + { + "epoch": 1.319894073450619, + "grad_norm": 0.3355719844565293, + "learning_rate": 7.6147176201437695e-06, + "loss": 0.4957, + "step": 8037 + }, + { + "epoch": 1.3200583006589617, + "grad_norm": 0.5136141703832557, + "learning_rate": 7.614379125078063e-06, + "loss": 0.4786, + "step": 8038 + }, + { + "epoch": 1.3202225278673043, + "grad_norm": 0.4938624681910936, + "learning_rate": 7.614040596187138e-06, + "loss": 0.4649, + "step": 8039 + }, + { + "epoch": 1.3203867550756472, + "grad_norm": 0.3426636485394977, + "learning_rate": 7.613702033474667e-06, + "loss": 0.4791, + "step": 8040 + }, + { + "epoch": 1.32055098228399, + "grad_norm": 0.29924421972371945, + "learning_rate": 7.613363436944328e-06, + "loss": 0.4634, + "step": 8041 + }, + { + "epoch": 1.3207152094923327, + "grad_norm": 0.37481981284715726, + "learning_rate": 7.613024806599799e-06, + "loss": 0.4718, + "step": 8042 + }, + { + "epoch": 1.3208794367006753, + "grad_norm": 0.2831538659040023, + "learning_rate": 7.612686142444757e-06, + "loss": 0.4685, + "step": 8043 + }, + { + "epoch": 1.3210436639090182, + "grad_norm": 0.29528741651177665, + "learning_rate": 7.612347444482883e-06, + "loss": 0.4519, + "step": 8044 + }, + { + "epoch": 1.3212078911173608, + "grad_norm": 0.34103817691768396, + "learning_rate": 7.612008712717853e-06, + "loss": 0.4884, + "step": 8045 + }, + { + "epoch": 1.3213721183257037, + "grad_norm": 0.32873295054672647, + "learning_rate": 7.611669947153346e-06, + "loss": 0.4951, + "step": 8046 + }, + { + "epoch": 1.3215363455340463, + "grad_norm": 0.3487948323682672, + "learning_rate": 7.611331147793042e-06, + "loss": 0.4863, + "step": 8047 + }, + { + "epoch": 1.3217005727423892, + "grad_norm": 0.3518588634886226, + "learning_rate": 7.610992314640621e-06, + "loss": 0.4878, + "step": 8048 + }, + { + "epoch": 1.3218647999507318, + "grad_norm": 0.3851542782176887, + "learning_rate": 7.610653447699763e-06, + "loss": 0.4799, + "step": 8049 + }, + { + "epoch": 1.3220290271590747, + "grad_norm": 0.30015345022801126, + "learning_rate": 7.610314546974146e-06, + "loss": 0.4742, + "step": 8050 + }, + { + "epoch": 1.3221932543674173, + "grad_norm": 0.3396771981317586, + "learning_rate": 7.6099756124674555e-06, + "loss": 0.4692, + "step": 8051 + }, + { + "epoch": 1.32235748157576, + "grad_norm": 0.37353944059171346, + "learning_rate": 7.6096366441833686e-06, + "loss": 0.4923, + "step": 8052 + }, + { + "epoch": 1.3225217087841028, + "grad_norm": 0.3611586443903467, + "learning_rate": 7.609297642125568e-06, + "loss": 0.4695, + "step": 8053 + }, + { + "epoch": 1.3226859359924457, + "grad_norm": 0.3391887708192555, + "learning_rate": 7.6089586062977375e-06, + "loss": 0.4752, + "step": 8054 + }, + { + "epoch": 1.3228501632007883, + "grad_norm": 0.3614387838925698, + "learning_rate": 7.608619536703557e-06, + "loss": 0.4739, + "step": 8055 + }, + { + "epoch": 1.323014390409131, + "grad_norm": 0.3365756410105476, + "learning_rate": 7.608280433346709e-06, + "loss": 0.4798, + "step": 8056 + }, + { + "epoch": 1.3231786176174738, + "grad_norm": 1.238861339205873, + "learning_rate": 7.607941296230878e-06, + "loss": 0.4753, + "step": 8057 + }, + { + "epoch": 1.3233428448258167, + "grad_norm": 0.3307128371330611, + "learning_rate": 7.6076021253597465e-06, + "loss": 0.4914, + "step": 8058 + }, + { + "epoch": 1.3235070720341593, + "grad_norm": 0.4128137716597536, + "learning_rate": 7.607262920736999e-06, + "loss": 0.4686, + "step": 8059 + }, + { + "epoch": 1.323671299242502, + "grad_norm": 0.32301645197658924, + "learning_rate": 7.606923682366318e-06, + "loss": 0.4168, + "step": 8060 + }, + { + "epoch": 1.3238355264508448, + "grad_norm": 0.2996180664900655, + "learning_rate": 7.60658441025139e-06, + "loss": 0.4777, + "step": 8061 + }, + { + "epoch": 1.3239997536591874, + "grad_norm": 0.3167191179377588, + "learning_rate": 7.606245104395898e-06, + "loss": 0.4795, + "step": 8062 + }, + { + "epoch": 1.3241639808675303, + "grad_norm": 0.32137763185269175, + "learning_rate": 7.605905764803528e-06, + "loss": 0.4715, + "step": 8063 + }, + { + "epoch": 1.324328208075873, + "grad_norm": 0.47425814179048575, + "learning_rate": 7.6055663914779665e-06, + "loss": 0.477, + "step": 8064 + }, + { + "epoch": 1.3244924352842158, + "grad_norm": 0.38687364650229183, + "learning_rate": 7.605226984422899e-06, + "loss": 0.47, + "step": 8065 + }, + { + "epoch": 1.3246566624925584, + "grad_norm": 0.29405401978587575, + "learning_rate": 7.60488754364201e-06, + "loss": 0.4682, + "step": 8066 + }, + { + "epoch": 1.3248208897009013, + "grad_norm": 0.3612032326513613, + "learning_rate": 7.604548069138988e-06, + "loss": 0.4877, + "step": 8067 + }, + { + "epoch": 1.324985116909244, + "grad_norm": 0.3739762267080125, + "learning_rate": 7.604208560917519e-06, + "loss": 0.4728, + "step": 8068 + }, + { + "epoch": 1.3251493441175866, + "grad_norm": 0.33058668071605135, + "learning_rate": 7.603869018981292e-06, + "loss": 0.4707, + "step": 8069 + }, + { + "epoch": 1.3253135713259294, + "grad_norm": 0.5793876431825817, + "learning_rate": 7.603529443333993e-06, + "loss": 0.491, + "step": 8070 + }, + { + "epoch": 1.3254777985342723, + "grad_norm": 0.2881885125717379, + "learning_rate": 7.603189833979311e-06, + "loss": 0.4674, + "step": 8071 + }, + { + "epoch": 1.325642025742615, + "grad_norm": 0.2923464384400088, + "learning_rate": 7.602850190920933e-06, + "loss": 0.4742, + "step": 8072 + }, + { + "epoch": 1.3258062529509576, + "grad_norm": 0.32004321913986356, + "learning_rate": 7.602510514162551e-06, + "loss": 0.4909, + "step": 8073 + }, + { + "epoch": 1.3259704801593004, + "grad_norm": 0.3242085527775499, + "learning_rate": 7.602170803707852e-06, + "loss": 0.4895, + "step": 8074 + }, + { + "epoch": 1.3261347073676433, + "grad_norm": 0.31932451799372075, + "learning_rate": 7.601831059560525e-06, + "loss": 0.4472, + "step": 8075 + }, + { + "epoch": 1.326298934575986, + "grad_norm": 0.43480385093250357, + "learning_rate": 7.6014912817242615e-06, + "loss": 0.5229, + "step": 8076 + }, + { + "epoch": 1.3264631617843285, + "grad_norm": 0.28575102479694664, + "learning_rate": 7.601151470202752e-06, + "loss": 0.4712, + "step": 8077 + }, + { + "epoch": 1.3266273889926714, + "grad_norm": 0.3413516866373773, + "learning_rate": 7.600811624999685e-06, + "loss": 0.4893, + "step": 8078 + }, + { + "epoch": 1.326791616201014, + "grad_norm": 0.4762227758293843, + "learning_rate": 7.600471746118754e-06, + "loss": 0.4593, + "step": 8079 + }, + { + "epoch": 1.326955843409357, + "grad_norm": 0.36642531096793896, + "learning_rate": 7.600131833563648e-06, + "loss": 0.4828, + "step": 8080 + }, + { + "epoch": 1.3271200706176995, + "grad_norm": 0.24844728932647275, + "learning_rate": 7.599791887338061e-06, + "loss": 0.4486, + "step": 8081 + }, + { + "epoch": 1.3272842978260424, + "grad_norm": 0.2761053375616054, + "learning_rate": 7.599451907445685e-06, + "loss": 0.4666, + "step": 8082 + }, + { + "epoch": 1.327448525034385, + "grad_norm": 0.35201886560818063, + "learning_rate": 7.599111893890211e-06, + "loss": 0.4635, + "step": 8083 + }, + { + "epoch": 1.327612752242728, + "grad_norm": 0.33488597740115933, + "learning_rate": 7.598771846675333e-06, + "loss": 0.4694, + "step": 8084 + }, + { + "epoch": 1.3277769794510705, + "grad_norm": 2.0433814286588077, + "learning_rate": 7.598431765804745e-06, + "loss": 0.472, + "step": 8085 + }, + { + "epoch": 1.3279412066594132, + "grad_norm": 0.28476359738830287, + "learning_rate": 7.598091651282138e-06, + "loss": 0.4753, + "step": 8086 + }, + { + "epoch": 1.328105433867756, + "grad_norm": 0.3091484184216086, + "learning_rate": 7.597751503111208e-06, + "loss": 0.4704, + "step": 8087 + }, + { + "epoch": 1.328269661076099, + "grad_norm": 0.2939492958850838, + "learning_rate": 7.597411321295649e-06, + "loss": 0.4881, + "step": 8088 + }, + { + "epoch": 1.3284338882844415, + "grad_norm": 0.2938888929723579, + "learning_rate": 7.597071105839155e-06, + "loss": 0.4922, + "step": 8089 + }, + { + "epoch": 1.3285981154927842, + "grad_norm": 0.30286546203247877, + "learning_rate": 7.596730856745423e-06, + "loss": 0.4762, + "step": 8090 + }, + { + "epoch": 1.328762342701127, + "grad_norm": 0.30359089033544046, + "learning_rate": 7.5963905740181465e-06, + "loss": 0.4806, + "step": 8091 + }, + { + "epoch": 1.3289265699094697, + "grad_norm": 0.3143376216387629, + "learning_rate": 7.5960502576610206e-06, + "loss": 0.4909, + "step": 8092 + }, + { + "epoch": 1.3290907971178125, + "grad_norm": 0.38876383498655837, + "learning_rate": 7.5957099076777445e-06, + "loss": 0.4568, + "step": 8093 + }, + { + "epoch": 1.3292550243261552, + "grad_norm": 0.36562505267623785, + "learning_rate": 7.595369524072013e-06, + "loss": 0.4806, + "step": 8094 + }, + { + "epoch": 1.329419251534498, + "grad_norm": 0.34457728082214156, + "learning_rate": 7.595029106847523e-06, + "loss": 0.4746, + "step": 8095 + }, + { + "epoch": 1.3295834787428407, + "grad_norm": 0.3324520354871399, + "learning_rate": 7.59468865600797e-06, + "loss": 0.4903, + "step": 8096 + }, + { + "epoch": 1.3297477059511835, + "grad_norm": 0.4155866763230899, + "learning_rate": 7.594348171557055e-06, + "loss": 0.4703, + "step": 8097 + }, + { + "epoch": 1.3299119331595262, + "grad_norm": 0.2969915001045424, + "learning_rate": 7.594007653498475e-06, + "loss": 0.4858, + "step": 8098 + }, + { + "epoch": 1.330076160367869, + "grad_norm": 0.30978801552869756, + "learning_rate": 7.593667101835927e-06, + "loss": 0.4635, + "step": 8099 + }, + { + "epoch": 1.3302403875762117, + "grad_norm": 0.34181601930545924, + "learning_rate": 7.593326516573111e-06, + "loss": 0.4701, + "step": 8100 + }, + { + "epoch": 1.3304046147845545, + "grad_norm": 0.42274058666425973, + "learning_rate": 7.592985897713724e-06, + "loss": 0.4826, + "step": 8101 + }, + { + "epoch": 1.3305688419928972, + "grad_norm": 0.492798755390736, + "learning_rate": 7.592645245261468e-06, + "loss": 0.4745, + "step": 8102 + }, + { + "epoch": 1.3307330692012398, + "grad_norm": 0.2959789659905488, + "learning_rate": 7.5923045592200425e-06, + "loss": 0.4788, + "step": 8103 + }, + { + "epoch": 1.3308972964095827, + "grad_norm": 0.2900045872919727, + "learning_rate": 7.591963839593147e-06, + "loss": 0.4897, + "step": 8104 + }, + { + "epoch": 1.3310615236179255, + "grad_norm": 0.28604574921354087, + "learning_rate": 7.591623086384482e-06, + "loss": 0.4801, + "step": 8105 + }, + { + "epoch": 1.3312257508262682, + "grad_norm": 0.2883666824407346, + "learning_rate": 7.5912822995977485e-06, + "loss": 0.4641, + "step": 8106 + }, + { + "epoch": 1.3313899780346108, + "grad_norm": 0.3693527994487577, + "learning_rate": 7.590941479236647e-06, + "loss": 0.497, + "step": 8107 + }, + { + "epoch": 1.3315542052429536, + "grad_norm": 0.26417888929256916, + "learning_rate": 7.590600625304882e-06, + "loss": 0.4961, + "step": 8108 + }, + { + "epoch": 1.3317184324512963, + "grad_norm": 0.37961262463328665, + "learning_rate": 7.590259737806151e-06, + "loss": 0.4758, + "step": 8109 + }, + { + "epoch": 1.3318826596596391, + "grad_norm": 0.32640878987499683, + "learning_rate": 7.5899188167441596e-06, + "loss": 0.4956, + "step": 8110 + }, + { + "epoch": 1.3320468868679818, + "grad_norm": 0.29588985784763766, + "learning_rate": 7.589577862122611e-06, + "loss": 0.4828, + "step": 8111 + }, + { + "epoch": 1.3322111140763246, + "grad_norm": 0.37197198970391443, + "learning_rate": 7.589236873945205e-06, + "loss": 0.4528, + "step": 8112 + }, + { + "epoch": 1.3323753412846673, + "grad_norm": 0.370013610959104, + "learning_rate": 7.588895852215649e-06, + "loss": 0.4599, + "step": 8113 + }, + { + "epoch": 1.3325395684930101, + "grad_norm": 0.37106657556119116, + "learning_rate": 7.588554796937643e-06, + "loss": 0.4644, + "step": 8114 + }, + { + "epoch": 1.3327037957013528, + "grad_norm": 0.3269685481961678, + "learning_rate": 7.588213708114895e-06, + "loss": 0.4845, + "step": 8115 + }, + { + "epoch": 1.3328680229096956, + "grad_norm": 0.2555741764001781, + "learning_rate": 7.587872585751108e-06, + "loss": 0.4721, + "step": 8116 + }, + { + "epoch": 1.3330322501180383, + "grad_norm": 0.3073360793567854, + "learning_rate": 7.587531429849986e-06, + "loss": 0.4881, + "step": 8117 + }, + { + "epoch": 1.3331964773263811, + "grad_norm": 0.48663839387542973, + "learning_rate": 7.587190240415235e-06, + "loss": 0.4736, + "step": 8118 + }, + { + "epoch": 1.3333607045347238, + "grad_norm": 0.37921822868911753, + "learning_rate": 7.58684901745056e-06, + "loss": 0.487, + "step": 8119 + }, + { + "epoch": 1.3335249317430664, + "grad_norm": 0.2710430476175914, + "learning_rate": 7.586507760959668e-06, + "loss": 0.4553, + "step": 8120 + }, + { + "epoch": 1.3336891589514093, + "grad_norm": 0.2823588509201183, + "learning_rate": 7.586166470946265e-06, + "loss": 0.4723, + "step": 8121 + }, + { + "epoch": 1.3338533861597521, + "grad_norm": 0.3588907283766576, + "learning_rate": 7.585825147414058e-06, + "loss": 0.4588, + "step": 8122 + }, + { + "epoch": 1.3340176133680948, + "grad_norm": 0.26534977430795154, + "learning_rate": 7.585483790366755e-06, + "loss": 0.4488, + "step": 8123 + }, + { + "epoch": 1.3341818405764374, + "grad_norm": 0.36041667654909365, + "learning_rate": 7.58514239980806e-06, + "loss": 0.4778, + "step": 8124 + }, + { + "epoch": 1.3343460677847803, + "grad_norm": 0.3025737628651881, + "learning_rate": 7.584800975741684e-06, + "loss": 0.486, + "step": 8125 + }, + { + "epoch": 1.334510294993123, + "grad_norm": 0.2634336667104377, + "learning_rate": 7.584459518171334e-06, + "loss": 0.4659, + "step": 8126 + }, + { + "epoch": 1.3346745222014658, + "grad_norm": 0.44720072697100516, + "learning_rate": 7.58411802710072e-06, + "loss": 0.4965, + "step": 8127 + }, + { + "epoch": 1.3348387494098084, + "grad_norm": 0.33297381374331897, + "learning_rate": 7.58377650253355e-06, + "loss": 0.5003, + "step": 8128 + }, + { + "epoch": 1.3350029766181513, + "grad_norm": 0.2732531330453581, + "learning_rate": 7.583434944473531e-06, + "loss": 0.4742, + "step": 8129 + }, + { + "epoch": 1.335167203826494, + "grad_norm": 0.314812564000472, + "learning_rate": 7.583093352924377e-06, + "loss": 0.4602, + "step": 8130 + }, + { + "epoch": 1.3353314310348368, + "grad_norm": 0.3506246269150991, + "learning_rate": 7.582751727889795e-06, + "loss": 0.4655, + "step": 8131 + }, + { + "epoch": 1.3354956582431794, + "grad_norm": 0.31631151263064144, + "learning_rate": 7.582410069373497e-06, + "loss": 0.4862, + "step": 8132 + }, + { + "epoch": 1.3356598854515223, + "grad_norm": 0.3378754290915035, + "learning_rate": 7.582068377379192e-06, + "loss": 0.4874, + "step": 8133 + }, + { + "epoch": 1.335824112659865, + "grad_norm": 0.2950314734983047, + "learning_rate": 7.581726651910592e-06, + "loss": 0.4602, + "step": 8134 + }, + { + "epoch": 1.3359883398682078, + "grad_norm": 0.37993331588181634, + "learning_rate": 7.58138489297141e-06, + "loss": 0.485, + "step": 8135 + }, + { + "epoch": 1.3361525670765504, + "grad_norm": 0.47312803234745243, + "learning_rate": 7.5810431005653555e-06, + "loss": 0.4602, + "step": 8136 + }, + { + "epoch": 1.336316794284893, + "grad_norm": 0.38400139770922437, + "learning_rate": 7.580701274696141e-06, + "loss": 0.473, + "step": 8137 + }, + { + "epoch": 1.3364810214932359, + "grad_norm": 0.2713798872467751, + "learning_rate": 7.58035941536748e-06, + "loss": 0.4735, + "step": 8138 + }, + { + "epoch": 1.3366452487015787, + "grad_norm": 0.28391273813649176, + "learning_rate": 7.580017522583085e-06, + "loss": 0.4917, + "step": 8139 + }, + { + "epoch": 1.3368094759099214, + "grad_norm": 0.29927465149936233, + "learning_rate": 7.57967559634667e-06, + "loss": 0.4704, + "step": 8140 + }, + { + "epoch": 1.336973703118264, + "grad_norm": 0.36460727628287165, + "learning_rate": 7.579333636661947e-06, + "loss": 0.4906, + "step": 8141 + }, + { + "epoch": 1.3371379303266069, + "grad_norm": 0.33819147075200523, + "learning_rate": 7.578991643532631e-06, + "loss": 0.4861, + "step": 8142 + }, + { + "epoch": 1.3373021575349495, + "grad_norm": 0.29856704467356415, + "learning_rate": 7.578649616962437e-06, + "loss": 0.4814, + "step": 8143 + }, + { + "epoch": 1.3374663847432924, + "grad_norm": 0.3243878064563001, + "learning_rate": 7.57830755695508e-06, + "loss": 0.474, + "step": 8144 + }, + { + "epoch": 1.337630611951635, + "grad_norm": 0.3260830025390945, + "learning_rate": 7.577965463514273e-06, + "loss": 0.4439, + "step": 8145 + }, + { + "epoch": 1.3377948391599779, + "grad_norm": 0.3561533434330576, + "learning_rate": 7.577623336643734e-06, + "loss": 0.4819, + "step": 8146 + }, + { + "epoch": 1.3379590663683205, + "grad_norm": 0.40282604646571896, + "learning_rate": 7.5772811763471765e-06, + "loss": 0.4731, + "step": 8147 + }, + { + "epoch": 1.3381232935766634, + "grad_norm": 0.4354568605464644, + "learning_rate": 7.576938982628319e-06, + "loss": 0.4791, + "step": 8148 + }, + { + "epoch": 1.338287520785006, + "grad_norm": 0.27024614355661286, + "learning_rate": 7.5765967554908766e-06, + "loss": 0.4534, + "step": 8149 + }, + { + "epoch": 1.3384517479933489, + "grad_norm": 0.2800449426831548, + "learning_rate": 7.576254494938565e-06, + "loss": 0.4638, + "step": 8150 + }, + { + "epoch": 1.3386159752016915, + "grad_norm": 0.31469804284271313, + "learning_rate": 7.5759122009751034e-06, + "loss": 0.4672, + "step": 8151 + }, + { + "epoch": 1.3387802024100344, + "grad_norm": 0.31981087249106815, + "learning_rate": 7.575569873604211e-06, + "loss": 0.4754, + "step": 8152 + }, + { + "epoch": 1.338944429618377, + "grad_norm": 0.32041798503279073, + "learning_rate": 7.575227512829601e-06, + "loss": 0.4542, + "step": 8153 + }, + { + "epoch": 1.3391086568267196, + "grad_norm": 0.30671216747447894, + "learning_rate": 7.574885118654997e-06, + "loss": 0.4687, + "step": 8154 + }, + { + "epoch": 1.3392728840350625, + "grad_norm": 0.2954637648278154, + "learning_rate": 7.574542691084114e-06, + "loss": 0.4976, + "step": 8155 + }, + { + "epoch": 1.3394371112434054, + "grad_norm": 0.3081145232331669, + "learning_rate": 7.574200230120672e-06, + "loss": 0.4592, + "step": 8156 + }, + { + "epoch": 1.339601338451748, + "grad_norm": 0.320559291908532, + "learning_rate": 7.573857735768392e-06, + "loss": 0.4742, + "step": 8157 + }, + { + "epoch": 1.3397655656600906, + "grad_norm": 0.3365256596562173, + "learning_rate": 7.573515208030992e-06, + "loss": 0.4914, + "step": 8158 + }, + { + "epoch": 1.3399297928684335, + "grad_norm": 0.2751284881552089, + "learning_rate": 7.5731726469121925e-06, + "loss": 0.489, + "step": 8159 + }, + { + "epoch": 1.3400940200767761, + "grad_norm": 0.37688645099946766, + "learning_rate": 7.572830052415716e-06, + "loss": 0.4933, + "step": 8160 + }, + { + "epoch": 1.340258247285119, + "grad_norm": 0.30999225372323935, + "learning_rate": 7.572487424545282e-06, + "loss": 0.4993, + "step": 8161 + }, + { + "epoch": 1.3404224744934616, + "grad_norm": 0.33954222243627585, + "learning_rate": 7.572144763304609e-06, + "loss": 0.4623, + "step": 8162 + }, + { + "epoch": 1.3405867017018045, + "grad_norm": 0.35287127368966964, + "learning_rate": 7.571802068697424e-06, + "loss": 0.4674, + "step": 8163 + }, + { + "epoch": 1.3407509289101471, + "grad_norm": 0.26149503644123145, + "learning_rate": 7.571459340727444e-06, + "loss": 0.4895, + "step": 8164 + }, + { + "epoch": 1.34091515611849, + "grad_norm": 0.340594769554577, + "learning_rate": 7.5711165793983955e-06, + "loss": 0.4668, + "step": 8165 + }, + { + "epoch": 1.3410793833268326, + "grad_norm": 0.35235304957633606, + "learning_rate": 7.570773784714e-06, + "loss": 0.4808, + "step": 8166 + }, + { + "epoch": 1.3412436105351755, + "grad_norm": 0.2674302744225442, + "learning_rate": 7.570430956677978e-06, + "loss": 0.4555, + "step": 8167 + }, + { + "epoch": 1.3414078377435181, + "grad_norm": 0.3802926912537419, + "learning_rate": 7.570088095294056e-06, + "loss": 0.4788, + "step": 8168 + }, + { + "epoch": 1.341572064951861, + "grad_norm": 0.3044697674369472, + "learning_rate": 7.569745200565956e-06, + "loss": 0.479, + "step": 8169 + }, + { + "epoch": 1.3417362921602036, + "grad_norm": 0.5955916581848941, + "learning_rate": 7.569402272497403e-06, + "loss": 0.4623, + "step": 8170 + }, + { + "epoch": 1.3419005193685463, + "grad_norm": 0.31407739496613557, + "learning_rate": 7.569059311092121e-06, + "loss": 0.4711, + "step": 8171 + }, + { + "epoch": 1.3420647465768891, + "grad_norm": 0.34767475507542017, + "learning_rate": 7.568716316353837e-06, + "loss": 0.5016, + "step": 8172 + }, + { + "epoch": 1.342228973785232, + "grad_norm": 0.38120882129971084, + "learning_rate": 7.568373288286274e-06, + "loss": 0.4619, + "step": 8173 + }, + { + "epoch": 1.3423932009935746, + "grad_norm": 0.33282327887772417, + "learning_rate": 7.568030226893158e-06, + "loss": 0.4911, + "step": 8174 + }, + { + "epoch": 1.3425574282019173, + "grad_norm": 0.2965917498704802, + "learning_rate": 7.567687132178216e-06, + "loss": 0.4884, + "step": 8175 + }, + { + "epoch": 1.3427216554102601, + "grad_norm": 0.3396648386159076, + "learning_rate": 7.567344004145172e-06, + "loss": 0.483, + "step": 8176 + }, + { + "epoch": 1.3428858826186028, + "grad_norm": 0.30333232393757004, + "learning_rate": 7.567000842797754e-06, + "loss": 0.4851, + "step": 8177 + }, + { + "epoch": 1.3430501098269456, + "grad_norm": 0.34618357769248426, + "learning_rate": 7.56665764813969e-06, + "loss": 0.4906, + "step": 8178 + }, + { + "epoch": 1.3432143370352883, + "grad_norm": 0.3789647075081895, + "learning_rate": 7.566314420174707e-06, + "loss": 0.4798, + "step": 8179 + }, + { + "epoch": 1.3433785642436311, + "grad_norm": 0.8248885820482107, + "learning_rate": 7.565971158906533e-06, + "loss": 0.4696, + "step": 8180 + }, + { + "epoch": 1.3435427914519738, + "grad_norm": 0.34738798654195413, + "learning_rate": 7.565627864338896e-06, + "loss": 0.4731, + "step": 8181 + }, + { + "epoch": 1.3437070186603166, + "grad_norm": 0.3001257042915786, + "learning_rate": 7.565284536475523e-06, + "loss": 0.4761, + "step": 8182 + }, + { + "epoch": 1.3438712458686592, + "grad_norm": 0.29395541996703284, + "learning_rate": 7.564941175320145e-06, + "loss": 0.4704, + "step": 8183 + }, + { + "epoch": 1.344035473077002, + "grad_norm": 0.3490565668842923, + "learning_rate": 7.564597780876489e-06, + "loss": 0.484, + "step": 8184 + }, + { + "epoch": 1.3441997002853447, + "grad_norm": 0.33784447165967335, + "learning_rate": 7.564254353148286e-06, + "loss": 0.4551, + "step": 8185 + }, + { + "epoch": 1.3443639274936876, + "grad_norm": 0.31079920003562245, + "learning_rate": 7.563910892139268e-06, + "loss": 0.4629, + "step": 8186 + }, + { + "epoch": 1.3445281547020302, + "grad_norm": 0.339855134296473, + "learning_rate": 7.563567397853162e-06, + "loss": 0.4723, + "step": 8187 + }, + { + "epoch": 1.3446923819103729, + "grad_norm": 0.31361048502666455, + "learning_rate": 7.5632238702937e-06, + "loss": 0.4626, + "step": 8188 + }, + { + "epoch": 1.3448566091187157, + "grad_norm": 0.30916236843543365, + "learning_rate": 7.562880309464612e-06, + "loss": 0.4885, + "step": 8189 + }, + { + "epoch": 1.3450208363270586, + "grad_norm": 0.42762086725224197, + "learning_rate": 7.562536715369632e-06, + "loss": 0.4798, + "step": 8190 + }, + { + "epoch": 1.3451850635354012, + "grad_norm": 0.29582310475373164, + "learning_rate": 7.562193088012489e-06, + "loss": 0.4757, + "step": 8191 + }, + { + "epoch": 1.3453492907437439, + "grad_norm": 0.2885250979215616, + "learning_rate": 7.561849427396916e-06, + "loss": 0.4677, + "step": 8192 + }, + { + "epoch": 1.3455135179520867, + "grad_norm": 0.3149369224702306, + "learning_rate": 7.561505733526646e-06, + "loss": 0.4801, + "step": 8193 + }, + { + "epoch": 1.3456777451604294, + "grad_norm": 0.3230766779512786, + "learning_rate": 7.561162006405413e-06, + "loss": 0.4705, + "step": 8194 + }, + { + "epoch": 1.3458419723687722, + "grad_norm": 0.2664636359519095, + "learning_rate": 7.560818246036948e-06, + "loss": 0.4632, + "step": 8195 + }, + { + "epoch": 1.3460061995771149, + "grad_norm": 0.33555822669646035, + "learning_rate": 7.560474452424984e-06, + "loss": 0.4661, + "step": 8196 + }, + { + "epoch": 1.3461704267854577, + "grad_norm": 0.6934805707048038, + "learning_rate": 7.560130625573259e-06, + "loss": 0.4862, + "step": 8197 + }, + { + "epoch": 1.3463346539938004, + "grad_norm": 0.2607196059610446, + "learning_rate": 7.559786765485503e-06, + "loss": 0.481, + "step": 8198 + }, + { + "epoch": 1.3464988812021432, + "grad_norm": 0.32406956402984705, + "learning_rate": 7.559442872165452e-06, + "loss": 0.4648, + "step": 8199 + }, + { + "epoch": 1.3466631084104859, + "grad_norm": 0.23869746457513058, + "learning_rate": 7.5590989456168425e-06, + "loss": 0.4656, + "step": 8200 + }, + { + "epoch": 1.3468273356188287, + "grad_norm": 0.3208825001499969, + "learning_rate": 7.558754985843408e-06, + "loss": 0.4713, + "step": 8201 + }, + { + "epoch": 1.3469915628271714, + "grad_norm": 0.2841932547090098, + "learning_rate": 7.558410992848886e-06, + "loss": 0.4806, + "step": 8202 + }, + { + "epoch": 1.3471557900355142, + "grad_norm": 0.3941615072914976, + "learning_rate": 7.55806696663701e-06, + "loss": 0.462, + "step": 8203 + }, + { + "epoch": 1.3473200172438569, + "grad_norm": 0.30906597354873233, + "learning_rate": 7.557722907211518e-06, + "loss": 0.4597, + "step": 8204 + }, + { + "epoch": 1.3474842444521995, + "grad_norm": 0.3261144107035512, + "learning_rate": 7.557378814576148e-06, + "loss": 0.4767, + "step": 8205 + }, + { + "epoch": 1.3476484716605424, + "grad_norm": 0.3231943091793371, + "learning_rate": 7.557034688734636e-06, + "loss": 0.4903, + "step": 8206 + }, + { + "epoch": 1.3478126988688852, + "grad_norm": 0.31891508433673865, + "learning_rate": 7.556690529690719e-06, + "loss": 0.4661, + "step": 8207 + }, + { + "epoch": 1.3479769260772279, + "grad_norm": 0.3561392861742652, + "learning_rate": 7.556346337448135e-06, + "loss": 0.468, + "step": 8208 + }, + { + "epoch": 1.3481411532855705, + "grad_norm": 0.29269560637612674, + "learning_rate": 7.556002112010623e-06, + "loss": 0.489, + "step": 8209 + }, + { + "epoch": 1.3483053804939134, + "grad_norm": 0.29961111960033693, + "learning_rate": 7.555657853381921e-06, + "loss": 0.4822, + "step": 8210 + }, + { + "epoch": 1.348469607702256, + "grad_norm": 0.37927679526581437, + "learning_rate": 7.55531356156577e-06, + "loss": 0.475, + "step": 8211 + }, + { + "epoch": 1.3486338349105988, + "grad_norm": 0.2841023304201933, + "learning_rate": 7.554969236565906e-06, + "loss": 0.4762, + "step": 8212 + }, + { + "epoch": 1.3487980621189415, + "grad_norm": 0.28326627175304997, + "learning_rate": 7.554624878386071e-06, + "loss": 0.4959, + "step": 8213 + }, + { + "epoch": 1.3489622893272843, + "grad_norm": 0.41227391057862817, + "learning_rate": 7.5542804870300035e-06, + "loss": 0.4711, + "step": 8214 + }, + { + "epoch": 1.349126516535627, + "grad_norm": 0.3915072018838373, + "learning_rate": 7.553936062501448e-06, + "loss": 0.481, + "step": 8215 + }, + { + "epoch": 1.3492907437439698, + "grad_norm": 0.3381486335554688, + "learning_rate": 7.55359160480414e-06, + "loss": 0.4789, + "step": 8216 + }, + { + "epoch": 1.3494549709523125, + "grad_norm": 0.27882630443105566, + "learning_rate": 7.553247113941822e-06, + "loss": 0.4697, + "step": 8217 + }, + { + "epoch": 1.3496191981606553, + "grad_norm": 0.31148174098022713, + "learning_rate": 7.552902589918237e-06, + "loss": 0.4677, + "step": 8218 + }, + { + "epoch": 1.349783425368998, + "grad_norm": 0.32014703747449885, + "learning_rate": 7.552558032737128e-06, + "loss": 0.4803, + "step": 8219 + }, + { + "epoch": 1.3499476525773408, + "grad_norm": 0.3466674862984574, + "learning_rate": 7.552213442402233e-06, + "loss": 0.4739, + "step": 8220 + }, + { + "epoch": 1.3501118797856835, + "grad_norm": 0.33280537484605616, + "learning_rate": 7.551868818917298e-06, + "loss": 0.4627, + "step": 8221 + }, + { + "epoch": 1.3502761069940261, + "grad_norm": 0.2603930956385385, + "learning_rate": 7.551524162286065e-06, + "loss": 0.4662, + "step": 8222 + }, + { + "epoch": 1.350440334202369, + "grad_norm": 0.2930172645612939, + "learning_rate": 7.551179472512278e-06, + "loss": 0.5013, + "step": 8223 + }, + { + "epoch": 1.3506045614107118, + "grad_norm": 0.4860934666600133, + "learning_rate": 7.5508347495996785e-06, + "loss": 0.4903, + "step": 8224 + }, + { + "epoch": 1.3507687886190545, + "grad_norm": 0.29072814081065385, + "learning_rate": 7.5504899935520135e-06, + "loss": 0.4827, + "step": 8225 + }, + { + "epoch": 1.350933015827397, + "grad_norm": 0.28925745501679684, + "learning_rate": 7.550145204373025e-06, + "loss": 0.467, + "step": 8226 + }, + { + "epoch": 1.35109724303574, + "grad_norm": 0.3215258109778846, + "learning_rate": 7.549800382066458e-06, + "loss": 0.4546, + "step": 8227 + }, + { + "epoch": 1.3512614702440826, + "grad_norm": 0.3339575673451271, + "learning_rate": 7.549455526636061e-06, + "loss": 0.4796, + "step": 8228 + }, + { + "epoch": 1.3514256974524255, + "grad_norm": 0.2885210154394203, + "learning_rate": 7.549110638085574e-06, + "loss": 0.449, + "step": 8229 + }, + { + "epoch": 1.351589924660768, + "grad_norm": 0.2797488004637664, + "learning_rate": 7.548765716418745e-06, + "loss": 0.4589, + "step": 8230 + }, + { + "epoch": 1.351754151869111, + "grad_norm": 0.42563485332320505, + "learning_rate": 7.5484207616393225e-06, + "loss": 0.4715, + "step": 8231 + }, + { + "epoch": 1.3519183790774536, + "grad_norm": 0.32789103541201964, + "learning_rate": 7.548075773751052e-06, + "loss": 0.4707, + "step": 8232 + }, + { + "epoch": 1.3520826062857965, + "grad_norm": 0.3307854208254755, + "learning_rate": 7.547730752757679e-06, + "loss": 0.4846, + "step": 8233 + }, + { + "epoch": 1.352246833494139, + "grad_norm": 0.33558511584102185, + "learning_rate": 7.547385698662949e-06, + "loss": 0.4713, + "step": 8234 + }, + { + "epoch": 1.352411060702482, + "grad_norm": 0.5575831635879408, + "learning_rate": 7.547040611470615e-06, + "loss": 0.4816, + "step": 8235 + }, + { + "epoch": 1.3525752879108246, + "grad_norm": 0.29420650994646813, + "learning_rate": 7.546695491184422e-06, + "loss": 0.4757, + "step": 8236 + }, + { + "epoch": 1.3527395151191675, + "grad_norm": 0.43134384746264715, + "learning_rate": 7.546350337808117e-06, + "loss": 0.458, + "step": 8237 + }, + { + "epoch": 1.35290374232751, + "grad_norm": 0.28010941089448865, + "learning_rate": 7.546005151345451e-06, + "loss": 0.461, + "step": 8238 + }, + { + "epoch": 1.3530679695358527, + "grad_norm": 0.4136198715585237, + "learning_rate": 7.545659931800171e-06, + "loss": 0.474, + "step": 8239 + }, + { + "epoch": 1.3532321967441956, + "grad_norm": 0.2773429745089072, + "learning_rate": 7.5453146791760295e-06, + "loss": 0.4636, + "step": 8240 + }, + { + "epoch": 1.3533964239525385, + "grad_norm": 0.4222897509697245, + "learning_rate": 7.544969393476774e-06, + "loss": 0.4792, + "step": 8241 + }, + { + "epoch": 1.353560651160881, + "grad_norm": 0.30400034421879774, + "learning_rate": 7.544624074706155e-06, + "loss": 0.482, + "step": 8242 + }, + { + "epoch": 1.3537248783692237, + "grad_norm": 0.3404611704418366, + "learning_rate": 7.544278722867922e-06, + "loss": 0.4801, + "step": 8243 + }, + { + "epoch": 1.3538891055775666, + "grad_norm": 0.3795342780211706, + "learning_rate": 7.543933337965828e-06, + "loss": 0.4357, + "step": 8244 + }, + { + "epoch": 1.3540533327859092, + "grad_norm": 0.2967089858210644, + "learning_rate": 7.543587920003622e-06, + "loss": 0.4621, + "step": 8245 + }, + { + "epoch": 1.354217559994252, + "grad_norm": 0.27784864211749893, + "learning_rate": 7.543242468985057e-06, + "loss": 0.4604, + "step": 8246 + }, + { + "epoch": 1.3543817872025947, + "grad_norm": 0.47344567937835635, + "learning_rate": 7.542896984913885e-06, + "loss": 0.4714, + "step": 8247 + }, + { + "epoch": 1.3545460144109376, + "grad_norm": 0.3088897586133829, + "learning_rate": 7.542551467793858e-06, + "loss": 0.4707, + "step": 8248 + }, + { + "epoch": 1.3547102416192802, + "grad_norm": 0.3436137560670549, + "learning_rate": 7.542205917628729e-06, + "loss": 0.4804, + "step": 8249 + }, + { + "epoch": 1.354874468827623, + "grad_norm": 0.35880366032473915, + "learning_rate": 7.54186033442225e-06, + "loss": 0.4797, + "step": 8250 + }, + { + "epoch": 1.3550386960359657, + "grad_norm": 0.4984973501764961, + "learning_rate": 7.541514718178174e-06, + "loss": 0.4688, + "step": 8251 + }, + { + "epoch": 1.3552029232443086, + "grad_norm": 0.33572784628150065, + "learning_rate": 7.541169068900258e-06, + "loss": 0.4816, + "step": 8252 + }, + { + "epoch": 1.3553671504526512, + "grad_norm": 0.2685346799680185, + "learning_rate": 7.540823386592252e-06, + "loss": 0.4689, + "step": 8253 + }, + { + "epoch": 1.355531377660994, + "grad_norm": 0.32616794767271073, + "learning_rate": 7.540477671257913e-06, + "loss": 0.4785, + "step": 8254 + }, + { + "epoch": 1.3556956048693367, + "grad_norm": 0.297867131998836, + "learning_rate": 7.540131922900995e-06, + "loss": 0.4597, + "step": 8255 + }, + { + "epoch": 1.3558598320776793, + "grad_norm": 0.4019680636042429, + "learning_rate": 7.5397861415252526e-06, + "loss": 0.4575, + "step": 8256 + }, + { + "epoch": 1.3560240592860222, + "grad_norm": 0.3480970659977098, + "learning_rate": 7.539440327134442e-06, + "loss": 0.4863, + "step": 8257 + }, + { + "epoch": 1.356188286494365, + "grad_norm": 0.2997521494595294, + "learning_rate": 7.53909447973232e-06, + "loss": 0.4875, + "step": 8258 + }, + { + "epoch": 1.3563525137027077, + "grad_norm": 0.3412320676442106, + "learning_rate": 7.538748599322642e-06, + "loss": 0.4846, + "step": 8259 + }, + { + "epoch": 1.3565167409110503, + "grad_norm": 0.30124717220604347, + "learning_rate": 7.538402685909164e-06, + "loss": 0.4913, + "step": 8260 + }, + { + "epoch": 1.3566809681193932, + "grad_norm": 0.5869707815625042, + "learning_rate": 7.538056739495643e-06, + "loss": 0.4619, + "step": 8261 + }, + { + "epoch": 1.3568451953277358, + "grad_norm": 0.41886149665065575, + "learning_rate": 7.537710760085837e-06, + "loss": 0.4819, + "step": 8262 + }, + { + "epoch": 1.3570094225360787, + "grad_norm": 0.3076873203590548, + "learning_rate": 7.537364747683502e-06, + "loss": 0.461, + "step": 8263 + }, + { + "epoch": 1.3571736497444213, + "grad_norm": 0.339881383453682, + "learning_rate": 7.537018702292401e-06, + "loss": 0.4584, + "step": 8264 + }, + { + "epoch": 1.3573378769527642, + "grad_norm": 0.4871044368615955, + "learning_rate": 7.536672623916286e-06, + "loss": 0.4749, + "step": 8265 + }, + { + "epoch": 1.3575021041611068, + "grad_norm": 0.2778485736129083, + "learning_rate": 7.5363265125589195e-06, + "loss": 0.4515, + "step": 8266 + }, + { + "epoch": 1.3576663313694497, + "grad_norm": 0.29364065488596336, + "learning_rate": 7.535980368224061e-06, + "loss": 0.4816, + "step": 8267 + }, + { + "epoch": 1.3578305585777923, + "grad_norm": 0.2838595820478707, + "learning_rate": 7.5356341909154665e-06, + "loss": 0.4566, + "step": 8268 + }, + { + "epoch": 1.3579947857861352, + "grad_norm": 0.3392614946332992, + "learning_rate": 7.5352879806369e-06, + "loss": 0.4608, + "step": 8269 + }, + { + "epoch": 1.3581590129944778, + "grad_norm": 0.3391061577152166, + "learning_rate": 7.5349417373921175e-06, + "loss": 0.4797, + "step": 8270 + }, + { + "epoch": 1.3583232402028207, + "grad_norm": 0.3055348595500309, + "learning_rate": 7.534595461184884e-06, + "loss": 0.4872, + "step": 8271 + }, + { + "epoch": 1.3584874674111633, + "grad_norm": 0.3142799258083078, + "learning_rate": 7.534249152018957e-06, + "loss": 0.4582, + "step": 8272 + }, + { + "epoch": 1.358651694619506, + "grad_norm": 0.314329556767126, + "learning_rate": 7.533902809898098e-06, + "loss": 0.4672, + "step": 8273 + }, + { + "epoch": 1.3588159218278488, + "grad_norm": 0.2967056278298423, + "learning_rate": 7.533556434826072e-06, + "loss": 0.476, + "step": 8274 + }, + { + "epoch": 1.3589801490361917, + "grad_norm": 0.2837665629837183, + "learning_rate": 7.533210026806636e-06, + "loss": 0.4681, + "step": 8275 + }, + { + "epoch": 1.3591443762445343, + "grad_norm": 0.32916697819824303, + "learning_rate": 7.532863585843556e-06, + "loss": 0.4799, + "step": 8276 + }, + { + "epoch": 1.359308603452877, + "grad_norm": 0.3801296291612379, + "learning_rate": 7.532517111940593e-06, + "loss": 0.482, + "step": 8277 + }, + { + "epoch": 1.3594728306612198, + "grad_norm": 0.30518418701039557, + "learning_rate": 7.5321706051015115e-06, + "loss": 0.4902, + "step": 8278 + }, + { + "epoch": 1.3596370578695625, + "grad_norm": 0.31494213222449563, + "learning_rate": 7.531824065330073e-06, + "loss": 0.4801, + "step": 8279 + }, + { + "epoch": 1.3598012850779053, + "grad_norm": 0.27913060649487614, + "learning_rate": 7.5314774926300425e-06, + "loss": 0.4674, + "step": 8280 + }, + { + "epoch": 1.359965512286248, + "grad_norm": 0.4272270596215226, + "learning_rate": 7.531130887005185e-06, + "loss": 0.4564, + "step": 8281 + }, + { + "epoch": 1.3601297394945908, + "grad_norm": 0.33419238881683055, + "learning_rate": 7.5307842484592625e-06, + "loss": 0.4829, + "step": 8282 + }, + { + "epoch": 1.3602939667029335, + "grad_norm": 0.3286978349267236, + "learning_rate": 7.530437576996042e-06, + "loss": 0.4811, + "step": 8283 + }, + { + "epoch": 1.3604581939112763, + "grad_norm": 0.29069776505360395, + "learning_rate": 7.530090872619287e-06, + "loss": 0.4869, + "step": 8284 + }, + { + "epoch": 1.360622421119619, + "grad_norm": 0.3464972039264677, + "learning_rate": 7.529744135332765e-06, + "loss": 0.4772, + "step": 8285 + }, + { + "epoch": 1.3607866483279618, + "grad_norm": 0.2661431197402841, + "learning_rate": 7.52939736514024e-06, + "loss": 0.4928, + "step": 8286 + }, + { + "epoch": 1.3609508755363044, + "grad_norm": 0.3143496844872588, + "learning_rate": 7.5290505620454785e-06, + "loss": 0.4915, + "step": 8287 + }, + { + "epoch": 1.3611151027446473, + "grad_norm": 0.379624731415056, + "learning_rate": 7.528703726052248e-06, + "loss": 0.4606, + "step": 8288 + }, + { + "epoch": 1.36127932995299, + "grad_norm": 0.3309339659074846, + "learning_rate": 7.528356857164315e-06, + "loss": 0.4516, + "step": 8289 + }, + { + "epoch": 1.3614435571613326, + "grad_norm": 0.30095414860162006, + "learning_rate": 7.5280099553854495e-06, + "loss": 0.453, + "step": 8290 + }, + { + "epoch": 1.3616077843696754, + "grad_norm": 0.5214225092991446, + "learning_rate": 7.527663020719415e-06, + "loss": 0.4589, + "step": 8291 + }, + { + "epoch": 1.3617720115780183, + "grad_norm": 0.29060618148335327, + "learning_rate": 7.52731605316998e-06, + "loss": 0.4746, + "step": 8292 + }, + { + "epoch": 1.361936238786361, + "grad_norm": 0.3803658344262675, + "learning_rate": 7.526969052740916e-06, + "loss": 0.4697, + "step": 8293 + }, + { + "epoch": 1.3621004659947036, + "grad_norm": 0.34410484988053686, + "learning_rate": 7.52662201943599e-06, + "loss": 0.4707, + "step": 8294 + }, + { + "epoch": 1.3622646932030464, + "grad_norm": 0.32125725777151576, + "learning_rate": 7.52627495325897e-06, + "loss": 0.472, + "step": 8295 + }, + { + "epoch": 1.362428920411389, + "grad_norm": 0.3261931445987548, + "learning_rate": 7.525927854213627e-06, + "loss": 0.4928, + "step": 8296 + }, + { + "epoch": 1.362593147619732, + "grad_norm": 0.4204108817507413, + "learning_rate": 7.52558072230373e-06, + "loss": 0.4595, + "step": 8297 + }, + { + "epoch": 1.3627573748280746, + "grad_norm": 0.32134642277397796, + "learning_rate": 7.5252335575330514e-06, + "loss": 0.4782, + "step": 8298 + }, + { + "epoch": 1.3629216020364174, + "grad_norm": 0.3515706624302091, + "learning_rate": 7.524886359905357e-06, + "loss": 0.4944, + "step": 8299 + }, + { + "epoch": 1.36308582924476, + "grad_norm": 0.3573167400693006, + "learning_rate": 7.5245391294244225e-06, + "loss": 0.4852, + "step": 8300 + }, + { + "epoch": 1.363250056453103, + "grad_norm": 0.3120269607498423, + "learning_rate": 7.524191866094016e-06, + "loss": 0.4793, + "step": 8301 + }, + { + "epoch": 1.3634142836614456, + "grad_norm": 0.30100585842420313, + "learning_rate": 7.523844569917912e-06, + "loss": 0.4567, + "step": 8302 + }, + { + "epoch": 1.3635785108697884, + "grad_norm": 0.33931025334201326, + "learning_rate": 7.523497240899881e-06, + "loss": 0.495, + "step": 8303 + }, + { + "epoch": 1.363742738078131, + "grad_norm": 0.30310450315920817, + "learning_rate": 7.523149879043694e-06, + "loss": 0.4844, + "step": 8304 + }, + { + "epoch": 1.363906965286474, + "grad_norm": 0.36035296781002274, + "learning_rate": 7.522802484353125e-06, + "loss": 0.4844, + "step": 8305 + }, + { + "epoch": 1.3640711924948166, + "grad_norm": 0.32849349322708093, + "learning_rate": 7.522455056831948e-06, + "loss": 0.4785, + "step": 8306 + }, + { + "epoch": 1.3642354197031592, + "grad_norm": 0.3416519194854029, + "learning_rate": 7.522107596483934e-06, + "loss": 0.4575, + "step": 8307 + }, + { + "epoch": 1.364399646911502, + "grad_norm": 0.29685825846548575, + "learning_rate": 7.5217601033128604e-06, + "loss": 0.4852, + "step": 8308 + }, + { + "epoch": 1.364563874119845, + "grad_norm": 0.31485021328089774, + "learning_rate": 7.5214125773224975e-06, + "loss": 0.4671, + "step": 8309 + }, + { + "epoch": 1.3647281013281876, + "grad_norm": 0.28896520824254684, + "learning_rate": 7.5210650185166205e-06, + "loss": 0.4626, + "step": 8310 + }, + { + "epoch": 1.3648923285365302, + "grad_norm": 0.3173799606341833, + "learning_rate": 7.520717426899007e-06, + "loss": 0.4505, + "step": 8311 + }, + { + "epoch": 1.365056555744873, + "grad_norm": 0.45661343907823004, + "learning_rate": 7.520369802473429e-06, + "loss": 0.46, + "step": 8312 + }, + { + "epoch": 1.3652207829532157, + "grad_norm": 0.31662459661797954, + "learning_rate": 7.520022145243664e-06, + "loss": 0.4689, + "step": 8313 + }, + { + "epoch": 1.3653850101615586, + "grad_norm": 0.3206366426822329, + "learning_rate": 7.5196744552134866e-06, + "loss": 0.4767, + "step": 8314 + }, + { + "epoch": 1.3655492373699012, + "grad_norm": 0.31121042162784335, + "learning_rate": 7.519326732386674e-06, + "loss": 0.4463, + "step": 8315 + }, + { + "epoch": 1.365713464578244, + "grad_norm": 0.3287820896136919, + "learning_rate": 7.518978976767001e-06, + "loss": 0.4833, + "step": 8316 + }, + { + "epoch": 1.3658776917865867, + "grad_norm": 0.44335887820682335, + "learning_rate": 7.518631188358249e-06, + "loss": 0.4892, + "step": 8317 + }, + { + "epoch": 1.3660419189949295, + "grad_norm": 0.2763335722270975, + "learning_rate": 7.51828336716419e-06, + "loss": 0.4862, + "step": 8318 + }, + { + "epoch": 1.3662061462032722, + "grad_norm": 0.3209361127665086, + "learning_rate": 7.517935513188605e-06, + "loss": 0.4563, + "step": 8319 + }, + { + "epoch": 1.366370373411615, + "grad_norm": 0.35302259974899775, + "learning_rate": 7.517587626435271e-06, + "loss": 0.4781, + "step": 8320 + }, + { + "epoch": 1.3665346006199577, + "grad_norm": 0.3165049719005419, + "learning_rate": 7.5172397069079656e-06, + "loss": 0.4508, + "step": 8321 + }, + { + "epoch": 1.3666988278283005, + "grad_norm": 0.292617607086799, + "learning_rate": 7.516891754610469e-06, + "loss": 0.4726, + "step": 8322 + }, + { + "epoch": 1.3668630550366432, + "grad_norm": 0.3238974592916656, + "learning_rate": 7.5165437695465605e-06, + "loss": 0.4742, + "step": 8323 + }, + { + "epoch": 1.3670272822449858, + "grad_norm": 0.32845712957705675, + "learning_rate": 7.516195751720018e-06, + "loss": 0.4692, + "step": 8324 + }, + { + "epoch": 1.3671915094533287, + "grad_norm": 0.3973395741657114, + "learning_rate": 7.515847701134623e-06, + "loss": 0.4554, + "step": 8325 + }, + { + "epoch": 1.3673557366616715, + "grad_norm": 0.3248357261446677, + "learning_rate": 7.5154996177941544e-06, + "loss": 0.4755, + "step": 8326 + }, + { + "epoch": 1.3675199638700142, + "grad_norm": 0.29548108080992236, + "learning_rate": 7.515151501702392e-06, + "loss": 0.5038, + "step": 8327 + }, + { + "epoch": 1.3676841910783568, + "grad_norm": 0.3793355108500688, + "learning_rate": 7.514803352863119e-06, + "loss": 0.4573, + "step": 8328 + }, + { + "epoch": 1.3678484182866997, + "grad_norm": 0.28719236328501363, + "learning_rate": 7.5144551712801146e-06, + "loss": 0.4873, + "step": 8329 + }, + { + "epoch": 1.3680126454950423, + "grad_norm": 0.31485206904199836, + "learning_rate": 7.514106956957162e-06, + "loss": 0.4553, + "step": 8330 + }, + { + "epoch": 1.3681768727033852, + "grad_norm": 0.4207883658274008, + "learning_rate": 7.513758709898041e-06, + "loss": 0.4754, + "step": 8331 + }, + { + "epoch": 1.3683410999117278, + "grad_norm": 0.3500059465239506, + "learning_rate": 7.513410430106538e-06, + "loss": 0.4904, + "step": 8332 + }, + { + "epoch": 1.3685053271200707, + "grad_norm": 0.29361003110067885, + "learning_rate": 7.5130621175864295e-06, + "loss": 0.4701, + "step": 8333 + }, + { + "epoch": 1.3686695543284133, + "grad_norm": 0.36962502570741995, + "learning_rate": 7.512713772341504e-06, + "loss": 0.4786, + "step": 8334 + }, + { + "epoch": 1.3688337815367562, + "grad_norm": 0.3727744016142195, + "learning_rate": 7.512365394375543e-06, + "loss": 0.4797, + "step": 8335 + }, + { + "epoch": 1.3689980087450988, + "grad_norm": 0.2792130647553743, + "learning_rate": 7.512016983692329e-06, + "loss": 0.4687, + "step": 8336 + }, + { + "epoch": 1.3691622359534417, + "grad_norm": 0.29179763332648856, + "learning_rate": 7.511668540295648e-06, + "loss": 0.481, + "step": 8337 + }, + { + "epoch": 1.3693264631617843, + "grad_norm": 0.3281601698226462, + "learning_rate": 7.5113200641892826e-06, + "loss": 0.4811, + "step": 8338 + }, + { + "epoch": 1.3694906903701272, + "grad_norm": 0.32319827408143587, + "learning_rate": 7.510971555377019e-06, + "loss": 0.4744, + "step": 8339 + }, + { + "epoch": 1.3696549175784698, + "grad_norm": 0.48402519535000715, + "learning_rate": 7.510623013862643e-06, + "loss": 0.4692, + "step": 8340 + }, + { + "epoch": 1.3698191447868124, + "grad_norm": 0.36689549011407746, + "learning_rate": 7.510274439649938e-06, + "loss": 0.472, + "step": 8341 + }, + { + "epoch": 1.3699833719951553, + "grad_norm": 0.2931911030026804, + "learning_rate": 7.509925832742691e-06, + "loss": 0.4815, + "step": 8342 + }, + { + "epoch": 1.3701475992034982, + "grad_norm": 0.38794231413739944, + "learning_rate": 7.5095771931446874e-06, + "loss": 0.4806, + "step": 8343 + }, + { + "epoch": 1.3703118264118408, + "grad_norm": 0.32982445067061356, + "learning_rate": 7.509228520859716e-06, + "loss": 0.4655, + "step": 8344 + }, + { + "epoch": 1.3704760536201834, + "grad_norm": 0.29391263723942956, + "learning_rate": 7.508879815891561e-06, + "loss": 0.4733, + "step": 8345 + }, + { + "epoch": 1.3706402808285263, + "grad_norm": 0.35197760791067445, + "learning_rate": 7.50853107824401e-06, + "loss": 0.4762, + "step": 8346 + }, + { + "epoch": 1.370804508036869, + "grad_norm": 0.3392993683378087, + "learning_rate": 7.508182307920853e-06, + "loss": 0.4749, + "step": 8347 + }, + { + "epoch": 1.3709687352452118, + "grad_norm": 0.4550315133155526, + "learning_rate": 7.507833504925876e-06, + "loss": 0.461, + "step": 8348 + }, + { + "epoch": 1.3711329624535544, + "grad_norm": 0.31311924905014327, + "learning_rate": 7.507484669262869e-06, + "loss": 0.4714, + "step": 8349 + }, + { + "epoch": 1.3712971896618973, + "grad_norm": 0.39457589400869614, + "learning_rate": 7.507135800935618e-06, + "loss": 0.4842, + "step": 8350 + }, + { + "epoch": 1.37146141687024, + "grad_norm": 0.32116871340010233, + "learning_rate": 7.506786899947914e-06, + "loss": 0.4808, + "step": 8351 + }, + { + "epoch": 1.3716256440785828, + "grad_norm": 0.28265646058257143, + "learning_rate": 7.506437966303546e-06, + "loss": 0.4592, + "step": 8352 + }, + { + "epoch": 1.3717898712869254, + "grad_norm": 0.2988692411952469, + "learning_rate": 7.5060890000063035e-06, + "loss": 0.4847, + "step": 8353 + }, + { + "epoch": 1.3719540984952683, + "grad_norm": 0.3962595768036921, + "learning_rate": 7.505740001059977e-06, + "loss": 0.4694, + "step": 8354 + }, + { + "epoch": 1.372118325703611, + "grad_norm": 0.27664766973560395, + "learning_rate": 7.5053909694683575e-06, + "loss": 0.4652, + "step": 8355 + }, + { + "epoch": 1.3722825529119538, + "grad_norm": 0.34306614080647385, + "learning_rate": 7.505041905235234e-06, + "loss": 0.4609, + "step": 8356 + }, + { + "epoch": 1.3724467801202964, + "grad_norm": 0.3243406451551074, + "learning_rate": 7.5046928083644e-06, + "loss": 0.4756, + "step": 8357 + }, + { + "epoch": 1.372611007328639, + "grad_norm": 0.32581858539649494, + "learning_rate": 7.504343678859645e-06, + "loss": 0.4865, + "step": 8358 + }, + { + "epoch": 1.372775234536982, + "grad_norm": 0.3316606832659808, + "learning_rate": 7.5039945167247625e-06, + "loss": 0.4613, + "step": 8359 + }, + { + "epoch": 1.3729394617453248, + "grad_norm": 0.3043342240235991, + "learning_rate": 7.503645321963543e-06, + "loss": 0.4445, + "step": 8360 + }, + { + "epoch": 1.3731036889536674, + "grad_norm": 0.36325843544559216, + "learning_rate": 7.503296094579782e-06, + "loss": 0.4766, + "step": 8361 + }, + { + "epoch": 1.37326791616201, + "grad_norm": 0.3295513937739328, + "learning_rate": 7.502946834577269e-06, + "loss": 0.4764, + "step": 8362 + }, + { + "epoch": 1.373432143370353, + "grad_norm": 0.29301402397809956, + "learning_rate": 7.5025975419597995e-06, + "loss": 0.453, + "step": 8363 + }, + { + "epoch": 1.3735963705786955, + "grad_norm": 0.3265871905901499, + "learning_rate": 7.502248216731166e-06, + "loss": 0.485, + "step": 8364 + }, + { + "epoch": 1.3737605977870384, + "grad_norm": 0.3307848184863391, + "learning_rate": 7.501898858895163e-06, + "loss": 0.482, + "step": 8365 + }, + { + "epoch": 1.373924824995381, + "grad_norm": 0.318539651593937, + "learning_rate": 7.501549468455586e-06, + "loss": 0.4635, + "step": 8366 + }, + { + "epoch": 1.374089052203724, + "grad_norm": 0.4299176845852357, + "learning_rate": 7.501200045416228e-06, + "loss": 0.4746, + "step": 8367 + }, + { + "epoch": 1.3742532794120665, + "grad_norm": 0.33954283941162516, + "learning_rate": 7.500850589780885e-06, + "loss": 0.4581, + "step": 8368 + }, + { + "epoch": 1.3744175066204094, + "grad_norm": 0.4102003323616825, + "learning_rate": 7.5005011015533515e-06, + "loss": 0.4652, + "step": 8369 + }, + { + "epoch": 1.374581733828752, + "grad_norm": 0.37286895421142485, + "learning_rate": 7.500151580737423e-06, + "loss": 0.4749, + "step": 8370 + }, + { + "epoch": 1.374745961037095, + "grad_norm": 0.4328474063000734, + "learning_rate": 7.4998020273368985e-06, + "loss": 0.4734, + "step": 8371 + }, + { + "epoch": 1.3749101882454375, + "grad_norm": 0.3229313804670453, + "learning_rate": 7.499452441355571e-06, + "loss": 0.4854, + "step": 8372 + }, + { + "epoch": 1.3750744154537804, + "grad_norm": 0.338741035436834, + "learning_rate": 7.49910282279724e-06, + "loss": 0.4485, + "step": 8373 + }, + { + "epoch": 1.375238642662123, + "grad_norm": 0.3835064638344377, + "learning_rate": 7.498753171665702e-06, + "loss": 0.4763, + "step": 8374 + }, + { + "epoch": 1.3754028698704657, + "grad_norm": 0.3824516415335608, + "learning_rate": 7.498403487964754e-06, + "loss": 0.4885, + "step": 8375 + }, + { + "epoch": 1.3755670970788085, + "grad_norm": 0.3196719968516, + "learning_rate": 7.4980537716981935e-06, + "loss": 0.4901, + "step": 8376 + }, + { + "epoch": 1.3757313242871514, + "grad_norm": 0.3553565763368463, + "learning_rate": 7.49770402286982e-06, + "loss": 0.4437, + "step": 8377 + }, + { + "epoch": 1.375895551495494, + "grad_norm": 0.4544423987932916, + "learning_rate": 7.49735424148343e-06, + "loss": 0.4518, + "step": 8378 + }, + { + "epoch": 1.3760597787038367, + "grad_norm": 0.3659010510622629, + "learning_rate": 7.497004427542827e-06, + "loss": 0.5014, + "step": 8379 + }, + { + "epoch": 1.3762240059121795, + "grad_norm": 0.30706994581649516, + "learning_rate": 7.4966545810518046e-06, + "loss": 0.4702, + "step": 8380 + }, + { + "epoch": 1.3763882331205222, + "grad_norm": 0.3120020596631399, + "learning_rate": 7.496304702014165e-06, + "loss": 0.4603, + "step": 8381 + }, + { + "epoch": 1.376552460328865, + "grad_norm": 0.3428900780388786, + "learning_rate": 7.49595479043371e-06, + "loss": 0.4851, + "step": 8382 + }, + { + "epoch": 1.3767166875372077, + "grad_norm": 0.28232317726051354, + "learning_rate": 7.495604846314236e-06, + "loss": 0.473, + "step": 8383 + }, + { + "epoch": 1.3768809147455505, + "grad_norm": 0.40601124361359603, + "learning_rate": 7.495254869659548e-06, + "loss": 0.4825, + "step": 8384 + }, + { + "epoch": 1.3770451419538932, + "grad_norm": 0.30385377720199874, + "learning_rate": 7.494904860473446e-06, + "loss": 0.4827, + "step": 8385 + }, + { + "epoch": 1.377209369162236, + "grad_norm": 0.3585977445148269, + "learning_rate": 7.494554818759729e-06, + "loss": 0.4686, + "step": 8386 + }, + { + "epoch": 1.3773735963705787, + "grad_norm": 0.296032640267518, + "learning_rate": 7.4942047445222005e-06, + "loss": 0.4749, + "step": 8387 + }, + { + "epoch": 1.3775378235789215, + "grad_norm": 0.3929412698877934, + "learning_rate": 7.493854637764663e-06, + "loss": 0.471, + "step": 8388 + }, + { + "epoch": 1.3777020507872642, + "grad_norm": 0.3584205071049299, + "learning_rate": 7.493504498490919e-06, + "loss": 0.4703, + "step": 8389 + }, + { + "epoch": 1.377866277995607, + "grad_norm": 0.26009475523382186, + "learning_rate": 7.49315432670477e-06, + "loss": 0.4767, + "step": 8390 + }, + { + "epoch": 1.3780305052039497, + "grad_norm": 0.5168875923714398, + "learning_rate": 7.492804122410021e-06, + "loss": 0.467, + "step": 8391 + }, + { + "epoch": 1.3781947324122923, + "grad_norm": 0.2963401806601922, + "learning_rate": 7.492453885610474e-06, + "loss": 0.4768, + "step": 8392 + }, + { + "epoch": 1.3783589596206351, + "grad_norm": 0.31659003611238296, + "learning_rate": 7.492103616309933e-06, + "loss": 0.4711, + "step": 8393 + }, + { + "epoch": 1.378523186828978, + "grad_norm": 0.33403184376342293, + "learning_rate": 7.491753314512205e-06, + "loss": 0.4788, + "step": 8394 + }, + { + "epoch": 1.3786874140373206, + "grad_norm": 0.6322963767384263, + "learning_rate": 7.491402980221091e-06, + "loss": 0.484, + "step": 8395 + }, + { + "epoch": 1.3788516412456633, + "grad_norm": 0.3105545863957514, + "learning_rate": 7.491052613440398e-06, + "loss": 0.4804, + "step": 8396 + }, + { + "epoch": 1.3790158684540061, + "grad_norm": 0.37750790683795843, + "learning_rate": 7.4907022141739305e-06, + "loss": 0.4725, + "step": 8397 + }, + { + "epoch": 1.3791800956623488, + "grad_norm": 4.6596300526714955, + "learning_rate": 7.490351782425494e-06, + "loss": 0.4773, + "step": 8398 + }, + { + "epoch": 1.3793443228706916, + "grad_norm": 0.35738475045237333, + "learning_rate": 7.490001318198896e-06, + "loss": 0.4715, + "step": 8399 + }, + { + "epoch": 1.3795085500790343, + "grad_norm": 0.2786126847265779, + "learning_rate": 7.489650821497942e-06, + "loss": 0.4735, + "step": 8400 + }, + { + "epoch": 1.3796727772873771, + "grad_norm": 0.33428832912866097, + "learning_rate": 7.489300292326438e-06, + "loss": 0.4639, + "step": 8401 + }, + { + "epoch": 1.3798370044957198, + "grad_norm": 0.30964514582994457, + "learning_rate": 7.4889497306881924e-06, + "loss": 0.466, + "step": 8402 + }, + { + "epoch": 1.3800012317040626, + "grad_norm": 0.2889957503949191, + "learning_rate": 7.488599136587012e-06, + "loss": 0.4659, + "step": 8403 + }, + { + "epoch": 1.3801654589124053, + "grad_norm": 0.3925535751253334, + "learning_rate": 7.488248510026704e-06, + "loss": 0.4676, + "step": 8404 + }, + { + "epoch": 1.3803296861207481, + "grad_norm": 0.4829764742485803, + "learning_rate": 7.487897851011077e-06, + "loss": 0.4857, + "step": 8405 + }, + { + "epoch": 1.3804939133290908, + "grad_norm": 0.5775087390936786, + "learning_rate": 7.4875471595439395e-06, + "loss": 0.4627, + "step": 8406 + }, + { + "epoch": 1.3806581405374336, + "grad_norm": 0.41355222116583273, + "learning_rate": 7.4871964356291015e-06, + "loss": 0.5126, + "step": 8407 + }, + { + "epoch": 1.3808223677457763, + "grad_norm": 0.43117946888282094, + "learning_rate": 7.4868456792703715e-06, + "loss": 0.4567, + "step": 8408 + }, + { + "epoch": 1.380986594954119, + "grad_norm": 0.5106408016749285, + "learning_rate": 7.486494890471557e-06, + "loss": 0.4632, + "step": 8409 + }, + { + "epoch": 1.3811508221624618, + "grad_norm": 0.4436767147179429, + "learning_rate": 7.48614406923647e-06, + "loss": 0.4679, + "step": 8410 + }, + { + "epoch": 1.3813150493708046, + "grad_norm": 0.4034600956616641, + "learning_rate": 7.4857932155689216e-06, + "loss": 0.4875, + "step": 8411 + }, + { + "epoch": 1.3814792765791473, + "grad_norm": 0.33297679444132877, + "learning_rate": 7.485442329472721e-06, + "loss": 0.4634, + "step": 8412 + }, + { + "epoch": 1.38164350378749, + "grad_norm": 0.3543346865632646, + "learning_rate": 7.485091410951679e-06, + "loss": 0.4761, + "step": 8413 + }, + { + "epoch": 1.3818077309958328, + "grad_norm": 0.34842494937452245, + "learning_rate": 7.484740460009608e-06, + "loss": 0.4822, + "step": 8414 + }, + { + "epoch": 1.3819719582041754, + "grad_norm": 0.36443048186557747, + "learning_rate": 7.484389476650317e-06, + "loss": 0.4781, + "step": 8415 + }, + { + "epoch": 1.3821361854125183, + "grad_norm": 0.343766357021871, + "learning_rate": 7.484038460877623e-06, + "loss": 0.4706, + "step": 8416 + }, + { + "epoch": 1.382300412620861, + "grad_norm": 0.32675746001653816, + "learning_rate": 7.483687412695334e-06, + "loss": 0.4476, + "step": 8417 + }, + { + "epoch": 1.3824646398292038, + "grad_norm": 0.3221113117267582, + "learning_rate": 7.483336332107262e-06, + "loss": 0.4647, + "step": 8418 + }, + { + "epoch": 1.3826288670375464, + "grad_norm": 0.3215797449648108, + "learning_rate": 7.482985219117225e-06, + "loss": 0.4721, + "step": 8419 + }, + { + "epoch": 1.3827930942458893, + "grad_norm": 0.2928507581174783, + "learning_rate": 7.482634073729034e-06, + "loss": 0.4581, + "step": 8420 + }, + { + "epoch": 1.382957321454232, + "grad_norm": 0.35576242260440516, + "learning_rate": 7.482282895946501e-06, + "loss": 0.4617, + "step": 8421 + }, + { + "epoch": 1.3831215486625748, + "grad_norm": 0.3366737168411468, + "learning_rate": 7.481931685773442e-06, + "loss": 0.4536, + "step": 8422 + }, + { + "epoch": 1.3832857758709174, + "grad_norm": 0.46285205063668183, + "learning_rate": 7.481580443213671e-06, + "loss": 0.4628, + "step": 8423 + }, + { + "epoch": 1.3834500030792602, + "grad_norm": 0.2907414212608209, + "learning_rate": 7.481229168271003e-06, + "loss": 0.4778, + "step": 8424 + }, + { + "epoch": 1.3836142302876029, + "grad_norm": 0.3830255011938799, + "learning_rate": 7.480877860949253e-06, + "loss": 0.483, + "step": 8425 + }, + { + "epoch": 1.3837784574959455, + "grad_norm": 0.3698854430170395, + "learning_rate": 7.480526521252237e-06, + "loss": 0.4766, + "step": 8426 + }, + { + "epoch": 1.3839426847042884, + "grad_norm": 0.41376734309397795, + "learning_rate": 7.480175149183771e-06, + "loss": 0.491, + "step": 8427 + }, + { + "epoch": 1.3841069119126312, + "grad_norm": 0.39701920731416607, + "learning_rate": 7.479823744747669e-06, + "loss": 0.4913, + "step": 8428 + }, + { + "epoch": 1.3842711391209739, + "grad_norm": 0.2992212234035658, + "learning_rate": 7.479472307947752e-06, + "loss": 0.4996, + "step": 8429 + }, + { + "epoch": 1.3844353663293165, + "grad_norm": 0.3945568889015645, + "learning_rate": 7.479120838787832e-06, + "loss": 0.4843, + "step": 8430 + }, + { + "epoch": 1.3845995935376594, + "grad_norm": 0.32718184784481347, + "learning_rate": 7.478769337271729e-06, + "loss": 0.4688, + "step": 8431 + }, + { + "epoch": 1.384763820746002, + "grad_norm": 0.40322396401045213, + "learning_rate": 7.478417803403262e-06, + "loss": 0.4774, + "step": 8432 + }, + { + "epoch": 1.3849280479543449, + "grad_norm": 0.31278115808345475, + "learning_rate": 7.4780662371862454e-06, + "loss": 0.4887, + "step": 8433 + }, + { + "epoch": 1.3850922751626875, + "grad_norm": 0.2880177748586995, + "learning_rate": 7.4777146386245e-06, + "loss": 0.4622, + "step": 8434 + }, + { + "epoch": 1.3852565023710304, + "grad_norm": 0.2922145141022478, + "learning_rate": 7.477363007721842e-06, + "loss": 0.472, + "step": 8435 + }, + { + "epoch": 1.385420729579373, + "grad_norm": 0.2595927763167588, + "learning_rate": 7.477011344482097e-06, + "loss": 0.4598, + "step": 8436 + }, + { + "epoch": 1.3855849567877159, + "grad_norm": 0.3827827123321603, + "learning_rate": 7.4766596489090765e-06, + "loss": 0.4768, + "step": 8437 + }, + { + "epoch": 1.3857491839960585, + "grad_norm": 0.3046199138275548, + "learning_rate": 7.476307921006603e-06, + "loss": 0.4708, + "step": 8438 + }, + { + "epoch": 1.3859134112044014, + "grad_norm": 0.36047761509934007, + "learning_rate": 7.475956160778499e-06, + "loss": 0.4779, + "step": 8439 + }, + { + "epoch": 1.386077638412744, + "grad_norm": 0.2876281404368738, + "learning_rate": 7.475604368228583e-06, + "loss": 0.4676, + "step": 8440 + }, + { + "epoch": 1.3862418656210869, + "grad_norm": 0.3415868267007575, + "learning_rate": 7.475252543360676e-06, + "loss": 0.4821, + "step": 8441 + }, + { + "epoch": 1.3864060928294295, + "grad_norm": 0.3071306454877626, + "learning_rate": 7.474900686178598e-06, + "loss": 0.4661, + "step": 8442 + }, + { + "epoch": 1.3865703200377721, + "grad_norm": 0.3816098854453923, + "learning_rate": 7.474548796686172e-06, + "loss": 0.4777, + "step": 8443 + }, + { + "epoch": 1.386734547246115, + "grad_norm": 0.26175587149765556, + "learning_rate": 7.474196874887219e-06, + "loss": 0.4637, + "step": 8444 + }, + { + "epoch": 1.3868987744544579, + "grad_norm": 0.3957223210428203, + "learning_rate": 7.473844920785564e-06, + "loss": 0.4536, + "step": 8445 + }, + { + "epoch": 1.3870630016628005, + "grad_norm": 0.42887048967429414, + "learning_rate": 7.473492934385025e-06, + "loss": 0.4608, + "step": 8446 + }, + { + "epoch": 1.3872272288711431, + "grad_norm": 0.3039193509641038, + "learning_rate": 7.473140915689428e-06, + "loss": 0.487, + "step": 8447 + }, + { + "epoch": 1.387391456079486, + "grad_norm": 0.2924723132481886, + "learning_rate": 7.472788864702596e-06, + "loss": 0.4658, + "step": 8448 + }, + { + "epoch": 1.3875556832878286, + "grad_norm": 0.3104130699213828, + "learning_rate": 7.4724367814283515e-06, + "loss": 0.4661, + "step": 8449 + }, + { + "epoch": 1.3877199104961715, + "grad_norm": 0.2516987078850876, + "learning_rate": 7.472084665870519e-06, + "loss": 0.459, + "step": 8450 + }, + { + "epoch": 1.3878841377045141, + "grad_norm": 0.3246890939691412, + "learning_rate": 7.4717325180329246e-06, + "loss": 0.481, + "step": 8451 + }, + { + "epoch": 1.388048364912857, + "grad_norm": 0.3079833527004406, + "learning_rate": 7.4713803379193885e-06, + "loss": 0.4644, + "step": 8452 + }, + { + "epoch": 1.3882125921211996, + "grad_norm": 0.29180556133406804, + "learning_rate": 7.47102812553374e-06, + "loss": 0.4614, + "step": 8453 + }, + { + "epoch": 1.3883768193295425, + "grad_norm": 0.2886969888932696, + "learning_rate": 7.470675880879802e-06, + "loss": 0.4736, + "step": 8454 + }, + { + "epoch": 1.3885410465378851, + "grad_norm": 0.32198851042761334, + "learning_rate": 7.470323603961402e-06, + "loss": 0.4488, + "step": 8455 + }, + { + "epoch": 1.388705273746228, + "grad_norm": 0.27850170833038657, + "learning_rate": 7.469971294782366e-06, + "loss": 0.4719, + "step": 8456 + }, + { + "epoch": 1.3888695009545706, + "grad_norm": 0.3167504981040049, + "learning_rate": 7.469618953346519e-06, + "loss": 0.4733, + "step": 8457 + }, + { + "epoch": 1.3890337281629135, + "grad_norm": 0.2952002433702235, + "learning_rate": 7.469266579657688e-06, + "loss": 0.4689, + "step": 8458 + }, + { + "epoch": 1.3891979553712561, + "grad_norm": 0.3653309348288008, + "learning_rate": 7.468914173719701e-06, + "loss": 0.4991, + "step": 8459 + }, + { + "epoch": 1.3893621825795988, + "grad_norm": 0.33786108288862315, + "learning_rate": 7.468561735536384e-06, + "loss": 0.4691, + "step": 8460 + }, + { + "epoch": 1.3895264097879416, + "grad_norm": 0.38459079965390597, + "learning_rate": 7.468209265111568e-06, + "loss": 0.4428, + "step": 8461 + }, + { + "epoch": 1.3896906369962845, + "grad_norm": 0.26526099944477444, + "learning_rate": 7.467856762449077e-06, + "loss": 0.4388, + "step": 8462 + }, + { + "epoch": 1.3898548642046271, + "grad_norm": 0.29889257287695886, + "learning_rate": 7.467504227552743e-06, + "loss": 0.4768, + "step": 8463 + }, + { + "epoch": 1.3900190914129698, + "grad_norm": 0.4112551605074209, + "learning_rate": 7.467151660426393e-06, + "loss": 0.473, + "step": 8464 + }, + { + "epoch": 1.3901833186213126, + "grad_norm": 0.41186499581502484, + "learning_rate": 7.466799061073857e-06, + "loss": 0.475, + "step": 8465 + }, + { + "epoch": 1.3903475458296553, + "grad_norm": 0.5636465016362382, + "learning_rate": 7.466446429498963e-06, + "loss": 0.469, + "step": 8466 + }, + { + "epoch": 1.3905117730379981, + "grad_norm": 0.32784216972789626, + "learning_rate": 7.4660937657055426e-06, + "loss": 0.4753, + "step": 8467 + }, + { + "epoch": 1.3906760002463407, + "grad_norm": 0.357908475606493, + "learning_rate": 7.465741069697428e-06, + "loss": 0.488, + "step": 8468 + }, + { + "epoch": 1.3908402274546836, + "grad_norm": 0.33925619079683195, + "learning_rate": 7.4653883414784445e-06, + "loss": 0.472, + "step": 8469 + }, + { + "epoch": 1.3910044546630262, + "grad_norm": 0.29056325498193813, + "learning_rate": 7.465035581052428e-06, + "loss": 0.4641, + "step": 8470 + }, + { + "epoch": 1.391168681871369, + "grad_norm": 0.3130794677226531, + "learning_rate": 7.464682788423206e-06, + "loss": 0.4607, + "step": 8471 + }, + { + "epoch": 1.3913329090797117, + "grad_norm": 0.2999242321369178, + "learning_rate": 7.464329963594613e-06, + "loss": 0.4615, + "step": 8472 + }, + { + "epoch": 1.3914971362880546, + "grad_norm": 0.31033618826953613, + "learning_rate": 7.463977106570481e-06, + "loss": 0.4693, + "step": 8473 + }, + { + "epoch": 1.3916613634963972, + "grad_norm": 0.40690685776686436, + "learning_rate": 7.463624217354641e-06, + "loss": 0.4811, + "step": 8474 + }, + { + "epoch": 1.39182559070474, + "grad_norm": 0.5071406863141119, + "learning_rate": 7.463271295950926e-06, + "loss": 0.4661, + "step": 8475 + }, + { + "epoch": 1.3919898179130827, + "grad_norm": 0.33631701642143286, + "learning_rate": 7.462918342363169e-06, + "loss": 0.4771, + "step": 8476 + }, + { + "epoch": 1.3921540451214254, + "grad_norm": 0.30037934836463953, + "learning_rate": 7.462565356595202e-06, + "loss": 0.4802, + "step": 8477 + }, + { + "epoch": 1.3923182723297682, + "grad_norm": 0.3193277779391227, + "learning_rate": 7.4622123386508635e-06, + "loss": 0.4876, + "step": 8478 + }, + { + "epoch": 1.392482499538111, + "grad_norm": 0.270676534414637, + "learning_rate": 7.461859288533983e-06, + "loss": 0.4594, + "step": 8479 + }, + { + "epoch": 1.3926467267464537, + "grad_norm": 0.4127141520008906, + "learning_rate": 7.461506206248397e-06, + "loss": 0.4653, + "step": 8480 + }, + { + "epoch": 1.3928109539547964, + "grad_norm": 0.4991721815266884, + "learning_rate": 7.46115309179794e-06, + "loss": 0.4844, + "step": 8481 + }, + { + "epoch": 1.3929751811631392, + "grad_norm": 0.29698984397598005, + "learning_rate": 7.460799945186447e-06, + "loss": 0.4613, + "step": 8482 + }, + { + "epoch": 1.3931394083714819, + "grad_norm": 0.3248795253734255, + "learning_rate": 7.4604467664177535e-06, + "loss": 0.4638, + "step": 8483 + }, + { + "epoch": 1.3933036355798247, + "grad_norm": 0.37999833355929574, + "learning_rate": 7.460093555495695e-06, + "loss": 0.483, + "step": 8484 + }, + { + "epoch": 1.3934678627881674, + "grad_norm": 0.3038401995582087, + "learning_rate": 7.459740312424108e-06, + "loss": 0.4846, + "step": 8485 + }, + { + "epoch": 1.3936320899965102, + "grad_norm": 0.33297402648887964, + "learning_rate": 7.459387037206831e-06, + "loss": 0.4907, + "step": 8486 + }, + { + "epoch": 1.3937963172048529, + "grad_norm": 0.5686034853236254, + "learning_rate": 7.459033729847697e-06, + "loss": 0.4726, + "step": 8487 + }, + { + "epoch": 1.3939605444131957, + "grad_norm": 0.3948521265247895, + "learning_rate": 7.458680390350547e-06, + "loss": 0.4641, + "step": 8488 + }, + { + "epoch": 1.3941247716215384, + "grad_norm": 0.45439200880210123, + "learning_rate": 7.458327018719214e-06, + "loss": 0.4778, + "step": 8489 + }, + { + "epoch": 1.3942889988298812, + "grad_norm": 0.3042339933458967, + "learning_rate": 7.457973614957541e-06, + "loss": 0.4634, + "step": 8490 + }, + { + "epoch": 1.3944532260382239, + "grad_norm": 0.3305137784668754, + "learning_rate": 7.457620179069366e-06, + "loss": 0.4842, + "step": 8491 + }, + { + "epoch": 1.3946174532465667, + "grad_norm": 0.27760176510723344, + "learning_rate": 7.457266711058524e-06, + "loss": 0.4752, + "step": 8492 + }, + { + "epoch": 1.3947816804549094, + "grad_norm": 0.4102659742863658, + "learning_rate": 7.4569132109288555e-06, + "loss": 0.4685, + "step": 8493 + }, + { + "epoch": 1.394945907663252, + "grad_norm": 0.31841425627424486, + "learning_rate": 7.456559678684201e-06, + "loss": 0.469, + "step": 8494 + }, + { + "epoch": 1.3951101348715949, + "grad_norm": 0.2770736991011022, + "learning_rate": 7.456206114328402e-06, + "loss": 0.4522, + "step": 8495 + }, + { + "epoch": 1.3952743620799377, + "grad_norm": 0.36425396742082483, + "learning_rate": 7.455852517865292e-06, + "loss": 0.4661, + "step": 8496 + }, + { + "epoch": 1.3954385892882804, + "grad_norm": 0.29309115832848304, + "learning_rate": 7.4554988892987186e-06, + "loss": 0.4566, + "step": 8497 + }, + { + "epoch": 1.395602816496623, + "grad_norm": 0.3467605734878834, + "learning_rate": 7.455145228632518e-06, + "loss": 0.4851, + "step": 8498 + }, + { + "epoch": 1.3957670437049658, + "grad_norm": 0.2803211107231899, + "learning_rate": 7.454791535870533e-06, + "loss": 0.4698, + "step": 8499 + }, + { + "epoch": 1.3959312709133085, + "grad_norm": 0.30707032322686445, + "learning_rate": 7.454437811016605e-06, + "loss": 0.4637, + "step": 8500 + }, + { + "epoch": 1.3960954981216513, + "grad_norm": 0.292408272146633, + "learning_rate": 7.454084054074575e-06, + "loss": 0.4647, + "step": 8501 + }, + { + "epoch": 1.396259725329994, + "grad_norm": 0.3176060833917574, + "learning_rate": 7.453730265048285e-06, + "loss": 0.4789, + "step": 8502 + }, + { + "epoch": 1.3964239525383368, + "grad_norm": 0.2712677509040151, + "learning_rate": 7.45337644394158e-06, + "loss": 0.4637, + "step": 8503 + }, + { + "epoch": 1.3965881797466795, + "grad_norm": 0.2992868914528555, + "learning_rate": 7.453022590758301e-06, + "loss": 0.4662, + "step": 8504 + }, + { + "epoch": 1.3967524069550223, + "grad_norm": 0.2862272436304518, + "learning_rate": 7.45266870550229e-06, + "loss": 0.4684, + "step": 8505 + }, + { + "epoch": 1.396916634163365, + "grad_norm": 0.34703558397614, + "learning_rate": 7.452314788177391e-06, + "loss": 0.4745, + "step": 8506 + }, + { + "epoch": 1.3970808613717078, + "grad_norm": 0.3593107459557686, + "learning_rate": 7.451960838787452e-06, + "loss": 0.4701, + "step": 8507 + }, + { + "epoch": 1.3972450885800505, + "grad_norm": 0.27753636142750543, + "learning_rate": 7.451606857336312e-06, + "loss": 0.454, + "step": 8508 + }, + { + "epoch": 1.3974093157883933, + "grad_norm": 0.32519945938169814, + "learning_rate": 7.4512528438278174e-06, + "loss": 0.4653, + "step": 8509 + }, + { + "epoch": 1.397573542996736, + "grad_norm": 0.37636601508996753, + "learning_rate": 7.4508987982658135e-06, + "loss": 0.4723, + "step": 8510 + }, + { + "epoch": 1.3977377702050786, + "grad_norm": 0.3242174929140608, + "learning_rate": 7.450544720654145e-06, + "loss": 0.4673, + "step": 8511 + }, + { + "epoch": 1.3979019974134215, + "grad_norm": 0.3152017954856842, + "learning_rate": 7.4501906109966595e-06, + "loss": 0.4684, + "step": 8512 + }, + { + "epoch": 1.3980662246217643, + "grad_norm": 0.273731924885685, + "learning_rate": 7.4498364692971996e-06, + "loss": 0.4686, + "step": 8513 + }, + { + "epoch": 1.398230451830107, + "grad_norm": 0.4412982637682514, + "learning_rate": 7.449482295559614e-06, + "loss": 0.4689, + "step": 8514 + }, + { + "epoch": 1.3983946790384496, + "grad_norm": 0.6275413982705256, + "learning_rate": 7.4491280897877475e-06, + "loss": 0.4428, + "step": 8515 + }, + { + "epoch": 1.3985589062467925, + "grad_norm": 0.3754192144297303, + "learning_rate": 7.44877385198545e-06, + "loss": 0.4798, + "step": 8516 + }, + { + "epoch": 1.398723133455135, + "grad_norm": 1.2042144009390472, + "learning_rate": 7.448419582156568e-06, + "loss": 0.4711, + "step": 8517 + }, + { + "epoch": 1.398887360663478, + "grad_norm": 0.32201331477470474, + "learning_rate": 7.448065280304946e-06, + "loss": 0.4691, + "step": 8518 + }, + { + "epoch": 1.3990515878718206, + "grad_norm": 0.36748107501569693, + "learning_rate": 7.447710946434438e-06, + "loss": 0.4736, + "step": 8519 + }, + { + "epoch": 1.3992158150801635, + "grad_norm": 0.30196407746713133, + "learning_rate": 7.447356580548886e-06, + "loss": 0.461, + "step": 8520 + }, + { + "epoch": 1.399380042288506, + "grad_norm": 0.40934242162584306, + "learning_rate": 7.447002182652143e-06, + "loss": 0.4674, + "step": 8521 + }, + { + "epoch": 1.399544269496849, + "grad_norm": 0.2764712262536976, + "learning_rate": 7.446647752748056e-06, + "loss": 0.442, + "step": 8522 + }, + { + "epoch": 1.3997084967051916, + "grad_norm": 0.2964002281736302, + "learning_rate": 7.446293290840475e-06, + "loss": 0.466, + "step": 8523 + }, + { + "epoch": 1.3998727239135345, + "grad_norm": 0.31240139575336234, + "learning_rate": 7.4459387969332514e-06, + "loss": 0.4669, + "step": 8524 + }, + { + "epoch": 1.400036951121877, + "grad_norm": 0.5236529764809225, + "learning_rate": 7.4455842710302346e-06, + "loss": 0.4811, + "step": 8525 + }, + { + "epoch": 1.40020117833022, + "grad_norm": 0.2690862497073942, + "learning_rate": 7.445229713135273e-06, + "loss": 0.4989, + "step": 8526 + }, + { + "epoch": 1.4003654055385626, + "grad_norm": 0.31760863779848236, + "learning_rate": 7.444875123252219e-06, + "loss": 0.4686, + "step": 8527 + }, + { + "epoch": 1.4005296327469052, + "grad_norm": 0.3106443897577975, + "learning_rate": 7.444520501384925e-06, + "loss": 0.4796, + "step": 8528 + }, + { + "epoch": 1.400693859955248, + "grad_norm": 0.45939652043207874, + "learning_rate": 7.44416584753724e-06, + "loss": 0.465, + "step": 8529 + }, + { + "epoch": 1.400858087163591, + "grad_norm": 0.3001121880109803, + "learning_rate": 7.443811161713018e-06, + "loss": 0.4606, + "step": 8530 + }, + { + "epoch": 1.4010223143719336, + "grad_norm": 0.32939005836550994, + "learning_rate": 7.443456443916111e-06, + "loss": 0.4665, + "step": 8531 + }, + { + "epoch": 1.4011865415802762, + "grad_norm": 0.31886006819648094, + "learning_rate": 7.443101694150371e-06, + "loss": 0.4913, + "step": 8532 + }, + { + "epoch": 1.401350768788619, + "grad_norm": 0.2781872322411299, + "learning_rate": 7.442746912419649e-06, + "loss": 0.4668, + "step": 8533 + }, + { + "epoch": 1.4015149959969617, + "grad_norm": 0.9270944124758137, + "learning_rate": 7.442392098727801e-06, + "loss": 0.4763, + "step": 8534 + }, + { + "epoch": 1.4016792232053046, + "grad_norm": 0.35113450993008694, + "learning_rate": 7.442037253078681e-06, + "loss": 0.465, + "step": 8535 + }, + { + "epoch": 1.4018434504136472, + "grad_norm": 0.4559463003574399, + "learning_rate": 7.441682375476141e-06, + "loss": 0.4837, + "step": 8536 + }, + { + "epoch": 1.40200767762199, + "grad_norm": 0.4216442006644967, + "learning_rate": 7.441327465924038e-06, + "loss": 0.4726, + "step": 8537 + }, + { + "epoch": 1.4021719048303327, + "grad_norm": 0.2786121891687586, + "learning_rate": 7.440972524426222e-06, + "loss": 0.4744, + "step": 8538 + }, + { + "epoch": 1.4023361320386756, + "grad_norm": 0.38727303750505243, + "learning_rate": 7.440617550986552e-06, + "loss": 0.4792, + "step": 8539 + }, + { + "epoch": 1.4025003592470182, + "grad_norm": 0.30560195767242304, + "learning_rate": 7.4402625456088826e-06, + "loss": 0.4695, + "step": 8540 + }, + { + "epoch": 1.402664586455361, + "grad_norm": 0.28880668327933817, + "learning_rate": 7.43990750829707e-06, + "loss": 0.4721, + "step": 8541 + }, + { + "epoch": 1.4028288136637037, + "grad_norm": 0.4172695597566841, + "learning_rate": 7.439552439054967e-06, + "loss": 0.4622, + "step": 8542 + }, + { + "epoch": 1.4029930408720466, + "grad_norm": 0.30842138351354353, + "learning_rate": 7.439197337886435e-06, + "loss": 0.4837, + "step": 8543 + }, + { + "epoch": 1.4031572680803892, + "grad_norm": 0.29297139265934846, + "learning_rate": 7.438842204795327e-06, + "loss": 0.4716, + "step": 8544 + }, + { + "epoch": 1.4033214952887318, + "grad_norm": 0.59448376485058, + "learning_rate": 7.4384870397855e-06, + "loss": 0.4695, + "step": 8545 + }, + { + "epoch": 1.4034857224970747, + "grad_norm": 0.27269194109427797, + "learning_rate": 7.438131842860813e-06, + "loss": 0.4929, + "step": 8546 + }, + { + "epoch": 1.4036499497054176, + "grad_norm": 0.341391040294893, + "learning_rate": 7.437776614025125e-06, + "loss": 0.4908, + "step": 8547 + }, + { + "epoch": 1.4038141769137602, + "grad_norm": 0.32200140091490614, + "learning_rate": 7.4374213532822915e-06, + "loss": 0.4745, + "step": 8548 + }, + { + "epoch": 1.4039784041221028, + "grad_norm": 0.2661207928603546, + "learning_rate": 7.437066060636174e-06, + "loss": 0.4899, + "step": 8549 + }, + { + "epoch": 1.4041426313304457, + "grad_norm": 0.29696741496995765, + "learning_rate": 7.436710736090627e-06, + "loss": 0.4584, + "step": 8550 + }, + { + "epoch": 1.4043068585387883, + "grad_norm": 0.29306385562883797, + "learning_rate": 7.436355379649513e-06, + "loss": 0.4588, + "step": 8551 + }, + { + "epoch": 1.4044710857471312, + "grad_norm": 0.3970937075104934, + "learning_rate": 7.43599999131669e-06, + "loss": 0.4891, + "step": 8552 + }, + { + "epoch": 1.4046353129554738, + "grad_norm": 0.34058582880145505, + "learning_rate": 7.435644571096019e-06, + "loss": 0.4523, + "step": 8553 + }, + { + "epoch": 1.4047995401638167, + "grad_norm": 0.45701364527626853, + "learning_rate": 7.435289118991359e-06, + "loss": 0.4677, + "step": 8554 + }, + { + "epoch": 1.4049637673721593, + "grad_norm": 0.31318959131377144, + "learning_rate": 7.434933635006573e-06, + "loss": 0.4598, + "step": 8555 + }, + { + "epoch": 1.4051279945805022, + "grad_norm": 0.3489953705021745, + "learning_rate": 7.4345781191455184e-06, + "loss": 0.4679, + "step": 8556 + }, + { + "epoch": 1.4052922217888448, + "grad_norm": 0.5725145140493672, + "learning_rate": 7.434222571412059e-06, + "loss": 0.4708, + "step": 8557 + }, + { + "epoch": 1.4054564489971877, + "grad_norm": 0.2986176145115553, + "learning_rate": 7.433866991810055e-06, + "loss": 0.4637, + "step": 8558 + }, + { + "epoch": 1.4056206762055303, + "grad_norm": 0.3467896030354416, + "learning_rate": 7.433511380343369e-06, + "loss": 0.4869, + "step": 8559 + }, + { + "epoch": 1.4057849034138732, + "grad_norm": 0.5115851153170545, + "learning_rate": 7.433155737015863e-06, + "loss": 0.4801, + "step": 8560 + }, + { + "epoch": 1.4059491306222158, + "grad_norm": 0.3898500557859638, + "learning_rate": 7.432800061831401e-06, + "loss": 0.4636, + "step": 8561 + }, + { + "epoch": 1.4061133578305585, + "grad_norm": 0.3072252086568954, + "learning_rate": 7.432444354793844e-06, + "loss": 0.4574, + "step": 8562 + }, + { + "epoch": 1.4062775850389013, + "grad_norm": 0.38470436228122484, + "learning_rate": 7.432088615907057e-06, + "loss": 0.4824, + "step": 8563 + }, + { + "epoch": 1.4064418122472442, + "grad_norm": 0.35917326620407675, + "learning_rate": 7.431732845174901e-06, + "loss": 0.4609, + "step": 8564 + }, + { + "epoch": 1.4066060394555868, + "grad_norm": 0.3190169868757679, + "learning_rate": 7.4313770426012435e-06, + "loss": 0.4755, + "step": 8565 + }, + { + "epoch": 1.4067702666639295, + "grad_norm": 0.30937207655898136, + "learning_rate": 7.4310212081899475e-06, + "loss": 0.4747, + "step": 8566 + }, + { + "epoch": 1.4069344938722723, + "grad_norm": 0.30277533292576975, + "learning_rate": 7.430665341944877e-06, + "loss": 0.4801, + "step": 8567 + }, + { + "epoch": 1.407098721080615, + "grad_norm": 0.3104919922663232, + "learning_rate": 7.430309443869896e-06, + "loss": 0.4733, + "step": 8568 + }, + { + "epoch": 1.4072629482889578, + "grad_norm": 0.4631342370065941, + "learning_rate": 7.429953513968873e-06, + "loss": 0.4611, + "step": 8569 + }, + { + "epoch": 1.4074271754973005, + "grad_norm": 0.2910622862956142, + "learning_rate": 7.429597552245673e-06, + "loss": 0.4707, + "step": 8570 + }, + { + "epoch": 1.4075914027056433, + "grad_norm": 0.3070539773881218, + "learning_rate": 7.429241558704159e-06, + "loss": 0.4674, + "step": 8571 + }, + { + "epoch": 1.407755629913986, + "grad_norm": 0.3795506728011231, + "learning_rate": 7.428885533348201e-06, + "loss": 0.4866, + "step": 8572 + }, + { + "epoch": 1.4079198571223288, + "grad_norm": 0.3507655176583296, + "learning_rate": 7.428529476181664e-06, + "loss": 0.4611, + "step": 8573 + }, + { + "epoch": 1.4080840843306714, + "grad_norm": 0.34017407358390006, + "learning_rate": 7.428173387208416e-06, + "loss": 0.4712, + "step": 8574 + }, + { + "epoch": 1.4082483115390143, + "grad_norm": 0.30794027444638433, + "learning_rate": 7.427817266432324e-06, + "loss": 0.4707, + "step": 8575 + }, + { + "epoch": 1.408412538747357, + "grad_norm": 0.3143973834229577, + "learning_rate": 7.427461113857256e-06, + "loss": 0.451, + "step": 8576 + }, + { + "epoch": 1.4085767659556998, + "grad_norm": 0.5979945971913595, + "learning_rate": 7.42710492948708e-06, + "loss": 0.4639, + "step": 8577 + }, + { + "epoch": 1.4087409931640424, + "grad_norm": 0.2887234277853099, + "learning_rate": 7.426748713325664e-06, + "loss": 0.47, + "step": 8578 + }, + { + "epoch": 1.408905220372385, + "grad_norm": 0.33756145494788314, + "learning_rate": 7.426392465376879e-06, + "loss": 0.4775, + "step": 8579 + }, + { + "epoch": 1.409069447580728, + "grad_norm": 0.2800702567687854, + "learning_rate": 7.426036185644591e-06, + "loss": 0.4517, + "step": 8580 + }, + { + "epoch": 1.4092336747890708, + "grad_norm": 0.46678552703550924, + "learning_rate": 7.425679874132672e-06, + "loss": 0.4797, + "step": 8581 + }, + { + "epoch": 1.4093979019974134, + "grad_norm": 0.3356274085717647, + "learning_rate": 7.42532353084499e-06, + "loss": 0.4575, + "step": 8582 + }, + { + "epoch": 1.409562129205756, + "grad_norm": 0.3116451615310341, + "learning_rate": 7.424967155785418e-06, + "loss": 0.4616, + "step": 8583 + }, + { + "epoch": 1.409726356414099, + "grad_norm": 0.2696206283265771, + "learning_rate": 7.424610748957823e-06, + "loss": 0.4636, + "step": 8584 + }, + { + "epoch": 1.4098905836224416, + "grad_norm": 0.33772238707792296, + "learning_rate": 7.424254310366079e-06, + "loss": 0.4704, + "step": 8585 + }, + { + "epoch": 1.4100548108307844, + "grad_norm": 0.3055990131999076, + "learning_rate": 7.4238978400140565e-06, + "loss": 0.476, + "step": 8586 + }, + { + "epoch": 1.410219038039127, + "grad_norm": 0.3551794459285267, + "learning_rate": 7.423541337905626e-06, + "loss": 0.4573, + "step": 8587 + }, + { + "epoch": 1.41038326524747, + "grad_norm": 0.3186811436613112, + "learning_rate": 7.4231848040446605e-06, + "loss": 0.4709, + "step": 8588 + }, + { + "epoch": 1.4105474924558126, + "grad_norm": 0.49816413275093707, + "learning_rate": 7.4228282384350315e-06, + "loss": 0.4923, + "step": 8589 + }, + { + "epoch": 1.4107117196641554, + "grad_norm": 0.35030694382788935, + "learning_rate": 7.4224716410806126e-06, + "loss": 0.4781, + "step": 8590 + }, + { + "epoch": 1.410875946872498, + "grad_norm": 0.30048156452761454, + "learning_rate": 7.422115011985278e-06, + "loss": 0.4796, + "step": 8591 + }, + { + "epoch": 1.411040174080841, + "grad_norm": 0.3196508862449292, + "learning_rate": 7.421758351152898e-06, + "loss": 0.4611, + "step": 8592 + }, + { + "epoch": 1.4112044012891836, + "grad_norm": 0.4013596427282612, + "learning_rate": 7.421401658587347e-06, + "loss": 0.4694, + "step": 8593 + }, + { + "epoch": 1.4113686284975264, + "grad_norm": 0.2652885562504778, + "learning_rate": 7.4210449342924995e-06, + "loss": 0.4387, + "step": 8594 + }, + { + "epoch": 1.411532855705869, + "grad_norm": 0.30374163694020784, + "learning_rate": 7.4206881782722305e-06, + "loss": 0.4537, + "step": 8595 + }, + { + "epoch": 1.4116970829142117, + "grad_norm": 0.4612477117901834, + "learning_rate": 7.420331390530415e-06, + "loss": 0.4827, + "step": 8596 + }, + { + "epoch": 1.4118613101225546, + "grad_norm": 0.2844872627001613, + "learning_rate": 7.419974571070927e-06, + "loss": 0.4617, + "step": 8597 + }, + { + "epoch": 1.4120255373308974, + "grad_norm": 0.27905217514415914, + "learning_rate": 7.419617719897642e-06, + "loss": 0.4682, + "step": 8598 + }, + { + "epoch": 1.41218976453924, + "grad_norm": 0.27318352991817796, + "learning_rate": 7.4192608370144355e-06, + "loss": 0.4905, + "step": 8599 + }, + { + "epoch": 1.4123539917475827, + "grad_norm": 0.3544091023529306, + "learning_rate": 7.418903922425187e-06, + "loss": 0.4396, + "step": 8600 + }, + { + "epoch": 1.4125182189559256, + "grad_norm": 0.32549410255272687, + "learning_rate": 7.418546976133766e-06, + "loss": 0.457, + "step": 8601 + }, + { + "epoch": 1.4126824461642682, + "grad_norm": 0.33155795928650694, + "learning_rate": 7.418189998144056e-06, + "loss": 0.4621, + "step": 8602 + }, + { + "epoch": 1.412846673372611, + "grad_norm": 0.2963843090235932, + "learning_rate": 7.417832988459932e-06, + "loss": 0.4527, + "step": 8603 + }, + { + "epoch": 1.4130109005809537, + "grad_norm": 0.3009253973532018, + "learning_rate": 7.417475947085269e-06, + "loss": 0.4651, + "step": 8604 + }, + { + "epoch": 1.4131751277892965, + "grad_norm": 0.31122937905474224, + "learning_rate": 7.4171188740239475e-06, + "loss": 0.4778, + "step": 8605 + }, + { + "epoch": 1.4133393549976392, + "grad_norm": 0.35058399361945486, + "learning_rate": 7.416761769279846e-06, + "loss": 0.4452, + "step": 8606 + }, + { + "epoch": 1.413503582205982, + "grad_norm": 0.32076975303698535, + "learning_rate": 7.4164046328568404e-06, + "loss": 0.441, + "step": 8607 + }, + { + "epoch": 1.4136678094143247, + "grad_norm": 0.5070947218880237, + "learning_rate": 7.416047464758812e-06, + "loss": 0.4604, + "step": 8608 + }, + { + "epoch": 1.4138320366226675, + "grad_norm": 0.37100148779464004, + "learning_rate": 7.415690264989639e-06, + "loss": 0.4645, + "step": 8609 + }, + { + "epoch": 1.4139962638310102, + "grad_norm": 0.3308140782663967, + "learning_rate": 7.415333033553201e-06, + "loss": 0.47, + "step": 8610 + }, + { + "epoch": 1.414160491039353, + "grad_norm": 0.36507773865455445, + "learning_rate": 7.414975770453378e-06, + "loss": 0.4506, + "step": 8611 + }, + { + "epoch": 1.4143247182476957, + "grad_norm": 0.3049033070114644, + "learning_rate": 7.414618475694051e-06, + "loss": 0.4712, + "step": 8612 + }, + { + "epoch": 1.4144889454560383, + "grad_norm": 0.3199448512706157, + "learning_rate": 7.414261149279099e-06, + "loss": 0.4779, + "step": 8613 + }, + { + "epoch": 1.4146531726643812, + "grad_norm": 0.4204919239325658, + "learning_rate": 7.413903791212403e-06, + "loss": 0.4661, + "step": 8614 + }, + { + "epoch": 1.414817399872724, + "grad_norm": 0.31521651354486385, + "learning_rate": 7.413546401497846e-06, + "loss": 0.4816, + "step": 8615 + }, + { + "epoch": 1.4149816270810667, + "grad_norm": 0.35333273072663923, + "learning_rate": 7.413188980139309e-06, + "loss": 0.4628, + "step": 8616 + }, + { + "epoch": 1.4151458542894093, + "grad_norm": 0.289281221874642, + "learning_rate": 7.412831527140672e-06, + "loss": 0.4563, + "step": 8617 + }, + { + "epoch": 1.4153100814977522, + "grad_norm": 0.32144669061901465, + "learning_rate": 7.412474042505819e-06, + "loss": 0.4588, + "step": 8618 + }, + { + "epoch": 1.4154743087060948, + "grad_norm": 0.46956177346878347, + "learning_rate": 7.412116526238633e-06, + "loss": 0.4575, + "step": 8619 + }, + { + "epoch": 1.4156385359144377, + "grad_norm": 0.38669633047785396, + "learning_rate": 7.411758978342996e-06, + "loss": 0.4939, + "step": 8620 + }, + { + "epoch": 1.4158027631227803, + "grad_norm": 0.3337701977281101, + "learning_rate": 7.411401398822792e-06, + "loss": 0.455, + "step": 8621 + }, + { + "epoch": 1.4159669903311232, + "grad_norm": 0.27205267115254267, + "learning_rate": 7.411043787681904e-06, + "loss": 0.4595, + "step": 8622 + }, + { + "epoch": 1.4161312175394658, + "grad_norm": 0.31901257150921336, + "learning_rate": 7.410686144924216e-06, + "loss": 0.4744, + "step": 8623 + }, + { + "epoch": 1.4162954447478087, + "grad_norm": 0.29990230673339774, + "learning_rate": 7.410328470553614e-06, + "loss": 0.4677, + "step": 8624 + }, + { + "epoch": 1.4164596719561513, + "grad_norm": 0.432034413195062, + "learning_rate": 7.409970764573981e-06, + "loss": 0.4718, + "step": 8625 + }, + { + "epoch": 1.4166238991644942, + "grad_norm": 0.45117493263050745, + "learning_rate": 7.409613026989202e-06, + "loss": 0.4503, + "step": 8626 + }, + { + "epoch": 1.4167881263728368, + "grad_norm": 0.288399028366705, + "learning_rate": 7.409255257803164e-06, + "loss": 0.477, + "step": 8627 + }, + { + "epoch": 1.4169523535811797, + "grad_norm": 0.2742665130026796, + "learning_rate": 7.40889745701975e-06, + "loss": 0.4793, + "step": 8628 + }, + { + "epoch": 1.4171165807895223, + "grad_norm": 0.28388631767832506, + "learning_rate": 7.408539624642849e-06, + "loss": 0.4707, + "step": 8629 + }, + { + "epoch": 1.417280807997865, + "grad_norm": 0.3762696631629608, + "learning_rate": 7.408181760676345e-06, + "loss": 0.4742, + "step": 8630 + }, + { + "epoch": 1.4174450352062078, + "grad_norm": 0.30604710663862517, + "learning_rate": 7.407823865124126e-06, + "loss": 0.4893, + "step": 8631 + }, + { + "epoch": 1.4176092624145507, + "grad_norm": 0.30730097863583844, + "learning_rate": 7.407465937990079e-06, + "loss": 0.4583, + "step": 8632 + }, + { + "epoch": 1.4177734896228933, + "grad_norm": 0.28177094476792225, + "learning_rate": 7.407107979278093e-06, + "loss": 0.4893, + "step": 8633 + }, + { + "epoch": 1.417937716831236, + "grad_norm": 0.3339766813808621, + "learning_rate": 7.406749988992052e-06, + "loss": 0.4782, + "step": 8634 + }, + { + "epoch": 1.4181019440395788, + "grad_norm": 0.3550560311986116, + "learning_rate": 7.4063919671358456e-06, + "loss": 0.4521, + "step": 8635 + }, + { + "epoch": 1.4182661712479214, + "grad_norm": 0.31422102493092624, + "learning_rate": 7.406033913713365e-06, + "loss": 0.4574, + "step": 8636 + }, + { + "epoch": 1.4184303984562643, + "grad_norm": 0.36862943811813514, + "learning_rate": 7.405675828728497e-06, + "loss": 0.4612, + "step": 8637 + }, + { + "epoch": 1.418594625664607, + "grad_norm": 0.3823455427636954, + "learning_rate": 7.405317712185129e-06, + "loss": 0.4642, + "step": 8638 + }, + { + "epoch": 1.4187588528729498, + "grad_norm": 0.6697363617447964, + "learning_rate": 7.4049595640871534e-06, + "loss": 0.4611, + "step": 8639 + }, + { + "epoch": 1.4189230800812924, + "grad_norm": 0.3082877536536946, + "learning_rate": 7.404601384438458e-06, + "loss": 0.4553, + "step": 8640 + }, + { + "epoch": 1.4190873072896353, + "grad_norm": 0.3866697717486414, + "learning_rate": 7.404243173242936e-06, + "loss": 0.4717, + "step": 8641 + }, + { + "epoch": 1.419251534497978, + "grad_norm": 0.3095639973516765, + "learning_rate": 7.403884930504474e-06, + "loss": 0.4899, + "step": 8642 + }, + { + "epoch": 1.4194157617063208, + "grad_norm": 0.3281301085749949, + "learning_rate": 7.403526656226965e-06, + "loss": 0.4433, + "step": 8643 + }, + { + "epoch": 1.4195799889146634, + "grad_norm": 0.3093984334312015, + "learning_rate": 7.4031683504142985e-06, + "loss": 0.4617, + "step": 8644 + }, + { + "epoch": 1.4197442161230063, + "grad_norm": 0.3132767270598841, + "learning_rate": 7.402810013070369e-06, + "loss": 0.4648, + "step": 8645 + }, + { + "epoch": 1.419908443331349, + "grad_norm": 0.35555840733129107, + "learning_rate": 7.4024516441990665e-06, + "loss": 0.4825, + "step": 8646 + }, + { + "epoch": 1.4200726705396916, + "grad_norm": 0.3605843369817561, + "learning_rate": 7.402093243804283e-06, + "loss": 0.5036, + "step": 8647 + }, + { + "epoch": 1.4202368977480344, + "grad_norm": 0.31417513230023925, + "learning_rate": 7.401734811889911e-06, + "loss": 0.4756, + "step": 8648 + }, + { + "epoch": 1.4204011249563773, + "grad_norm": 0.3143056077749063, + "learning_rate": 7.401376348459846e-06, + "loss": 0.463, + "step": 8649 + }, + { + "epoch": 1.42056535216472, + "grad_norm": 0.33212448977759, + "learning_rate": 7.401017853517978e-06, + "loss": 0.4869, + "step": 8650 + }, + { + "epoch": 1.4207295793730625, + "grad_norm": 0.4137208235695577, + "learning_rate": 7.400659327068202e-06, + "loss": 0.4866, + "step": 8651 + }, + { + "epoch": 1.4208938065814054, + "grad_norm": 0.3671152000569904, + "learning_rate": 7.400300769114411e-06, + "loss": 0.4672, + "step": 8652 + }, + { + "epoch": 1.421058033789748, + "grad_norm": 0.36134698599484183, + "learning_rate": 7.399942179660502e-06, + "loss": 0.4803, + "step": 8653 + }, + { + "epoch": 1.421222260998091, + "grad_norm": 0.3226969808681514, + "learning_rate": 7.399583558710367e-06, + "loss": 0.4564, + "step": 8654 + }, + { + "epoch": 1.4213864882064335, + "grad_norm": 0.3360087789001294, + "learning_rate": 7.399224906267901e-06, + "loss": 0.4695, + "step": 8655 + }, + { + "epoch": 1.4215507154147764, + "grad_norm": 0.3999377209987789, + "learning_rate": 7.398866222337e-06, + "loss": 0.4524, + "step": 8656 + }, + { + "epoch": 1.421714942623119, + "grad_norm": 0.2619494941802104, + "learning_rate": 7.39850750692156e-06, + "loss": 0.4738, + "step": 8657 + }, + { + "epoch": 1.421879169831462, + "grad_norm": 0.3377812599960666, + "learning_rate": 7.398148760025479e-06, + "loss": 0.4783, + "step": 8658 + }, + { + "epoch": 1.4220433970398045, + "grad_norm": 0.5649331309834287, + "learning_rate": 7.397789981652648e-06, + "loss": 0.4686, + "step": 8659 + }, + { + "epoch": 1.4222076242481472, + "grad_norm": 0.266427925002802, + "learning_rate": 7.3974311718069685e-06, + "loss": 0.4613, + "step": 8660 + }, + { + "epoch": 1.42237185145649, + "grad_norm": 0.3444625103911929, + "learning_rate": 7.397072330492334e-06, + "loss": 0.4537, + "step": 8661 + }, + { + "epoch": 1.422536078664833, + "grad_norm": 0.34059735816769793, + "learning_rate": 7.396713457712646e-06, + "loss": 0.4899, + "step": 8662 + }, + { + "epoch": 1.4227003058731755, + "grad_norm": 0.3790439525703914, + "learning_rate": 7.396354553471799e-06, + "loss": 0.4625, + "step": 8663 + }, + { + "epoch": 1.4228645330815182, + "grad_norm": 0.39460946921143725, + "learning_rate": 7.3959956177736906e-06, + "loss": 0.4553, + "step": 8664 + }, + { + "epoch": 1.423028760289861, + "grad_norm": 0.2817981735232756, + "learning_rate": 7.3956366506222225e-06, + "loss": 0.4691, + "step": 8665 + }, + { + "epoch": 1.4231929874982039, + "grad_norm": 0.4378902762621768, + "learning_rate": 7.39527765202129e-06, + "loss": 0.4843, + "step": 8666 + }, + { + "epoch": 1.4233572147065465, + "grad_norm": 0.975584940866872, + "learning_rate": 7.394918621974795e-06, + "loss": 0.4811, + "step": 8667 + }, + { + "epoch": 1.4235214419148892, + "grad_norm": 0.7960565831153253, + "learning_rate": 7.394559560486634e-06, + "loss": 0.4538, + "step": 8668 + }, + { + "epoch": 1.423685669123232, + "grad_norm": 0.30691411758843296, + "learning_rate": 7.394200467560708e-06, + "loss": 0.4838, + "step": 8669 + }, + { + "epoch": 1.4238498963315747, + "grad_norm": 0.2963936182755594, + "learning_rate": 7.39384134320092e-06, + "loss": 0.4711, + "step": 8670 + }, + { + "epoch": 1.4240141235399175, + "grad_norm": 0.4976685910588599, + "learning_rate": 7.393482187411165e-06, + "loss": 0.4713, + "step": 8671 + }, + { + "epoch": 1.4241783507482602, + "grad_norm": 0.3144718758400609, + "learning_rate": 7.393123000195349e-06, + "loss": 0.4841, + "step": 8672 + }, + { + "epoch": 1.424342577956603, + "grad_norm": 0.30284813504731734, + "learning_rate": 7.392763781557369e-06, + "loss": 0.4538, + "step": 8673 + }, + { + "epoch": 1.4245068051649457, + "grad_norm": 0.34444613471749935, + "learning_rate": 7.3924045315011294e-06, + "loss": 0.4684, + "step": 8674 + }, + { + "epoch": 1.4246710323732885, + "grad_norm": 0.30198678685548713, + "learning_rate": 7.39204525003053e-06, + "loss": 0.4796, + "step": 8675 + }, + { + "epoch": 1.4248352595816312, + "grad_norm": 0.29695760309471625, + "learning_rate": 7.391685937149474e-06, + "loss": 0.4972, + "step": 8676 + }, + { + "epoch": 1.4249994867899738, + "grad_norm": 0.3142088847097196, + "learning_rate": 7.391326592861863e-06, + "loss": 0.4796, + "step": 8677 + }, + { + "epoch": 1.4251637139983167, + "grad_norm": 0.3184273247226719, + "learning_rate": 7.390967217171602e-06, + "loss": 0.4991, + "step": 8678 + }, + { + "epoch": 1.4253279412066595, + "grad_norm": 0.3155069256631096, + "learning_rate": 7.390607810082593e-06, + "loss": 0.4629, + "step": 8679 + }, + { + "epoch": 1.4254921684150021, + "grad_norm": 0.39120624749395266, + "learning_rate": 7.390248371598738e-06, + "loss": 0.4489, + "step": 8680 + }, + { + "epoch": 1.4256563956233448, + "grad_norm": 0.3245465147182549, + "learning_rate": 7.389888901723942e-06, + "loss": 0.4579, + "step": 8681 + }, + { + "epoch": 1.4258206228316876, + "grad_norm": 0.3394055494225815, + "learning_rate": 7.38952940046211e-06, + "loss": 0.4757, + "step": 8682 + }, + { + "epoch": 1.4259848500400305, + "grad_norm": 0.32787591201901345, + "learning_rate": 7.389169867817145e-06, + "loss": 0.4814, + "step": 8683 + }, + { + "epoch": 1.4261490772483731, + "grad_norm": 0.49850515920788757, + "learning_rate": 7.388810303792953e-06, + "loss": 0.4779, + "step": 8684 + }, + { + "epoch": 1.4263133044567158, + "grad_norm": 0.4000878439452594, + "learning_rate": 7.388450708393439e-06, + "loss": 0.4735, + "step": 8685 + }, + { + "epoch": 1.4264775316650586, + "grad_norm": 0.3884268909927122, + "learning_rate": 7.388091081622508e-06, + "loss": 0.4615, + "step": 8686 + }, + { + "epoch": 1.4266417588734013, + "grad_norm": 0.47546552185240776, + "learning_rate": 7.387731423484068e-06, + "loss": 0.4673, + "step": 8687 + }, + { + "epoch": 1.4268059860817441, + "grad_norm": 0.3451691022903009, + "learning_rate": 7.387371733982022e-06, + "loss": 0.4527, + "step": 8688 + }, + { + "epoch": 1.4269702132900868, + "grad_norm": 0.3351213061571729, + "learning_rate": 7.387012013120278e-06, + "loss": 0.4505, + "step": 8689 + }, + { + "epoch": 1.4271344404984296, + "grad_norm": 0.33697957511100923, + "learning_rate": 7.386652260902743e-06, + "loss": 0.4526, + "step": 8690 + }, + { + "epoch": 1.4272986677067723, + "grad_norm": 0.3288480616443145, + "learning_rate": 7.3862924773333266e-06, + "loss": 0.4605, + "step": 8691 + }, + { + "epoch": 1.4274628949151151, + "grad_norm": 0.2741337484391843, + "learning_rate": 7.385932662415932e-06, + "loss": 0.4733, + "step": 8692 + }, + { + "epoch": 1.4276271221234578, + "grad_norm": 0.35755160262639774, + "learning_rate": 7.38557281615447e-06, + "loss": 0.4764, + "step": 8693 + }, + { + "epoch": 1.4277913493318004, + "grad_norm": 0.42377404035966615, + "learning_rate": 7.3852129385528476e-06, + "loss": 0.4806, + "step": 8694 + }, + { + "epoch": 1.4279555765401433, + "grad_norm": 0.3601017434270535, + "learning_rate": 7.3848530296149756e-06, + "loss": 0.4878, + "step": 8695 + }, + { + "epoch": 1.4281198037484861, + "grad_norm": 0.2642369260132183, + "learning_rate": 7.38449308934476e-06, + "loss": 0.46, + "step": 8696 + }, + { + "epoch": 1.4282840309568288, + "grad_norm": 0.3231395826179842, + "learning_rate": 7.3841331177461114e-06, + "loss": 0.4513, + "step": 8697 + }, + { + "epoch": 1.4284482581651714, + "grad_norm": 0.5167362775726638, + "learning_rate": 7.38377311482294e-06, + "loss": 0.4731, + "step": 8698 + }, + { + "epoch": 1.4286124853735143, + "grad_norm": 0.31768177825252264, + "learning_rate": 7.383413080579156e-06, + "loss": 0.4808, + "step": 8699 + }, + { + "epoch": 1.4287767125818571, + "grad_norm": 0.2670760327678991, + "learning_rate": 7.383053015018668e-06, + "loss": 0.4463, + "step": 8700 + }, + { + "epoch": 1.4289409397901998, + "grad_norm": 0.7385714108339203, + "learning_rate": 7.382692918145388e-06, + "loss": 0.4714, + "step": 8701 + }, + { + "epoch": 1.4291051669985424, + "grad_norm": 0.3665760851986582, + "learning_rate": 7.382332789963226e-06, + "loss": 0.45, + "step": 8702 + }, + { + "epoch": 1.4292693942068853, + "grad_norm": 0.37412406306082946, + "learning_rate": 7.381972630476095e-06, + "loss": 0.4606, + "step": 8703 + }, + { + "epoch": 1.429433621415228, + "grad_norm": 0.282501205455874, + "learning_rate": 7.381612439687906e-06, + "loss": 0.4713, + "step": 8704 + }, + { + "epoch": 1.4295978486235708, + "grad_norm": 0.3175716080706612, + "learning_rate": 7.3812522176025705e-06, + "loss": 0.4505, + "step": 8705 + }, + { + "epoch": 1.4297620758319134, + "grad_norm": 0.30021046264890927, + "learning_rate": 7.380891964224001e-06, + "loss": 0.468, + "step": 8706 + }, + { + "epoch": 1.4299263030402563, + "grad_norm": 0.36843733896681896, + "learning_rate": 7.38053167955611e-06, + "loss": 0.4797, + "step": 8707 + }, + { + "epoch": 1.430090530248599, + "grad_norm": 0.4091002966536038, + "learning_rate": 7.380171363602812e-06, + "loss": 0.4429, + "step": 8708 + }, + { + "epoch": 1.4302547574569417, + "grad_norm": 0.34868960892958695, + "learning_rate": 7.379811016368018e-06, + "loss": 0.4822, + "step": 8709 + }, + { + "epoch": 1.4304189846652844, + "grad_norm": 0.30935125392275414, + "learning_rate": 7.379450637855644e-06, + "loss": 0.4717, + "step": 8710 + }, + { + "epoch": 1.430583211873627, + "grad_norm": 0.39039071388522256, + "learning_rate": 7.379090228069602e-06, + "loss": 0.4512, + "step": 8711 + }, + { + "epoch": 1.4307474390819699, + "grad_norm": 0.34733265965538085, + "learning_rate": 7.378729787013809e-06, + "loss": 0.4544, + "step": 8712 + }, + { + "epoch": 1.4309116662903127, + "grad_norm": 0.30168326488643704, + "learning_rate": 7.3783693146921765e-06, + "loss": 0.4871, + "step": 8713 + }, + { + "epoch": 1.4310758934986554, + "grad_norm": 0.30953253420001814, + "learning_rate": 7.378008811108622e-06, + "loss": 0.4725, + "step": 8714 + }, + { + "epoch": 1.431240120706998, + "grad_norm": 0.38832020065310613, + "learning_rate": 7.377648276267061e-06, + "loss": 0.4744, + "step": 8715 + }, + { + "epoch": 1.4314043479153409, + "grad_norm": 0.3305266235225147, + "learning_rate": 7.377287710171408e-06, + "loss": 0.4819, + "step": 8716 + }, + { + "epoch": 1.4315685751236837, + "grad_norm": 0.34863300149843024, + "learning_rate": 7.376927112825579e-06, + "loss": 0.4465, + "step": 8717 + }, + { + "epoch": 1.4317328023320264, + "grad_norm": 0.3006649152307014, + "learning_rate": 7.376566484233492e-06, + "loss": 0.4851, + "step": 8718 + }, + { + "epoch": 1.431897029540369, + "grad_norm": 0.4097936421003854, + "learning_rate": 7.3762058243990615e-06, + "loss": 0.4564, + "step": 8719 + }, + { + "epoch": 1.4320612567487119, + "grad_norm": 0.29230226483539684, + "learning_rate": 7.3758451333262075e-06, + "loss": 0.4433, + "step": 8720 + }, + { + "epoch": 1.4322254839570545, + "grad_norm": 0.48553587471592025, + "learning_rate": 7.375484411018845e-06, + "loss": 0.4547, + "step": 8721 + }, + { + "epoch": 1.4323897111653974, + "grad_norm": 0.322278646726219, + "learning_rate": 7.375123657480893e-06, + "loss": 0.4635, + "step": 8722 + }, + { + "epoch": 1.43255393837374, + "grad_norm": 0.3545662737503202, + "learning_rate": 7.374762872716269e-06, + "loss": 0.4687, + "step": 8723 + }, + { + "epoch": 1.4327181655820829, + "grad_norm": 0.3099671112478105, + "learning_rate": 7.374402056728893e-06, + "loss": 0.4638, + "step": 8724 + }, + { + "epoch": 1.4328823927904255, + "grad_norm": 0.3716005362691296, + "learning_rate": 7.374041209522682e-06, + "loss": 0.4784, + "step": 8725 + }, + { + "epoch": 1.4330466199987684, + "grad_norm": 0.27171238151285476, + "learning_rate": 7.373680331101554e-06, + "loss": 0.4728, + "step": 8726 + }, + { + "epoch": 1.433210847207111, + "grad_norm": 0.29662911135939474, + "learning_rate": 7.373319421469432e-06, + "loss": 0.4717, + "step": 8727 + }, + { + "epoch": 1.4333750744154536, + "grad_norm": 0.2888872805223644, + "learning_rate": 7.3729584806302346e-06, + "loss": 0.4556, + "step": 8728 + }, + { + "epoch": 1.4335393016237965, + "grad_norm": 0.8980499551263118, + "learning_rate": 7.372597508587881e-06, + "loss": 0.4434, + "step": 8729 + }, + { + "epoch": 1.4337035288321394, + "grad_norm": 0.31920864527645193, + "learning_rate": 7.372236505346292e-06, + "loss": 0.4707, + "step": 8730 + }, + { + "epoch": 1.433867756040482, + "grad_norm": 0.31494192651957037, + "learning_rate": 7.371875470909388e-06, + "loss": 0.4751, + "step": 8731 + }, + { + "epoch": 1.4340319832488246, + "grad_norm": 0.3646059146697081, + "learning_rate": 7.371514405281091e-06, + "loss": 0.4542, + "step": 8732 + }, + { + "epoch": 1.4341962104571675, + "grad_norm": 0.29678542187972806, + "learning_rate": 7.371153308465324e-06, + "loss": 0.4688, + "step": 8733 + }, + { + "epoch": 1.4343604376655104, + "grad_norm": 0.30307859935590387, + "learning_rate": 7.370792180466006e-06, + "loss": 0.4811, + "step": 8734 + }, + { + "epoch": 1.434524664873853, + "grad_norm": 0.35944445084086313, + "learning_rate": 7.370431021287059e-06, + "loss": 0.4892, + "step": 8735 + }, + { + "epoch": 1.4346888920821956, + "grad_norm": 0.3054816830042426, + "learning_rate": 7.37006983093241e-06, + "loss": 0.4489, + "step": 8736 + }, + { + "epoch": 1.4348531192905385, + "grad_norm": 0.3480155627029634, + "learning_rate": 7.369708609405977e-06, + "loss": 0.4521, + "step": 8737 + }, + { + "epoch": 1.4350173464988811, + "grad_norm": 0.33385036419258024, + "learning_rate": 7.369347356711686e-06, + "loss": 0.4694, + "step": 8738 + }, + { + "epoch": 1.435181573707224, + "grad_norm": 0.2680938800862615, + "learning_rate": 7.368986072853459e-06, + "loss": 0.4641, + "step": 8739 + }, + { + "epoch": 1.4353458009155666, + "grad_norm": 0.45577800534453167, + "learning_rate": 7.36862475783522e-06, + "loss": 0.4672, + "step": 8740 + }, + { + "epoch": 1.4355100281239095, + "grad_norm": 0.3124628033703303, + "learning_rate": 7.3682634116608955e-06, + "loss": 0.4613, + "step": 8741 + }, + { + "epoch": 1.4356742553322521, + "grad_norm": 0.29964203647181226, + "learning_rate": 7.367902034334407e-06, + "loss": 0.4702, + "step": 8742 + }, + { + "epoch": 1.435838482540595, + "grad_norm": 0.3592963394992377, + "learning_rate": 7.367540625859681e-06, + "loss": 0.4741, + "step": 8743 + }, + { + "epoch": 1.4360027097489376, + "grad_norm": 0.3646717741074932, + "learning_rate": 7.367179186240642e-06, + "loss": 0.459, + "step": 8744 + }, + { + "epoch": 1.4361669369572803, + "grad_norm": 0.3194017770756867, + "learning_rate": 7.366817715481216e-06, + "loss": 0.4661, + "step": 8745 + }, + { + "epoch": 1.4363311641656231, + "grad_norm": 0.8041761901096787, + "learning_rate": 7.36645621358533e-06, + "loss": 0.4723, + "step": 8746 + }, + { + "epoch": 1.436495391373966, + "grad_norm": 0.3466083547248436, + "learning_rate": 7.36609468055691e-06, + "loss": 0.4528, + "step": 8747 + }, + { + "epoch": 1.4366596185823086, + "grad_norm": 0.43236098119183647, + "learning_rate": 7.36573311639988e-06, + "loss": 0.4912, + "step": 8748 + }, + { + "epoch": 1.4368238457906513, + "grad_norm": 0.48201449975080524, + "learning_rate": 7.365371521118171e-06, + "loss": 0.4662, + "step": 8749 + }, + { + "epoch": 1.4369880729989941, + "grad_norm": 0.2897789566298985, + "learning_rate": 7.365009894715706e-06, + "loss": 0.4697, + "step": 8750 + }, + { + "epoch": 1.437152300207337, + "grad_norm": 0.4285770698212275, + "learning_rate": 7.364648237196416e-06, + "loss": 0.4645, + "step": 8751 + }, + { + "epoch": 1.4373165274156796, + "grad_norm": 0.30895575283505117, + "learning_rate": 7.364286548564226e-06, + "loss": 0.4635, + "step": 8752 + }, + { + "epoch": 1.4374807546240222, + "grad_norm": 0.3878587785336352, + "learning_rate": 7.3639248288230685e-06, + "loss": 0.4717, + "step": 8753 + }, + { + "epoch": 1.437644981832365, + "grad_norm": 0.31587243157295675, + "learning_rate": 7.3635630779768694e-06, + "loss": 0.4807, + "step": 8754 + }, + { + "epoch": 1.4378092090407077, + "grad_norm": 0.29345313599614753, + "learning_rate": 7.363201296029556e-06, + "loss": 0.4743, + "step": 8755 + }, + { + "epoch": 1.4379734362490506, + "grad_norm": 0.29824132861417796, + "learning_rate": 7.362839482985061e-06, + "loss": 0.4708, + "step": 8756 + }, + { + "epoch": 1.4381376634573932, + "grad_norm": 0.3150943702529251, + "learning_rate": 7.362477638847312e-06, + "loss": 0.4803, + "step": 8757 + }, + { + "epoch": 1.438301890665736, + "grad_norm": 0.3035150249486316, + "learning_rate": 7.362115763620241e-06, + "loss": 0.4642, + "step": 8758 + }, + { + "epoch": 1.4384661178740787, + "grad_norm": 0.49228834262052035, + "learning_rate": 7.361753857307775e-06, + "loss": 0.4644, + "step": 8759 + }, + { + "epoch": 1.4386303450824216, + "grad_norm": 0.425149859034616, + "learning_rate": 7.3613919199138464e-06, + "loss": 0.4573, + "step": 8760 + }, + { + "epoch": 1.4387945722907642, + "grad_norm": 0.38725281483202917, + "learning_rate": 7.361029951442388e-06, + "loss": 0.4717, + "step": 8761 + }, + { + "epoch": 1.4389587994991069, + "grad_norm": 0.304654791341761, + "learning_rate": 7.360667951897329e-06, + "loss": 0.475, + "step": 8762 + }, + { + "epoch": 1.4391230267074497, + "grad_norm": 0.29352327279914797, + "learning_rate": 7.3603059212826e-06, + "loss": 0.4814, + "step": 8763 + }, + { + "epoch": 1.4392872539157926, + "grad_norm": 0.39246611782479257, + "learning_rate": 7.359943859602135e-06, + "loss": 0.4847, + "step": 8764 + }, + { + "epoch": 1.4394514811241352, + "grad_norm": 0.3311896456985306, + "learning_rate": 7.359581766859867e-06, + "loss": 0.4603, + "step": 8765 + }, + { + "epoch": 1.4396157083324779, + "grad_norm": 0.359044957795356, + "learning_rate": 7.359219643059727e-06, + "loss": 0.4447, + "step": 8766 + }, + { + "epoch": 1.4397799355408207, + "grad_norm": 0.26938397873450154, + "learning_rate": 7.3588574882056485e-06, + "loss": 0.4674, + "step": 8767 + }, + { + "epoch": 1.4399441627491636, + "grad_norm": 0.4728328295191253, + "learning_rate": 7.358495302301566e-06, + "loss": 0.4584, + "step": 8768 + }, + { + "epoch": 1.4401083899575062, + "grad_norm": 0.28764939085722807, + "learning_rate": 7.35813308535141e-06, + "loss": 0.5044, + "step": 8769 + }, + { + "epoch": 1.4402726171658489, + "grad_norm": 0.64332414790041, + "learning_rate": 7.357770837359119e-06, + "loss": 0.4699, + "step": 8770 + }, + { + "epoch": 1.4404368443741917, + "grad_norm": 0.26700818401796017, + "learning_rate": 7.357408558328623e-06, + "loss": 0.4667, + "step": 8771 + }, + { + "epoch": 1.4406010715825344, + "grad_norm": 0.3857951942674286, + "learning_rate": 7.35704624826386e-06, + "loss": 0.4705, + "step": 8772 + }, + { + "epoch": 1.4407652987908772, + "grad_norm": 0.2941631079784591, + "learning_rate": 7.356683907168762e-06, + "loss": 0.4678, + "step": 8773 + }, + { + "epoch": 1.4409295259992199, + "grad_norm": 0.35313285672785943, + "learning_rate": 7.356321535047269e-06, + "loss": 0.4609, + "step": 8774 + }, + { + "epoch": 1.4410937532075627, + "grad_norm": 0.30558355477754984, + "learning_rate": 7.355959131903313e-06, + "loss": 0.491, + "step": 8775 + }, + { + "epoch": 1.4412579804159054, + "grad_norm": 0.30028824124065273, + "learning_rate": 7.355596697740828e-06, + "loss": 0.4782, + "step": 8776 + }, + { + "epoch": 1.4414222076242482, + "grad_norm": 0.3974043321955806, + "learning_rate": 7.355234232563758e-06, + "loss": 0.466, + "step": 8777 + }, + { + "epoch": 1.4415864348325909, + "grad_norm": 0.32176678659026303, + "learning_rate": 7.354871736376031e-06, + "loss": 0.4519, + "step": 8778 + }, + { + "epoch": 1.4417506620409335, + "grad_norm": 0.32792249317240885, + "learning_rate": 7.354509209181591e-06, + "loss": 0.4738, + "step": 8779 + }, + { + "epoch": 1.4419148892492764, + "grad_norm": 0.30658790737339253, + "learning_rate": 7.354146650984372e-06, + "loss": 0.462, + "step": 8780 + }, + { + "epoch": 1.4420791164576192, + "grad_norm": 0.4592510866299841, + "learning_rate": 7.35378406178831e-06, + "loss": 0.4529, + "step": 8781 + }, + { + "epoch": 1.4422433436659619, + "grad_norm": 0.2998740701505465, + "learning_rate": 7.353421441597348e-06, + "loss": 0.4682, + "step": 8782 + }, + { + "epoch": 1.4424075708743045, + "grad_norm": 0.355911208435906, + "learning_rate": 7.353058790415422e-06, + "loss": 0.4655, + "step": 8783 + }, + { + "epoch": 1.4425717980826473, + "grad_norm": 0.32644863293812215, + "learning_rate": 7.352696108246469e-06, + "loss": 0.4618, + "step": 8784 + }, + { + "epoch": 1.4427360252909902, + "grad_norm": 0.2730444038859821, + "learning_rate": 7.352333395094429e-06, + "loss": 0.4931, + "step": 8785 + }, + { + "epoch": 1.4429002524993328, + "grad_norm": 0.32896780601935, + "learning_rate": 7.351970650963243e-06, + "loss": 0.4527, + "step": 8786 + }, + { + "epoch": 1.4430644797076755, + "grad_norm": 0.3141642761360534, + "learning_rate": 7.35160787585685e-06, + "loss": 0.4731, + "step": 8787 + }, + { + "epoch": 1.4432287069160183, + "grad_norm": 0.3259242243345612, + "learning_rate": 7.35124506977919e-06, + "loss": 0.472, + "step": 8788 + }, + { + "epoch": 1.443392934124361, + "grad_norm": 0.28421921594410116, + "learning_rate": 7.350882232734202e-06, + "loss": 0.4423, + "step": 8789 + }, + { + "epoch": 1.4435571613327038, + "grad_norm": 0.3161737021968298, + "learning_rate": 7.350519364725829e-06, + "loss": 0.4716, + "step": 8790 + }, + { + "epoch": 1.4437213885410465, + "grad_norm": 0.2932958787210009, + "learning_rate": 7.350156465758012e-06, + "loss": 0.4774, + "step": 8791 + }, + { + "epoch": 1.4438856157493893, + "grad_norm": 0.6697209643610852, + "learning_rate": 7.3497935358346894e-06, + "loss": 0.4357, + "step": 8792 + }, + { + "epoch": 1.444049842957732, + "grad_norm": 0.3264213977418509, + "learning_rate": 7.349430574959807e-06, + "loss": 0.4725, + "step": 8793 + }, + { + "epoch": 1.4442140701660748, + "grad_norm": 0.32557575314880266, + "learning_rate": 7.349067583137305e-06, + "loss": 0.4615, + "step": 8794 + }, + { + "epoch": 1.4443782973744175, + "grad_norm": 0.31900811247252975, + "learning_rate": 7.348704560371126e-06, + "loss": 0.4705, + "step": 8795 + }, + { + "epoch": 1.4445425245827601, + "grad_norm": 0.35273304255974214, + "learning_rate": 7.348341506665211e-06, + "loss": 0.4836, + "step": 8796 + }, + { + "epoch": 1.444706751791103, + "grad_norm": 0.34808125964253456, + "learning_rate": 7.347978422023507e-06, + "loss": 0.4652, + "step": 8797 + }, + { + "epoch": 1.4448709789994458, + "grad_norm": 0.4687086087072129, + "learning_rate": 7.347615306449954e-06, + "loss": 0.473, + "step": 8798 + }, + { + "epoch": 1.4450352062077885, + "grad_norm": 0.3125715957895599, + "learning_rate": 7.347252159948498e-06, + "loss": 0.4606, + "step": 8799 + }, + { + "epoch": 1.445199433416131, + "grad_norm": 0.2823950209487057, + "learning_rate": 7.346888982523081e-06, + "loss": 0.4761, + "step": 8800 + }, + { + "epoch": 1.445363660624474, + "grad_norm": 0.27396154510210274, + "learning_rate": 7.3465257741776495e-06, + "loss": 0.4841, + "step": 8801 + }, + { + "epoch": 1.4455278878328168, + "grad_norm": 0.38488801653092736, + "learning_rate": 7.346162534916148e-06, + "loss": 0.4717, + "step": 8802 + }, + { + "epoch": 1.4456921150411595, + "grad_norm": 0.3296881924583521, + "learning_rate": 7.34579926474252e-06, + "loss": 0.4636, + "step": 8803 + }, + { + "epoch": 1.445856342249502, + "grad_norm": 0.4285832646198502, + "learning_rate": 7.345435963660713e-06, + "loss": 0.4575, + "step": 8804 + }, + { + "epoch": 1.446020569457845, + "grad_norm": 0.34978061084114653, + "learning_rate": 7.345072631674672e-06, + "loss": 0.4777, + "step": 8805 + }, + { + "epoch": 1.4461847966661876, + "grad_norm": 0.308759693755405, + "learning_rate": 7.344709268788342e-06, + "loss": 0.4674, + "step": 8806 + }, + { + "epoch": 1.4463490238745305, + "grad_norm": 0.37776335856045, + "learning_rate": 7.344345875005671e-06, + "loss": 0.4734, + "step": 8807 + }, + { + "epoch": 1.446513251082873, + "grad_norm": 0.30213707597068723, + "learning_rate": 7.343982450330605e-06, + "loss": 0.4677, + "step": 8808 + }, + { + "epoch": 1.446677478291216, + "grad_norm": 0.2863664625153726, + "learning_rate": 7.343618994767093e-06, + "loss": 0.4757, + "step": 8809 + }, + { + "epoch": 1.4468417054995586, + "grad_norm": 0.4567100127507095, + "learning_rate": 7.343255508319079e-06, + "loss": 0.4826, + "step": 8810 + }, + { + "epoch": 1.4470059327079015, + "grad_norm": 0.3235014519664871, + "learning_rate": 7.342891990990514e-06, + "loss": 0.4877, + "step": 8811 + }, + { + "epoch": 1.447170159916244, + "grad_norm": 0.48932063009730403, + "learning_rate": 7.342528442785346e-06, + "loss": 0.4597, + "step": 8812 + }, + { + "epoch": 1.4473343871245867, + "grad_norm": 0.29500404335881325, + "learning_rate": 7.34216486370752e-06, + "loss": 0.4617, + "step": 8813 + }, + { + "epoch": 1.4474986143329296, + "grad_norm": 0.32921988580938477, + "learning_rate": 7.341801253760988e-06, + "loss": 0.4409, + "step": 8814 + }, + { + "epoch": 1.4476628415412724, + "grad_norm": 0.32024711284584884, + "learning_rate": 7.341437612949699e-06, + "loss": 0.459, + "step": 8815 + }, + { + "epoch": 1.447827068749615, + "grad_norm": 0.2953490698527201, + "learning_rate": 7.341073941277602e-06, + "loss": 0.4596, + "step": 8816 + }, + { + "epoch": 1.4479912959579577, + "grad_norm": 0.3474854112500576, + "learning_rate": 7.340710238748646e-06, + "loss": 0.4713, + "step": 8817 + }, + { + "epoch": 1.4481555231663006, + "grad_norm": 0.6411897652901434, + "learning_rate": 7.340346505366782e-06, + "loss": 0.4792, + "step": 8818 + }, + { + "epoch": 1.4483197503746434, + "grad_norm": 0.3431135250449065, + "learning_rate": 7.3399827411359615e-06, + "loss": 0.4734, + "step": 8819 + }, + { + "epoch": 1.448483977582986, + "grad_norm": 0.3095387941404658, + "learning_rate": 7.3396189460601325e-06, + "loss": 0.4672, + "step": 8820 + }, + { + "epoch": 1.4486482047913287, + "grad_norm": 0.29983920611558473, + "learning_rate": 7.339255120143251e-06, + "loss": 0.4751, + "step": 8821 + }, + { + "epoch": 1.4488124319996716, + "grad_norm": 0.29211550519830737, + "learning_rate": 7.338891263389263e-06, + "loss": 0.4871, + "step": 8822 + }, + { + "epoch": 1.4489766592080142, + "grad_norm": 0.3023119216930271, + "learning_rate": 7.338527375802123e-06, + "loss": 0.4751, + "step": 8823 + }, + { + "epoch": 1.449140886416357, + "grad_norm": 0.2741409984560532, + "learning_rate": 7.338163457385783e-06, + "loss": 0.4576, + "step": 8824 + }, + { + "epoch": 1.4493051136246997, + "grad_norm": 0.7815452059668843, + "learning_rate": 7.337799508144196e-06, + "loss": 0.477, + "step": 8825 + }, + { + "epoch": 1.4494693408330426, + "grad_norm": 0.28544871860888393, + "learning_rate": 7.337435528081315e-06, + "loss": 0.4815, + "step": 8826 + }, + { + "epoch": 1.4496335680413852, + "grad_norm": 0.3701675073844943, + "learning_rate": 7.33707151720109e-06, + "loss": 0.478, + "step": 8827 + }, + { + "epoch": 1.449797795249728, + "grad_norm": 0.32291784357585807, + "learning_rate": 7.336707475507479e-06, + "loss": 0.4671, + "step": 8828 + }, + { + "epoch": 1.4499620224580707, + "grad_norm": 0.2915111220168, + "learning_rate": 7.336343403004434e-06, + "loss": 0.4804, + "step": 8829 + }, + { + "epoch": 1.4501262496664133, + "grad_norm": 0.3140881121761215, + "learning_rate": 7.335979299695907e-06, + "loss": 0.4604, + "step": 8830 + }, + { + "epoch": 1.4502904768747562, + "grad_norm": 0.2802099567639804, + "learning_rate": 7.335615165585857e-06, + "loss": 0.4703, + "step": 8831 + }, + { + "epoch": 1.450454704083099, + "grad_norm": 0.3050059252111744, + "learning_rate": 7.335251000678235e-06, + "loss": 0.4572, + "step": 8832 + }, + { + "epoch": 1.4506189312914417, + "grad_norm": 0.3919127908954277, + "learning_rate": 7.334886804976999e-06, + "loss": 0.4758, + "step": 8833 + }, + { + "epoch": 1.4507831584997843, + "grad_norm": 0.31520124295877416, + "learning_rate": 7.334522578486102e-06, + "loss": 0.4934, + "step": 8834 + }, + { + "epoch": 1.4509473857081272, + "grad_norm": 0.302542810820108, + "learning_rate": 7.334158321209502e-06, + "loss": 0.4637, + "step": 8835 + }, + { + "epoch": 1.45111161291647, + "grad_norm": 0.2983970505298541, + "learning_rate": 7.333794033151153e-06, + "loss": 0.456, + "step": 8836 + }, + { + "epoch": 1.4512758401248127, + "grad_norm": 0.2992793685686754, + "learning_rate": 7.333429714315014e-06, + "loss": 0.4765, + "step": 8837 + }, + { + "epoch": 1.4514400673331553, + "grad_norm": 0.30608183883212225, + "learning_rate": 7.333065364705039e-06, + "loss": 0.4544, + "step": 8838 + }, + { + "epoch": 1.4516042945414982, + "grad_norm": 0.33612070244669445, + "learning_rate": 7.332700984325188e-06, + "loss": 0.4473, + "step": 8839 + }, + { + "epoch": 1.4517685217498408, + "grad_norm": 0.36256371284524325, + "learning_rate": 7.332336573179417e-06, + "loss": 0.4773, + "step": 8840 + }, + { + "epoch": 1.4519327489581837, + "grad_norm": 0.5163192135177029, + "learning_rate": 7.331972131271683e-06, + "loss": 0.4803, + "step": 8841 + }, + { + "epoch": 1.4520969761665263, + "grad_norm": 0.291303436118226, + "learning_rate": 7.331607658605947e-06, + "loss": 0.4666, + "step": 8842 + }, + { + "epoch": 1.4522612033748692, + "grad_norm": 0.30054038001077393, + "learning_rate": 7.331243155186165e-06, + "loss": 0.4596, + "step": 8843 + }, + { + "epoch": 1.4524254305832118, + "grad_norm": 0.32552628814577383, + "learning_rate": 7.330878621016298e-06, + "loss": 0.4655, + "step": 8844 + }, + { + "epoch": 1.4525896577915547, + "grad_norm": 0.3255490210170379, + "learning_rate": 7.330514056100302e-06, + "loss": 0.4728, + "step": 8845 + }, + { + "epoch": 1.4527538849998973, + "grad_norm": 0.3732452570361322, + "learning_rate": 7.330149460442139e-06, + "loss": 0.4678, + "step": 8846 + }, + { + "epoch": 1.45291811220824, + "grad_norm": 0.3403884811461902, + "learning_rate": 7.329784834045769e-06, + "loss": 0.473, + "step": 8847 + }, + { + "epoch": 1.4530823394165828, + "grad_norm": 0.35037990887786574, + "learning_rate": 7.329420176915151e-06, + "loss": 0.4578, + "step": 8848 + }, + { + "epoch": 1.4532465666249257, + "grad_norm": 0.33906292449742215, + "learning_rate": 7.329055489054248e-06, + "loss": 0.473, + "step": 8849 + }, + { + "epoch": 1.4534107938332683, + "grad_norm": 0.3476995567742656, + "learning_rate": 7.3286907704670175e-06, + "loss": 0.489, + "step": 8850 + }, + { + "epoch": 1.453575021041611, + "grad_norm": 0.3307868667516689, + "learning_rate": 7.328326021157423e-06, + "loss": 0.4775, + "step": 8851 + }, + { + "epoch": 1.4537392482499538, + "grad_norm": 0.47335856697556894, + "learning_rate": 7.327961241129423e-06, + "loss": 0.471, + "step": 8852 + }, + { + "epoch": 1.4539034754582967, + "grad_norm": 0.27687802887216756, + "learning_rate": 7.327596430386984e-06, + "loss": 0.4493, + "step": 8853 + }, + { + "epoch": 1.4540677026666393, + "grad_norm": 0.3294214659206888, + "learning_rate": 7.327231588934065e-06, + "loss": 0.4644, + "step": 8854 + }, + { + "epoch": 1.454231929874982, + "grad_norm": 0.321441907890736, + "learning_rate": 7.326866716774629e-06, + "loss": 0.4733, + "step": 8855 + }, + { + "epoch": 1.4543961570833248, + "grad_norm": 0.3409296447226374, + "learning_rate": 7.326501813912639e-06, + "loss": 0.4459, + "step": 8856 + }, + { + "epoch": 1.4545603842916675, + "grad_norm": 0.40546821658975873, + "learning_rate": 7.326136880352058e-06, + "loss": 0.4736, + "step": 8857 + }, + { + "epoch": 1.4547246115000103, + "grad_norm": 0.3157113879724182, + "learning_rate": 7.325771916096853e-06, + "loss": 0.4581, + "step": 8858 + }, + { + "epoch": 1.454888838708353, + "grad_norm": 0.43633886623309787, + "learning_rate": 7.325406921150981e-06, + "loss": 0.4638, + "step": 8859 + }, + { + "epoch": 1.4550530659166958, + "grad_norm": 0.6933423819272897, + "learning_rate": 7.3250418955184115e-06, + "loss": 0.4936, + "step": 8860 + }, + { + "epoch": 1.4552172931250384, + "grad_norm": 0.30856150710693425, + "learning_rate": 7.324676839203108e-06, + "loss": 0.4739, + "step": 8861 + }, + { + "epoch": 1.4553815203333813, + "grad_norm": 0.37647065666549806, + "learning_rate": 7.324311752209035e-06, + "loss": 0.4774, + "step": 8862 + }, + { + "epoch": 1.455545747541724, + "grad_norm": 0.31474858543251255, + "learning_rate": 7.323946634540156e-06, + "loss": 0.4775, + "step": 8863 + }, + { + "epoch": 1.4557099747500666, + "grad_norm": 0.35550058465976125, + "learning_rate": 7.32358148620044e-06, + "loss": 0.475, + "step": 8864 + }, + { + "epoch": 1.4558742019584094, + "grad_norm": 0.3180664131895398, + "learning_rate": 7.32321630719385e-06, + "loss": 0.4534, + "step": 8865 + }, + { + "epoch": 1.4560384291667523, + "grad_norm": 0.4056724611982341, + "learning_rate": 7.322851097524354e-06, + "loss": 0.4915, + "step": 8866 + }, + { + "epoch": 1.456202656375095, + "grad_norm": 0.3267146528014822, + "learning_rate": 7.322485857195916e-06, + "loss": 0.4957, + "step": 8867 + }, + { + "epoch": 1.4563668835834376, + "grad_norm": 0.429087209244443, + "learning_rate": 7.322120586212507e-06, + "loss": 0.4619, + "step": 8868 + }, + { + "epoch": 1.4565311107917804, + "grad_norm": 0.2921345299614119, + "learning_rate": 7.32175528457809e-06, + "loss": 0.4468, + "step": 8869 + }, + { + "epoch": 1.4566953380001233, + "grad_norm": 0.326412519394861, + "learning_rate": 7.321389952296635e-06, + "loss": 0.4696, + "step": 8870 + }, + { + "epoch": 1.456859565208466, + "grad_norm": 0.30064282080456756, + "learning_rate": 7.32102458937211e-06, + "loss": 0.4659, + "step": 8871 + }, + { + "epoch": 1.4570237924168086, + "grad_norm": 0.3096831053057281, + "learning_rate": 7.320659195808482e-06, + "loss": 0.4621, + "step": 8872 + }, + { + "epoch": 1.4571880196251514, + "grad_norm": 0.29535434369055585, + "learning_rate": 7.32029377160972e-06, + "loss": 0.4546, + "step": 8873 + }, + { + "epoch": 1.457352246833494, + "grad_norm": 0.32787363679249254, + "learning_rate": 7.319928316779792e-06, + "loss": 0.4522, + "step": 8874 + }, + { + "epoch": 1.457516474041837, + "grad_norm": 0.35674569817713586, + "learning_rate": 7.3195628313226685e-06, + "loss": 0.4668, + "step": 8875 + }, + { + "epoch": 1.4576807012501796, + "grad_norm": 0.3534886154900476, + "learning_rate": 7.31919731524232e-06, + "loss": 0.463, + "step": 8876 + }, + { + "epoch": 1.4578449284585224, + "grad_norm": 0.37215114639067504, + "learning_rate": 7.318831768542713e-06, + "loss": 0.4606, + "step": 8877 + }, + { + "epoch": 1.458009155666865, + "grad_norm": 0.412766233405986, + "learning_rate": 7.31846619122782e-06, + "loss": 0.4659, + "step": 8878 + }, + { + "epoch": 1.458173382875208, + "grad_norm": 0.322259459292066, + "learning_rate": 7.318100583301612e-06, + "loss": 0.4712, + "step": 8879 + }, + { + "epoch": 1.4583376100835506, + "grad_norm": 0.38762871022432, + "learning_rate": 7.31773494476806e-06, + "loss": 0.4848, + "step": 8880 + }, + { + "epoch": 1.4585018372918932, + "grad_norm": 0.33110964123463305, + "learning_rate": 7.317369275631131e-06, + "loss": 0.4534, + "step": 8881 + }, + { + "epoch": 1.458666064500236, + "grad_norm": 0.29537116751045295, + "learning_rate": 7.317003575894802e-06, + "loss": 0.4618, + "step": 8882 + }, + { + "epoch": 1.458830291708579, + "grad_norm": 0.3093608228609226, + "learning_rate": 7.316637845563043e-06, + "loss": 0.4649, + "step": 8883 + }, + { + "epoch": 1.4589945189169216, + "grad_norm": 0.3557175296933889, + "learning_rate": 7.316272084639824e-06, + "loss": 0.4524, + "step": 8884 + }, + { + "epoch": 1.4591587461252642, + "grad_norm": 0.3408766965625624, + "learning_rate": 7.31590629312912e-06, + "loss": 0.4615, + "step": 8885 + }, + { + "epoch": 1.459322973333607, + "grad_norm": 0.31371414979257384, + "learning_rate": 7.315540471034903e-06, + "loss": 0.4605, + "step": 8886 + }, + { + "epoch": 1.45948720054195, + "grad_norm": 0.3204753006979815, + "learning_rate": 7.3151746183611485e-06, + "loss": 0.4685, + "step": 8887 + }, + { + "epoch": 1.4596514277502926, + "grad_norm": 0.2763367425927962, + "learning_rate": 7.314808735111825e-06, + "loss": 0.4723, + "step": 8888 + }, + { + "epoch": 1.4598156549586352, + "grad_norm": 0.30501998510386186, + "learning_rate": 7.314442821290911e-06, + "loss": 0.4755, + "step": 8889 + }, + { + "epoch": 1.459979882166978, + "grad_norm": 0.31957072046857943, + "learning_rate": 7.314076876902378e-06, + "loss": 0.4508, + "step": 8890 + }, + { + "epoch": 1.4601441093753207, + "grad_norm": 0.4569959723681631, + "learning_rate": 7.313710901950203e-06, + "loss": 0.4711, + "step": 8891 + }, + { + "epoch": 1.4603083365836635, + "grad_norm": 0.3275277934527127, + "learning_rate": 7.3133448964383575e-06, + "loss": 0.4531, + "step": 8892 + }, + { + "epoch": 1.4604725637920062, + "grad_norm": 0.31450641388610223, + "learning_rate": 7.312978860370818e-06, + "loss": 0.4699, + "step": 8893 + }, + { + "epoch": 1.460636791000349, + "grad_norm": 0.35690484276864787, + "learning_rate": 7.312612793751563e-06, + "loss": 0.4571, + "step": 8894 + }, + { + "epoch": 1.4608010182086917, + "grad_norm": 0.32055550173392267, + "learning_rate": 7.312246696584564e-06, + "loss": 0.4687, + "step": 8895 + }, + { + "epoch": 1.4609652454170345, + "grad_norm": 0.31626984369788685, + "learning_rate": 7.311880568873799e-06, + "loss": 0.4789, + "step": 8896 + }, + { + "epoch": 1.4611294726253772, + "grad_norm": 0.27444977756984074, + "learning_rate": 7.311514410623244e-06, + "loss": 0.4762, + "step": 8897 + }, + { + "epoch": 1.4612936998337198, + "grad_norm": 0.2779313804007929, + "learning_rate": 7.311148221836878e-06, + "loss": 0.4485, + "step": 8898 + }, + { + "epoch": 1.4614579270420627, + "grad_norm": 0.32831057129001473, + "learning_rate": 7.310782002518676e-06, + "loss": 0.4724, + "step": 8899 + }, + { + "epoch": 1.4616221542504055, + "grad_norm": 0.36700689821516946, + "learning_rate": 7.310415752672616e-06, + "loss": 0.4913, + "step": 8900 + }, + { + "epoch": 1.4617863814587482, + "grad_norm": 0.24873549957452207, + "learning_rate": 7.310049472302676e-06, + "loss": 0.4701, + "step": 8901 + }, + { + "epoch": 1.4619506086670908, + "grad_norm": 0.271400064217122, + "learning_rate": 7.3096831614128336e-06, + "loss": 0.4849, + "step": 8902 + }, + { + "epoch": 1.4621148358754337, + "grad_norm": 0.30503298928358774, + "learning_rate": 7.309316820007066e-06, + "loss": 0.4793, + "step": 8903 + }, + { + "epoch": 1.4622790630837765, + "grad_norm": 0.32951277679543467, + "learning_rate": 7.3089504480893565e-06, + "loss": 0.4634, + "step": 8904 + }, + { + "epoch": 1.4624432902921192, + "grad_norm": 0.3415409673036727, + "learning_rate": 7.30858404566368e-06, + "loss": 0.456, + "step": 8905 + }, + { + "epoch": 1.4626075175004618, + "grad_norm": 0.2969992645784687, + "learning_rate": 7.308217612734019e-06, + "loss": 0.4706, + "step": 8906 + }, + { + "epoch": 1.4627717447088047, + "grad_norm": 0.28013964963023535, + "learning_rate": 7.30785114930435e-06, + "loss": 0.4544, + "step": 8907 + }, + { + "epoch": 1.4629359719171473, + "grad_norm": 0.32904919821824175, + "learning_rate": 7.307484655378656e-06, + "loss": 0.4442, + "step": 8908 + }, + { + "epoch": 1.4631001991254902, + "grad_norm": 0.4378861663903591, + "learning_rate": 7.307118130960917e-06, + "loss": 0.4411, + "step": 8909 + }, + { + "epoch": 1.4632644263338328, + "grad_norm": 0.2887836195173391, + "learning_rate": 7.306751576055111e-06, + "loss": 0.4565, + "step": 8910 + }, + { + "epoch": 1.4634286535421757, + "grad_norm": 0.3921748038982994, + "learning_rate": 7.306384990665223e-06, + "loss": 0.4707, + "step": 8911 + }, + { + "epoch": 1.4635928807505183, + "grad_norm": 0.33692717526667476, + "learning_rate": 7.306018374795234e-06, + "loss": 0.4521, + "step": 8912 + }, + { + "epoch": 1.4637571079588612, + "grad_norm": 0.3331839944553744, + "learning_rate": 7.305651728449123e-06, + "loss": 0.4676, + "step": 8913 + }, + { + "epoch": 1.4639213351672038, + "grad_norm": 0.4512463545321785, + "learning_rate": 7.305285051630875e-06, + "loss": 0.4694, + "step": 8914 + }, + { + "epoch": 1.4640855623755464, + "grad_norm": 0.3429881607623171, + "learning_rate": 7.3049183443444694e-06, + "loss": 0.4444, + "step": 8915 + }, + { + "epoch": 1.4642497895838893, + "grad_norm": 0.2916652689698767, + "learning_rate": 7.304551606593892e-06, + "loss": 0.4534, + "step": 8916 + }, + { + "epoch": 1.4644140167922322, + "grad_norm": 0.3177553803478125, + "learning_rate": 7.3041848383831254e-06, + "loss": 0.4787, + "step": 8917 + }, + { + "epoch": 1.4645782440005748, + "grad_norm": 0.2936657930249479, + "learning_rate": 7.303818039716152e-06, + "loss": 0.4576, + "step": 8918 + }, + { + "epoch": 1.4647424712089174, + "grad_norm": 0.3535099457184482, + "learning_rate": 7.303451210596957e-06, + "loss": 0.4742, + "step": 8919 + }, + { + "epoch": 1.4649066984172603, + "grad_norm": 0.36301749310474385, + "learning_rate": 7.303084351029522e-06, + "loss": 0.4771, + "step": 8920 + }, + { + "epoch": 1.4650709256256031, + "grad_norm": 0.27236820017666563, + "learning_rate": 7.302717461017834e-06, + "loss": 0.476, + "step": 8921 + }, + { + "epoch": 1.4652351528339458, + "grad_norm": 0.31864032094301475, + "learning_rate": 7.302350540565877e-06, + "loss": 0.4626, + "step": 8922 + }, + { + "epoch": 1.4653993800422884, + "grad_norm": 0.44159276047496465, + "learning_rate": 7.301983589677637e-06, + "loss": 0.4661, + "step": 8923 + }, + { + "epoch": 1.4655636072506313, + "grad_norm": 0.6163261811363203, + "learning_rate": 7.301616608357096e-06, + "loss": 0.4687, + "step": 8924 + }, + { + "epoch": 1.465727834458974, + "grad_norm": 0.6132487217057926, + "learning_rate": 7.301249596608244e-06, + "loss": 0.4746, + "step": 8925 + }, + { + "epoch": 1.4658920616673168, + "grad_norm": 0.30355394160206417, + "learning_rate": 7.300882554435065e-06, + "loss": 0.4899, + "step": 8926 + }, + { + "epoch": 1.4660562888756594, + "grad_norm": 0.3028776881337748, + "learning_rate": 7.3005154818415446e-06, + "loss": 0.4624, + "step": 8927 + }, + { + "epoch": 1.4662205160840023, + "grad_norm": 0.3490294656355712, + "learning_rate": 7.300148378831672e-06, + "loss": 0.4762, + "step": 8928 + }, + { + "epoch": 1.466384743292345, + "grad_norm": 0.38672288617898776, + "learning_rate": 7.2997812454094325e-06, + "loss": 0.4563, + "step": 8929 + }, + { + "epoch": 1.4665489705006878, + "grad_norm": 0.3460975897825717, + "learning_rate": 7.2994140815788146e-06, + "loss": 0.4558, + "step": 8930 + }, + { + "epoch": 1.4667131977090304, + "grad_norm": 0.3817812381648602, + "learning_rate": 7.299046887343805e-06, + "loss": 0.4676, + "step": 8931 + }, + { + "epoch": 1.466877424917373, + "grad_norm": 0.29091502397352326, + "learning_rate": 7.298679662708392e-06, + "loss": 0.4535, + "step": 8932 + }, + { + "epoch": 1.467041652125716, + "grad_norm": 0.3720886033235574, + "learning_rate": 7.298312407676565e-06, + "loss": 0.4719, + "step": 8933 + }, + { + "epoch": 1.4672058793340588, + "grad_norm": 0.2982734402976886, + "learning_rate": 7.297945122252312e-06, + "loss": 0.4642, + "step": 8934 + }, + { + "epoch": 1.4673701065424014, + "grad_norm": 0.2767453925146361, + "learning_rate": 7.297577806439622e-06, + "loss": 0.4618, + "step": 8935 + }, + { + "epoch": 1.467534333750744, + "grad_norm": 0.36345402905506835, + "learning_rate": 7.297210460242484e-06, + "loss": 0.4714, + "step": 8936 + }, + { + "epoch": 1.467698560959087, + "grad_norm": 0.32274055597623746, + "learning_rate": 7.2968430836648885e-06, + "loss": 0.4605, + "step": 8937 + }, + { + "epoch": 1.4678627881674298, + "grad_norm": 0.2545913876862383, + "learning_rate": 7.2964756767108265e-06, + "loss": 0.4743, + "step": 8938 + }, + { + "epoch": 1.4680270153757724, + "grad_norm": 0.3158754146163357, + "learning_rate": 7.296108239384287e-06, + "loss": 0.4785, + "step": 8939 + }, + { + "epoch": 1.468191242584115, + "grad_norm": 0.2791997204542906, + "learning_rate": 7.2957407716892604e-06, + "loss": 0.4799, + "step": 8940 + }, + { + "epoch": 1.468355469792458, + "grad_norm": 0.26346926896439843, + "learning_rate": 7.295373273629739e-06, + "loss": 0.481, + "step": 8941 + }, + { + "epoch": 1.4685196970008005, + "grad_norm": 0.27357834356219624, + "learning_rate": 7.295005745209713e-06, + "loss": 0.446, + "step": 8942 + }, + { + "epoch": 1.4686839242091434, + "grad_norm": 0.2852558176896654, + "learning_rate": 7.294638186433175e-06, + "loss": 0.4663, + "step": 8943 + }, + { + "epoch": 1.468848151417486, + "grad_norm": 0.3705262899315953, + "learning_rate": 7.294270597304117e-06, + "loss": 0.4699, + "step": 8944 + }, + { + "epoch": 1.469012378625829, + "grad_norm": 0.31750281046154766, + "learning_rate": 7.2939029778265295e-06, + "loss": 0.4582, + "step": 8945 + }, + { + "epoch": 1.4691766058341715, + "grad_norm": 0.3038648764501812, + "learning_rate": 7.29353532800441e-06, + "loss": 0.4509, + "step": 8946 + }, + { + "epoch": 1.4693408330425144, + "grad_norm": 0.29723681969502824, + "learning_rate": 7.293167647841745e-06, + "loss": 0.472, + "step": 8947 + }, + { + "epoch": 1.469505060250857, + "grad_norm": 0.3876434334700834, + "learning_rate": 7.292799937342534e-06, + "loss": 0.4829, + "step": 8948 + }, + { + "epoch": 1.4696692874591997, + "grad_norm": 0.2695956978075607, + "learning_rate": 7.292432196510766e-06, + "loss": 0.449, + "step": 8949 + }, + { + "epoch": 1.4698335146675425, + "grad_norm": 0.2974485527980798, + "learning_rate": 7.292064425350438e-06, + "loss": 0.4747, + "step": 8950 + }, + { + "epoch": 1.4699977418758854, + "grad_norm": 0.2859895877854272, + "learning_rate": 7.2916966238655424e-06, + "loss": 0.448, + "step": 8951 + }, + { + "epoch": 1.470161969084228, + "grad_norm": 0.31309097836143424, + "learning_rate": 7.291328792060075e-06, + "loss": 0.4844, + "step": 8952 + }, + { + "epoch": 1.4703261962925707, + "grad_norm": 0.31950330577085617, + "learning_rate": 7.290960929938032e-06, + "loss": 0.4677, + "step": 8953 + }, + { + "epoch": 1.4704904235009135, + "grad_norm": 0.356031776293735, + "learning_rate": 7.290593037503405e-06, + "loss": 0.4631, + "step": 8954 + }, + { + "epoch": 1.4706546507092564, + "grad_norm": 0.3892179659188917, + "learning_rate": 7.290225114760195e-06, + "loss": 0.4788, + "step": 8955 + }, + { + "epoch": 1.470818877917599, + "grad_norm": 0.3088060110020469, + "learning_rate": 7.289857161712393e-06, + "loss": 0.4773, + "step": 8956 + }, + { + "epoch": 1.4709831051259417, + "grad_norm": 0.34766766878995486, + "learning_rate": 7.289489178363997e-06, + "loss": 0.4641, + "step": 8957 + }, + { + "epoch": 1.4711473323342845, + "grad_norm": 0.987689661044155, + "learning_rate": 7.289121164719006e-06, + "loss": 0.4616, + "step": 8958 + }, + { + "epoch": 1.4713115595426272, + "grad_norm": 0.37323779225069614, + "learning_rate": 7.288753120781414e-06, + "loss": 0.4725, + "step": 8959 + }, + { + "epoch": 1.47147578675097, + "grad_norm": 0.2999592106766716, + "learning_rate": 7.288385046555218e-06, + "loss": 0.4874, + "step": 8960 + }, + { + "epoch": 1.4716400139593127, + "grad_norm": 0.4071471939682784, + "learning_rate": 7.288016942044418e-06, + "loss": 0.476, + "step": 8961 + }, + { + "epoch": 1.4718042411676555, + "grad_norm": 0.4291504258791555, + "learning_rate": 7.287648807253012e-06, + "loss": 0.4571, + "step": 8962 + }, + { + "epoch": 1.4719684683759982, + "grad_norm": 0.3502198595250842, + "learning_rate": 7.287280642184996e-06, + "loss": 0.4828, + "step": 8963 + }, + { + "epoch": 1.472132695584341, + "grad_norm": 0.375414310412938, + "learning_rate": 7.286912446844369e-06, + "loss": 0.4879, + "step": 8964 + }, + { + "epoch": 1.4722969227926836, + "grad_norm": 0.26277910260284426, + "learning_rate": 7.286544221235134e-06, + "loss": 0.4556, + "step": 8965 + }, + { + "epoch": 1.4724611500010263, + "grad_norm": 0.31490149171126364, + "learning_rate": 7.286175965361285e-06, + "loss": 0.4638, + "step": 8966 + }, + { + "epoch": 1.4726253772093691, + "grad_norm": 0.2892208531501333, + "learning_rate": 7.285807679226825e-06, + "loss": 0.4601, + "step": 8967 + }, + { + "epoch": 1.472789604417712, + "grad_norm": 0.2918932647093718, + "learning_rate": 7.285439362835751e-06, + "loss": 0.4796, + "step": 8968 + }, + { + "epoch": 1.4729538316260546, + "grad_norm": 0.2945099295955126, + "learning_rate": 7.285071016192067e-06, + "loss": 0.4636, + "step": 8969 + }, + { + "epoch": 1.4731180588343973, + "grad_norm": 0.28597820917530037, + "learning_rate": 7.28470263929977e-06, + "loss": 0.4905, + "step": 8970 + }, + { + "epoch": 1.4732822860427401, + "grad_norm": 0.4109484557655828, + "learning_rate": 7.284334232162864e-06, + "loss": 0.4654, + "step": 8971 + }, + { + "epoch": 1.473446513251083, + "grad_norm": 0.2621363643613177, + "learning_rate": 7.283965794785346e-06, + "loss": 0.4665, + "step": 8972 + }, + { + "epoch": 1.4736107404594256, + "grad_norm": 0.3483351477212068, + "learning_rate": 7.283597327171223e-06, + "loss": 0.4772, + "step": 8973 + }, + { + "epoch": 1.4737749676677683, + "grad_norm": 0.3508989330249181, + "learning_rate": 7.2832288293244935e-06, + "loss": 0.4476, + "step": 8974 + }, + { + "epoch": 1.4739391948761111, + "grad_norm": 0.44165630426238167, + "learning_rate": 7.282860301249162e-06, + "loss": 0.4698, + "step": 8975 + }, + { + "epoch": 1.4741034220844538, + "grad_norm": 0.3086075564657962, + "learning_rate": 7.2824917429492275e-06, + "loss": 0.4756, + "step": 8976 + }, + { + "epoch": 1.4742676492927966, + "grad_norm": 0.5470619925050496, + "learning_rate": 7.282123154428696e-06, + "loss": 0.4881, + "step": 8977 + }, + { + "epoch": 1.4744318765011393, + "grad_norm": 0.28503437164757617, + "learning_rate": 7.28175453569157e-06, + "loss": 0.4909, + "step": 8978 + }, + { + "epoch": 1.4745961037094821, + "grad_norm": 0.3183139987359893, + "learning_rate": 7.281385886741852e-06, + "loss": 0.4508, + "step": 8979 + }, + { + "epoch": 1.4747603309178248, + "grad_norm": 0.30594851299796155, + "learning_rate": 7.281017207583548e-06, + "loss": 0.4369, + "step": 8980 + }, + { + "epoch": 1.4749245581261676, + "grad_norm": 0.32208508578505773, + "learning_rate": 7.28064849822066e-06, + "loss": 0.471, + "step": 8981 + }, + { + "epoch": 1.4750887853345103, + "grad_norm": 0.35597240221197657, + "learning_rate": 7.280279758657194e-06, + "loss": 0.4673, + "step": 8982 + }, + { + "epoch": 1.475253012542853, + "grad_norm": 0.30726013389705537, + "learning_rate": 7.2799109888971544e-06, + "loss": 0.4376, + "step": 8983 + }, + { + "epoch": 1.4754172397511958, + "grad_norm": 0.3061948703740364, + "learning_rate": 7.279542188944548e-06, + "loss": 0.4453, + "step": 8984 + }, + { + "epoch": 1.4755814669595386, + "grad_norm": 0.3801963399207698, + "learning_rate": 7.279173358803376e-06, + "loss": 0.4601, + "step": 8985 + }, + { + "epoch": 1.4757456941678813, + "grad_norm": 0.30531711719679955, + "learning_rate": 7.2788044984776475e-06, + "loss": 0.4602, + "step": 8986 + }, + { + "epoch": 1.475909921376224, + "grad_norm": 0.26931105089669216, + "learning_rate": 7.2784356079713695e-06, + "loss": 0.458, + "step": 8987 + }, + { + "epoch": 1.4760741485845668, + "grad_norm": 0.3842988883695888, + "learning_rate": 7.278066687288547e-06, + "loss": 0.4869, + "step": 8988 + }, + { + "epoch": 1.4762383757929096, + "grad_norm": 0.28218722853074585, + "learning_rate": 7.277697736433186e-06, + "loss": 0.4642, + "step": 8989 + }, + { + "epoch": 1.4764026030012523, + "grad_norm": 0.2897705573643494, + "learning_rate": 7.277328755409295e-06, + "loss": 0.4625, + "step": 8990 + }, + { + "epoch": 1.476566830209595, + "grad_norm": 0.27586866292250783, + "learning_rate": 7.276959744220881e-06, + "loss": 0.4664, + "step": 8991 + }, + { + "epoch": 1.4767310574179378, + "grad_norm": 0.2827774853066872, + "learning_rate": 7.276590702871954e-06, + "loss": 0.4744, + "step": 8992 + }, + { + "epoch": 1.4768952846262804, + "grad_norm": 0.32702311421279057, + "learning_rate": 7.276221631366516e-06, + "loss": 0.475, + "step": 8993 + }, + { + "epoch": 1.4770595118346233, + "grad_norm": 0.2617829134294582, + "learning_rate": 7.275852529708582e-06, + "loss": 0.4705, + "step": 8994 + }, + { + "epoch": 1.4772237390429659, + "grad_norm": 0.47054078391175314, + "learning_rate": 7.275483397902159e-06, + "loss": 0.4546, + "step": 8995 + }, + { + "epoch": 1.4773879662513087, + "grad_norm": 0.28021517153771475, + "learning_rate": 7.275114235951256e-06, + "loss": 0.4753, + "step": 8996 + }, + { + "epoch": 1.4775521934596514, + "grad_norm": 0.26960945938330666, + "learning_rate": 7.274745043859882e-06, + "loss": 0.4722, + "step": 8997 + }, + { + "epoch": 1.4777164206679942, + "grad_norm": 0.34790989377611464, + "learning_rate": 7.274375821632045e-06, + "loss": 0.4666, + "step": 8998 + }, + { + "epoch": 1.4778806478763369, + "grad_norm": 0.2692479263324211, + "learning_rate": 7.274006569271758e-06, + "loss": 0.4628, + "step": 8999 + }, + { + "epoch": 1.4780448750846795, + "grad_norm": 0.429200616981824, + "learning_rate": 7.273637286783031e-06, + "loss": 0.4727, + "step": 9000 + }, + { + "epoch": 1.4782091022930224, + "grad_norm": 0.27891585980617317, + "learning_rate": 7.273267974169874e-06, + "loss": 0.4539, + "step": 9001 + }, + { + "epoch": 1.4783733295013652, + "grad_norm": 0.3124890295725687, + "learning_rate": 7.272898631436298e-06, + "loss": 0.4667, + "step": 9002 + }, + { + "epoch": 1.4785375567097079, + "grad_norm": 0.3246369076421457, + "learning_rate": 7.272529258586314e-06, + "loss": 0.4608, + "step": 9003 + }, + { + "epoch": 1.4787017839180505, + "grad_norm": 0.4909357585608995, + "learning_rate": 7.272159855623936e-06, + "loss": 0.4267, + "step": 9004 + }, + { + "epoch": 1.4788660111263934, + "grad_norm": 0.2962438853516577, + "learning_rate": 7.271790422553172e-06, + "loss": 0.4808, + "step": 9005 + }, + { + "epoch": 1.4790302383347362, + "grad_norm": 0.3504770230621263, + "learning_rate": 7.27142095937804e-06, + "loss": 0.4711, + "step": 9006 + }, + { + "epoch": 1.4791944655430789, + "grad_norm": 0.2911014714288107, + "learning_rate": 7.271051466102547e-06, + "loss": 0.4596, + "step": 9007 + }, + { + "epoch": 1.4793586927514215, + "grad_norm": 0.31010532252304307, + "learning_rate": 7.27068194273071e-06, + "loss": 0.4781, + "step": 9008 + }, + { + "epoch": 1.4795229199597644, + "grad_norm": 0.286613279240997, + "learning_rate": 7.270312389266542e-06, + "loss": 0.4617, + "step": 9009 + }, + { + "epoch": 1.479687147168107, + "grad_norm": 0.3897309711832461, + "learning_rate": 7.2699428057140545e-06, + "loss": 0.4781, + "step": 9010 + }, + { + "epoch": 1.4798513743764499, + "grad_norm": 0.37326616468971, + "learning_rate": 7.269573192077263e-06, + "loss": 0.4661, + "step": 9011 + }, + { + "epoch": 1.4800156015847925, + "grad_norm": 0.2772497866687941, + "learning_rate": 7.269203548360182e-06, + "loss": 0.4527, + "step": 9012 + }, + { + "epoch": 1.4801798287931354, + "grad_norm": 0.2775398257236335, + "learning_rate": 7.2688338745668264e-06, + "loss": 0.4574, + "step": 9013 + }, + { + "epoch": 1.480344056001478, + "grad_norm": 0.365805191968904, + "learning_rate": 7.26846417070121e-06, + "loss": 0.4674, + "step": 9014 + }, + { + "epoch": 1.4805082832098209, + "grad_norm": 0.2995131322182461, + "learning_rate": 7.26809443676735e-06, + "loss": 0.4625, + "step": 9015 + }, + { + "epoch": 1.4806725104181635, + "grad_norm": 0.2920748910867749, + "learning_rate": 7.2677246727692605e-06, + "loss": 0.449, + "step": 9016 + }, + { + "epoch": 1.4808367376265061, + "grad_norm": 0.3134085998958177, + "learning_rate": 7.267354878710957e-06, + "loss": 0.4752, + "step": 9017 + }, + { + "epoch": 1.481000964834849, + "grad_norm": 0.2910762798061273, + "learning_rate": 7.266985054596457e-06, + "loss": 0.4668, + "step": 9018 + }, + { + "epoch": 1.4811651920431919, + "grad_norm": 0.3620826935634441, + "learning_rate": 7.266615200429778e-06, + "loss": 0.4694, + "step": 9019 + }, + { + "epoch": 1.4813294192515345, + "grad_norm": 0.3239612400513165, + "learning_rate": 7.266245316214935e-06, + "loss": 0.4685, + "step": 9020 + }, + { + "epoch": 1.4814936464598771, + "grad_norm": 0.512868777403178, + "learning_rate": 7.265875401955947e-06, + "loss": 0.4471, + "step": 9021 + }, + { + "epoch": 1.48165787366822, + "grad_norm": 0.35965132682584766, + "learning_rate": 7.265505457656831e-06, + "loss": 0.475, + "step": 9022 + }, + { + "epoch": 1.4818221008765629, + "grad_norm": 0.3268419705604959, + "learning_rate": 7.265135483321604e-06, + "loss": 0.4804, + "step": 9023 + }, + { + "epoch": 1.4819863280849055, + "grad_norm": 0.3992553047998647, + "learning_rate": 7.264765478954286e-06, + "loss": 0.449, + "step": 9024 + }, + { + "epoch": 1.4821505552932481, + "grad_norm": 0.2918082085432597, + "learning_rate": 7.264395444558895e-06, + "loss": 0.4584, + "step": 9025 + }, + { + "epoch": 1.482314782501591, + "grad_norm": 0.270882114076403, + "learning_rate": 7.264025380139448e-06, + "loss": 0.4664, + "step": 9026 + }, + { + "epoch": 1.4824790097099336, + "grad_norm": 0.3532222289683168, + "learning_rate": 7.263655285699966e-06, + "loss": 0.4719, + "step": 9027 + }, + { + "epoch": 1.4826432369182765, + "grad_norm": 0.2849963487764551, + "learning_rate": 7.263285161244469e-06, + "loss": 0.4576, + "step": 9028 + }, + { + "epoch": 1.4828074641266191, + "grad_norm": 0.3139206838920788, + "learning_rate": 7.262915006776978e-06, + "loss": 0.4721, + "step": 9029 + }, + { + "epoch": 1.482971691334962, + "grad_norm": 0.48344010886321853, + "learning_rate": 7.26254482230151e-06, + "loss": 0.4539, + "step": 9030 + }, + { + "epoch": 1.4831359185433046, + "grad_norm": 0.3287835530250805, + "learning_rate": 7.262174607822088e-06, + "loss": 0.4511, + "step": 9031 + }, + { + "epoch": 1.4833001457516475, + "grad_norm": 0.2750891894079266, + "learning_rate": 7.261804363342731e-06, + "loss": 0.4718, + "step": 9032 + }, + { + "epoch": 1.4834643729599901, + "grad_norm": 0.3425506478423419, + "learning_rate": 7.261434088867463e-06, + "loss": 0.4768, + "step": 9033 + }, + { + "epoch": 1.4836286001683328, + "grad_norm": 0.298876986879865, + "learning_rate": 7.261063784400304e-06, + "loss": 0.4703, + "step": 9034 + }, + { + "epoch": 1.4837928273766756, + "grad_norm": 0.34455142421650525, + "learning_rate": 7.260693449945274e-06, + "loss": 0.478, + "step": 9035 + }, + { + "epoch": 1.4839570545850185, + "grad_norm": 0.34054511742199567, + "learning_rate": 7.260323085506398e-06, + "loss": 0.4653, + "step": 9036 + }, + { + "epoch": 1.4841212817933611, + "grad_norm": 0.2849754511923055, + "learning_rate": 7.259952691087697e-06, + "loss": 0.4581, + "step": 9037 + }, + { + "epoch": 1.4842855090017038, + "grad_norm": 0.31908669245109694, + "learning_rate": 7.259582266693196e-06, + "loss": 0.4709, + "step": 9038 + }, + { + "epoch": 1.4844497362100466, + "grad_norm": 0.261760821048014, + "learning_rate": 7.259211812326916e-06, + "loss": 0.4595, + "step": 9039 + }, + { + "epoch": 1.4846139634183895, + "grad_norm": 0.5699588523422938, + "learning_rate": 7.258841327992879e-06, + "loss": 0.4435, + "step": 9040 + }, + { + "epoch": 1.484778190626732, + "grad_norm": 0.3436625253191333, + "learning_rate": 7.258470813695112e-06, + "loss": 0.49, + "step": 9041 + }, + { + "epoch": 1.4849424178350747, + "grad_norm": 0.29229791987285086, + "learning_rate": 7.258100269437637e-06, + "loss": 0.448, + "step": 9042 + }, + { + "epoch": 1.4851066450434176, + "grad_norm": 0.2696646005456477, + "learning_rate": 7.257729695224482e-06, + "loss": 0.4741, + "step": 9043 + }, + { + "epoch": 1.4852708722517602, + "grad_norm": 0.5564414897893172, + "learning_rate": 7.257359091059668e-06, + "loss": 0.4651, + "step": 9044 + }, + { + "epoch": 1.485435099460103, + "grad_norm": 0.37273520474470473, + "learning_rate": 7.256988456947221e-06, + "loss": 0.4658, + "step": 9045 + }, + { + "epoch": 1.4855993266684457, + "grad_norm": 0.26984808619858086, + "learning_rate": 7.256617792891168e-06, + "loss": 0.4763, + "step": 9046 + }, + { + "epoch": 1.4857635538767886, + "grad_norm": 0.30726132225802555, + "learning_rate": 7.256247098895533e-06, + "loss": 0.4564, + "step": 9047 + }, + { + "epoch": 1.4859277810851312, + "grad_norm": 0.2964204921364908, + "learning_rate": 7.255876374964341e-06, + "loss": 0.449, + "step": 9048 + }, + { + "epoch": 1.486092008293474, + "grad_norm": 0.34428474234477524, + "learning_rate": 7.255505621101623e-06, + "loss": 0.4738, + "step": 9049 + }, + { + "epoch": 1.4862562355018167, + "grad_norm": 0.3100524445547254, + "learning_rate": 7.255134837311402e-06, + "loss": 0.4513, + "step": 9050 + }, + { + "epoch": 1.4864204627101594, + "grad_norm": 0.2968065640191466, + "learning_rate": 7.254764023597705e-06, + "loss": 0.4488, + "step": 9051 + }, + { + "epoch": 1.4865846899185022, + "grad_norm": 0.42991865968143095, + "learning_rate": 7.254393179964561e-06, + "loss": 0.4656, + "step": 9052 + }, + { + "epoch": 1.486748917126845, + "grad_norm": 0.2840509670038126, + "learning_rate": 7.254022306415996e-06, + "loss": 0.4576, + "step": 9053 + }, + { + "epoch": 1.4869131443351877, + "grad_norm": 0.27885545284201946, + "learning_rate": 7.25365140295604e-06, + "loss": 0.4566, + "step": 9054 + }, + { + "epoch": 1.4870773715435304, + "grad_norm": 0.2880230279300889, + "learning_rate": 7.253280469588722e-06, + "loss": 0.467, + "step": 9055 + }, + { + "epoch": 1.4872415987518732, + "grad_norm": 0.2713198311101982, + "learning_rate": 7.252909506318067e-06, + "loss": 0.4737, + "step": 9056 + }, + { + "epoch": 1.487405825960216, + "grad_norm": 0.29087205139831046, + "learning_rate": 7.252538513148108e-06, + "loss": 0.457, + "step": 9057 + }, + { + "epoch": 1.4875700531685587, + "grad_norm": 0.3218582414140392, + "learning_rate": 7.2521674900828705e-06, + "loss": 0.4679, + "step": 9058 + }, + { + "epoch": 1.4877342803769014, + "grad_norm": 0.3022988004594291, + "learning_rate": 7.251796437126388e-06, + "loss": 0.4762, + "step": 9059 + }, + { + "epoch": 1.4878985075852442, + "grad_norm": 0.3237420410141972, + "learning_rate": 7.251425354282689e-06, + "loss": 0.4806, + "step": 9060 + }, + { + "epoch": 1.4880627347935869, + "grad_norm": 0.40148076135141486, + "learning_rate": 7.251054241555803e-06, + "loss": 0.4479, + "step": 9061 + }, + { + "epoch": 1.4882269620019297, + "grad_norm": 0.3021622550741741, + "learning_rate": 7.250683098949761e-06, + "loss": 0.4489, + "step": 9062 + }, + { + "epoch": 1.4883911892102724, + "grad_norm": 0.2737397256477465, + "learning_rate": 7.250311926468595e-06, + "loss": 0.4622, + "step": 9063 + }, + { + "epoch": 1.4885554164186152, + "grad_norm": 0.2838829341107318, + "learning_rate": 7.249940724116335e-06, + "loss": 0.4788, + "step": 9064 + }, + { + "epoch": 1.4887196436269579, + "grad_norm": 0.2900387237367032, + "learning_rate": 7.2495694918970125e-06, + "loss": 0.5016, + "step": 9065 + }, + { + "epoch": 1.4888838708353007, + "grad_norm": 0.32723225864852523, + "learning_rate": 7.249198229814661e-06, + "loss": 0.4635, + "step": 9066 + }, + { + "epoch": 1.4890480980436434, + "grad_norm": 0.2929153755364235, + "learning_rate": 7.248826937873313e-06, + "loss": 0.4627, + "step": 9067 + }, + { + "epoch": 1.489212325251986, + "grad_norm": 0.29032930548652564, + "learning_rate": 7.248455616076998e-06, + "loss": 0.4736, + "step": 9068 + }, + { + "epoch": 1.4893765524603289, + "grad_norm": 0.35280273211851365, + "learning_rate": 7.248084264429751e-06, + "loss": 0.473, + "step": 9069 + }, + { + "epoch": 1.4895407796686717, + "grad_norm": 0.46850827088761615, + "learning_rate": 7.2477128829356055e-06, + "loss": 0.4495, + "step": 9070 + }, + { + "epoch": 1.4897050068770143, + "grad_norm": 0.3028158132164672, + "learning_rate": 7.247341471598596e-06, + "loss": 0.4797, + "step": 9071 + }, + { + "epoch": 1.489869234085357, + "grad_norm": 0.3222872181362865, + "learning_rate": 7.2469700304227535e-06, + "loss": 0.4687, + "step": 9072 + }, + { + "epoch": 1.4900334612936998, + "grad_norm": 0.3353242450473158, + "learning_rate": 7.246598559412115e-06, + "loss": 0.4442, + "step": 9073 + }, + { + "epoch": 1.4901976885020427, + "grad_norm": 0.3147282673915163, + "learning_rate": 7.246227058570714e-06, + "loss": 0.4727, + "step": 9074 + }, + { + "epoch": 1.4903619157103853, + "grad_norm": 0.26213956343160233, + "learning_rate": 7.2458555279025836e-06, + "loss": 0.4803, + "step": 9075 + }, + { + "epoch": 1.490526142918728, + "grad_norm": 0.43566408018354824, + "learning_rate": 7.245483967411762e-06, + "loss": 0.4805, + "step": 9076 + }, + { + "epoch": 1.4906903701270708, + "grad_norm": 0.49105090744394064, + "learning_rate": 7.2451123771022816e-06, + "loss": 0.4697, + "step": 9077 + }, + { + "epoch": 1.4908545973354135, + "grad_norm": 0.5978965467093454, + "learning_rate": 7.244740756978181e-06, + "loss": 0.4664, + "step": 9078 + }, + { + "epoch": 1.4910188245437563, + "grad_norm": 0.34767763702904925, + "learning_rate": 7.2443691070434955e-06, + "loss": 0.4618, + "step": 9079 + }, + { + "epoch": 1.491183051752099, + "grad_norm": 0.2835559414850179, + "learning_rate": 7.2439974273022625e-06, + "loss": 0.4952, + "step": 9080 + }, + { + "epoch": 1.4913472789604418, + "grad_norm": 0.36899619802931427, + "learning_rate": 7.243625717758516e-06, + "loss": 0.4588, + "step": 9081 + }, + { + "epoch": 1.4915115061687845, + "grad_norm": 0.31211856962328216, + "learning_rate": 7.243253978416294e-06, + "loss": 0.4719, + "step": 9082 + }, + { + "epoch": 1.4916757333771273, + "grad_norm": 0.2795658714787976, + "learning_rate": 7.242882209279637e-06, + "loss": 0.4635, + "step": 9083 + }, + { + "epoch": 1.49183996058547, + "grad_norm": 0.25916023604519495, + "learning_rate": 7.242510410352581e-06, + "loss": 0.4682, + "step": 9084 + }, + { + "epoch": 1.4920041877938126, + "grad_norm": 0.3129340495290174, + "learning_rate": 7.242138581639162e-06, + "loss": 0.4615, + "step": 9085 + }, + { + "epoch": 1.4921684150021555, + "grad_norm": 0.36526210508797624, + "learning_rate": 7.24176672314342e-06, + "loss": 0.4804, + "step": 9086 + }, + { + "epoch": 1.4923326422104983, + "grad_norm": 0.37940069172741353, + "learning_rate": 7.241394834869395e-06, + "loss": 0.444, + "step": 9087 + }, + { + "epoch": 1.492496869418841, + "grad_norm": 0.36743810559285073, + "learning_rate": 7.241022916821124e-06, + "loss": 0.4617, + "step": 9088 + }, + { + "epoch": 1.4926610966271836, + "grad_norm": 0.3399224851236104, + "learning_rate": 7.240650969002647e-06, + "loss": 0.4767, + "step": 9089 + }, + { + "epoch": 1.4928253238355265, + "grad_norm": 0.3568026906661504, + "learning_rate": 7.2402789914180045e-06, + "loss": 0.4583, + "step": 9090 + }, + { + "epoch": 1.4929895510438693, + "grad_norm": 0.47928165555028224, + "learning_rate": 7.239906984071238e-06, + "loss": 0.4717, + "step": 9091 + }, + { + "epoch": 1.493153778252212, + "grad_norm": 0.3438429100691715, + "learning_rate": 7.239534946966384e-06, + "loss": 0.4975, + "step": 9092 + }, + { + "epoch": 1.4933180054605546, + "grad_norm": 0.37410444420126543, + "learning_rate": 7.239162880107485e-06, + "loss": 0.4614, + "step": 9093 + }, + { + "epoch": 1.4934822326688975, + "grad_norm": 0.3130268740405012, + "learning_rate": 7.238790783498583e-06, + "loss": 0.4601, + "step": 9094 + }, + { + "epoch": 1.49364645987724, + "grad_norm": 0.34707548690935847, + "learning_rate": 7.238418657143716e-06, + "loss": 0.4751, + "step": 9095 + }, + { + "epoch": 1.493810687085583, + "grad_norm": 0.2975015764973547, + "learning_rate": 7.2380465010469316e-06, + "loss": 0.4566, + "step": 9096 + }, + { + "epoch": 1.4939749142939256, + "grad_norm": 0.2923288441893135, + "learning_rate": 7.237674315212267e-06, + "loss": 0.4669, + "step": 9097 + }, + { + "epoch": 1.4941391415022685, + "grad_norm": 0.31449192041777363, + "learning_rate": 7.237302099643766e-06, + "loss": 0.4665, + "step": 9098 + }, + { + "epoch": 1.494303368710611, + "grad_norm": 0.4705678262595334, + "learning_rate": 7.23692985434547e-06, + "loss": 0.4765, + "step": 9099 + }, + { + "epoch": 1.494467595918954, + "grad_norm": 0.3617709294721401, + "learning_rate": 7.236557579321424e-06, + "loss": 0.4689, + "step": 9100 + }, + { + "epoch": 1.4946318231272966, + "grad_norm": 0.3301324253026518, + "learning_rate": 7.23618527457567e-06, + "loss": 0.442, + "step": 9101 + }, + { + "epoch": 1.4947960503356392, + "grad_norm": 0.658918156487996, + "learning_rate": 7.235812940112252e-06, + "loss": 0.4592, + "step": 9102 + }, + { + "epoch": 1.494960277543982, + "grad_norm": 1.0723152036270371, + "learning_rate": 7.235440575935215e-06, + "loss": 0.472, + "step": 9103 + }, + { + "epoch": 1.495124504752325, + "grad_norm": 0.32671311937006464, + "learning_rate": 7.235068182048599e-06, + "loss": 0.4698, + "step": 9104 + }, + { + "epoch": 1.4952887319606676, + "grad_norm": 0.28952442856594846, + "learning_rate": 7.234695758456454e-06, + "loss": 0.4607, + "step": 9105 + }, + { + "epoch": 1.4954529591690102, + "grad_norm": 0.5227314578459401, + "learning_rate": 7.234323305162822e-06, + "loss": 0.4777, + "step": 9106 + }, + { + "epoch": 1.495617186377353, + "grad_norm": 0.2734681071879501, + "learning_rate": 7.233950822171748e-06, + "loss": 0.4682, + "step": 9107 + }, + { + "epoch": 1.495781413585696, + "grad_norm": 0.3778217861495209, + "learning_rate": 7.233578309487279e-06, + "loss": 0.46, + "step": 9108 + }, + { + "epoch": 1.4959456407940386, + "grad_norm": 0.3616470314992457, + "learning_rate": 7.23320576711346e-06, + "loss": 0.4457, + "step": 9109 + }, + { + "epoch": 1.4961098680023812, + "grad_norm": 0.31428696869504247, + "learning_rate": 7.232833195054337e-06, + "loss": 0.4759, + "step": 9110 + }, + { + "epoch": 1.496274095210724, + "grad_norm": 0.3320566406078576, + "learning_rate": 7.232460593313957e-06, + "loss": 0.4645, + "step": 9111 + }, + { + "epoch": 1.4964383224190667, + "grad_norm": 0.5345790329651504, + "learning_rate": 7.232087961896366e-06, + "loss": 0.4563, + "step": 9112 + }, + { + "epoch": 1.4966025496274096, + "grad_norm": 0.41574619219644277, + "learning_rate": 7.231715300805613e-06, + "loss": 0.473, + "step": 9113 + }, + { + "epoch": 1.4967667768357522, + "grad_norm": 0.2652127144284407, + "learning_rate": 7.231342610045744e-06, + "loss": 0.4606, + "step": 9114 + }, + { + "epoch": 1.496931004044095, + "grad_norm": 0.2908442547923296, + "learning_rate": 7.230969889620806e-06, + "loss": 0.4607, + "step": 9115 + }, + { + "epoch": 1.4970952312524377, + "grad_norm": 0.3965280190532817, + "learning_rate": 7.230597139534848e-06, + "loss": 0.465, + "step": 9116 + }, + { + "epoch": 1.4972594584607806, + "grad_norm": 0.37558384455595123, + "learning_rate": 7.230224359791918e-06, + "loss": 0.4845, + "step": 9117 + }, + { + "epoch": 1.4974236856691232, + "grad_norm": 0.2828617605706177, + "learning_rate": 7.2298515503960665e-06, + "loss": 0.4719, + "step": 9118 + }, + { + "epoch": 1.4975879128774658, + "grad_norm": 0.28998875121414835, + "learning_rate": 7.229478711351341e-06, + "loss": 0.4722, + "step": 9119 + }, + { + "epoch": 1.4977521400858087, + "grad_norm": 0.3699351941513517, + "learning_rate": 7.229105842661792e-06, + "loss": 0.4716, + "step": 9120 + }, + { + "epoch": 1.4979163672941516, + "grad_norm": 0.4265125664411172, + "learning_rate": 7.228732944331468e-06, + "loss": 0.4693, + "step": 9121 + }, + { + "epoch": 1.4980805945024942, + "grad_norm": 0.4204329568659064, + "learning_rate": 7.228360016364418e-06, + "loss": 0.4519, + "step": 9122 + }, + { + "epoch": 1.4982448217108368, + "grad_norm": 0.5794156378337116, + "learning_rate": 7.227987058764696e-06, + "loss": 0.4771, + "step": 9123 + }, + { + "epoch": 1.4984090489191797, + "grad_norm": 0.2791439561851294, + "learning_rate": 7.22761407153635e-06, + "loss": 0.4829, + "step": 9124 + }, + { + "epoch": 1.4985732761275226, + "grad_norm": 0.6543863320758504, + "learning_rate": 7.227241054683431e-06, + "loss": 0.466, + "step": 9125 + }, + { + "epoch": 1.4987375033358652, + "grad_norm": 0.4130277000795869, + "learning_rate": 7.2268680082099915e-06, + "loss": 0.4615, + "step": 9126 + }, + { + "epoch": 1.4989017305442078, + "grad_norm": 0.29982715625006073, + "learning_rate": 7.226494932120081e-06, + "loss": 0.4564, + "step": 9127 + }, + { + "epoch": 1.4990659577525507, + "grad_norm": 0.35911683308681835, + "learning_rate": 7.226121826417755e-06, + "loss": 0.4566, + "step": 9128 + }, + { + "epoch": 1.4992301849608933, + "grad_norm": 0.3561038189163182, + "learning_rate": 7.225748691107063e-06, + "loss": 0.4506, + "step": 9129 + }, + { + "epoch": 1.4993944121692362, + "grad_norm": 0.31919930536086555, + "learning_rate": 7.225375526192059e-06, + "loss": 0.4602, + "step": 9130 + }, + { + "epoch": 1.4995586393775788, + "grad_norm": 0.37857621222149634, + "learning_rate": 7.225002331676795e-06, + "loss": 0.4588, + "step": 9131 + }, + { + "epoch": 1.4997228665859217, + "grad_norm": 0.29728581761591794, + "learning_rate": 7.224629107565324e-06, + "loss": 0.4722, + "step": 9132 + }, + { + "epoch": 1.4998870937942643, + "grad_norm": 0.4309870688263262, + "learning_rate": 7.224255853861701e-06, + "loss": 0.4506, + "step": 9133 + }, + { + "epoch": 1.5000513210026072, + "grad_norm": 0.4385984762113458, + "learning_rate": 7.22388257056998e-06, + "loss": 0.4754, + "step": 9134 + }, + { + "epoch": 1.5002155482109498, + "grad_norm": 0.37723360627303476, + "learning_rate": 7.2235092576942125e-06, + "loss": 0.4835, + "step": 9135 + }, + { + "epoch": 1.5003797754192925, + "grad_norm": 0.2969754460478771, + "learning_rate": 7.223135915238455e-06, + "loss": 0.4753, + "step": 9136 + }, + { + "epoch": 1.5005440026276353, + "grad_norm": 0.3254557370821571, + "learning_rate": 7.222762543206763e-06, + "loss": 0.4764, + "step": 9137 + }, + { + "epoch": 1.5007082298359782, + "grad_norm": 0.3534731655018746, + "learning_rate": 7.222389141603192e-06, + "loss": 0.4517, + "step": 9138 + }, + { + "epoch": 1.5008724570443208, + "grad_norm": 1.0771222470195647, + "learning_rate": 7.222015710431795e-06, + "loss": 0.4836, + "step": 9139 + }, + { + "epoch": 1.5010366842526635, + "grad_norm": 0.4629425359743406, + "learning_rate": 7.221642249696629e-06, + "loss": 0.4474, + "step": 9140 + }, + { + "epoch": 1.5012009114610063, + "grad_norm": 0.3712616941989587, + "learning_rate": 7.221268759401751e-06, + "loss": 0.4755, + "step": 9141 + }, + { + "epoch": 1.5013651386693492, + "grad_norm": 0.334863403671556, + "learning_rate": 7.220895239551218e-06, + "loss": 0.4748, + "step": 9142 + }, + { + "epoch": 1.5015293658776918, + "grad_norm": 0.3404372632720664, + "learning_rate": 7.220521690149084e-06, + "loss": 0.4634, + "step": 9143 + }, + { + "epoch": 1.5016935930860345, + "grad_norm": 0.3129244116574231, + "learning_rate": 7.220148111199409e-06, + "loss": 0.4562, + "step": 9144 + }, + { + "epoch": 1.5018578202943773, + "grad_norm": 0.3654382271943735, + "learning_rate": 7.219774502706248e-06, + "loss": 0.4505, + "step": 9145 + }, + { + "epoch": 1.5020220475027202, + "grad_norm": 0.35624390931397154, + "learning_rate": 7.21940086467366e-06, + "loss": 0.4513, + "step": 9146 + }, + { + "epoch": 1.5021862747110628, + "grad_norm": 0.3228160726866767, + "learning_rate": 7.219027197105705e-06, + "loss": 0.4384, + "step": 9147 + }, + { + "epoch": 1.5023505019194054, + "grad_norm": 0.3362682026085022, + "learning_rate": 7.218653500006438e-06, + "loss": 0.4613, + "step": 9148 + }, + { + "epoch": 1.502514729127748, + "grad_norm": 0.3517407041807582, + "learning_rate": 7.21827977337992e-06, + "loss": 0.4515, + "step": 9149 + }, + { + "epoch": 1.502678956336091, + "grad_norm": 0.3059612901310846, + "learning_rate": 7.217906017230208e-06, + "loss": 0.4642, + "step": 9150 + }, + { + "epoch": 1.5028431835444338, + "grad_norm": 0.3005958427302342, + "learning_rate": 7.217532231561363e-06, + "loss": 0.4691, + "step": 9151 + }, + { + "epoch": 1.5030074107527764, + "grad_norm": 0.5129411936571094, + "learning_rate": 7.217158416377445e-06, + "loss": 0.465, + "step": 9152 + }, + { + "epoch": 1.503171637961119, + "grad_norm": 0.3666288166046643, + "learning_rate": 7.216784571682513e-06, + "loss": 0.4509, + "step": 9153 + }, + { + "epoch": 1.503335865169462, + "grad_norm": 0.294422504507227, + "learning_rate": 7.216410697480627e-06, + "loss": 0.4554, + "step": 9154 + }, + { + "epoch": 1.5035000923778048, + "grad_norm": 0.3113601579543826, + "learning_rate": 7.21603679377585e-06, + "loss": 0.4828, + "step": 9155 + }, + { + "epoch": 1.5036643195861474, + "grad_norm": 0.30263614060555033, + "learning_rate": 7.215662860572238e-06, + "loss": 0.4694, + "step": 9156 + }, + { + "epoch": 1.50382854679449, + "grad_norm": 0.4734181151005649, + "learning_rate": 7.2152888978738585e-06, + "loss": 0.4762, + "step": 9157 + }, + { + "epoch": 1.503992774002833, + "grad_norm": 0.3287225146005883, + "learning_rate": 7.214914905684769e-06, + "loss": 0.4666, + "step": 9158 + }, + { + "epoch": 1.5041570012111758, + "grad_norm": 0.305886165071671, + "learning_rate": 7.214540884009032e-06, + "loss": 0.4496, + "step": 9159 + }, + { + "epoch": 1.5043212284195184, + "grad_norm": 0.3011882096853734, + "learning_rate": 7.214166832850711e-06, + "loss": 0.4582, + "step": 9160 + }, + { + "epoch": 1.504485455627861, + "grad_norm": 0.41488963999633277, + "learning_rate": 7.213792752213867e-06, + "loss": 0.4648, + "step": 9161 + }, + { + "epoch": 1.504649682836204, + "grad_norm": 0.31544674799165684, + "learning_rate": 7.213418642102564e-06, + "loss": 0.4844, + "step": 9162 + }, + { + "epoch": 1.5048139100445468, + "grad_norm": 0.28949199485026034, + "learning_rate": 7.213044502520866e-06, + "loss": 0.4671, + "step": 9163 + }, + { + "epoch": 1.5049781372528894, + "grad_norm": 0.2939590327333749, + "learning_rate": 7.212670333472835e-06, + "loss": 0.4771, + "step": 9164 + }, + { + "epoch": 1.505142364461232, + "grad_norm": 0.3670101412740805, + "learning_rate": 7.212296134962533e-06, + "loss": 0.4688, + "step": 9165 + }, + { + "epoch": 1.5053065916695747, + "grad_norm": 0.28723258461262186, + "learning_rate": 7.2119219069940296e-06, + "loss": 0.4862, + "step": 9166 + }, + { + "epoch": 1.5054708188779176, + "grad_norm": 0.32542175887326896, + "learning_rate": 7.2115476495713846e-06, + "loss": 0.4553, + "step": 9167 + }, + { + "epoch": 1.5056350460862604, + "grad_norm": 0.3007208342378376, + "learning_rate": 7.211173362698664e-06, + "loss": 0.4659, + "step": 9168 + }, + { + "epoch": 1.505799273294603, + "grad_norm": 0.38737030393235344, + "learning_rate": 7.210799046379935e-06, + "loss": 0.4686, + "step": 9169 + }, + { + "epoch": 1.5059635005029457, + "grad_norm": 0.26963051081694284, + "learning_rate": 7.210424700619259e-06, + "loss": 0.4454, + "step": 9170 + }, + { + "epoch": 1.5061277277112886, + "grad_norm": 0.34262459734970785, + "learning_rate": 7.210050325420705e-06, + "loss": 0.4565, + "step": 9171 + }, + { + "epoch": 1.5062919549196314, + "grad_norm": 0.5174600911956027, + "learning_rate": 7.209675920788338e-06, + "loss": 0.4668, + "step": 9172 + }, + { + "epoch": 1.506456182127974, + "grad_norm": 0.29129980320508153, + "learning_rate": 7.209301486726226e-06, + "loss": 0.4603, + "step": 9173 + }, + { + "epoch": 1.5066204093363167, + "grad_norm": 0.2999281854094833, + "learning_rate": 7.208927023238432e-06, + "loss": 0.4459, + "step": 9174 + }, + { + "epoch": 1.5067846365446596, + "grad_norm": 0.34509789834469307, + "learning_rate": 7.208552530329026e-06, + "loss": 0.4825, + "step": 9175 + }, + { + "epoch": 1.5069488637530024, + "grad_norm": 0.33035112975237557, + "learning_rate": 7.208178008002076e-06, + "loss": 0.4466, + "step": 9176 + }, + { + "epoch": 1.507113090961345, + "grad_norm": 0.33090831360597694, + "learning_rate": 7.2078034562616465e-06, + "loss": 0.4671, + "step": 9177 + }, + { + "epoch": 1.5072773181696877, + "grad_norm": 0.2959356681925396, + "learning_rate": 7.207428875111809e-06, + "loss": 0.4513, + "step": 9178 + }, + { + "epoch": 1.5074415453780305, + "grad_norm": 0.27555625602430894, + "learning_rate": 7.207054264556629e-06, + "loss": 0.4824, + "step": 9179 + }, + { + "epoch": 1.5076057725863734, + "grad_norm": 0.3831379474711666, + "learning_rate": 7.206679624600177e-06, + "loss": 0.4436, + "step": 9180 + }, + { + "epoch": 1.507769999794716, + "grad_norm": 0.3182477305511219, + "learning_rate": 7.206304955246521e-06, + "loss": 0.4693, + "step": 9181 + }, + { + "epoch": 1.5079342270030587, + "grad_norm": 0.3604807686672664, + "learning_rate": 7.2059302564997295e-06, + "loss": 0.4604, + "step": 9182 + }, + { + "epoch": 1.5080984542114013, + "grad_norm": 0.5300038312700972, + "learning_rate": 7.205555528363875e-06, + "loss": 0.4699, + "step": 9183 + }, + { + "epoch": 1.5082626814197442, + "grad_norm": 0.2701038850534607, + "learning_rate": 7.205180770843024e-06, + "loss": 0.4949, + "step": 9184 + }, + { + "epoch": 1.508426908628087, + "grad_norm": 0.7164451544357832, + "learning_rate": 7.204805983941249e-06, + "loss": 0.4636, + "step": 9185 + }, + { + "epoch": 1.5085911358364297, + "grad_norm": 0.45159203892050004, + "learning_rate": 7.20443116766262e-06, + "loss": 0.4948, + "step": 9186 + }, + { + "epoch": 1.5087553630447723, + "grad_norm": 0.28798609716907286, + "learning_rate": 7.204056322011208e-06, + "loss": 0.4756, + "step": 9187 + }, + { + "epoch": 1.5089195902531152, + "grad_norm": 0.2880361608967198, + "learning_rate": 7.203681446991084e-06, + "loss": 0.4714, + "step": 9188 + }, + { + "epoch": 1.509083817461458, + "grad_norm": 0.5247905124145498, + "learning_rate": 7.2033065426063176e-06, + "loss": 0.4727, + "step": 9189 + }, + { + "epoch": 1.5092480446698007, + "grad_norm": 0.6860853909871801, + "learning_rate": 7.202931608860984e-06, + "loss": 0.4934, + "step": 9190 + }, + { + "epoch": 1.5094122718781433, + "grad_norm": 0.28275800142480045, + "learning_rate": 7.202556645759153e-06, + "loss": 0.4777, + "step": 9191 + }, + { + "epoch": 1.5095764990864862, + "grad_norm": 0.31027777276533725, + "learning_rate": 7.2021816533049e-06, + "loss": 0.4692, + "step": 9192 + }, + { + "epoch": 1.509740726294829, + "grad_norm": 0.3172137811682593, + "learning_rate": 7.2018066315022925e-06, + "loss": 0.4735, + "step": 9193 + }, + { + "epoch": 1.5099049535031717, + "grad_norm": 0.3372622416767625, + "learning_rate": 7.201431580355408e-06, + "loss": 0.4537, + "step": 9194 + }, + { + "epoch": 1.5100691807115143, + "grad_norm": 0.31133923346230863, + "learning_rate": 7.201056499868319e-06, + "loss": 0.4743, + "step": 9195 + }, + { + "epoch": 1.5102334079198572, + "grad_norm": 0.3781581429663542, + "learning_rate": 7.2006813900451e-06, + "loss": 0.4559, + "step": 9196 + }, + { + "epoch": 1.5103976351282, + "grad_norm": 0.30980865474592634, + "learning_rate": 7.2003062508898224e-06, + "loss": 0.471, + "step": 9197 + }, + { + "epoch": 1.5105618623365427, + "grad_norm": 0.3164800874921807, + "learning_rate": 7.1999310824065624e-06, + "loss": 0.4684, + "step": 9198 + }, + { + "epoch": 1.5107260895448853, + "grad_norm": 0.33008071514670867, + "learning_rate": 7.199555884599395e-06, + "loss": 0.4589, + "step": 9199 + }, + { + "epoch": 1.510890316753228, + "grad_norm": 0.3651681256005668, + "learning_rate": 7.199180657472395e-06, + "loss": 0.4586, + "step": 9200 + }, + { + "epoch": 1.5110545439615708, + "grad_norm": 0.3513853825676322, + "learning_rate": 7.198805401029636e-06, + "loss": 0.4498, + "step": 9201 + }, + { + "epoch": 1.5112187711699137, + "grad_norm": 0.3528677624533436, + "learning_rate": 7.1984301152751956e-06, + "loss": 0.4606, + "step": 9202 + }, + { + "epoch": 1.5113829983782563, + "grad_norm": 0.4279584123728494, + "learning_rate": 7.198054800213151e-06, + "loss": 0.452, + "step": 9203 + }, + { + "epoch": 1.511547225586599, + "grad_norm": 0.2921779445976517, + "learning_rate": 7.1976794558475745e-06, + "loss": 0.4739, + "step": 9204 + }, + { + "epoch": 1.5117114527949418, + "grad_norm": 0.2706453446276023, + "learning_rate": 7.1973040821825465e-06, + "loss": 0.4623, + "step": 9205 + }, + { + "epoch": 1.5118756800032846, + "grad_norm": 0.2520182801619176, + "learning_rate": 7.196928679222141e-06, + "loss": 0.4442, + "step": 9206 + }, + { + "epoch": 1.5120399072116273, + "grad_norm": 0.29407078062875935, + "learning_rate": 7.196553246970438e-06, + "loss": 0.4711, + "step": 9207 + }, + { + "epoch": 1.51220413441997, + "grad_norm": 0.3291641852823142, + "learning_rate": 7.196177785431513e-06, + "loss": 0.4488, + "step": 9208 + }, + { + "epoch": 1.5123683616283128, + "grad_norm": 0.314166634713391, + "learning_rate": 7.195802294609444e-06, + "loss": 0.458, + "step": 9209 + }, + { + "epoch": 1.5125325888366556, + "grad_norm": 0.30014099078366124, + "learning_rate": 7.195426774508311e-06, + "loss": 0.4725, + "step": 9210 + }, + { + "epoch": 1.5126968160449983, + "grad_norm": 0.28804167027539607, + "learning_rate": 7.19505122513219e-06, + "loss": 0.4813, + "step": 9211 + }, + { + "epoch": 1.512861043253341, + "grad_norm": 0.27774905359300783, + "learning_rate": 7.194675646485161e-06, + "loss": 0.4516, + "step": 9212 + }, + { + "epoch": 1.5130252704616838, + "grad_norm": 0.3376729483173663, + "learning_rate": 7.194300038571305e-06, + "loss": 0.4664, + "step": 9213 + }, + { + "epoch": 1.5131894976700266, + "grad_norm": 0.2990700174753199, + "learning_rate": 7.193924401394699e-06, + "loss": 0.4613, + "step": 9214 + }, + { + "epoch": 1.5133537248783693, + "grad_norm": 0.2570384432016577, + "learning_rate": 7.193548734959423e-06, + "loss": 0.4556, + "step": 9215 + }, + { + "epoch": 1.513517952086712, + "grad_norm": 0.3070126716863017, + "learning_rate": 7.193173039269558e-06, + "loss": 0.464, + "step": 9216 + }, + { + "epoch": 1.5136821792950546, + "grad_norm": 0.29290536910157305, + "learning_rate": 7.1927973143291835e-06, + "loss": 0.4396, + "step": 9217 + }, + { + "epoch": 1.5138464065033974, + "grad_norm": 0.3581538018351269, + "learning_rate": 7.192421560142382e-06, + "loss": 0.4615, + "step": 9218 + }, + { + "epoch": 1.5140106337117403, + "grad_norm": 0.36202199207173685, + "learning_rate": 7.192045776713232e-06, + "loss": 0.4596, + "step": 9219 + }, + { + "epoch": 1.514174860920083, + "grad_norm": 0.29895826521978897, + "learning_rate": 7.191669964045818e-06, + "loss": 0.466, + "step": 9220 + }, + { + "epoch": 1.5143390881284255, + "grad_norm": 0.3183924651940098, + "learning_rate": 7.191294122144217e-06, + "loss": 0.48, + "step": 9221 + }, + { + "epoch": 1.5145033153367684, + "grad_norm": 0.28198083034908455, + "learning_rate": 7.190918251012517e-06, + "loss": 0.4663, + "step": 9222 + }, + { + "epoch": 1.5146675425451113, + "grad_norm": 0.3303474365929383, + "learning_rate": 7.190542350654796e-06, + "loss": 0.4923, + "step": 9223 + }, + { + "epoch": 1.514831769753454, + "grad_norm": 0.4024297610145994, + "learning_rate": 7.190166421075138e-06, + "loss": 0.4718, + "step": 9224 + }, + { + "epoch": 1.5149959969617965, + "grad_norm": 0.25837992364309836, + "learning_rate": 7.189790462277626e-06, + "loss": 0.4564, + "step": 9225 + }, + { + "epoch": 1.5151602241701394, + "grad_norm": 0.39812967678782274, + "learning_rate": 7.1894144742663435e-06, + "loss": 0.4519, + "step": 9226 + }, + { + "epoch": 1.5153244513784823, + "grad_norm": 0.3450635706066045, + "learning_rate": 7.189038457045372e-06, + "loss": 0.453, + "step": 9227 + }, + { + "epoch": 1.515488678586825, + "grad_norm": 0.2783889625191982, + "learning_rate": 7.1886624106188e-06, + "loss": 0.475, + "step": 9228 + }, + { + "epoch": 1.5156529057951675, + "grad_norm": 0.3376557375783973, + "learning_rate": 7.1882863349907076e-06, + "loss": 0.4526, + "step": 9229 + }, + { + "epoch": 1.5158171330035104, + "grad_norm": 0.2896971589896287, + "learning_rate": 7.187910230165181e-06, + "loss": 0.4553, + "step": 9230 + }, + { + "epoch": 1.5159813602118533, + "grad_norm": 0.3157032612136202, + "learning_rate": 7.187534096146304e-06, + "loss": 0.472, + "step": 9231 + }, + { + "epoch": 1.516145587420196, + "grad_norm": 0.30994446974518053, + "learning_rate": 7.1871579329381625e-06, + "loss": 0.4639, + "step": 9232 + }, + { + "epoch": 1.5163098146285385, + "grad_norm": 0.33537568493394626, + "learning_rate": 7.186781740544842e-06, + "loss": 0.4527, + "step": 9233 + }, + { + "epoch": 1.5164740418368812, + "grad_norm": 0.3319121755581918, + "learning_rate": 7.18640551897043e-06, + "loss": 0.4741, + "step": 9234 + }, + { + "epoch": 1.516638269045224, + "grad_norm": 0.29375157989929307, + "learning_rate": 7.18602926821901e-06, + "loss": 0.4805, + "step": 9235 + }, + { + "epoch": 1.516802496253567, + "grad_norm": 0.278719060087101, + "learning_rate": 7.185652988294668e-06, + "loss": 0.459, + "step": 9236 + }, + { + "epoch": 1.5169667234619095, + "grad_norm": 0.28645821773642316, + "learning_rate": 7.185276679201494e-06, + "loss": 0.4561, + "step": 9237 + }, + { + "epoch": 1.5171309506702522, + "grad_norm": 0.3963936354834165, + "learning_rate": 7.184900340943574e-06, + "loss": 0.4494, + "step": 9238 + }, + { + "epoch": 1.517295177878595, + "grad_norm": 0.26055817683213023, + "learning_rate": 7.184523973524993e-06, + "loss": 0.4488, + "step": 9239 + }, + { + "epoch": 1.5174594050869379, + "grad_norm": 0.31956318401914047, + "learning_rate": 7.184147576949841e-06, + "loss": 0.4846, + "step": 9240 + }, + { + "epoch": 1.5176236322952805, + "grad_norm": 0.6295143635584098, + "learning_rate": 7.183771151222205e-06, + "loss": 0.4712, + "step": 9241 + }, + { + "epoch": 1.5177878595036232, + "grad_norm": 0.35754560129622853, + "learning_rate": 7.183394696346175e-06, + "loss": 0.4528, + "step": 9242 + }, + { + "epoch": 1.517952086711966, + "grad_norm": 0.44717398192916935, + "learning_rate": 7.1830182123258376e-06, + "loss": 0.4685, + "step": 9243 + }, + { + "epoch": 1.5181163139203089, + "grad_norm": 0.44012743422950795, + "learning_rate": 7.1826416991652835e-06, + "loss": 0.4705, + "step": 9244 + }, + { + "epoch": 1.5182805411286515, + "grad_norm": 0.26859917683990014, + "learning_rate": 7.182265156868599e-06, + "loss": 0.4438, + "step": 9245 + }, + { + "epoch": 1.5184447683369942, + "grad_norm": 0.28892403464626515, + "learning_rate": 7.181888585439879e-06, + "loss": 0.4714, + "step": 9246 + }, + { + "epoch": 1.518608995545337, + "grad_norm": 0.38304813283172645, + "learning_rate": 7.181511984883208e-06, + "loss": 0.4867, + "step": 9247 + }, + { + "epoch": 1.5187732227536799, + "grad_norm": 0.30305384307379923, + "learning_rate": 7.181135355202679e-06, + "loss": 0.4791, + "step": 9248 + }, + { + "epoch": 1.5189374499620225, + "grad_norm": 0.3155490544939365, + "learning_rate": 7.180758696402382e-06, + "loss": 0.4766, + "step": 9249 + }, + { + "epoch": 1.5191016771703652, + "grad_norm": 0.2906595932861458, + "learning_rate": 7.180382008486409e-06, + "loss": 0.4532, + "step": 9250 + }, + { + "epoch": 1.5192659043787078, + "grad_norm": 0.4474126938917336, + "learning_rate": 7.18000529145885e-06, + "loss": 0.4571, + "step": 9251 + }, + { + "epoch": 1.5194301315870506, + "grad_norm": 0.31794116167934067, + "learning_rate": 7.179628545323797e-06, + "loss": 0.4786, + "step": 9252 + }, + { + "epoch": 1.5195943587953935, + "grad_norm": 0.3263806739083232, + "learning_rate": 7.1792517700853405e-06, + "loss": 0.4735, + "step": 9253 + }, + { + "epoch": 1.5197585860037361, + "grad_norm": 0.3827263780224837, + "learning_rate": 7.178874965747575e-06, + "loss": 0.4625, + "step": 9254 + }, + { + "epoch": 1.5199228132120788, + "grad_norm": 0.3192602339087632, + "learning_rate": 7.178498132314591e-06, + "loss": 0.4676, + "step": 9255 + }, + { + "epoch": 1.5200870404204216, + "grad_norm": 0.39440388028059575, + "learning_rate": 7.1781212697904815e-06, + "loss": 0.4611, + "step": 9256 + }, + { + "epoch": 1.5202512676287645, + "grad_norm": 0.32399627276062054, + "learning_rate": 7.177744378179342e-06, + "loss": 0.4473, + "step": 9257 + }, + { + "epoch": 1.5204154948371071, + "grad_norm": 0.2790344377044542, + "learning_rate": 7.177367457485262e-06, + "loss": 0.4805, + "step": 9258 + }, + { + "epoch": 1.5205797220454498, + "grad_norm": 0.3458950067863732, + "learning_rate": 7.176990507712338e-06, + "loss": 0.4459, + "step": 9259 + }, + { + "epoch": 1.5207439492537926, + "grad_norm": 0.3416751416590733, + "learning_rate": 7.176613528864664e-06, + "loss": 0.4549, + "step": 9260 + }, + { + "epoch": 1.5209081764621355, + "grad_norm": 0.3240434085579008, + "learning_rate": 7.176236520946333e-06, + "loss": 0.4541, + "step": 9261 + }, + { + "epoch": 1.5210724036704781, + "grad_norm": 0.28624661378771976, + "learning_rate": 7.175859483961441e-06, + "loss": 0.4687, + "step": 9262 + }, + { + "epoch": 1.5212366308788208, + "grad_norm": 0.31156116458197464, + "learning_rate": 7.175482417914081e-06, + "loss": 0.4775, + "step": 9263 + }, + { + "epoch": 1.5214008580871636, + "grad_norm": 0.3201213181079633, + "learning_rate": 7.1751053228083495e-06, + "loss": 0.4542, + "step": 9264 + }, + { + "epoch": 1.5215650852955065, + "grad_norm": 0.2896366480561736, + "learning_rate": 7.174728198648343e-06, + "loss": 0.4613, + "step": 9265 + }, + { + "epoch": 1.5217293125038491, + "grad_norm": 1.5454545712227965, + "learning_rate": 7.174351045438156e-06, + "loss": 0.4528, + "step": 9266 + }, + { + "epoch": 1.5218935397121918, + "grad_norm": 0.4130809329340727, + "learning_rate": 7.173973863181886e-06, + "loss": 0.4612, + "step": 9267 + }, + { + "epoch": 1.5220577669205344, + "grad_norm": 0.5392643599819068, + "learning_rate": 7.173596651883629e-06, + "loss": 0.4734, + "step": 9268 + }, + { + "epoch": 1.5222219941288773, + "grad_norm": 0.29554384020455526, + "learning_rate": 7.173219411547483e-06, + "loss": 0.4572, + "step": 9269 + }, + { + "epoch": 1.5223862213372201, + "grad_norm": 0.37929201827686776, + "learning_rate": 7.172842142177543e-06, + "loss": 0.4761, + "step": 9270 + }, + { + "epoch": 1.5225504485455628, + "grad_norm": 0.27942306578844844, + "learning_rate": 7.172464843777907e-06, + "loss": 0.4546, + "step": 9271 + }, + { + "epoch": 1.5227146757539054, + "grad_norm": 0.31283795812998494, + "learning_rate": 7.172087516352674e-06, + "loss": 0.4632, + "step": 9272 + }, + { + "epoch": 1.5228789029622483, + "grad_norm": 0.28612405394145773, + "learning_rate": 7.171710159905943e-06, + "loss": 0.4581, + "step": 9273 + }, + { + "epoch": 1.5230431301705911, + "grad_norm": 0.4814978265268251, + "learning_rate": 7.171332774441809e-06, + "loss": 0.4654, + "step": 9274 + }, + { + "epoch": 1.5232073573789338, + "grad_norm": 0.3974772117244747, + "learning_rate": 7.170955359964373e-06, + "loss": 0.4625, + "step": 9275 + }, + { + "epoch": 1.5233715845872764, + "grad_norm": 0.3978478495668505, + "learning_rate": 7.170577916477736e-06, + "loss": 0.4508, + "step": 9276 + }, + { + "epoch": 1.5235358117956193, + "grad_norm": 0.47194095448274265, + "learning_rate": 7.170200443985993e-06, + "loss": 0.4553, + "step": 9277 + }, + { + "epoch": 1.5237000390039621, + "grad_norm": 0.37774460441156743, + "learning_rate": 7.1698229424932476e-06, + "loss": 0.4612, + "step": 9278 + }, + { + "epoch": 1.5238642662123048, + "grad_norm": 0.36763814623347385, + "learning_rate": 7.169445412003598e-06, + "loss": 0.4604, + "step": 9279 + }, + { + "epoch": 1.5240284934206474, + "grad_norm": 0.5037516746821502, + "learning_rate": 7.169067852521144e-06, + "loss": 0.478, + "step": 9280 + }, + { + "epoch": 1.5241927206289902, + "grad_norm": 0.4212263940251124, + "learning_rate": 7.1686902640499876e-06, + "loss": 0.4634, + "step": 9281 + }, + { + "epoch": 1.524356947837333, + "grad_norm": 0.639577271055163, + "learning_rate": 7.168312646594228e-06, + "loss": 0.4659, + "step": 9282 + }, + { + "epoch": 1.5245211750456757, + "grad_norm": 0.31425965396324984, + "learning_rate": 7.16793500015797e-06, + "loss": 0.458, + "step": 9283 + }, + { + "epoch": 1.5246854022540184, + "grad_norm": 0.3549092300339072, + "learning_rate": 7.167557324745312e-06, + "loss": 0.4847, + "step": 9284 + }, + { + "epoch": 1.524849629462361, + "grad_norm": 0.464940002037064, + "learning_rate": 7.167179620360357e-06, + "loss": 0.4851, + "step": 9285 + }, + { + "epoch": 1.5250138566707039, + "grad_norm": 0.7404536442112363, + "learning_rate": 7.166801887007208e-06, + "loss": 0.4645, + "step": 9286 + }, + { + "epoch": 1.5251780838790467, + "grad_norm": 0.3797169402295166, + "learning_rate": 7.166424124689965e-06, + "loss": 0.4729, + "step": 9287 + }, + { + "epoch": 1.5253423110873894, + "grad_norm": 0.4800510956035743, + "learning_rate": 7.1660463334127345e-06, + "loss": 0.473, + "step": 9288 + }, + { + "epoch": 1.525506538295732, + "grad_norm": 0.29060595534750716, + "learning_rate": 7.165668513179617e-06, + "loss": 0.4534, + "step": 9289 + }, + { + "epoch": 1.5256707655040749, + "grad_norm": 0.30087575863625227, + "learning_rate": 7.165290663994715e-06, + "loss": 0.4956, + "step": 9290 + }, + { + "epoch": 1.5258349927124177, + "grad_norm": 0.339029574512295, + "learning_rate": 7.1649127858621354e-06, + "loss": 0.4774, + "step": 9291 + }, + { + "epoch": 1.5259992199207604, + "grad_norm": 0.2916719047991023, + "learning_rate": 7.164534878785982e-06, + "loss": 0.4867, + "step": 9292 + }, + { + "epoch": 1.526163447129103, + "grad_norm": 0.30316356818056006, + "learning_rate": 7.1641569427703585e-06, + "loss": 0.4491, + "step": 9293 + }, + { + "epoch": 1.5263276743374459, + "grad_norm": 0.34009506124918254, + "learning_rate": 7.163778977819368e-06, + "loss": 0.4506, + "step": 9294 + }, + { + "epoch": 1.5264919015457887, + "grad_norm": 0.3158249532017147, + "learning_rate": 7.163400983937117e-06, + "loss": 0.4728, + "step": 9295 + }, + { + "epoch": 1.5266561287541314, + "grad_norm": 0.31123665592368355, + "learning_rate": 7.163022961127711e-06, + "loss": 0.4595, + "step": 9296 + }, + { + "epoch": 1.526820355962474, + "grad_norm": 0.3281051407634187, + "learning_rate": 7.162644909395256e-06, + "loss": 0.4652, + "step": 9297 + }, + { + "epoch": 1.5269845831708169, + "grad_norm": 0.29028559973315327, + "learning_rate": 7.162266828743857e-06, + "loss": 0.469, + "step": 9298 + }, + { + "epoch": 1.5271488103791597, + "grad_norm": 0.26500384107962766, + "learning_rate": 7.161888719177622e-06, + "loss": 0.4717, + "step": 9299 + }, + { + "epoch": 1.5273130375875024, + "grad_norm": 0.3146412183956378, + "learning_rate": 7.161510580700656e-06, + "loss": 0.4669, + "step": 9300 + }, + { + "epoch": 1.527477264795845, + "grad_norm": 0.30019999592266705, + "learning_rate": 7.161132413317068e-06, + "loss": 0.4657, + "step": 9301 + }, + { + "epoch": 1.5276414920041876, + "grad_norm": 0.28169330642062956, + "learning_rate": 7.160754217030962e-06, + "loss": 0.4525, + "step": 9302 + }, + { + "epoch": 1.5278057192125305, + "grad_norm": 0.5323783758611011, + "learning_rate": 7.1603759918464476e-06, + "loss": 0.4877, + "step": 9303 + }, + { + "epoch": 1.5279699464208734, + "grad_norm": 0.2690038165753098, + "learning_rate": 7.159997737767632e-06, + "loss": 0.4695, + "step": 9304 + }, + { + "epoch": 1.528134173629216, + "grad_norm": 0.709221517163938, + "learning_rate": 7.159619454798625e-06, + "loss": 0.4505, + "step": 9305 + }, + { + "epoch": 1.5282984008375586, + "grad_norm": 0.26711462113418744, + "learning_rate": 7.159241142943533e-06, + "loss": 0.4658, + "step": 9306 + }, + { + "epoch": 1.5284626280459015, + "grad_norm": 0.3039516845191958, + "learning_rate": 7.158862802206466e-06, + "loss": 0.4671, + "step": 9307 + }, + { + "epoch": 1.5286268552542444, + "grad_norm": 0.2631380237679185, + "learning_rate": 7.158484432591534e-06, + "loss": 0.457, + "step": 9308 + }, + { + "epoch": 1.528791082462587, + "grad_norm": 0.34880055704945134, + "learning_rate": 7.158106034102846e-06, + "loss": 0.459, + "step": 9309 + }, + { + "epoch": 1.5289553096709296, + "grad_norm": 0.4332345447097457, + "learning_rate": 7.1577276067445094e-06, + "loss": 0.481, + "step": 9310 + }, + { + "epoch": 1.5291195368792725, + "grad_norm": 0.2701428254345847, + "learning_rate": 7.157349150520636e-06, + "loss": 0.464, + "step": 9311 + }, + { + "epoch": 1.5292837640876153, + "grad_norm": 0.3191308013759308, + "learning_rate": 7.156970665435338e-06, + "loss": 0.461, + "step": 9312 + }, + { + "epoch": 1.529447991295958, + "grad_norm": 0.3030079025076875, + "learning_rate": 7.156592151492722e-06, + "loss": 0.4735, + "step": 9313 + }, + { + "epoch": 1.5296122185043006, + "grad_norm": 0.33155877053915367, + "learning_rate": 7.156213608696904e-06, + "loss": 0.5012, + "step": 9314 + }, + { + "epoch": 1.5297764457126435, + "grad_norm": 0.3130320047585034, + "learning_rate": 7.155835037051993e-06, + "loss": 0.4917, + "step": 9315 + }, + { + "epoch": 1.5299406729209863, + "grad_norm": 0.36060204593966044, + "learning_rate": 7.155456436562098e-06, + "loss": 0.4803, + "step": 9316 + }, + { + "epoch": 1.530104900129329, + "grad_norm": 0.32398573562386035, + "learning_rate": 7.155077807231336e-06, + "loss": 0.4495, + "step": 9317 + }, + { + "epoch": 1.5302691273376716, + "grad_norm": 0.27751479721468064, + "learning_rate": 7.154699149063816e-06, + "loss": 0.4826, + "step": 9318 + }, + { + "epoch": 1.5304333545460143, + "grad_norm": 0.42071383808180096, + "learning_rate": 7.1543204620636505e-06, + "loss": 0.4529, + "step": 9319 + }, + { + "epoch": 1.5305975817543571, + "grad_norm": 0.44110763680049825, + "learning_rate": 7.153941746234953e-06, + "loss": 0.4987, + "step": 9320 + }, + { + "epoch": 1.5307618089627, + "grad_norm": 0.289458468189357, + "learning_rate": 7.153563001581838e-06, + "loss": 0.4733, + "step": 9321 + }, + { + "epoch": 1.5309260361710426, + "grad_norm": 0.2512213588331616, + "learning_rate": 7.153184228108419e-06, + "loss": 0.4549, + "step": 9322 + }, + { + "epoch": 1.5310902633793853, + "grad_norm": 0.3004273617854931, + "learning_rate": 7.152805425818807e-06, + "loss": 0.454, + "step": 9323 + }, + { + "epoch": 1.5312544905877281, + "grad_norm": 0.32267924554730515, + "learning_rate": 7.152426594717119e-06, + "loss": 0.4796, + "step": 9324 + }, + { + "epoch": 1.531418717796071, + "grad_norm": 0.2957664116913034, + "learning_rate": 7.152047734807469e-06, + "loss": 0.4688, + "step": 9325 + }, + { + "epoch": 1.5315829450044136, + "grad_norm": 0.29607329025830087, + "learning_rate": 7.151668846093971e-06, + "loss": 0.4735, + "step": 9326 + }, + { + "epoch": 1.5317471722127562, + "grad_norm": 0.34564464756084806, + "learning_rate": 7.15128992858074e-06, + "loss": 0.4746, + "step": 9327 + }, + { + "epoch": 1.531911399421099, + "grad_norm": 0.3632349137268046, + "learning_rate": 7.1509109822718915e-06, + "loss": 0.4638, + "step": 9328 + }, + { + "epoch": 1.532075626629442, + "grad_norm": 0.3528582463422724, + "learning_rate": 7.150532007171542e-06, + "loss": 0.4739, + "step": 9329 + }, + { + "epoch": 1.5322398538377846, + "grad_norm": 0.2851865961310054, + "learning_rate": 7.150153003283807e-06, + "loss": 0.4625, + "step": 9330 + }, + { + "epoch": 1.5324040810461272, + "grad_norm": 0.2644495220903839, + "learning_rate": 7.149773970612804e-06, + "loss": 0.4744, + "step": 9331 + }, + { + "epoch": 1.53256830825447, + "grad_norm": 0.26070085543493976, + "learning_rate": 7.149394909162648e-06, + "loss": 0.4608, + "step": 9332 + }, + { + "epoch": 1.532732535462813, + "grad_norm": 0.29613010697864284, + "learning_rate": 7.149015818937456e-06, + "loss": 0.4521, + "step": 9333 + }, + { + "epoch": 1.5328967626711556, + "grad_norm": 0.31117686961305685, + "learning_rate": 7.148636699941347e-06, + "loss": 0.4644, + "step": 9334 + }, + { + "epoch": 1.5330609898794982, + "grad_norm": 0.29211819423313984, + "learning_rate": 7.148257552178438e-06, + "loss": 0.4821, + "step": 9335 + }, + { + "epoch": 1.5332252170878409, + "grad_norm": 0.2703181498755695, + "learning_rate": 7.147878375652844e-06, + "loss": 0.4721, + "step": 9336 + }, + { + "epoch": 1.5333894442961837, + "grad_norm": 0.33278351582547366, + "learning_rate": 7.147499170368688e-06, + "loss": 0.4536, + "step": 9337 + }, + { + "epoch": 1.5335536715045266, + "grad_norm": 0.28749069490029466, + "learning_rate": 7.1471199363300845e-06, + "loss": 0.4548, + "step": 9338 + }, + { + "epoch": 1.5337178987128692, + "grad_norm": 0.29073060662806555, + "learning_rate": 7.146740673541155e-06, + "loss": 0.4615, + "step": 9339 + }, + { + "epoch": 1.5338821259212119, + "grad_norm": 0.4272828402384313, + "learning_rate": 7.146361382006019e-06, + "loss": 0.4344, + "step": 9340 + }, + { + "epoch": 1.5340463531295547, + "grad_norm": 0.28147195710433215, + "learning_rate": 7.145982061728792e-06, + "loss": 0.4752, + "step": 9341 + }, + { + "epoch": 1.5342105803378976, + "grad_norm": 0.3439436486971339, + "learning_rate": 7.145602712713598e-06, + "loss": 0.4666, + "step": 9342 + }, + { + "epoch": 1.5343748075462402, + "grad_norm": 0.3488722878311697, + "learning_rate": 7.145223334964556e-06, + "loss": 0.4615, + "step": 9343 + }, + { + "epoch": 1.5345390347545829, + "grad_norm": 0.30250466922681474, + "learning_rate": 7.144843928485786e-06, + "loss": 0.4663, + "step": 9344 + }, + { + "epoch": 1.5347032619629257, + "grad_norm": 0.2913076112847636, + "learning_rate": 7.144464493281407e-06, + "loss": 0.493, + "step": 9345 + }, + { + "epoch": 1.5348674891712686, + "grad_norm": 0.27574286461367503, + "learning_rate": 7.144085029355544e-06, + "loss": 0.4568, + "step": 9346 + }, + { + "epoch": 1.5350317163796112, + "grad_norm": 0.3085587250021239, + "learning_rate": 7.143705536712316e-06, + "loss": 0.459, + "step": 9347 + }, + { + "epoch": 1.5351959435879539, + "grad_norm": 0.42885403672499844, + "learning_rate": 7.143326015355844e-06, + "loss": 0.4581, + "step": 9348 + }, + { + "epoch": 1.5353601707962967, + "grad_norm": 0.2740916414682097, + "learning_rate": 7.14294646529025e-06, + "loss": 0.4518, + "step": 9349 + }, + { + "epoch": 1.5355243980046396, + "grad_norm": 0.3450236813009234, + "learning_rate": 7.1425668865196585e-06, + "loss": 0.4523, + "step": 9350 + }, + { + "epoch": 1.5356886252129822, + "grad_norm": 0.38502119714992583, + "learning_rate": 7.14218727904819e-06, + "loss": 0.4633, + "step": 9351 + }, + { + "epoch": 1.5358528524213249, + "grad_norm": 0.4006288598907935, + "learning_rate": 7.1418076428799685e-06, + "loss": 0.4618, + "step": 9352 + }, + { + "epoch": 1.5360170796296675, + "grad_norm": 0.5293557699101507, + "learning_rate": 7.141427978019116e-06, + "loss": 0.4661, + "step": 9353 + }, + { + "epoch": 1.5361813068380104, + "grad_norm": 0.3574612680794133, + "learning_rate": 7.141048284469758e-06, + "loss": 0.4801, + "step": 9354 + }, + { + "epoch": 1.5363455340463532, + "grad_norm": 0.3274060576534963, + "learning_rate": 7.1406685622360174e-06, + "loss": 0.4723, + "step": 9355 + }, + { + "epoch": 1.5365097612546958, + "grad_norm": 0.3126087621576872, + "learning_rate": 7.140288811322017e-06, + "loss": 0.4491, + "step": 9356 + }, + { + "epoch": 1.5366739884630385, + "grad_norm": 0.43435643215596625, + "learning_rate": 7.139909031731883e-06, + "loss": 0.476, + "step": 9357 + }, + { + "epoch": 1.5368382156713813, + "grad_norm": 0.2922725077472286, + "learning_rate": 7.139529223469738e-06, + "loss": 0.4591, + "step": 9358 + }, + { + "epoch": 1.5370024428797242, + "grad_norm": 0.33144720651032605, + "learning_rate": 7.139149386539711e-06, + "loss": 0.4807, + "step": 9359 + }, + { + "epoch": 1.5371666700880668, + "grad_norm": 0.2945145824570666, + "learning_rate": 7.138769520945925e-06, + "loss": 0.4484, + "step": 9360 + }, + { + "epoch": 1.5373308972964095, + "grad_norm": 0.2699345270357702, + "learning_rate": 7.138389626692504e-06, + "loss": 0.4663, + "step": 9361 + }, + { + "epoch": 1.5374951245047523, + "grad_norm": 0.37423556894532223, + "learning_rate": 7.138009703783577e-06, + "loss": 0.4629, + "step": 9362 + }, + { + "epoch": 1.5376593517130952, + "grad_norm": 0.28643072132489733, + "learning_rate": 7.137629752223268e-06, + "loss": 0.4626, + "step": 9363 + }, + { + "epoch": 1.5378235789214378, + "grad_norm": 0.4011805145508984, + "learning_rate": 7.137249772015707e-06, + "loss": 0.4598, + "step": 9364 + }, + { + "epoch": 1.5379878061297805, + "grad_norm": 0.28589490123057115, + "learning_rate": 7.136869763165017e-06, + "loss": 0.4401, + "step": 9365 + }, + { + "epoch": 1.5381520333381233, + "grad_norm": 0.33402197609435524, + "learning_rate": 7.136489725675328e-06, + "loss": 0.4718, + "step": 9366 + }, + { + "epoch": 1.5383162605464662, + "grad_norm": 0.3515709813075105, + "learning_rate": 7.136109659550765e-06, + "loss": 0.4692, + "step": 9367 + }, + { + "epoch": 1.5384804877548088, + "grad_norm": 0.727042993615268, + "learning_rate": 7.13572956479546e-06, + "loss": 0.4434, + "step": 9368 + }, + { + "epoch": 1.5386447149631515, + "grad_norm": 0.2803236782587894, + "learning_rate": 7.135349441413538e-06, + "loss": 0.4727, + "step": 9369 + }, + { + "epoch": 1.538808942171494, + "grad_norm": 0.3226815325060115, + "learning_rate": 7.134969289409126e-06, + "loss": 0.4747, + "step": 9370 + }, + { + "epoch": 1.538973169379837, + "grad_norm": 0.36235437219598443, + "learning_rate": 7.134589108786357e-06, + "loss": 0.4468, + "step": 9371 + }, + { + "epoch": 1.5391373965881798, + "grad_norm": 0.35625948487457887, + "learning_rate": 7.134208899549359e-06, + "loss": 0.4486, + "step": 9372 + }, + { + "epoch": 1.5393016237965225, + "grad_norm": 0.34735670003053626, + "learning_rate": 7.133828661702259e-06, + "loss": 0.4335, + "step": 9373 + }, + { + "epoch": 1.539465851004865, + "grad_norm": 0.34199356720688007, + "learning_rate": 7.133448395249189e-06, + "loss": 0.4774, + "step": 9374 + }, + { + "epoch": 1.539630078213208, + "grad_norm": 0.3075669052642566, + "learning_rate": 7.133068100194278e-06, + "loss": 0.466, + "step": 9375 + }, + { + "epoch": 1.5397943054215508, + "grad_norm": 0.30933420404936507, + "learning_rate": 7.132687776541658e-06, + "loss": 0.4629, + "step": 9376 + }, + { + "epoch": 1.5399585326298935, + "grad_norm": 0.3167400643057456, + "learning_rate": 7.132307424295457e-06, + "loss": 0.4694, + "step": 9377 + }, + { + "epoch": 1.540122759838236, + "grad_norm": 0.3330410083602585, + "learning_rate": 7.1319270434598095e-06, + "loss": 0.4541, + "step": 9378 + }, + { + "epoch": 1.540286987046579, + "grad_norm": 0.2608004017915481, + "learning_rate": 7.131546634038843e-06, + "loss": 0.4609, + "step": 9379 + }, + { + "epoch": 1.5404512142549218, + "grad_norm": 0.32756145155673744, + "learning_rate": 7.131166196036692e-06, + "loss": 0.4679, + "step": 9380 + }, + { + "epoch": 1.5406154414632645, + "grad_norm": 0.3215737601475467, + "learning_rate": 7.130785729457487e-06, + "loss": 0.4712, + "step": 9381 + }, + { + "epoch": 1.540779668671607, + "grad_norm": 0.34381508752868223, + "learning_rate": 7.13040523430536e-06, + "loss": 0.4399, + "step": 9382 + }, + { + "epoch": 1.54094389587995, + "grad_norm": 0.3004485729292794, + "learning_rate": 7.1300247105844455e-06, + "loss": 0.4567, + "step": 9383 + }, + { + "epoch": 1.5411081230882928, + "grad_norm": 0.37942435111803424, + "learning_rate": 7.1296441582988745e-06, + "loss": 0.4677, + "step": 9384 + }, + { + "epoch": 1.5412723502966355, + "grad_norm": 0.4357473073673098, + "learning_rate": 7.129263577452781e-06, + "loss": 0.4518, + "step": 9385 + }, + { + "epoch": 1.541436577504978, + "grad_norm": 0.3087949525062071, + "learning_rate": 7.128882968050298e-06, + "loss": 0.4724, + "step": 9386 + }, + { + "epoch": 1.5416008047133207, + "grad_norm": 0.35014647578381863, + "learning_rate": 7.128502330095558e-06, + "loss": 0.46, + "step": 9387 + }, + { + "epoch": 1.5417650319216636, + "grad_norm": 0.3146885120825772, + "learning_rate": 7.1281216635926985e-06, + "loss": 0.4583, + "step": 9388 + }, + { + "epoch": 1.5419292591300064, + "grad_norm": 0.3136789279242204, + "learning_rate": 7.127740968545852e-06, + "loss": 0.4843, + "step": 9389 + }, + { + "epoch": 1.542093486338349, + "grad_norm": 0.5298025260222496, + "learning_rate": 7.127360244959151e-06, + "loss": 0.4812, + "step": 9390 + }, + { + "epoch": 1.5422577135466917, + "grad_norm": 0.3844808012012636, + "learning_rate": 7.126979492836736e-06, + "loss": 0.4544, + "step": 9391 + }, + { + "epoch": 1.5424219407550346, + "grad_norm": 0.39924791256605835, + "learning_rate": 7.126598712182736e-06, + "loss": 0.479, + "step": 9392 + }, + { + "epoch": 1.5425861679633774, + "grad_norm": 0.3602246778936246, + "learning_rate": 7.1262179030012925e-06, + "loss": 0.4783, + "step": 9393 + }, + { + "epoch": 1.54275039517172, + "grad_norm": 0.2732202036898396, + "learning_rate": 7.1258370652965375e-06, + "loss": 0.4785, + "step": 9394 + }, + { + "epoch": 1.5429146223800627, + "grad_norm": 0.3263140799564156, + "learning_rate": 7.125456199072609e-06, + "loss": 0.4563, + "step": 9395 + }, + { + "epoch": 1.5430788495884056, + "grad_norm": 0.3548333323535569, + "learning_rate": 7.125075304333642e-06, + "loss": 0.4517, + "step": 9396 + }, + { + "epoch": 1.5432430767967484, + "grad_norm": 0.8067372128015823, + "learning_rate": 7.1246943810837745e-06, + "loss": 0.453, + "step": 9397 + }, + { + "epoch": 1.543407304005091, + "grad_norm": 0.26504405806007353, + "learning_rate": 7.1243134293271445e-06, + "loss": 0.4349, + "step": 9398 + }, + { + "epoch": 1.5435715312134337, + "grad_norm": 0.45675103704189807, + "learning_rate": 7.123932449067888e-06, + "loss": 0.4476, + "step": 9399 + }, + { + "epoch": 1.5437357584217766, + "grad_norm": 0.32637847065326686, + "learning_rate": 7.123551440310144e-06, + "loss": 0.4862, + "step": 9400 + }, + { + "epoch": 1.5438999856301194, + "grad_norm": 0.33195374509870645, + "learning_rate": 7.1231704030580516e-06, + "loss": 0.4596, + "step": 9401 + }, + { + "epoch": 1.544064212838462, + "grad_norm": 0.255934132190599, + "learning_rate": 7.122789337315745e-06, + "loss": 0.4678, + "step": 9402 + }, + { + "epoch": 1.5442284400468047, + "grad_norm": 0.33693985909373964, + "learning_rate": 7.122408243087367e-06, + "loss": 0.4686, + "step": 9403 + }, + { + "epoch": 1.5443926672551473, + "grad_norm": 0.2996963732838922, + "learning_rate": 7.122027120377055e-06, + "loss": 0.4839, + "step": 9404 + }, + { + "epoch": 1.5445568944634902, + "grad_norm": 0.2773729203223255, + "learning_rate": 7.12164596918895e-06, + "loss": 0.4862, + "step": 9405 + }, + { + "epoch": 1.544721121671833, + "grad_norm": 0.25544315255098893, + "learning_rate": 7.121264789527189e-06, + "loss": 0.4691, + "step": 9406 + }, + { + "epoch": 1.5448853488801757, + "grad_norm": 0.3592750822464238, + "learning_rate": 7.120883581395914e-06, + "loss": 0.4467, + "step": 9407 + }, + { + "epoch": 1.5450495760885183, + "grad_norm": 0.28607035669715075, + "learning_rate": 7.120502344799264e-06, + "loss": 0.4544, + "step": 9408 + }, + { + "epoch": 1.5452138032968612, + "grad_norm": 0.25531993325233804, + "learning_rate": 7.120121079741381e-06, + "loss": 0.4449, + "step": 9409 + }, + { + "epoch": 1.545378030505204, + "grad_norm": 0.5343782428139733, + "learning_rate": 7.119739786226406e-06, + "loss": 0.473, + "step": 9410 + }, + { + "epoch": 1.5455422577135467, + "grad_norm": 0.2864075940659775, + "learning_rate": 7.119358464258478e-06, + "loss": 0.4694, + "step": 9411 + }, + { + "epoch": 1.5457064849218893, + "grad_norm": 0.2715392085229134, + "learning_rate": 7.118977113841741e-06, + "loss": 0.4486, + "step": 9412 + }, + { + "epoch": 1.5458707121302322, + "grad_norm": 0.3593589381329291, + "learning_rate": 7.1185957349803355e-06, + "loss": 0.4578, + "step": 9413 + }, + { + "epoch": 1.546034939338575, + "grad_norm": 0.3399272636588334, + "learning_rate": 7.118214327678404e-06, + "loss": 0.4822, + "step": 9414 + }, + { + "epoch": 1.5461991665469177, + "grad_norm": 0.3378526292093369, + "learning_rate": 7.1178328919400895e-06, + "loss": 0.459, + "step": 9415 + }, + { + "epoch": 1.5463633937552603, + "grad_norm": 0.29955207988397575, + "learning_rate": 7.117451427769532e-06, + "loss": 0.4605, + "step": 9416 + }, + { + "epoch": 1.5465276209636032, + "grad_norm": 0.31826406899690846, + "learning_rate": 7.117069935170879e-06, + "loss": 0.4502, + "step": 9417 + }, + { + "epoch": 1.546691848171946, + "grad_norm": 0.3419910247342894, + "learning_rate": 7.11668841414827e-06, + "loss": 0.481, + "step": 9418 + }, + { + "epoch": 1.5468560753802887, + "grad_norm": 0.3465516444944875, + "learning_rate": 7.1163068647058515e-06, + "loss": 0.4589, + "step": 9419 + }, + { + "epoch": 1.5470203025886313, + "grad_norm": 0.3299902597621926, + "learning_rate": 7.115925286847767e-06, + "loss": 0.4424, + "step": 9420 + }, + { + "epoch": 1.547184529796974, + "grad_norm": 0.30635057158537654, + "learning_rate": 7.115543680578159e-06, + "loss": 0.4842, + "step": 9421 + }, + { + "epoch": 1.5473487570053168, + "grad_norm": 0.34957654661682797, + "learning_rate": 7.115162045901173e-06, + "loss": 0.4825, + "step": 9422 + }, + { + "epoch": 1.5475129842136597, + "grad_norm": 0.2953423418282763, + "learning_rate": 7.114780382820955e-06, + "loss": 0.4719, + "step": 9423 + }, + { + "epoch": 1.5476772114220023, + "grad_norm": 0.3780817633359112, + "learning_rate": 7.1143986913416495e-06, + "loss": 0.4778, + "step": 9424 + }, + { + "epoch": 1.547841438630345, + "grad_norm": 0.34843838102503016, + "learning_rate": 7.114016971467401e-06, + "loss": 0.4526, + "step": 9425 + }, + { + "epoch": 1.5480056658386878, + "grad_norm": 0.355449539427888, + "learning_rate": 7.113635223202358e-06, + "loss": 0.4825, + "step": 9426 + }, + { + "epoch": 1.5481698930470307, + "grad_norm": 0.3234611083942094, + "learning_rate": 7.113253446550665e-06, + "loss": 0.4869, + "step": 9427 + }, + { + "epoch": 1.5483341202553733, + "grad_norm": 0.307306483199121, + "learning_rate": 7.112871641516466e-06, + "loss": 0.4694, + "step": 9428 + }, + { + "epoch": 1.548498347463716, + "grad_norm": 0.30794715743718676, + "learning_rate": 7.112489808103912e-06, + "loss": 0.4873, + "step": 9429 + }, + { + "epoch": 1.5486625746720588, + "grad_norm": 0.47467914042047893, + "learning_rate": 7.1121079463171485e-06, + "loss": 0.475, + "step": 9430 + }, + { + "epoch": 1.5488268018804017, + "grad_norm": 0.29763731599021, + "learning_rate": 7.111726056160322e-06, + "loss": 0.4722, + "step": 9431 + }, + { + "epoch": 1.5489910290887443, + "grad_norm": 0.5091962443448468, + "learning_rate": 7.1113441376375815e-06, + "loss": 0.4707, + "step": 9432 + }, + { + "epoch": 1.549155256297087, + "grad_norm": 0.37582684669527816, + "learning_rate": 7.110962190753074e-06, + "loss": 0.4578, + "step": 9433 + }, + { + "epoch": 1.5493194835054298, + "grad_norm": 0.28575068074639237, + "learning_rate": 7.110580215510948e-06, + "loss": 0.4823, + "step": 9434 + }, + { + "epoch": 1.5494837107137727, + "grad_norm": 0.2629592017210659, + "learning_rate": 7.1101982119153535e-06, + "loss": 0.4744, + "step": 9435 + }, + { + "epoch": 1.5496479379221153, + "grad_norm": 0.30478574127256014, + "learning_rate": 7.109816179970438e-06, + "loss": 0.4651, + "step": 9436 + }, + { + "epoch": 1.549812165130458, + "grad_norm": 0.30396636516802095, + "learning_rate": 7.10943411968035e-06, + "loss": 0.4578, + "step": 9437 + }, + { + "epoch": 1.5499763923388006, + "grad_norm": 0.34806655997770647, + "learning_rate": 7.109052031049241e-06, + "loss": 0.451, + "step": 9438 + }, + { + "epoch": 1.5501406195471434, + "grad_norm": 0.3268077745593211, + "learning_rate": 7.108669914081259e-06, + "loss": 0.4668, + "step": 9439 + }, + { + "epoch": 1.5503048467554863, + "grad_norm": 0.34825895403338536, + "learning_rate": 7.108287768780558e-06, + "loss": 0.4656, + "step": 9440 + }, + { + "epoch": 1.550469073963829, + "grad_norm": 0.34054761958330737, + "learning_rate": 7.107905595151283e-06, + "loss": 0.4579, + "step": 9441 + }, + { + "epoch": 1.5506333011721716, + "grad_norm": 0.32724562169367966, + "learning_rate": 7.107523393197588e-06, + "loss": 0.451, + "step": 9442 + }, + { + "epoch": 1.5507975283805144, + "grad_norm": 0.30315162837296666, + "learning_rate": 7.107141162923624e-06, + "loss": 0.4703, + "step": 9443 + }, + { + "epoch": 1.5509617555888573, + "grad_norm": 0.3294099327301469, + "learning_rate": 7.1067589043335415e-06, + "loss": 0.4831, + "step": 9444 + }, + { + "epoch": 1.5511259827972, + "grad_norm": 0.3131121838015957, + "learning_rate": 7.106376617431493e-06, + "loss": 0.4784, + "step": 9445 + }, + { + "epoch": 1.5512902100055426, + "grad_norm": 0.4850003427166011, + "learning_rate": 7.105994302221629e-06, + "loss": 0.4584, + "step": 9446 + }, + { + "epoch": 1.5514544372138854, + "grad_norm": 0.2824536374442186, + "learning_rate": 7.105611958708103e-06, + "loss": 0.4684, + "step": 9447 + }, + { + "epoch": 1.5516186644222283, + "grad_norm": 0.26487668424818395, + "learning_rate": 7.105229586895069e-06, + "loss": 0.4706, + "step": 9448 + }, + { + "epoch": 1.551782891630571, + "grad_norm": 0.30186016930036735, + "learning_rate": 7.104847186786678e-06, + "loss": 0.4763, + "step": 9449 + }, + { + "epoch": 1.5519471188389136, + "grad_norm": 0.2944963864288857, + "learning_rate": 7.104464758387083e-06, + "loss": 0.4645, + "step": 9450 + }, + { + "epoch": 1.5521113460472564, + "grad_norm": 0.34167036610371465, + "learning_rate": 7.104082301700439e-06, + "loss": 0.4659, + "step": 9451 + }, + { + "epoch": 1.5522755732555993, + "grad_norm": 0.29168994835833856, + "learning_rate": 7.103699816730898e-06, + "loss": 0.4683, + "step": 9452 + }, + { + "epoch": 1.552439800463942, + "grad_norm": 0.31947429820977025, + "learning_rate": 7.1033173034826165e-06, + "loss": 0.4653, + "step": 9453 + }, + { + "epoch": 1.5526040276722846, + "grad_norm": 0.3599259614381269, + "learning_rate": 7.102934761959746e-06, + "loss": 0.4491, + "step": 9454 + }, + { + "epoch": 1.5527682548806272, + "grad_norm": 0.3837674128150548, + "learning_rate": 7.102552192166445e-06, + "loss": 0.4579, + "step": 9455 + }, + { + "epoch": 1.55293248208897, + "grad_norm": 0.5835016569566674, + "learning_rate": 7.102169594106867e-06, + "loss": 0.4753, + "step": 9456 + }, + { + "epoch": 1.553096709297313, + "grad_norm": 0.35557963697118555, + "learning_rate": 7.101786967785166e-06, + "loss": 0.4752, + "step": 9457 + }, + { + "epoch": 1.5532609365056556, + "grad_norm": 0.36659569428207733, + "learning_rate": 7.101404313205499e-06, + "loss": 0.4628, + "step": 9458 + }, + { + "epoch": 1.5534251637139982, + "grad_norm": 0.3163237320621313, + "learning_rate": 7.101021630372021e-06, + "loss": 0.4731, + "step": 9459 + }, + { + "epoch": 1.553589390922341, + "grad_norm": 0.2771310527959539, + "learning_rate": 7.1006389192888896e-06, + "loss": 0.4713, + "step": 9460 + }, + { + "epoch": 1.553753618130684, + "grad_norm": 0.305105596101068, + "learning_rate": 7.100256179960261e-06, + "loss": 0.4741, + "step": 9461 + }, + { + "epoch": 1.5539178453390265, + "grad_norm": 0.31147161003315604, + "learning_rate": 7.099873412390292e-06, + "loss": 0.4736, + "step": 9462 + }, + { + "epoch": 1.5540820725473692, + "grad_norm": 0.2704854712401249, + "learning_rate": 7.099490616583138e-06, + "loss": 0.4605, + "step": 9463 + }, + { + "epoch": 1.554246299755712, + "grad_norm": 0.468442175046904, + "learning_rate": 7.099107792542962e-06, + "loss": 0.4638, + "step": 9464 + }, + { + "epoch": 1.554410526964055, + "grad_norm": 0.2761597581553079, + "learning_rate": 7.098724940273914e-06, + "loss": 0.4456, + "step": 9465 + }, + { + "epoch": 1.5545747541723975, + "grad_norm": 0.3244344994528494, + "learning_rate": 7.098342059780159e-06, + "loss": 0.4405, + "step": 9466 + }, + { + "epoch": 1.5547389813807402, + "grad_norm": 0.4858686845931506, + "learning_rate": 7.097959151065853e-06, + "loss": 0.4572, + "step": 9467 + }, + { + "epoch": 1.554903208589083, + "grad_norm": 0.24869488822776611, + "learning_rate": 7.097576214135153e-06, + "loss": 0.457, + "step": 9468 + }, + { + "epoch": 1.555067435797426, + "grad_norm": 0.3540991970589224, + "learning_rate": 7.097193248992222e-06, + "loss": 0.4747, + "step": 9469 + }, + { + "epoch": 1.5552316630057685, + "grad_norm": 0.3304816779100368, + "learning_rate": 7.096810255641214e-06, + "loss": 0.449, + "step": 9470 + }, + { + "epoch": 1.5553958902141112, + "grad_norm": 0.38004786342989794, + "learning_rate": 7.096427234086294e-06, + "loss": 0.4592, + "step": 9471 + }, + { + "epoch": 1.5555601174224538, + "grad_norm": 0.27383675654332823, + "learning_rate": 7.096044184331619e-06, + "loss": 0.4678, + "step": 9472 + }, + { + "epoch": 1.5557243446307967, + "grad_norm": 0.3790453414043705, + "learning_rate": 7.09566110638135e-06, + "loss": 0.4699, + "step": 9473 + }, + { + "epoch": 1.5558885718391395, + "grad_norm": 0.38324765092279667, + "learning_rate": 7.095278000239646e-06, + "loss": 0.4563, + "step": 9474 + }, + { + "epoch": 1.5560527990474822, + "grad_norm": 0.33095846652110594, + "learning_rate": 7.094894865910672e-06, + "loss": 0.4408, + "step": 9475 + }, + { + "epoch": 1.5562170262558248, + "grad_norm": 0.3003482283168898, + "learning_rate": 7.094511703398586e-06, + "loss": 0.448, + "step": 9476 + }, + { + "epoch": 1.5563812534641677, + "grad_norm": 0.2829919076745514, + "learning_rate": 7.094128512707549e-06, + "loss": 0.4416, + "step": 9477 + }, + { + "epoch": 1.5565454806725105, + "grad_norm": 0.4384164662797856, + "learning_rate": 7.093745293841726e-06, + "loss": 0.4616, + "step": 9478 + }, + { + "epoch": 1.5567097078808532, + "grad_norm": 0.31266180818887007, + "learning_rate": 7.093362046805277e-06, + "loss": 0.46, + "step": 9479 + }, + { + "epoch": 1.5568739350891958, + "grad_norm": 0.27640585641465, + "learning_rate": 7.092978771602363e-06, + "loss": 0.4869, + "step": 9480 + }, + { + "epoch": 1.5570381622975387, + "grad_norm": 0.3111860240242668, + "learning_rate": 7.0925954682371495e-06, + "loss": 0.4859, + "step": 9481 + }, + { + "epoch": 1.5572023895058815, + "grad_norm": 0.26418192548126807, + "learning_rate": 7.092212136713798e-06, + "loss": 0.4423, + "step": 9482 + }, + { + "epoch": 1.5573666167142242, + "grad_norm": 0.32839296991893036, + "learning_rate": 7.0918287770364725e-06, + "loss": 0.4608, + "step": 9483 + }, + { + "epoch": 1.5575308439225668, + "grad_norm": 0.29188899721436623, + "learning_rate": 7.091445389209336e-06, + "loss": 0.4668, + "step": 9484 + }, + { + "epoch": 1.5576950711309097, + "grad_norm": 0.7384461938939119, + "learning_rate": 7.091061973236555e-06, + "loss": 0.4934, + "step": 9485 + }, + { + "epoch": 1.5578592983392525, + "grad_norm": 0.325973263250149, + "learning_rate": 7.0906785291222905e-06, + "loss": 0.4679, + "step": 9486 + }, + { + "epoch": 1.5580235255475952, + "grad_norm": 0.4237636097354834, + "learning_rate": 7.090295056870706e-06, + "loss": 0.447, + "step": 9487 + }, + { + "epoch": 1.5581877527559378, + "grad_norm": 0.27640337167974977, + "learning_rate": 7.089911556485971e-06, + "loss": 0.4606, + "step": 9488 + }, + { + "epoch": 1.5583519799642804, + "grad_norm": 0.2826604533459565, + "learning_rate": 7.08952802797225e-06, + "loss": 0.4604, + "step": 9489 + }, + { + "epoch": 1.5585162071726233, + "grad_norm": 0.33442396415131775, + "learning_rate": 7.089144471333703e-06, + "loss": 0.4526, + "step": 9490 + }, + { + "epoch": 1.5586804343809662, + "grad_norm": 0.3254632769404679, + "learning_rate": 7.088760886574502e-06, + "loss": 0.4542, + "step": 9491 + }, + { + "epoch": 1.5588446615893088, + "grad_norm": 0.3473371450325595, + "learning_rate": 7.08837727369881e-06, + "loss": 0.4719, + "step": 9492 + }, + { + "epoch": 1.5590088887976514, + "grad_norm": 0.3354920157988492, + "learning_rate": 7.087993632710796e-06, + "loss": 0.4702, + "step": 9493 + }, + { + "epoch": 1.5591731160059943, + "grad_norm": 0.30499776435563025, + "learning_rate": 7.087609963614622e-06, + "loss": 0.4514, + "step": 9494 + }, + { + "epoch": 1.5593373432143371, + "grad_norm": 0.3328412721849873, + "learning_rate": 7.08722626641446e-06, + "loss": 0.4576, + "step": 9495 + }, + { + "epoch": 1.5595015704226798, + "grad_norm": 0.31693393803170694, + "learning_rate": 7.086842541114474e-06, + "loss": 0.4571, + "step": 9496 + }, + { + "epoch": 1.5596657976310224, + "grad_norm": 0.35129703393382067, + "learning_rate": 7.086458787718834e-06, + "loss": 0.4543, + "step": 9497 + }, + { + "epoch": 1.5598300248393653, + "grad_norm": 0.294075165507447, + "learning_rate": 7.086075006231707e-06, + "loss": 0.4705, + "step": 9498 + }, + { + "epoch": 1.5599942520477081, + "grad_norm": 0.3011262860718532, + "learning_rate": 7.085691196657259e-06, + "loss": 0.464, + "step": 9499 + }, + { + "epoch": 1.5601584792560508, + "grad_norm": 0.2934159176282551, + "learning_rate": 7.085307358999662e-06, + "loss": 0.4623, + "step": 9500 + }, + { + "epoch": 1.5603227064643934, + "grad_norm": 0.31840048492371464, + "learning_rate": 7.084923493263085e-06, + "loss": 0.435, + "step": 9501 + }, + { + "epoch": 1.5604869336727363, + "grad_norm": 0.38778892493126904, + "learning_rate": 7.084539599451693e-06, + "loss": 0.4533, + "step": 9502 + }, + { + "epoch": 1.5606511608810791, + "grad_norm": 0.3121985866882979, + "learning_rate": 7.084155677569659e-06, + "loss": 0.476, + "step": 9503 + }, + { + "epoch": 1.5608153880894218, + "grad_norm": 0.4007484244591844, + "learning_rate": 7.0837717276211524e-06, + "loss": 0.4822, + "step": 9504 + }, + { + "epoch": 1.5609796152977644, + "grad_norm": 0.3096911825413162, + "learning_rate": 7.0833877496103425e-06, + "loss": 0.4582, + "step": 9505 + }, + { + "epoch": 1.561143842506107, + "grad_norm": 0.3039014707731322, + "learning_rate": 7.0830037435414e-06, + "loss": 0.4361, + "step": 9506 + }, + { + "epoch": 1.56130806971445, + "grad_norm": 0.3185728949963143, + "learning_rate": 7.082619709418496e-06, + "loss": 0.4464, + "step": 9507 + }, + { + "epoch": 1.5614722969227928, + "grad_norm": 0.29381394262617466, + "learning_rate": 7.0822356472458e-06, + "loss": 0.4765, + "step": 9508 + }, + { + "epoch": 1.5616365241311354, + "grad_norm": 0.30664391728474316, + "learning_rate": 7.081851557027485e-06, + "loss": 0.4621, + "step": 9509 + }, + { + "epoch": 1.561800751339478, + "grad_norm": 0.5913865040452077, + "learning_rate": 7.0814674387677216e-06, + "loss": 0.4729, + "step": 9510 + }, + { + "epoch": 1.561964978547821, + "grad_norm": 0.31667754417883187, + "learning_rate": 7.081083292470681e-06, + "loss": 0.4783, + "step": 9511 + }, + { + "epoch": 1.5621292057561638, + "grad_norm": 0.3476930914201371, + "learning_rate": 7.080699118140538e-06, + "loss": 0.4527, + "step": 9512 + }, + { + "epoch": 1.5622934329645064, + "grad_norm": 0.31491112655895354, + "learning_rate": 7.080314915781463e-06, + "loss": 0.462, + "step": 9513 + }, + { + "epoch": 1.562457660172849, + "grad_norm": 0.3392259237310825, + "learning_rate": 7.07993068539763e-06, + "loss": 0.477, + "step": 9514 + }, + { + "epoch": 1.562621887381192, + "grad_norm": 0.34770909353808194, + "learning_rate": 7.079546426993211e-06, + "loss": 0.4507, + "step": 9515 + }, + { + "epoch": 1.5627861145895348, + "grad_norm": 0.31253636621680053, + "learning_rate": 7.0791621405723785e-06, + "loss": 0.4686, + "step": 9516 + }, + { + "epoch": 1.5629503417978774, + "grad_norm": 0.319327369627498, + "learning_rate": 7.078777826139309e-06, + "loss": 0.4569, + "step": 9517 + }, + { + "epoch": 1.56311456900622, + "grad_norm": 0.30529715870836194, + "learning_rate": 7.078393483698175e-06, + "loss": 0.453, + "step": 9518 + }, + { + "epoch": 1.563278796214563, + "grad_norm": 0.265733050448355, + "learning_rate": 7.078009113253151e-06, + "loss": 0.4722, + "step": 9519 + }, + { + "epoch": 1.5634430234229058, + "grad_norm": 0.31408912922552107, + "learning_rate": 7.077624714808411e-06, + "loss": 0.4641, + "step": 9520 + }, + { + "epoch": 1.5636072506312484, + "grad_norm": 0.3132960285785013, + "learning_rate": 7.077240288368131e-06, + "loss": 0.454, + "step": 9521 + }, + { + "epoch": 1.563771477839591, + "grad_norm": 0.3063188137151771, + "learning_rate": 7.076855833936486e-06, + "loss": 0.4662, + "step": 9522 + }, + { + "epoch": 1.5639357050479337, + "grad_norm": 0.2720545580356203, + "learning_rate": 7.0764713515176505e-06, + "loss": 0.4629, + "step": 9523 + }, + { + "epoch": 1.5640999322562765, + "grad_norm": 0.2956064257938657, + "learning_rate": 7.076086841115802e-06, + "loss": 0.4465, + "step": 9524 + }, + { + "epoch": 1.5642641594646194, + "grad_norm": 0.2800826541100162, + "learning_rate": 7.075702302735116e-06, + "loss": 0.4575, + "step": 9525 + }, + { + "epoch": 1.564428386672962, + "grad_norm": 0.41341271140566976, + "learning_rate": 7.075317736379768e-06, + "loss": 0.4594, + "step": 9526 + }, + { + "epoch": 1.5645926138813047, + "grad_norm": 0.34150965202806, + "learning_rate": 7.0749331420539365e-06, + "loss": 0.4561, + "step": 9527 + }, + { + "epoch": 1.5647568410896475, + "grad_norm": 0.24492469743953443, + "learning_rate": 7.074548519761797e-06, + "loss": 0.4475, + "step": 9528 + }, + { + "epoch": 1.5649210682979904, + "grad_norm": 0.34562330583944856, + "learning_rate": 7.074163869507528e-06, + "loss": 0.472, + "step": 9529 + }, + { + "epoch": 1.565085295506333, + "grad_norm": 0.292325354044597, + "learning_rate": 7.073779191295306e-06, + "loss": 0.4573, + "step": 9530 + }, + { + "epoch": 1.5652495227146757, + "grad_norm": 0.4229715578432917, + "learning_rate": 7.0733944851293116e-06, + "loss": 0.4577, + "step": 9531 + }, + { + "epoch": 1.5654137499230185, + "grad_norm": 0.32096121679499595, + "learning_rate": 7.0730097510137204e-06, + "loss": 0.4503, + "step": 9532 + }, + { + "epoch": 1.5655779771313614, + "grad_norm": 0.27339951728229317, + "learning_rate": 7.072624988952711e-06, + "loss": 0.4557, + "step": 9533 + }, + { + "epoch": 1.565742204339704, + "grad_norm": 0.30943985914503824, + "learning_rate": 7.0722401989504634e-06, + "loss": 0.4557, + "step": 9534 + }, + { + "epoch": 1.5659064315480467, + "grad_norm": 0.33863811933806, + "learning_rate": 7.071855381011157e-06, + "loss": 0.4532, + "step": 9535 + }, + { + "epoch": 1.5660706587563895, + "grad_norm": 0.32722389646224836, + "learning_rate": 7.071470535138969e-06, + "loss": 0.4534, + "step": 9536 + }, + { + "epoch": 1.5662348859647324, + "grad_norm": 0.3616589600678771, + "learning_rate": 7.071085661338083e-06, + "loss": 0.4596, + "step": 9537 + }, + { + "epoch": 1.566399113173075, + "grad_norm": 0.290065563681788, + "learning_rate": 7.070700759612676e-06, + "loss": 0.4419, + "step": 9538 + }, + { + "epoch": 1.5665633403814176, + "grad_norm": 0.3681681484784924, + "learning_rate": 7.070315829966932e-06, + "loss": 0.4674, + "step": 9539 + }, + { + "epoch": 1.5667275675897603, + "grad_norm": 0.29433431026892365, + "learning_rate": 7.069930872405026e-06, + "loss": 0.4681, + "step": 9540 + }, + { + "epoch": 1.5668917947981031, + "grad_norm": 0.28165471409438464, + "learning_rate": 7.069545886931144e-06, + "loss": 0.4381, + "step": 9541 + }, + { + "epoch": 1.567056022006446, + "grad_norm": 0.2646752230437082, + "learning_rate": 7.069160873549464e-06, + "loss": 0.4711, + "step": 9542 + }, + { + "epoch": 1.5672202492147886, + "grad_norm": 0.29628517431852486, + "learning_rate": 7.068775832264172e-06, + "loss": 0.4607, + "step": 9543 + }, + { + "epoch": 1.5673844764231313, + "grad_norm": 0.44146222206987495, + "learning_rate": 7.068390763079445e-06, + "loss": 0.4721, + "step": 9544 + }, + { + "epoch": 1.5675487036314741, + "grad_norm": 0.3452702849381547, + "learning_rate": 7.068005665999467e-06, + "loss": 0.4667, + "step": 9545 + }, + { + "epoch": 1.567712930839817, + "grad_norm": 0.32130074805891345, + "learning_rate": 7.067620541028422e-06, + "loss": 0.4643, + "step": 9546 + }, + { + "epoch": 1.5678771580481596, + "grad_norm": 0.2856839609238298, + "learning_rate": 7.0672353881704915e-06, + "loss": 0.45, + "step": 9547 + }, + { + "epoch": 1.5680413852565023, + "grad_norm": 0.41361729342399817, + "learning_rate": 7.066850207429859e-06, + "loss": 0.457, + "step": 9548 + }, + { + "epoch": 1.5682056124648451, + "grad_norm": 0.4783024490657136, + "learning_rate": 7.066464998810707e-06, + "loss": 0.4673, + "step": 9549 + }, + { + "epoch": 1.568369839673188, + "grad_norm": 0.26854722532828895, + "learning_rate": 7.066079762317221e-06, + "loss": 0.4533, + "step": 9550 + }, + { + "epoch": 1.5685340668815306, + "grad_norm": 0.3243123197398592, + "learning_rate": 7.0656944979535836e-06, + "loss": 0.4703, + "step": 9551 + }, + { + "epoch": 1.5686982940898733, + "grad_norm": 0.31882320116029533, + "learning_rate": 7.06530920572398e-06, + "loss": 0.4334, + "step": 9552 + }, + { + "epoch": 1.5688625212982161, + "grad_norm": 0.3153813981363007, + "learning_rate": 7.064923885632595e-06, + "loss": 0.4629, + "step": 9553 + }, + { + "epoch": 1.569026748506559, + "grad_norm": 0.32022797273174436, + "learning_rate": 7.064538537683612e-06, + "loss": 0.4655, + "step": 9554 + }, + { + "epoch": 1.5691909757149016, + "grad_norm": 0.3000384583865678, + "learning_rate": 7.0641531618812165e-06, + "loss": 0.4616, + "step": 9555 + }, + { + "epoch": 1.5693552029232443, + "grad_norm": 0.3007245952874813, + "learning_rate": 7.063767758229597e-06, + "loss": 0.458, + "step": 9556 + }, + { + "epoch": 1.569519430131587, + "grad_norm": 0.3370373862995186, + "learning_rate": 7.063382326732936e-06, + "loss": 0.4564, + "step": 9557 + }, + { + "epoch": 1.5696836573399298, + "grad_norm": 0.2668854475743722, + "learning_rate": 7.062996867395421e-06, + "loss": 0.4637, + "step": 9558 + }, + { + "epoch": 1.5698478845482726, + "grad_norm": 0.32082269614047243, + "learning_rate": 7.062611380221239e-06, + "loss": 0.4557, + "step": 9559 + }, + { + "epoch": 1.5700121117566153, + "grad_norm": 0.2642372498332476, + "learning_rate": 7.0622258652145755e-06, + "loss": 0.459, + "step": 9560 + }, + { + "epoch": 1.570176338964958, + "grad_norm": 0.32119739559651794, + "learning_rate": 7.061840322379618e-06, + "loss": 0.4826, + "step": 9561 + }, + { + "epoch": 1.5703405661733008, + "grad_norm": 0.33573520572974747, + "learning_rate": 7.061454751720556e-06, + "loss": 0.4773, + "step": 9562 + }, + { + "epoch": 1.5705047933816436, + "grad_norm": 0.3727906802239041, + "learning_rate": 7.061069153241572e-06, + "loss": 0.467, + "step": 9563 + }, + { + "epoch": 1.5706690205899863, + "grad_norm": 0.25635954624939483, + "learning_rate": 7.06068352694686e-06, + "loss": 0.4725, + "step": 9564 + }, + { + "epoch": 1.570833247798329, + "grad_norm": 0.3569861822095909, + "learning_rate": 7.060297872840604e-06, + "loss": 0.4674, + "step": 9565 + }, + { + "epoch": 1.5709974750066718, + "grad_norm": 0.4167333878551979, + "learning_rate": 7.059912190926995e-06, + "loss": 0.464, + "step": 9566 + }, + { + "epoch": 1.5711617022150146, + "grad_norm": 0.2988223076469179, + "learning_rate": 7.05952648121022e-06, + "loss": 0.4883, + "step": 9567 + }, + { + "epoch": 1.5713259294233572, + "grad_norm": 0.2830256329297102, + "learning_rate": 7.059140743694471e-06, + "loss": 0.4476, + "step": 9568 + }, + { + "epoch": 1.5714901566316999, + "grad_norm": 0.372258375978891, + "learning_rate": 7.058754978383934e-06, + "loss": 0.4893, + "step": 9569 + }, + { + "epoch": 1.5716543838400427, + "grad_norm": 0.2654171969672735, + "learning_rate": 7.0583691852828e-06, + "loss": 0.4525, + "step": 9570 + }, + { + "epoch": 1.5718186110483856, + "grad_norm": 0.280300815255278, + "learning_rate": 7.0579833643952605e-06, + "loss": 0.455, + "step": 9571 + }, + { + "epoch": 1.5719828382567282, + "grad_norm": 0.5008140822399365, + "learning_rate": 7.0575975157255044e-06, + "loss": 0.4672, + "step": 9572 + }, + { + "epoch": 1.5721470654650709, + "grad_norm": 0.31955287284252826, + "learning_rate": 7.057211639277725e-06, + "loss": 0.477, + "step": 9573 + }, + { + "epoch": 1.5723112926734135, + "grad_norm": 0.3489108249379957, + "learning_rate": 7.05682573505611e-06, + "loss": 0.4773, + "step": 9574 + }, + { + "epoch": 1.5724755198817564, + "grad_norm": 0.35593903885934475, + "learning_rate": 7.056439803064851e-06, + "loss": 0.4744, + "step": 9575 + }, + { + "epoch": 1.5726397470900992, + "grad_norm": 0.28457851992077404, + "learning_rate": 7.056053843308141e-06, + "loss": 0.4511, + "step": 9576 + }, + { + "epoch": 1.5728039742984419, + "grad_norm": 0.27240800898902373, + "learning_rate": 7.0556678557901725e-06, + "loss": 0.4603, + "step": 9577 + }, + { + "epoch": 1.5729682015067845, + "grad_norm": 0.2727708545101182, + "learning_rate": 7.055281840515136e-06, + "loss": 0.4814, + "step": 9578 + }, + { + "epoch": 1.5731324287151274, + "grad_norm": 0.31236355231569163, + "learning_rate": 7.054895797487223e-06, + "loss": 0.4415, + "step": 9579 + }, + { + "epoch": 1.5732966559234702, + "grad_norm": 0.27149555085526944, + "learning_rate": 7.054509726710629e-06, + "loss": 0.4602, + "step": 9580 + }, + { + "epoch": 1.5734608831318129, + "grad_norm": 0.26192212609758236, + "learning_rate": 7.054123628189548e-06, + "loss": 0.473, + "step": 9581 + }, + { + "epoch": 1.5736251103401555, + "grad_norm": 0.2604095204435766, + "learning_rate": 7.0537375019281705e-06, + "loss": 0.4786, + "step": 9582 + }, + { + "epoch": 1.5737893375484984, + "grad_norm": 0.3173000671655432, + "learning_rate": 7.05335134793069e-06, + "loss": 0.4431, + "step": 9583 + }, + { + "epoch": 1.5739535647568412, + "grad_norm": 0.4249155458956311, + "learning_rate": 7.052965166201301e-06, + "loss": 0.4422, + "step": 9584 + }, + { + "epoch": 1.5741177919651839, + "grad_norm": 0.326714175509031, + "learning_rate": 7.052578956744201e-06, + "loss": 0.4533, + "step": 9585 + }, + { + "epoch": 1.5742820191735265, + "grad_norm": 0.30076342807026163, + "learning_rate": 7.052192719563582e-06, + "loss": 0.4646, + "step": 9586 + }, + { + "epoch": 1.5744462463818694, + "grad_norm": 0.5574470434140718, + "learning_rate": 7.0518064546636375e-06, + "loss": 0.4785, + "step": 9587 + }, + { + "epoch": 1.5746104735902122, + "grad_norm": 0.3899021496848957, + "learning_rate": 7.051420162048565e-06, + "loss": 0.4645, + "step": 9588 + }, + { + "epoch": 1.5747747007985549, + "grad_norm": 0.5009365869861512, + "learning_rate": 7.051033841722559e-06, + "loss": 0.4616, + "step": 9589 + }, + { + "epoch": 1.5749389280068975, + "grad_norm": 0.3026411124363173, + "learning_rate": 7.050647493689816e-06, + "loss": 0.4617, + "step": 9590 + }, + { + "epoch": 1.5751031552152401, + "grad_norm": 0.29061469419822633, + "learning_rate": 7.050261117954531e-06, + "loss": 0.4668, + "step": 9591 + }, + { + "epoch": 1.575267382423583, + "grad_norm": 0.29963184439358226, + "learning_rate": 7.049874714520902e-06, + "loss": 0.4807, + "step": 9592 + }, + { + "epoch": 1.5754316096319259, + "grad_norm": 0.32685838124755867, + "learning_rate": 7.049488283393124e-06, + "loss": 0.4554, + "step": 9593 + }, + { + "epoch": 1.5755958368402685, + "grad_norm": 0.3098084135045518, + "learning_rate": 7.049101824575395e-06, + "loss": 0.4631, + "step": 9594 + }, + { + "epoch": 1.5757600640486111, + "grad_norm": 0.5182353120987377, + "learning_rate": 7.048715338071913e-06, + "loss": 0.4556, + "step": 9595 + }, + { + "epoch": 1.575924291256954, + "grad_norm": 0.33384098807841417, + "learning_rate": 7.048328823886873e-06, + "loss": 0.4752, + "step": 9596 + }, + { + "epoch": 1.5760885184652969, + "grad_norm": 0.5106129343711809, + "learning_rate": 7.047942282024477e-06, + "loss": 0.4458, + "step": 9597 + }, + { + "epoch": 1.5762527456736395, + "grad_norm": 0.2779558813719126, + "learning_rate": 7.047555712488921e-06, + "loss": 0.4627, + "step": 9598 + }, + { + "epoch": 1.5764169728819821, + "grad_norm": 0.3036165546910705, + "learning_rate": 7.047169115284401e-06, + "loss": 0.4542, + "step": 9599 + }, + { + "epoch": 1.576581200090325, + "grad_norm": 0.2813306295945631, + "learning_rate": 7.0467824904151205e-06, + "loss": 0.4574, + "step": 9600 + }, + { + "epoch": 1.5767454272986678, + "grad_norm": 0.3756631418000488, + "learning_rate": 7.046395837885276e-06, + "loss": 0.4665, + "step": 9601 + }, + { + "epoch": 1.5769096545070105, + "grad_norm": 0.3014834902402672, + "learning_rate": 7.046009157699068e-06, + "loss": 0.4686, + "step": 9602 + }, + { + "epoch": 1.5770738817153531, + "grad_norm": 0.37031689284883956, + "learning_rate": 7.045622449860695e-06, + "loss": 0.438, + "step": 9603 + }, + { + "epoch": 1.577238108923696, + "grad_norm": 0.5091529417201783, + "learning_rate": 7.045235714374358e-06, + "loss": 0.4596, + "step": 9604 + }, + { + "epoch": 1.5774023361320388, + "grad_norm": 0.316980806826266, + "learning_rate": 7.044848951244255e-06, + "loss": 0.4717, + "step": 9605 + }, + { + "epoch": 1.5775665633403815, + "grad_norm": 0.27563114539777, + "learning_rate": 7.04446216047459e-06, + "loss": 0.4652, + "step": 9606 + }, + { + "epoch": 1.5777307905487241, + "grad_norm": 0.3254061465093312, + "learning_rate": 7.044075342069563e-06, + "loss": 0.4721, + "step": 9607 + }, + { + "epoch": 1.5778950177570668, + "grad_norm": 0.3070070243971381, + "learning_rate": 7.0436884960333746e-06, + "loss": 0.4451, + "step": 9608 + }, + { + "epoch": 1.5780592449654096, + "grad_norm": 0.3046083263392374, + "learning_rate": 7.0433016223702255e-06, + "loss": 0.4646, + "step": 9609 + }, + { + "epoch": 1.5782234721737525, + "grad_norm": 0.3356082246504402, + "learning_rate": 7.04291472108432e-06, + "loss": 0.4773, + "step": 9610 + }, + { + "epoch": 1.578387699382095, + "grad_norm": 0.3154556445315861, + "learning_rate": 7.042527792179857e-06, + "loss": 0.4668, + "step": 9611 + }, + { + "epoch": 1.5785519265904377, + "grad_norm": 0.34093873199245545, + "learning_rate": 7.042140835661042e-06, + "loss": 0.4478, + "step": 9612 + }, + { + "epoch": 1.5787161537987806, + "grad_norm": 0.4638435804483628, + "learning_rate": 7.041753851532076e-06, + "loss": 0.4545, + "step": 9613 + }, + { + "epoch": 1.5788803810071235, + "grad_norm": 0.3080869937789505, + "learning_rate": 7.041366839797163e-06, + "loss": 0.4676, + "step": 9614 + }, + { + "epoch": 1.579044608215466, + "grad_norm": 0.3208085604236869, + "learning_rate": 7.040979800460505e-06, + "loss": 0.4618, + "step": 9615 + }, + { + "epoch": 1.5792088354238087, + "grad_norm": 0.37304515332678506, + "learning_rate": 7.040592733526307e-06, + "loss": 0.47, + "step": 9616 + }, + { + "epoch": 1.5793730626321516, + "grad_norm": 0.3489077569804723, + "learning_rate": 7.0402056389987716e-06, + "loss": 0.4719, + "step": 9617 + }, + { + "epoch": 1.5795372898404945, + "grad_norm": 0.3204051769035944, + "learning_rate": 7.039818516882105e-06, + "loss": 0.4643, + "step": 9618 + }, + { + "epoch": 1.579701517048837, + "grad_norm": 0.33360715457799445, + "learning_rate": 7.039431367180509e-06, + "loss": 0.4711, + "step": 9619 + }, + { + "epoch": 1.5798657442571797, + "grad_norm": 0.3188657125688107, + "learning_rate": 7.03904418989819e-06, + "loss": 0.478, + "step": 9620 + }, + { + "epoch": 1.5800299714655226, + "grad_norm": 0.3406891273173139, + "learning_rate": 7.038656985039353e-06, + "loss": 0.464, + "step": 9621 + }, + { + "epoch": 1.5801941986738655, + "grad_norm": 0.3344841323401284, + "learning_rate": 7.038269752608205e-06, + "loss": 0.4628, + "step": 9622 + }, + { + "epoch": 1.580358425882208, + "grad_norm": 0.2962372762879127, + "learning_rate": 7.0378824926089485e-06, + "loss": 0.4557, + "step": 9623 + }, + { + "epoch": 1.5805226530905507, + "grad_norm": 0.30877525105222614, + "learning_rate": 7.0374952050457925e-06, + "loss": 0.4545, + "step": 9624 + }, + { + "epoch": 1.5806868802988934, + "grad_norm": 0.5755131888295433, + "learning_rate": 7.037107889922941e-06, + "loss": 0.4472, + "step": 9625 + }, + { + "epoch": 1.5808511075072362, + "grad_norm": 0.25748170687714833, + "learning_rate": 7.036720547244602e-06, + "loss": 0.4641, + "step": 9626 + }, + { + "epoch": 1.581015334715579, + "grad_norm": 0.36057283649245553, + "learning_rate": 7.0363331770149826e-06, + "loss": 0.468, + "step": 9627 + }, + { + "epoch": 1.5811795619239217, + "grad_norm": 0.327370587050576, + "learning_rate": 7.035945779238288e-06, + "loss": 0.4446, + "step": 9628 + }, + { + "epoch": 1.5813437891322644, + "grad_norm": 0.32141525897584117, + "learning_rate": 7.035558353918728e-06, + "loss": 0.4408, + "step": 9629 + }, + { + "epoch": 1.5815080163406072, + "grad_norm": 0.2675636946309375, + "learning_rate": 7.035170901060509e-06, + "loss": 0.461, + "step": 9630 + }, + { + "epoch": 1.58167224354895, + "grad_norm": 0.3485598277383404, + "learning_rate": 7.034783420667841e-06, + "loss": 0.4599, + "step": 9631 + }, + { + "epoch": 1.5818364707572927, + "grad_norm": 0.40961186549980955, + "learning_rate": 7.03439591274493e-06, + "loss": 0.4485, + "step": 9632 + }, + { + "epoch": 1.5820006979656354, + "grad_norm": 0.3150450672652272, + "learning_rate": 7.034008377295987e-06, + "loss": 0.4458, + "step": 9633 + }, + { + "epoch": 1.5821649251739782, + "grad_norm": 0.2925616187190679, + "learning_rate": 7.033620814325219e-06, + "loss": 0.4864, + "step": 9634 + }, + { + "epoch": 1.582329152382321, + "grad_norm": 0.33606955689744944, + "learning_rate": 7.033233223836837e-06, + "loss": 0.466, + "step": 9635 + }, + { + "epoch": 1.5824933795906637, + "grad_norm": 0.4824169072771863, + "learning_rate": 7.03284560583505e-06, + "loss": 0.4528, + "step": 9636 + }, + { + "epoch": 1.5826576067990064, + "grad_norm": 0.2906977714611012, + "learning_rate": 7.032457960324066e-06, + "loss": 0.4537, + "step": 9637 + }, + { + "epoch": 1.5828218340073492, + "grad_norm": 0.279391411557559, + "learning_rate": 7.032070287308099e-06, + "loss": 0.4664, + "step": 9638 + }, + { + "epoch": 1.582986061215692, + "grad_norm": 0.321052045706845, + "learning_rate": 7.031682586791356e-06, + "loss": 0.4575, + "step": 9639 + }, + { + "epoch": 1.5831502884240347, + "grad_norm": 0.3399100775023264, + "learning_rate": 7.03129485877805e-06, + "loss": 0.4729, + "step": 9640 + }, + { + "epoch": 1.5833145156323774, + "grad_norm": 0.32977031429239967, + "learning_rate": 7.030907103272391e-06, + "loss": 0.4716, + "step": 9641 + }, + { + "epoch": 1.58347874284072, + "grad_norm": 0.31242153963448743, + "learning_rate": 7.0305193202785905e-06, + "loss": 0.473, + "step": 9642 + }, + { + "epoch": 1.5836429700490628, + "grad_norm": 0.3019615032920058, + "learning_rate": 7.030131509800861e-06, + "loss": 0.4414, + "step": 9643 + }, + { + "epoch": 1.5838071972574057, + "grad_norm": 0.3483822784558959, + "learning_rate": 7.029743671843415e-06, + "loss": 0.4646, + "step": 9644 + }, + { + "epoch": 1.5839714244657483, + "grad_norm": 0.2991796623781081, + "learning_rate": 7.029355806410462e-06, + "loss": 0.4656, + "step": 9645 + }, + { + "epoch": 1.584135651674091, + "grad_norm": 0.36838639847658394, + "learning_rate": 7.028967913506217e-06, + "loss": 0.4784, + "step": 9646 + }, + { + "epoch": 1.5842998788824338, + "grad_norm": 0.30208778163975347, + "learning_rate": 7.028579993134892e-06, + "loss": 0.4827, + "step": 9647 + }, + { + "epoch": 1.5844641060907767, + "grad_norm": 0.27407402707941875, + "learning_rate": 7.028192045300701e-06, + "loss": 0.4668, + "step": 9648 + }, + { + "epoch": 1.5846283332991193, + "grad_norm": 0.2618196125378437, + "learning_rate": 7.027804070007858e-06, + "loss": 0.4807, + "step": 9649 + }, + { + "epoch": 1.584792560507462, + "grad_norm": 0.3152951828088361, + "learning_rate": 7.027416067260574e-06, + "loss": 0.4632, + "step": 9650 + }, + { + "epoch": 1.5849567877158048, + "grad_norm": 0.3463447699192336, + "learning_rate": 7.027028037063066e-06, + "loss": 0.45, + "step": 9651 + }, + { + "epoch": 1.5851210149241477, + "grad_norm": 0.2943307574825071, + "learning_rate": 7.0266399794195465e-06, + "loss": 0.4598, + "step": 9652 + }, + { + "epoch": 1.5852852421324903, + "grad_norm": 0.3006986761693188, + "learning_rate": 7.026251894334231e-06, + "loss": 0.4703, + "step": 9653 + }, + { + "epoch": 1.585449469340833, + "grad_norm": 0.3068826226953967, + "learning_rate": 7.025863781811335e-06, + "loss": 0.4569, + "step": 9654 + }, + { + "epoch": 1.5856136965491758, + "grad_norm": 0.2975266953823899, + "learning_rate": 7.025475641855074e-06, + "loss": 0.4541, + "step": 9655 + }, + { + "epoch": 1.5857779237575187, + "grad_norm": 0.3719118805987487, + "learning_rate": 7.025087474469661e-06, + "loss": 0.4693, + "step": 9656 + }, + { + "epoch": 1.5859421509658613, + "grad_norm": 0.316004408532983, + "learning_rate": 7.024699279659314e-06, + "loss": 0.4846, + "step": 9657 + }, + { + "epoch": 1.586106378174204, + "grad_norm": 0.2750964430209116, + "learning_rate": 7.024311057428249e-06, + "loss": 0.45, + "step": 9658 + }, + { + "epoch": 1.5862706053825466, + "grad_norm": 0.25877806352237737, + "learning_rate": 7.023922807780682e-06, + "loss": 0.4712, + "step": 9659 + }, + { + "epoch": 1.5864348325908895, + "grad_norm": 0.30726318388169743, + "learning_rate": 7.023534530720832e-06, + "loss": 0.4505, + "step": 9660 + }, + { + "epoch": 1.5865990597992323, + "grad_norm": 0.277175420733913, + "learning_rate": 7.0231462262529125e-06, + "loss": 0.4462, + "step": 9661 + }, + { + "epoch": 1.586763287007575, + "grad_norm": 0.2480687810862842, + "learning_rate": 7.022757894381143e-06, + "loss": 0.4578, + "step": 9662 + }, + { + "epoch": 1.5869275142159176, + "grad_norm": 0.310062585639788, + "learning_rate": 7.02236953510974e-06, + "loss": 0.4529, + "step": 9663 + }, + { + "epoch": 1.5870917414242605, + "grad_norm": 0.29278447216754283, + "learning_rate": 7.021981148442923e-06, + "loss": 0.4479, + "step": 9664 + }, + { + "epoch": 1.5872559686326033, + "grad_norm": 0.34205414183900085, + "learning_rate": 7.0215927343849095e-06, + "loss": 0.4567, + "step": 9665 + }, + { + "epoch": 1.587420195840946, + "grad_norm": 0.30235224417403145, + "learning_rate": 7.021204292939917e-06, + "loss": 0.4504, + "step": 9666 + }, + { + "epoch": 1.5875844230492886, + "grad_norm": 0.3204273273921009, + "learning_rate": 7.020815824112166e-06, + "loss": 0.4555, + "step": 9667 + }, + { + "epoch": 1.5877486502576315, + "grad_norm": 0.29059833973456584, + "learning_rate": 7.020427327905875e-06, + "loss": 0.4524, + "step": 9668 + }, + { + "epoch": 1.5879128774659743, + "grad_norm": 0.312532284678278, + "learning_rate": 7.0200388043252635e-06, + "loss": 0.4531, + "step": 9669 + }, + { + "epoch": 1.588077104674317, + "grad_norm": 0.4001305657269944, + "learning_rate": 7.019650253374552e-06, + "loss": 0.4649, + "step": 9670 + }, + { + "epoch": 1.5882413318826596, + "grad_norm": 0.29994775751366703, + "learning_rate": 7.019261675057958e-06, + "loss": 0.454, + "step": 9671 + }, + { + "epoch": 1.5884055590910025, + "grad_norm": 0.3719043680773999, + "learning_rate": 7.018873069379705e-06, + "loss": 0.4631, + "step": 9672 + }, + { + "epoch": 1.5885697862993453, + "grad_norm": 0.2949768760769632, + "learning_rate": 7.018484436344012e-06, + "loss": 0.4696, + "step": 9673 + }, + { + "epoch": 1.588734013507688, + "grad_norm": 0.28422629123361914, + "learning_rate": 7.018095775955099e-06, + "loss": 0.4618, + "step": 9674 + }, + { + "epoch": 1.5888982407160306, + "grad_norm": 0.27145460449253184, + "learning_rate": 7.0177070882171895e-06, + "loss": 0.4839, + "step": 9675 + }, + { + "epoch": 1.5890624679243732, + "grad_norm": 0.3286664513768836, + "learning_rate": 7.017318373134504e-06, + "loss": 0.4594, + "step": 9676 + }, + { + "epoch": 1.589226695132716, + "grad_norm": 0.3619168102990762, + "learning_rate": 7.0169296307112635e-06, + "loss": 0.4444, + "step": 9677 + }, + { + "epoch": 1.589390922341059, + "grad_norm": 0.28446907903214214, + "learning_rate": 7.0165408609516916e-06, + "loss": 0.4572, + "step": 9678 + }, + { + "epoch": 1.5895551495494016, + "grad_norm": 0.352917143530713, + "learning_rate": 7.0161520638600105e-06, + "loss": 0.4514, + "step": 9679 + }, + { + "epoch": 1.5897193767577442, + "grad_norm": 0.35002679253958296, + "learning_rate": 7.01576323944044e-06, + "loss": 0.4588, + "step": 9680 + }, + { + "epoch": 1.589883603966087, + "grad_norm": 0.32197653074915533, + "learning_rate": 7.015374387697208e-06, + "loss": 0.4674, + "step": 9681 + }, + { + "epoch": 1.59004783117443, + "grad_norm": 0.29434146515200116, + "learning_rate": 7.014985508634535e-06, + "loss": 0.4843, + "step": 9682 + }, + { + "epoch": 1.5902120583827726, + "grad_norm": 0.32987470643692046, + "learning_rate": 7.014596602256644e-06, + "loss": 0.4927, + "step": 9683 + }, + { + "epoch": 1.5903762855911152, + "grad_norm": 0.43707017640769624, + "learning_rate": 7.014207668567761e-06, + "loss": 0.4806, + "step": 9684 + }, + { + "epoch": 1.590540512799458, + "grad_norm": 0.37174042084246073, + "learning_rate": 7.013818707572109e-06, + "loss": 0.483, + "step": 9685 + }, + { + "epoch": 1.590704740007801, + "grad_norm": 0.29865649575691555, + "learning_rate": 7.013429719273912e-06, + "loss": 0.4724, + "step": 9686 + }, + { + "epoch": 1.5908689672161436, + "grad_norm": 0.28389824575492095, + "learning_rate": 7.0130407036773955e-06, + "loss": 0.4617, + "step": 9687 + }, + { + "epoch": 1.5910331944244862, + "grad_norm": 0.4856752735113594, + "learning_rate": 7.0126516607867845e-06, + "loss": 0.4588, + "step": 9688 + }, + { + "epoch": 1.591197421632829, + "grad_norm": 0.3280719264639045, + "learning_rate": 7.012262590606304e-06, + "loss": 0.4744, + "step": 9689 + }, + { + "epoch": 1.591361648841172, + "grad_norm": 0.3017016566131072, + "learning_rate": 7.011873493140181e-06, + "loss": 0.4561, + "step": 9690 + }, + { + "epoch": 1.5915258760495146, + "grad_norm": 0.27235999254119797, + "learning_rate": 7.011484368392639e-06, + "loss": 0.4589, + "step": 9691 + }, + { + "epoch": 1.5916901032578572, + "grad_norm": 0.293826082990073, + "learning_rate": 7.011095216367906e-06, + "loss": 0.4538, + "step": 9692 + }, + { + "epoch": 1.5918543304661998, + "grad_norm": 0.30234293241414956, + "learning_rate": 7.010706037070209e-06, + "loss": 0.461, + "step": 9693 + }, + { + "epoch": 1.5920185576745427, + "grad_norm": 0.2807863448579779, + "learning_rate": 7.010316830503775e-06, + "loss": 0.4579, + "step": 9694 + }, + { + "epoch": 1.5921827848828856, + "grad_norm": 0.25374035826197255, + "learning_rate": 7.009927596672829e-06, + "loss": 0.4783, + "step": 9695 + }, + { + "epoch": 1.5923470120912282, + "grad_norm": 0.28295051020587353, + "learning_rate": 7.009538335581601e-06, + "loss": 0.4487, + "step": 9696 + }, + { + "epoch": 1.5925112392995708, + "grad_norm": 0.9402708462138283, + "learning_rate": 7.0091490472343165e-06, + "loss": 0.4652, + "step": 9697 + }, + { + "epoch": 1.5926754665079137, + "grad_norm": 0.3089325539254181, + "learning_rate": 7.008759731635206e-06, + "loss": 0.4736, + "step": 9698 + }, + { + "epoch": 1.5928396937162566, + "grad_norm": 0.28210806545171774, + "learning_rate": 7.008370388788496e-06, + "loss": 0.4424, + "step": 9699 + }, + { + "epoch": 1.5930039209245992, + "grad_norm": 0.33788709073616296, + "learning_rate": 7.007981018698415e-06, + "loss": 0.4705, + "step": 9700 + }, + { + "epoch": 1.5931681481329418, + "grad_norm": 0.2810052632243497, + "learning_rate": 7.007591621369193e-06, + "loss": 0.4618, + "step": 9701 + }, + { + "epoch": 1.5933323753412847, + "grad_norm": 0.3861673845526481, + "learning_rate": 7.00720219680506e-06, + "loss": 0.453, + "step": 9702 + }, + { + "epoch": 1.5934966025496275, + "grad_norm": 0.29160915803694276, + "learning_rate": 7.006812745010243e-06, + "loss": 0.4733, + "step": 9703 + }, + { + "epoch": 1.5936608297579702, + "grad_norm": 0.33914569659292165, + "learning_rate": 7.006423265988972e-06, + "loss": 0.4515, + "step": 9704 + }, + { + "epoch": 1.5938250569663128, + "grad_norm": 0.2550988195503919, + "learning_rate": 7.006033759745481e-06, + "loss": 0.4619, + "step": 9705 + }, + { + "epoch": 1.5939892841746557, + "grad_norm": 0.29945375984337574, + "learning_rate": 7.005644226283997e-06, + "loss": 0.4713, + "step": 9706 + }, + { + "epoch": 1.5941535113829983, + "grad_norm": 0.25626086854928126, + "learning_rate": 7.005254665608751e-06, + "loss": 0.4694, + "step": 9707 + }, + { + "epoch": 1.5943177385913412, + "grad_norm": 0.2478955917014123, + "learning_rate": 7.004865077723974e-06, + "loss": 0.4469, + "step": 9708 + }, + { + "epoch": 1.5944819657996838, + "grad_norm": 0.261073698927775, + "learning_rate": 7.0044754626339e-06, + "loss": 0.4424, + "step": 9709 + }, + { + "epoch": 1.5946461930080265, + "grad_norm": 0.29944897321085306, + "learning_rate": 7.0040858203427555e-06, + "loss": 0.4598, + "step": 9710 + }, + { + "epoch": 1.5948104202163693, + "grad_norm": 0.3029852969342873, + "learning_rate": 7.003696150854777e-06, + "loss": 0.4568, + "step": 9711 + }, + { + "epoch": 1.5949746474247122, + "grad_norm": 0.3545481979767949, + "learning_rate": 7.003306454174195e-06, + "loss": 0.4739, + "step": 9712 + }, + { + "epoch": 1.5951388746330548, + "grad_norm": 0.2915600645900402, + "learning_rate": 7.002916730305242e-06, + "loss": 0.4731, + "step": 9713 + }, + { + "epoch": 1.5953031018413975, + "grad_norm": 0.30375434938571066, + "learning_rate": 7.00252697925215e-06, + "loss": 0.4594, + "step": 9714 + }, + { + "epoch": 1.5954673290497403, + "grad_norm": 0.3858113833425676, + "learning_rate": 7.002137201019153e-06, + "loss": 0.4711, + "step": 9715 + }, + { + "epoch": 1.5956315562580832, + "grad_norm": 0.306509544782491, + "learning_rate": 7.001747395610485e-06, + "loss": 0.4478, + "step": 9716 + }, + { + "epoch": 1.5957957834664258, + "grad_norm": 0.3747588996266856, + "learning_rate": 7.001357563030378e-06, + "loss": 0.4668, + "step": 9717 + }, + { + "epoch": 1.5959600106747684, + "grad_norm": 0.24590682956768997, + "learning_rate": 7.000967703283067e-06, + "loss": 0.4767, + "step": 9718 + }, + { + "epoch": 1.5961242378831113, + "grad_norm": 0.389786665880499, + "learning_rate": 7.000577816372787e-06, + "loss": 0.4499, + "step": 9719 + }, + { + "epoch": 1.5962884650914542, + "grad_norm": 0.3980182584891584, + "learning_rate": 7.0001879023037704e-06, + "loss": 0.4691, + "step": 9720 + }, + { + "epoch": 1.5964526922997968, + "grad_norm": 1.5256900334767118, + "learning_rate": 6.999797961080255e-06, + "loss": 0.4411, + "step": 9721 + }, + { + "epoch": 1.5966169195081394, + "grad_norm": 0.341679700793873, + "learning_rate": 6.999407992706472e-06, + "loss": 0.4636, + "step": 9722 + }, + { + "epoch": 1.5967811467164823, + "grad_norm": 0.3189785275812659, + "learning_rate": 6.999017997186662e-06, + "loss": 0.4502, + "step": 9723 + }, + { + "epoch": 1.596945373924825, + "grad_norm": 0.3096324528487003, + "learning_rate": 6.998627974525056e-06, + "loss": 0.4658, + "step": 9724 + }, + { + "epoch": 1.5971096011331678, + "grad_norm": 0.32705409411953407, + "learning_rate": 6.998237924725891e-06, + "loss": 0.473, + "step": 9725 + }, + { + "epoch": 1.5972738283415104, + "grad_norm": 0.2877085709462597, + "learning_rate": 6.997847847793406e-06, + "loss": 0.4643, + "step": 9726 + }, + { + "epoch": 1.597438055549853, + "grad_norm": 0.3923418706541653, + "learning_rate": 6.997457743731836e-06, + "loss": 0.4555, + "step": 9727 + }, + { + "epoch": 1.597602282758196, + "grad_norm": 0.3107603017489276, + "learning_rate": 6.997067612545416e-06, + "loss": 0.4443, + "step": 9728 + }, + { + "epoch": 1.5977665099665388, + "grad_norm": 0.32297011284910904, + "learning_rate": 6.996677454238386e-06, + "loss": 0.4674, + "step": 9729 + }, + { + "epoch": 1.5979307371748814, + "grad_norm": 0.31297648005174855, + "learning_rate": 6.996287268814981e-06, + "loss": 0.4502, + "step": 9730 + }, + { + "epoch": 1.598094964383224, + "grad_norm": 0.33511246862645244, + "learning_rate": 6.9958970562794435e-06, + "loss": 0.4729, + "step": 9731 + }, + { + "epoch": 1.598259191591567, + "grad_norm": 0.34563244047173136, + "learning_rate": 6.995506816636005e-06, + "loss": 0.4593, + "step": 9732 + }, + { + "epoch": 1.5984234187999098, + "grad_norm": 0.2745624451603022, + "learning_rate": 6.9951165498889085e-06, + "loss": 0.4536, + "step": 9733 + }, + { + "epoch": 1.5985876460082524, + "grad_norm": 0.3344724710839866, + "learning_rate": 6.994726256042392e-06, + "loss": 0.4574, + "step": 9734 + }, + { + "epoch": 1.598751873216595, + "grad_norm": 0.9641068009989658, + "learning_rate": 6.994335935100693e-06, + "loss": 0.4694, + "step": 9735 + }, + { + "epoch": 1.598916100424938, + "grad_norm": 0.32579840476566196, + "learning_rate": 6.993945587068053e-06, + "loss": 0.4849, + "step": 9736 + }, + { + "epoch": 1.5990803276332808, + "grad_norm": 0.33205921777574166, + "learning_rate": 6.993555211948709e-06, + "loss": 0.4692, + "step": 9737 + }, + { + "epoch": 1.5992445548416234, + "grad_norm": 0.2770176558361628, + "learning_rate": 6.993164809746901e-06, + "loss": 0.4531, + "step": 9738 + }, + { + "epoch": 1.599408782049966, + "grad_norm": 0.2972289466677687, + "learning_rate": 6.992774380466872e-06, + "loss": 0.4647, + "step": 9739 + }, + { + "epoch": 1.599573009258309, + "grad_norm": 0.5161134035043563, + "learning_rate": 6.99238392411286e-06, + "loss": 0.4444, + "step": 9740 + }, + { + "epoch": 1.5997372364666516, + "grad_norm": 0.3632176478786719, + "learning_rate": 6.991993440689107e-06, + "loss": 0.4536, + "step": 9741 + }, + { + "epoch": 1.5999014636749944, + "grad_norm": 0.3972599771589535, + "learning_rate": 6.991602930199853e-06, + "loss": 0.4565, + "step": 9742 + }, + { + "epoch": 1.600065690883337, + "grad_norm": 0.34852573709915075, + "learning_rate": 6.991212392649341e-06, + "loss": 0.4395, + "step": 9743 + }, + { + "epoch": 1.6002299180916797, + "grad_norm": 0.378242100179647, + "learning_rate": 6.990821828041809e-06, + "loss": 0.4815, + "step": 9744 + }, + { + "epoch": 1.6003941453000226, + "grad_norm": 0.2997246411725691, + "learning_rate": 6.990431236381503e-06, + "loss": 0.4667, + "step": 9745 + }, + { + "epoch": 1.6005583725083654, + "grad_norm": 0.2928757734583082, + "learning_rate": 6.990040617672663e-06, + "loss": 0.464, + "step": 9746 + }, + { + "epoch": 1.600722599716708, + "grad_norm": 0.2671624667674582, + "learning_rate": 6.989649971919531e-06, + "loss": 0.4643, + "step": 9747 + }, + { + "epoch": 1.6008868269250507, + "grad_norm": 0.2615783823407928, + "learning_rate": 6.989259299126353e-06, + "loss": 0.45, + "step": 9748 + }, + { + "epoch": 1.6010510541333935, + "grad_norm": 0.31506464269600354, + "learning_rate": 6.988868599297368e-06, + "loss": 0.456, + "step": 9749 + }, + { + "epoch": 1.6012152813417364, + "grad_norm": 0.2857207363724411, + "learning_rate": 6.988477872436822e-06, + "loss": 0.4596, + "step": 9750 + }, + { + "epoch": 1.601379508550079, + "grad_norm": 0.3117589736533758, + "learning_rate": 6.988087118548959e-06, + "loss": 0.4498, + "step": 9751 + }, + { + "epoch": 1.6015437357584217, + "grad_norm": 0.26792546533856765, + "learning_rate": 6.9876963376380206e-06, + "loss": 0.4418, + "step": 9752 + }, + { + "epoch": 1.6017079629667645, + "grad_norm": 0.3510529580662092, + "learning_rate": 6.987305529708253e-06, + "loss": 0.4754, + "step": 9753 + }, + { + "epoch": 1.6018721901751074, + "grad_norm": 0.34665871073162313, + "learning_rate": 6.986914694763899e-06, + "loss": 0.4495, + "step": 9754 + }, + { + "epoch": 1.60203641738345, + "grad_norm": 0.2704560117494492, + "learning_rate": 6.986523832809207e-06, + "loss": 0.4563, + "step": 9755 + }, + { + "epoch": 1.6022006445917927, + "grad_norm": 0.39103153382891753, + "learning_rate": 6.986132943848418e-06, + "loss": 0.4657, + "step": 9756 + }, + { + "epoch": 1.6023648718001355, + "grad_norm": 0.27479927593805215, + "learning_rate": 6.985742027885779e-06, + "loss": 0.4703, + "step": 9757 + }, + { + "epoch": 1.6025290990084782, + "grad_norm": 0.34785620793652167, + "learning_rate": 6.985351084925537e-06, + "loss": 0.4459, + "step": 9758 + }, + { + "epoch": 1.602693326216821, + "grad_norm": 0.3060687588327717, + "learning_rate": 6.984960114971936e-06, + "loss": 0.4774, + "step": 9759 + }, + { + "epoch": 1.6028575534251637, + "grad_norm": 0.37563535546956034, + "learning_rate": 6.9845691180292235e-06, + "loss": 0.4805, + "step": 9760 + }, + { + "epoch": 1.6030217806335063, + "grad_norm": 0.24377779296879026, + "learning_rate": 6.984178094101647e-06, + "loss": 0.4809, + "step": 9761 + }, + { + "epoch": 1.6031860078418492, + "grad_norm": 0.2571146440045776, + "learning_rate": 6.983787043193452e-06, + "loss": 0.4634, + "step": 9762 + }, + { + "epoch": 1.603350235050192, + "grad_norm": 0.3778507318993349, + "learning_rate": 6.983395965308885e-06, + "loss": 0.4464, + "step": 9763 + }, + { + "epoch": 1.6035144622585347, + "grad_norm": 0.30104596005073675, + "learning_rate": 6.983004860452195e-06, + "loss": 0.463, + "step": 9764 + }, + { + "epoch": 1.6036786894668773, + "grad_norm": 0.29668588814681535, + "learning_rate": 6.982613728627629e-06, + "loss": 0.4557, + "step": 9765 + }, + { + "epoch": 1.6038429166752202, + "grad_norm": 0.4248514538815655, + "learning_rate": 6.982222569839436e-06, + "loss": 0.4238, + "step": 9766 + }, + { + "epoch": 1.604007143883563, + "grad_norm": 0.36176526973408535, + "learning_rate": 6.981831384091863e-06, + "loss": 0.4639, + "step": 9767 + }, + { + "epoch": 1.6041713710919057, + "grad_norm": 0.3367393011027692, + "learning_rate": 6.981440171389158e-06, + "loss": 0.4729, + "step": 9768 + }, + { + "epoch": 1.6043355983002483, + "grad_norm": 0.29669673074478037, + "learning_rate": 6.981048931735574e-06, + "loss": 0.4804, + "step": 9769 + }, + { + "epoch": 1.6044998255085912, + "grad_norm": 0.2980190900348066, + "learning_rate": 6.980657665135357e-06, + "loss": 0.4661, + "step": 9770 + }, + { + "epoch": 1.604664052716934, + "grad_norm": 0.332201811865359, + "learning_rate": 6.980266371592756e-06, + "loss": 0.4418, + "step": 9771 + }, + { + "epoch": 1.6048282799252767, + "grad_norm": 0.40769861897148146, + "learning_rate": 6.979875051112023e-06, + "loss": 0.4509, + "step": 9772 + }, + { + "epoch": 1.6049925071336193, + "grad_norm": 0.28501944575324484, + "learning_rate": 6.979483703697408e-06, + "loss": 0.4693, + "step": 9773 + }, + { + "epoch": 1.6051567343419622, + "grad_norm": 0.2760017459121368, + "learning_rate": 6.979092329353159e-06, + "loss": 0.4627, + "step": 9774 + }, + { + "epoch": 1.6053209615503048, + "grad_norm": 0.3752488518830528, + "learning_rate": 6.978700928083527e-06, + "loss": 0.4763, + "step": 9775 + }, + { + "epoch": 1.6054851887586477, + "grad_norm": 0.3251274051588068, + "learning_rate": 6.9783094998927655e-06, + "loss": 0.4671, + "step": 9776 + }, + { + "epoch": 1.6056494159669903, + "grad_norm": 0.36151954149272725, + "learning_rate": 6.977918044785125e-06, + "loss": 0.453, + "step": 9777 + }, + { + "epoch": 1.605813643175333, + "grad_norm": 0.37377513986589384, + "learning_rate": 6.9775265627648565e-06, + "loss": 0.4644, + "step": 9778 + }, + { + "epoch": 1.6059778703836758, + "grad_norm": 0.28806517291802397, + "learning_rate": 6.977135053836211e-06, + "loss": 0.4641, + "step": 9779 + }, + { + "epoch": 1.6061420975920186, + "grad_norm": 0.25543690612044667, + "learning_rate": 6.976743518003443e-06, + "loss": 0.4595, + "step": 9780 + }, + { + "epoch": 1.6063063248003613, + "grad_norm": 0.2927473928733643, + "learning_rate": 6.976351955270803e-06, + "loss": 0.4852, + "step": 9781 + }, + { + "epoch": 1.606470552008704, + "grad_norm": 0.32663703524442445, + "learning_rate": 6.975960365642544e-06, + "loss": 0.4828, + "step": 9782 + }, + { + "epoch": 1.6066347792170468, + "grad_norm": 0.33126405619204863, + "learning_rate": 6.9755687491229195e-06, + "loss": 0.4574, + "step": 9783 + }, + { + "epoch": 1.6067990064253896, + "grad_norm": 0.27115302052887735, + "learning_rate": 6.975177105716184e-06, + "loss": 0.4554, + "step": 9784 + }, + { + "epoch": 1.6069632336337323, + "grad_norm": 0.5938177654376247, + "learning_rate": 6.974785435426588e-06, + "loss": 0.4748, + "step": 9785 + }, + { + "epoch": 1.607127460842075, + "grad_norm": 0.3709570909137645, + "learning_rate": 6.974393738258388e-06, + "loss": 0.4737, + "step": 9786 + }, + { + "epoch": 1.6072916880504178, + "grad_norm": 0.29217409026482316, + "learning_rate": 6.974002014215839e-06, + "loss": 0.4594, + "step": 9787 + }, + { + "epoch": 1.6074559152587606, + "grad_norm": 0.5522762635217223, + "learning_rate": 6.973610263303191e-06, + "loss": 0.4405, + "step": 9788 + }, + { + "epoch": 1.6076201424671033, + "grad_norm": 0.2700062333742084, + "learning_rate": 6.973218485524704e-06, + "loss": 0.4527, + "step": 9789 + }, + { + "epoch": 1.607784369675446, + "grad_norm": 0.2752977343062548, + "learning_rate": 6.972826680884631e-06, + "loss": 0.4579, + "step": 9790 + }, + { + "epoch": 1.6079485968837888, + "grad_norm": 0.351632135170519, + "learning_rate": 6.972434849387226e-06, + "loss": 0.4488, + "step": 9791 + }, + { + "epoch": 1.6081128240921314, + "grad_norm": 0.30396405938296145, + "learning_rate": 6.972042991036747e-06, + "loss": 0.4552, + "step": 9792 + }, + { + "epoch": 1.6082770513004743, + "grad_norm": 0.2923401297274902, + "learning_rate": 6.971651105837449e-06, + "loss": 0.4671, + "step": 9793 + }, + { + "epoch": 1.608441278508817, + "grad_norm": 0.36307972078813605, + "learning_rate": 6.971259193793588e-06, + "loss": 0.4382, + "step": 9794 + }, + { + "epoch": 1.6086055057171595, + "grad_norm": 0.31702386181625597, + "learning_rate": 6.970867254909422e-06, + "loss": 0.4675, + "step": 9795 + }, + { + "epoch": 1.6087697329255024, + "grad_norm": 0.3142731818838145, + "learning_rate": 6.970475289189204e-06, + "loss": 0.459, + "step": 9796 + }, + { + "epoch": 1.6089339601338453, + "grad_norm": 0.36241745736101644, + "learning_rate": 6.970083296637196e-06, + "loss": 0.4748, + "step": 9797 + }, + { + "epoch": 1.609098187342188, + "grad_norm": 0.3470417056667347, + "learning_rate": 6.969691277257652e-06, + "loss": 0.469, + "step": 9798 + }, + { + "epoch": 1.6092624145505305, + "grad_norm": 0.41937451840307666, + "learning_rate": 6.969299231054831e-06, + "loss": 0.4564, + "step": 9799 + }, + { + "epoch": 1.6094266417588734, + "grad_norm": 0.3262573052858504, + "learning_rate": 6.9689071580329905e-06, + "loss": 0.4587, + "step": 9800 + }, + { + "epoch": 1.6095908689672163, + "grad_norm": 0.2947421917980309, + "learning_rate": 6.96851505819639e-06, + "loss": 0.4533, + "step": 9801 + }, + { + "epoch": 1.609755096175559, + "grad_norm": 0.2521016894403766, + "learning_rate": 6.968122931549288e-06, + "loss": 0.4593, + "step": 9802 + }, + { + "epoch": 1.6099193233839015, + "grad_norm": 0.2955673416323675, + "learning_rate": 6.9677307780959416e-06, + "loss": 0.4666, + "step": 9803 + }, + { + "epoch": 1.6100835505922444, + "grad_norm": 0.27659905950201386, + "learning_rate": 6.96733859784061e-06, + "loss": 0.4489, + "step": 9804 + }, + { + "epoch": 1.6102477778005873, + "grad_norm": 0.3404542615909959, + "learning_rate": 6.966946390787554e-06, + "loss": 0.4602, + "step": 9805 + }, + { + "epoch": 1.61041200500893, + "grad_norm": 0.2845051029226587, + "learning_rate": 6.9665541569410355e-06, + "loss": 0.4692, + "step": 9806 + }, + { + "epoch": 1.6105762322172725, + "grad_norm": 0.38260420051370825, + "learning_rate": 6.96616189630531e-06, + "loss": 0.4759, + "step": 9807 + }, + { + "epoch": 1.6107404594256154, + "grad_norm": 0.26683465973181375, + "learning_rate": 6.96576960888464e-06, + "loss": 0.4739, + "step": 9808 + }, + { + "epoch": 1.610904686633958, + "grad_norm": 0.384929446621508, + "learning_rate": 6.965377294683286e-06, + "loss": 0.438, + "step": 9809 + }, + { + "epoch": 1.6110689138423009, + "grad_norm": 0.38374467343084134, + "learning_rate": 6.964984953705509e-06, + "loss": 0.4729, + "step": 9810 + }, + { + "epoch": 1.6112331410506435, + "grad_norm": 0.2694526018583905, + "learning_rate": 6.964592585955571e-06, + "loss": 0.4557, + "step": 9811 + }, + { + "epoch": 1.6113973682589862, + "grad_norm": 0.3115303986468197, + "learning_rate": 6.964200191437732e-06, + "loss": 0.4506, + "step": 9812 + }, + { + "epoch": 1.611561595467329, + "grad_norm": 0.3035293723273839, + "learning_rate": 6.963807770156254e-06, + "loss": 0.4641, + "step": 9813 + }, + { + "epoch": 1.6117258226756719, + "grad_norm": 0.2992017924804873, + "learning_rate": 6.963415322115402e-06, + "loss": 0.4536, + "step": 9814 + }, + { + "epoch": 1.6118900498840145, + "grad_norm": 0.31667569525527806, + "learning_rate": 6.963022847319434e-06, + "loss": 0.4551, + "step": 9815 + }, + { + "epoch": 1.6120542770923572, + "grad_norm": 0.2540364674293299, + "learning_rate": 6.962630345772615e-06, + "loss": 0.4708, + "step": 9816 + }, + { + "epoch": 1.6122185043007, + "grad_norm": 0.4442076282948899, + "learning_rate": 6.962237817479207e-06, + "loss": 0.4743, + "step": 9817 + }, + { + "epoch": 1.6123827315090429, + "grad_norm": 0.2903014811736007, + "learning_rate": 6.961845262443474e-06, + "loss": 0.4645, + "step": 9818 + }, + { + "epoch": 1.6125469587173855, + "grad_norm": 0.36913889456903465, + "learning_rate": 6.96145268066968e-06, + "loss": 0.4619, + "step": 9819 + }, + { + "epoch": 1.6127111859257282, + "grad_norm": 0.4006840087480322, + "learning_rate": 6.961060072162087e-06, + "loss": 0.4707, + "step": 9820 + }, + { + "epoch": 1.612875413134071, + "grad_norm": 0.3353180038710587, + "learning_rate": 6.960667436924961e-06, + "loss": 0.4773, + "step": 9821 + }, + { + "epoch": 1.6130396403424139, + "grad_norm": 0.28696136203207506, + "learning_rate": 6.960274774962565e-06, + "loss": 0.4412, + "step": 9822 + }, + { + "epoch": 1.6132038675507565, + "grad_norm": 0.2864030214755703, + "learning_rate": 6.959882086279166e-06, + "loss": 0.4622, + "step": 9823 + }, + { + "epoch": 1.6133680947590991, + "grad_norm": 0.2985379366016787, + "learning_rate": 6.959489370879026e-06, + "loss": 0.4611, + "step": 9824 + }, + { + "epoch": 1.613532321967442, + "grad_norm": 0.28314934992789903, + "learning_rate": 6.959096628766411e-06, + "loss": 0.4729, + "step": 9825 + }, + { + "epoch": 1.6136965491757846, + "grad_norm": 0.38930289972293297, + "learning_rate": 6.9587038599455874e-06, + "loss": 0.4515, + "step": 9826 + }, + { + "epoch": 1.6138607763841275, + "grad_norm": 0.787582127701228, + "learning_rate": 6.958311064420822e-06, + "loss": 0.4569, + "step": 9827 + }, + { + "epoch": 1.6140250035924701, + "grad_norm": 0.27450397102919527, + "learning_rate": 6.957918242196379e-06, + "loss": 0.4619, + "step": 9828 + }, + { + "epoch": 1.6141892308008128, + "grad_norm": 0.4634130614046554, + "learning_rate": 6.957525393276526e-06, + "loss": 0.4574, + "step": 9829 + }, + { + "epoch": 1.6143534580091556, + "grad_norm": 0.32687976750827596, + "learning_rate": 6.957132517665529e-06, + "loss": 0.4478, + "step": 9830 + }, + { + "epoch": 1.6145176852174985, + "grad_norm": 0.2749625418731786, + "learning_rate": 6.9567396153676556e-06, + "loss": 0.4787, + "step": 9831 + }, + { + "epoch": 1.6146819124258411, + "grad_norm": 0.2686148625927595, + "learning_rate": 6.956346686387174e-06, + "loss": 0.4641, + "step": 9832 + }, + { + "epoch": 1.6148461396341838, + "grad_norm": 0.27749373643361147, + "learning_rate": 6.955953730728349e-06, + "loss": 0.4478, + "step": 9833 + }, + { + "epoch": 1.6150103668425266, + "grad_norm": 0.33035669372922283, + "learning_rate": 6.95556074839545e-06, + "loss": 0.4574, + "step": 9834 + }, + { + "epoch": 1.6151745940508695, + "grad_norm": 0.319743000157342, + "learning_rate": 6.955167739392747e-06, + "loss": 0.4471, + "step": 9835 + }, + { + "epoch": 1.6153388212592121, + "grad_norm": 0.26117931070035566, + "learning_rate": 6.954774703724506e-06, + "loss": 0.4639, + "step": 9836 + }, + { + "epoch": 1.6155030484675548, + "grad_norm": 0.27760241909976285, + "learning_rate": 6.9543816413949965e-06, + "loss": 0.4572, + "step": 9837 + }, + { + "epoch": 1.6156672756758976, + "grad_norm": 0.2998866914995418, + "learning_rate": 6.953988552408487e-06, + "loss": 0.4551, + "step": 9838 + }, + { + "epoch": 1.6158315028842405, + "grad_norm": 0.30021599310248603, + "learning_rate": 6.953595436769248e-06, + "loss": 0.4615, + "step": 9839 + }, + { + "epoch": 1.6159957300925831, + "grad_norm": 0.31256899332836824, + "learning_rate": 6.953202294481548e-06, + "loss": 0.4541, + "step": 9840 + }, + { + "epoch": 1.6161599573009258, + "grad_norm": 0.3899196834494022, + "learning_rate": 6.952809125549658e-06, + "loss": 0.4771, + "step": 9841 + }, + { + "epoch": 1.6163241845092686, + "grad_norm": 0.3010082901322875, + "learning_rate": 6.952415929977848e-06, + "loss": 0.4719, + "step": 9842 + }, + { + "epoch": 1.6164884117176113, + "grad_norm": 0.3576168449635639, + "learning_rate": 6.952022707770387e-06, + "loss": 0.4404, + "step": 9843 + }, + { + "epoch": 1.6166526389259541, + "grad_norm": 0.2760702244129268, + "learning_rate": 6.951629458931548e-06, + "loss": 0.4578, + "step": 9844 + }, + { + "epoch": 1.6168168661342968, + "grad_norm": 0.2749007165277947, + "learning_rate": 6.951236183465601e-06, + "loss": 0.45, + "step": 9845 + }, + { + "epoch": 1.6169810933426394, + "grad_norm": 0.31390505673084157, + "learning_rate": 6.950842881376816e-06, + "loss": 0.4495, + "step": 9846 + }, + { + "epoch": 1.6171453205509823, + "grad_norm": 0.2868629433154881, + "learning_rate": 6.9504495526694675e-06, + "loss": 0.4534, + "step": 9847 + }, + { + "epoch": 1.6173095477593251, + "grad_norm": 0.312588550829176, + "learning_rate": 6.9500561973478264e-06, + "loss": 0.4478, + "step": 9848 + }, + { + "epoch": 1.6174737749676678, + "grad_norm": 0.2958477322250829, + "learning_rate": 6.949662815416163e-06, + "loss": 0.464, + "step": 9849 + }, + { + "epoch": 1.6176380021760104, + "grad_norm": 0.29871300762114433, + "learning_rate": 6.949269406878752e-06, + "loss": 0.4636, + "step": 9850 + }, + { + "epoch": 1.6178022293843533, + "grad_norm": 0.40407588926968097, + "learning_rate": 6.9488759717398645e-06, + "loss": 0.4673, + "step": 9851 + }, + { + "epoch": 1.6179664565926961, + "grad_norm": 0.27940144990316745, + "learning_rate": 6.948482510003776e-06, + "loss": 0.4671, + "step": 9852 + }, + { + "epoch": 1.6181306838010387, + "grad_norm": 0.30202586554252114, + "learning_rate": 6.948089021674758e-06, + "loss": 0.4433, + "step": 9853 + }, + { + "epoch": 1.6182949110093814, + "grad_norm": 0.285583505265655, + "learning_rate": 6.947695506757084e-06, + "loss": 0.4786, + "step": 9854 + }, + { + "epoch": 1.6184591382177242, + "grad_norm": 0.39905382824632013, + "learning_rate": 6.947301965255029e-06, + "loss": 0.4724, + "step": 9855 + }, + { + "epoch": 1.618623365426067, + "grad_norm": 0.307863136962308, + "learning_rate": 6.946908397172866e-06, + "loss": 0.4232, + "step": 9856 + }, + { + "epoch": 1.6187875926344097, + "grad_norm": 0.43057042653410355, + "learning_rate": 6.946514802514872e-06, + "loss": 0.451, + "step": 9857 + }, + { + "epoch": 1.6189518198427524, + "grad_norm": 0.3171082589247781, + "learning_rate": 6.94612118128532e-06, + "loss": 0.4396, + "step": 9858 + }, + { + "epoch": 1.6191160470510952, + "grad_norm": 0.29330682282221376, + "learning_rate": 6.945727533488483e-06, + "loss": 0.4546, + "step": 9859 + }, + { + "epoch": 1.6192802742594379, + "grad_norm": 0.2870797937588179, + "learning_rate": 6.945333859128642e-06, + "loss": 0.4634, + "step": 9860 + }, + { + "epoch": 1.6194445014677807, + "grad_norm": 0.3051040740644912, + "learning_rate": 6.944940158210067e-06, + "loss": 0.4536, + "step": 9861 + }, + { + "epoch": 1.6196087286761234, + "grad_norm": 0.41361444535474173, + "learning_rate": 6.9445464307370375e-06, + "loss": 0.458, + "step": 9862 + }, + { + "epoch": 1.619772955884466, + "grad_norm": 0.25243414567729117, + "learning_rate": 6.944152676713828e-06, + "loss": 0.4663, + "step": 9863 + }, + { + "epoch": 1.6199371830928089, + "grad_norm": 0.3815051393474824, + "learning_rate": 6.943758896144715e-06, + "loss": 0.444, + "step": 9864 + }, + { + "epoch": 1.6201014103011517, + "grad_norm": 0.38218490759074814, + "learning_rate": 6.943365089033979e-06, + "loss": 0.4714, + "step": 9865 + }, + { + "epoch": 1.6202656375094944, + "grad_norm": 0.28814253453168126, + "learning_rate": 6.942971255385893e-06, + "loss": 0.4753, + "step": 9866 + }, + { + "epoch": 1.620429864717837, + "grad_norm": 0.337629238841208, + "learning_rate": 6.9425773952047335e-06, + "loss": 0.4506, + "step": 9867 + }, + { + "epoch": 1.6205940919261799, + "grad_norm": 0.25032929018512107, + "learning_rate": 6.942183508494782e-06, + "loss": 0.4624, + "step": 9868 + }, + { + "epoch": 1.6207583191345227, + "grad_norm": 0.28775404020259854, + "learning_rate": 6.941789595260315e-06, + "loss": 0.4854, + "step": 9869 + }, + { + "epoch": 1.6209225463428654, + "grad_norm": 0.28379877031843614, + "learning_rate": 6.941395655505611e-06, + "loss": 0.4747, + "step": 9870 + }, + { + "epoch": 1.621086773551208, + "grad_norm": 0.8202653712403721, + "learning_rate": 6.941001689234946e-06, + "loss": 0.4387, + "step": 9871 + }, + { + "epoch": 1.6212510007595509, + "grad_norm": 0.42955934255178224, + "learning_rate": 6.940607696452603e-06, + "loss": 0.4522, + "step": 9872 + }, + { + "epoch": 1.6214152279678937, + "grad_norm": 0.3259530932672982, + "learning_rate": 6.940213677162859e-06, + "loss": 0.4552, + "step": 9873 + }, + { + "epoch": 1.6215794551762364, + "grad_norm": 0.38494136138535234, + "learning_rate": 6.939819631369992e-06, + "loss": 0.4636, + "step": 9874 + }, + { + "epoch": 1.621743682384579, + "grad_norm": 0.2880001166580629, + "learning_rate": 6.939425559078285e-06, + "loss": 0.4682, + "step": 9875 + }, + { + "epoch": 1.6219079095929219, + "grad_norm": 0.2530128315191015, + "learning_rate": 6.939031460292016e-06, + "loss": 0.4465, + "step": 9876 + }, + { + "epoch": 1.6220721368012645, + "grad_norm": 0.29707957324021506, + "learning_rate": 6.938637335015466e-06, + "loss": 0.4585, + "step": 9877 + }, + { + "epoch": 1.6222363640096074, + "grad_norm": 0.2835923151500123, + "learning_rate": 6.938243183252914e-06, + "loss": 0.4799, + "step": 9878 + }, + { + "epoch": 1.62240059121795, + "grad_norm": 0.3185739450507327, + "learning_rate": 6.937849005008643e-06, + "loss": 0.4754, + "step": 9879 + }, + { + "epoch": 1.6225648184262926, + "grad_norm": 0.2816869463340398, + "learning_rate": 6.937454800286933e-06, + "loss": 0.4432, + "step": 9880 + }, + { + "epoch": 1.6227290456346355, + "grad_norm": 0.2856348714662334, + "learning_rate": 6.937060569092066e-06, + "loss": 0.4584, + "step": 9881 + }, + { + "epoch": 1.6228932728429784, + "grad_norm": 0.39852623208599847, + "learning_rate": 6.936666311428324e-06, + "loss": 0.4808, + "step": 9882 + }, + { + "epoch": 1.623057500051321, + "grad_norm": 0.2881186208127396, + "learning_rate": 6.936272027299987e-06, + "loss": 0.4663, + "step": 9883 + }, + { + "epoch": 1.6232217272596636, + "grad_norm": 0.28622926544540417, + "learning_rate": 6.935877716711341e-06, + "loss": 0.4604, + "step": 9884 + }, + { + "epoch": 1.6233859544680065, + "grad_norm": 0.25634085668901097, + "learning_rate": 6.935483379666665e-06, + "loss": 0.4489, + "step": 9885 + }, + { + "epoch": 1.6235501816763493, + "grad_norm": 0.35586491846358337, + "learning_rate": 6.935089016170243e-06, + "loss": 0.4747, + "step": 9886 + }, + { + "epoch": 1.623714408884692, + "grad_norm": 0.45630619008435724, + "learning_rate": 6.934694626226359e-06, + "loss": 0.4803, + "step": 9887 + }, + { + "epoch": 1.6238786360930346, + "grad_norm": 0.414835910725295, + "learning_rate": 6.934300209839296e-06, + "loss": 0.4619, + "step": 9888 + }, + { + "epoch": 1.6240428633013775, + "grad_norm": 0.39488106594808875, + "learning_rate": 6.933905767013337e-06, + "loss": 0.4804, + "step": 9889 + }, + { + "epoch": 1.6242070905097203, + "grad_norm": 0.25434236302766866, + "learning_rate": 6.933511297752769e-06, + "loss": 0.4511, + "step": 9890 + }, + { + "epoch": 1.624371317718063, + "grad_norm": 0.2852061649295925, + "learning_rate": 6.93311680206187e-06, + "loss": 0.4452, + "step": 9891 + }, + { + "epoch": 1.6245355449264056, + "grad_norm": 0.26183435681063855, + "learning_rate": 6.932722279944933e-06, + "loss": 0.443, + "step": 9892 + }, + { + "epoch": 1.6246997721347485, + "grad_norm": 0.3371400824807553, + "learning_rate": 6.932327731406235e-06, + "loss": 0.4562, + "step": 9893 + }, + { + "epoch": 1.6248639993430911, + "grad_norm": 0.259702383047695, + "learning_rate": 6.931933156450068e-06, + "loss": 0.453, + "step": 9894 + }, + { + "epoch": 1.625028226551434, + "grad_norm": 0.2845006703877606, + "learning_rate": 6.931538555080712e-06, + "loss": 0.4577, + "step": 9895 + }, + { + "epoch": 1.6251924537597766, + "grad_norm": 0.2900661570644532, + "learning_rate": 6.931143927302455e-06, + "loss": 0.4455, + "step": 9896 + }, + { + "epoch": 1.6253566809681192, + "grad_norm": 0.2464969379598356, + "learning_rate": 6.930749273119583e-06, + "loss": 0.4546, + "step": 9897 + }, + { + "epoch": 1.625520908176462, + "grad_norm": 0.2571761908465857, + "learning_rate": 6.930354592536384e-06, + "loss": 0.4719, + "step": 9898 + }, + { + "epoch": 1.625685135384805, + "grad_norm": 0.3131139124840701, + "learning_rate": 6.929959885557142e-06, + "loss": 0.4357, + "step": 9899 + }, + { + "epoch": 1.6258493625931476, + "grad_norm": 0.2794163509230121, + "learning_rate": 6.929565152186145e-06, + "loss": 0.4645, + "step": 9900 + }, + { + "epoch": 1.6260135898014902, + "grad_norm": 0.3279839951388422, + "learning_rate": 6.9291703924276795e-06, + "loss": 0.4668, + "step": 9901 + }, + { + "epoch": 1.626177817009833, + "grad_norm": 0.28360673429548217, + "learning_rate": 6.928775606286034e-06, + "loss": 0.4366, + "step": 9902 + }, + { + "epoch": 1.626342044218176, + "grad_norm": 0.26088450799046986, + "learning_rate": 6.928380793765495e-06, + "loss": 0.4621, + "step": 9903 + }, + { + "epoch": 1.6265062714265186, + "grad_norm": 0.31179713297315687, + "learning_rate": 6.927985954870352e-06, + "loss": 0.4639, + "step": 9904 + }, + { + "epoch": 1.6266704986348612, + "grad_norm": 0.31748956718889826, + "learning_rate": 6.927591089604894e-06, + "loss": 0.4539, + "step": 9905 + }, + { + "epoch": 1.626834725843204, + "grad_norm": 0.2803655727982917, + "learning_rate": 6.927196197973406e-06, + "loss": 0.4618, + "step": 9906 + }, + { + "epoch": 1.626998953051547, + "grad_norm": 0.27920456966557644, + "learning_rate": 6.926801279980181e-06, + "loss": 0.4754, + "step": 9907 + }, + { + "epoch": 1.6271631802598896, + "grad_norm": 0.35933877215535787, + "learning_rate": 6.926406335629506e-06, + "loss": 0.4694, + "step": 9908 + }, + { + "epoch": 1.6273274074682322, + "grad_norm": 0.32910657102073315, + "learning_rate": 6.926011364925671e-06, + "loss": 0.4563, + "step": 9909 + }, + { + "epoch": 1.627491634676575, + "grad_norm": 0.28716458893408375, + "learning_rate": 6.925616367872966e-06, + "loss": 0.4486, + "step": 9910 + }, + { + "epoch": 1.6276558618849177, + "grad_norm": 0.3223186286773691, + "learning_rate": 6.92522134447568e-06, + "loss": 0.4487, + "step": 9911 + }, + { + "epoch": 1.6278200890932606, + "grad_norm": 0.28144093144592025, + "learning_rate": 6.924826294738104e-06, + "loss": 0.4602, + "step": 9912 + }, + { + "epoch": 1.6279843163016032, + "grad_norm": 0.30356735564352616, + "learning_rate": 6.924431218664529e-06, + "loss": 0.4542, + "step": 9913 + }, + { + "epoch": 1.6281485435099459, + "grad_norm": 0.27955424056774203, + "learning_rate": 6.9240361162592466e-06, + "loss": 0.4532, + "step": 9914 + }, + { + "epoch": 1.6283127707182887, + "grad_norm": 0.27703469310058093, + "learning_rate": 6.923640987526547e-06, + "loss": 0.4641, + "step": 9915 + }, + { + "epoch": 1.6284769979266316, + "grad_norm": 0.2741045415309227, + "learning_rate": 6.92324583247072e-06, + "loss": 0.4624, + "step": 9916 + }, + { + "epoch": 1.6286412251349742, + "grad_norm": 0.566318151594307, + "learning_rate": 6.9228506510960595e-06, + "loss": 0.4463, + "step": 9917 + }, + { + "epoch": 1.6288054523433169, + "grad_norm": 0.4132857679172401, + "learning_rate": 6.922455443406858e-06, + "loss": 0.4639, + "step": 9918 + }, + { + "epoch": 1.6289696795516597, + "grad_norm": 0.3322732173475824, + "learning_rate": 6.922060209407407e-06, + "loss": 0.4523, + "step": 9919 + }, + { + "epoch": 1.6291339067600026, + "grad_norm": 0.26147560000120657, + "learning_rate": 6.921664949102e-06, + "loss": 0.432, + "step": 9920 + }, + { + "epoch": 1.6292981339683452, + "grad_norm": 0.26057303179752594, + "learning_rate": 6.921269662494927e-06, + "loss": 0.4612, + "step": 9921 + }, + { + "epoch": 1.6294623611766879, + "grad_norm": 0.40536918485757667, + "learning_rate": 6.9208743495904846e-06, + "loss": 0.4676, + "step": 9922 + }, + { + "epoch": 1.6296265883850307, + "grad_norm": 0.2811103795445873, + "learning_rate": 6.920479010392964e-06, + "loss": 0.4848, + "step": 9923 + }, + { + "epoch": 1.6297908155933736, + "grad_norm": 0.28369942619293775, + "learning_rate": 6.920083644906659e-06, + "loss": 0.4575, + "step": 9924 + }, + { + "epoch": 1.6299550428017162, + "grad_norm": 0.9300496984621315, + "learning_rate": 6.919688253135867e-06, + "loss": 0.4596, + "step": 9925 + }, + { + "epoch": 1.6301192700100589, + "grad_norm": 0.629522730790281, + "learning_rate": 6.919292835084879e-06, + "loss": 0.4461, + "step": 9926 + }, + { + "epoch": 1.6302834972184017, + "grad_norm": 0.2516198118246472, + "learning_rate": 6.91889739075799e-06, + "loss": 0.4709, + "step": 9927 + }, + { + "epoch": 1.6304477244267443, + "grad_norm": 0.3026698120551612, + "learning_rate": 6.918501920159496e-06, + "loss": 0.4531, + "step": 9928 + }, + { + "epoch": 1.6306119516350872, + "grad_norm": 0.26043558936113975, + "learning_rate": 6.9181064232936926e-06, + "loss": 0.454, + "step": 9929 + }, + { + "epoch": 1.6307761788434298, + "grad_norm": 0.2549099028558008, + "learning_rate": 6.917710900164873e-06, + "loss": 0.4673, + "step": 9930 + }, + { + "epoch": 1.6309404060517725, + "grad_norm": 0.3021568552153403, + "learning_rate": 6.917315350777335e-06, + "loss": 0.4532, + "step": 9931 + }, + { + "epoch": 1.6311046332601153, + "grad_norm": 0.3000004873374244, + "learning_rate": 6.916919775135374e-06, + "loss": 0.4772, + "step": 9932 + }, + { + "epoch": 1.6312688604684582, + "grad_norm": 0.32110553303472866, + "learning_rate": 6.916524173243285e-06, + "loss": 0.4524, + "step": 9933 + }, + { + "epoch": 1.6314330876768008, + "grad_norm": 0.31698803998851016, + "learning_rate": 6.916128545105368e-06, + "loss": 0.4602, + "step": 9934 + }, + { + "epoch": 1.6315973148851435, + "grad_norm": 0.28149338690697373, + "learning_rate": 6.915732890725915e-06, + "loss": 0.4535, + "step": 9935 + }, + { + "epoch": 1.6317615420934863, + "grad_norm": 0.3373716032595957, + "learning_rate": 6.91533721010923e-06, + "loss": 0.4603, + "step": 9936 + }, + { + "epoch": 1.6319257693018292, + "grad_norm": 0.2547340712685884, + "learning_rate": 6.914941503259604e-06, + "loss": 0.4513, + "step": 9937 + }, + { + "epoch": 1.6320899965101718, + "grad_norm": 0.2737540154463338, + "learning_rate": 6.9145457701813365e-06, + "loss": 0.4644, + "step": 9938 + }, + { + "epoch": 1.6322542237185145, + "grad_norm": 0.3191457901931209, + "learning_rate": 6.9141500108787284e-06, + "loss": 0.4609, + "step": 9939 + }, + { + "epoch": 1.6324184509268573, + "grad_norm": 0.30132448751628016, + "learning_rate": 6.913754225356075e-06, + "loss": 0.4529, + "step": 9940 + }, + { + "epoch": 1.6325826781352002, + "grad_norm": 0.3397790738337766, + "learning_rate": 6.9133584136176755e-06, + "loss": 0.4414, + "step": 9941 + }, + { + "epoch": 1.6327469053435428, + "grad_norm": 0.294003400100843, + "learning_rate": 6.91296257566783e-06, + "loss": 0.4663, + "step": 9942 + }, + { + "epoch": 1.6329111325518855, + "grad_norm": 0.3010856698778934, + "learning_rate": 6.912566711510836e-06, + "loss": 0.4773, + "step": 9943 + }, + { + "epoch": 1.6330753597602283, + "grad_norm": 0.4470639674793449, + "learning_rate": 6.912170821150994e-06, + "loss": 0.4485, + "step": 9944 + }, + { + "epoch": 1.633239586968571, + "grad_norm": 0.3158933653529828, + "learning_rate": 6.911774904592605e-06, + "loss": 0.4678, + "step": 9945 + }, + { + "epoch": 1.6334038141769138, + "grad_norm": 0.35320107780642485, + "learning_rate": 6.911378961839966e-06, + "loss": 0.4766, + "step": 9946 + }, + { + "epoch": 1.6335680413852565, + "grad_norm": 0.32146417670514194, + "learning_rate": 6.9109829928973794e-06, + "loss": 0.4507, + "step": 9947 + }, + { + "epoch": 1.633732268593599, + "grad_norm": 0.39318528095594146, + "learning_rate": 6.910586997769147e-06, + "loss": 0.4354, + "step": 9948 + }, + { + "epoch": 1.633896495801942, + "grad_norm": 0.31839866812565076, + "learning_rate": 6.910190976459568e-06, + "loss": 0.4575, + "step": 9949 + }, + { + "epoch": 1.6340607230102848, + "grad_norm": 0.4040545593174407, + "learning_rate": 6.9097949289729416e-06, + "loss": 0.4536, + "step": 9950 + }, + { + "epoch": 1.6342249502186275, + "grad_norm": 0.3204970703934227, + "learning_rate": 6.909398855313572e-06, + "loss": 0.4437, + "step": 9951 + }, + { + "epoch": 1.63438917742697, + "grad_norm": 0.35954942581069016, + "learning_rate": 6.90900275548576e-06, + "loss": 0.4657, + "step": 9952 + }, + { + "epoch": 1.634553404635313, + "grad_norm": 0.3009896626505442, + "learning_rate": 6.908606629493809e-06, + "loss": 0.4508, + "step": 9953 + }, + { + "epoch": 1.6347176318436558, + "grad_norm": 0.32257264785454315, + "learning_rate": 6.908210477342019e-06, + "loss": 0.4328, + "step": 9954 + }, + { + "epoch": 1.6348818590519985, + "grad_norm": 0.4503379019831764, + "learning_rate": 6.907814299034695e-06, + "loss": 0.4706, + "step": 9955 + }, + { + "epoch": 1.635046086260341, + "grad_norm": 0.3434774458899971, + "learning_rate": 6.907418094576138e-06, + "loss": 0.4926, + "step": 9956 + }, + { + "epoch": 1.635210313468684, + "grad_norm": 0.4039715002404461, + "learning_rate": 6.9070218639706535e-06, + "loss": 0.4568, + "step": 9957 + }, + { + "epoch": 1.6353745406770268, + "grad_norm": 0.32002629237152624, + "learning_rate": 6.906625607222541e-06, + "loss": 0.4694, + "step": 9958 + }, + { + "epoch": 1.6355387678853694, + "grad_norm": 0.28880005235928163, + "learning_rate": 6.906229324336109e-06, + "loss": 0.4752, + "step": 9959 + }, + { + "epoch": 1.635702995093712, + "grad_norm": 0.300776456819465, + "learning_rate": 6.905833015315657e-06, + "loss": 0.469, + "step": 9960 + }, + { + "epoch": 1.635867222302055, + "grad_norm": 0.3689131755069731, + "learning_rate": 6.905436680165493e-06, + "loss": 0.4483, + "step": 9961 + }, + { + "epoch": 1.6360314495103976, + "grad_norm": 0.34492706183414207, + "learning_rate": 6.905040318889919e-06, + "loss": 0.4574, + "step": 9962 + }, + { + "epoch": 1.6361956767187404, + "grad_norm": 0.3119084180868803, + "learning_rate": 6.904643931493241e-06, + "loss": 0.4612, + "step": 9963 + }, + { + "epoch": 1.636359903927083, + "grad_norm": 0.25489213577479997, + "learning_rate": 6.904247517979764e-06, + "loss": 0.4543, + "step": 9964 + }, + { + "epoch": 1.6365241311354257, + "grad_norm": 0.29549691545907814, + "learning_rate": 6.903851078353795e-06, + "loss": 0.4733, + "step": 9965 + }, + { + "epoch": 1.6366883583437686, + "grad_norm": 0.28335212002696497, + "learning_rate": 6.903454612619636e-06, + "loss": 0.4608, + "step": 9966 + }, + { + "epoch": 1.6368525855521114, + "grad_norm": 0.2993104367804374, + "learning_rate": 6.903058120781597e-06, + "loss": 0.4672, + "step": 9967 + }, + { + "epoch": 1.637016812760454, + "grad_norm": 0.28332822889276027, + "learning_rate": 6.902661602843981e-06, + "loss": 0.4487, + "step": 9968 + }, + { + "epoch": 1.6371810399687967, + "grad_norm": 0.25729560828720943, + "learning_rate": 6.9022650588110985e-06, + "loss": 0.4184, + "step": 9969 + }, + { + "epoch": 1.6373452671771396, + "grad_norm": 0.30264712506892016, + "learning_rate": 6.901868488687251e-06, + "loss": 0.4596, + "step": 9970 + }, + { + "epoch": 1.6375094943854824, + "grad_norm": 0.2672502724121429, + "learning_rate": 6.901471892476751e-06, + "loss": 0.4603, + "step": 9971 + }, + { + "epoch": 1.637673721593825, + "grad_norm": 0.28382573152561147, + "learning_rate": 6.901075270183901e-06, + "loss": 0.4805, + "step": 9972 + }, + { + "epoch": 1.6378379488021677, + "grad_norm": 0.29678729782940755, + "learning_rate": 6.900678621813014e-06, + "loss": 0.4696, + "step": 9973 + }, + { + "epoch": 1.6380021760105106, + "grad_norm": 0.2438202408021163, + "learning_rate": 6.900281947368394e-06, + "loss": 0.4582, + "step": 9974 + }, + { + "epoch": 1.6381664032188534, + "grad_norm": 0.3363567609882665, + "learning_rate": 6.8998852468543495e-06, + "loss": 0.4349, + "step": 9975 + }, + { + "epoch": 1.638330630427196, + "grad_norm": 0.2897556757604139, + "learning_rate": 6.8994885202751905e-06, + "loss": 0.4581, + "step": 9976 + }, + { + "epoch": 1.6384948576355387, + "grad_norm": 0.3406024616786714, + "learning_rate": 6.899091767635226e-06, + "loss": 0.4669, + "step": 9977 + }, + { + "epoch": 1.6386590848438816, + "grad_norm": 0.3571028862398934, + "learning_rate": 6.8986949889387655e-06, + "loss": 0.4622, + "step": 9978 + }, + { + "epoch": 1.6388233120522242, + "grad_norm": 0.28486072032707144, + "learning_rate": 6.898298184190116e-06, + "loss": 0.4584, + "step": 9979 + }, + { + "epoch": 1.638987539260567, + "grad_norm": 0.6487043597141194, + "learning_rate": 6.897901353393588e-06, + "loss": 0.474, + "step": 9980 + }, + { + "epoch": 1.6391517664689097, + "grad_norm": 0.2955606209398538, + "learning_rate": 6.897504496553493e-06, + "loss": 0.4613, + "step": 9981 + }, + { + "epoch": 1.6393159936772523, + "grad_norm": 0.3960711855428685, + "learning_rate": 6.89710761367414e-06, + "loss": 0.4384, + "step": 9982 + }, + { + "epoch": 1.6394802208855952, + "grad_norm": 0.32856200591871165, + "learning_rate": 6.8967107047598405e-06, + "loss": 0.4834, + "step": 9983 + }, + { + "epoch": 1.639644448093938, + "grad_norm": 0.2726115586771324, + "learning_rate": 6.896313769814905e-06, + "loss": 0.4681, + "step": 9984 + }, + { + "epoch": 1.6398086753022807, + "grad_norm": 0.2689421461804761, + "learning_rate": 6.895916808843643e-06, + "loss": 0.4533, + "step": 9985 + }, + { + "epoch": 1.6399729025106233, + "grad_norm": 0.2627988789746831, + "learning_rate": 6.895519821850368e-06, + "loss": 0.4709, + "step": 9986 + }, + { + "epoch": 1.6401371297189662, + "grad_norm": 0.32357716022869204, + "learning_rate": 6.895122808839391e-06, + "loss": 0.4752, + "step": 9987 + }, + { + "epoch": 1.640301356927309, + "grad_norm": 0.35351722543048564, + "learning_rate": 6.894725769815023e-06, + "loss": 0.4596, + "step": 9988 + }, + { + "epoch": 1.6404655841356517, + "grad_norm": 0.30283147058524224, + "learning_rate": 6.894328704781578e-06, + "loss": 0.4602, + "step": 9989 + }, + { + "epoch": 1.6406298113439943, + "grad_norm": 0.32871065213422807, + "learning_rate": 6.893931613743367e-06, + "loss": 0.4398, + "step": 9990 + }, + { + "epoch": 1.6407940385523372, + "grad_norm": 0.2909440710280135, + "learning_rate": 6.893534496704704e-06, + "loss": 0.4562, + "step": 9991 + }, + { + "epoch": 1.64095826576068, + "grad_norm": 0.26730344436772585, + "learning_rate": 6.893137353669899e-06, + "loss": 0.4363, + "step": 9992 + }, + { + "epoch": 1.6411224929690227, + "grad_norm": 0.3766542819585991, + "learning_rate": 6.892740184643271e-06, + "loss": 0.4577, + "step": 9993 + }, + { + "epoch": 1.6412867201773653, + "grad_norm": 0.2845863213745585, + "learning_rate": 6.8923429896291295e-06, + "loss": 0.4547, + "step": 9994 + }, + { + "epoch": 1.6414509473857082, + "grad_norm": 0.28734066949876436, + "learning_rate": 6.891945768631789e-06, + "loss": 0.47, + "step": 9995 + }, + { + "epoch": 1.6416151745940508, + "grad_norm": 0.4035762230136988, + "learning_rate": 6.891548521655563e-06, + "loss": 0.4615, + "step": 9996 + }, + { + "epoch": 1.6417794018023937, + "grad_norm": 0.27005177748526993, + "learning_rate": 6.891151248704769e-06, + "loss": 0.4527, + "step": 9997 + }, + { + "epoch": 1.6419436290107363, + "grad_norm": 1.0453460937427654, + "learning_rate": 6.890753949783719e-06, + "loss": 0.4578, + "step": 9998 + }, + { + "epoch": 1.642107856219079, + "grad_norm": 0.38637735669787354, + "learning_rate": 6.8903566248967296e-06, + "loss": 0.4528, + "step": 9999 + }, + { + "epoch": 1.6422720834274218, + "grad_norm": 0.26584240115120206, + "learning_rate": 6.889959274048115e-06, + "loss": 0.4463, + "step": 10000 + }, + { + "epoch": 1.6424363106357647, + "grad_norm": 0.390070342324435, + "learning_rate": 6.889561897242191e-06, + "loss": 0.4599, + "step": 10001 + }, + { + "epoch": 1.6426005378441073, + "grad_norm": 0.24050547042083945, + "learning_rate": 6.889164494483274e-06, + "loss": 0.4414, + "step": 10002 + }, + { + "epoch": 1.64276476505245, + "grad_norm": 0.2653471004908911, + "learning_rate": 6.888767065775681e-06, + "loss": 0.4624, + "step": 10003 + }, + { + "epoch": 1.6429289922607928, + "grad_norm": 0.2749182908648262, + "learning_rate": 6.888369611123726e-06, + "loss": 0.4613, + "step": 10004 + }, + { + "epoch": 1.6430932194691357, + "grad_norm": 0.3815791887017438, + "learning_rate": 6.887972130531728e-06, + "loss": 0.445, + "step": 10005 + }, + { + "epoch": 1.6432574466774783, + "grad_norm": 0.3873361429992468, + "learning_rate": 6.887574624004002e-06, + "loss": 0.4693, + "step": 10006 + }, + { + "epoch": 1.643421673885821, + "grad_norm": 0.40267621321284247, + "learning_rate": 6.8871770915448666e-06, + "loss": 0.4621, + "step": 10007 + }, + { + "epoch": 1.6435859010941638, + "grad_norm": 0.2863259481455292, + "learning_rate": 6.886779533158642e-06, + "loss": 0.4698, + "step": 10008 + }, + { + "epoch": 1.6437501283025067, + "grad_norm": 0.3518634637082607, + "learning_rate": 6.88638194884964e-06, + "loss": 0.4403, + "step": 10009 + }, + { + "epoch": 1.6439143555108493, + "grad_norm": 0.2604027741918767, + "learning_rate": 6.885984338622183e-06, + "loss": 0.4512, + "step": 10010 + }, + { + "epoch": 1.644078582719192, + "grad_norm": 0.28233015945831985, + "learning_rate": 6.8855867024805904e-06, + "loss": 0.4521, + "step": 10011 + }, + { + "epoch": 1.6442428099275348, + "grad_norm": 0.3275936381929935, + "learning_rate": 6.885189040429179e-06, + "loss": 0.4734, + "step": 10012 + }, + { + "epoch": 1.6444070371358774, + "grad_norm": 0.2791456316545117, + "learning_rate": 6.884791352472266e-06, + "loss": 0.4579, + "step": 10013 + }, + { + "epoch": 1.6445712643442203, + "grad_norm": 0.32228759885779773, + "learning_rate": 6.884393638614173e-06, + "loss": 0.4712, + "step": 10014 + }, + { + "epoch": 1.644735491552563, + "grad_norm": 0.43417519786081893, + "learning_rate": 6.88399589885922e-06, + "loss": 0.4557, + "step": 10015 + }, + { + "epoch": 1.6448997187609056, + "grad_norm": 0.36224933578030566, + "learning_rate": 6.8835981332117256e-06, + "loss": 0.4663, + "step": 10016 + }, + { + "epoch": 1.6450639459692484, + "grad_norm": 0.4044672096369628, + "learning_rate": 6.883200341676011e-06, + "loss": 0.4646, + "step": 10017 + }, + { + "epoch": 1.6452281731775913, + "grad_norm": 0.5066739693604935, + "learning_rate": 6.882802524256395e-06, + "loss": 0.4484, + "step": 10018 + }, + { + "epoch": 1.645392400385934, + "grad_norm": 0.31496716290507054, + "learning_rate": 6.8824046809571985e-06, + "loss": 0.4647, + "step": 10019 + }, + { + "epoch": 1.6455566275942766, + "grad_norm": 0.28390866682913946, + "learning_rate": 6.882006811782745e-06, + "loss": 0.4529, + "step": 10020 + }, + { + "epoch": 1.6457208548026194, + "grad_norm": 0.3395815099915771, + "learning_rate": 6.881608916737352e-06, + "loss": 0.4805, + "step": 10021 + }, + { + "epoch": 1.6458850820109623, + "grad_norm": 0.4099494282068759, + "learning_rate": 6.881210995825344e-06, + "loss": 0.4478, + "step": 10022 + }, + { + "epoch": 1.646049309219305, + "grad_norm": 0.30184948710719806, + "learning_rate": 6.880813049051043e-06, + "loss": 0.45, + "step": 10023 + }, + { + "epoch": 1.6462135364276476, + "grad_norm": 0.38529826858875593, + "learning_rate": 6.880415076418768e-06, + "loss": 0.4433, + "step": 10024 + }, + { + "epoch": 1.6463777636359904, + "grad_norm": 0.3318275138640135, + "learning_rate": 6.880017077932844e-06, + "loss": 0.4388, + "step": 10025 + }, + { + "epoch": 1.6465419908443333, + "grad_norm": 0.5649606434374844, + "learning_rate": 6.879619053597593e-06, + "loss": 0.4452, + "step": 10026 + }, + { + "epoch": 1.646706218052676, + "grad_norm": 0.30558762969639924, + "learning_rate": 6.879221003417338e-06, + "loss": 0.4533, + "step": 10027 + }, + { + "epoch": 1.6468704452610186, + "grad_norm": 0.3017474883745442, + "learning_rate": 6.878822927396402e-06, + "loss": 0.4363, + "step": 10028 + }, + { + "epoch": 1.6470346724693614, + "grad_norm": 0.3065943760906928, + "learning_rate": 6.878424825539108e-06, + "loss": 0.4582, + "step": 10029 + }, + { + "epoch": 1.647198899677704, + "grad_norm": 0.33933242129544633, + "learning_rate": 6.8780266978497805e-06, + "loss": 0.4664, + "step": 10030 + }, + { + "epoch": 1.647363126886047, + "grad_norm": 0.6614223853975514, + "learning_rate": 6.877628544332744e-06, + "loss": 0.4433, + "step": 10031 + }, + { + "epoch": 1.6475273540943896, + "grad_norm": 0.3048403727802867, + "learning_rate": 6.877230364992322e-06, + "loss": 0.4547, + "step": 10032 + }, + { + "epoch": 1.6476915813027322, + "grad_norm": 0.29375069237212753, + "learning_rate": 6.87683215983284e-06, + "loss": 0.4735, + "step": 10033 + }, + { + "epoch": 1.647855808511075, + "grad_norm": 0.2877086753406038, + "learning_rate": 6.876433928858621e-06, + "loss": 0.4635, + "step": 10034 + }, + { + "epoch": 1.648020035719418, + "grad_norm": 0.3292038999675611, + "learning_rate": 6.8760356720739906e-06, + "loss": 0.4748, + "step": 10035 + }, + { + "epoch": 1.6481842629277605, + "grad_norm": 0.28454457736434846, + "learning_rate": 6.875637389483278e-06, + "loss": 0.4575, + "step": 10036 + }, + { + "epoch": 1.6483484901361032, + "grad_norm": 0.46952622537098326, + "learning_rate": 6.875239081090805e-06, + "loss": 0.4685, + "step": 10037 + }, + { + "epoch": 1.648512717344446, + "grad_norm": 0.48947938405660096, + "learning_rate": 6.8748407469008975e-06, + "loss": 0.4523, + "step": 10038 + }, + { + "epoch": 1.648676944552789, + "grad_norm": 0.296064157512238, + "learning_rate": 6.874442386917883e-06, + "loss": 0.4504, + "step": 10039 + }, + { + "epoch": 1.6488411717611315, + "grad_norm": 0.29222940294924704, + "learning_rate": 6.87404400114609e-06, + "loss": 0.463, + "step": 10040 + }, + { + "epoch": 1.6490053989694742, + "grad_norm": 0.3095877404451654, + "learning_rate": 6.873645589589842e-06, + "loss": 0.4656, + "step": 10041 + }, + { + "epoch": 1.649169626177817, + "grad_norm": 0.6592237090493654, + "learning_rate": 6.8732471522534675e-06, + "loss": 0.4323, + "step": 10042 + }, + { + "epoch": 1.64933385338616, + "grad_norm": 0.36761926537069284, + "learning_rate": 6.872848689141294e-06, + "loss": 0.4479, + "step": 10043 + }, + { + "epoch": 1.6494980805945025, + "grad_norm": 0.4189296399366135, + "learning_rate": 6.872450200257648e-06, + "loss": 0.4572, + "step": 10044 + }, + { + "epoch": 1.6496623078028452, + "grad_norm": 0.3915907380874519, + "learning_rate": 6.872051685606861e-06, + "loss": 0.4438, + "step": 10045 + }, + { + "epoch": 1.649826535011188, + "grad_norm": 0.7110489512948105, + "learning_rate": 6.871653145193258e-06, + "loss": 0.4649, + "step": 10046 + }, + { + "epoch": 1.6499907622195307, + "grad_norm": 0.34437855980356424, + "learning_rate": 6.871254579021168e-06, + "loss": 0.4823, + "step": 10047 + }, + { + "epoch": 1.6501549894278735, + "grad_norm": 0.30378428528681245, + "learning_rate": 6.8708559870949205e-06, + "loss": 0.4661, + "step": 10048 + }, + { + "epoch": 1.6503192166362162, + "grad_norm": 0.3439466847782822, + "learning_rate": 6.8704573694188455e-06, + "loss": 0.4542, + "step": 10049 + }, + { + "epoch": 1.6504834438445588, + "grad_norm": 0.27429610500978663, + "learning_rate": 6.870058725997269e-06, + "loss": 0.4437, + "step": 10050 + }, + { + "epoch": 1.6506476710529017, + "grad_norm": 0.3041048998211635, + "learning_rate": 6.8696600568345235e-06, + "loss": 0.4444, + "step": 10051 + }, + { + "epoch": 1.6508118982612445, + "grad_norm": 0.3819352975581534, + "learning_rate": 6.869261361934939e-06, + "loss": 0.4556, + "step": 10052 + }, + { + "epoch": 1.6509761254695872, + "grad_norm": 0.3133404616783929, + "learning_rate": 6.8688626413028455e-06, + "loss": 0.4369, + "step": 10053 + }, + { + "epoch": 1.6511403526779298, + "grad_norm": 0.30859949689052946, + "learning_rate": 6.868463894942572e-06, + "loss": 0.4505, + "step": 10054 + }, + { + "epoch": 1.6513045798862727, + "grad_norm": 0.2569151609312324, + "learning_rate": 6.868065122858452e-06, + "loss": 0.4494, + "step": 10055 + }, + { + "epoch": 1.6514688070946155, + "grad_norm": 0.29726416618644635, + "learning_rate": 6.867666325054813e-06, + "loss": 0.4643, + "step": 10056 + }, + { + "epoch": 1.6516330343029582, + "grad_norm": 0.30981990269827364, + "learning_rate": 6.86726750153599e-06, + "loss": 0.4526, + "step": 10057 + }, + { + "epoch": 1.6517972615113008, + "grad_norm": 0.3135006032323148, + "learning_rate": 6.866868652306312e-06, + "loss": 0.4547, + "step": 10058 + }, + { + "epoch": 1.6519614887196437, + "grad_norm": 0.31218503934827935, + "learning_rate": 6.866469777370111e-06, + "loss": 0.4615, + "step": 10059 + }, + { + "epoch": 1.6521257159279865, + "grad_norm": 0.28950250972669206, + "learning_rate": 6.8660708767317204e-06, + "loss": 0.4606, + "step": 10060 + }, + { + "epoch": 1.6522899431363292, + "grad_norm": 0.32905922986746405, + "learning_rate": 6.865671950395474e-06, + "loss": 0.4595, + "step": 10061 + }, + { + "epoch": 1.6524541703446718, + "grad_norm": 0.28969266444342057, + "learning_rate": 6.8652729983656995e-06, + "loss": 0.4603, + "step": 10062 + }, + { + "epoch": 1.6526183975530147, + "grad_norm": 0.39683467383680576, + "learning_rate": 6.8648740206467345e-06, + "loss": 0.4676, + "step": 10063 + }, + { + "epoch": 1.6527826247613573, + "grad_norm": 0.5331896140060073, + "learning_rate": 6.864475017242911e-06, + "loss": 0.4477, + "step": 10064 + }, + { + "epoch": 1.6529468519697001, + "grad_norm": 0.2953487815232206, + "learning_rate": 6.8640759881585635e-06, + "loss": 0.4531, + "step": 10065 + }, + { + "epoch": 1.6531110791780428, + "grad_norm": 0.3145480336655805, + "learning_rate": 6.863676933398024e-06, + "loss": 0.4483, + "step": 10066 + }, + { + "epoch": 1.6532753063863854, + "grad_norm": 0.27112246157431824, + "learning_rate": 6.863277852965627e-06, + "loss": 0.4821, + "step": 10067 + }, + { + "epoch": 1.6534395335947283, + "grad_norm": 0.33070118687121697, + "learning_rate": 6.862878746865708e-06, + "loss": 0.47, + "step": 10068 + }, + { + "epoch": 1.6536037608030711, + "grad_norm": 0.3125265983720962, + "learning_rate": 6.8624796151026e-06, + "loss": 0.4612, + "step": 10069 + }, + { + "epoch": 1.6537679880114138, + "grad_norm": 0.3305711511682732, + "learning_rate": 6.862080457680641e-06, + "loss": 0.4693, + "step": 10070 + }, + { + "epoch": 1.6539322152197564, + "grad_norm": 0.29359255539804346, + "learning_rate": 6.861681274604163e-06, + "loss": 0.4603, + "step": 10071 + }, + { + "epoch": 1.6540964424280993, + "grad_norm": 0.3449893242439603, + "learning_rate": 6.861282065877503e-06, + "loss": 0.4705, + "step": 10072 + }, + { + "epoch": 1.6542606696364421, + "grad_norm": 0.44332745359425263, + "learning_rate": 6.860882831504996e-06, + "loss": 0.4621, + "step": 10073 + }, + { + "epoch": 1.6544248968447848, + "grad_norm": 0.3169582268764672, + "learning_rate": 6.86048357149098e-06, + "loss": 0.4606, + "step": 10074 + }, + { + "epoch": 1.6545891240531274, + "grad_norm": 0.2812684006764611, + "learning_rate": 6.860084285839787e-06, + "loss": 0.4624, + "step": 10075 + }, + { + "epoch": 1.6547533512614703, + "grad_norm": 0.3030134510583051, + "learning_rate": 6.85968497455576e-06, + "loss": 0.4628, + "step": 10076 + }, + { + "epoch": 1.6549175784698131, + "grad_norm": 0.5621866248997615, + "learning_rate": 6.859285637643231e-06, + "loss": 0.4475, + "step": 10077 + }, + { + "epoch": 1.6550818056781558, + "grad_norm": 0.2846808009420695, + "learning_rate": 6.85888627510654e-06, + "loss": 0.4467, + "step": 10078 + }, + { + "epoch": 1.6552460328864984, + "grad_norm": 0.5413683398262233, + "learning_rate": 6.858486886950022e-06, + "loss": 0.4525, + "step": 10079 + }, + { + "epoch": 1.6554102600948413, + "grad_norm": 0.26694224344114137, + "learning_rate": 6.858087473178015e-06, + "loss": 0.4613, + "step": 10080 + }, + { + "epoch": 1.655574487303184, + "grad_norm": 0.31430274465533675, + "learning_rate": 6.85768803379486e-06, + "loss": 0.4487, + "step": 10081 + }, + { + "epoch": 1.6557387145115268, + "grad_norm": 0.3048388379627296, + "learning_rate": 6.8572885688048935e-06, + "loss": 0.4686, + "step": 10082 + }, + { + "epoch": 1.6559029417198694, + "grad_norm": 0.4029978714207957, + "learning_rate": 6.856889078212452e-06, + "loss": 0.4556, + "step": 10083 + }, + { + "epoch": 1.656067168928212, + "grad_norm": 0.2713347612783681, + "learning_rate": 6.856489562021877e-06, + "loss": 0.4522, + "step": 10084 + }, + { + "epoch": 1.656231396136555, + "grad_norm": 0.3227466412547536, + "learning_rate": 6.856090020237507e-06, + "loss": 0.4526, + "step": 10085 + }, + { + "epoch": 1.6563956233448978, + "grad_norm": 0.30791311562056967, + "learning_rate": 6.855690452863681e-06, + "loss": 0.4426, + "step": 10086 + }, + { + "epoch": 1.6565598505532404, + "grad_norm": 0.28536385334132947, + "learning_rate": 6.85529085990474e-06, + "loss": 0.4503, + "step": 10087 + }, + { + "epoch": 1.656724077761583, + "grad_norm": 0.3003422483466558, + "learning_rate": 6.854891241365023e-06, + "loss": 0.484, + "step": 10088 + }, + { + "epoch": 1.656888304969926, + "grad_norm": 0.3026878855756908, + "learning_rate": 6.8544915972488685e-06, + "loss": 0.4474, + "step": 10089 + }, + { + "epoch": 1.6570525321782688, + "grad_norm": 0.2629547019528613, + "learning_rate": 6.85409192756062e-06, + "loss": 0.4556, + "step": 10090 + }, + { + "epoch": 1.6572167593866114, + "grad_norm": 0.5892721961165369, + "learning_rate": 6.8536922323046175e-06, + "loss": 0.4635, + "step": 10091 + }, + { + "epoch": 1.657380986594954, + "grad_norm": 0.31233894549114827, + "learning_rate": 6.8532925114852005e-06, + "loss": 0.4504, + "step": 10092 + }, + { + "epoch": 1.657545213803297, + "grad_norm": 0.9927882559085216, + "learning_rate": 6.852892765106712e-06, + "loss": 0.4446, + "step": 10093 + }, + { + "epoch": 1.6577094410116398, + "grad_norm": 0.27389626760484753, + "learning_rate": 6.852492993173493e-06, + "loss": 0.4438, + "step": 10094 + }, + { + "epoch": 1.6578736682199824, + "grad_norm": 0.28999814533998897, + "learning_rate": 6.852093195689886e-06, + "loss": 0.4749, + "step": 10095 + }, + { + "epoch": 1.658037895428325, + "grad_norm": 0.28344610002066106, + "learning_rate": 6.851693372660232e-06, + "loss": 0.4621, + "step": 10096 + }, + { + "epoch": 1.6582021226366677, + "grad_norm": 0.29129088168434936, + "learning_rate": 6.851293524088875e-06, + "loss": 0.4675, + "step": 10097 + }, + { + "epoch": 1.6583663498450105, + "grad_norm": 0.26928559045198974, + "learning_rate": 6.850893649980156e-06, + "loss": 0.4559, + "step": 10098 + }, + { + "epoch": 1.6585305770533534, + "grad_norm": 0.3608111549504681, + "learning_rate": 6.85049375033842e-06, + "loss": 0.4795, + "step": 10099 + }, + { + "epoch": 1.658694804261696, + "grad_norm": 0.31003894818762734, + "learning_rate": 6.850093825168009e-06, + "loss": 0.4503, + "step": 10100 + }, + { + "epoch": 1.6588590314700387, + "grad_norm": 0.2982802514934513, + "learning_rate": 6.849693874473266e-06, + "loss": 0.4603, + "step": 10101 + }, + { + "epoch": 1.6590232586783815, + "grad_norm": 0.36819864548273795, + "learning_rate": 6.849293898258537e-06, + "loss": 0.4573, + "step": 10102 + }, + { + "epoch": 1.6591874858867244, + "grad_norm": 0.3051585703764991, + "learning_rate": 6.8488938965281645e-06, + "loss": 0.4663, + "step": 10103 + }, + { + "epoch": 1.659351713095067, + "grad_norm": 0.33465803831927327, + "learning_rate": 6.848493869286493e-06, + "loss": 0.4502, + "step": 10104 + }, + { + "epoch": 1.6595159403034097, + "grad_norm": 0.38292256300891603, + "learning_rate": 6.848093816537868e-06, + "loss": 0.455, + "step": 10105 + }, + { + "epoch": 1.6596801675117525, + "grad_norm": 0.4288122393071528, + "learning_rate": 6.847693738286633e-06, + "loss": 0.4597, + "step": 10106 + }, + { + "epoch": 1.6598443947200954, + "grad_norm": 0.24811801866347535, + "learning_rate": 6.847293634537135e-06, + "loss": 0.4505, + "step": 10107 + }, + { + "epoch": 1.660008621928438, + "grad_norm": 0.3088836539894647, + "learning_rate": 6.846893505293719e-06, + "loss": 0.4509, + "step": 10108 + }, + { + "epoch": 1.6601728491367806, + "grad_norm": 0.3003313846361625, + "learning_rate": 6.846493350560729e-06, + "loss": 0.4523, + "step": 10109 + }, + { + "epoch": 1.6603370763451235, + "grad_norm": 0.5284141833491072, + "learning_rate": 6.846093170342515e-06, + "loss": 0.4548, + "step": 10110 + }, + { + "epoch": 1.6605013035534664, + "grad_norm": 0.3918097946018369, + "learning_rate": 6.84569296464342e-06, + "loss": 0.4502, + "step": 10111 + }, + { + "epoch": 1.660665530761809, + "grad_norm": 0.3162058034183177, + "learning_rate": 6.84529273346779e-06, + "loss": 0.4537, + "step": 10112 + }, + { + "epoch": 1.6608297579701516, + "grad_norm": 0.3712791403292991, + "learning_rate": 6.844892476819973e-06, + "loss": 0.4753, + "step": 10113 + }, + { + "epoch": 1.6609939851784943, + "grad_norm": 0.2919157998372654, + "learning_rate": 6.84449219470432e-06, + "loss": 0.4555, + "step": 10114 + }, + { + "epoch": 1.6611582123868371, + "grad_norm": 0.3444696305060915, + "learning_rate": 6.844091887125172e-06, + "loss": 0.4596, + "step": 10115 + }, + { + "epoch": 1.66132243959518, + "grad_norm": 0.3506674472551571, + "learning_rate": 6.843691554086882e-06, + "loss": 0.4696, + "step": 10116 + }, + { + "epoch": 1.6614866668035226, + "grad_norm": 0.3444951783000774, + "learning_rate": 6.8432911955937935e-06, + "loss": 0.4573, + "step": 10117 + }, + { + "epoch": 1.6616508940118653, + "grad_norm": 0.4158734419460173, + "learning_rate": 6.8428908116502595e-06, + "loss": 0.4593, + "step": 10118 + }, + { + "epoch": 1.6618151212202081, + "grad_norm": 0.30382622051819197, + "learning_rate": 6.842490402260625e-06, + "loss": 0.4731, + "step": 10119 + }, + { + "epoch": 1.661979348428551, + "grad_norm": 0.2730468701565896, + "learning_rate": 6.8420899674292405e-06, + "loss": 0.4655, + "step": 10120 + }, + { + "epoch": 1.6621435756368936, + "grad_norm": 0.3113059653283631, + "learning_rate": 6.8416895071604545e-06, + "loss": 0.4715, + "step": 10121 + }, + { + "epoch": 1.6623078028452363, + "grad_norm": 0.4526111377968724, + "learning_rate": 6.841289021458617e-06, + "loss": 0.4748, + "step": 10122 + }, + { + "epoch": 1.6624720300535791, + "grad_norm": 0.3076876962554403, + "learning_rate": 6.840888510328075e-06, + "loss": 0.4424, + "step": 10123 + }, + { + "epoch": 1.662636257261922, + "grad_norm": 0.4323855586209853, + "learning_rate": 6.840487973773183e-06, + "loss": 0.4513, + "step": 10124 + }, + { + "epoch": 1.6628004844702646, + "grad_norm": 0.2948138042150791, + "learning_rate": 6.840087411798289e-06, + "loss": 0.4572, + "step": 10125 + }, + { + "epoch": 1.6629647116786073, + "grad_norm": 0.3235230615740369, + "learning_rate": 6.839686824407742e-06, + "loss": 0.4566, + "step": 10126 + }, + { + "epoch": 1.6631289388869501, + "grad_norm": 0.35818271322707407, + "learning_rate": 6.8392862116058945e-06, + "loss": 0.4557, + "step": 10127 + }, + { + "epoch": 1.663293166095293, + "grad_norm": 0.28832099909946796, + "learning_rate": 6.8388855733970975e-06, + "loss": 0.464, + "step": 10128 + }, + { + "epoch": 1.6634573933036356, + "grad_norm": 0.32200541250106973, + "learning_rate": 6.838484909785702e-06, + "loss": 0.4468, + "step": 10129 + }, + { + "epoch": 1.6636216205119783, + "grad_norm": 0.2759151795686372, + "learning_rate": 6.838084220776061e-06, + "loss": 0.4549, + "step": 10130 + }, + { + "epoch": 1.663785847720321, + "grad_norm": 0.38764451430397096, + "learning_rate": 6.8376835063725216e-06, + "loss": 0.4591, + "step": 10131 + }, + { + "epoch": 1.6639500749286638, + "grad_norm": 0.3689015443052184, + "learning_rate": 6.837282766579442e-06, + "loss": 0.4512, + "step": 10132 + }, + { + "epoch": 1.6641143021370066, + "grad_norm": 0.3136526314101458, + "learning_rate": 6.836882001401171e-06, + "loss": 0.4673, + "step": 10133 + }, + { + "epoch": 1.6642785293453493, + "grad_norm": 0.29260575496034674, + "learning_rate": 6.836481210842064e-06, + "loss": 0.4675, + "step": 10134 + }, + { + "epoch": 1.664442756553692, + "grad_norm": 0.3097745397723282, + "learning_rate": 6.83608039490647e-06, + "loss": 0.4534, + "step": 10135 + }, + { + "epoch": 1.6646069837620348, + "grad_norm": 0.2974482169941179, + "learning_rate": 6.835679553598746e-06, + "loss": 0.4403, + "step": 10136 + }, + { + "epoch": 1.6647712109703776, + "grad_norm": 0.5267663399483836, + "learning_rate": 6.835278686923242e-06, + "loss": 0.4649, + "step": 10137 + }, + { + "epoch": 1.6649354381787203, + "grad_norm": 0.28156728840426715, + "learning_rate": 6.834877794884314e-06, + "loss": 0.4638, + "step": 10138 + }, + { + "epoch": 1.6650996653870629, + "grad_norm": 0.44869677574882944, + "learning_rate": 6.834476877486318e-06, + "loss": 0.453, + "step": 10139 + }, + { + "epoch": 1.6652638925954057, + "grad_norm": 0.30123810086379144, + "learning_rate": 6.8340759347336056e-06, + "loss": 0.4601, + "step": 10140 + }, + { + "epoch": 1.6654281198037486, + "grad_norm": 0.5795294618527556, + "learning_rate": 6.833674966630533e-06, + "loss": 0.4615, + "step": 10141 + }, + { + "epoch": 1.6655923470120912, + "grad_norm": 0.40046742999378, + "learning_rate": 6.833273973181453e-06, + "loss": 0.47, + "step": 10142 + }, + { + "epoch": 1.6657565742204339, + "grad_norm": 0.2708138308319611, + "learning_rate": 6.832872954390723e-06, + "loss": 0.4649, + "step": 10143 + }, + { + "epoch": 1.6659208014287767, + "grad_norm": 0.4570428523808035, + "learning_rate": 6.832471910262696e-06, + "loss": 0.4627, + "step": 10144 + }, + { + "epoch": 1.6660850286371196, + "grad_norm": 0.4293640775402717, + "learning_rate": 6.8320708408017305e-06, + "loss": 0.4579, + "step": 10145 + }, + { + "epoch": 1.6662492558454622, + "grad_norm": 0.2685026647291335, + "learning_rate": 6.83166974601218e-06, + "loss": 0.4509, + "step": 10146 + }, + { + "epoch": 1.6664134830538049, + "grad_norm": 0.31228397957090265, + "learning_rate": 6.831268625898402e-06, + "loss": 0.479, + "step": 10147 + }, + { + "epoch": 1.6665777102621475, + "grad_norm": 0.3137721705986118, + "learning_rate": 6.830867480464754e-06, + "loss": 0.4437, + "step": 10148 + }, + { + "epoch": 1.6667419374704904, + "grad_norm": 0.5708636843935152, + "learning_rate": 6.830466309715593e-06, + "loss": 0.4512, + "step": 10149 + }, + { + "epoch": 1.6669061646788332, + "grad_norm": 0.2653212103007819, + "learning_rate": 6.830065113655272e-06, + "loss": 0.4496, + "step": 10150 + }, + { + "epoch": 1.6670703918871759, + "grad_norm": 0.7572840674435325, + "learning_rate": 6.829663892288155e-06, + "loss": 0.4401, + "step": 10151 + }, + { + "epoch": 1.6672346190955185, + "grad_norm": 0.425478297583331, + "learning_rate": 6.829262645618592e-06, + "loss": 0.4681, + "step": 10152 + }, + { + "epoch": 1.6673988463038614, + "grad_norm": 0.2559404822460795, + "learning_rate": 6.8288613736509485e-06, + "loss": 0.4491, + "step": 10153 + }, + { + "epoch": 1.6675630735122042, + "grad_norm": 0.2704039864595664, + "learning_rate": 6.828460076389577e-06, + "loss": 0.4651, + "step": 10154 + }, + { + "epoch": 1.6677273007205469, + "grad_norm": 0.29909003380134896, + "learning_rate": 6.82805875383884e-06, + "loss": 0.4391, + "step": 10155 + }, + { + "epoch": 1.6678915279288895, + "grad_norm": 0.3071232266044963, + "learning_rate": 6.827657406003092e-06, + "loss": 0.4543, + "step": 10156 + }, + { + "epoch": 1.6680557551372324, + "grad_norm": 0.33138094181705335, + "learning_rate": 6.8272560328866965e-06, + "loss": 0.4758, + "step": 10157 + }, + { + "epoch": 1.6682199823455752, + "grad_norm": 0.7843415418694715, + "learning_rate": 6.826854634494011e-06, + "loss": 0.4437, + "step": 10158 + }, + { + "epoch": 1.6683842095539179, + "grad_norm": 0.2925059010158654, + "learning_rate": 6.8264532108293936e-06, + "loss": 0.4652, + "step": 10159 + }, + { + "epoch": 1.6685484367622605, + "grad_norm": 0.31203921925175077, + "learning_rate": 6.826051761897205e-06, + "loss": 0.4407, + "step": 10160 + }, + { + "epoch": 1.6687126639706034, + "grad_norm": 0.3378418281840202, + "learning_rate": 6.825650287701807e-06, + "loss": 0.4396, + "step": 10161 + }, + { + "epoch": 1.6688768911789462, + "grad_norm": 0.6728722464644683, + "learning_rate": 6.825248788247557e-06, + "loss": 0.4732, + "step": 10162 + }, + { + "epoch": 1.6690411183872889, + "grad_norm": 0.4629013914353093, + "learning_rate": 6.82484726353882e-06, + "loss": 0.4691, + "step": 10163 + }, + { + "epoch": 1.6692053455956315, + "grad_norm": 0.31808519111750916, + "learning_rate": 6.824445713579954e-06, + "loss": 0.4594, + "step": 10164 + }, + { + "epoch": 1.6693695728039741, + "grad_norm": 0.2995549769221818, + "learning_rate": 6.824044138375318e-06, + "loss": 0.4584, + "step": 10165 + }, + { + "epoch": 1.669533800012317, + "grad_norm": 1.8078725563334275, + "learning_rate": 6.823642537929278e-06, + "loss": 0.4445, + "step": 10166 + }, + { + "epoch": 1.6696980272206599, + "grad_norm": 0.2907891043954342, + "learning_rate": 6.823240912246193e-06, + "loss": 0.4537, + "step": 10167 + }, + { + "epoch": 1.6698622544290025, + "grad_norm": 0.5084326002415936, + "learning_rate": 6.8228392613304285e-06, + "loss": 0.4627, + "step": 10168 + }, + { + "epoch": 1.6700264816373451, + "grad_norm": 0.29511217662791017, + "learning_rate": 6.822437585186341e-06, + "loss": 0.4779, + "step": 10169 + }, + { + "epoch": 1.670190708845688, + "grad_norm": 0.2693223126141609, + "learning_rate": 6.822035883818299e-06, + "loss": 0.4466, + "step": 10170 + }, + { + "epoch": 1.6703549360540308, + "grad_norm": 0.3933448320738575, + "learning_rate": 6.821634157230661e-06, + "loss": 0.4772, + "step": 10171 + }, + { + "epoch": 1.6705191632623735, + "grad_norm": 0.2942491456962361, + "learning_rate": 6.821232405427791e-06, + "loss": 0.452, + "step": 10172 + }, + { + "epoch": 1.6706833904707161, + "grad_norm": 0.29925065753639407, + "learning_rate": 6.820830628414056e-06, + "loss": 0.4626, + "step": 10173 + }, + { + "epoch": 1.670847617679059, + "grad_norm": 0.3433489098652025, + "learning_rate": 6.820428826193816e-06, + "loss": 0.4565, + "step": 10174 + }, + { + "epoch": 1.6710118448874018, + "grad_norm": 0.297289729417751, + "learning_rate": 6.820026998771435e-06, + "loss": 0.468, + "step": 10175 + }, + { + "epoch": 1.6711760720957445, + "grad_norm": 0.2536699644456774, + "learning_rate": 6.819625146151278e-06, + "loss": 0.4534, + "step": 10176 + }, + { + "epoch": 1.6713402993040871, + "grad_norm": 0.25733154085588766, + "learning_rate": 6.81922326833771e-06, + "loss": 0.4698, + "step": 10177 + }, + { + "epoch": 1.67150452651243, + "grad_norm": 0.7738151356879273, + "learning_rate": 6.818821365335097e-06, + "loss": 0.4686, + "step": 10178 + }, + { + "epoch": 1.6716687537207728, + "grad_norm": 0.2918725037566939, + "learning_rate": 6.818419437147802e-06, + "loss": 0.4822, + "step": 10179 + }, + { + "epoch": 1.6718329809291155, + "grad_norm": 0.3340710811833185, + "learning_rate": 6.81801748378019e-06, + "loss": 0.4595, + "step": 10180 + }, + { + "epoch": 1.6719972081374581, + "grad_norm": 0.30776634460261865, + "learning_rate": 6.817615505236627e-06, + "loss": 0.4676, + "step": 10181 + }, + { + "epoch": 1.6721614353458008, + "grad_norm": 0.2942163724177741, + "learning_rate": 6.81721350152148e-06, + "loss": 0.4607, + "step": 10182 + }, + { + "epoch": 1.6723256625541436, + "grad_norm": 0.6801375014109976, + "learning_rate": 6.816811472639116e-06, + "loss": 0.4689, + "step": 10183 + }, + { + "epoch": 1.6724898897624865, + "grad_norm": 0.31395840415757875, + "learning_rate": 6.8164094185938986e-06, + "loss": 0.4637, + "step": 10184 + }, + { + "epoch": 1.672654116970829, + "grad_norm": 0.3058483374593646, + "learning_rate": 6.816007339390195e-06, + "loss": 0.4504, + "step": 10185 + }, + { + "epoch": 1.6728183441791717, + "grad_norm": 0.31385256374761467, + "learning_rate": 6.815605235032374e-06, + "loss": 0.4518, + "step": 10186 + }, + { + "epoch": 1.6729825713875146, + "grad_norm": 0.29066102734867516, + "learning_rate": 6.815203105524803e-06, + "loss": 0.4303, + "step": 10187 + }, + { + "epoch": 1.6731467985958575, + "grad_norm": 0.3231898610109705, + "learning_rate": 6.814800950871848e-06, + "loss": 0.4578, + "step": 10188 + }, + { + "epoch": 1.6733110258042, + "grad_norm": 0.305140836240839, + "learning_rate": 6.8143987710778764e-06, + "loss": 0.4656, + "step": 10189 + }, + { + "epoch": 1.6734752530125427, + "grad_norm": 0.301483387719651, + "learning_rate": 6.813996566147257e-06, + "loss": 0.4879, + "step": 10190 + }, + { + "epoch": 1.6736394802208856, + "grad_norm": 1.2463700741489872, + "learning_rate": 6.813594336084359e-06, + "loss": 0.4543, + "step": 10191 + }, + { + "epoch": 1.6738037074292285, + "grad_norm": 0.39223622977769473, + "learning_rate": 6.81319208089355e-06, + "loss": 0.4681, + "step": 10192 + }, + { + "epoch": 1.673967934637571, + "grad_norm": 0.3176465043128236, + "learning_rate": 6.812789800579198e-06, + "loss": 0.4456, + "step": 10193 + }, + { + "epoch": 1.6741321618459137, + "grad_norm": 0.2903606994722687, + "learning_rate": 6.812387495145675e-06, + "loss": 0.4687, + "step": 10194 + }, + { + "epoch": 1.6742963890542566, + "grad_norm": 0.3094943548301401, + "learning_rate": 6.811985164597348e-06, + "loss": 0.4565, + "step": 10195 + }, + { + "epoch": 1.6744606162625995, + "grad_norm": 0.4317000637297288, + "learning_rate": 6.811582808938587e-06, + "loss": 0.449, + "step": 10196 + }, + { + "epoch": 1.674624843470942, + "grad_norm": 0.45232778899457615, + "learning_rate": 6.8111804281737636e-06, + "loss": 0.4724, + "step": 10197 + }, + { + "epoch": 1.6747890706792847, + "grad_norm": 0.33289301304418345, + "learning_rate": 6.810778022307245e-06, + "loss": 0.4624, + "step": 10198 + }, + { + "epoch": 1.6749532978876274, + "grad_norm": 0.8288740572151884, + "learning_rate": 6.810375591343405e-06, + "loss": 0.4414, + "step": 10199 + }, + { + "epoch": 1.6751175250959702, + "grad_norm": 0.5574232589287477, + "learning_rate": 6.809973135286613e-06, + "loss": 0.4157, + "step": 10200 + }, + { + "epoch": 1.675281752304313, + "grad_norm": 0.40580094718204485, + "learning_rate": 6.809570654141239e-06, + "loss": 0.4702, + "step": 10201 + }, + { + "epoch": 1.6754459795126557, + "grad_norm": 0.337536087662044, + "learning_rate": 6.809168147911656e-06, + "loss": 0.4573, + "step": 10202 + }, + { + "epoch": 1.6756102067209984, + "grad_norm": 0.3024412512663363, + "learning_rate": 6.808765616602236e-06, + "loss": 0.4379, + "step": 10203 + }, + { + "epoch": 1.6757744339293412, + "grad_norm": 0.36910762147354104, + "learning_rate": 6.808363060217348e-06, + "loss": 0.4482, + "step": 10204 + }, + { + "epoch": 1.675938661137684, + "grad_norm": 0.4531128443431689, + "learning_rate": 6.8079604787613664e-06, + "loss": 0.4615, + "step": 10205 + }, + { + "epoch": 1.6761028883460267, + "grad_norm": 0.25406746010569026, + "learning_rate": 6.8075578722386646e-06, + "loss": 0.4625, + "step": 10206 + }, + { + "epoch": 1.6762671155543694, + "grad_norm": 0.4489461763784889, + "learning_rate": 6.807155240653614e-06, + "loss": 0.4557, + "step": 10207 + }, + { + "epoch": 1.6764313427627122, + "grad_norm": 0.32368261589902353, + "learning_rate": 6.806752584010586e-06, + "loss": 0.4512, + "step": 10208 + }, + { + "epoch": 1.676595569971055, + "grad_norm": 0.30499813536039555, + "learning_rate": 6.8063499023139565e-06, + "loss": 0.4653, + "step": 10209 + }, + { + "epoch": 1.6767597971793977, + "grad_norm": 0.28669103974464233, + "learning_rate": 6.805947195568096e-06, + "loss": 0.4656, + "step": 10210 + }, + { + "epoch": 1.6769240243877404, + "grad_norm": 0.342212151906084, + "learning_rate": 6.805544463777383e-06, + "loss": 0.4629, + "step": 10211 + }, + { + "epoch": 1.6770882515960832, + "grad_norm": 0.2878312678429544, + "learning_rate": 6.805141706946188e-06, + "loss": 0.4674, + "step": 10212 + }, + { + "epoch": 1.677252478804426, + "grad_norm": 0.2932394604869272, + "learning_rate": 6.804738925078885e-06, + "loss": 0.4713, + "step": 10213 + }, + { + "epoch": 1.6774167060127687, + "grad_norm": 0.29840250165017845, + "learning_rate": 6.8043361181798515e-06, + "loss": 0.4747, + "step": 10214 + }, + { + "epoch": 1.6775809332211113, + "grad_norm": 0.3088400439236326, + "learning_rate": 6.803933286253458e-06, + "loss": 0.4733, + "step": 10215 + }, + { + "epoch": 1.677745160429454, + "grad_norm": 0.3900409979120372, + "learning_rate": 6.803530429304084e-06, + "loss": 0.4675, + "step": 10216 + }, + { + "epoch": 1.6779093876377968, + "grad_norm": 0.30596119386700804, + "learning_rate": 6.803127547336104e-06, + "loss": 0.4477, + "step": 10217 + }, + { + "epoch": 1.6780736148461397, + "grad_norm": 0.3396698020804204, + "learning_rate": 6.802724640353891e-06, + "loss": 0.4533, + "step": 10218 + }, + { + "epoch": 1.6782378420544823, + "grad_norm": 0.36186525495151045, + "learning_rate": 6.802321708361823e-06, + "loss": 0.4737, + "step": 10219 + }, + { + "epoch": 1.678402069262825, + "grad_norm": 0.32655803334282024, + "learning_rate": 6.8019187513642775e-06, + "loss": 0.4307, + "step": 10220 + }, + { + "epoch": 1.6785662964711678, + "grad_norm": 0.2625813564960309, + "learning_rate": 6.801515769365629e-06, + "loss": 0.4823, + "step": 10221 + }, + { + "epoch": 1.6787305236795107, + "grad_norm": 0.29774752894997863, + "learning_rate": 6.801112762370254e-06, + "loss": 0.4524, + "step": 10222 + }, + { + "epoch": 1.6788947508878533, + "grad_norm": 0.3127505478958254, + "learning_rate": 6.800709730382531e-06, + "loss": 0.4403, + "step": 10223 + }, + { + "epoch": 1.679058978096196, + "grad_norm": 0.30376527080229004, + "learning_rate": 6.8003066734068374e-06, + "loss": 0.4531, + "step": 10224 + }, + { + "epoch": 1.6792232053045388, + "grad_norm": 0.3136688315705886, + "learning_rate": 6.799903591447548e-06, + "loss": 0.4698, + "step": 10225 + }, + { + "epoch": 1.6793874325128817, + "grad_norm": 0.28186818901257743, + "learning_rate": 6.799500484509046e-06, + "loss": 0.4312, + "step": 10226 + }, + { + "epoch": 1.6795516597212243, + "grad_norm": 0.30242428453226355, + "learning_rate": 6.799097352595704e-06, + "loss": 0.4545, + "step": 10227 + }, + { + "epoch": 1.679715886929567, + "grad_norm": 0.33302698471723835, + "learning_rate": 6.798694195711903e-06, + "loss": 0.4716, + "step": 10228 + }, + { + "epoch": 1.6798801141379098, + "grad_norm": 0.34080784140890574, + "learning_rate": 6.798291013862023e-06, + "loss": 0.4677, + "step": 10229 + }, + { + "epoch": 1.6800443413462527, + "grad_norm": 0.3374671449414891, + "learning_rate": 6.797887807050439e-06, + "loss": 0.4534, + "step": 10230 + }, + { + "epoch": 1.6802085685545953, + "grad_norm": 0.3153387804540685, + "learning_rate": 6.797484575281535e-06, + "loss": 0.463, + "step": 10231 + }, + { + "epoch": 1.680372795762938, + "grad_norm": 0.402795624677278, + "learning_rate": 6.797081318559686e-06, + "loss": 0.4644, + "step": 10232 + }, + { + "epoch": 1.6805370229712806, + "grad_norm": 0.30953687154793585, + "learning_rate": 6.796678036889275e-06, + "loss": 0.4674, + "step": 10233 + }, + { + "epoch": 1.6807012501796235, + "grad_norm": 0.3317048558219616, + "learning_rate": 6.79627473027468e-06, + "loss": 0.4857, + "step": 10234 + }, + { + "epoch": 1.6808654773879663, + "grad_norm": 0.3802312298770638, + "learning_rate": 6.795871398720282e-06, + "loss": 0.4726, + "step": 10235 + }, + { + "epoch": 1.681029704596309, + "grad_norm": 0.2997172652219126, + "learning_rate": 6.795468042230464e-06, + "loss": 0.4712, + "step": 10236 + }, + { + "epoch": 1.6811939318046516, + "grad_norm": 0.3335225931142483, + "learning_rate": 6.795064660809604e-06, + "loss": 0.4387, + "step": 10237 + }, + { + "epoch": 1.6813581590129945, + "grad_norm": 0.2824447347067514, + "learning_rate": 6.7946612544620825e-06, + "loss": 0.4538, + "step": 10238 + }, + { + "epoch": 1.6815223862213373, + "grad_norm": 0.29554559306343914, + "learning_rate": 6.794257823192282e-06, + "loss": 0.4443, + "step": 10239 + }, + { + "epoch": 1.68168661342968, + "grad_norm": 0.2963918421520866, + "learning_rate": 6.793854367004585e-06, + "loss": 0.4639, + "step": 10240 + }, + { + "epoch": 1.6818508406380226, + "grad_norm": 0.36615082692498374, + "learning_rate": 6.793450885903374e-06, + "loss": 0.4534, + "step": 10241 + }, + { + "epoch": 1.6820150678463655, + "grad_norm": 0.32269538559501254, + "learning_rate": 6.793047379893027e-06, + "loss": 0.4743, + "step": 10242 + }, + { + "epoch": 1.6821792950547083, + "grad_norm": 0.35870366829155603, + "learning_rate": 6.7926438489779315e-06, + "loss": 0.4643, + "step": 10243 + }, + { + "epoch": 1.682343522263051, + "grad_norm": 0.3287341790332416, + "learning_rate": 6.792240293162467e-06, + "loss": 0.4611, + "step": 10244 + }, + { + "epoch": 1.6825077494713936, + "grad_norm": 0.36393328763510463, + "learning_rate": 6.791836712451018e-06, + "loss": 0.4727, + "step": 10245 + }, + { + "epoch": 1.6826719766797364, + "grad_norm": 0.32174905680140364, + "learning_rate": 6.791433106847968e-06, + "loss": 0.4446, + "step": 10246 + }, + { + "epoch": 1.6828362038880793, + "grad_norm": 0.341965634713859, + "learning_rate": 6.7910294763577e-06, + "loss": 0.4682, + "step": 10247 + }, + { + "epoch": 1.683000431096422, + "grad_norm": 0.3729904454177416, + "learning_rate": 6.790625820984597e-06, + "loss": 0.4374, + "step": 10248 + }, + { + "epoch": 1.6831646583047646, + "grad_norm": 0.3070748646257641, + "learning_rate": 6.790222140733044e-06, + "loss": 0.4516, + "step": 10249 + }, + { + "epoch": 1.6833288855131072, + "grad_norm": 0.33665130707619334, + "learning_rate": 6.789818435607426e-06, + "loss": 0.4436, + "step": 10250 + }, + { + "epoch": 1.68349311272145, + "grad_norm": 0.2835961697412146, + "learning_rate": 6.789414705612128e-06, + "loss": 0.4449, + "step": 10251 + }, + { + "epoch": 1.683657339929793, + "grad_norm": 0.33480743026989657, + "learning_rate": 6.7890109507515315e-06, + "loss": 0.4691, + "step": 10252 + }, + { + "epoch": 1.6838215671381356, + "grad_norm": 0.3456039394184217, + "learning_rate": 6.788607171030025e-06, + "loss": 0.4514, + "step": 10253 + }, + { + "epoch": 1.6839857943464782, + "grad_norm": 0.27234082924457936, + "learning_rate": 6.788203366451993e-06, + "loss": 0.4447, + "step": 10254 + }, + { + "epoch": 1.684150021554821, + "grad_norm": 0.34880013350903516, + "learning_rate": 6.78779953702182e-06, + "loss": 0.4709, + "step": 10255 + }, + { + "epoch": 1.684314248763164, + "grad_norm": 0.33395070260387977, + "learning_rate": 6.787395682743895e-06, + "loss": 0.4554, + "step": 10256 + }, + { + "epoch": 1.6844784759715066, + "grad_norm": 0.28524661530688, + "learning_rate": 6.786991803622602e-06, + "loss": 0.485, + "step": 10257 + }, + { + "epoch": 1.6846427031798492, + "grad_norm": 0.3068358586063563, + "learning_rate": 6.786587899662327e-06, + "loss": 0.474, + "step": 10258 + }, + { + "epoch": 1.684806930388192, + "grad_norm": 0.2848755321460969, + "learning_rate": 6.786183970867458e-06, + "loss": 0.4548, + "step": 10259 + }, + { + "epoch": 1.684971157596535, + "grad_norm": 0.3643720044509586, + "learning_rate": 6.785780017242382e-06, + "loss": 0.4391, + "step": 10260 + }, + { + "epoch": 1.6851353848048776, + "grad_norm": 0.2542353434490754, + "learning_rate": 6.785376038791486e-06, + "loss": 0.4491, + "step": 10261 + }, + { + "epoch": 1.6852996120132202, + "grad_norm": 0.6157108993224596, + "learning_rate": 6.784972035519159e-06, + "loss": 0.4631, + "step": 10262 + }, + { + "epoch": 1.685463839221563, + "grad_norm": 0.2588069234838315, + "learning_rate": 6.784568007429786e-06, + "loss": 0.4453, + "step": 10263 + }, + { + "epoch": 1.685628066429906, + "grad_norm": 0.3417690274270287, + "learning_rate": 6.784163954527755e-06, + "loss": 0.4588, + "step": 10264 + }, + { + "epoch": 1.6857922936382486, + "grad_norm": 0.41849951712013345, + "learning_rate": 6.7837598768174595e-06, + "loss": 0.4612, + "step": 10265 + }, + { + "epoch": 1.6859565208465912, + "grad_norm": 0.7638550414307494, + "learning_rate": 6.783355774303284e-06, + "loss": 0.4342, + "step": 10266 + }, + { + "epoch": 1.6861207480549338, + "grad_norm": 0.34278375489175383, + "learning_rate": 6.782951646989617e-06, + "loss": 0.4427, + "step": 10267 + }, + { + "epoch": 1.6862849752632767, + "grad_norm": 0.36146825412263345, + "learning_rate": 6.7825474948808495e-06, + "loss": 0.4443, + "step": 10268 + }, + { + "epoch": 1.6864492024716196, + "grad_norm": 0.29089142141868596, + "learning_rate": 6.782143317981371e-06, + "loss": 0.4551, + "step": 10269 + }, + { + "epoch": 1.6866134296799622, + "grad_norm": 0.30031960919690487, + "learning_rate": 6.78173911629557e-06, + "loss": 0.4703, + "step": 10270 + }, + { + "epoch": 1.6867776568883048, + "grad_norm": 0.3250331596103951, + "learning_rate": 6.78133488982784e-06, + "loss": 0.4714, + "step": 10271 + }, + { + "epoch": 1.6869418840966477, + "grad_norm": 0.3971300159279936, + "learning_rate": 6.780930638582566e-06, + "loss": 0.4546, + "step": 10272 + }, + { + "epoch": 1.6871061113049906, + "grad_norm": 0.29162352290664173, + "learning_rate": 6.7805263625641394e-06, + "loss": 0.4755, + "step": 10273 + }, + { + "epoch": 1.6872703385133332, + "grad_norm": 0.26532718169105196, + "learning_rate": 6.780122061776957e-06, + "loss": 0.443, + "step": 10274 + }, + { + "epoch": 1.6874345657216758, + "grad_norm": 0.2722156753431404, + "learning_rate": 6.7797177362254035e-06, + "loss": 0.4593, + "step": 10275 + }, + { + "epoch": 1.6875987929300187, + "grad_norm": 0.26193853080166557, + "learning_rate": 6.779313385913872e-06, + "loss": 0.4471, + "step": 10276 + }, + { + "epoch": 1.6877630201383615, + "grad_norm": 0.37316229355423847, + "learning_rate": 6.778909010846754e-06, + "loss": 0.4663, + "step": 10277 + }, + { + "epoch": 1.6879272473467042, + "grad_norm": 0.3047724813156029, + "learning_rate": 6.778504611028443e-06, + "loss": 0.4729, + "step": 10278 + }, + { + "epoch": 1.6880914745550468, + "grad_norm": 0.6649079988395612, + "learning_rate": 6.778100186463331e-06, + "loss": 0.4553, + "step": 10279 + }, + { + "epoch": 1.6882557017633897, + "grad_norm": 0.29209916143149306, + "learning_rate": 6.777695737155809e-06, + "loss": 0.4535, + "step": 10280 + }, + { + "epoch": 1.6884199289717325, + "grad_norm": 0.31165383824990045, + "learning_rate": 6.77729126311027e-06, + "loss": 0.4529, + "step": 10281 + }, + { + "epoch": 1.6885841561800752, + "grad_norm": 0.30989188157862446, + "learning_rate": 6.776886764331108e-06, + "loss": 0.4538, + "step": 10282 + }, + { + "epoch": 1.6887483833884178, + "grad_norm": 0.3244201798525414, + "learning_rate": 6.776482240822715e-06, + "loss": 0.4484, + "step": 10283 + }, + { + "epoch": 1.6889126105967605, + "grad_norm": 0.29740012853549874, + "learning_rate": 6.776077692589484e-06, + "loss": 0.4392, + "step": 10284 + }, + { + "epoch": 1.6890768378051033, + "grad_norm": 0.31728604602189964, + "learning_rate": 6.775673119635812e-06, + "loss": 0.4747, + "step": 10285 + }, + { + "epoch": 1.6892410650134462, + "grad_norm": 0.25451478660174215, + "learning_rate": 6.775268521966091e-06, + "loss": 0.4455, + "step": 10286 + }, + { + "epoch": 1.6894052922217888, + "grad_norm": 0.28516624095149246, + "learning_rate": 6.774863899584714e-06, + "loss": 0.4463, + "step": 10287 + }, + { + "epoch": 1.6895695194301315, + "grad_norm": 0.38342411007047644, + "learning_rate": 6.774459252496077e-06, + "loss": 0.4654, + "step": 10288 + }, + { + "epoch": 1.6897337466384743, + "grad_norm": 0.26453823657971653, + "learning_rate": 6.774054580704576e-06, + "loss": 0.4571, + "step": 10289 + }, + { + "epoch": 1.6898979738468172, + "grad_norm": 0.2918895135048132, + "learning_rate": 6.773649884214603e-06, + "loss": 0.443, + "step": 10290 + }, + { + "epoch": 1.6900622010551598, + "grad_norm": 0.3251405163584019, + "learning_rate": 6.773245163030557e-06, + "loss": 0.4578, + "step": 10291 + }, + { + "epoch": 1.6902264282635024, + "grad_norm": 0.2912874050807492, + "learning_rate": 6.772840417156831e-06, + "loss": 0.4555, + "step": 10292 + }, + { + "epoch": 1.6903906554718453, + "grad_norm": 0.30295294439343956, + "learning_rate": 6.77243564659782e-06, + "loss": 0.4802, + "step": 10293 + }, + { + "epoch": 1.6905548826801882, + "grad_norm": 0.2914439106468315, + "learning_rate": 6.7720308513579255e-06, + "loss": 0.4621, + "step": 10294 + }, + { + "epoch": 1.6907191098885308, + "grad_norm": 0.30084357916317045, + "learning_rate": 6.77162603144154e-06, + "loss": 0.4574, + "step": 10295 + }, + { + "epoch": 1.6908833370968734, + "grad_norm": 0.31309506377140034, + "learning_rate": 6.771221186853059e-06, + "loss": 0.4383, + "step": 10296 + }, + { + "epoch": 1.6910475643052163, + "grad_norm": 0.32088881385022494, + "learning_rate": 6.770816317596882e-06, + "loss": 0.4492, + "step": 10297 + }, + { + "epoch": 1.6912117915135592, + "grad_norm": 0.2979143781946795, + "learning_rate": 6.770411423677406e-06, + "loss": 0.4547, + "step": 10298 + }, + { + "epoch": 1.6913760187219018, + "grad_norm": 0.24331679942835663, + "learning_rate": 6.770006505099029e-06, + "loss": 0.4569, + "step": 10299 + }, + { + "epoch": 1.6915402459302444, + "grad_norm": 0.29674736960822723, + "learning_rate": 6.769601561866147e-06, + "loss": 0.452, + "step": 10300 + }, + { + "epoch": 1.691704473138587, + "grad_norm": 0.27199210828461945, + "learning_rate": 6.76919659398316e-06, + "loss": 0.4464, + "step": 10301 + }, + { + "epoch": 1.69186870034693, + "grad_norm": 0.26438612536458783, + "learning_rate": 6.7687916014544635e-06, + "loss": 0.4382, + "step": 10302 + }, + { + "epoch": 1.6920329275552728, + "grad_norm": 0.32917048248376596, + "learning_rate": 6.768386584284458e-06, + "loss": 0.4457, + "step": 10303 + }, + { + "epoch": 1.6921971547636154, + "grad_norm": 0.33675980630175334, + "learning_rate": 6.767981542477545e-06, + "loss": 0.4518, + "step": 10304 + }, + { + "epoch": 1.692361381971958, + "grad_norm": 0.32689547936397234, + "learning_rate": 6.767576476038119e-06, + "loss": 0.457, + "step": 10305 + }, + { + "epoch": 1.692525609180301, + "grad_norm": 0.33346484391816406, + "learning_rate": 6.767171384970583e-06, + "loss": 0.4844, + "step": 10306 + }, + { + "epoch": 1.6926898363886438, + "grad_norm": 0.308072789313566, + "learning_rate": 6.766766269279333e-06, + "loss": 0.4584, + "step": 10307 + }, + { + "epoch": 1.6928540635969864, + "grad_norm": 0.287864748449835, + "learning_rate": 6.7663611289687725e-06, + "loss": 0.4439, + "step": 10308 + }, + { + "epoch": 1.693018290805329, + "grad_norm": 0.3082917399631894, + "learning_rate": 6.765955964043302e-06, + "loss": 0.4588, + "step": 10309 + }, + { + "epoch": 1.693182518013672, + "grad_norm": 0.29546090273406916, + "learning_rate": 6.765550774507317e-06, + "loss": 0.4353, + "step": 10310 + }, + { + "epoch": 1.6933467452220148, + "grad_norm": 0.2858280741500684, + "learning_rate": 6.765145560365224e-06, + "loss": 0.4591, + "step": 10311 + }, + { + "epoch": 1.6935109724303574, + "grad_norm": 0.32008558469929194, + "learning_rate": 6.7647403216214205e-06, + "loss": 0.4529, + "step": 10312 + }, + { + "epoch": 1.6936751996387, + "grad_norm": 0.3146694953864376, + "learning_rate": 6.764335058280309e-06, + "loss": 0.4696, + "step": 10313 + }, + { + "epoch": 1.693839426847043, + "grad_norm": 0.3219358380569886, + "learning_rate": 6.7639297703462916e-06, + "loss": 0.4394, + "step": 10314 + }, + { + "epoch": 1.6940036540553858, + "grad_norm": 0.3004978132549195, + "learning_rate": 6.763524457823768e-06, + "loss": 0.4607, + "step": 10315 + }, + { + "epoch": 1.6941678812637284, + "grad_norm": 0.2733719772784754, + "learning_rate": 6.763119120717143e-06, + "loss": 0.467, + "step": 10316 + }, + { + "epoch": 1.694332108472071, + "grad_norm": 0.2706089474104434, + "learning_rate": 6.762713759030817e-06, + "loss": 0.449, + "step": 10317 + }, + { + "epoch": 1.6944963356804137, + "grad_norm": 0.42130473705280985, + "learning_rate": 6.762308372769194e-06, + "loss": 0.4529, + "step": 10318 + }, + { + "epoch": 1.6946605628887565, + "grad_norm": 0.36806728788529874, + "learning_rate": 6.761902961936676e-06, + "loss": 0.4709, + "step": 10319 + }, + { + "epoch": 1.6948247900970994, + "grad_norm": 0.30950909755417966, + "learning_rate": 6.761497526537668e-06, + "loss": 0.4662, + "step": 10320 + }, + { + "epoch": 1.694989017305442, + "grad_norm": 0.25509018884086976, + "learning_rate": 6.761092066576569e-06, + "loss": 0.4302, + "step": 10321 + }, + { + "epoch": 1.6951532445137847, + "grad_norm": 0.3542012605063986, + "learning_rate": 6.760686582057787e-06, + "loss": 0.4785, + "step": 10322 + }, + { + "epoch": 1.6953174717221275, + "grad_norm": 0.27189101345521893, + "learning_rate": 6.760281072985725e-06, + "loss": 0.471, + "step": 10323 + }, + { + "epoch": 1.6954816989304704, + "grad_norm": 0.2690817881529284, + "learning_rate": 6.7598755393647855e-06, + "loss": 0.4621, + "step": 10324 + }, + { + "epoch": 1.695645926138813, + "grad_norm": 0.2796687617198455, + "learning_rate": 6.759469981199375e-06, + "loss": 0.4409, + "step": 10325 + }, + { + "epoch": 1.6958101533471557, + "grad_norm": 0.34430847635917744, + "learning_rate": 6.7590643984938965e-06, + "loss": 0.4662, + "step": 10326 + }, + { + "epoch": 1.6959743805554985, + "grad_norm": 0.3545725970327559, + "learning_rate": 6.7586587912527575e-06, + "loss": 0.4882, + "step": 10327 + }, + { + "epoch": 1.6961386077638414, + "grad_norm": 0.45759440356995434, + "learning_rate": 6.758253159480362e-06, + "loss": 0.4548, + "step": 10328 + }, + { + "epoch": 1.696302834972184, + "grad_norm": 0.3371943712343659, + "learning_rate": 6.757847503181114e-06, + "loss": 0.4493, + "step": 10329 + }, + { + "epoch": 1.6964670621805267, + "grad_norm": 0.38712254849700195, + "learning_rate": 6.757441822359422e-06, + "loss": 0.4553, + "step": 10330 + }, + { + "epoch": 1.6966312893888695, + "grad_norm": 0.367661804842924, + "learning_rate": 6.757036117019689e-06, + "loss": 0.4605, + "step": 10331 + }, + { + "epoch": 1.6967955165972124, + "grad_norm": 0.28797269409836235, + "learning_rate": 6.756630387166324e-06, + "loss": 0.4527, + "step": 10332 + }, + { + "epoch": 1.696959743805555, + "grad_norm": 0.31382333583278105, + "learning_rate": 6.756224632803734e-06, + "loss": 0.4749, + "step": 10333 + }, + { + "epoch": 1.6971239710138977, + "grad_norm": 0.3185347219714828, + "learning_rate": 6.755818853936323e-06, + "loss": 0.4722, + "step": 10334 + }, + { + "epoch": 1.6972881982222403, + "grad_norm": 0.31584226638721585, + "learning_rate": 6.755413050568501e-06, + "loss": 0.4729, + "step": 10335 + }, + { + "epoch": 1.6974524254305832, + "grad_norm": 0.3821555035764605, + "learning_rate": 6.755007222704674e-06, + "loss": 0.468, + "step": 10336 + }, + { + "epoch": 1.697616652638926, + "grad_norm": 0.3381119959701247, + "learning_rate": 6.754601370349249e-06, + "loss": 0.4728, + "step": 10337 + }, + { + "epoch": 1.6977808798472687, + "grad_norm": 0.26045264306014676, + "learning_rate": 6.754195493506635e-06, + "loss": 0.4314, + "step": 10338 + }, + { + "epoch": 1.6979451070556113, + "grad_norm": 0.29197349217378316, + "learning_rate": 6.753789592181241e-06, + "loss": 0.4717, + "step": 10339 + }, + { + "epoch": 1.6981093342639542, + "grad_norm": 0.341468042844591, + "learning_rate": 6.753383666377474e-06, + "loss": 0.4558, + "step": 10340 + }, + { + "epoch": 1.698273561472297, + "grad_norm": 0.2817026922401525, + "learning_rate": 6.752977716099744e-06, + "loss": 0.4563, + "step": 10341 + }, + { + "epoch": 1.6984377886806397, + "grad_norm": 0.29656847976847733, + "learning_rate": 6.752571741352459e-06, + "loss": 0.4578, + "step": 10342 + }, + { + "epoch": 1.6986020158889823, + "grad_norm": 0.28247072661654443, + "learning_rate": 6.752165742140029e-06, + "loss": 0.439, + "step": 10343 + }, + { + "epoch": 1.6987662430973252, + "grad_norm": 0.28792432296827525, + "learning_rate": 6.751759718466862e-06, + "loss": 0.454, + "step": 10344 + }, + { + "epoch": 1.698930470305668, + "grad_norm": 0.3713145261232129, + "learning_rate": 6.751353670337371e-06, + "loss": 0.4646, + "step": 10345 + }, + { + "epoch": 1.6990946975140107, + "grad_norm": 0.3210942286713483, + "learning_rate": 6.7509475977559614e-06, + "loss": 0.4533, + "step": 10346 + }, + { + "epoch": 1.6992589247223533, + "grad_norm": 0.25200029858425016, + "learning_rate": 6.750541500727048e-06, + "loss": 0.4567, + "step": 10347 + }, + { + "epoch": 1.6994231519306962, + "grad_norm": 0.2854102830910053, + "learning_rate": 6.7501353792550404e-06, + "loss": 0.4514, + "step": 10348 + }, + { + "epoch": 1.699587379139039, + "grad_norm": 0.4609283687810497, + "learning_rate": 6.749729233344347e-06, + "loss": 0.462, + "step": 10349 + }, + { + "epoch": 1.6997516063473816, + "grad_norm": 0.2920906286091975, + "learning_rate": 6.749323062999382e-06, + "loss": 0.4496, + "step": 10350 + }, + { + "epoch": 1.6999158335557243, + "grad_norm": 0.34357740596563463, + "learning_rate": 6.748916868224554e-06, + "loss": 0.467, + "step": 10351 + }, + { + "epoch": 1.700080060764067, + "grad_norm": 0.35815701259371335, + "learning_rate": 6.748510649024277e-06, + "loss": 0.4445, + "step": 10352 + }, + { + "epoch": 1.7002442879724098, + "grad_norm": 0.2697156568134558, + "learning_rate": 6.748104405402963e-06, + "loss": 0.4506, + "step": 10353 + }, + { + "epoch": 1.7004085151807526, + "grad_norm": 0.3074531144110258, + "learning_rate": 6.747698137365023e-06, + "loss": 0.4523, + "step": 10354 + }, + { + "epoch": 1.7005727423890953, + "grad_norm": 0.33685069490300174, + "learning_rate": 6.7472918449148695e-06, + "loss": 0.4725, + "step": 10355 + }, + { + "epoch": 1.700736969597438, + "grad_norm": 0.3240178685891051, + "learning_rate": 6.746885528056915e-06, + "loss": 0.4745, + "step": 10356 + }, + { + "epoch": 1.7009011968057808, + "grad_norm": 0.30519208673786224, + "learning_rate": 6.746479186795573e-06, + "loss": 0.4722, + "step": 10357 + }, + { + "epoch": 1.7010654240141236, + "grad_norm": 0.284689567099896, + "learning_rate": 6.746072821135258e-06, + "loss": 0.469, + "step": 10358 + }, + { + "epoch": 1.7012296512224663, + "grad_norm": 0.3561436791068111, + "learning_rate": 6.745666431080382e-06, + "loss": 0.4613, + "step": 10359 + }, + { + "epoch": 1.701393878430809, + "grad_norm": 0.2637723404608694, + "learning_rate": 6.745260016635358e-06, + "loss": 0.4644, + "step": 10360 + }, + { + "epoch": 1.7015581056391518, + "grad_norm": 0.26636461148580687, + "learning_rate": 6.744853577804601e-06, + "loss": 0.4464, + "step": 10361 + }, + { + "epoch": 1.7017223328474946, + "grad_norm": 1.0327093958429159, + "learning_rate": 6.744447114592526e-06, + "loss": 0.4665, + "step": 10362 + }, + { + "epoch": 1.7018865600558373, + "grad_norm": 0.2762304703339504, + "learning_rate": 6.744040627003549e-06, + "loss": 0.473, + "step": 10363 + }, + { + "epoch": 1.70205078726418, + "grad_norm": 0.41730054692606017, + "learning_rate": 6.743634115042082e-06, + "loss": 0.4709, + "step": 10364 + }, + { + "epoch": 1.7022150144725228, + "grad_norm": 0.25904651355072494, + "learning_rate": 6.743227578712539e-06, + "loss": 0.444, + "step": 10365 + }, + { + "epoch": 1.7023792416808656, + "grad_norm": 0.980301374420022, + "learning_rate": 6.74282101801934e-06, + "loss": 0.4497, + "step": 10366 + }, + { + "epoch": 1.7025434688892083, + "grad_norm": 0.299626853323015, + "learning_rate": 6.742414432966896e-06, + "loss": 0.4636, + "step": 10367 + }, + { + "epoch": 1.702707696097551, + "grad_norm": 0.3115433025902387, + "learning_rate": 6.742007823559627e-06, + "loss": 0.4485, + "step": 10368 + }, + { + "epoch": 1.7028719233058935, + "grad_norm": 0.3507573006468152, + "learning_rate": 6.741601189801946e-06, + "loss": 0.456, + "step": 10369 + }, + { + "epoch": 1.7030361505142364, + "grad_norm": 0.36607655888975515, + "learning_rate": 6.741194531698271e-06, + "loss": 0.4585, + "step": 10370 + }, + { + "epoch": 1.7032003777225793, + "grad_norm": 0.283069528317907, + "learning_rate": 6.740787849253018e-06, + "loss": 0.4582, + "step": 10371 + }, + { + "epoch": 1.703364604930922, + "grad_norm": 0.3830473238004654, + "learning_rate": 6.740381142470605e-06, + "loss": 0.4409, + "step": 10372 + }, + { + "epoch": 1.7035288321392645, + "grad_norm": 0.2768537602884907, + "learning_rate": 6.739974411355448e-06, + "loss": 0.4537, + "step": 10373 + }, + { + "epoch": 1.7036930593476074, + "grad_norm": 0.3490536084340397, + "learning_rate": 6.739567655911965e-06, + "loss": 0.453, + "step": 10374 + }, + { + "epoch": 1.7038572865559503, + "grad_norm": 0.36771767087617424, + "learning_rate": 6.739160876144575e-06, + "loss": 0.4766, + "step": 10375 + }, + { + "epoch": 1.704021513764293, + "grad_norm": 0.2878841697769496, + "learning_rate": 6.738754072057693e-06, + "loss": 0.4759, + "step": 10376 + }, + { + "epoch": 1.7041857409726355, + "grad_norm": 0.3020927112421301, + "learning_rate": 6.738347243655741e-06, + "loss": 0.4874, + "step": 10377 + }, + { + "epoch": 1.7043499681809784, + "grad_norm": 0.29368059878224284, + "learning_rate": 6.737940390943134e-06, + "loss": 0.4657, + "step": 10378 + }, + { + "epoch": 1.7045141953893213, + "grad_norm": 0.339697488464221, + "learning_rate": 6.7375335139242936e-06, + "loss": 0.4602, + "step": 10379 + }, + { + "epoch": 1.704678422597664, + "grad_norm": 0.4986765984982801, + "learning_rate": 6.737126612603637e-06, + "loss": 0.4584, + "step": 10380 + }, + { + "epoch": 1.7048426498060065, + "grad_norm": 0.28694916511887175, + "learning_rate": 6.7367196869855845e-06, + "loss": 0.447, + "step": 10381 + }, + { + "epoch": 1.7050068770143494, + "grad_norm": 0.37107709671599887, + "learning_rate": 6.736312737074557e-06, + "loss": 0.4554, + "step": 10382 + }, + { + "epoch": 1.7051711042226922, + "grad_norm": 0.32138012549222017, + "learning_rate": 6.735905762874972e-06, + "loss": 0.4549, + "step": 10383 + }, + { + "epoch": 1.7053353314310349, + "grad_norm": 0.29340690471890696, + "learning_rate": 6.73549876439125e-06, + "loss": 0.4647, + "step": 10384 + }, + { + "epoch": 1.7054995586393775, + "grad_norm": 0.35984360098887486, + "learning_rate": 6.735091741627811e-06, + "loss": 0.4483, + "step": 10385 + }, + { + "epoch": 1.7056637858477202, + "grad_norm": 0.2811151158720807, + "learning_rate": 6.734684694589078e-06, + "loss": 0.4787, + "step": 10386 + }, + { + "epoch": 1.705828013056063, + "grad_norm": 0.3031459072365189, + "learning_rate": 6.7342776232794725e-06, + "loss": 0.4511, + "step": 10387 + }, + { + "epoch": 1.7059922402644059, + "grad_norm": 0.2571523484193839, + "learning_rate": 6.733870527703411e-06, + "loss": 0.4524, + "step": 10388 + }, + { + "epoch": 1.7061564674727485, + "grad_norm": 0.3361334913411509, + "learning_rate": 6.733463407865319e-06, + "loss": 0.4628, + "step": 10389 + }, + { + "epoch": 1.7063206946810912, + "grad_norm": 0.27863555359391223, + "learning_rate": 6.733056263769616e-06, + "loss": 0.4745, + "step": 10390 + }, + { + "epoch": 1.706484921889434, + "grad_norm": 0.31072955149297077, + "learning_rate": 6.732649095420726e-06, + "loss": 0.457, + "step": 10391 + }, + { + "epoch": 1.7066491490977769, + "grad_norm": 0.388313133889824, + "learning_rate": 6.73224190282307e-06, + "loss": 0.4623, + "step": 10392 + }, + { + "epoch": 1.7068133763061195, + "grad_norm": 1.0394211962031206, + "learning_rate": 6.73183468598107e-06, + "loss": 0.4495, + "step": 10393 + }, + { + "epoch": 1.7069776035144621, + "grad_norm": 0.40572430108763996, + "learning_rate": 6.73142744489915e-06, + "loss": 0.4471, + "step": 10394 + }, + { + "epoch": 1.707141830722805, + "grad_norm": 0.242324759201867, + "learning_rate": 6.731020179581732e-06, + "loss": 0.4317, + "step": 10395 + }, + { + "epoch": 1.7073060579311479, + "grad_norm": 0.31949901700226574, + "learning_rate": 6.7306128900332405e-06, + "loss": 0.4409, + "step": 10396 + }, + { + "epoch": 1.7074702851394905, + "grad_norm": 0.2662188504303161, + "learning_rate": 6.730205576258099e-06, + "loss": 0.4576, + "step": 10397 + }, + { + "epoch": 1.7076345123478331, + "grad_norm": 0.34371474136704083, + "learning_rate": 6.7297982382607295e-06, + "loss": 0.4603, + "step": 10398 + }, + { + "epoch": 1.707798739556176, + "grad_norm": 0.5706657024675674, + "learning_rate": 6.7293908760455575e-06, + "loss": 0.4607, + "step": 10399 + }, + { + "epoch": 1.7079629667645189, + "grad_norm": 0.4611294102411975, + "learning_rate": 6.728983489617008e-06, + "loss": 0.4773, + "step": 10400 + }, + { + "epoch": 1.7081271939728615, + "grad_norm": 0.2924681797853017, + "learning_rate": 6.728576078979503e-06, + "loss": 0.4431, + "step": 10401 + }, + { + "epoch": 1.7082914211812041, + "grad_norm": 0.34761075327557933, + "learning_rate": 6.7281686441374705e-06, + "loss": 0.4424, + "step": 10402 + }, + { + "epoch": 1.7084556483895468, + "grad_norm": 0.26384299959420315, + "learning_rate": 6.727761185095334e-06, + "loss": 0.4524, + "step": 10403 + }, + { + "epoch": 1.7086198755978896, + "grad_norm": 0.3084678291137836, + "learning_rate": 6.727353701857519e-06, + "loss": 0.4659, + "step": 10404 + }, + { + "epoch": 1.7087841028062325, + "grad_norm": 0.2901345922105834, + "learning_rate": 6.7269461944284525e-06, + "loss": 0.4788, + "step": 10405 + }, + { + "epoch": 1.7089483300145751, + "grad_norm": 0.27440293975090463, + "learning_rate": 6.7265386628125584e-06, + "loss": 0.4579, + "step": 10406 + }, + { + "epoch": 1.7091125572229178, + "grad_norm": 0.3972081442525347, + "learning_rate": 6.726131107014264e-06, + "loss": 0.4322, + "step": 10407 + }, + { + "epoch": 1.7092767844312606, + "grad_norm": 0.334535225703064, + "learning_rate": 6.7257235270379955e-06, + "loss": 0.4618, + "step": 10408 + }, + { + "epoch": 1.7094410116396035, + "grad_norm": 0.2765674491027994, + "learning_rate": 6.725315922888179e-06, + "loss": 0.4544, + "step": 10409 + }, + { + "epoch": 1.7096052388479461, + "grad_norm": 0.34975187365135374, + "learning_rate": 6.724908294569242e-06, + "loss": 0.4603, + "step": 10410 + }, + { + "epoch": 1.7097694660562888, + "grad_norm": 0.2856071808433099, + "learning_rate": 6.724500642085614e-06, + "loss": 0.4412, + "step": 10411 + }, + { + "epoch": 1.7099336932646316, + "grad_norm": 0.8417017909558503, + "learning_rate": 6.7240929654417196e-06, + "loss": 0.4508, + "step": 10412 + }, + { + "epoch": 1.7100979204729745, + "grad_norm": 0.2939192815435623, + "learning_rate": 6.723685264641986e-06, + "loss": 0.4666, + "step": 10413 + }, + { + "epoch": 1.7102621476813171, + "grad_norm": 0.26199379189429295, + "learning_rate": 6.723277539690842e-06, + "loss": 0.4474, + "step": 10414 + }, + { + "epoch": 1.7104263748896598, + "grad_norm": 0.29879744281049325, + "learning_rate": 6.7228697905927184e-06, + "loss": 0.4793, + "step": 10415 + }, + { + "epoch": 1.7105906020980026, + "grad_norm": 0.2918493779231738, + "learning_rate": 6.722462017352042e-06, + "loss": 0.4618, + "step": 10416 + }, + { + "epoch": 1.7107548293063455, + "grad_norm": 0.3098008325301708, + "learning_rate": 6.722054219973242e-06, + "loss": 0.4552, + "step": 10417 + }, + { + "epoch": 1.7109190565146881, + "grad_norm": 0.3165158551796311, + "learning_rate": 6.721646398460745e-06, + "loss": 0.4707, + "step": 10418 + }, + { + "epoch": 1.7110832837230308, + "grad_norm": 0.30672549273761096, + "learning_rate": 6.721238552818983e-06, + "loss": 0.4417, + "step": 10419 + }, + { + "epoch": 1.7112475109313734, + "grad_norm": 0.2584646055648473, + "learning_rate": 6.7208306830523844e-06, + "loss": 0.4383, + "step": 10420 + }, + { + "epoch": 1.7114117381397163, + "grad_norm": 0.2717102838108191, + "learning_rate": 6.720422789165382e-06, + "loss": 0.4503, + "step": 10421 + }, + { + "epoch": 1.7115759653480591, + "grad_norm": 0.37441039548738764, + "learning_rate": 6.720014871162402e-06, + "loss": 0.448, + "step": 10422 + }, + { + "epoch": 1.7117401925564018, + "grad_norm": 0.3053067938918457, + "learning_rate": 6.719606929047875e-06, + "loss": 0.4555, + "step": 10423 + }, + { + "epoch": 1.7119044197647444, + "grad_norm": 0.33951489319508066, + "learning_rate": 6.719198962826234e-06, + "loss": 0.4508, + "step": 10424 + }, + { + "epoch": 1.7120686469730872, + "grad_norm": 0.40669330323048297, + "learning_rate": 6.718790972501909e-06, + "loss": 0.4462, + "step": 10425 + }, + { + "epoch": 1.71223287418143, + "grad_norm": 0.31788264969946245, + "learning_rate": 6.718382958079332e-06, + "loss": 0.4448, + "step": 10426 + }, + { + "epoch": 1.7123971013897727, + "grad_norm": 0.2972005514734704, + "learning_rate": 6.717974919562932e-06, + "loss": 0.4546, + "step": 10427 + }, + { + "epoch": 1.7125613285981154, + "grad_norm": 0.3405902179433068, + "learning_rate": 6.717566856957143e-06, + "loss": 0.4451, + "step": 10428 + }, + { + "epoch": 1.7127255558064582, + "grad_norm": 0.27053272243654325, + "learning_rate": 6.717158770266396e-06, + "loss": 0.4484, + "step": 10429 + }, + { + "epoch": 1.712889783014801, + "grad_norm": 0.2926589561710632, + "learning_rate": 6.716750659495123e-06, + "loss": 0.4565, + "step": 10430 + }, + { + "epoch": 1.7130540102231437, + "grad_norm": 0.3146258821459904, + "learning_rate": 6.716342524647757e-06, + "loss": 0.4634, + "step": 10431 + }, + { + "epoch": 1.7132182374314864, + "grad_norm": 0.45155855350560037, + "learning_rate": 6.71593436572873e-06, + "loss": 0.456, + "step": 10432 + }, + { + "epoch": 1.7133824646398292, + "grad_norm": 0.31378119401303933, + "learning_rate": 6.715526182742477e-06, + "loss": 0.4758, + "step": 10433 + }, + { + "epoch": 1.713546691848172, + "grad_norm": 0.26711786147597105, + "learning_rate": 6.715117975693429e-06, + "loss": 0.4353, + "step": 10434 + }, + { + "epoch": 1.7137109190565147, + "grad_norm": 0.28845664173005386, + "learning_rate": 6.71470974458602e-06, + "loss": 0.4627, + "step": 10435 + }, + { + "epoch": 1.7138751462648574, + "grad_norm": 0.4823107200435918, + "learning_rate": 6.7143014894246846e-06, + "loss": 0.4466, + "step": 10436 + }, + { + "epoch": 1.7140393734732, + "grad_norm": 0.2786151787980394, + "learning_rate": 6.713893210213857e-06, + "loss": 0.45, + "step": 10437 + }, + { + "epoch": 1.7142036006815429, + "grad_norm": 0.2935876494132793, + "learning_rate": 6.713484906957971e-06, + "loss": 0.492, + "step": 10438 + }, + { + "epoch": 1.7143678278898857, + "grad_norm": 0.28236963636794876, + "learning_rate": 6.71307657966146e-06, + "loss": 0.4516, + "step": 10439 + }, + { + "epoch": 1.7145320550982284, + "grad_norm": 0.6496068241742552, + "learning_rate": 6.712668228328761e-06, + "loss": 0.4313, + "step": 10440 + }, + { + "epoch": 1.714696282306571, + "grad_norm": 0.6204538490273221, + "learning_rate": 6.712259852964308e-06, + "loss": 0.4574, + "step": 10441 + }, + { + "epoch": 1.7148605095149139, + "grad_norm": 0.2793118226529691, + "learning_rate": 6.711851453572535e-06, + "loss": 0.4689, + "step": 10442 + }, + { + "epoch": 1.7150247367232567, + "grad_norm": 0.2744098757190267, + "learning_rate": 6.71144303015788e-06, + "loss": 0.4442, + "step": 10443 + }, + { + "epoch": 1.7151889639315994, + "grad_norm": 0.34174424481501375, + "learning_rate": 6.711034582724778e-06, + "loss": 0.4657, + "step": 10444 + }, + { + "epoch": 1.715353191139942, + "grad_norm": 0.2955658019877106, + "learning_rate": 6.710626111277666e-06, + "loss": 0.4418, + "step": 10445 + }, + { + "epoch": 1.7155174183482849, + "grad_norm": 0.2634439956111268, + "learning_rate": 6.710217615820979e-06, + "loss": 0.4597, + "step": 10446 + }, + { + "epoch": 1.7156816455566277, + "grad_norm": 0.3298989104289385, + "learning_rate": 6.709809096359153e-06, + "loss": 0.4435, + "step": 10447 + }, + { + "epoch": 1.7158458727649704, + "grad_norm": 0.314188393745869, + "learning_rate": 6.709400552896627e-06, + "loss": 0.4441, + "step": 10448 + }, + { + "epoch": 1.716010099973313, + "grad_norm": 0.2666842409720632, + "learning_rate": 6.708991985437836e-06, + "loss": 0.4552, + "step": 10449 + }, + { + "epoch": 1.7161743271816559, + "grad_norm": 0.3231120636371941, + "learning_rate": 6.70858339398722e-06, + "loss": 0.4706, + "step": 10450 + }, + { + "epoch": 1.7163385543899987, + "grad_norm": 0.2752932839799465, + "learning_rate": 6.708174778549216e-06, + "loss": 0.4474, + "step": 10451 + }, + { + "epoch": 1.7165027815983414, + "grad_norm": 0.30247348745969427, + "learning_rate": 6.70776613912826e-06, + "loss": 0.4403, + "step": 10452 + }, + { + "epoch": 1.716667008806684, + "grad_norm": 0.37340998671693054, + "learning_rate": 6.707357475728792e-06, + "loss": 0.4617, + "step": 10453 + }, + { + "epoch": 1.7168312360150266, + "grad_norm": 0.2745424306375286, + "learning_rate": 6.70694878835525e-06, + "loss": 0.4498, + "step": 10454 + }, + { + "epoch": 1.7169954632233695, + "grad_norm": 0.43213806007948663, + "learning_rate": 6.706540077012074e-06, + "loss": 0.4333, + "step": 10455 + }, + { + "epoch": 1.7171596904317123, + "grad_norm": 0.31565939323718656, + "learning_rate": 6.7061313417037e-06, + "loss": 0.4752, + "step": 10456 + }, + { + "epoch": 1.717323917640055, + "grad_norm": 0.5251729427395571, + "learning_rate": 6.705722582434569e-06, + "loss": 0.4505, + "step": 10457 + }, + { + "epoch": 1.7174881448483976, + "grad_norm": 0.3968038722142534, + "learning_rate": 6.705313799209123e-06, + "loss": 0.46, + "step": 10458 + }, + { + "epoch": 1.7176523720567405, + "grad_norm": 0.36237150280496244, + "learning_rate": 6.704904992031796e-06, + "loss": 0.4517, + "step": 10459 + }, + { + "epoch": 1.7178165992650833, + "grad_norm": 0.3266331407754335, + "learning_rate": 6.704496160907035e-06, + "loss": 0.4553, + "step": 10460 + }, + { + "epoch": 1.717980826473426, + "grad_norm": 0.35061228607963685, + "learning_rate": 6.704087305839275e-06, + "loss": 0.449, + "step": 10461 + }, + { + "epoch": 1.7181450536817686, + "grad_norm": 0.3011777257227322, + "learning_rate": 6.703678426832958e-06, + "loss": 0.4715, + "step": 10462 + }, + { + "epoch": 1.7183092808901115, + "grad_norm": 0.3016498684711106, + "learning_rate": 6.703269523892526e-06, + "loss": 0.4533, + "step": 10463 + }, + { + "epoch": 1.7184735080984543, + "grad_norm": 0.3188339916151969, + "learning_rate": 6.7028605970224175e-06, + "loss": 0.4638, + "step": 10464 + }, + { + "epoch": 1.718637735306797, + "grad_norm": 0.2868947673367226, + "learning_rate": 6.702451646227077e-06, + "loss": 0.4535, + "step": 10465 + }, + { + "epoch": 1.7188019625151396, + "grad_norm": 0.3513689796636359, + "learning_rate": 6.702042671510945e-06, + "loss": 0.4534, + "step": 10466 + }, + { + "epoch": 1.7189661897234825, + "grad_norm": 0.2835644141881405, + "learning_rate": 6.701633672878463e-06, + "loss": 0.4514, + "step": 10467 + }, + { + "epoch": 1.7191304169318253, + "grad_norm": 0.4246394600668017, + "learning_rate": 6.701224650334072e-06, + "loss": 0.4525, + "step": 10468 + }, + { + "epoch": 1.719294644140168, + "grad_norm": 0.4832348262491999, + "learning_rate": 6.700815603882218e-06, + "loss": 0.467, + "step": 10469 + }, + { + "epoch": 1.7194588713485106, + "grad_norm": 0.2393193885818513, + "learning_rate": 6.700406533527338e-06, + "loss": 0.4606, + "step": 10470 + }, + { + "epoch": 1.7196230985568532, + "grad_norm": 0.28340704067344497, + "learning_rate": 6.699997439273881e-06, + "loss": 0.473, + "step": 10471 + }, + { + "epoch": 1.719787325765196, + "grad_norm": 0.29619829364482575, + "learning_rate": 6.6995883211262855e-06, + "loss": 0.467, + "step": 10472 + }, + { + "epoch": 1.719951552973539, + "grad_norm": 0.2680896560474138, + "learning_rate": 6.699179179088998e-06, + "loss": 0.4677, + "step": 10473 + }, + { + "epoch": 1.7201157801818816, + "grad_norm": 0.29926212745864955, + "learning_rate": 6.69877001316646e-06, + "loss": 0.4517, + "step": 10474 + }, + { + "epoch": 1.7202800073902242, + "grad_norm": 0.30751652357242387, + "learning_rate": 6.698360823363117e-06, + "loss": 0.4574, + "step": 10475 + }, + { + "epoch": 1.720444234598567, + "grad_norm": 0.3328073603049628, + "learning_rate": 6.697951609683412e-06, + "loss": 0.4514, + "step": 10476 + }, + { + "epoch": 1.72060846180691, + "grad_norm": 0.24904973625178245, + "learning_rate": 6.697542372131789e-06, + "loss": 0.4691, + "step": 10477 + }, + { + "epoch": 1.7207726890152526, + "grad_norm": 0.3055567768298029, + "learning_rate": 6.697133110712695e-06, + "loss": 0.4415, + "step": 10478 + }, + { + "epoch": 1.7209369162235952, + "grad_norm": 0.25915721154981863, + "learning_rate": 6.696723825430574e-06, + "loss": 0.4787, + "step": 10479 + }, + { + "epoch": 1.721101143431938, + "grad_norm": 0.2802197692823335, + "learning_rate": 6.696314516289872e-06, + "loss": 0.4601, + "step": 10480 + }, + { + "epoch": 1.721265370640281, + "grad_norm": 0.3525676666157653, + "learning_rate": 6.695905183295031e-06, + "loss": 0.4588, + "step": 10481 + }, + { + "epoch": 1.7214295978486236, + "grad_norm": 0.2727601150327771, + "learning_rate": 6.695495826450501e-06, + "loss": 0.4584, + "step": 10482 + }, + { + "epoch": 1.7215938250569662, + "grad_norm": 0.32149351547771154, + "learning_rate": 6.695086445760725e-06, + "loss": 0.456, + "step": 10483 + }, + { + "epoch": 1.721758052265309, + "grad_norm": 0.39117096357713665, + "learning_rate": 6.694677041230152e-06, + "loss": 0.4586, + "step": 10484 + }, + { + "epoch": 1.721922279473652, + "grad_norm": 0.3322750991270649, + "learning_rate": 6.694267612863227e-06, + "loss": 0.4635, + "step": 10485 + }, + { + "epoch": 1.7220865066819946, + "grad_norm": 0.32418802055402957, + "learning_rate": 6.693858160664394e-06, + "loss": 0.447, + "step": 10486 + }, + { + "epoch": 1.7222507338903372, + "grad_norm": 0.2675172834941799, + "learning_rate": 6.693448684638106e-06, + "loss": 0.435, + "step": 10487 + }, + { + "epoch": 1.7224149610986799, + "grad_norm": 0.2947083214764386, + "learning_rate": 6.693039184788806e-06, + "loss": 0.4471, + "step": 10488 + }, + { + "epoch": 1.7225791883070227, + "grad_norm": 0.29546155758230347, + "learning_rate": 6.692629661120944e-06, + "loss": 0.4609, + "step": 10489 + }, + { + "epoch": 1.7227434155153656, + "grad_norm": 0.3072763602251934, + "learning_rate": 6.692220113638965e-06, + "loss": 0.4698, + "step": 10490 + }, + { + "epoch": 1.7229076427237082, + "grad_norm": 0.2612485647618418, + "learning_rate": 6.691810542347319e-06, + "loss": 0.4593, + "step": 10491 + }, + { + "epoch": 1.7230718699320509, + "grad_norm": 0.27610091859399394, + "learning_rate": 6.691400947250454e-06, + "loss": 0.4466, + "step": 10492 + }, + { + "epoch": 1.7232360971403937, + "grad_norm": 0.2994420717725685, + "learning_rate": 6.690991328352819e-06, + "loss": 0.4596, + "step": 10493 + }, + { + "epoch": 1.7234003243487366, + "grad_norm": 0.3337391439613997, + "learning_rate": 6.690581685658863e-06, + "loss": 0.4519, + "step": 10494 + }, + { + "epoch": 1.7235645515570792, + "grad_norm": 0.283817248370631, + "learning_rate": 6.690172019173035e-06, + "loss": 0.4538, + "step": 10495 + }, + { + "epoch": 1.7237287787654219, + "grad_norm": 0.27336394201895664, + "learning_rate": 6.689762328899783e-06, + "loss": 0.4741, + "step": 10496 + }, + { + "epoch": 1.7238930059737647, + "grad_norm": 0.4125973348785593, + "learning_rate": 6.689352614843557e-06, + "loss": 0.476, + "step": 10497 + }, + { + "epoch": 1.7240572331821076, + "grad_norm": 0.35607773234902174, + "learning_rate": 6.688942877008809e-06, + "loss": 0.4361, + "step": 10498 + }, + { + "epoch": 1.7242214603904502, + "grad_norm": 0.3158612111856992, + "learning_rate": 6.688533115399987e-06, + "loss": 0.4657, + "step": 10499 + }, + { + "epoch": 1.7243856875987928, + "grad_norm": 0.30310414980860667, + "learning_rate": 6.688123330021543e-06, + "loss": 0.4622, + "step": 10500 + }, + { + "epoch": 1.7245499148071357, + "grad_norm": 0.29849694832345836, + "learning_rate": 6.687713520877926e-06, + "loss": 0.4584, + "step": 10501 + }, + { + "epoch": 1.7247141420154786, + "grad_norm": 0.2651875391881616, + "learning_rate": 6.687303687973588e-06, + "loss": 0.4753, + "step": 10502 + }, + { + "epoch": 1.7248783692238212, + "grad_norm": 0.31880123609608296, + "learning_rate": 6.68689383131298e-06, + "loss": 0.4422, + "step": 10503 + }, + { + "epoch": 1.7250425964321638, + "grad_norm": 0.30651866368968245, + "learning_rate": 6.6864839509005534e-06, + "loss": 0.4674, + "step": 10504 + }, + { + "epoch": 1.7252068236405065, + "grad_norm": 0.273843576978905, + "learning_rate": 6.6860740467407594e-06, + "loss": 0.4542, + "step": 10505 + }, + { + "epoch": 1.7253710508488493, + "grad_norm": 0.33736322841446215, + "learning_rate": 6.685664118838051e-06, + "loss": 0.4733, + "step": 10506 + }, + { + "epoch": 1.7255352780571922, + "grad_norm": 0.28310879953417917, + "learning_rate": 6.685254167196879e-06, + "loss": 0.4325, + "step": 10507 + }, + { + "epoch": 1.7256995052655348, + "grad_norm": 0.30625720125880435, + "learning_rate": 6.684844191821698e-06, + "loss": 0.4654, + "step": 10508 + }, + { + "epoch": 1.7258637324738775, + "grad_norm": 0.4041210802480253, + "learning_rate": 6.684434192716959e-06, + "loss": 0.4629, + "step": 10509 + }, + { + "epoch": 1.7260279596822203, + "grad_norm": 0.3285621834318125, + "learning_rate": 6.684024169887115e-06, + "loss": 0.4517, + "step": 10510 + }, + { + "epoch": 1.7261921868905632, + "grad_norm": 0.5371314885182225, + "learning_rate": 6.68361412333662e-06, + "loss": 0.4544, + "step": 10511 + }, + { + "epoch": 1.7263564140989058, + "grad_norm": 0.3346484736752237, + "learning_rate": 6.683204053069928e-06, + "loss": 0.4483, + "step": 10512 + }, + { + "epoch": 1.7265206413072485, + "grad_norm": 0.265535426288644, + "learning_rate": 6.68279395909149e-06, + "loss": 0.455, + "step": 10513 + }, + { + "epoch": 1.7266848685155913, + "grad_norm": 0.272151834494228, + "learning_rate": 6.682383841405764e-06, + "loss": 0.4627, + "step": 10514 + }, + { + "epoch": 1.7268490957239342, + "grad_norm": 0.2768667797354831, + "learning_rate": 6.681973700017202e-06, + "loss": 0.4622, + "step": 10515 + }, + { + "epoch": 1.7270133229322768, + "grad_norm": 0.2611288871413042, + "learning_rate": 6.681563534930258e-06, + "loss": 0.4692, + "step": 10516 + }, + { + "epoch": 1.7271775501406195, + "grad_norm": 0.30032015789813, + "learning_rate": 6.681153346149388e-06, + "loss": 0.4425, + "step": 10517 + }, + { + "epoch": 1.7273417773489623, + "grad_norm": 0.2765399919998032, + "learning_rate": 6.680743133679048e-06, + "loss": 0.4418, + "step": 10518 + }, + { + "epoch": 1.7275060045573052, + "grad_norm": 0.2637540347081802, + "learning_rate": 6.6803328975236904e-06, + "loss": 0.4579, + "step": 10519 + }, + { + "epoch": 1.7276702317656478, + "grad_norm": 0.405385672011455, + "learning_rate": 6.679922637687772e-06, + "loss": 0.4557, + "step": 10520 + }, + { + "epoch": 1.7278344589739905, + "grad_norm": 0.2911681189325491, + "learning_rate": 6.679512354175751e-06, + "loss": 0.4586, + "step": 10521 + }, + { + "epoch": 1.727998686182333, + "grad_norm": 0.2675727811944536, + "learning_rate": 6.679102046992079e-06, + "loss": 0.4376, + "step": 10522 + }, + { + "epoch": 1.728162913390676, + "grad_norm": 0.2817047223528335, + "learning_rate": 6.678691716141217e-06, + "loss": 0.4556, + "step": 10523 + }, + { + "epoch": 1.7283271405990188, + "grad_norm": 0.30171073273169, + "learning_rate": 6.678281361627619e-06, + "loss": 0.433, + "step": 10524 + }, + { + "epoch": 1.7284913678073615, + "grad_norm": 0.33379983997579427, + "learning_rate": 6.677870983455741e-06, + "loss": 0.4654, + "step": 10525 + }, + { + "epoch": 1.728655595015704, + "grad_norm": 0.2722756149058063, + "learning_rate": 6.677460581630043e-06, + "loss": 0.4506, + "step": 10526 + }, + { + "epoch": 1.728819822224047, + "grad_norm": 0.8752736675862947, + "learning_rate": 6.677050156154979e-06, + "loss": 0.4607, + "step": 10527 + }, + { + "epoch": 1.7289840494323898, + "grad_norm": 0.8938119846179062, + "learning_rate": 6.67663970703501e-06, + "loss": 0.4628, + "step": 10528 + }, + { + "epoch": 1.7291482766407325, + "grad_norm": 0.29687175309764746, + "learning_rate": 6.676229234274592e-06, + "loss": 0.4473, + "step": 10529 + }, + { + "epoch": 1.729312503849075, + "grad_norm": 0.2983652847761805, + "learning_rate": 6.675818737878183e-06, + "loss": 0.464, + "step": 10530 + }, + { + "epoch": 1.729476731057418, + "grad_norm": 0.2882164982567439, + "learning_rate": 6.67540821785024e-06, + "loss": 0.4656, + "step": 10531 + }, + { + "epoch": 1.7296409582657608, + "grad_norm": 0.3597729555329601, + "learning_rate": 6.674997674195225e-06, + "loss": 0.4854, + "step": 10532 + }, + { + "epoch": 1.7298051854741034, + "grad_norm": 0.23773034919142239, + "learning_rate": 6.674587106917597e-06, + "loss": 0.419, + "step": 10533 + }, + { + "epoch": 1.729969412682446, + "grad_norm": 0.3158190970863376, + "learning_rate": 6.674176516021812e-06, + "loss": 0.457, + "step": 10534 + }, + { + "epoch": 1.730133639890789, + "grad_norm": 0.32428977605563186, + "learning_rate": 6.6737659015123315e-06, + "loss": 0.4549, + "step": 10535 + }, + { + "epoch": 1.7302978670991318, + "grad_norm": 0.2866692455873746, + "learning_rate": 6.673355263393612e-06, + "loss": 0.4351, + "step": 10536 + }, + { + "epoch": 1.7304620943074744, + "grad_norm": 0.38551922492958735, + "learning_rate": 6.6729446016701195e-06, + "loss": 0.475, + "step": 10537 + }, + { + "epoch": 1.730626321515817, + "grad_norm": 0.5243113966027705, + "learning_rate": 6.672533916346309e-06, + "loss": 0.475, + "step": 10538 + }, + { + "epoch": 1.7307905487241597, + "grad_norm": 0.2515301096578204, + "learning_rate": 6.672123207426644e-06, + "loss": 0.4438, + "step": 10539 + }, + { + "epoch": 1.7309547759325026, + "grad_norm": 0.3074191829940396, + "learning_rate": 6.671712474915583e-06, + "loss": 0.4616, + "step": 10540 + }, + { + "epoch": 1.7311190031408454, + "grad_norm": 0.2880000746192107, + "learning_rate": 6.671301718817586e-06, + "loss": 0.4564, + "step": 10541 + }, + { + "epoch": 1.731283230349188, + "grad_norm": 0.2962969244487233, + "learning_rate": 6.67089093913712e-06, + "loss": 0.471, + "step": 10542 + }, + { + "epoch": 1.7314474575575307, + "grad_norm": 0.27218947530068305, + "learning_rate": 6.67048013587864e-06, + "loss": 0.4569, + "step": 10543 + }, + { + "epoch": 1.7316116847658736, + "grad_norm": 0.3374040851327258, + "learning_rate": 6.670069309046611e-06, + "loss": 0.4434, + "step": 10544 + }, + { + "epoch": 1.7317759119742164, + "grad_norm": 0.3706121145690397, + "learning_rate": 6.669658458645493e-06, + "loss": 0.4644, + "step": 10545 + }, + { + "epoch": 1.731940139182559, + "grad_norm": 0.3597240408303545, + "learning_rate": 6.669247584679751e-06, + "loss": 0.4718, + "step": 10546 + }, + { + "epoch": 1.7321043663909017, + "grad_norm": 0.34526245876029543, + "learning_rate": 6.668836687153844e-06, + "loss": 0.4502, + "step": 10547 + }, + { + "epoch": 1.7322685935992446, + "grad_norm": 0.25465094933578114, + "learning_rate": 6.668425766072239e-06, + "loss": 0.4451, + "step": 10548 + }, + { + "epoch": 1.7324328208075874, + "grad_norm": 0.2753508045274311, + "learning_rate": 6.6680148214393965e-06, + "loss": 0.4478, + "step": 10549 + }, + { + "epoch": 1.73259704801593, + "grad_norm": 0.2400898508376296, + "learning_rate": 6.667603853259779e-06, + "loss": 0.4785, + "step": 10550 + }, + { + "epoch": 1.7327612752242727, + "grad_norm": 0.2739499399340202, + "learning_rate": 6.667192861537851e-06, + "loss": 0.4763, + "step": 10551 + }, + { + "epoch": 1.7329255024326156, + "grad_norm": 0.2838378785010479, + "learning_rate": 6.666781846278077e-06, + "loss": 0.4431, + "step": 10552 + }, + { + "epoch": 1.7330897296409584, + "grad_norm": 0.3275742837311978, + "learning_rate": 6.6663708074849195e-06, + "loss": 0.4555, + "step": 10553 + }, + { + "epoch": 1.733253956849301, + "grad_norm": 0.35097480269581555, + "learning_rate": 6.665959745162845e-06, + "loss": 0.4681, + "step": 10554 + }, + { + "epoch": 1.7334181840576437, + "grad_norm": 0.9181025874191894, + "learning_rate": 6.6655486593163155e-06, + "loss": 0.4616, + "step": 10555 + }, + { + "epoch": 1.7335824112659863, + "grad_norm": 0.7894254312466302, + "learning_rate": 6.665137549949797e-06, + "loss": 0.4539, + "step": 10556 + }, + { + "epoch": 1.7337466384743292, + "grad_norm": 0.43038099202683344, + "learning_rate": 6.664726417067755e-06, + "loss": 0.4593, + "step": 10557 + }, + { + "epoch": 1.733910865682672, + "grad_norm": 0.4093328771869033, + "learning_rate": 6.664315260674654e-06, + "loss": 0.4361, + "step": 10558 + }, + { + "epoch": 1.7340750928910147, + "grad_norm": 0.3082230765897813, + "learning_rate": 6.66390408077496e-06, + "loss": 0.4586, + "step": 10559 + }, + { + "epoch": 1.7342393200993573, + "grad_norm": 0.32275708574988343, + "learning_rate": 6.663492877373138e-06, + "loss": 0.4461, + "step": 10560 + }, + { + "epoch": 1.7344035473077002, + "grad_norm": 0.31035788426403466, + "learning_rate": 6.663081650473655e-06, + "loss": 0.4617, + "step": 10561 + }, + { + "epoch": 1.734567774516043, + "grad_norm": 0.31601675827456194, + "learning_rate": 6.662670400080978e-06, + "loss": 0.4706, + "step": 10562 + }, + { + "epoch": 1.7347320017243857, + "grad_norm": 0.33103811875238215, + "learning_rate": 6.662259126199573e-06, + "loss": 0.4919, + "step": 10563 + }, + { + "epoch": 1.7348962289327283, + "grad_norm": 0.41689720596390145, + "learning_rate": 6.661847828833905e-06, + "loss": 0.46, + "step": 10564 + }, + { + "epoch": 1.7350604561410712, + "grad_norm": 0.35258440749097886, + "learning_rate": 6.661436507988442e-06, + "loss": 0.4874, + "step": 10565 + }, + { + "epoch": 1.735224683349414, + "grad_norm": 0.38367876380098803, + "learning_rate": 6.6610251636676536e-06, + "loss": 0.4711, + "step": 10566 + }, + { + "epoch": 1.7353889105577567, + "grad_norm": 0.35911279259469897, + "learning_rate": 6.660613795876007e-06, + "loss": 0.4483, + "step": 10567 + }, + { + "epoch": 1.7355531377660993, + "grad_norm": 0.2888127223228145, + "learning_rate": 6.6602024046179665e-06, + "loss": 0.4533, + "step": 10568 + }, + { + "epoch": 1.7357173649744422, + "grad_norm": 0.2798102991264217, + "learning_rate": 6.659790989898002e-06, + "loss": 0.4543, + "step": 10569 + }, + { + "epoch": 1.735881592182785, + "grad_norm": 0.32600684688107623, + "learning_rate": 6.659379551720584e-06, + "loss": 0.4565, + "step": 10570 + }, + { + "epoch": 1.7360458193911277, + "grad_norm": 0.3049264362217875, + "learning_rate": 6.658968090090179e-06, + "loss": 0.4613, + "step": 10571 + }, + { + "epoch": 1.7362100465994703, + "grad_norm": 0.27547939138967337, + "learning_rate": 6.658556605011257e-06, + "loss": 0.4423, + "step": 10572 + }, + { + "epoch": 1.736374273807813, + "grad_norm": 0.31268202539302475, + "learning_rate": 6.6581450964882865e-06, + "loss": 0.4649, + "step": 10573 + }, + { + "epoch": 1.7365385010161558, + "grad_norm": 0.2947480651912641, + "learning_rate": 6.6577335645257356e-06, + "loss": 0.4667, + "step": 10574 + }, + { + "epoch": 1.7367027282244987, + "grad_norm": 0.3829571158229656, + "learning_rate": 6.657322009128077e-06, + "loss": 0.4471, + "step": 10575 + }, + { + "epoch": 1.7368669554328413, + "grad_norm": 0.9143886610861583, + "learning_rate": 6.656910430299777e-06, + "loss": 0.4641, + "step": 10576 + }, + { + "epoch": 1.737031182641184, + "grad_norm": 0.27884086419625775, + "learning_rate": 6.65649882804531e-06, + "loss": 0.4774, + "step": 10577 + }, + { + "epoch": 1.7371954098495268, + "grad_norm": 0.33456889981474314, + "learning_rate": 6.656087202369142e-06, + "loss": 0.4481, + "step": 10578 + }, + { + "epoch": 1.7373596370578697, + "grad_norm": 0.30338483694912066, + "learning_rate": 6.655675553275747e-06, + "loss": 0.4408, + "step": 10579 + }, + { + "epoch": 1.7375238642662123, + "grad_norm": 0.29204954685142154, + "learning_rate": 6.655263880769593e-06, + "loss": 0.4561, + "step": 10580 + }, + { + "epoch": 1.737688091474555, + "grad_norm": 0.28477453765180205, + "learning_rate": 6.654852184855153e-06, + "loss": 0.473, + "step": 10581 + }, + { + "epoch": 1.7378523186828978, + "grad_norm": 0.3611993605343738, + "learning_rate": 6.654440465536899e-06, + "loss": 0.4816, + "step": 10582 + }, + { + "epoch": 1.7380165458912407, + "grad_norm": 0.2788356026362653, + "learning_rate": 6.654028722819303e-06, + "loss": 0.4878, + "step": 10583 + }, + { + "epoch": 1.7381807730995833, + "grad_norm": 0.33148173481642546, + "learning_rate": 6.653616956706834e-06, + "loss": 0.4545, + "step": 10584 + }, + { + "epoch": 1.738345000307926, + "grad_norm": 0.3035096767740506, + "learning_rate": 6.653205167203966e-06, + "loss": 0.4748, + "step": 10585 + }, + { + "epoch": 1.7385092275162688, + "grad_norm": 0.3493164699571448, + "learning_rate": 6.652793354315173e-06, + "loss": 0.4618, + "step": 10586 + }, + { + "epoch": 1.7386734547246117, + "grad_norm": 0.3404445961452418, + "learning_rate": 6.652381518044924e-06, + "loss": 0.471, + "step": 10587 + }, + { + "epoch": 1.7388376819329543, + "grad_norm": 0.33019863805209754, + "learning_rate": 6.651969658397696e-06, + "loss": 0.4659, + "step": 10588 + }, + { + "epoch": 1.739001909141297, + "grad_norm": 0.26683706560142134, + "learning_rate": 6.651557775377958e-06, + "loss": 0.4395, + "step": 10589 + }, + { + "epoch": 1.7391661363496396, + "grad_norm": 0.3633868009011593, + "learning_rate": 6.651145868990188e-06, + "loss": 0.4582, + "step": 10590 + }, + { + "epoch": 1.7393303635579824, + "grad_norm": 0.35217765494100506, + "learning_rate": 6.650733939238857e-06, + "loss": 0.4377, + "step": 10591 + }, + { + "epoch": 1.7394945907663253, + "grad_norm": 0.36982069809605017, + "learning_rate": 6.650321986128439e-06, + "loss": 0.4503, + "step": 10592 + }, + { + "epoch": 1.739658817974668, + "grad_norm": 0.307761272677984, + "learning_rate": 6.649910009663408e-06, + "loss": 0.4752, + "step": 10593 + }, + { + "epoch": 1.7398230451830106, + "grad_norm": 0.28942145592305923, + "learning_rate": 6.649498009848239e-06, + "loss": 0.4665, + "step": 10594 + }, + { + "epoch": 1.7399872723913534, + "grad_norm": 0.27124401320804664, + "learning_rate": 6.649085986687406e-06, + "loss": 0.4398, + "step": 10595 + }, + { + "epoch": 1.7401514995996963, + "grad_norm": 0.3261971310442718, + "learning_rate": 6.648673940185388e-06, + "loss": 0.4601, + "step": 10596 + }, + { + "epoch": 1.740315726808039, + "grad_norm": 0.2843498854236714, + "learning_rate": 6.6482618703466545e-06, + "loss": 0.4694, + "step": 10597 + }, + { + "epoch": 1.7404799540163816, + "grad_norm": 0.37333399523345706, + "learning_rate": 6.6478497771756845e-06, + "loss": 0.4482, + "step": 10598 + }, + { + "epoch": 1.7406441812247244, + "grad_norm": 0.2788317231506996, + "learning_rate": 6.647437660676951e-06, + "loss": 0.4513, + "step": 10599 + }, + { + "epoch": 1.7408084084330673, + "grad_norm": 0.31375562829679565, + "learning_rate": 6.647025520854934e-06, + "loss": 0.4624, + "step": 10600 + }, + { + "epoch": 1.74097263564141, + "grad_norm": 0.3615619772581296, + "learning_rate": 6.646613357714107e-06, + "loss": 0.4436, + "step": 10601 + }, + { + "epoch": 1.7411368628497526, + "grad_norm": 0.3315910833247448, + "learning_rate": 6.646201171258946e-06, + "loss": 0.4512, + "step": 10602 + }, + { + "epoch": 1.7413010900580954, + "grad_norm": 0.2708299030595272, + "learning_rate": 6.645788961493929e-06, + "loss": 0.4426, + "step": 10603 + }, + { + "epoch": 1.7414653172664383, + "grad_norm": 0.27840725603169686, + "learning_rate": 6.645376728423533e-06, + "loss": 0.4385, + "step": 10604 + }, + { + "epoch": 1.741629544474781, + "grad_norm": 0.2569884855267863, + "learning_rate": 6.644964472052234e-06, + "loss": 0.4575, + "step": 10605 + }, + { + "epoch": 1.7417937716831235, + "grad_norm": 0.28479141295519006, + "learning_rate": 6.644552192384512e-06, + "loss": 0.4424, + "step": 10606 + }, + { + "epoch": 1.7419579988914662, + "grad_norm": 0.2951760227251854, + "learning_rate": 6.644139889424842e-06, + "loss": 0.4657, + "step": 10607 + }, + { + "epoch": 1.742122226099809, + "grad_norm": 0.30186113653350727, + "learning_rate": 6.643727563177704e-06, + "loss": 0.4754, + "step": 10608 + }, + { + "epoch": 1.742286453308152, + "grad_norm": 0.2612584685088557, + "learning_rate": 6.643315213647575e-06, + "loss": 0.4453, + "step": 10609 + }, + { + "epoch": 1.7424506805164945, + "grad_norm": 0.6947221272165853, + "learning_rate": 6.642902840838934e-06, + "loss": 0.4571, + "step": 10610 + }, + { + "epoch": 1.7426149077248372, + "grad_norm": 0.3120014744182741, + "learning_rate": 6.6424904447562615e-06, + "loss": 0.4474, + "step": 10611 + }, + { + "epoch": 1.74277913493318, + "grad_norm": 0.3068805224277791, + "learning_rate": 6.642078025404033e-06, + "loss": 0.4649, + "step": 10612 + }, + { + "epoch": 1.742943362141523, + "grad_norm": 0.37785825035727366, + "learning_rate": 6.641665582786731e-06, + "loss": 0.4277, + "step": 10613 + }, + { + "epoch": 1.7431075893498655, + "grad_norm": 0.32257286709905886, + "learning_rate": 6.6412531169088325e-06, + "loss": 0.4476, + "step": 10614 + }, + { + "epoch": 1.7432718165582082, + "grad_norm": 0.27288879363747537, + "learning_rate": 6.6408406277748176e-06, + "loss": 0.4439, + "step": 10615 + }, + { + "epoch": 1.743436043766551, + "grad_norm": 0.29328108390781554, + "learning_rate": 6.640428115389168e-06, + "loss": 0.4833, + "step": 10616 + }, + { + "epoch": 1.743600270974894, + "grad_norm": 0.4506991914279768, + "learning_rate": 6.640015579756364e-06, + "loss": 0.4652, + "step": 10617 + }, + { + "epoch": 1.7437644981832365, + "grad_norm": 0.3087380589024369, + "learning_rate": 6.639603020880885e-06, + "loss": 0.4793, + "step": 10618 + }, + { + "epoch": 1.7439287253915792, + "grad_norm": 0.34152603811976384, + "learning_rate": 6.639190438767211e-06, + "loss": 0.4486, + "step": 10619 + }, + { + "epoch": 1.744092952599922, + "grad_norm": 0.3100709807557343, + "learning_rate": 6.638777833419825e-06, + "loss": 0.4602, + "step": 10620 + }, + { + "epoch": 1.744257179808265, + "grad_norm": 0.3054017542582119, + "learning_rate": 6.638365204843209e-06, + "loss": 0.4642, + "step": 10621 + }, + { + "epoch": 1.7444214070166075, + "grad_norm": 0.34891463707536224, + "learning_rate": 6.637952553041842e-06, + "loss": 0.4696, + "step": 10622 + }, + { + "epoch": 1.7445856342249502, + "grad_norm": 0.3379641255537083, + "learning_rate": 6.637539878020205e-06, + "loss": 0.46, + "step": 10623 + }, + { + "epoch": 1.7447498614332928, + "grad_norm": 0.4668367355877062, + "learning_rate": 6.637127179782782e-06, + "loss": 0.4542, + "step": 10624 + }, + { + "epoch": 1.7449140886416357, + "grad_norm": 0.8209300021742912, + "learning_rate": 6.636714458334057e-06, + "loss": 0.4623, + "step": 10625 + }, + { + "epoch": 1.7450783158499785, + "grad_norm": 0.28705239054834886, + "learning_rate": 6.636301713678511e-06, + "loss": 0.4507, + "step": 10626 + }, + { + "epoch": 1.7452425430583212, + "grad_norm": 0.3355537786733906, + "learning_rate": 6.635888945820625e-06, + "loss": 0.4302, + "step": 10627 + }, + { + "epoch": 1.7454067702666638, + "grad_norm": 0.37192872092910106, + "learning_rate": 6.635476154764884e-06, + "loss": 0.452, + "step": 10628 + }, + { + "epoch": 1.7455709974750067, + "grad_norm": 0.26525282249639376, + "learning_rate": 6.635063340515772e-06, + "loss": 0.4537, + "step": 10629 + }, + { + "epoch": 1.7457352246833495, + "grad_norm": 0.30958646418527175, + "learning_rate": 6.63465050307777e-06, + "loss": 0.4485, + "step": 10630 + }, + { + "epoch": 1.7458994518916922, + "grad_norm": 0.33668426160861986, + "learning_rate": 6.634237642455365e-06, + "loss": 0.4624, + "step": 10631 + }, + { + "epoch": 1.7460636791000348, + "grad_norm": 0.3366417579129416, + "learning_rate": 6.633824758653038e-06, + "loss": 0.4421, + "step": 10632 + }, + { + "epoch": 1.7462279063083777, + "grad_norm": 0.3456242095455785, + "learning_rate": 6.633411851675275e-06, + "loss": 0.4655, + "step": 10633 + }, + { + "epoch": 1.7463921335167205, + "grad_norm": 0.32458901913886634, + "learning_rate": 6.63299892152656e-06, + "loss": 0.4458, + "step": 10634 + }, + { + "epoch": 1.7465563607250632, + "grad_norm": 0.29732800671455417, + "learning_rate": 6.632585968211379e-06, + "loss": 0.4663, + "step": 10635 + }, + { + "epoch": 1.7467205879334058, + "grad_norm": 0.35012774391383994, + "learning_rate": 6.632172991734216e-06, + "loss": 0.4716, + "step": 10636 + }, + { + "epoch": 1.7468848151417486, + "grad_norm": 0.37338988277876983, + "learning_rate": 6.6317599920995555e-06, + "loss": 0.4591, + "step": 10637 + }, + { + "epoch": 1.7470490423500915, + "grad_norm": 0.3799089211952642, + "learning_rate": 6.631346969311886e-06, + "loss": 0.4604, + "step": 10638 + }, + { + "epoch": 1.7472132695584341, + "grad_norm": 0.2894958279907993, + "learning_rate": 6.6309339233756894e-06, + "loss": 0.4603, + "step": 10639 + }, + { + "epoch": 1.7473774967667768, + "grad_norm": 0.3575252568448904, + "learning_rate": 6.630520854295455e-06, + "loss": 0.4781, + "step": 10640 + }, + { + "epoch": 1.7475417239751194, + "grad_norm": 0.27491260487392993, + "learning_rate": 6.630107762075668e-06, + "loss": 0.4522, + "step": 10641 + }, + { + "epoch": 1.7477059511834623, + "grad_norm": 0.28858432050151406, + "learning_rate": 6.629694646720815e-06, + "loss": 0.4569, + "step": 10642 + }, + { + "epoch": 1.7478701783918051, + "grad_norm": 0.25855245592740905, + "learning_rate": 6.6292815082353825e-06, + "loss": 0.4733, + "step": 10643 + }, + { + "epoch": 1.7480344056001478, + "grad_norm": 0.28350226598092165, + "learning_rate": 6.628868346623858e-06, + "loss": 0.4555, + "step": 10644 + }, + { + "epoch": 1.7481986328084904, + "grad_norm": 0.3637068588878069, + "learning_rate": 6.6284551618907284e-06, + "loss": 0.4548, + "step": 10645 + }, + { + "epoch": 1.7483628600168333, + "grad_norm": 0.3527726874714712, + "learning_rate": 6.628041954040482e-06, + "loss": 0.4544, + "step": 10646 + }, + { + "epoch": 1.7485270872251761, + "grad_norm": 0.26024031603615716, + "learning_rate": 6.627628723077606e-06, + "loss": 0.46, + "step": 10647 + }, + { + "epoch": 1.7486913144335188, + "grad_norm": 0.3288597058710364, + "learning_rate": 6.627215469006589e-06, + "loss": 0.447, + "step": 10648 + }, + { + "epoch": 1.7488555416418614, + "grad_norm": 0.25892417052957034, + "learning_rate": 6.626802191831919e-06, + "loss": 0.4713, + "step": 10649 + }, + { + "epoch": 1.7490197688502043, + "grad_norm": 0.4091246145076616, + "learning_rate": 6.626388891558086e-06, + "loss": 0.4735, + "step": 10650 + }, + { + "epoch": 1.7491839960585471, + "grad_norm": 0.25989391360676, + "learning_rate": 6.625975568189575e-06, + "loss": 0.4459, + "step": 10651 + }, + { + "epoch": 1.7493482232668898, + "grad_norm": 0.34067425289731734, + "learning_rate": 6.62556222173088e-06, + "loss": 0.4395, + "step": 10652 + }, + { + "epoch": 1.7495124504752324, + "grad_norm": 0.36332303622096074, + "learning_rate": 6.625148852186485e-06, + "loss": 0.466, + "step": 10653 + }, + { + "epoch": 1.7496766776835753, + "grad_norm": 0.40636172529249653, + "learning_rate": 6.624735459560886e-06, + "loss": 0.4526, + "step": 10654 + }, + { + "epoch": 1.7498409048919181, + "grad_norm": 0.35763187050169826, + "learning_rate": 6.6243220438585685e-06, + "loss": 0.4465, + "step": 10655 + }, + { + "epoch": 1.7500051321002608, + "grad_norm": 0.2928042263761072, + "learning_rate": 6.623908605084023e-06, + "loss": 0.4452, + "step": 10656 + }, + { + "epoch": 1.7501693593086034, + "grad_norm": 0.32058362491336445, + "learning_rate": 6.623495143241739e-06, + "loss": 0.4424, + "step": 10657 + }, + { + "epoch": 1.750333586516946, + "grad_norm": 0.3051735078726969, + "learning_rate": 6.623081658336211e-06, + "loss": 0.4478, + "step": 10658 + }, + { + "epoch": 1.750497813725289, + "grad_norm": 0.2846557283714331, + "learning_rate": 6.622668150371925e-06, + "loss": 0.4746, + "step": 10659 + }, + { + "epoch": 1.7506620409336318, + "grad_norm": 0.2631461096010346, + "learning_rate": 6.622254619353377e-06, + "loss": 0.4806, + "step": 10660 + }, + { + "epoch": 1.7508262681419744, + "grad_norm": 0.27165081583272227, + "learning_rate": 6.621841065285054e-06, + "loss": 0.4629, + "step": 10661 + }, + { + "epoch": 1.750990495350317, + "grad_norm": 0.34897299039333585, + "learning_rate": 6.62142748817145e-06, + "loss": 0.4733, + "step": 10662 + }, + { + "epoch": 1.75115472255866, + "grad_norm": 0.2967986687922061, + "learning_rate": 6.621013888017057e-06, + "loss": 0.4721, + "step": 10663 + }, + { + "epoch": 1.7513189497670028, + "grad_norm": 0.2696164577840737, + "learning_rate": 6.620600264826365e-06, + "loss": 0.4533, + "step": 10664 + }, + { + "epoch": 1.7514831769753454, + "grad_norm": 0.30955662042741533, + "learning_rate": 6.620186618603869e-06, + "loss": 0.4681, + "step": 10665 + }, + { + "epoch": 1.751647404183688, + "grad_norm": 0.2663600728753576, + "learning_rate": 6.6197729493540595e-06, + "loss": 0.4527, + "step": 10666 + }, + { + "epoch": 1.7518116313920309, + "grad_norm": 0.39339227377818786, + "learning_rate": 6.619359257081431e-06, + "loss": 0.4404, + "step": 10667 + }, + { + "epoch": 1.7519758586003737, + "grad_norm": 0.2943000935613823, + "learning_rate": 6.618945541790474e-06, + "loss": 0.4437, + "step": 10668 + }, + { + "epoch": 1.7521400858087164, + "grad_norm": 0.29804979688660815, + "learning_rate": 6.618531803485686e-06, + "loss": 0.4505, + "step": 10669 + }, + { + "epoch": 1.752304313017059, + "grad_norm": 0.29331311411628713, + "learning_rate": 6.6181180421715574e-06, + "loss": 0.4579, + "step": 10670 + }, + { + "epoch": 1.7524685402254019, + "grad_norm": 0.2996471216969548, + "learning_rate": 6.617704257852583e-06, + "loss": 0.4488, + "step": 10671 + }, + { + "epoch": 1.7526327674337447, + "grad_norm": 0.45540878251191624, + "learning_rate": 6.6172904505332555e-06, + "loss": 0.4634, + "step": 10672 + }, + { + "epoch": 1.7527969946420874, + "grad_norm": 0.3484322554734612, + "learning_rate": 6.616876620218071e-06, + "loss": 0.4566, + "step": 10673 + }, + { + "epoch": 1.75296122185043, + "grad_norm": 0.293755535000031, + "learning_rate": 6.616462766911525e-06, + "loss": 0.4753, + "step": 10674 + }, + { + "epoch": 1.7531254490587727, + "grad_norm": 0.33996324247139664, + "learning_rate": 6.616048890618111e-06, + "loss": 0.4636, + "step": 10675 + }, + { + "epoch": 1.7532896762671155, + "grad_norm": 0.41606836561116783, + "learning_rate": 6.615634991342323e-06, + "loss": 0.4418, + "step": 10676 + }, + { + "epoch": 1.7534539034754584, + "grad_norm": 0.3132030327583419, + "learning_rate": 6.615221069088658e-06, + "loss": 0.4685, + "step": 10677 + }, + { + "epoch": 1.753618130683801, + "grad_norm": 0.2993572772718032, + "learning_rate": 6.614807123861611e-06, + "loss": 0.4639, + "step": 10678 + }, + { + "epoch": 1.7537823578921437, + "grad_norm": 0.4478176444935102, + "learning_rate": 6.614393155665678e-06, + "loss": 0.4475, + "step": 10679 + }, + { + "epoch": 1.7539465851004865, + "grad_norm": 0.30048533535202293, + "learning_rate": 6.613979164505355e-06, + "loss": 0.4403, + "step": 10680 + }, + { + "epoch": 1.7541108123088294, + "grad_norm": 0.2908512457695989, + "learning_rate": 6.613565150385138e-06, + "loss": 0.4559, + "step": 10681 + }, + { + "epoch": 1.754275039517172, + "grad_norm": 0.30416948079747663, + "learning_rate": 6.613151113309524e-06, + "loss": 0.4624, + "step": 10682 + }, + { + "epoch": 1.7544392667255146, + "grad_norm": 0.3456723939038373, + "learning_rate": 6.6127370532830105e-06, + "loss": 0.4458, + "step": 10683 + }, + { + "epoch": 1.7546034939338575, + "grad_norm": 0.2688557993639089, + "learning_rate": 6.612322970310094e-06, + "loss": 0.4395, + "step": 10684 + }, + { + "epoch": 1.7547677211422004, + "grad_norm": 0.2857108442653925, + "learning_rate": 6.6119088643952715e-06, + "loss": 0.4616, + "step": 10685 + }, + { + "epoch": 1.754931948350543, + "grad_norm": 0.2946948925070282, + "learning_rate": 6.611494735543041e-06, + "loss": 0.4428, + "step": 10686 + }, + { + "epoch": 1.7550961755588856, + "grad_norm": 0.2816529774982687, + "learning_rate": 6.611080583757899e-06, + "loss": 0.4602, + "step": 10687 + }, + { + "epoch": 1.7552604027672285, + "grad_norm": 0.290157879388694, + "learning_rate": 6.610666409044347e-06, + "loss": 0.4696, + "step": 10688 + }, + { + "epoch": 1.7554246299755714, + "grad_norm": 0.36955605038324496, + "learning_rate": 6.61025221140688e-06, + "loss": 0.4493, + "step": 10689 + }, + { + "epoch": 1.755588857183914, + "grad_norm": 0.3095960553714322, + "learning_rate": 6.609837990849999e-06, + "loss": 0.4315, + "step": 10690 + }, + { + "epoch": 1.7557530843922566, + "grad_norm": 0.47185918895177503, + "learning_rate": 6.609423747378199e-06, + "loss": 0.4817, + "step": 10691 + }, + { + "epoch": 1.7559173116005993, + "grad_norm": 0.28115335143313835, + "learning_rate": 6.609009480995984e-06, + "loss": 0.4312, + "step": 10692 + }, + { + "epoch": 1.7560815388089421, + "grad_norm": 0.30650451169190884, + "learning_rate": 6.608595191707851e-06, + "loss": 0.4793, + "step": 10693 + }, + { + "epoch": 1.756245766017285, + "grad_norm": 0.34971127443580935, + "learning_rate": 6.608180879518299e-06, + "loss": 0.4604, + "step": 10694 + }, + { + "epoch": 1.7564099932256276, + "grad_norm": 0.3094900938094681, + "learning_rate": 6.607766544431828e-06, + "loss": 0.4473, + "step": 10695 + }, + { + "epoch": 1.7565742204339703, + "grad_norm": 0.3187215929719953, + "learning_rate": 6.6073521864529395e-06, + "loss": 0.4662, + "step": 10696 + }, + { + "epoch": 1.7567384476423131, + "grad_norm": 0.3338168033020335, + "learning_rate": 6.606937805586132e-06, + "loss": 0.4582, + "step": 10697 + }, + { + "epoch": 1.756902674850656, + "grad_norm": 0.28507866277484556, + "learning_rate": 6.606523401835907e-06, + "loss": 0.4632, + "step": 10698 + }, + { + "epoch": 1.7570669020589986, + "grad_norm": 1.3822681586763514, + "learning_rate": 6.606108975206767e-06, + "loss": 0.4564, + "step": 10699 + }, + { + "epoch": 1.7572311292673413, + "grad_norm": 0.2818682573662767, + "learning_rate": 6.605694525703209e-06, + "loss": 0.4744, + "step": 10700 + }, + { + "epoch": 1.7573953564756841, + "grad_norm": 0.3082291536949848, + "learning_rate": 6.605280053329738e-06, + "loss": 0.4434, + "step": 10701 + }, + { + "epoch": 1.757559583684027, + "grad_norm": 0.30796809315309, + "learning_rate": 6.604865558090854e-06, + "loss": 0.4537, + "step": 10702 + }, + { + "epoch": 1.7577238108923696, + "grad_norm": 0.2785442087505172, + "learning_rate": 6.604451039991059e-06, + "loss": 0.4688, + "step": 10703 + }, + { + "epoch": 1.7578880381007123, + "grad_norm": 0.26598099036361045, + "learning_rate": 6.6040364990348556e-06, + "loss": 0.4381, + "step": 10704 + }, + { + "epoch": 1.7580522653090551, + "grad_norm": 0.3066077419739165, + "learning_rate": 6.603621935226746e-06, + "loss": 0.4783, + "step": 10705 + }, + { + "epoch": 1.758216492517398, + "grad_norm": 0.28792843857434686, + "learning_rate": 6.603207348571231e-06, + "loss": 0.4533, + "step": 10706 + }, + { + "epoch": 1.7583807197257406, + "grad_norm": 0.2912059270129017, + "learning_rate": 6.602792739072817e-06, + "loss": 0.4634, + "step": 10707 + }, + { + "epoch": 1.7585449469340833, + "grad_norm": 0.2938487886132544, + "learning_rate": 6.6023781067360035e-06, + "loss": 0.4729, + "step": 10708 + }, + { + "epoch": 1.758709174142426, + "grad_norm": 0.32239400228435916, + "learning_rate": 6.601963451565297e-06, + "loss": 0.461, + "step": 10709 + }, + { + "epoch": 1.7588734013507688, + "grad_norm": 0.37857542453820436, + "learning_rate": 6.601548773565197e-06, + "loss": 0.4429, + "step": 10710 + }, + { + "epoch": 1.7590376285591116, + "grad_norm": 0.3075074168414816, + "learning_rate": 6.601134072740211e-06, + "loss": 0.4403, + "step": 10711 + }, + { + "epoch": 1.7592018557674542, + "grad_norm": 0.7430190716889086, + "learning_rate": 6.600719349094841e-06, + "loss": 0.4448, + "step": 10712 + }, + { + "epoch": 1.7593660829757969, + "grad_norm": 0.44458514655301934, + "learning_rate": 6.600304602633594e-06, + "loss": 0.4563, + "step": 10713 + }, + { + "epoch": 1.7595303101841397, + "grad_norm": 0.5312318676070004, + "learning_rate": 6.5998898333609715e-06, + "loss": 0.4531, + "step": 10714 + }, + { + "epoch": 1.7596945373924826, + "grad_norm": 0.29153224613219314, + "learning_rate": 6.599475041281479e-06, + "loss": 0.4433, + "step": 10715 + }, + { + "epoch": 1.7598587646008252, + "grad_norm": 0.8133344159717972, + "learning_rate": 6.59906022639962e-06, + "loss": 0.4555, + "step": 10716 + }, + { + "epoch": 1.7600229918091679, + "grad_norm": 0.29733696086637973, + "learning_rate": 6.598645388719905e-06, + "loss": 0.448, + "step": 10717 + }, + { + "epoch": 1.7601872190175107, + "grad_norm": 0.3456375401995861, + "learning_rate": 6.598230528246835e-06, + "loss": 0.4571, + "step": 10718 + }, + { + "epoch": 1.7603514462258536, + "grad_norm": 0.3311963527266412, + "learning_rate": 6.597815644984918e-06, + "loss": 0.4506, + "step": 10719 + }, + { + "epoch": 1.7605156734341962, + "grad_norm": 0.38540894382757607, + "learning_rate": 6.597400738938658e-06, + "loss": 0.4657, + "step": 10720 + }, + { + "epoch": 1.7606799006425389, + "grad_norm": 0.3139327570135372, + "learning_rate": 6.596985810112563e-06, + "loss": 0.4371, + "step": 10721 + }, + { + "epoch": 1.7608441278508817, + "grad_norm": 0.2927446497689957, + "learning_rate": 6.596570858511138e-06, + "loss": 0.4377, + "step": 10722 + }, + { + "epoch": 1.7610083550592246, + "grad_norm": 0.3170970048028251, + "learning_rate": 6.5961558841388915e-06, + "loss": 0.4459, + "step": 10723 + }, + { + "epoch": 1.7611725822675672, + "grad_norm": 0.2895232317380216, + "learning_rate": 6.59574088700033e-06, + "loss": 0.4463, + "step": 10724 + }, + { + "epoch": 1.7613368094759099, + "grad_norm": 0.30132577539583216, + "learning_rate": 6.59532586709996e-06, + "loss": 0.4479, + "step": 10725 + }, + { + "epoch": 1.7615010366842525, + "grad_norm": 0.297712209809135, + "learning_rate": 6.59491082444229e-06, + "loss": 0.4812, + "step": 10726 + }, + { + "epoch": 1.7616652638925954, + "grad_norm": 0.30872107113301706, + "learning_rate": 6.594495759031826e-06, + "loss": 0.4532, + "step": 10727 + }, + { + "epoch": 1.7618294911009382, + "grad_norm": 0.28669341623275213, + "learning_rate": 6.594080670873079e-06, + "loss": 0.4717, + "step": 10728 + }, + { + "epoch": 1.7619937183092809, + "grad_norm": 0.3179139571288722, + "learning_rate": 6.593665559970555e-06, + "loss": 0.4736, + "step": 10729 + }, + { + "epoch": 1.7621579455176235, + "grad_norm": 0.32432497751797623, + "learning_rate": 6.5932504263287636e-06, + "loss": 0.4646, + "step": 10730 + }, + { + "epoch": 1.7623221727259664, + "grad_norm": 0.29703929983463007, + "learning_rate": 6.592835269952212e-06, + "loss": 0.4648, + "step": 10731 + }, + { + "epoch": 1.7624863999343092, + "grad_norm": 0.34219272476991125, + "learning_rate": 6.592420090845412e-06, + "loss": 0.4648, + "step": 10732 + }, + { + "epoch": 1.7626506271426519, + "grad_norm": 0.3240563250775775, + "learning_rate": 6.59200488901287e-06, + "loss": 0.4554, + "step": 10733 + }, + { + "epoch": 1.7628148543509945, + "grad_norm": 0.3625809225011669, + "learning_rate": 6.591589664459096e-06, + "loss": 0.4584, + "step": 10734 + }, + { + "epoch": 1.7629790815593374, + "grad_norm": 0.6478536789807493, + "learning_rate": 6.5911744171886016e-06, + "loss": 0.4771, + "step": 10735 + }, + { + "epoch": 1.7631433087676802, + "grad_norm": 0.3228958516773163, + "learning_rate": 6.590759147205895e-06, + "loss": 0.4507, + "step": 10736 + }, + { + "epoch": 1.7633075359760229, + "grad_norm": 0.26105724500638866, + "learning_rate": 6.590343854515487e-06, + "loss": 0.4801, + "step": 10737 + }, + { + "epoch": 1.7634717631843655, + "grad_norm": 0.3364783517266538, + "learning_rate": 6.589928539121889e-06, + "loss": 0.4448, + "step": 10738 + }, + { + "epoch": 1.7636359903927084, + "grad_norm": 0.2677527481328561, + "learning_rate": 6.589513201029609e-06, + "loss": 0.4705, + "step": 10739 + }, + { + "epoch": 1.7638002176010512, + "grad_norm": 0.28157915471787903, + "learning_rate": 6.589097840243162e-06, + "loss": 0.4264, + "step": 10740 + }, + { + "epoch": 1.7639644448093939, + "grad_norm": 0.2877591902737641, + "learning_rate": 6.588682456767055e-06, + "loss": 0.4307, + "step": 10741 + }, + { + "epoch": 1.7641286720177365, + "grad_norm": 0.3628181269741827, + "learning_rate": 6.588267050605803e-06, + "loss": 0.4588, + "step": 10742 + }, + { + "epoch": 1.7642928992260791, + "grad_norm": 0.3245284684820889, + "learning_rate": 6.587851621763916e-06, + "loss": 0.4455, + "step": 10743 + }, + { + "epoch": 1.764457126434422, + "grad_norm": 0.4464116642828582, + "learning_rate": 6.587436170245907e-06, + "loss": 0.446, + "step": 10744 + }, + { + "epoch": 1.7646213536427648, + "grad_norm": 0.3618431017109906, + "learning_rate": 6.587020696056285e-06, + "loss": 0.4663, + "step": 10745 + }, + { + "epoch": 1.7647855808511075, + "grad_norm": 0.7835075103239901, + "learning_rate": 6.586605199199567e-06, + "loss": 0.4709, + "step": 10746 + }, + { + "epoch": 1.7649498080594501, + "grad_norm": 0.30836303357783795, + "learning_rate": 6.586189679680263e-06, + "loss": 0.4497, + "step": 10747 + }, + { + "epoch": 1.765114035267793, + "grad_norm": 0.5197716220523791, + "learning_rate": 6.585774137502887e-06, + "loss": 0.4672, + "step": 10748 + }, + { + "epoch": 1.7652782624761358, + "grad_norm": 0.37370315931183823, + "learning_rate": 6.585358572671951e-06, + "loss": 0.4688, + "step": 10749 + }, + { + "epoch": 1.7654424896844785, + "grad_norm": 0.30915793884894854, + "learning_rate": 6.584942985191969e-06, + "loss": 0.4522, + "step": 10750 + }, + { + "epoch": 1.7656067168928211, + "grad_norm": 0.2850764149367957, + "learning_rate": 6.584527375067456e-06, + "loss": 0.4426, + "step": 10751 + }, + { + "epoch": 1.765770944101164, + "grad_norm": 0.3175691534431449, + "learning_rate": 6.584111742302924e-06, + "loss": 0.4661, + "step": 10752 + }, + { + "epoch": 1.7659351713095068, + "grad_norm": 0.2726331377292973, + "learning_rate": 6.583696086902888e-06, + "loss": 0.4386, + "step": 10753 + }, + { + "epoch": 1.7660993985178495, + "grad_norm": 0.3159980495549621, + "learning_rate": 6.583280408871862e-06, + "loss": 0.4281, + "step": 10754 + }, + { + "epoch": 1.766263625726192, + "grad_norm": 0.30476048593373595, + "learning_rate": 6.5828647082143624e-06, + "loss": 0.4386, + "step": 10755 + }, + { + "epoch": 1.766427852934535, + "grad_norm": 0.5184037524101708, + "learning_rate": 6.582448984934901e-06, + "loss": 0.4596, + "step": 10756 + }, + { + "epoch": 1.7665920801428778, + "grad_norm": 0.3719752223361219, + "learning_rate": 6.582033239037997e-06, + "loss": 0.4586, + "step": 10757 + }, + { + "epoch": 1.7667563073512205, + "grad_norm": 0.42817232060131705, + "learning_rate": 6.581617470528162e-06, + "loss": 0.4301, + "step": 10758 + }, + { + "epoch": 1.766920534559563, + "grad_norm": 0.274656189593246, + "learning_rate": 6.5812016794099144e-06, + "loss": 0.4472, + "step": 10759 + }, + { + "epoch": 1.7670847617679057, + "grad_norm": 0.3350111125117778, + "learning_rate": 6.5807858656877675e-06, + "loss": 0.4384, + "step": 10760 + }, + { + "epoch": 1.7672489889762486, + "grad_norm": 0.3961802579412799, + "learning_rate": 6.580370029366239e-06, + "loss": 0.4514, + "step": 10761 + }, + { + "epoch": 1.7674132161845915, + "grad_norm": 0.2810769194265865, + "learning_rate": 6.579954170449847e-06, + "loss": 0.4389, + "step": 10762 + }, + { + "epoch": 1.767577443392934, + "grad_norm": 0.2939767282116434, + "learning_rate": 6.5795382889431045e-06, + "loss": 0.4733, + "step": 10763 + }, + { + "epoch": 1.7677416706012767, + "grad_norm": 0.35139246676594316, + "learning_rate": 6.5791223848505305e-06, + "loss": 0.46, + "step": 10764 + }, + { + "epoch": 1.7679058978096196, + "grad_norm": 0.2923901618315435, + "learning_rate": 6.578706458176642e-06, + "loss": 0.4569, + "step": 10765 + }, + { + "epoch": 1.7680701250179625, + "grad_norm": 0.3661090744129942, + "learning_rate": 6.578290508925957e-06, + "loss": 0.4572, + "step": 10766 + }, + { + "epoch": 1.768234352226305, + "grad_norm": 0.3365126819034772, + "learning_rate": 6.577874537102991e-06, + "loss": 0.4611, + "step": 10767 + }, + { + "epoch": 1.7683985794346477, + "grad_norm": 0.3998899420071638, + "learning_rate": 6.577458542712263e-06, + "loss": 0.4459, + "step": 10768 + }, + { + "epoch": 1.7685628066429906, + "grad_norm": 0.32391092810763855, + "learning_rate": 6.5770425257582926e-06, + "loss": 0.4644, + "step": 10769 + }, + { + "epoch": 1.7687270338513335, + "grad_norm": 0.3529853865117939, + "learning_rate": 6.576626486245596e-06, + "loss": 0.4698, + "step": 10770 + }, + { + "epoch": 1.768891261059676, + "grad_norm": 0.2915016282853621, + "learning_rate": 6.576210424178693e-06, + "loss": 0.4494, + "step": 10771 + }, + { + "epoch": 1.7690554882680187, + "grad_norm": 0.29888633281109755, + "learning_rate": 6.575794339562103e-06, + "loss": 0.4623, + "step": 10772 + }, + { + "epoch": 1.7692197154763616, + "grad_norm": 0.33782021114077626, + "learning_rate": 6.575378232400343e-06, + "loss": 0.4722, + "step": 10773 + }, + { + "epoch": 1.7693839426847044, + "grad_norm": 0.33197396780011823, + "learning_rate": 6.574962102697932e-06, + "loss": 0.446, + "step": 10774 + }, + { + "epoch": 1.769548169893047, + "grad_norm": 0.3498934678387365, + "learning_rate": 6.574545950459393e-06, + "loss": 0.471, + "step": 10775 + }, + { + "epoch": 1.7697123971013897, + "grad_norm": 0.2838093993191466, + "learning_rate": 6.574129775689244e-06, + "loss": 0.4541, + "step": 10776 + }, + { + "epoch": 1.7698766243097324, + "grad_norm": 0.31636922709474874, + "learning_rate": 6.573713578392005e-06, + "loss": 0.4339, + "step": 10777 + }, + { + "epoch": 1.7700408515180752, + "grad_norm": 0.49559821090645556, + "learning_rate": 6.5732973585721955e-06, + "loss": 0.4695, + "step": 10778 + }, + { + "epoch": 1.770205078726418, + "grad_norm": 0.28710379603213987, + "learning_rate": 6.572881116234337e-06, + "loss": 0.4597, + "step": 10779 + }, + { + "epoch": 1.7703693059347607, + "grad_norm": 0.2641290921404477, + "learning_rate": 6.57246485138295e-06, + "loss": 0.4399, + "step": 10780 + }, + { + "epoch": 1.7705335331431034, + "grad_norm": 0.31354698332702563, + "learning_rate": 6.572048564022557e-06, + "loss": 0.4761, + "step": 10781 + }, + { + "epoch": 1.7706977603514462, + "grad_norm": 0.27293143060893843, + "learning_rate": 6.571632254157676e-06, + "loss": 0.4702, + "step": 10782 + }, + { + "epoch": 1.770861987559789, + "grad_norm": 0.2789244175301241, + "learning_rate": 6.571215921792832e-06, + "loss": 0.4639, + "step": 10783 + }, + { + "epoch": 1.7710262147681317, + "grad_norm": 0.3119449859394376, + "learning_rate": 6.570799566932545e-06, + "loss": 0.4298, + "step": 10784 + }, + { + "epoch": 1.7711904419764744, + "grad_norm": 0.4196702056062995, + "learning_rate": 6.570383189581336e-06, + "loss": 0.4453, + "step": 10785 + }, + { + "epoch": 1.7713546691848172, + "grad_norm": 0.3100662001480286, + "learning_rate": 6.569966789743731e-06, + "loss": 0.4666, + "step": 10786 + }, + { + "epoch": 1.77151889639316, + "grad_norm": 0.3417203555284162, + "learning_rate": 6.569550367424248e-06, + "loss": 0.4489, + "step": 10787 + }, + { + "epoch": 1.7716831236015027, + "grad_norm": 0.2997760203724878, + "learning_rate": 6.569133922627413e-06, + "loss": 0.4672, + "step": 10788 + }, + { + "epoch": 1.7718473508098453, + "grad_norm": 0.3650237571988705, + "learning_rate": 6.5687174553577475e-06, + "loss": 0.4398, + "step": 10789 + }, + { + "epoch": 1.7720115780181882, + "grad_norm": 0.40735904021853114, + "learning_rate": 6.568300965619775e-06, + "loss": 0.4582, + "step": 10790 + }, + { + "epoch": 1.772175805226531, + "grad_norm": 0.41041486738406796, + "learning_rate": 6.56788445341802e-06, + "loss": 0.4596, + "step": 10791 + }, + { + "epoch": 1.7723400324348737, + "grad_norm": 0.34668412608676713, + "learning_rate": 6.567467918757004e-06, + "loss": 0.4445, + "step": 10792 + }, + { + "epoch": 1.7725042596432163, + "grad_norm": 0.31068727442963523, + "learning_rate": 6.5670513616412525e-06, + "loss": 0.4421, + "step": 10793 + }, + { + "epoch": 1.772668486851559, + "grad_norm": 0.3138059771083177, + "learning_rate": 6.5666347820752895e-06, + "loss": 0.4358, + "step": 10794 + }, + { + "epoch": 1.7728327140599018, + "grad_norm": 0.2586739324491047, + "learning_rate": 6.5662181800636395e-06, + "loss": 0.4436, + "step": 10795 + }, + { + "epoch": 1.7729969412682447, + "grad_norm": 0.30324835791213506, + "learning_rate": 6.565801555610827e-06, + "loss": 0.4382, + "step": 10796 + }, + { + "epoch": 1.7731611684765873, + "grad_norm": 0.33744280019832007, + "learning_rate": 6.565384908721379e-06, + "loss": 0.4297, + "step": 10797 + }, + { + "epoch": 1.77332539568493, + "grad_norm": 0.2838435130478732, + "learning_rate": 6.564968239399816e-06, + "loss": 0.4328, + "step": 10798 + }, + { + "epoch": 1.7734896228932728, + "grad_norm": 0.29658021102022636, + "learning_rate": 6.564551547650668e-06, + "loss": 0.4576, + "step": 10799 + }, + { + "epoch": 1.7736538501016157, + "grad_norm": 0.3007907744005072, + "learning_rate": 6.564134833478459e-06, + "loss": 0.4695, + "step": 10800 + }, + { + "epoch": 1.7738386057110012, + "grad_norm": 0.285764971226959, + "learning_rate": 6.563718096887715e-06, + "loss": 0.4597, + "step": 10801 + }, + { + "epoch": 1.774002832919344, + "grad_norm": 0.28967808908806614, + "learning_rate": 6.56330133788296e-06, + "loss": 0.4587, + "step": 10802 + }, + { + "epoch": 1.7741670601276867, + "grad_norm": 0.2703467117021694, + "learning_rate": 6.562884556468725e-06, + "loss": 0.445, + "step": 10803 + }, + { + "epoch": 1.7743312873360293, + "grad_norm": 0.2627977347482234, + "learning_rate": 6.562467752649532e-06, + "loss": 0.4707, + "step": 10804 + }, + { + "epoch": 1.7744955145443722, + "grad_norm": 0.2966307541960856, + "learning_rate": 6.562050926429912e-06, + "loss": 0.4636, + "step": 10805 + }, + { + "epoch": 1.774659741752715, + "grad_norm": 0.39316975585417724, + "learning_rate": 6.561634077814389e-06, + "loss": 0.4592, + "step": 10806 + }, + { + "epoch": 1.7748239689610577, + "grad_norm": 0.28644623247691015, + "learning_rate": 6.561217206807491e-06, + "loss": 0.4627, + "step": 10807 + }, + { + "epoch": 1.7749881961694003, + "grad_norm": 0.38443342350967297, + "learning_rate": 6.5608003134137465e-06, + "loss": 0.4639, + "step": 10808 + }, + { + "epoch": 1.7751524233777431, + "grad_norm": 0.388144163612288, + "learning_rate": 6.560383397637684e-06, + "loss": 0.4733, + "step": 10809 + }, + { + "epoch": 1.775316650586086, + "grad_norm": 0.32983894347945997, + "learning_rate": 6.55996645948383e-06, + "loss": 0.4477, + "step": 10810 + }, + { + "epoch": 1.7754808777944286, + "grad_norm": 0.4331970351898739, + "learning_rate": 6.559549498956715e-06, + "loss": 0.4576, + "step": 10811 + }, + { + "epoch": 1.7756451050027713, + "grad_norm": 0.298687059323067, + "learning_rate": 6.559132516060865e-06, + "loss": 0.4534, + "step": 10812 + }, + { + "epoch": 1.775809332211114, + "grad_norm": 0.2718148542045194, + "learning_rate": 6.55871551080081e-06, + "loss": 0.4431, + "step": 10813 + }, + { + "epoch": 1.7759735594194568, + "grad_norm": 0.3192924286136304, + "learning_rate": 6.558298483181078e-06, + "loss": 0.4637, + "step": 10814 + }, + { + "epoch": 1.7761377866277996, + "grad_norm": 0.3229275703672246, + "learning_rate": 6.5578814332062e-06, + "loss": 0.4517, + "step": 10815 + }, + { + "epoch": 1.7763020138361423, + "grad_norm": 0.3074386241153964, + "learning_rate": 6.557464360880704e-06, + "loss": 0.4425, + "step": 10816 + }, + { + "epoch": 1.776466241044485, + "grad_norm": 0.2904906686506346, + "learning_rate": 6.557047266209123e-06, + "loss": 0.4571, + "step": 10817 + }, + { + "epoch": 1.7766304682528278, + "grad_norm": 0.27502918069961163, + "learning_rate": 6.556630149195984e-06, + "loss": 0.4583, + "step": 10818 + }, + { + "epoch": 1.7767946954611706, + "grad_norm": 0.34980850402681873, + "learning_rate": 6.5562130098458175e-06, + "loss": 0.4665, + "step": 10819 + }, + { + "epoch": 1.7769589226695133, + "grad_norm": 0.32905826580936176, + "learning_rate": 6.555795848163155e-06, + "loss": 0.4326, + "step": 10820 + }, + { + "epoch": 1.777123149877856, + "grad_norm": 0.31399784018566634, + "learning_rate": 6.5553786641525266e-06, + "loss": 0.4728, + "step": 10821 + }, + { + "epoch": 1.7772873770861988, + "grad_norm": 0.28220727942929263, + "learning_rate": 6.554961457818464e-06, + "loss": 0.4689, + "step": 10822 + }, + { + "epoch": 1.7774516042945416, + "grad_norm": 0.350203078583294, + "learning_rate": 6.554544229165498e-06, + "loss": 0.4778, + "step": 10823 + }, + { + "epoch": 1.7776158315028843, + "grad_norm": 0.43535540364275027, + "learning_rate": 6.55412697819816e-06, + "loss": 0.4483, + "step": 10824 + }, + { + "epoch": 1.777780058711227, + "grad_norm": 0.30355242827628565, + "learning_rate": 6.553709704920984e-06, + "loss": 0.4592, + "step": 10825 + }, + { + "epoch": 1.7779442859195698, + "grad_norm": 0.3460292247959962, + "learning_rate": 6.553292409338499e-06, + "loss": 0.4647, + "step": 10826 + }, + { + "epoch": 1.7781085131279126, + "grad_norm": 0.3144679395520833, + "learning_rate": 6.552875091455237e-06, + "loss": 0.4783, + "step": 10827 + }, + { + "epoch": 1.7782727403362553, + "grad_norm": 0.2956641392050019, + "learning_rate": 6.552457751275732e-06, + "loss": 0.463, + "step": 10828 + }, + { + "epoch": 1.778436967544598, + "grad_norm": 1.2627729567562462, + "learning_rate": 6.5520403888045175e-06, + "loss": 0.4429, + "step": 10829 + }, + { + "epoch": 1.7786011947529405, + "grad_norm": 0.2850528949825912, + "learning_rate": 6.551623004046125e-06, + "loss": 0.4428, + "step": 10830 + }, + { + "epoch": 1.7787654219612834, + "grad_norm": 0.3196432030904305, + "learning_rate": 6.551205597005088e-06, + "loss": 0.4663, + "step": 10831 + }, + { + "epoch": 1.7789296491696263, + "grad_norm": 0.2575516148408011, + "learning_rate": 6.550788167685941e-06, + "loss": 0.4293, + "step": 10832 + }, + { + "epoch": 1.779093876377969, + "grad_norm": 0.25567605917187286, + "learning_rate": 6.550370716093215e-06, + "loss": 0.4511, + "step": 10833 + }, + { + "epoch": 1.7792581035863115, + "grad_norm": 0.3174637713246571, + "learning_rate": 6.549953242231447e-06, + "loss": 0.4611, + "step": 10834 + }, + { + "epoch": 1.7794223307946544, + "grad_norm": 0.5439191573102424, + "learning_rate": 6.549535746105171e-06, + "loss": 0.4454, + "step": 10835 + }, + { + "epoch": 1.7795865580029973, + "grad_norm": 0.3170447961287215, + "learning_rate": 6.549118227718918e-06, + "loss": 0.4606, + "step": 10836 + }, + { + "epoch": 1.77975078521134, + "grad_norm": 0.36447664378677996, + "learning_rate": 6.548700687077226e-06, + "loss": 0.4711, + "step": 10837 + }, + { + "epoch": 1.7799150124196825, + "grad_norm": 0.2904792276977439, + "learning_rate": 6.5482831241846284e-06, + "loss": 0.4585, + "step": 10838 + }, + { + "epoch": 1.7800792396280254, + "grad_norm": 0.2719058857151288, + "learning_rate": 6.547865539045661e-06, + "loss": 0.467, + "step": 10839 + }, + { + "epoch": 1.7802434668363682, + "grad_norm": 0.3222765664208403, + "learning_rate": 6.54744793166486e-06, + "loss": 0.4411, + "step": 10840 + }, + { + "epoch": 1.7804076940447109, + "grad_norm": 0.3264407930977664, + "learning_rate": 6.547030302046759e-06, + "loss": 0.4493, + "step": 10841 + }, + { + "epoch": 1.7805719212530535, + "grad_norm": 0.3601154985976864, + "learning_rate": 6.546612650195897e-06, + "loss": 0.4593, + "step": 10842 + }, + { + "epoch": 1.7807361484613964, + "grad_norm": 0.29387278390753235, + "learning_rate": 6.546194976116805e-06, + "loss": 0.4364, + "step": 10843 + }, + { + "epoch": 1.7809003756697392, + "grad_norm": 0.32486471922003535, + "learning_rate": 6.545777279814024e-06, + "loss": 0.4613, + "step": 10844 + }, + { + "epoch": 1.7810646028780819, + "grad_norm": 0.3041127864331923, + "learning_rate": 6.5453595612920885e-06, + "loss": 0.4567, + "step": 10845 + }, + { + "epoch": 1.7812288300864245, + "grad_norm": 0.3548441166557751, + "learning_rate": 6.544941820555536e-06, + "loss": 0.4446, + "step": 10846 + }, + { + "epoch": 1.7813930572947672, + "grad_norm": 0.33470252658410204, + "learning_rate": 6.544524057608904e-06, + "loss": 0.4537, + "step": 10847 + }, + { + "epoch": 1.78155728450311, + "grad_norm": 0.2989380276370097, + "learning_rate": 6.544106272456727e-06, + "loss": 0.4557, + "step": 10848 + }, + { + "epoch": 1.7817215117114529, + "grad_norm": 0.2997943871744751, + "learning_rate": 6.543688465103548e-06, + "loss": 0.4559, + "step": 10849 + }, + { + "epoch": 1.7818857389197955, + "grad_norm": 0.2936293590695143, + "learning_rate": 6.5432706355538985e-06, + "loss": 0.4467, + "step": 10850 + }, + { + "epoch": 1.7820499661281382, + "grad_norm": 0.25972399431795706, + "learning_rate": 6.5428527838123215e-06, + "loss": 0.4515, + "step": 10851 + }, + { + "epoch": 1.782214193336481, + "grad_norm": 0.34117830092565526, + "learning_rate": 6.5424349098833534e-06, + "loss": 0.4497, + "step": 10852 + }, + { + "epoch": 1.7823784205448239, + "grad_norm": 0.40292122778145445, + "learning_rate": 6.542017013771531e-06, + "loss": 0.4445, + "step": 10853 + }, + { + "epoch": 1.7825426477531665, + "grad_norm": 0.29530119778119707, + "learning_rate": 6.541599095481396e-06, + "loss": 0.465, + "step": 10854 + }, + { + "epoch": 1.7827068749615091, + "grad_norm": 0.31448321131899243, + "learning_rate": 6.541181155017487e-06, + "loss": 0.4569, + "step": 10855 + }, + { + "epoch": 1.782871102169852, + "grad_norm": 0.28643134276254606, + "learning_rate": 6.540763192384341e-06, + "loss": 0.455, + "step": 10856 + }, + { + "epoch": 1.7830353293781949, + "grad_norm": 0.30659806432999265, + "learning_rate": 6.540345207586498e-06, + "loss": 0.4507, + "step": 10857 + }, + { + "epoch": 1.7831995565865375, + "grad_norm": 0.26947636257084084, + "learning_rate": 6.5399272006285e-06, + "loss": 0.4635, + "step": 10858 + }, + { + "epoch": 1.7833637837948801, + "grad_norm": 0.30243946446801434, + "learning_rate": 6.539509171514888e-06, + "loss": 0.4348, + "step": 10859 + }, + { + "epoch": 1.783528011003223, + "grad_norm": 0.29964948420515214, + "learning_rate": 6.539091120250196e-06, + "loss": 0.4556, + "step": 10860 + }, + { + "epoch": 1.7836922382115659, + "grad_norm": 0.4948757323995327, + "learning_rate": 6.53867304683897e-06, + "loss": 0.4437, + "step": 10861 + }, + { + "epoch": 1.7838564654199085, + "grad_norm": 0.27079970013608673, + "learning_rate": 6.538254951285747e-06, + "loss": 0.4639, + "step": 10862 + }, + { + "epoch": 1.7840206926282511, + "grad_norm": 0.4382147974067117, + "learning_rate": 6.5378368335950716e-06, + "loss": 0.4524, + "step": 10863 + }, + { + "epoch": 1.7841849198365938, + "grad_norm": 0.3276204275339086, + "learning_rate": 6.537418693771484e-06, + "loss": 0.4729, + "step": 10864 + }, + { + "epoch": 1.7843491470449366, + "grad_norm": 0.44798273637267755, + "learning_rate": 6.537000531819523e-06, + "loss": 0.4515, + "step": 10865 + }, + { + "epoch": 1.7845133742532795, + "grad_norm": 0.2958121807443287, + "learning_rate": 6.536582347743732e-06, + "loss": 0.4713, + "step": 10866 + }, + { + "epoch": 1.7846776014616221, + "grad_norm": 0.3204812397349806, + "learning_rate": 6.536164141548654e-06, + "loss": 0.4734, + "step": 10867 + }, + { + "epoch": 1.7848418286699648, + "grad_norm": 0.27522351558354125, + "learning_rate": 6.535745913238831e-06, + "loss": 0.4507, + "step": 10868 + }, + { + "epoch": 1.7850060558783076, + "grad_norm": 0.31930333118326126, + "learning_rate": 6.535327662818804e-06, + "loss": 0.4608, + "step": 10869 + }, + { + "epoch": 1.7851702830866505, + "grad_norm": 0.3058979140390943, + "learning_rate": 6.534909390293115e-06, + "loss": 0.4617, + "step": 10870 + }, + { + "epoch": 1.7853345102949931, + "grad_norm": 0.29387217939437715, + "learning_rate": 6.534491095666308e-06, + "loss": 0.4646, + "step": 10871 + }, + { + "epoch": 1.7854987375033358, + "grad_norm": 0.2930336057878312, + "learning_rate": 6.534072778942927e-06, + "loss": 0.4641, + "step": 10872 + }, + { + "epoch": 1.7856629647116786, + "grad_norm": 0.2594508867244782, + "learning_rate": 6.533654440127514e-06, + "loss": 0.4437, + "step": 10873 + }, + { + "epoch": 1.7858271919200215, + "grad_norm": 0.2716593847223371, + "learning_rate": 6.5332360792246125e-06, + "loss": 0.4558, + "step": 10874 + }, + { + "epoch": 1.7859914191283641, + "grad_norm": 0.28558707454108323, + "learning_rate": 6.532817696238766e-06, + "loss": 0.4584, + "step": 10875 + }, + { + "epoch": 1.7861556463367068, + "grad_norm": 0.35979502883415454, + "learning_rate": 6.532399291174521e-06, + "loss": 0.4617, + "step": 10876 + }, + { + "epoch": 1.7863198735450496, + "grad_norm": 0.28513575240988615, + "learning_rate": 6.531980864036419e-06, + "loss": 0.4546, + "step": 10877 + }, + { + "epoch": 1.7864841007533925, + "grad_norm": 0.3096452283539566, + "learning_rate": 6.531562414829007e-06, + "loss": 0.4776, + "step": 10878 + }, + { + "epoch": 1.7866483279617351, + "grad_norm": 0.3335856723002851, + "learning_rate": 6.5311439435568275e-06, + "loss": 0.4468, + "step": 10879 + }, + { + "epoch": 1.7868125551700778, + "grad_norm": 0.2934169575309989, + "learning_rate": 6.530725450224426e-06, + "loss": 0.4379, + "step": 10880 + }, + { + "epoch": 1.7869767823784204, + "grad_norm": 0.27505220382016676, + "learning_rate": 6.530306934836349e-06, + "loss": 0.4481, + "step": 10881 + }, + { + "epoch": 1.7871410095867633, + "grad_norm": 0.30139640369351783, + "learning_rate": 6.52988839739714e-06, + "loss": 0.4537, + "step": 10882 + }, + { + "epoch": 1.7873052367951061, + "grad_norm": 0.2989131874209402, + "learning_rate": 6.529469837911347e-06, + "loss": 0.427, + "step": 10883 + }, + { + "epoch": 1.7874694640034487, + "grad_norm": 0.5210680923032412, + "learning_rate": 6.529051256383515e-06, + "loss": 0.4633, + "step": 10884 + }, + { + "epoch": 1.7876336912117914, + "grad_norm": 0.29927676663364566, + "learning_rate": 6.528632652818189e-06, + "loss": 0.482, + "step": 10885 + }, + { + "epoch": 1.7877979184201342, + "grad_norm": 0.30191373230381446, + "learning_rate": 6.528214027219916e-06, + "loss": 0.4607, + "step": 10886 + }, + { + "epoch": 1.787962145628477, + "grad_norm": 0.293916130122298, + "learning_rate": 6.527795379593244e-06, + "loss": 0.4591, + "step": 10887 + }, + { + "epoch": 1.7881263728368197, + "grad_norm": 0.4352563222467592, + "learning_rate": 6.52737670994272e-06, + "loss": 0.4788, + "step": 10888 + }, + { + "epoch": 1.7882906000451624, + "grad_norm": 0.9674724434130556, + "learning_rate": 6.52695801827289e-06, + "loss": 0.4533, + "step": 10889 + }, + { + "epoch": 1.7884548272535052, + "grad_norm": 0.2905113824899683, + "learning_rate": 6.5265393045882995e-06, + "loss": 0.442, + "step": 10890 + }, + { + "epoch": 1.788619054461848, + "grad_norm": 0.2708018683006927, + "learning_rate": 6.5261205688935e-06, + "loss": 0.4559, + "step": 10891 + }, + { + "epoch": 1.7887832816701907, + "grad_norm": 0.32087345113081384, + "learning_rate": 6.525701811193037e-06, + "loss": 0.4751, + "step": 10892 + }, + { + "epoch": 1.7889475088785334, + "grad_norm": 0.37785896852038653, + "learning_rate": 6.52528303149146e-06, + "loss": 0.4552, + "step": 10893 + }, + { + "epoch": 1.7891117360868762, + "grad_norm": 0.430886228611665, + "learning_rate": 6.524864229793317e-06, + "loss": 0.4806, + "step": 10894 + }, + { + "epoch": 1.789275963295219, + "grad_norm": 0.290804423309023, + "learning_rate": 6.524445406103155e-06, + "loss": 0.4715, + "step": 10895 + }, + { + "epoch": 1.7894401905035617, + "grad_norm": 0.503444044730968, + "learning_rate": 6.524026560425525e-06, + "loss": 0.4692, + "step": 10896 + }, + { + "epoch": 1.7896044177119044, + "grad_norm": 0.31224124708725315, + "learning_rate": 6.523607692764976e-06, + "loss": 0.4474, + "step": 10897 + }, + { + "epoch": 1.789768644920247, + "grad_norm": 0.2946202941953378, + "learning_rate": 6.523188803126056e-06, + "loss": 0.4633, + "step": 10898 + }, + { + "epoch": 1.7899328721285899, + "grad_norm": 0.2758037376117042, + "learning_rate": 6.522769891513314e-06, + "loss": 0.4693, + "step": 10899 + }, + { + "epoch": 1.7900970993369327, + "grad_norm": 0.32328162302086266, + "learning_rate": 6.522350957931301e-06, + "loss": 0.4612, + "step": 10900 + }, + { + "epoch": 1.7902613265452754, + "grad_norm": 0.3282936126324164, + "learning_rate": 6.521932002384568e-06, + "loss": 0.4419, + "step": 10901 + }, + { + "epoch": 1.790425553753618, + "grad_norm": 0.389470936266045, + "learning_rate": 6.5215130248776625e-06, + "loss": 0.4575, + "step": 10902 + }, + { + "epoch": 1.7905897809619609, + "grad_norm": 0.35674404043827046, + "learning_rate": 6.521094025415138e-06, + "loss": 0.4636, + "step": 10903 + }, + { + "epoch": 1.7907540081703037, + "grad_norm": 0.45021462450787103, + "learning_rate": 6.520675004001544e-06, + "loss": 0.4805, + "step": 10904 + }, + { + "epoch": 1.7909182353786464, + "grad_norm": 0.3029242489250928, + "learning_rate": 6.520255960641431e-06, + "loss": 0.4538, + "step": 10905 + }, + { + "epoch": 1.791082462586989, + "grad_norm": 0.35349088557231206, + "learning_rate": 6.5198368953393505e-06, + "loss": 0.47, + "step": 10906 + }, + { + "epoch": 1.7912466897953319, + "grad_norm": 0.2986364223054352, + "learning_rate": 6.519417808099853e-06, + "loss": 0.4348, + "step": 10907 + }, + { + "epoch": 1.7914109170036747, + "grad_norm": 0.2913259353520882, + "learning_rate": 6.5189986989274925e-06, + "loss": 0.4525, + "step": 10908 + }, + { + "epoch": 1.7915751442120174, + "grad_norm": 0.3301936968659294, + "learning_rate": 6.518579567826821e-06, + "loss": 0.4729, + "step": 10909 + }, + { + "epoch": 1.79173937142036, + "grad_norm": 0.38417340767056996, + "learning_rate": 6.518160414802386e-06, + "loss": 0.4485, + "step": 10910 + }, + { + "epoch": 1.7919035986287029, + "grad_norm": 0.344684065145805, + "learning_rate": 6.517741239858746e-06, + "loss": 0.4345, + "step": 10911 + }, + { + "epoch": 1.7920678258370457, + "grad_norm": 0.2605300283061335, + "learning_rate": 6.5173220430004505e-06, + "loss": 0.4607, + "step": 10912 + }, + { + "epoch": 1.7922320530453884, + "grad_norm": 0.2659233305573, + "learning_rate": 6.5169028242320535e-06, + "loss": 0.4444, + "step": 10913 + }, + { + "epoch": 1.792396280253731, + "grad_norm": 0.4184644221606819, + "learning_rate": 6.516483583558105e-06, + "loss": 0.4402, + "step": 10914 + }, + { + "epoch": 1.7925605074620736, + "grad_norm": 0.3084055212966688, + "learning_rate": 6.5160643209831625e-06, + "loss": 0.4519, + "step": 10915 + }, + { + "epoch": 1.7927247346704165, + "grad_norm": 0.31404721713067635, + "learning_rate": 6.5156450365117775e-06, + "loss": 0.4551, + "step": 10916 + }, + { + "epoch": 1.7928889618787593, + "grad_norm": 0.3102731776923771, + "learning_rate": 6.515225730148504e-06, + "loss": 0.4377, + "step": 10917 + }, + { + "epoch": 1.793053189087102, + "grad_norm": 0.2839548794420706, + "learning_rate": 6.514806401897898e-06, + "loss": 0.457, + "step": 10918 + }, + { + "epoch": 1.7932174162954446, + "grad_norm": 0.3763346060238748, + "learning_rate": 6.51438705176451e-06, + "loss": 0.4337, + "step": 10919 + }, + { + "epoch": 1.7933816435037875, + "grad_norm": 0.3949426628078843, + "learning_rate": 6.513967679752898e-06, + "loss": 0.4731, + "step": 10920 + }, + { + "epoch": 1.7935458707121303, + "grad_norm": 0.3935792482843286, + "learning_rate": 6.513548285867615e-06, + "loss": 0.4436, + "step": 10921 + }, + { + "epoch": 1.793710097920473, + "grad_norm": 0.315815220534042, + "learning_rate": 6.513128870113217e-06, + "loss": 0.4442, + "step": 10922 + }, + { + "epoch": 1.7938743251288156, + "grad_norm": 0.34375092404008345, + "learning_rate": 6.51270943249426e-06, + "loss": 0.4646, + "step": 10923 + }, + { + "epoch": 1.7940385523371585, + "grad_norm": 0.31256531427721074, + "learning_rate": 6.512289973015296e-06, + "loss": 0.4668, + "step": 10924 + }, + { + "epoch": 1.7942027795455013, + "grad_norm": 0.37261118367690316, + "learning_rate": 6.511870491680884e-06, + "loss": 0.449, + "step": 10925 + }, + { + "epoch": 1.794367006753844, + "grad_norm": 0.3237806057691401, + "learning_rate": 6.511450988495579e-06, + "loss": 0.4706, + "step": 10926 + }, + { + "epoch": 1.7945312339621866, + "grad_norm": 0.3194253833424651, + "learning_rate": 6.511031463463938e-06, + "loss": 0.4614, + "step": 10927 + }, + { + "epoch": 1.7946954611705295, + "grad_norm": 0.32585747743809507, + "learning_rate": 6.510611916590516e-06, + "loss": 0.4661, + "step": 10928 + }, + { + "epoch": 1.7948596883788723, + "grad_norm": 0.3586057403191899, + "learning_rate": 6.51019234787987e-06, + "loss": 0.4613, + "step": 10929 + }, + { + "epoch": 1.795023915587215, + "grad_norm": 0.346056732069319, + "learning_rate": 6.5097727573365585e-06, + "loss": 0.4798, + "step": 10930 + }, + { + "epoch": 1.7951881427955576, + "grad_norm": 0.3126585681021275, + "learning_rate": 6.509353144965137e-06, + "loss": 0.4404, + "step": 10931 + }, + { + "epoch": 1.7953523700039002, + "grad_norm": 0.3535289870550478, + "learning_rate": 6.508933510770163e-06, + "loss": 0.4505, + "step": 10932 + }, + { + "epoch": 1.795516597212243, + "grad_norm": 0.3555341152205804, + "learning_rate": 6.508513854756194e-06, + "loss": 0.4561, + "step": 10933 + }, + { + "epoch": 1.795680824420586, + "grad_norm": 0.29507904491048875, + "learning_rate": 6.5080941769277895e-06, + "loss": 0.4435, + "step": 10934 + }, + { + "epoch": 1.7958450516289286, + "grad_norm": 0.5882288729640641, + "learning_rate": 6.5076744772895066e-06, + "loss": 0.456, + "step": 10935 + }, + { + "epoch": 1.7960092788372712, + "grad_norm": 0.45412600871907816, + "learning_rate": 6.507254755845903e-06, + "loss": 0.4564, + "step": 10936 + }, + { + "epoch": 1.796173506045614, + "grad_norm": 0.39575488914845575, + "learning_rate": 6.506835012601538e-06, + "loss": 0.4669, + "step": 10937 + }, + { + "epoch": 1.796337733253957, + "grad_norm": 0.48392008032248435, + "learning_rate": 6.50641524756097e-06, + "loss": 0.4518, + "step": 10938 + }, + { + "epoch": 1.7965019604622996, + "grad_norm": 0.5955498416084375, + "learning_rate": 6.505995460728759e-06, + "loss": 0.4419, + "step": 10939 + }, + { + "epoch": 1.7966661876706422, + "grad_norm": 0.32689911038398983, + "learning_rate": 6.505575652109464e-06, + "loss": 0.4531, + "step": 10940 + }, + { + "epoch": 1.796830414878985, + "grad_norm": 0.34195713441440434, + "learning_rate": 6.505155821707642e-06, + "loss": 0.4402, + "step": 10941 + }, + { + "epoch": 1.796994642087328, + "grad_norm": 0.2566054657671745, + "learning_rate": 6.504735969527858e-06, + "loss": 0.4404, + "step": 10942 + }, + { + "epoch": 1.7971588692956706, + "grad_norm": 0.32830307058291525, + "learning_rate": 6.504316095574668e-06, + "loss": 0.4713, + "step": 10943 + }, + { + "epoch": 1.7973230965040132, + "grad_norm": 0.2556867194390802, + "learning_rate": 6.503896199852632e-06, + "loss": 0.447, + "step": 10944 + }, + { + "epoch": 1.797487323712356, + "grad_norm": 0.33439323742377086, + "learning_rate": 6.503476282366313e-06, + "loss": 0.4523, + "step": 10945 + }, + { + "epoch": 1.797651550920699, + "grad_norm": 0.369790100012626, + "learning_rate": 6.50305634312027e-06, + "loss": 0.4555, + "step": 10946 + }, + { + "epoch": 1.7978157781290416, + "grad_norm": 0.31309996544936336, + "learning_rate": 6.502636382119064e-06, + "loss": 0.4618, + "step": 10947 + }, + { + "epoch": 1.7979800053373842, + "grad_norm": 0.28018315473575545, + "learning_rate": 6.5022163993672575e-06, + "loss": 0.47, + "step": 10948 + }, + { + "epoch": 1.7981442325457269, + "grad_norm": 0.33690209073758887, + "learning_rate": 6.5017963948694094e-06, + "loss": 0.4501, + "step": 10949 + }, + { + "epoch": 1.7983084597540697, + "grad_norm": 0.4252119518199385, + "learning_rate": 6.501376368630083e-06, + "loss": 0.4494, + "step": 10950 + }, + { + "epoch": 1.7984726869624126, + "grad_norm": 0.3752736500074051, + "learning_rate": 6.5009563206538426e-06, + "loss": 0.4607, + "step": 10951 + }, + { + "epoch": 1.7986369141707552, + "grad_norm": 0.35097152026242395, + "learning_rate": 6.500536250945247e-06, + "loss": 0.4517, + "step": 10952 + }, + { + "epoch": 1.7988011413790979, + "grad_norm": 0.3815338566089623, + "learning_rate": 6.500116159508858e-06, + "loss": 0.4392, + "step": 10953 + }, + { + "epoch": 1.7989653685874407, + "grad_norm": 0.47223690095875387, + "learning_rate": 6.49969604634924e-06, + "loss": 0.4736, + "step": 10954 + }, + { + "epoch": 1.7991295957957836, + "grad_norm": 0.35101255145850957, + "learning_rate": 6.499275911470957e-06, + "loss": 0.4562, + "step": 10955 + }, + { + "epoch": 1.7992938230041262, + "grad_norm": 0.6402324900212448, + "learning_rate": 6.498855754878569e-06, + "loss": 0.4366, + "step": 10956 + }, + { + "epoch": 1.7994580502124689, + "grad_norm": 0.3048648173075826, + "learning_rate": 6.498435576576641e-06, + "loss": 0.4421, + "step": 10957 + }, + { + "epoch": 1.7996222774208117, + "grad_norm": 0.34111976311589354, + "learning_rate": 6.498015376569737e-06, + "loss": 0.4375, + "step": 10958 + }, + { + "epoch": 1.7997865046291546, + "grad_norm": 0.30784589349110625, + "learning_rate": 6.49759515486242e-06, + "loss": 0.4567, + "step": 10959 + }, + { + "epoch": 1.7999507318374972, + "grad_norm": 0.31297748416124943, + "learning_rate": 6.497174911459255e-06, + "loss": 0.4631, + "step": 10960 + }, + { + "epoch": 1.8001149590458398, + "grad_norm": 0.3201823680748511, + "learning_rate": 6.496754646364805e-06, + "loss": 0.4813, + "step": 10961 + }, + { + "epoch": 1.8002791862541827, + "grad_norm": 0.2898450135675172, + "learning_rate": 6.496334359583635e-06, + "loss": 0.4512, + "step": 10962 + }, + { + "epoch": 1.8004434134625256, + "grad_norm": 0.3354166298240835, + "learning_rate": 6.4959140511203085e-06, + "loss": 0.4553, + "step": 10963 + }, + { + "epoch": 1.8006076406708682, + "grad_norm": 0.3826613416674712, + "learning_rate": 6.495493720979394e-06, + "loss": 0.4511, + "step": 10964 + }, + { + "epoch": 1.8007718678792108, + "grad_norm": 0.31587237057764445, + "learning_rate": 6.495073369165452e-06, + "loss": 0.4801, + "step": 10965 + }, + { + "epoch": 1.8009360950875535, + "grad_norm": 0.3074217908036942, + "learning_rate": 6.494652995683053e-06, + "loss": 0.4202, + "step": 10966 + }, + { + "epoch": 1.8011003222958963, + "grad_norm": 0.3223590332544111, + "learning_rate": 6.494232600536757e-06, + "loss": 0.4732, + "step": 10967 + }, + { + "epoch": 1.8012645495042392, + "grad_norm": 0.34342528444048104, + "learning_rate": 6.493812183731135e-06, + "loss": 0.4386, + "step": 10968 + }, + { + "epoch": 1.8014287767125818, + "grad_norm": 0.28160135914564627, + "learning_rate": 6.49339174527075e-06, + "loss": 0.428, + "step": 10969 + }, + { + "epoch": 1.8015930039209245, + "grad_norm": 0.32430009941633803, + "learning_rate": 6.492971285160169e-06, + "loss": 0.4391, + "step": 10970 + }, + { + "epoch": 1.8017572311292673, + "grad_norm": 0.364125940598261, + "learning_rate": 6.492550803403962e-06, + "loss": 0.4454, + "step": 10971 + }, + { + "epoch": 1.8019214583376102, + "grad_norm": 0.27861356365010376, + "learning_rate": 6.492130300006691e-06, + "loss": 0.4506, + "step": 10972 + }, + { + "epoch": 1.8020856855459528, + "grad_norm": 0.29793446395521767, + "learning_rate": 6.491709774972923e-06, + "loss": 0.4528, + "step": 10973 + }, + { + "epoch": 1.8022499127542955, + "grad_norm": 0.30617337367580927, + "learning_rate": 6.491289228307229e-06, + "loss": 0.4604, + "step": 10974 + }, + { + "epoch": 1.8024141399626383, + "grad_norm": 0.35553986727454073, + "learning_rate": 6.490868660014175e-06, + "loss": 0.4618, + "step": 10975 + }, + { + "epoch": 1.8025783671709812, + "grad_norm": 0.43792944372602643, + "learning_rate": 6.49044807009833e-06, + "loss": 0.444, + "step": 10976 + }, + { + "epoch": 1.8027425943793238, + "grad_norm": 0.309443097946476, + "learning_rate": 6.490027458564258e-06, + "loss": 0.4633, + "step": 10977 + }, + { + "epoch": 1.8029068215876665, + "grad_norm": 0.29035938841687275, + "learning_rate": 6.489606825416531e-06, + "loss": 0.4657, + "step": 10978 + }, + { + "epoch": 1.8030710487960093, + "grad_norm": 0.38280437482755186, + "learning_rate": 6.489186170659715e-06, + "loss": 0.4412, + "step": 10979 + }, + { + "epoch": 1.8032352760043522, + "grad_norm": 0.3268712043393675, + "learning_rate": 6.488765494298382e-06, + "loss": 0.461, + "step": 10980 + }, + { + "epoch": 1.8033995032126948, + "grad_norm": 0.5975993158922271, + "learning_rate": 6.488344796337099e-06, + "loss": 0.4575, + "step": 10981 + }, + { + "epoch": 1.8035637304210375, + "grad_norm": 0.28730055670229004, + "learning_rate": 6.487924076780434e-06, + "loss": 0.4605, + "step": 10982 + }, + { + "epoch": 1.80372795762938, + "grad_norm": 0.3067576354459948, + "learning_rate": 6.487503335632958e-06, + "loss": 0.4579, + "step": 10983 + }, + { + "epoch": 1.803892184837723, + "grad_norm": 0.3178578324928383, + "learning_rate": 6.48708257289924e-06, + "loss": 0.4553, + "step": 10984 + }, + { + "epoch": 1.8040564120460658, + "grad_norm": 0.31296466092563197, + "learning_rate": 6.486661788583851e-06, + "loss": 0.4701, + "step": 10985 + }, + { + "epoch": 1.8042206392544085, + "grad_norm": 0.27868046588972933, + "learning_rate": 6.4862409826913615e-06, + "loss": 0.4234, + "step": 10986 + }, + { + "epoch": 1.804384866462751, + "grad_norm": 0.2894081748376271, + "learning_rate": 6.485820155226339e-06, + "loss": 0.4575, + "step": 10987 + }, + { + "epoch": 1.804549093671094, + "grad_norm": 0.3840278851261531, + "learning_rate": 6.485399306193356e-06, + "loss": 0.4688, + "step": 10988 + }, + { + "epoch": 1.8047133208794368, + "grad_norm": 0.35329064565706103, + "learning_rate": 6.484978435596983e-06, + "loss": 0.4492, + "step": 10989 + }, + { + "epoch": 1.8048775480877794, + "grad_norm": 0.2666745016209278, + "learning_rate": 6.484557543441792e-06, + "loss": 0.451, + "step": 10990 + }, + { + "epoch": 1.805041775296122, + "grad_norm": 0.2727701044292131, + "learning_rate": 6.484136629732354e-06, + "loss": 0.4699, + "step": 10991 + }, + { + "epoch": 1.805206002504465, + "grad_norm": 0.2876975201051365, + "learning_rate": 6.483715694473239e-06, + "loss": 0.452, + "step": 10992 + }, + { + "epoch": 1.8053702297128078, + "grad_norm": 0.30486164251675607, + "learning_rate": 6.483294737669021e-06, + "loss": 0.4638, + "step": 10993 + }, + { + "epoch": 1.8055344569211504, + "grad_norm": 0.3242566132554888, + "learning_rate": 6.482873759324268e-06, + "loss": 0.4656, + "step": 10994 + }, + { + "epoch": 1.805698684129493, + "grad_norm": 0.27753825968593376, + "learning_rate": 6.4824527594435586e-06, + "loss": 0.4425, + "step": 10995 + }, + { + "epoch": 1.805862911337836, + "grad_norm": 0.32392349033908413, + "learning_rate": 6.48203173803146e-06, + "loss": 0.4592, + "step": 10996 + }, + { + "epoch": 1.8060271385461788, + "grad_norm": 0.2700411488707498, + "learning_rate": 6.481610695092547e-06, + "loss": 0.4622, + "step": 10997 + }, + { + "epoch": 1.8061913657545214, + "grad_norm": 0.3150824644921173, + "learning_rate": 6.481189630631392e-06, + "loss": 0.4466, + "step": 10998 + }, + { + "epoch": 1.806355592962864, + "grad_norm": 0.4169412074763378, + "learning_rate": 6.480768544652569e-06, + "loss": 0.4512, + "step": 10999 + }, + { + "epoch": 1.8065198201712067, + "grad_norm": 0.33008365407804763, + "learning_rate": 6.48034743716065e-06, + "loss": 0.4525, + "step": 11000 + }, + { + "epoch": 1.8066840473795496, + "grad_norm": 0.3184161468360843, + "learning_rate": 6.479926308160211e-06, + "loss": 0.4638, + "step": 11001 + }, + { + "epoch": 1.8068482745878924, + "grad_norm": 0.2894848075749476, + "learning_rate": 6.479505157655822e-06, + "loss": 0.443, + "step": 11002 + }, + { + "epoch": 1.807012501796235, + "grad_norm": 0.319981318393346, + "learning_rate": 6.4790839856520605e-06, + "loss": 0.4473, + "step": 11003 + }, + { + "epoch": 1.8071767290045777, + "grad_norm": 0.3540499062083888, + "learning_rate": 6.4786627921534985e-06, + "loss": 0.4304, + "step": 11004 + }, + { + "epoch": 1.8073409562129206, + "grad_norm": 0.294134878699911, + "learning_rate": 6.4782415771647145e-06, + "loss": 0.4544, + "step": 11005 + }, + { + "epoch": 1.8075051834212634, + "grad_norm": 0.4010107953685391, + "learning_rate": 6.47782034069028e-06, + "loss": 0.4226, + "step": 11006 + }, + { + "epoch": 1.807669410629606, + "grad_norm": 0.3958453380626719, + "learning_rate": 6.477399082734769e-06, + "loss": 0.4563, + "step": 11007 + }, + { + "epoch": 1.8078336378379487, + "grad_norm": 0.49181283823773797, + "learning_rate": 6.476977803302758e-06, + "loss": 0.4549, + "step": 11008 + }, + { + "epoch": 1.8079978650462916, + "grad_norm": 0.32707082518126557, + "learning_rate": 6.476556502398825e-06, + "loss": 0.437, + "step": 11009 + }, + { + "epoch": 1.8081620922546344, + "grad_norm": 0.32247170989748514, + "learning_rate": 6.476135180027544e-06, + "loss": 0.4545, + "step": 11010 + }, + { + "epoch": 1.808326319462977, + "grad_norm": 0.2571814591641584, + "learning_rate": 6.47571383619349e-06, + "loss": 0.4704, + "step": 11011 + }, + { + "epoch": 1.8084905466713197, + "grad_norm": 0.27433283480124443, + "learning_rate": 6.4752924709012385e-06, + "loss": 0.4397, + "step": 11012 + }, + { + "epoch": 1.8086547738796626, + "grad_norm": 0.30296470989492685, + "learning_rate": 6.474871084155368e-06, + "loss": 0.4827, + "step": 11013 + }, + { + "epoch": 1.8088190010880054, + "grad_norm": 0.329768978817267, + "learning_rate": 6.474449675960455e-06, + "loss": 0.4383, + "step": 11014 + }, + { + "epoch": 1.808983228296348, + "grad_norm": 0.3012850081596831, + "learning_rate": 6.474028246321077e-06, + "loss": 0.4354, + "step": 11015 + }, + { + "epoch": 1.8091474555046907, + "grad_norm": 0.3940716281617523, + "learning_rate": 6.47360679524181e-06, + "loss": 0.4473, + "step": 11016 + }, + { + "epoch": 1.8093116827130333, + "grad_norm": 0.29057796041348144, + "learning_rate": 6.473185322727228e-06, + "loss": 0.456, + "step": 11017 + }, + { + "epoch": 1.8094759099213762, + "grad_norm": 0.3647116420745295, + "learning_rate": 6.472763828781916e-06, + "loss": 0.4429, + "step": 11018 + }, + { + "epoch": 1.809640137129719, + "grad_norm": 0.4227525053223582, + "learning_rate": 6.472342313410446e-06, + "loss": 0.4582, + "step": 11019 + }, + { + "epoch": 1.8098043643380617, + "grad_norm": 0.3717185423227278, + "learning_rate": 6.471920776617399e-06, + "loss": 0.4441, + "step": 11020 + }, + { + "epoch": 1.8099685915464043, + "grad_norm": 0.27670914914335104, + "learning_rate": 6.471499218407351e-06, + "loss": 0.4658, + "step": 11021 + }, + { + "epoch": 1.8101328187547472, + "grad_norm": 0.34867878518606726, + "learning_rate": 6.471077638784882e-06, + "loss": 0.4565, + "step": 11022 + }, + { + "epoch": 1.81029704596309, + "grad_norm": 0.27587572155017304, + "learning_rate": 6.470656037754571e-06, + "loss": 0.4582, + "step": 11023 + }, + { + "epoch": 1.8104612731714327, + "grad_norm": 0.7304768832794305, + "learning_rate": 6.470234415320997e-06, + "loss": 0.4771, + "step": 11024 + }, + { + "epoch": 1.8106255003797753, + "grad_norm": 0.3823763993220649, + "learning_rate": 6.469812771488737e-06, + "loss": 0.4675, + "step": 11025 + }, + { + "epoch": 1.8107897275881182, + "grad_norm": 0.333588611062913, + "learning_rate": 6.469391106262375e-06, + "loss": 0.4481, + "step": 11026 + }, + { + "epoch": 1.810953954796461, + "grad_norm": 0.3375432724883843, + "learning_rate": 6.468969419646486e-06, + "loss": 0.4603, + "step": 11027 + }, + { + "epoch": 1.8111181820048037, + "grad_norm": 0.3195629064261066, + "learning_rate": 6.468547711645652e-06, + "loss": 0.4542, + "step": 11028 + }, + { + "epoch": 1.8112824092131463, + "grad_norm": 0.28394558610822923, + "learning_rate": 6.468125982264454e-06, + "loss": 0.4513, + "step": 11029 + }, + { + "epoch": 1.8114466364214892, + "grad_norm": 0.33674311931385514, + "learning_rate": 6.4677042315074715e-06, + "loss": 0.4662, + "step": 11030 + }, + { + "epoch": 1.811610863629832, + "grad_norm": 0.38153871812642226, + "learning_rate": 6.4672824593792835e-06, + "loss": 0.4618, + "step": 11031 + }, + { + "epoch": 1.8117750908381747, + "grad_norm": 0.30354053866531616, + "learning_rate": 6.466860665884473e-06, + "loss": 0.4538, + "step": 11032 + }, + { + "epoch": 1.8119393180465173, + "grad_norm": 0.3405080385317878, + "learning_rate": 6.466438851027622e-06, + "loss": 0.4433, + "step": 11033 + }, + { + "epoch": 1.81210354525486, + "grad_norm": 0.29623350679986754, + "learning_rate": 6.46601701481331e-06, + "loss": 0.4314, + "step": 11034 + }, + { + "epoch": 1.8122677724632028, + "grad_norm": 0.2935779674611224, + "learning_rate": 6.465595157246118e-06, + "loss": 0.4527, + "step": 11035 + }, + { + "epoch": 1.8124319996715457, + "grad_norm": 0.3068991690315416, + "learning_rate": 6.4651732783306285e-06, + "loss": 0.4497, + "step": 11036 + }, + { + "epoch": 1.8125962268798883, + "grad_norm": 0.40453061082958175, + "learning_rate": 6.464751378071424e-06, + "loss": 0.4602, + "step": 11037 + }, + { + "epoch": 1.812760454088231, + "grad_norm": 0.31945911053068904, + "learning_rate": 6.464329456473086e-06, + "loss": 0.464, + "step": 11038 + }, + { + "epoch": 1.8129246812965738, + "grad_norm": 0.311919813694301, + "learning_rate": 6.4639075135402e-06, + "loss": 0.4706, + "step": 11039 + }, + { + "epoch": 1.8130889085049167, + "grad_norm": 0.37696971232973997, + "learning_rate": 6.463485549277343e-06, + "loss": 0.4397, + "step": 11040 + }, + { + "epoch": 1.8132531357132593, + "grad_norm": 0.3312339585677867, + "learning_rate": 6.463063563689103e-06, + "loss": 0.4468, + "step": 11041 + }, + { + "epoch": 1.813417362921602, + "grad_norm": 1.0677719619640302, + "learning_rate": 6.46264155678006e-06, + "loss": 0.4389, + "step": 11042 + }, + { + "epoch": 1.8135815901299448, + "grad_norm": 0.3297430212323639, + "learning_rate": 6.4622195285548e-06, + "loss": 0.4531, + "step": 11043 + }, + { + "epoch": 1.8137458173382877, + "grad_norm": 0.2958232682654792, + "learning_rate": 6.461797479017906e-06, + "loss": 0.4471, + "step": 11044 + }, + { + "epoch": 1.8139100445466303, + "grad_norm": 0.327250642966852, + "learning_rate": 6.46137540817396e-06, + "loss": 0.4401, + "step": 11045 + }, + { + "epoch": 1.814074271754973, + "grad_norm": 0.4175480266474759, + "learning_rate": 6.4609533160275465e-06, + "loss": 0.4378, + "step": 11046 + }, + { + "epoch": 1.8142384989633158, + "grad_norm": 0.38417408251346336, + "learning_rate": 6.460531202583252e-06, + "loss": 0.4641, + "step": 11047 + }, + { + "epoch": 1.8144027261716587, + "grad_norm": 1.088990431170157, + "learning_rate": 6.460109067845658e-06, + "loss": 0.4307, + "step": 11048 + }, + { + "epoch": 1.8145669533800013, + "grad_norm": 0.4952726764230832, + "learning_rate": 6.459686911819353e-06, + "loss": 0.4457, + "step": 11049 + }, + { + "epoch": 1.814731180588344, + "grad_norm": 0.31313614766898384, + "learning_rate": 6.45926473450892e-06, + "loss": 0.4671, + "step": 11050 + }, + { + "epoch": 1.8148954077966866, + "grad_norm": 0.3303499375852876, + "learning_rate": 6.458842535918944e-06, + "loss": 0.4383, + "step": 11051 + }, + { + "epoch": 1.8150596350050294, + "grad_norm": 0.29281553354836926, + "learning_rate": 6.4584203160540105e-06, + "loss": 0.4327, + "step": 11052 + }, + { + "epoch": 1.8152238622133723, + "grad_norm": 0.38270445224557276, + "learning_rate": 6.457998074918705e-06, + "loss": 0.4503, + "step": 11053 + }, + { + "epoch": 1.815388089421715, + "grad_norm": 0.4228231426560049, + "learning_rate": 6.457575812517615e-06, + "loss": 0.4482, + "step": 11054 + }, + { + "epoch": 1.8155523166300576, + "grad_norm": 0.34167907967027655, + "learning_rate": 6.457153528855325e-06, + "loss": 0.4512, + "step": 11055 + }, + { + "epoch": 1.8157165438384004, + "grad_norm": 0.41697598086361165, + "learning_rate": 6.456731223936423e-06, + "loss": 0.4805, + "step": 11056 + }, + { + "epoch": 1.8158807710467433, + "grad_norm": 0.3569020023794603, + "learning_rate": 6.456308897765494e-06, + "loss": 0.4532, + "step": 11057 + }, + { + "epoch": 1.816044998255086, + "grad_norm": 0.3618170825730565, + "learning_rate": 6.455886550347127e-06, + "loss": 0.4494, + "step": 11058 + }, + { + "epoch": 1.8162092254634286, + "grad_norm": 0.3233994158463805, + "learning_rate": 6.455464181685904e-06, + "loss": 0.4636, + "step": 11059 + }, + { + "epoch": 1.8163734526717714, + "grad_norm": 0.2949245076583048, + "learning_rate": 6.45504179178642e-06, + "loss": 0.4542, + "step": 11060 + }, + { + "epoch": 1.8165376798801143, + "grad_norm": 0.3139588667284323, + "learning_rate": 6.454619380653257e-06, + "loss": 0.4656, + "step": 11061 + }, + { + "epoch": 1.816701907088457, + "grad_norm": 0.29795405508359124, + "learning_rate": 6.4541969482910044e-06, + "loss": 0.4467, + "step": 11062 + }, + { + "epoch": 1.8168661342967996, + "grad_norm": 0.3168979315435727, + "learning_rate": 6.453774494704251e-06, + "loss": 0.4384, + "step": 11063 + }, + { + "epoch": 1.8170303615051424, + "grad_norm": 0.32749872421298704, + "learning_rate": 6.453352019897584e-06, + "loss": 0.4554, + "step": 11064 + }, + { + "epoch": 1.8171945887134853, + "grad_norm": 0.42556222068988175, + "learning_rate": 6.452929523875592e-06, + "loss": 0.4552, + "step": 11065 + }, + { + "epoch": 1.817358815921828, + "grad_norm": 0.44368552613487633, + "learning_rate": 6.452507006642863e-06, + "loss": 0.4534, + "step": 11066 + }, + { + "epoch": 1.8175230431301705, + "grad_norm": 0.3223174577268706, + "learning_rate": 6.452084468203988e-06, + "loss": 0.4479, + "step": 11067 + }, + { + "epoch": 1.8176872703385132, + "grad_norm": 0.33800164745732303, + "learning_rate": 6.4516619085635555e-06, + "loss": 0.4734, + "step": 11068 + }, + { + "epoch": 1.817851497546856, + "grad_norm": 0.3452294395819774, + "learning_rate": 6.451239327726155e-06, + "loss": 0.4517, + "step": 11069 + }, + { + "epoch": 1.818015724755199, + "grad_norm": 0.4589685077156546, + "learning_rate": 6.4508167256963735e-06, + "loss": 0.4278, + "step": 11070 + }, + { + "epoch": 1.8181799519635415, + "grad_norm": 0.4040673809335704, + "learning_rate": 6.450394102478804e-06, + "loss": 0.4348, + "step": 11071 + }, + { + "epoch": 1.8183441791718842, + "grad_norm": 0.3518608635195822, + "learning_rate": 6.449971458078036e-06, + "loss": 0.4628, + "step": 11072 + }, + { + "epoch": 1.818508406380227, + "grad_norm": 0.40018200265748566, + "learning_rate": 6.44954879249866e-06, + "loss": 0.432, + "step": 11073 + }, + { + "epoch": 1.81867263358857, + "grad_norm": 0.3737532515019579, + "learning_rate": 6.4491261057452644e-06, + "loss": 0.4505, + "step": 11074 + }, + { + "epoch": 1.8188368607969125, + "grad_norm": 0.31055544483008213, + "learning_rate": 6.448703397822442e-06, + "loss": 0.4481, + "step": 11075 + }, + { + "epoch": 1.8190010880052552, + "grad_norm": 0.3225928339655794, + "learning_rate": 6.448280668734785e-06, + "loss": 0.463, + "step": 11076 + }, + { + "epoch": 1.819165315213598, + "grad_norm": 0.3259872807174308, + "learning_rate": 6.447857918486881e-06, + "loss": 0.4619, + "step": 11077 + }, + { + "epoch": 1.819329542421941, + "grad_norm": 0.3702360876886552, + "learning_rate": 6.447435147083326e-06, + "loss": 0.4555, + "step": 11078 + }, + { + "epoch": 1.8194937696302835, + "grad_norm": 0.3691408949553295, + "learning_rate": 6.447012354528708e-06, + "loss": 0.4432, + "step": 11079 + }, + { + "epoch": 1.8196579968386262, + "grad_norm": 0.3041321344097473, + "learning_rate": 6.446589540827619e-06, + "loss": 0.4201, + "step": 11080 + }, + { + "epoch": 1.819822224046969, + "grad_norm": 0.30894935051624056, + "learning_rate": 6.446166705984654e-06, + "loss": 0.4367, + "step": 11081 + }, + { + "epoch": 1.8199864512553119, + "grad_norm": 0.3979234912372484, + "learning_rate": 6.4457438500044025e-06, + "loss": 0.4442, + "step": 11082 + }, + { + "epoch": 1.8201506784636545, + "grad_norm": 0.35321748362975847, + "learning_rate": 6.44532097289146e-06, + "loss": 0.4585, + "step": 11083 + }, + { + "epoch": 1.8203149056719972, + "grad_norm": 0.32382149930434995, + "learning_rate": 6.444898074650416e-06, + "loss": 0.4504, + "step": 11084 + }, + { + "epoch": 1.8204791328803398, + "grad_norm": 0.3905287570407914, + "learning_rate": 6.444475155285867e-06, + "loss": 0.4671, + "step": 11085 + }, + { + "epoch": 1.8206433600886827, + "grad_norm": 0.3364174209242473, + "learning_rate": 6.444052214802404e-06, + "loss": 0.4514, + "step": 11086 + }, + { + "epoch": 1.8208075872970255, + "grad_norm": 0.3132902251492512, + "learning_rate": 6.443629253204621e-06, + "loss": 0.4593, + "step": 11087 + }, + { + "epoch": 1.8209718145053682, + "grad_norm": 0.31611007055962465, + "learning_rate": 6.443206270497113e-06, + "loss": 0.4289, + "step": 11088 + }, + { + "epoch": 1.8211360417137108, + "grad_norm": 0.3183713282494217, + "learning_rate": 6.4427832666844725e-06, + "loss": 0.4557, + "step": 11089 + }, + { + "epoch": 1.8213002689220537, + "grad_norm": 0.3550373150593315, + "learning_rate": 6.442360241771294e-06, + "loss": 0.4428, + "step": 11090 + }, + { + "epoch": 1.8214644961303965, + "grad_norm": 0.32252481795074844, + "learning_rate": 6.4419371957621726e-06, + "loss": 0.4485, + "step": 11091 + }, + { + "epoch": 1.8216287233387392, + "grad_norm": 0.31700873209102415, + "learning_rate": 6.441514128661702e-06, + "loss": 0.4577, + "step": 11092 + }, + { + "epoch": 1.8217929505470818, + "grad_norm": 0.3785064320067957, + "learning_rate": 6.44109104047448e-06, + "loss": 0.4385, + "step": 11093 + }, + { + "epoch": 1.8219571777554247, + "grad_norm": 0.29377408501971125, + "learning_rate": 6.440667931205097e-06, + "loss": 0.4443, + "step": 11094 + }, + { + "epoch": 1.8221214049637675, + "grad_norm": 0.30032625362802456, + "learning_rate": 6.440244800858152e-06, + "loss": 0.4331, + "step": 11095 + }, + { + "epoch": 1.8222856321721101, + "grad_norm": 0.3718303109112087, + "learning_rate": 6.4398216494382386e-06, + "loss": 0.4572, + "step": 11096 + }, + { + "epoch": 1.8224498593804528, + "grad_norm": 0.2960251316220687, + "learning_rate": 6.439398476949954e-06, + "loss": 0.4621, + "step": 11097 + }, + { + "epoch": 1.8226140865887956, + "grad_norm": 0.33123020320172913, + "learning_rate": 6.438975283397895e-06, + "loss": 0.4432, + "step": 11098 + }, + { + "epoch": 1.8227783137971385, + "grad_norm": 0.2742503490676697, + "learning_rate": 6.4385520687866554e-06, + "loss": 0.4535, + "step": 11099 + }, + { + "epoch": 1.8229425410054811, + "grad_norm": 0.2836794155117891, + "learning_rate": 6.438128833120833e-06, + "loss": 0.4517, + "step": 11100 + }, + { + "epoch": 1.8231067682138238, + "grad_norm": 0.30579928885691016, + "learning_rate": 6.437705576405025e-06, + "loss": 0.4649, + "step": 11101 + }, + { + "epoch": 1.8232709954221664, + "grad_norm": 0.3615411845518621, + "learning_rate": 6.437282298643828e-06, + "loss": 0.4617, + "step": 11102 + }, + { + "epoch": 1.8234352226305093, + "grad_norm": 0.30059915648933905, + "learning_rate": 6.43685899984184e-06, + "loss": 0.4397, + "step": 11103 + }, + { + "epoch": 1.8235994498388521, + "grad_norm": 0.9129523653870982, + "learning_rate": 6.4364356800036555e-06, + "loss": 0.4795, + "step": 11104 + }, + { + "epoch": 1.8237636770471948, + "grad_norm": 0.4191096623708467, + "learning_rate": 6.436012339133876e-06, + "loss": 0.4281, + "step": 11105 + }, + { + "epoch": 1.8239279042555374, + "grad_norm": 0.3097106124164847, + "learning_rate": 6.435588977237098e-06, + "loss": 0.4599, + "step": 11106 + }, + { + "epoch": 1.8240921314638803, + "grad_norm": 0.25366945090881754, + "learning_rate": 6.435165594317919e-06, + "loss": 0.452, + "step": 11107 + }, + { + "epoch": 1.8242563586722231, + "grad_norm": 0.42724123488796834, + "learning_rate": 6.434742190380938e-06, + "loss": 0.4489, + "step": 11108 + }, + { + "epoch": 1.8244205858805658, + "grad_norm": 0.4470158326079735, + "learning_rate": 6.4343187654307516e-06, + "loss": 0.4586, + "step": 11109 + }, + { + "epoch": 1.8245848130889084, + "grad_norm": 0.2843995188282995, + "learning_rate": 6.4338953194719625e-06, + "loss": 0.4499, + "step": 11110 + }, + { + "epoch": 1.8247490402972513, + "grad_norm": 0.3535728345730193, + "learning_rate": 6.433471852509166e-06, + "loss": 0.4527, + "step": 11111 + }, + { + "epoch": 1.8249132675055941, + "grad_norm": 0.390488775163116, + "learning_rate": 6.433048364546963e-06, + "loss": 0.4499, + "step": 11112 + }, + { + "epoch": 1.8250774947139368, + "grad_norm": 0.2856644175201384, + "learning_rate": 6.4326248555899535e-06, + "loss": 0.4562, + "step": 11113 + }, + { + "epoch": 1.8252417219222794, + "grad_norm": 0.25416212783117076, + "learning_rate": 6.432201325642737e-06, + "loss": 0.4399, + "step": 11114 + }, + { + "epoch": 1.8254059491306223, + "grad_norm": 0.41194388605844706, + "learning_rate": 6.431777774709912e-06, + "loss": 0.4593, + "step": 11115 + }, + { + "epoch": 1.8255701763389651, + "grad_norm": 0.3144676904089028, + "learning_rate": 6.43135420279608e-06, + "loss": 0.443, + "step": 11116 + }, + { + "epoch": 1.8257344035473078, + "grad_norm": 0.393080464105788, + "learning_rate": 6.430930609905842e-06, + "loss": 0.4569, + "step": 11117 + }, + { + "epoch": 1.8258986307556504, + "grad_norm": 0.33847400830923585, + "learning_rate": 6.430506996043798e-06, + "loss": 0.4589, + "step": 11118 + }, + { + "epoch": 1.826062857963993, + "grad_norm": 0.36415414734893137, + "learning_rate": 6.430083361214547e-06, + "loss": 0.468, + "step": 11119 + }, + { + "epoch": 1.826227085172336, + "grad_norm": 0.37557718876286633, + "learning_rate": 6.429659705422693e-06, + "loss": 0.4628, + "step": 11120 + }, + { + "epoch": 1.8263913123806788, + "grad_norm": 0.32069328835487537, + "learning_rate": 6.429236028672834e-06, + "loss": 0.4346, + "step": 11121 + }, + { + "epoch": 1.8265555395890214, + "grad_norm": 0.31544843970795344, + "learning_rate": 6.428812330969576e-06, + "loss": 0.4485, + "step": 11122 + }, + { + "epoch": 1.826719766797364, + "grad_norm": 0.29920140008668483, + "learning_rate": 6.428388612317519e-06, + "loss": 0.4633, + "step": 11123 + }, + { + "epoch": 1.826883994005707, + "grad_norm": 0.26314160528688985, + "learning_rate": 6.427964872721262e-06, + "loss": 0.4446, + "step": 11124 + }, + { + "epoch": 1.8270482212140498, + "grad_norm": 0.3417200243928961, + "learning_rate": 6.4275411121854095e-06, + "loss": 0.447, + "step": 11125 + }, + { + "epoch": 1.8272124484223924, + "grad_norm": 0.4288052637609021, + "learning_rate": 6.427117330714566e-06, + "loss": 0.4726, + "step": 11126 + }, + { + "epoch": 1.827376675630735, + "grad_norm": 0.3291954670980573, + "learning_rate": 6.426693528313333e-06, + "loss": 0.4301, + "step": 11127 + }, + { + "epoch": 1.8275409028390779, + "grad_norm": 0.3432703402034783, + "learning_rate": 6.4262697049863106e-06, + "loss": 0.4617, + "step": 11128 + }, + { + "epoch": 1.8277051300474207, + "grad_norm": 0.31460182638244183, + "learning_rate": 6.425845860738104e-06, + "loss": 0.4361, + "step": 11129 + }, + { + "epoch": 1.8278693572557634, + "grad_norm": 0.3356042150563821, + "learning_rate": 6.4254219955733166e-06, + "loss": 0.4502, + "step": 11130 + }, + { + "epoch": 1.828033584464106, + "grad_norm": 0.3417134062143049, + "learning_rate": 6.424998109496554e-06, + "loss": 0.4591, + "step": 11131 + }, + { + "epoch": 1.8281978116724489, + "grad_norm": 0.29443259361596996, + "learning_rate": 6.4245742025124165e-06, + "loss": 0.4615, + "step": 11132 + }, + { + "epoch": 1.8283620388807917, + "grad_norm": 0.3027837712996636, + "learning_rate": 6.424150274625509e-06, + "loss": 0.4432, + "step": 11133 + }, + { + "epoch": 1.8285262660891344, + "grad_norm": 0.3859255584228414, + "learning_rate": 6.423726325840437e-06, + "loss": 0.4525, + "step": 11134 + }, + { + "epoch": 1.828690493297477, + "grad_norm": 0.8084532470537148, + "learning_rate": 6.423302356161805e-06, + "loss": 0.4585, + "step": 11135 + }, + { + "epoch": 1.8288547205058197, + "grad_norm": 0.3222083942212579, + "learning_rate": 6.422878365594217e-06, + "loss": 0.4348, + "step": 11136 + }, + { + "epoch": 1.8290189477141625, + "grad_norm": 0.5974350173854517, + "learning_rate": 6.422454354142277e-06, + "loss": 0.445, + "step": 11137 + }, + { + "epoch": 1.8291831749225054, + "grad_norm": 0.48381322250594383, + "learning_rate": 6.422030321810592e-06, + "loss": 0.4752, + "step": 11138 + }, + { + "epoch": 1.829347402130848, + "grad_norm": 0.43774133544125177, + "learning_rate": 6.421606268603767e-06, + "loss": 0.4676, + "step": 11139 + }, + { + "epoch": 1.8295116293391906, + "grad_norm": 0.5109065743607105, + "learning_rate": 6.421182194526407e-06, + "loss": 0.4339, + "step": 11140 + }, + { + "epoch": 1.8296758565475335, + "grad_norm": 0.2663502067197082, + "learning_rate": 6.420758099583119e-06, + "loss": 0.4548, + "step": 11141 + }, + { + "epoch": 1.8298400837558764, + "grad_norm": 0.35162665093102086, + "learning_rate": 6.420333983778507e-06, + "loss": 0.4549, + "step": 11142 + }, + { + "epoch": 1.830004310964219, + "grad_norm": 0.3582562471813268, + "learning_rate": 6.419909847117179e-06, + "loss": 0.4602, + "step": 11143 + }, + { + "epoch": 1.8301685381725616, + "grad_norm": 0.3064404860661055, + "learning_rate": 6.4194856896037416e-06, + "loss": 0.4517, + "step": 11144 + }, + { + "epoch": 1.8303327653809045, + "grad_norm": 0.3968100953955666, + "learning_rate": 6.419061511242799e-06, + "loss": 0.4455, + "step": 11145 + }, + { + "epoch": 1.8304969925892474, + "grad_norm": 0.3523584283490642, + "learning_rate": 6.418637312038963e-06, + "loss": 0.4545, + "step": 11146 + }, + { + "epoch": 1.83066121979759, + "grad_norm": 0.335856440378906, + "learning_rate": 6.4182130919968375e-06, + "loss": 0.4475, + "step": 11147 + }, + { + "epoch": 1.8308254470059326, + "grad_norm": 0.3146367836178673, + "learning_rate": 6.41778885112103e-06, + "loss": 0.4426, + "step": 11148 + }, + { + "epoch": 1.8309896742142755, + "grad_norm": 0.3653371520959425, + "learning_rate": 6.417364589416148e-06, + "loss": 0.45, + "step": 11149 + }, + { + "epoch": 1.8311539014226184, + "grad_norm": 0.3708603837292037, + "learning_rate": 6.4169403068868e-06, + "loss": 0.4656, + "step": 11150 + }, + { + "epoch": 1.831318128630961, + "grad_norm": 0.29649438936926636, + "learning_rate": 6.416516003537597e-06, + "loss": 0.4495, + "step": 11151 + }, + { + "epoch": 1.8314823558393036, + "grad_norm": 0.3521235978642944, + "learning_rate": 6.416091679373144e-06, + "loss": 0.4578, + "step": 11152 + }, + { + "epoch": 1.8316465830476463, + "grad_norm": 0.39789361916375526, + "learning_rate": 6.415667334398047e-06, + "loss": 0.4686, + "step": 11153 + }, + { + "epoch": 1.8318108102559891, + "grad_norm": 3.7291295388929075, + "learning_rate": 6.4152429686169195e-06, + "loss": 0.4464, + "step": 11154 + }, + { + "epoch": 1.831975037464332, + "grad_norm": 0.3062939593911645, + "learning_rate": 6.414818582034371e-06, + "loss": 0.4477, + "step": 11155 + }, + { + "epoch": 1.8321392646726746, + "grad_norm": 0.42487906457102476, + "learning_rate": 6.414394174655007e-06, + "loss": 0.4378, + "step": 11156 + }, + { + "epoch": 1.8323034918810173, + "grad_norm": 0.4173646928078844, + "learning_rate": 6.413969746483439e-06, + "loss": 0.4466, + "step": 11157 + }, + { + "epoch": 1.8324677190893601, + "grad_norm": 0.29528763334595215, + "learning_rate": 6.413545297524276e-06, + "loss": 0.4338, + "step": 11158 + }, + { + "epoch": 1.832631946297703, + "grad_norm": 0.3550556187269253, + "learning_rate": 6.413120827782128e-06, + "loss": 0.4294, + "step": 11159 + }, + { + "epoch": 1.8327961735060456, + "grad_norm": 0.35767683689815905, + "learning_rate": 6.412696337261608e-06, + "loss": 0.458, + "step": 11160 + }, + { + "epoch": 1.8329604007143883, + "grad_norm": 0.28119233715005026, + "learning_rate": 6.412271825967322e-06, + "loss": 0.4302, + "step": 11161 + }, + { + "epoch": 1.8331246279227311, + "grad_norm": 0.4170109914341217, + "learning_rate": 6.411847293903883e-06, + "loss": 0.4465, + "step": 11162 + }, + { + "epoch": 1.833288855131074, + "grad_norm": 0.34109494612960684, + "learning_rate": 6.4114227410759004e-06, + "loss": 0.4252, + "step": 11163 + }, + { + "epoch": 1.8334530823394166, + "grad_norm": 0.3907746165563666, + "learning_rate": 6.410998167487988e-06, + "loss": 0.4729, + "step": 11164 + }, + { + "epoch": 1.8336173095477593, + "grad_norm": 0.2953268045869606, + "learning_rate": 6.410573573144754e-06, + "loss": 0.4539, + "step": 11165 + }, + { + "epoch": 1.8337815367561021, + "grad_norm": 0.3687647764909497, + "learning_rate": 6.410148958050813e-06, + "loss": 0.4563, + "step": 11166 + }, + { + "epoch": 1.833945763964445, + "grad_norm": 0.49949614089435895, + "learning_rate": 6.409724322210772e-06, + "loss": 0.4785, + "step": 11167 + }, + { + "epoch": 1.8341099911727876, + "grad_norm": 0.35319171075997546, + "learning_rate": 6.4092996656292495e-06, + "loss": 0.4475, + "step": 11168 + }, + { + "epoch": 1.8342742183811303, + "grad_norm": 0.3578914249392386, + "learning_rate": 6.408874988310852e-06, + "loss": 0.4555, + "step": 11169 + }, + { + "epoch": 1.8344384455894729, + "grad_norm": 0.5233504521881581, + "learning_rate": 6.4084502902601946e-06, + "loss": 0.4535, + "step": 11170 + }, + { + "epoch": 1.8346026727978157, + "grad_norm": 0.3695067674128277, + "learning_rate": 6.408025571481889e-06, + "loss": 0.4489, + "step": 11171 + }, + { + "epoch": 1.8347669000061586, + "grad_norm": 0.3542537195814928, + "learning_rate": 6.407600831980548e-06, + "loss": 0.444, + "step": 11172 + }, + { + "epoch": 1.8349311272145012, + "grad_norm": 0.433248517239817, + "learning_rate": 6.407176071760787e-06, + "loss": 0.4562, + "step": 11173 + }, + { + "epoch": 1.8350953544228439, + "grad_norm": 0.43227636924692336, + "learning_rate": 6.406751290827214e-06, + "loss": 0.4399, + "step": 11174 + }, + { + "epoch": 1.8352595816311867, + "grad_norm": 0.43145471772441574, + "learning_rate": 6.40632648918445e-06, + "loss": 0.4457, + "step": 11175 + }, + { + "epoch": 1.8354238088395296, + "grad_norm": 0.3710357940741841, + "learning_rate": 6.405901666837102e-06, + "loss": 0.4319, + "step": 11176 + }, + { + "epoch": 1.8355880360478722, + "grad_norm": 0.3588178442046596, + "learning_rate": 6.405476823789788e-06, + "loss": 0.4713, + "step": 11177 + }, + { + "epoch": 1.8357522632562149, + "grad_norm": 0.3383423727372397, + "learning_rate": 6.4050519600471205e-06, + "loss": 0.45, + "step": 11178 + }, + { + "epoch": 1.8359164904645577, + "grad_norm": 0.3492514766969178, + "learning_rate": 6.404627075613715e-06, + "loss": 0.471, + "step": 11179 + }, + { + "epoch": 1.8360807176729006, + "grad_norm": 0.41424669052303414, + "learning_rate": 6.404202170494184e-06, + "loss": 0.4609, + "step": 11180 + }, + { + "epoch": 1.8362449448812432, + "grad_norm": 0.36182965770449854, + "learning_rate": 6.403777244693146e-06, + "loss": 0.4654, + "step": 11181 + }, + { + "epoch": 1.8364091720895859, + "grad_norm": 0.3327741789191045, + "learning_rate": 6.403352298215212e-06, + "loss": 0.4618, + "step": 11182 + }, + { + "epoch": 1.8365733992979287, + "grad_norm": 0.41298626199924565, + "learning_rate": 6.402927331065001e-06, + "loss": 0.4606, + "step": 11183 + }, + { + "epoch": 1.8367376265062716, + "grad_norm": 0.2869633366868695, + "learning_rate": 6.402502343247126e-06, + "loss": 0.4676, + "step": 11184 + }, + { + "epoch": 1.8369018537146142, + "grad_norm": 0.37976823478720945, + "learning_rate": 6.402077334766204e-06, + "loss": 0.4525, + "step": 11185 + }, + { + "epoch": 1.8370660809229569, + "grad_norm": 0.41990835423104483, + "learning_rate": 6.401652305626852e-06, + "loss": 0.4548, + "step": 11186 + }, + { + "epoch": 1.8372303081312995, + "grad_norm": 0.3411060115422886, + "learning_rate": 6.401227255833683e-06, + "loss": 0.4496, + "step": 11187 + }, + { + "epoch": 1.8373945353396424, + "grad_norm": 1.1713248336301512, + "learning_rate": 6.400802185391317e-06, + "loss": 0.4754, + "step": 11188 + }, + { + "epoch": 1.8375587625479852, + "grad_norm": 1.1536385664772004, + "learning_rate": 6.4003770943043685e-06, + "loss": 0.4581, + "step": 11189 + }, + { + "epoch": 1.8377229897563279, + "grad_norm": 0.32862778514012486, + "learning_rate": 6.399951982577456e-06, + "loss": 0.4752, + "step": 11190 + }, + { + "epoch": 1.8378872169646705, + "grad_norm": 0.47493385870865834, + "learning_rate": 6.399526850215195e-06, + "loss": 0.4555, + "step": 11191 + }, + { + "epoch": 1.8380514441730134, + "grad_norm": 0.31223427158800315, + "learning_rate": 6.399101697222202e-06, + "loss": 0.4529, + "step": 11192 + }, + { + "epoch": 1.8382156713813562, + "grad_norm": 0.31446797620165784, + "learning_rate": 6.3986765236030975e-06, + "loss": 0.4471, + "step": 11193 + }, + { + "epoch": 1.8383798985896989, + "grad_norm": 0.44653437318643374, + "learning_rate": 6.398251329362498e-06, + "loss": 0.4879, + "step": 11194 + }, + { + "epoch": 1.8385441257980415, + "grad_norm": 0.3569741883200403, + "learning_rate": 6.397826114505022e-06, + "loss": 0.4637, + "step": 11195 + }, + { + "epoch": 1.8387083530063844, + "grad_norm": 0.2930468326068091, + "learning_rate": 6.397400879035285e-06, + "loss": 0.4349, + "step": 11196 + }, + { + "epoch": 1.8388725802147272, + "grad_norm": 0.3441165187924981, + "learning_rate": 6.3969756229579085e-06, + "loss": 0.4466, + "step": 11197 + }, + { + "epoch": 1.8390368074230699, + "grad_norm": 0.3680111685540521, + "learning_rate": 6.396550346277512e-06, + "loss": 0.4433, + "step": 11198 + }, + { + "epoch": 1.8392010346314125, + "grad_norm": 0.4387326278721702, + "learning_rate": 6.396125048998711e-06, + "loss": 0.4499, + "step": 11199 + }, + { + "epoch": 1.8393652618397553, + "grad_norm": 0.29303807948267785, + "learning_rate": 6.395699731126128e-06, + "loss": 0.4655, + "step": 11200 + }, + { + "epoch": 1.8395294890480982, + "grad_norm": 0.535697689652598, + "learning_rate": 6.3952743926643795e-06, + "loss": 0.4374, + "step": 11201 + }, + { + "epoch": 1.8396937162564408, + "grad_norm": 0.3118572177188081, + "learning_rate": 6.394849033618087e-06, + "loss": 0.4725, + "step": 11202 + }, + { + "epoch": 1.8398579434647835, + "grad_norm": 0.3140766818191945, + "learning_rate": 6.394423653991869e-06, + "loss": 0.4267, + "step": 11203 + }, + { + "epoch": 1.8400221706731261, + "grad_norm": 0.37368035891343115, + "learning_rate": 6.393998253790347e-06, + "loss": 0.4639, + "step": 11204 + }, + { + "epoch": 1.840186397881469, + "grad_norm": 0.2971672289149953, + "learning_rate": 6.39357283301814e-06, + "loss": 0.4433, + "step": 11205 + }, + { + "epoch": 1.8403506250898118, + "grad_norm": 0.3390585276018315, + "learning_rate": 6.3931473916798705e-06, + "loss": 0.4781, + "step": 11206 + }, + { + "epoch": 1.8405148522981545, + "grad_norm": 0.38748554649335143, + "learning_rate": 6.3927219297801555e-06, + "loss": 0.4494, + "step": 11207 + }, + { + "epoch": 1.8406790795064971, + "grad_norm": 0.49663527474275104, + "learning_rate": 6.39229644732362e-06, + "loss": 0.4611, + "step": 11208 + }, + { + "epoch": 1.84084330671484, + "grad_norm": 0.29911888732308106, + "learning_rate": 6.391870944314882e-06, + "loss": 0.4591, + "step": 11209 + }, + { + "epoch": 1.8410075339231828, + "grad_norm": 0.40218304374728053, + "learning_rate": 6.391445420758565e-06, + "loss": 0.4715, + "step": 11210 + }, + { + "epoch": 1.8411717611315255, + "grad_norm": 0.3619310185273555, + "learning_rate": 6.39101987665929e-06, + "loss": 0.4516, + "step": 11211 + }, + { + "epoch": 1.8413359883398681, + "grad_norm": 0.4285679890613997, + "learning_rate": 6.390594312021677e-06, + "loss": 0.4587, + "step": 11212 + }, + { + "epoch": 1.841500215548211, + "grad_norm": 0.2965380574492834, + "learning_rate": 6.390168726850351e-06, + "loss": 0.4338, + "step": 11213 + }, + { + "epoch": 1.8416644427565538, + "grad_norm": 0.3859482067158745, + "learning_rate": 6.3897431211499325e-06, + "loss": 0.4529, + "step": 11214 + }, + { + "epoch": 1.8418286699648965, + "grad_norm": 0.41682659190036797, + "learning_rate": 6.389317494925046e-06, + "loss": 0.4431, + "step": 11215 + }, + { + "epoch": 1.841992897173239, + "grad_norm": 0.3187290191100235, + "learning_rate": 6.388891848180311e-06, + "loss": 0.4507, + "step": 11216 + }, + { + "epoch": 1.842157124381582, + "grad_norm": 0.3103138081670325, + "learning_rate": 6.388466180920351e-06, + "loss": 0.4699, + "step": 11217 + }, + { + "epoch": 1.8423213515899248, + "grad_norm": 0.4371948820010515, + "learning_rate": 6.388040493149793e-06, + "loss": 0.4592, + "step": 11218 + }, + { + "epoch": 1.8424855787982675, + "grad_norm": 0.3792863854932938, + "learning_rate": 6.387614784873257e-06, + "loss": 0.4655, + "step": 11219 + }, + { + "epoch": 1.84264980600661, + "grad_norm": 0.454009914635368, + "learning_rate": 6.387189056095367e-06, + "loss": 0.4589, + "step": 11220 + }, + { + "epoch": 1.8428140332149527, + "grad_norm": 0.36236305614927683, + "learning_rate": 6.386763306820746e-06, + "loss": 0.4805, + "step": 11221 + }, + { + "epoch": 1.8429782604232956, + "grad_norm": 0.32699570043919124, + "learning_rate": 6.38633753705402e-06, + "loss": 0.4348, + "step": 11222 + }, + { + "epoch": 1.8431424876316385, + "grad_norm": 0.27557523057430056, + "learning_rate": 6.385911746799812e-06, + "loss": 0.4529, + "step": 11223 + }, + { + "epoch": 1.843306714839981, + "grad_norm": 0.3092554373958321, + "learning_rate": 6.385485936062749e-06, + "loss": 0.4562, + "step": 11224 + }, + { + "epoch": 1.8434709420483237, + "grad_norm": 0.36892198796982645, + "learning_rate": 6.3850601048474516e-06, + "loss": 0.4672, + "step": 11225 + }, + { + "epoch": 1.8436351692566666, + "grad_norm": 0.2739936180230769, + "learning_rate": 6.384634253158546e-06, + "loss": 0.4637, + "step": 11226 + }, + { + "epoch": 1.8437993964650095, + "grad_norm": 0.7730659708755419, + "learning_rate": 6.38420838100066e-06, + "loss": 0.4554, + "step": 11227 + }, + { + "epoch": 1.843963623673352, + "grad_norm": 0.691066161259853, + "learning_rate": 6.383782488378416e-06, + "loss": 0.456, + "step": 11228 + }, + { + "epoch": 1.8441278508816947, + "grad_norm": 0.3885199433170536, + "learning_rate": 6.3833565752964415e-06, + "loss": 0.4622, + "step": 11229 + }, + { + "epoch": 1.8442920780900376, + "grad_norm": 0.3272844786118214, + "learning_rate": 6.382930641759361e-06, + "loss": 0.4454, + "step": 11230 + }, + { + "epoch": 1.8444563052983804, + "grad_norm": 0.2924812894819204, + "learning_rate": 6.382504687771804e-06, + "loss": 0.4606, + "step": 11231 + }, + { + "epoch": 1.844620532506723, + "grad_norm": 0.30384658501138234, + "learning_rate": 6.382078713338391e-06, + "loss": 0.4614, + "step": 11232 + }, + { + "epoch": 1.8447847597150657, + "grad_norm": 0.2959473309078626, + "learning_rate": 6.3816527184637514e-06, + "loss": 0.4711, + "step": 11233 + }, + { + "epoch": 1.8449489869234086, + "grad_norm": 0.3025842567454638, + "learning_rate": 6.3812267031525125e-06, + "loss": 0.4502, + "step": 11234 + }, + { + "epoch": 1.8451132141317512, + "grad_norm": 0.2605784588380211, + "learning_rate": 6.3808006674093015e-06, + "loss": 0.4534, + "step": 11235 + }, + { + "epoch": 1.845277441340094, + "grad_norm": 0.48953924584015085, + "learning_rate": 6.380374611238743e-06, + "loss": 0.4455, + "step": 11236 + }, + { + "epoch": 1.8454416685484367, + "grad_norm": 0.629228985946868, + "learning_rate": 6.3799485346454685e-06, + "loss": 0.4351, + "step": 11237 + }, + { + "epoch": 1.8456058957567794, + "grad_norm": 0.3061764968428075, + "learning_rate": 6.379522437634102e-06, + "loss": 0.4563, + "step": 11238 + }, + { + "epoch": 1.8457701229651222, + "grad_norm": 1.4709802223407416, + "learning_rate": 6.379096320209273e-06, + "loss": 0.4539, + "step": 11239 + }, + { + "epoch": 1.845934350173465, + "grad_norm": 0.323637509736383, + "learning_rate": 6.378670182375609e-06, + "loss": 0.451, + "step": 11240 + }, + { + "epoch": 1.8460985773818077, + "grad_norm": 0.3057557907583729, + "learning_rate": 6.3782440241377375e-06, + "loss": 0.4581, + "step": 11241 + }, + { + "epoch": 1.8462628045901504, + "grad_norm": 0.304889386260149, + "learning_rate": 6.377817845500289e-06, + "loss": 0.4325, + "step": 11242 + }, + { + "epoch": 1.8464270317984932, + "grad_norm": 0.33035100993678235, + "learning_rate": 6.377391646467891e-06, + "loss": 0.4502, + "step": 11243 + }, + { + "epoch": 1.846591259006836, + "grad_norm": 0.29746641796158035, + "learning_rate": 6.376965427045173e-06, + "loss": 0.4503, + "step": 11244 + }, + { + "epoch": 1.8467554862151787, + "grad_norm": 0.32982674950063423, + "learning_rate": 6.376539187236764e-06, + "loss": 0.4692, + "step": 11245 + }, + { + "epoch": 1.8469197134235213, + "grad_norm": 0.3033746048395144, + "learning_rate": 6.376112927047292e-06, + "loss": 0.456, + "step": 11246 + }, + { + "epoch": 1.8470839406318642, + "grad_norm": 0.34344852873284026, + "learning_rate": 6.375686646481388e-06, + "loss": 0.4693, + "step": 11247 + }, + { + "epoch": 1.847248167840207, + "grad_norm": 0.3152472718518401, + "learning_rate": 6.375260345543683e-06, + "loss": 0.4738, + "step": 11248 + }, + { + "epoch": 1.8474123950485497, + "grad_norm": 0.34465837766764895, + "learning_rate": 6.374834024238805e-06, + "loss": 0.4589, + "step": 11249 + }, + { + "epoch": 1.8475766222568923, + "grad_norm": 0.33303273260166083, + "learning_rate": 6.374407682571384e-06, + "loss": 0.481, + "step": 11250 + }, + { + "epoch": 1.8477408494652352, + "grad_norm": 0.2974928118083777, + "learning_rate": 6.373981320546051e-06, + "loss": 0.4489, + "step": 11251 + }, + { + "epoch": 1.8479050766735778, + "grad_norm": 0.3249645450729173, + "learning_rate": 6.373554938167439e-06, + "loss": 0.4445, + "step": 11252 + }, + { + "epoch": 1.8480693038819207, + "grad_norm": 0.3307822690469634, + "learning_rate": 6.373128535440177e-06, + "loss": 0.471, + "step": 11253 + }, + { + "epoch": 1.8482335310902633, + "grad_norm": 0.3783593787704891, + "learning_rate": 6.372702112368894e-06, + "loss": 0.4261, + "step": 11254 + }, + { + "epoch": 1.848397758298606, + "grad_norm": 0.3394670412812821, + "learning_rate": 6.372275668958225e-06, + "loss": 0.472, + "step": 11255 + }, + { + "epoch": 1.8485619855069488, + "grad_norm": 0.33822575181629483, + "learning_rate": 6.371849205212801e-06, + "loss": 0.4439, + "step": 11256 + }, + { + "epoch": 1.8487262127152917, + "grad_norm": 0.3142075629376298, + "learning_rate": 6.371422721137252e-06, + "loss": 0.4464, + "step": 11257 + }, + { + "epoch": 1.8488904399236343, + "grad_norm": 0.3342951923344951, + "learning_rate": 6.370996216736211e-06, + "loss": 0.4779, + "step": 11258 + }, + { + "epoch": 1.849054667131977, + "grad_norm": 0.349350432116345, + "learning_rate": 6.370569692014309e-06, + "loss": 0.4233, + "step": 11259 + }, + { + "epoch": 1.8492188943403198, + "grad_norm": 0.29426312829915247, + "learning_rate": 6.370143146976182e-06, + "loss": 0.4605, + "step": 11260 + }, + { + "epoch": 1.8493831215486627, + "grad_norm": 0.6891572867343618, + "learning_rate": 6.369716581626459e-06, + "loss": 0.4539, + "step": 11261 + }, + { + "epoch": 1.8495473487570053, + "grad_norm": 0.3566966922620718, + "learning_rate": 6.3692899959697735e-06, + "loss": 0.4473, + "step": 11262 + }, + { + "epoch": 1.849711575965348, + "grad_norm": 0.3671286591601414, + "learning_rate": 6.368863390010762e-06, + "loss": 0.4626, + "step": 11263 + }, + { + "epoch": 1.8498758031736908, + "grad_norm": 0.4516765235822067, + "learning_rate": 6.368436763754055e-06, + "loss": 0.4572, + "step": 11264 + }, + { + "epoch": 1.8500400303820337, + "grad_norm": 0.2954919181729513, + "learning_rate": 6.368010117204286e-06, + "loss": 0.4393, + "step": 11265 + }, + { + "epoch": 1.8502042575903763, + "grad_norm": 0.3493134392224947, + "learning_rate": 6.367583450366087e-06, + "loss": 0.4567, + "step": 11266 + }, + { + "epoch": 1.850368484798719, + "grad_norm": 0.35872074525718806, + "learning_rate": 6.367156763244097e-06, + "loss": 0.4561, + "step": 11267 + }, + { + "epoch": 1.8505327120070618, + "grad_norm": 0.42804427248337146, + "learning_rate": 6.3667300558429475e-06, + "loss": 0.4678, + "step": 11268 + }, + { + "epoch": 1.8506969392154045, + "grad_norm": 0.39557124706471175, + "learning_rate": 6.366303328167273e-06, + "loss": 0.4527, + "step": 11269 + }, + { + "epoch": 1.8508611664237473, + "grad_norm": 0.3645519450986868, + "learning_rate": 6.365876580221706e-06, + "loss": 0.4439, + "step": 11270 + }, + { + "epoch": 1.85102539363209, + "grad_norm": 0.2611586402914713, + "learning_rate": 6.365449812010884e-06, + "loss": 0.4624, + "step": 11271 + }, + { + "epoch": 1.8511896208404326, + "grad_norm": 0.4224856050672986, + "learning_rate": 6.365023023539444e-06, + "loss": 0.4262, + "step": 11272 + }, + { + "epoch": 1.8513538480487755, + "grad_norm": 0.34426711144349553, + "learning_rate": 6.364596214812018e-06, + "loss": 0.4635, + "step": 11273 + }, + { + "epoch": 1.8515180752571183, + "grad_norm": 0.33044232606364654, + "learning_rate": 6.364169385833242e-06, + "loss": 0.4413, + "step": 11274 + }, + { + "epoch": 1.851682302465461, + "grad_norm": 0.3566030564790324, + "learning_rate": 6.363742536607753e-06, + "loss": 0.4657, + "step": 11275 + }, + { + "epoch": 1.8518465296738036, + "grad_norm": 0.32664316500962876, + "learning_rate": 6.363315667140185e-06, + "loss": 0.4231, + "step": 11276 + }, + { + "epoch": 1.8520107568821464, + "grad_norm": 0.3192712622806749, + "learning_rate": 6.362888777435177e-06, + "loss": 0.4429, + "step": 11277 + }, + { + "epoch": 1.8521749840904893, + "grad_norm": 0.3483004909516952, + "learning_rate": 6.362461867497364e-06, + "loss": 0.4459, + "step": 11278 + }, + { + "epoch": 1.852339211298832, + "grad_norm": 0.3238437276102331, + "learning_rate": 6.362034937331382e-06, + "loss": 0.4658, + "step": 11279 + }, + { + "epoch": 1.8525034385071746, + "grad_norm": 0.4155190721399048, + "learning_rate": 6.361607986941869e-06, + "loss": 0.4594, + "step": 11280 + }, + { + "epoch": 1.8526676657155174, + "grad_norm": 0.4259720186833352, + "learning_rate": 6.361181016333462e-06, + "loss": 0.4427, + "step": 11281 + }, + { + "epoch": 1.8528318929238603, + "grad_norm": 0.39788743048437325, + "learning_rate": 6.360754025510797e-06, + "loss": 0.4389, + "step": 11282 + }, + { + "epoch": 1.852996120132203, + "grad_norm": 0.4959653416913325, + "learning_rate": 6.360327014478513e-06, + "loss": 0.4683, + "step": 11283 + }, + { + "epoch": 1.8531603473405456, + "grad_norm": 0.9145142089162646, + "learning_rate": 6.359899983241248e-06, + "loss": 0.4678, + "step": 11284 + }, + { + "epoch": 1.8533245745488884, + "grad_norm": 0.34958711678494037, + "learning_rate": 6.3594729318036395e-06, + "loss": 0.4392, + "step": 11285 + }, + { + "epoch": 1.853488801757231, + "grad_norm": 0.2915456832169805, + "learning_rate": 6.3590458601703234e-06, + "loss": 0.4396, + "step": 11286 + }, + { + "epoch": 1.853653028965574, + "grad_norm": 0.3523839879611003, + "learning_rate": 6.358618768345943e-06, + "loss": 0.4452, + "step": 11287 + }, + { + "epoch": 1.8538172561739166, + "grad_norm": 0.39252336971886814, + "learning_rate": 6.358191656335133e-06, + "loss": 0.4503, + "step": 11288 + }, + { + "epoch": 1.8539814833822592, + "grad_norm": 0.29758768939411184, + "learning_rate": 6.357764524142533e-06, + "loss": 0.4371, + "step": 11289 + }, + { + "epoch": 1.854145710590602, + "grad_norm": 0.2923988521358031, + "learning_rate": 6.3573373717727814e-06, + "loss": 0.4435, + "step": 11290 + }, + { + "epoch": 1.854309937798945, + "grad_norm": 0.3797175874288087, + "learning_rate": 6.35691019923052e-06, + "loss": 0.4735, + "step": 11291 + }, + { + "epoch": 1.8544741650072876, + "grad_norm": 0.3943571084660884, + "learning_rate": 6.356483006520387e-06, + "loss": 0.4385, + "step": 11292 + }, + { + "epoch": 1.8546383922156302, + "grad_norm": 0.422371885454595, + "learning_rate": 6.356055793647021e-06, + "loss": 0.4549, + "step": 11293 + }, + { + "epoch": 1.854802619423973, + "grad_norm": 0.2881492886554232, + "learning_rate": 6.3556285606150645e-06, + "loss": 0.4354, + "step": 11294 + }, + { + "epoch": 1.854966846632316, + "grad_norm": 0.2895915499671857, + "learning_rate": 6.355201307429155e-06, + "loss": 0.4554, + "step": 11295 + }, + { + "epoch": 1.8551310738406586, + "grad_norm": 0.27380290883797564, + "learning_rate": 6.354774034093934e-06, + "loss": 0.4457, + "step": 11296 + }, + { + "epoch": 1.8552953010490012, + "grad_norm": 0.3123756640040171, + "learning_rate": 6.354346740614043e-06, + "loss": 0.4317, + "step": 11297 + }, + { + "epoch": 1.855459528257344, + "grad_norm": 0.3097783710970955, + "learning_rate": 6.353919426994121e-06, + "loss": 0.4458, + "step": 11298 + }, + { + "epoch": 1.855623755465687, + "grad_norm": 0.31806916318986367, + "learning_rate": 6.353492093238811e-06, + "loss": 0.4459, + "step": 11299 + }, + { + "epoch": 1.8557879826740296, + "grad_norm": 0.42610233945463255, + "learning_rate": 6.353064739352752e-06, + "loss": 0.452, + "step": 11300 + }, + { + "epoch": 1.8559522098823722, + "grad_norm": 0.3118649517740116, + "learning_rate": 6.352637365340588e-06, + "loss": 0.4414, + "step": 11301 + }, + { + "epoch": 1.856116437090715, + "grad_norm": 0.2883834157653145, + "learning_rate": 6.352209971206959e-06, + "loss": 0.4447, + "step": 11302 + }, + { + "epoch": 1.8562806642990577, + "grad_norm": 0.30425811238812217, + "learning_rate": 6.3517825569565074e-06, + "loss": 0.4677, + "step": 11303 + }, + { + "epoch": 1.8564448915074006, + "grad_norm": 0.3157632228032478, + "learning_rate": 6.351355122593875e-06, + "loss": 0.451, + "step": 11304 + }, + { + "epoch": 1.8566091187157432, + "grad_norm": 0.31606115308966193, + "learning_rate": 6.350927668123704e-06, + "loss": 0.4615, + "step": 11305 + }, + { + "epoch": 1.8567733459240858, + "grad_norm": 0.2724015541206075, + "learning_rate": 6.350500193550638e-06, + "loss": 0.4353, + "step": 11306 + }, + { + "epoch": 1.8569375731324287, + "grad_norm": 0.2767441122373085, + "learning_rate": 6.35007269887932e-06, + "loss": 0.4567, + "step": 11307 + }, + { + "epoch": 1.8571018003407715, + "grad_norm": 0.29273901495114507, + "learning_rate": 6.349645184114392e-06, + "loss": 0.4567, + "step": 11308 + }, + { + "epoch": 1.8572660275491142, + "grad_norm": 0.2969139208502846, + "learning_rate": 6.349217649260497e-06, + "loss": 0.4342, + "step": 11309 + }, + { + "epoch": 1.8574302547574568, + "grad_norm": 0.35579388252131244, + "learning_rate": 6.34879009432228e-06, + "loss": 0.4611, + "step": 11310 + }, + { + "epoch": 1.8575944819657997, + "grad_norm": 0.34158770325629234, + "learning_rate": 6.348362519304382e-06, + "loss": 0.4616, + "step": 11311 + }, + { + "epoch": 1.8577587091741425, + "grad_norm": 0.28067291327273947, + "learning_rate": 6.34793492421145e-06, + "loss": 0.4455, + "step": 11312 + }, + { + "epoch": 1.8579229363824852, + "grad_norm": 0.30837793967609967, + "learning_rate": 6.347507309048125e-06, + "loss": 0.4578, + "step": 11313 + }, + { + "epoch": 1.8580871635908278, + "grad_norm": 0.3125025367261328, + "learning_rate": 6.347079673819053e-06, + "loss": 0.4472, + "step": 11314 + }, + { + "epoch": 1.8582513907991707, + "grad_norm": 0.5052978416465356, + "learning_rate": 6.34665201852888e-06, + "loss": 0.4471, + "step": 11315 + }, + { + "epoch": 1.8584156180075135, + "grad_norm": 0.31536032442555645, + "learning_rate": 6.346224343182248e-06, + "loss": 0.483, + "step": 11316 + }, + { + "epoch": 1.8585798452158562, + "grad_norm": 0.4470949513656592, + "learning_rate": 6.345796647783804e-06, + "loss": 0.4606, + "step": 11317 + }, + { + "epoch": 1.8587440724241988, + "grad_norm": 0.3170649277211733, + "learning_rate": 6.345368932338192e-06, + "loss": 0.4342, + "step": 11318 + }, + { + "epoch": 1.8589082996325417, + "grad_norm": 0.3424326385751127, + "learning_rate": 6.344941196850058e-06, + "loss": 0.463, + "step": 11319 + }, + { + "epoch": 1.8590725268408843, + "grad_norm": 0.3128336163128946, + "learning_rate": 6.344513441324048e-06, + "loss": 0.4494, + "step": 11320 + }, + { + "epoch": 1.8592367540492272, + "grad_norm": 0.4693741408208788, + "learning_rate": 6.344085665764806e-06, + "loss": 0.4228, + "step": 11321 + }, + { + "epoch": 1.8594009812575698, + "grad_norm": 0.3938432577957824, + "learning_rate": 6.343657870176979e-06, + "loss": 0.4579, + "step": 11322 + }, + { + "epoch": 1.8595652084659124, + "grad_norm": 0.2665157963971842, + "learning_rate": 6.343230054565215e-06, + "loss": 0.4559, + "step": 11323 + }, + { + "epoch": 1.8597294356742553, + "grad_norm": 0.5765119530939804, + "learning_rate": 6.342802218934159e-06, + "loss": 0.4366, + "step": 11324 + }, + { + "epoch": 1.8598936628825982, + "grad_norm": 0.6677510627653236, + "learning_rate": 6.342374363288456e-06, + "loss": 0.4634, + "step": 11325 + }, + { + "epoch": 1.8600578900909408, + "grad_norm": 0.3405761435040425, + "learning_rate": 6.341946487632758e-06, + "loss": 0.4518, + "step": 11326 + }, + { + "epoch": 1.8602221172992834, + "grad_norm": 0.28552553151580606, + "learning_rate": 6.341518591971707e-06, + "loss": 0.4468, + "step": 11327 + }, + { + "epoch": 1.8603863445076263, + "grad_norm": 0.3218955469025756, + "learning_rate": 6.341090676309951e-06, + "loss": 0.4628, + "step": 11328 + }, + { + "epoch": 1.8605505717159692, + "grad_norm": 0.3214289953278923, + "learning_rate": 6.340662740652141e-06, + "loss": 0.4289, + "step": 11329 + }, + { + "epoch": 1.8607147989243118, + "grad_norm": 0.2527221904206092, + "learning_rate": 6.340234785002922e-06, + "loss": 0.4347, + "step": 11330 + }, + { + "epoch": 1.8608790261326544, + "grad_norm": 0.39361376413646876, + "learning_rate": 6.339806809366942e-06, + "loss": 0.4583, + "step": 11331 + }, + { + "epoch": 1.8610432533409973, + "grad_norm": 0.36043320353855923, + "learning_rate": 6.339378813748852e-06, + "loss": 0.4427, + "step": 11332 + }, + { + "epoch": 1.8612074805493402, + "grad_norm": 0.8114253982192287, + "learning_rate": 6.338950798153295e-06, + "loss": 0.4365, + "step": 11333 + }, + { + "epoch": 1.8613717077576828, + "grad_norm": 0.2778367874294855, + "learning_rate": 6.338522762584925e-06, + "loss": 0.447, + "step": 11334 + }, + { + "epoch": 1.8615359349660254, + "grad_norm": 0.3560822146400267, + "learning_rate": 6.338094707048389e-06, + "loss": 0.4356, + "step": 11335 + }, + { + "epoch": 1.8617001621743683, + "grad_norm": 0.27192416670571484, + "learning_rate": 6.337666631548337e-06, + "loss": 0.4498, + "step": 11336 + }, + { + "epoch": 1.861864389382711, + "grad_norm": 0.4327915472121516, + "learning_rate": 6.337238536089416e-06, + "loss": 0.4366, + "step": 11337 + }, + { + "epoch": 1.8620286165910538, + "grad_norm": 0.4133339322356034, + "learning_rate": 6.336810420676277e-06, + "loss": 0.4509, + "step": 11338 + }, + { + "epoch": 1.8621928437993964, + "grad_norm": 0.2788288955428656, + "learning_rate": 6.336382285313569e-06, + "loss": 0.4571, + "step": 11339 + }, + { + "epoch": 1.862357071007739, + "grad_norm": 0.3527929954753714, + "learning_rate": 6.335954130005945e-06, + "loss": 0.4516, + "step": 11340 + }, + { + "epoch": 1.862521298216082, + "grad_norm": 0.4160277813715837, + "learning_rate": 6.335525954758051e-06, + "loss": 0.4483, + "step": 11341 + }, + { + "epoch": 1.8626855254244248, + "grad_norm": 0.41947127135248397, + "learning_rate": 6.335097759574539e-06, + "loss": 0.4715, + "step": 11342 + }, + { + "epoch": 1.8628497526327674, + "grad_norm": 0.30891972256795924, + "learning_rate": 6.33466954446006e-06, + "loss": 0.4757, + "step": 11343 + }, + { + "epoch": 1.86301397984111, + "grad_norm": 0.32281669512362693, + "learning_rate": 6.334241309419265e-06, + "loss": 0.4487, + "step": 11344 + }, + { + "epoch": 1.863178207049453, + "grad_norm": 0.3710096514926429, + "learning_rate": 6.333813054456805e-06, + "loss": 0.4512, + "step": 11345 + }, + { + "epoch": 1.8633424342577958, + "grad_norm": 0.297366741235457, + "learning_rate": 6.33338477957733e-06, + "loss": 0.4876, + "step": 11346 + }, + { + "epoch": 1.8635066614661384, + "grad_norm": 0.287759306786484, + "learning_rate": 6.332956484785495e-06, + "loss": 0.4574, + "step": 11347 + }, + { + "epoch": 1.863670888674481, + "grad_norm": 0.5127446230381244, + "learning_rate": 6.332528170085947e-06, + "loss": 0.4564, + "step": 11348 + }, + { + "epoch": 1.863835115882824, + "grad_norm": 0.3181484291823922, + "learning_rate": 6.33209983548334e-06, + "loss": 0.4511, + "step": 11349 + }, + { + "epoch": 1.8639993430911668, + "grad_norm": 0.36182292779200886, + "learning_rate": 6.331671480982328e-06, + "loss": 0.4509, + "step": 11350 + }, + { + "epoch": 1.8641635702995094, + "grad_norm": 0.3005556130697668, + "learning_rate": 6.3312431065875596e-06, + "loss": 0.4439, + "step": 11351 + }, + { + "epoch": 1.864327797507852, + "grad_norm": 0.31933998986102374, + "learning_rate": 6.33081471230369e-06, + "loss": 0.4796, + "step": 11352 + }, + { + "epoch": 1.864492024716195, + "grad_norm": 0.43163246367148256, + "learning_rate": 6.330386298135372e-06, + "loss": 0.4495, + "step": 11353 + }, + { + "epoch": 1.8646562519245375, + "grad_norm": 0.37365493661552046, + "learning_rate": 6.329957864087256e-06, + "loss": 0.4458, + "step": 11354 + }, + { + "epoch": 1.8648204791328804, + "grad_norm": 0.28485223530004894, + "learning_rate": 6.329529410163999e-06, + "loss": 0.4384, + "step": 11355 + }, + { + "epoch": 1.864984706341223, + "grad_norm": 0.3407623561451519, + "learning_rate": 6.329100936370253e-06, + "loss": 0.4628, + "step": 11356 + }, + { + "epoch": 1.8651489335495657, + "grad_norm": 0.3862070798121785, + "learning_rate": 6.328672442710671e-06, + "loss": 0.4508, + "step": 11357 + }, + { + "epoch": 1.8653131607579085, + "grad_norm": 0.3748507949476453, + "learning_rate": 6.328243929189905e-06, + "loss": 0.4458, + "step": 11358 + }, + { + "epoch": 1.8654773879662514, + "grad_norm": 0.38990948562174965, + "learning_rate": 6.327815395812613e-06, + "loss": 0.4455, + "step": 11359 + }, + { + "epoch": 1.865641615174594, + "grad_norm": 0.37073709138352673, + "learning_rate": 6.327386842583447e-06, + "loss": 0.4339, + "step": 11360 + }, + { + "epoch": 1.8658058423829367, + "grad_norm": 0.3471538957326833, + "learning_rate": 6.326958269507063e-06, + "loss": 0.4536, + "step": 11361 + }, + { + "epoch": 1.8659700695912795, + "grad_norm": 0.3642542342861628, + "learning_rate": 6.326529676588114e-06, + "loss": 0.4466, + "step": 11362 + }, + { + "epoch": 1.8661342967996224, + "grad_norm": 0.3782161620612055, + "learning_rate": 6.326101063831254e-06, + "loss": 0.4516, + "step": 11363 + }, + { + "epoch": 1.866298524007965, + "grad_norm": 0.3331236609817597, + "learning_rate": 6.325672431241142e-06, + "loss": 0.4546, + "step": 11364 + }, + { + "epoch": 1.8664627512163077, + "grad_norm": 1.0242190816400079, + "learning_rate": 6.325243778822431e-06, + "loss": 0.4514, + "step": 11365 + }, + { + "epoch": 1.8666269784246505, + "grad_norm": 0.27978995786744143, + "learning_rate": 6.324815106579777e-06, + "loss": 0.459, + "step": 11366 + }, + { + "epoch": 1.8667912056329934, + "grad_norm": 0.3215904181313786, + "learning_rate": 6.324386414517834e-06, + "loss": 0.452, + "step": 11367 + }, + { + "epoch": 1.866955432841336, + "grad_norm": 0.36826949228960465, + "learning_rate": 6.32395770264126e-06, + "loss": 0.4561, + "step": 11368 + }, + { + "epoch": 1.8671196600496787, + "grad_norm": 0.3002113463960166, + "learning_rate": 6.323528970954711e-06, + "loss": 0.4545, + "step": 11369 + }, + { + "epoch": 1.8672838872580215, + "grad_norm": 0.327024602938653, + "learning_rate": 6.323100219462844e-06, + "loss": 0.4568, + "step": 11370 + }, + { + "epoch": 1.8674481144663642, + "grad_norm": 0.44713301545973794, + "learning_rate": 6.322671448170314e-06, + "loss": 0.4622, + "step": 11371 + }, + { + "epoch": 1.867612341674707, + "grad_norm": 0.37617685259242234, + "learning_rate": 6.322242657081779e-06, + "loss": 0.451, + "step": 11372 + }, + { + "epoch": 1.8677765688830497, + "grad_norm": 0.4999178729558093, + "learning_rate": 6.321813846201897e-06, + "loss": 0.4676, + "step": 11373 + }, + { + "epoch": 1.8679407960913923, + "grad_norm": 0.3485035319342385, + "learning_rate": 6.321385015535323e-06, + "loss": 0.4607, + "step": 11374 + }, + { + "epoch": 1.8681050232997352, + "grad_norm": 0.3385822781097892, + "learning_rate": 6.320956165086716e-06, + "loss": 0.4748, + "step": 11375 + }, + { + "epoch": 1.868269250508078, + "grad_norm": 0.3730575244192388, + "learning_rate": 6.320527294860734e-06, + "loss": 0.4713, + "step": 11376 + }, + { + "epoch": 1.8684334777164207, + "grad_norm": 0.3223886623461952, + "learning_rate": 6.3200984048620335e-06, + "loss": 0.4469, + "step": 11377 + }, + { + "epoch": 1.8685977049247633, + "grad_norm": 0.31862136491199516, + "learning_rate": 6.319669495095275e-06, + "loss": 0.4537, + "step": 11378 + }, + { + "epoch": 1.8687619321331062, + "grad_norm": 0.29791203455607024, + "learning_rate": 6.3192405655651125e-06, + "loss": 0.4598, + "step": 11379 + }, + { + "epoch": 1.868926159341449, + "grad_norm": 0.32176269966805215, + "learning_rate": 6.318811616276211e-06, + "loss": 0.4483, + "step": 11380 + }, + { + "epoch": 1.8690903865497916, + "grad_norm": 0.43102468936947047, + "learning_rate": 6.318382647233225e-06, + "loss": 0.4427, + "step": 11381 + }, + { + "epoch": 1.8692546137581343, + "grad_norm": 0.33077619486864357, + "learning_rate": 6.3179536584408135e-06, + "loss": 0.4667, + "step": 11382 + }, + { + "epoch": 1.8694188409664771, + "grad_norm": 0.34418501391086825, + "learning_rate": 6.317524649903637e-06, + "loss": 0.4298, + "step": 11383 + }, + { + "epoch": 1.86958306817482, + "grad_norm": 0.29682961680933806, + "learning_rate": 6.317095621626354e-06, + "loss": 0.4531, + "step": 11384 + }, + { + "epoch": 1.8697472953831626, + "grad_norm": 0.3043037241928832, + "learning_rate": 6.316666573613625e-06, + "loss": 0.4474, + "step": 11385 + }, + { + "epoch": 1.8699115225915053, + "grad_norm": 0.39672356587770125, + "learning_rate": 6.316237505870111e-06, + "loss": 0.4473, + "step": 11386 + }, + { + "epoch": 1.8700757497998481, + "grad_norm": 0.32709641612353896, + "learning_rate": 6.315808418400469e-06, + "loss": 0.4533, + "step": 11387 + }, + { + "epoch": 1.8702399770081908, + "grad_norm": 0.571241424685496, + "learning_rate": 6.315379311209362e-06, + "loss": 0.4535, + "step": 11388 + }, + { + "epoch": 1.8704042042165336, + "grad_norm": 0.37875251809804106, + "learning_rate": 6.31495018430145e-06, + "loss": 0.452, + "step": 11389 + }, + { + "epoch": 1.8705684314248763, + "grad_norm": 0.35808189075596303, + "learning_rate": 6.3145210376813925e-06, + "loss": 0.4142, + "step": 11390 + }, + { + "epoch": 1.870732658633219, + "grad_norm": 0.33499970260438333, + "learning_rate": 6.314091871353852e-06, + "loss": 0.4386, + "step": 11391 + }, + { + "epoch": 1.8708968858415618, + "grad_norm": 0.3271825235316504, + "learning_rate": 6.313662685323488e-06, + "loss": 0.4348, + "step": 11392 + }, + { + "epoch": 1.8710611130499046, + "grad_norm": 0.3161373090038315, + "learning_rate": 6.313233479594963e-06, + "loss": 0.4542, + "step": 11393 + }, + { + "epoch": 1.8712253402582473, + "grad_norm": 0.3103016233161706, + "learning_rate": 6.312804254172938e-06, + "loss": 0.4525, + "step": 11394 + }, + { + "epoch": 1.87138956746659, + "grad_norm": 0.4571370996671186, + "learning_rate": 6.312375009062078e-06, + "loss": 0.4334, + "step": 11395 + }, + { + "epoch": 1.8715537946749328, + "grad_norm": 0.31005290994916324, + "learning_rate": 6.311945744267039e-06, + "loss": 0.4298, + "step": 11396 + }, + { + "epoch": 1.8717180218832756, + "grad_norm": 0.32531698774417556, + "learning_rate": 6.311516459792488e-06, + "loss": 0.4134, + "step": 11397 + }, + { + "epoch": 1.8718822490916183, + "grad_norm": 0.2637088668005535, + "learning_rate": 6.311087155643087e-06, + "loss": 0.4277, + "step": 11398 + }, + { + "epoch": 1.872046476299961, + "grad_norm": 0.352195618024151, + "learning_rate": 6.310657831823495e-06, + "loss": 0.4655, + "step": 11399 + }, + { + "epoch": 1.8722107035083038, + "grad_norm": 0.36961737788626986, + "learning_rate": 6.310228488338379e-06, + "loss": 0.4602, + "step": 11400 + }, + { + "epoch": 1.8723749307166466, + "grad_norm": 0.29607974177504476, + "learning_rate": 6.3097991251923995e-06, + "loss": 0.4531, + "step": 11401 + }, + { + "epoch": 1.8725391579249893, + "grad_norm": 0.3992638430794492, + "learning_rate": 6.309369742390224e-06, + "loss": 0.4544, + "step": 11402 + }, + { + "epoch": 1.872703385133332, + "grad_norm": 0.2969144434983537, + "learning_rate": 6.308940339936509e-06, + "loss": 0.4434, + "step": 11403 + }, + { + "epoch": 1.8728676123416748, + "grad_norm": 0.2615427443565544, + "learning_rate": 6.3085109178359245e-06, + "loss": 0.4419, + "step": 11404 + }, + { + "epoch": 1.8730318395500174, + "grad_norm": 0.32425811970912993, + "learning_rate": 6.308081476093131e-06, + "loss": 0.4489, + "step": 11405 + }, + { + "epoch": 1.8731960667583603, + "grad_norm": 0.47585657206487836, + "learning_rate": 6.3076520147127956e-06, + "loss": 0.4424, + "step": 11406 + }, + { + "epoch": 1.873360293966703, + "grad_norm": 0.30516338170610596, + "learning_rate": 6.30722253369958e-06, + "loss": 0.4413, + "step": 11407 + }, + { + "epoch": 1.8735245211750455, + "grad_norm": 0.39396978310804875, + "learning_rate": 6.306793033058147e-06, + "loss": 0.4667, + "step": 11408 + }, + { + "epoch": 1.8736887483833884, + "grad_norm": 0.3038677691494423, + "learning_rate": 6.306363512793167e-06, + "loss": 0.4502, + "step": 11409 + }, + { + "epoch": 1.8738529755917313, + "grad_norm": 0.31531753590896405, + "learning_rate": 6.305933972909301e-06, + "loss": 0.4226, + "step": 11410 + }, + { + "epoch": 1.874017202800074, + "grad_norm": 0.3287934612429399, + "learning_rate": 6.3055044134112165e-06, + "loss": 0.4417, + "step": 11411 + }, + { + "epoch": 1.8741814300084165, + "grad_norm": 0.27368395270120466, + "learning_rate": 6.305074834303576e-06, + "loss": 0.44, + "step": 11412 + }, + { + "epoch": 1.8743456572167594, + "grad_norm": 0.33455145144424686, + "learning_rate": 6.3046452355910465e-06, + "loss": 0.4573, + "step": 11413 + }, + { + "epoch": 1.8745098844251022, + "grad_norm": 0.3506306212009912, + "learning_rate": 6.304215617278296e-06, + "loss": 0.4549, + "step": 11414 + }, + { + "epoch": 1.8746741116334449, + "grad_norm": 0.3501676831247077, + "learning_rate": 6.303785979369988e-06, + "loss": 0.4774, + "step": 11415 + }, + { + "epoch": 1.8748383388417875, + "grad_norm": 0.2696692929128657, + "learning_rate": 6.30335632187079e-06, + "loss": 0.4582, + "step": 11416 + }, + { + "epoch": 1.8750025660501304, + "grad_norm": 0.2685861402212932, + "learning_rate": 6.302926644785367e-06, + "loss": 0.4552, + "step": 11417 + }, + { + "epoch": 1.8751667932584732, + "grad_norm": 0.5705129421412283, + "learning_rate": 6.302496948118388e-06, + "loss": 0.4575, + "step": 11418 + }, + { + "epoch": 1.8753310204668159, + "grad_norm": 0.27189216908107755, + "learning_rate": 6.302067231874519e-06, + "loss": 0.4584, + "step": 11419 + }, + { + "epoch": 1.8754952476751585, + "grad_norm": 0.30435635762114543, + "learning_rate": 6.3016374960584245e-06, + "loss": 0.4211, + "step": 11420 + }, + { + "epoch": 1.8756594748835014, + "grad_norm": 0.35066223874902763, + "learning_rate": 6.301207740674776e-06, + "loss": 0.4382, + "step": 11421 + }, + { + "epoch": 1.875823702091844, + "grad_norm": 0.2863261329375365, + "learning_rate": 6.300777965728238e-06, + "loss": 0.4595, + "step": 11422 + }, + { + "epoch": 1.8759879293001869, + "grad_norm": 0.3336400798811265, + "learning_rate": 6.300348171223482e-06, + "loss": 0.4607, + "step": 11423 + }, + { + "epoch": 1.8761521565085295, + "grad_norm": 0.265241509790696, + "learning_rate": 6.299918357165172e-06, + "loss": 0.4402, + "step": 11424 + }, + { + "epoch": 1.8763163837168721, + "grad_norm": 0.3908903746403792, + "learning_rate": 6.299488523557977e-06, + "loss": 0.4574, + "step": 11425 + }, + { + "epoch": 1.876480610925215, + "grad_norm": 0.3081261018458057, + "learning_rate": 6.299058670406567e-06, + "loss": 0.4411, + "step": 11426 + }, + { + "epoch": 1.8766448381335579, + "grad_norm": 0.36235121919155894, + "learning_rate": 6.298628797715611e-06, + "loss": 0.4471, + "step": 11427 + }, + { + "epoch": 1.8768090653419005, + "grad_norm": 0.3715659252716504, + "learning_rate": 6.298198905489775e-06, + "loss": 0.4622, + "step": 11428 + }, + { + "epoch": 1.8769732925502431, + "grad_norm": 0.2812323777900102, + "learning_rate": 6.297768993733731e-06, + "loss": 0.455, + "step": 11429 + }, + { + "epoch": 1.877137519758586, + "grad_norm": 0.5437456561571391, + "learning_rate": 6.297339062452145e-06, + "loss": 0.4685, + "step": 11430 + }, + { + "epoch": 1.8773017469669289, + "grad_norm": 0.3013365443414987, + "learning_rate": 6.296909111649689e-06, + "loss": 0.4403, + "step": 11431 + }, + { + "epoch": 1.8774659741752715, + "grad_norm": 0.3600951978257436, + "learning_rate": 6.296479141331033e-06, + "loss": 0.4688, + "step": 11432 + }, + { + "epoch": 1.8776302013836141, + "grad_norm": 0.32973648411913603, + "learning_rate": 6.296049151500847e-06, + "loss": 0.4671, + "step": 11433 + }, + { + "epoch": 1.877794428591957, + "grad_norm": 0.77516515033505, + "learning_rate": 6.295619142163799e-06, + "loss": 0.448, + "step": 11434 + }, + { + "epoch": 1.8779586558002999, + "grad_norm": 0.2813318600336481, + "learning_rate": 6.295189113324559e-06, + "loss": 0.4392, + "step": 11435 + }, + { + "epoch": 1.8781228830086425, + "grad_norm": 0.38431140428798827, + "learning_rate": 6.294759064987801e-06, + "loss": 0.4476, + "step": 11436 + }, + { + "epoch": 1.8782871102169851, + "grad_norm": 0.35869495335306845, + "learning_rate": 6.294328997158193e-06, + "loss": 0.428, + "step": 11437 + }, + { + "epoch": 1.878451337425328, + "grad_norm": 0.34393025001890787, + "learning_rate": 6.293898909840407e-06, + "loss": 0.443, + "step": 11438 + }, + { + "epoch": 1.8786155646336706, + "grad_norm": 0.30989139809827226, + "learning_rate": 6.293468803039114e-06, + "loss": 0.4453, + "step": 11439 + }, + { + "epoch": 1.8787797918420135, + "grad_norm": 0.3166916712968177, + "learning_rate": 6.293038676758985e-06, + "loss": 0.451, + "step": 11440 + }, + { + "epoch": 1.8789440190503561, + "grad_norm": 0.27680712337488805, + "learning_rate": 6.292608531004692e-06, + "loss": 0.4412, + "step": 11441 + }, + { + "epoch": 1.8791082462586988, + "grad_norm": 0.2942760765393107, + "learning_rate": 6.292178365780906e-06, + "loss": 0.4561, + "step": 11442 + }, + { + "epoch": 1.8792724734670416, + "grad_norm": 0.39917989420901784, + "learning_rate": 6.2917481810923e-06, + "loss": 0.4643, + "step": 11443 + }, + { + "epoch": 1.8794367006753845, + "grad_norm": 0.6947523304053774, + "learning_rate": 6.291317976943547e-06, + "loss": 0.4524, + "step": 11444 + }, + { + "epoch": 1.8796009278837271, + "grad_norm": 0.41434515474515987, + "learning_rate": 6.2908877533393164e-06, + "loss": 0.4302, + "step": 11445 + }, + { + "epoch": 1.8797651550920698, + "grad_norm": 0.45832914292702964, + "learning_rate": 6.290457510284283e-06, + "loss": 0.4517, + "step": 11446 + }, + { + "epoch": 1.8799293823004126, + "grad_norm": 0.32247693907639524, + "learning_rate": 6.290027247783121e-06, + "loss": 0.4395, + "step": 11447 + }, + { + "epoch": 1.8800936095087555, + "grad_norm": 0.2892060133011612, + "learning_rate": 6.289596965840503e-06, + "loss": 0.4655, + "step": 11448 + }, + { + "epoch": 1.8802578367170981, + "grad_norm": 0.3216050782026567, + "learning_rate": 6.2891666644610985e-06, + "loss": 0.4504, + "step": 11449 + }, + { + "epoch": 1.8804220639254408, + "grad_norm": 0.38605541235934754, + "learning_rate": 6.288736343649584e-06, + "loss": 0.4442, + "step": 11450 + }, + { + "epoch": 1.8805862911337836, + "grad_norm": 0.6160848873275179, + "learning_rate": 6.288306003410633e-06, + "loss": 0.468, + "step": 11451 + }, + { + "epoch": 1.8807505183421265, + "grad_norm": 0.3142613591082338, + "learning_rate": 6.287875643748921e-06, + "loss": 0.4312, + "step": 11452 + }, + { + "epoch": 1.8809147455504691, + "grad_norm": 0.3910821269458435, + "learning_rate": 6.2874452646691205e-06, + "loss": 0.4533, + "step": 11453 + }, + { + "epoch": 1.8810789727588118, + "grad_norm": 0.3190445080531141, + "learning_rate": 6.287014866175905e-06, + "loss": 0.4482, + "step": 11454 + }, + { + "epoch": 1.8812431999671546, + "grad_norm": 0.3013278803822531, + "learning_rate": 6.286584448273949e-06, + "loss": 0.4392, + "step": 11455 + }, + { + "epoch": 1.8814074271754972, + "grad_norm": 0.33607970980024, + "learning_rate": 6.286154010967928e-06, + "loss": 0.4596, + "step": 11456 + }, + { + "epoch": 1.88157165438384, + "grad_norm": 0.28505845262642243, + "learning_rate": 6.285723554262519e-06, + "loss": 0.4511, + "step": 11457 + }, + { + "epoch": 1.8817358815921827, + "grad_norm": 0.34519722662923064, + "learning_rate": 6.285293078162394e-06, + "loss": 0.4544, + "step": 11458 + }, + { + "epoch": 1.8819001088005254, + "grad_norm": 0.26225405676276736, + "learning_rate": 6.28486258267223e-06, + "loss": 0.4177, + "step": 11459 + }, + { + "epoch": 1.8820643360088682, + "grad_norm": 0.40104514826593735, + "learning_rate": 6.284432067796701e-06, + "loss": 0.4517, + "step": 11460 + }, + { + "epoch": 1.882228563217211, + "grad_norm": 0.29963155183195445, + "learning_rate": 6.284001533540486e-06, + "loss": 0.45, + "step": 11461 + }, + { + "epoch": 1.8823927904255537, + "grad_norm": 0.30673249122938356, + "learning_rate": 6.283570979908258e-06, + "loss": 0.4439, + "step": 11462 + }, + { + "epoch": 1.8825570176338964, + "grad_norm": 0.3151831837335785, + "learning_rate": 6.283140406904695e-06, + "loss": 0.4615, + "step": 11463 + }, + { + "epoch": 1.8827212448422392, + "grad_norm": 0.2774161243114612, + "learning_rate": 6.282709814534472e-06, + "loss": 0.448, + "step": 11464 + }, + { + "epoch": 1.882885472050582, + "grad_norm": 0.32876418823333714, + "learning_rate": 6.282279202802268e-06, + "loss": 0.4354, + "step": 11465 + }, + { + "epoch": 1.8830496992589247, + "grad_norm": 0.36150603135257353, + "learning_rate": 6.281848571712756e-06, + "loss": 0.4361, + "step": 11466 + }, + { + "epoch": 1.8832139264672674, + "grad_norm": 0.36968781944725243, + "learning_rate": 6.281417921270618e-06, + "loss": 0.4453, + "step": 11467 + }, + { + "epoch": 1.8833781536756102, + "grad_norm": 0.37179395407086213, + "learning_rate": 6.280987251480527e-06, + "loss": 0.4458, + "step": 11468 + }, + { + "epoch": 1.883542380883953, + "grad_norm": 0.4567677928565284, + "learning_rate": 6.280556562347163e-06, + "loss": 0.4602, + "step": 11469 + }, + { + "epoch": 1.8837066080922957, + "grad_norm": 0.3907571110244243, + "learning_rate": 6.280125853875202e-06, + "loss": 0.4568, + "step": 11470 + }, + { + "epoch": 1.8838708353006384, + "grad_norm": 0.3045018771953139, + "learning_rate": 6.279695126069323e-06, + "loss": 0.4285, + "step": 11471 + }, + { + "epoch": 1.8840350625089812, + "grad_norm": 0.38405524916279843, + "learning_rate": 6.279264378934205e-06, + "loss": 0.4835, + "step": 11472 + }, + { + "epoch": 1.8841992897173239, + "grad_norm": 0.2940885242084993, + "learning_rate": 6.278833612474525e-06, + "loss": 0.4656, + "step": 11473 + }, + { + "epoch": 1.8843635169256667, + "grad_norm": 0.34809134178677464, + "learning_rate": 6.2784028266949615e-06, + "loss": 0.4426, + "step": 11474 + }, + { + "epoch": 1.8845277441340094, + "grad_norm": 0.27515836675655314, + "learning_rate": 6.277972021600192e-06, + "loss": 0.4433, + "step": 11475 + }, + { + "epoch": 1.884691971342352, + "grad_norm": 0.33595930419622255, + "learning_rate": 6.277541197194899e-06, + "loss": 0.4445, + "step": 11476 + }, + { + "epoch": 1.8848561985506949, + "grad_norm": 0.32328908953131097, + "learning_rate": 6.27711035348376e-06, + "loss": 0.4423, + "step": 11477 + }, + { + "epoch": 1.8850204257590377, + "grad_norm": 0.2819189057919243, + "learning_rate": 6.276679490471454e-06, + "loss": 0.4349, + "step": 11478 + }, + { + "epoch": 1.8851846529673804, + "grad_norm": 0.3256501015452309, + "learning_rate": 6.276248608162659e-06, + "loss": 0.4511, + "step": 11479 + }, + { + "epoch": 1.885348880175723, + "grad_norm": 0.3552499408145123, + "learning_rate": 6.275817706562058e-06, + "loss": 0.4528, + "step": 11480 + }, + { + "epoch": 1.8855131073840659, + "grad_norm": 0.2841417854307412, + "learning_rate": 6.275386785674329e-06, + "loss": 0.4485, + "step": 11481 + }, + { + "epoch": 1.8856773345924087, + "grad_norm": 0.31758879872950563, + "learning_rate": 6.274955845504154e-06, + "loss": 0.4472, + "step": 11482 + }, + { + "epoch": 1.8858415618007514, + "grad_norm": 0.43039374723598556, + "learning_rate": 6.2745248860562105e-06, + "loss": 0.4618, + "step": 11483 + }, + { + "epoch": 1.886005789009094, + "grad_norm": 0.3325004206289839, + "learning_rate": 6.274093907335181e-06, + "loss": 0.4484, + "step": 11484 + }, + { + "epoch": 1.8861700162174369, + "grad_norm": 0.3247843329476943, + "learning_rate": 6.273662909345747e-06, + "loss": 0.4326, + "step": 11485 + }, + { + "epoch": 1.8863342434257797, + "grad_norm": 0.3280630386694321, + "learning_rate": 6.273231892092589e-06, + "loss": 0.4396, + "step": 11486 + }, + { + "epoch": 1.8864984706341223, + "grad_norm": 0.3923324865040793, + "learning_rate": 6.272800855580388e-06, + "loss": 0.4365, + "step": 11487 + }, + { + "epoch": 1.886662697842465, + "grad_norm": 0.33057770575928486, + "learning_rate": 6.272369799813824e-06, + "loss": 0.443, + "step": 11488 + }, + { + "epoch": 1.8868269250508078, + "grad_norm": 0.7173285870423338, + "learning_rate": 6.271938724797581e-06, + "loss": 0.4398, + "step": 11489 + }, + { + "epoch": 1.8869911522591505, + "grad_norm": 0.4121293372702511, + "learning_rate": 6.27150763053634e-06, + "loss": 0.4651, + "step": 11490 + }, + { + "epoch": 1.8871553794674933, + "grad_norm": 0.29936010036350014, + "learning_rate": 6.271076517034784e-06, + "loss": 0.4499, + "step": 11491 + }, + { + "epoch": 1.887319606675836, + "grad_norm": 0.3528959255900349, + "learning_rate": 6.270645384297594e-06, + "loss": 0.4273, + "step": 11492 + }, + { + "epoch": 1.8874838338841786, + "grad_norm": 0.39731600061297867, + "learning_rate": 6.270214232329453e-06, + "loss": 0.4497, + "step": 11493 + }, + { + "epoch": 1.8876480610925215, + "grad_norm": 0.277693424780527, + "learning_rate": 6.269783061135044e-06, + "loss": 0.444, + "step": 11494 + }, + { + "epoch": 1.8878122883008643, + "grad_norm": 0.40844379179595136, + "learning_rate": 6.269351870719049e-06, + "loss": 0.4735, + "step": 11495 + }, + { + "epoch": 1.887976515509207, + "grad_norm": 0.31135073552625936, + "learning_rate": 6.268920661086153e-06, + "loss": 0.437, + "step": 11496 + }, + { + "epoch": 1.8881407427175496, + "grad_norm": 0.32411155738375347, + "learning_rate": 6.268489432241038e-06, + "loss": 0.4472, + "step": 11497 + }, + { + "epoch": 1.8883049699258925, + "grad_norm": 0.28558926169895243, + "learning_rate": 6.268058184188387e-06, + "loss": 0.4326, + "step": 11498 + }, + { + "epoch": 1.8884691971342353, + "grad_norm": 0.3449194860672656, + "learning_rate": 6.267626916932886e-06, + "loss": 0.4565, + "step": 11499 + }, + { + "epoch": 1.888633424342578, + "grad_norm": 0.34609846937748573, + "learning_rate": 6.267195630479215e-06, + "loss": 0.45, + "step": 11500 + }, + { + "epoch": 1.8887976515509206, + "grad_norm": 0.30123922815432147, + "learning_rate": 6.266764324832063e-06, + "loss": 0.4421, + "step": 11501 + }, + { + "epoch": 1.8889618787592635, + "grad_norm": 0.33185310581070543, + "learning_rate": 6.266332999996111e-06, + "loss": 0.4628, + "step": 11502 + }, + { + "epoch": 1.8891261059676063, + "grad_norm": 0.45745920406436913, + "learning_rate": 6.265901655976046e-06, + "loss": 0.4583, + "step": 11503 + }, + { + "epoch": 1.889290333175949, + "grad_norm": 0.2995880781951445, + "learning_rate": 6.265470292776551e-06, + "loss": 0.4449, + "step": 11504 + }, + { + "epoch": 1.8894545603842916, + "grad_norm": 0.45810579993366735, + "learning_rate": 6.265038910402311e-06, + "loss": 0.4526, + "step": 11505 + }, + { + "epoch": 1.8896187875926345, + "grad_norm": 0.28333685775743284, + "learning_rate": 6.264607508858013e-06, + "loss": 0.4512, + "step": 11506 + }, + { + "epoch": 1.889783014800977, + "grad_norm": 0.35274460640803634, + "learning_rate": 6.2641760881483415e-06, + "loss": 0.4415, + "step": 11507 + }, + { + "epoch": 1.88994724200932, + "grad_norm": 0.32639796842673624, + "learning_rate": 6.26374464827798e-06, + "loss": 0.4678, + "step": 11508 + }, + { + "epoch": 1.8901114692176626, + "grad_norm": 0.35721090968404956, + "learning_rate": 6.263313189251618e-06, + "loss": 0.4635, + "step": 11509 + }, + { + "epoch": 1.8902756964260052, + "grad_norm": 0.3324618961782957, + "learning_rate": 6.262881711073939e-06, + "loss": 0.4408, + "step": 11510 + }, + { + "epoch": 1.890439923634348, + "grad_norm": 0.3248140728701892, + "learning_rate": 6.262450213749631e-06, + "loss": 0.4484, + "step": 11511 + }, + { + "epoch": 1.890604150842691, + "grad_norm": 0.30240343332479325, + "learning_rate": 6.26201869728338e-06, + "loss": 0.4579, + "step": 11512 + }, + { + "epoch": 1.8907683780510336, + "grad_norm": 0.3734330252729453, + "learning_rate": 6.261587161679871e-06, + "loss": 0.4604, + "step": 11513 + }, + { + "epoch": 1.8909326052593762, + "grad_norm": 0.31340663492426546, + "learning_rate": 6.261155606943793e-06, + "loss": 0.4493, + "step": 11514 + }, + { + "epoch": 1.891096832467719, + "grad_norm": 0.3291571447089467, + "learning_rate": 6.260724033079832e-06, + "loss": 0.4614, + "step": 11515 + }, + { + "epoch": 1.891261059676062, + "grad_norm": 0.29024380784731046, + "learning_rate": 6.260292440092677e-06, + "loss": 0.4427, + "step": 11516 + }, + { + "epoch": 1.8914252868844046, + "grad_norm": 0.9818095952183078, + "learning_rate": 6.259860827987014e-06, + "loss": 0.4554, + "step": 11517 + }, + { + "epoch": 1.8915895140927472, + "grad_norm": 0.59070892702716, + "learning_rate": 6.259429196767529e-06, + "loss": 0.4225, + "step": 11518 + }, + { + "epoch": 1.89175374130109, + "grad_norm": 0.35026140088051805, + "learning_rate": 6.258997546438914e-06, + "loss": 0.4421, + "step": 11519 + }, + { + "epoch": 1.891917968509433, + "grad_norm": 0.455132303164521, + "learning_rate": 6.258565877005853e-06, + "loss": 0.4801, + "step": 11520 + }, + { + "epoch": 1.8920821957177756, + "grad_norm": 0.3062190096578685, + "learning_rate": 6.258134188473038e-06, + "loss": 0.4525, + "step": 11521 + }, + { + "epoch": 1.8922464229261182, + "grad_norm": 0.33415070443741085, + "learning_rate": 6.257702480845155e-06, + "loss": 0.4534, + "step": 11522 + }, + { + "epoch": 1.892410650134461, + "grad_norm": 0.30454079751989627, + "learning_rate": 6.257270754126895e-06, + "loss": 0.4444, + "step": 11523 + }, + { + "epoch": 1.8925748773428037, + "grad_norm": 0.28195699746422137, + "learning_rate": 6.256839008322944e-06, + "loss": 0.4761, + "step": 11524 + }, + { + "epoch": 1.8927391045511466, + "grad_norm": 0.32220012186937447, + "learning_rate": 6.256407243437993e-06, + "loss": 0.4656, + "step": 11525 + }, + { + "epoch": 1.8929033317594892, + "grad_norm": 0.29079011686059164, + "learning_rate": 6.255975459476733e-06, + "loss": 0.4481, + "step": 11526 + }, + { + "epoch": 1.8930675589678319, + "grad_norm": 0.27533179125606577, + "learning_rate": 6.25554365644385e-06, + "loss": 0.4347, + "step": 11527 + }, + { + "epoch": 1.8932317861761747, + "grad_norm": 0.2722721620887308, + "learning_rate": 6.255111834344037e-06, + "loss": 0.447, + "step": 11528 + }, + { + "epoch": 1.8933960133845176, + "grad_norm": 0.40678158182691243, + "learning_rate": 6.254679993181982e-06, + "loss": 0.4652, + "step": 11529 + }, + { + "epoch": 1.8935602405928602, + "grad_norm": 0.32142588418213813, + "learning_rate": 6.254248132962377e-06, + "loss": 0.4635, + "step": 11530 + }, + { + "epoch": 1.8937244678012028, + "grad_norm": 0.29323029764232855, + "learning_rate": 6.253816253689909e-06, + "loss": 0.4447, + "step": 11531 + }, + { + "epoch": 1.8938886950095457, + "grad_norm": 0.29601891468051994, + "learning_rate": 6.2533843553692736e-06, + "loss": 0.4559, + "step": 11532 + }, + { + "epoch": 1.8940529222178886, + "grad_norm": 0.286254401704754, + "learning_rate": 6.252952438005157e-06, + "loss": 0.4337, + "step": 11533 + }, + { + "epoch": 1.8942171494262312, + "grad_norm": 0.3827539442976956, + "learning_rate": 6.252520501602252e-06, + "loss": 0.4571, + "step": 11534 + }, + { + "epoch": 1.8943813766345738, + "grad_norm": 0.2871685184088086, + "learning_rate": 6.2520885461652515e-06, + "loss": 0.4257, + "step": 11535 + }, + { + "epoch": 1.8945456038429167, + "grad_norm": 0.8404977702623102, + "learning_rate": 6.251656571698846e-06, + "loss": 0.4478, + "step": 11536 + }, + { + "epoch": 1.8947098310512596, + "grad_norm": 0.32025923391449485, + "learning_rate": 6.251224578207725e-06, + "loss": 0.4484, + "step": 11537 + }, + { + "epoch": 1.8948740582596022, + "grad_norm": 0.286052981589957, + "learning_rate": 6.2507925656965825e-06, + "loss": 0.4484, + "step": 11538 + }, + { + "epoch": 1.8950382854679448, + "grad_norm": 0.2856432354318945, + "learning_rate": 6.25036053417011e-06, + "loss": 0.441, + "step": 11539 + }, + { + "epoch": 1.8952025126762877, + "grad_norm": 0.3021535240566939, + "learning_rate": 6.2499284836330014e-06, + "loss": 0.4523, + "step": 11540 + }, + { + "epoch": 1.8953667398846303, + "grad_norm": 0.4005247080570124, + "learning_rate": 6.249496414089948e-06, + "loss": 0.463, + "step": 11541 + }, + { + "epoch": 1.8955309670929732, + "grad_norm": 0.2994150789421531, + "learning_rate": 6.24906432554564e-06, + "loss": 0.4448, + "step": 11542 + }, + { + "epoch": 1.8956951943013158, + "grad_norm": 0.3034711094777697, + "learning_rate": 6.248632218004773e-06, + "loss": 0.4422, + "step": 11543 + }, + { + "epoch": 1.8958594215096585, + "grad_norm": 0.3241176095587336, + "learning_rate": 6.248200091472042e-06, + "loss": 0.4704, + "step": 11544 + }, + { + "epoch": 1.8960236487180013, + "grad_norm": 0.31980159348254694, + "learning_rate": 6.247767945952138e-06, + "loss": 0.4389, + "step": 11545 + }, + { + "epoch": 1.8961878759263442, + "grad_norm": 0.37791081930961246, + "learning_rate": 6.247335781449751e-06, + "loss": 0.4415, + "step": 11546 + }, + { + "epoch": 1.8963521031346868, + "grad_norm": 0.271887232267887, + "learning_rate": 6.2469035979695805e-06, + "loss": 0.449, + "step": 11547 + }, + { + "epoch": 1.8965163303430295, + "grad_norm": 0.33458075064495246, + "learning_rate": 6.246471395516319e-06, + "loss": 0.4447, + "step": 11548 + }, + { + "epoch": 1.8966805575513723, + "grad_norm": 0.8032382841821906, + "learning_rate": 6.2460391740946585e-06, + "loss": 0.4596, + "step": 11549 + }, + { + "epoch": 1.8968447847597152, + "grad_norm": 0.39747951496956996, + "learning_rate": 6.245606933709296e-06, + "loss": 0.4669, + "step": 11550 + }, + { + "epoch": 1.8970090119680578, + "grad_norm": 0.29049470000836963, + "learning_rate": 6.245174674364923e-06, + "loss": 0.4586, + "step": 11551 + }, + { + "epoch": 1.8971732391764005, + "grad_norm": 0.3086255776162428, + "learning_rate": 6.244742396066237e-06, + "loss": 0.4448, + "step": 11552 + }, + { + "epoch": 1.8973374663847433, + "grad_norm": 0.3656060035316904, + "learning_rate": 6.244310098817933e-06, + "loss": 0.4494, + "step": 11553 + }, + { + "epoch": 1.8975016935930862, + "grad_norm": 0.2870249267218026, + "learning_rate": 6.243877782624703e-06, + "loss": 0.4474, + "step": 11554 + }, + { + "epoch": 1.8976659208014288, + "grad_norm": 0.39457473479831795, + "learning_rate": 6.243445447491246e-06, + "loss": 0.4487, + "step": 11555 + }, + { + "epoch": 1.8978301480097715, + "grad_norm": 0.35379457137991177, + "learning_rate": 6.2430130934222545e-06, + "loss": 0.4588, + "step": 11556 + }, + { + "epoch": 1.8979943752181143, + "grad_norm": 0.39416032087160907, + "learning_rate": 6.242580720422428e-06, + "loss": 0.4667, + "step": 11557 + }, + { + "epoch": 1.898158602426457, + "grad_norm": 0.3312029003928857, + "learning_rate": 6.242148328496459e-06, + "loss": 0.4657, + "step": 11558 + }, + { + "epoch": 1.8983228296347998, + "grad_norm": 0.3977817496035618, + "learning_rate": 6.241715917649046e-06, + "loss": 0.4449, + "step": 11559 + }, + { + "epoch": 1.8984870568431425, + "grad_norm": 0.2861507297462149, + "learning_rate": 6.241283487884884e-06, + "loss": 0.4449, + "step": 11560 + }, + { + "epoch": 1.898651284051485, + "grad_norm": 0.2867583166903834, + "learning_rate": 6.2408510392086714e-06, + "loss": 0.4653, + "step": 11561 + }, + { + "epoch": 1.898815511259828, + "grad_norm": 0.3138206192945438, + "learning_rate": 6.240418571625102e-06, + "loss": 0.4344, + "step": 11562 + }, + { + "epoch": 1.8989797384681708, + "grad_norm": 0.33426289559226574, + "learning_rate": 6.239986085138875e-06, + "loss": 0.4586, + "step": 11563 + }, + { + "epoch": 1.8991439656765134, + "grad_norm": 0.3225829509226065, + "learning_rate": 6.2395535797546875e-06, + "loss": 0.4533, + "step": 11564 + }, + { + "epoch": 1.899308192884856, + "grad_norm": 0.32700753558233936, + "learning_rate": 6.239121055477237e-06, + "loss": 0.4444, + "step": 11565 + }, + { + "epoch": 1.899472420093199, + "grad_norm": 0.7395817042078737, + "learning_rate": 6.238688512311219e-06, + "loss": 0.4559, + "step": 11566 + }, + { + "epoch": 1.8996366473015418, + "grad_norm": 0.3176473441311553, + "learning_rate": 6.238255950261335e-06, + "loss": 0.4468, + "step": 11567 + }, + { + "epoch": 1.8998008745098844, + "grad_norm": 0.3308084452657348, + "learning_rate": 6.23782336933228e-06, + "loss": 0.4732, + "step": 11568 + }, + { + "epoch": 1.899965101718227, + "grad_norm": 0.3424847343425013, + "learning_rate": 6.237390769528754e-06, + "loss": 0.459, + "step": 11569 + }, + { + "epoch": 1.90012932892657, + "grad_norm": 0.3329594799384342, + "learning_rate": 6.236958150855456e-06, + "loss": 0.4605, + "step": 11570 + }, + { + "epoch": 1.9002935561349128, + "grad_norm": 0.5164135512378328, + "learning_rate": 6.236525513317083e-06, + "loss": 0.4517, + "step": 11571 + }, + { + "epoch": 1.9004577833432554, + "grad_norm": 0.2962729195900784, + "learning_rate": 6.236092856918333e-06, + "loss": 0.454, + "step": 11572 + }, + { + "epoch": 1.900622010551598, + "grad_norm": 0.4457999560545923, + "learning_rate": 6.235660181663906e-06, + "loss": 0.4499, + "step": 11573 + }, + { + "epoch": 1.900786237759941, + "grad_norm": 0.29085184015101817, + "learning_rate": 6.235227487558504e-06, + "loss": 0.4487, + "step": 11574 + }, + { + "epoch": 1.9009504649682836, + "grad_norm": 0.2955717712662332, + "learning_rate": 6.2347947746068245e-06, + "loss": 0.4578, + "step": 11575 + }, + { + "epoch": 1.9011146921766264, + "grad_norm": 0.3152314118744751, + "learning_rate": 6.234362042813565e-06, + "loss": 0.4325, + "step": 11576 + }, + { + "epoch": 1.901278919384969, + "grad_norm": 0.26407825448367234, + "learning_rate": 6.233929292183427e-06, + "loss": 0.4503, + "step": 11577 + }, + { + "epoch": 1.9014431465933117, + "grad_norm": 0.3915379290000229, + "learning_rate": 6.233496522721113e-06, + "loss": 0.4504, + "step": 11578 + }, + { + "epoch": 1.9016073738016546, + "grad_norm": 0.3412610510019888, + "learning_rate": 6.233063734431321e-06, + "loss": 0.4463, + "step": 11579 + }, + { + "epoch": 1.9017716010099974, + "grad_norm": 0.32050195639366946, + "learning_rate": 6.23263092731875e-06, + "loss": 0.4497, + "step": 11580 + }, + { + "epoch": 1.90193582821834, + "grad_norm": 0.38273878233161573, + "learning_rate": 6.232198101388104e-06, + "loss": 0.4403, + "step": 11581 + }, + { + "epoch": 1.9021000554266827, + "grad_norm": 0.27806982974287925, + "learning_rate": 6.2317652566440825e-06, + "loss": 0.4451, + "step": 11582 + }, + { + "epoch": 1.9022642826350256, + "grad_norm": 0.32856501937660954, + "learning_rate": 6.231332393091385e-06, + "loss": 0.4551, + "step": 11583 + }, + { + "epoch": 1.9024285098433684, + "grad_norm": 0.2682827626213739, + "learning_rate": 6.230899510734716e-06, + "loss": 0.44, + "step": 11584 + }, + { + "epoch": 1.902592737051711, + "grad_norm": 0.49272255538438214, + "learning_rate": 6.230466609578773e-06, + "loss": 0.4414, + "step": 11585 + }, + { + "epoch": 1.9027569642600537, + "grad_norm": 0.32260483225705006, + "learning_rate": 6.230033689628262e-06, + "loss": 0.4271, + "step": 11586 + }, + { + "epoch": 1.9029211914683966, + "grad_norm": 0.29093147267619285, + "learning_rate": 6.229600750887883e-06, + "loss": 0.4579, + "step": 11587 + }, + { + "epoch": 1.9030854186767394, + "grad_norm": 0.35265332639510805, + "learning_rate": 6.229167793362337e-06, + "loss": 0.4413, + "step": 11588 + }, + { + "epoch": 1.903249645885082, + "grad_norm": 0.3142574353716658, + "learning_rate": 6.228734817056328e-06, + "loss": 0.4415, + "step": 11589 + }, + { + "epoch": 1.9034138730934247, + "grad_norm": 0.5622430879448479, + "learning_rate": 6.228301821974559e-06, + "loss": 0.4357, + "step": 11590 + }, + { + "epoch": 1.9035781003017676, + "grad_norm": 0.3871498522685965, + "learning_rate": 6.227868808121731e-06, + "loss": 0.4503, + "step": 11591 + }, + { + "epoch": 1.9037423275101102, + "grad_norm": 0.3141566277760273, + "learning_rate": 6.227435775502547e-06, + "loss": 0.4639, + "step": 11592 + }, + { + "epoch": 1.903906554718453, + "grad_norm": 0.32246353834594804, + "learning_rate": 6.227002724121711e-06, + "loss": 0.4344, + "step": 11593 + }, + { + "epoch": 1.9040707819267957, + "grad_norm": 0.29801343541053976, + "learning_rate": 6.226569653983929e-06, + "loss": 0.4561, + "step": 11594 + }, + { + "epoch": 1.9042350091351383, + "grad_norm": 0.33855218720060254, + "learning_rate": 6.2261365650939e-06, + "loss": 0.445, + "step": 11595 + }, + { + "epoch": 1.9043992363434812, + "grad_norm": 0.3087953002873901, + "learning_rate": 6.2257034574563285e-06, + "loss": 0.4356, + "step": 11596 + }, + { + "epoch": 1.904563463551824, + "grad_norm": 0.448227658275254, + "learning_rate": 6.225270331075921e-06, + "loss": 0.4669, + "step": 11597 + }, + { + "epoch": 1.9047276907601667, + "grad_norm": 0.325155177303091, + "learning_rate": 6.22483718595738e-06, + "loss": 0.4615, + "step": 11598 + }, + { + "epoch": 1.9048919179685093, + "grad_norm": 0.4675952573516096, + "learning_rate": 6.2244040221054095e-06, + "loss": 0.4613, + "step": 11599 + }, + { + "epoch": 1.9050561451768522, + "grad_norm": 0.32235353184835497, + "learning_rate": 6.223970839524715e-06, + "loss": 0.4601, + "step": 11600 + }, + { + "epoch": 1.905220372385195, + "grad_norm": 0.33233067198397287, + "learning_rate": 6.223537638220001e-06, + "loss": 0.4715, + "step": 11601 + }, + { + "epoch": 1.9053845995935377, + "grad_norm": 0.2982512809301434, + "learning_rate": 6.223104418195972e-06, + "loss": 0.4555, + "step": 11602 + }, + { + "epoch": 1.9055488268018803, + "grad_norm": 0.3801442216186049, + "learning_rate": 6.2226711794573354e-06, + "loss": 0.4318, + "step": 11603 + }, + { + "epoch": 1.9057130540102232, + "grad_norm": 0.2957424808790963, + "learning_rate": 6.222237922008795e-06, + "loss": 0.4574, + "step": 11604 + }, + { + "epoch": 1.905877281218566, + "grad_norm": 0.3087230953401648, + "learning_rate": 6.221804645855054e-06, + "loss": 0.4411, + "step": 11605 + }, + { + "epoch": 1.9060415084269087, + "grad_norm": 0.3542054566112007, + "learning_rate": 6.221371351000822e-06, + "loss": 0.4476, + "step": 11606 + }, + { + "epoch": 1.9062057356352513, + "grad_norm": 0.3553897290376785, + "learning_rate": 6.2209380374508035e-06, + "loss": 0.4707, + "step": 11607 + }, + { + "epoch": 1.906369962843594, + "grad_norm": 0.3564058255833194, + "learning_rate": 6.220504705209705e-06, + "loss": 0.4495, + "step": 11608 + }, + { + "epoch": 1.9065341900519368, + "grad_norm": 0.3832925455459929, + "learning_rate": 6.220071354282232e-06, + "loss": 0.4625, + "step": 11609 + }, + { + "epoch": 1.9066984172602797, + "grad_norm": 0.2817875411638949, + "learning_rate": 6.219637984673092e-06, + "loss": 0.4453, + "step": 11610 + }, + { + "epoch": 1.9068626444686223, + "grad_norm": 0.4776547443948289, + "learning_rate": 6.219204596386991e-06, + "loss": 0.4558, + "step": 11611 + }, + { + "epoch": 1.907026871676965, + "grad_norm": 0.34962942789150825, + "learning_rate": 6.218771189428637e-06, + "loss": 0.455, + "step": 11612 + }, + { + "epoch": 1.9071910988853078, + "grad_norm": 0.30962434569385655, + "learning_rate": 6.218337763802738e-06, + "loss": 0.4462, + "step": 11613 + }, + { + "epoch": 1.9073553260936507, + "grad_norm": 0.3391862780256781, + "learning_rate": 6.2179043195139985e-06, + "loss": 0.4351, + "step": 11614 + }, + { + "epoch": 1.9075195533019933, + "grad_norm": 0.30719116774597893, + "learning_rate": 6.2174708565671296e-06, + "loss": 0.4492, + "step": 11615 + }, + { + "epoch": 1.907683780510336, + "grad_norm": 0.30054149364958066, + "learning_rate": 6.217037374966836e-06, + "loss": 0.4628, + "step": 11616 + }, + { + "epoch": 1.9078480077186788, + "grad_norm": 0.386653014036295, + "learning_rate": 6.216603874717828e-06, + "loss": 0.4479, + "step": 11617 + }, + { + "epoch": 1.9080122349270217, + "grad_norm": 0.31127433847704317, + "learning_rate": 6.216170355824812e-06, + "loss": 0.4377, + "step": 11618 + }, + { + "epoch": 1.9081764621353643, + "grad_norm": 0.3605960243523264, + "learning_rate": 6.215736818292499e-06, + "loss": 0.459, + "step": 11619 + }, + { + "epoch": 1.908340689343707, + "grad_norm": 0.33757270101982667, + "learning_rate": 6.215303262125595e-06, + "loss": 0.4627, + "step": 11620 + }, + { + "epoch": 1.9085049165520498, + "grad_norm": 0.38173723225560446, + "learning_rate": 6.21486968732881e-06, + "loss": 0.4569, + "step": 11621 + }, + { + "epoch": 1.9086691437603927, + "grad_norm": 0.32491942608170243, + "learning_rate": 6.214436093906852e-06, + "loss": 0.4513, + "step": 11622 + }, + { + "epoch": 1.9088333709687353, + "grad_norm": 0.38253325439429253, + "learning_rate": 6.214002481864434e-06, + "loss": 0.429, + "step": 11623 + }, + { + "epoch": 1.908997598177078, + "grad_norm": 0.5169229832581916, + "learning_rate": 6.213568851206261e-06, + "loss": 0.4398, + "step": 11624 + }, + { + "epoch": 1.9091618253854206, + "grad_norm": 0.36052604572634506, + "learning_rate": 6.2131352019370446e-06, + "loss": 0.4403, + "step": 11625 + }, + { + "epoch": 1.9093260525937634, + "grad_norm": 0.2924912097276931, + "learning_rate": 6.212701534061493e-06, + "loss": 0.4611, + "step": 11626 + }, + { + "epoch": 1.9094902798021063, + "grad_norm": 0.3404899101670224, + "learning_rate": 6.212267847584319e-06, + "loss": 0.4447, + "step": 11627 + }, + { + "epoch": 1.909654507010449, + "grad_norm": 0.36381309075662677, + "learning_rate": 6.211834142510232e-06, + "loss": 0.4577, + "step": 11628 + }, + { + "epoch": 1.9098187342187916, + "grad_norm": 0.38424637201933903, + "learning_rate": 6.211400418843942e-06, + "loss": 0.4495, + "step": 11629 + }, + { + "epoch": 1.9099829614271344, + "grad_norm": 0.9281977678260263, + "learning_rate": 6.2109666765901585e-06, + "loss": 0.4659, + "step": 11630 + }, + { + "epoch": 1.9101471886354773, + "grad_norm": 0.35161232363455713, + "learning_rate": 6.2105329157535935e-06, + "loss": 0.455, + "step": 11631 + }, + { + "epoch": 1.91031141584382, + "grad_norm": 1.1585271182741959, + "learning_rate": 6.21009913633896e-06, + "loss": 0.4312, + "step": 11632 + }, + { + "epoch": 1.9104756430521626, + "grad_norm": 0.3674122396524195, + "learning_rate": 6.209665338350967e-06, + "loss": 0.4796, + "step": 11633 + }, + { + "epoch": 1.9106398702605054, + "grad_norm": 0.36718309874128846, + "learning_rate": 6.209231521794324e-06, + "loss": 0.4304, + "step": 11634 + }, + { + "epoch": 1.9108040974688483, + "grad_norm": 0.6703568626303426, + "learning_rate": 6.208797686673746e-06, + "loss": 0.4637, + "step": 11635 + }, + { + "epoch": 1.910968324677191, + "grad_norm": 0.40632340164699454, + "learning_rate": 6.2083638329939455e-06, + "loss": 0.46, + "step": 11636 + }, + { + "epoch": 1.9111325518855335, + "grad_norm": 0.35780222483159585, + "learning_rate": 6.207929960759631e-06, + "loss": 0.4555, + "step": 11637 + }, + { + "epoch": 1.9112967790938764, + "grad_norm": 0.34875469521533164, + "learning_rate": 6.207496069975519e-06, + "loss": 0.4445, + "step": 11638 + }, + { + "epoch": 1.9114610063022193, + "grad_norm": 0.37116131198879804, + "learning_rate": 6.207062160646318e-06, + "loss": 0.4627, + "step": 11639 + }, + { + "epoch": 1.911625233510562, + "grad_norm": 0.3261872497806788, + "learning_rate": 6.206628232776743e-06, + "loss": 0.436, + "step": 11640 + }, + { + "epoch": 1.9117894607189045, + "grad_norm": 0.2917346097333719, + "learning_rate": 6.206194286371505e-06, + "loss": 0.438, + "step": 11641 + }, + { + "epoch": 1.9119536879272472, + "grad_norm": 0.318715515401316, + "learning_rate": 6.205760321435319e-06, + "loss": 0.4716, + "step": 11642 + }, + { + "epoch": 1.91211791513559, + "grad_norm": 0.3822885736630288, + "learning_rate": 6.205326337972899e-06, + "loss": 0.4612, + "step": 11643 + }, + { + "epoch": 1.912282142343933, + "grad_norm": 0.39512948614808435, + "learning_rate": 6.204892335988956e-06, + "loss": 0.4546, + "step": 11644 + }, + { + "epoch": 1.9124463695522755, + "grad_norm": 0.2906356956083419, + "learning_rate": 6.204458315488205e-06, + "loss": 0.4323, + "step": 11645 + }, + { + "epoch": 1.9126105967606182, + "grad_norm": 0.3151679721101704, + "learning_rate": 6.204024276475361e-06, + "loss": 0.4274, + "step": 11646 + }, + { + "epoch": 1.912774823968961, + "grad_norm": 0.280920093275785, + "learning_rate": 6.203590218955136e-06, + "loss": 0.4345, + "step": 11647 + }, + { + "epoch": 1.912939051177304, + "grad_norm": 0.3177305722530813, + "learning_rate": 6.203156142932243e-06, + "loss": 0.4561, + "step": 11648 + }, + { + "epoch": 1.9131032783856465, + "grad_norm": 0.35949250252352477, + "learning_rate": 6.202722048411402e-06, + "loss": 0.4366, + "step": 11649 + }, + { + "epoch": 1.9132675055939892, + "grad_norm": 0.37584685311355087, + "learning_rate": 6.202287935397321e-06, + "loss": 0.4478, + "step": 11650 + }, + { + "epoch": 1.913431732802332, + "grad_norm": 0.33655101627969175, + "learning_rate": 6.20185380389472e-06, + "loss": 0.4548, + "step": 11651 + }, + { + "epoch": 1.913595960010675, + "grad_norm": 0.30964429163933455, + "learning_rate": 6.201419653908313e-06, + "loss": 0.4538, + "step": 11652 + }, + { + "epoch": 1.9137601872190175, + "grad_norm": 0.29372286656988195, + "learning_rate": 6.200985485442815e-06, + "loss": 0.4318, + "step": 11653 + }, + { + "epoch": 1.9139244144273602, + "grad_norm": 1.6537874331386198, + "learning_rate": 6.200551298502939e-06, + "loss": 0.4327, + "step": 11654 + }, + { + "epoch": 1.914088641635703, + "grad_norm": 0.414880176031993, + "learning_rate": 6.2001170930934025e-06, + "loss": 0.4639, + "step": 11655 + }, + { + "epoch": 1.9142528688440459, + "grad_norm": 0.381572536194753, + "learning_rate": 6.199682869218922e-06, + "loss": 0.454, + "step": 11656 + }, + { + "epoch": 1.9144170960523885, + "grad_norm": 0.3618841751303404, + "learning_rate": 6.199248626884215e-06, + "loss": 0.4415, + "step": 11657 + }, + { + "epoch": 1.9145813232607312, + "grad_norm": 0.3091815517830806, + "learning_rate": 6.198814366093996e-06, + "loss": 0.4635, + "step": 11658 + }, + { + "epoch": 1.9147455504690738, + "grad_norm": 0.3461588940465848, + "learning_rate": 6.198380086852981e-06, + "loss": 0.4545, + "step": 11659 + }, + { + "epoch": 1.9149097776774167, + "grad_norm": 0.32914313950790325, + "learning_rate": 6.197945789165885e-06, + "loss": 0.4288, + "step": 11660 + }, + { + "epoch": 1.9150740048857595, + "grad_norm": 0.30501120382983676, + "learning_rate": 6.197511473037431e-06, + "loss": 0.4331, + "step": 11661 + }, + { + "epoch": 1.9152382320941022, + "grad_norm": 0.31219879028545594, + "learning_rate": 6.19707713847233e-06, + "loss": 0.4464, + "step": 11662 + }, + { + "epoch": 1.9154024593024448, + "grad_norm": 0.31460550974100093, + "learning_rate": 6.196642785475302e-06, + "loss": 0.4562, + "step": 11663 + }, + { + "epoch": 1.9155666865107877, + "grad_norm": 0.36270468986808635, + "learning_rate": 6.196208414051064e-06, + "loss": 0.4501, + "step": 11664 + }, + { + "epoch": 1.9157309137191305, + "grad_norm": 0.4112853774633795, + "learning_rate": 6.195774024204334e-06, + "loss": 0.4611, + "step": 11665 + }, + { + "epoch": 1.9158951409274732, + "grad_norm": 0.33448305601115735, + "learning_rate": 6.19533961593983e-06, + "loss": 0.4738, + "step": 11666 + }, + { + "epoch": 1.9160593681358158, + "grad_norm": 0.5320652524792794, + "learning_rate": 6.194905189262269e-06, + "loss": 0.4803, + "step": 11667 + }, + { + "epoch": 1.9162235953441586, + "grad_norm": 0.3020273231113173, + "learning_rate": 6.19447074417637e-06, + "loss": 0.4522, + "step": 11668 + }, + { + "epoch": 1.9163878225525015, + "grad_norm": 0.3270778515804615, + "learning_rate": 6.194036280686851e-06, + "loss": 0.4652, + "step": 11669 + }, + { + "epoch": 1.9165520497608441, + "grad_norm": 0.30776326745206567, + "learning_rate": 6.193601798798435e-06, + "loss": 0.4489, + "step": 11670 + }, + { + "epoch": 1.9167162769691868, + "grad_norm": 0.30728786187995366, + "learning_rate": 6.193167298515833e-06, + "loss": 0.4427, + "step": 11671 + }, + { + "epoch": 1.9168805041775296, + "grad_norm": 0.3927785447223489, + "learning_rate": 6.192732779843771e-06, + "loss": 0.4553, + "step": 11672 + }, + { + "epoch": 1.9170447313858725, + "grad_norm": 0.29282886384148993, + "learning_rate": 6.192298242786963e-06, + "loss": 0.4499, + "step": 11673 + }, + { + "epoch": 1.9172089585942151, + "grad_norm": 0.31577968887587804, + "learning_rate": 6.191863687350133e-06, + "loss": 0.4792, + "step": 11674 + }, + { + "epoch": 1.9173731858025578, + "grad_norm": 0.3761942499553469, + "learning_rate": 6.191429113537998e-06, + "loss": 0.4498, + "step": 11675 + }, + { + "epoch": 1.9175374130109004, + "grad_norm": 0.317601854958605, + "learning_rate": 6.190994521355279e-06, + "loss": 0.4429, + "step": 11676 + }, + { + "epoch": 1.9177016402192433, + "grad_norm": 0.3517208242879503, + "learning_rate": 6.190559910806696e-06, + "loss": 0.4644, + "step": 11677 + }, + { + "epoch": 1.9178658674275861, + "grad_norm": 0.3241901907564984, + "learning_rate": 6.190125281896969e-06, + "loss": 0.456, + "step": 11678 + }, + { + "epoch": 1.9180300946359288, + "grad_norm": 0.3025139517641792, + "learning_rate": 6.189690634630818e-06, + "loss": 0.4564, + "step": 11679 + }, + { + "epoch": 1.9181943218442714, + "grad_norm": 0.35524817549149657, + "learning_rate": 6.189255969012965e-06, + "loss": 0.4399, + "step": 11680 + }, + { + "epoch": 1.9183585490526143, + "grad_norm": 0.4244953064809638, + "learning_rate": 6.18882128504813e-06, + "loss": 0.4292, + "step": 11681 + }, + { + "epoch": 1.9185227762609571, + "grad_norm": 0.30648349992963597, + "learning_rate": 6.188386582741034e-06, + "loss": 0.4547, + "step": 11682 + }, + { + "epoch": 1.9186870034692998, + "grad_norm": 0.30163722557002615, + "learning_rate": 6.187951862096398e-06, + "loss": 0.4584, + "step": 11683 + }, + { + "epoch": 1.9188512306776424, + "grad_norm": 0.33493401319101296, + "learning_rate": 6.187517123118945e-06, + "loss": 0.4509, + "step": 11684 + }, + { + "epoch": 1.9190154578859853, + "grad_norm": 0.313911034591551, + "learning_rate": 6.187082365813395e-06, + "loss": 0.4539, + "step": 11685 + }, + { + "epoch": 1.9191796850943281, + "grad_norm": 0.5202964654233119, + "learning_rate": 6.186647590184471e-06, + "loss": 0.4664, + "step": 11686 + }, + { + "epoch": 1.9193439123026708, + "grad_norm": 0.2911861487320873, + "learning_rate": 6.186212796236896e-06, + "loss": 0.4493, + "step": 11687 + }, + { + "epoch": 1.9195081395110134, + "grad_norm": 0.3894019111871137, + "learning_rate": 6.185777983975389e-06, + "loss": 0.4719, + "step": 11688 + }, + { + "epoch": 1.9196723667193563, + "grad_norm": 0.3263915765382676, + "learning_rate": 6.185343153404675e-06, + "loss": 0.4603, + "step": 11689 + }, + { + "epoch": 1.9198365939276991, + "grad_norm": 0.2812099109496593, + "learning_rate": 6.184908304529477e-06, + "loss": 0.4397, + "step": 11690 + }, + { + "epoch": 1.9200008211360418, + "grad_norm": 0.3316284604588048, + "learning_rate": 6.184473437354517e-06, + "loss": 0.4634, + "step": 11691 + }, + { + "epoch": 1.9201650483443844, + "grad_norm": 0.3214264446614346, + "learning_rate": 6.184038551884518e-06, + "loss": 0.4567, + "step": 11692 + }, + { + "epoch": 1.920329275552727, + "grad_norm": 0.29348354172249397, + "learning_rate": 6.183603648124203e-06, + "loss": 0.4354, + "step": 11693 + }, + { + "epoch": 1.92049350276107, + "grad_norm": 0.42769052741256725, + "learning_rate": 6.183168726078295e-06, + "loss": 0.4522, + "step": 11694 + }, + { + "epoch": 1.9206577299694128, + "grad_norm": 0.3172711368471013, + "learning_rate": 6.182733785751521e-06, + "loss": 0.4598, + "step": 11695 + }, + { + "epoch": 1.9208219571777554, + "grad_norm": 0.3141973931650255, + "learning_rate": 6.182298827148602e-06, + "loss": 0.4464, + "step": 11696 + }, + { + "epoch": 1.920986184386098, + "grad_norm": 0.3561540020725479, + "learning_rate": 6.181863850274262e-06, + "loss": 0.4455, + "step": 11697 + }, + { + "epoch": 1.9211504115944409, + "grad_norm": 0.32393565694758736, + "learning_rate": 6.181428855133225e-06, + "loss": 0.4564, + "step": 11698 + }, + { + "epoch": 1.9213146388027837, + "grad_norm": 0.298550373846303, + "learning_rate": 6.1809938417302176e-06, + "loss": 0.4506, + "step": 11699 + }, + { + "epoch": 1.9214788660111264, + "grad_norm": 0.3943979027923778, + "learning_rate": 6.180558810069962e-06, + "loss": 0.4397, + "step": 11700 + }, + { + "epoch": 1.921643093219469, + "grad_norm": 0.33128113794288905, + "learning_rate": 6.180123760157187e-06, + "loss": 0.4649, + "step": 11701 + }, + { + "epoch": 1.9218073204278119, + "grad_norm": 0.33788593807856176, + "learning_rate": 6.179688691996611e-06, + "loss": 0.4505, + "step": 11702 + }, + { + "epoch": 1.9219715476361547, + "grad_norm": 0.29197053226973274, + "learning_rate": 6.179253605592966e-06, + "loss": 0.4484, + "step": 11703 + }, + { + "epoch": 1.9221357748444974, + "grad_norm": 0.4002773092580003, + "learning_rate": 6.178818500950975e-06, + "loss": 0.4368, + "step": 11704 + }, + { + "epoch": 1.92230000205284, + "grad_norm": 0.3225017152426909, + "learning_rate": 6.178383378075361e-06, + "loss": 0.4492, + "step": 11705 + }, + { + "epoch": 1.9224642292611829, + "grad_norm": 0.3292097446594349, + "learning_rate": 6.177948236970854e-06, + "loss": 0.4524, + "step": 11706 + }, + { + "epoch": 1.9226284564695257, + "grad_norm": 0.351995173082209, + "learning_rate": 6.177513077642178e-06, + "loss": 0.4453, + "step": 11707 + }, + { + "epoch": 1.9227926836778684, + "grad_norm": 0.31252517151783626, + "learning_rate": 6.177077900094058e-06, + "loss": 0.453, + "step": 11708 + }, + { + "epoch": 1.922956910886211, + "grad_norm": 0.2861624043707913, + "learning_rate": 6.176642704331224e-06, + "loss": 0.4326, + "step": 11709 + }, + { + "epoch": 1.9231211380945537, + "grad_norm": 0.3842211082952547, + "learning_rate": 6.176207490358399e-06, + "loss": 0.4414, + "step": 11710 + }, + { + "epoch": 1.9232853653028965, + "grad_norm": 0.7917893981486481, + "learning_rate": 6.175772258180314e-06, + "loss": 0.445, + "step": 11711 + }, + { + "epoch": 1.9234495925112394, + "grad_norm": 0.4790207371880203, + "learning_rate": 6.175337007801691e-06, + "loss": 0.4307, + "step": 11712 + }, + { + "epoch": 1.923613819719582, + "grad_norm": 0.40561312865159216, + "learning_rate": 6.174901739227259e-06, + "loss": 0.4498, + "step": 11713 + }, + { + "epoch": 1.9237780469279246, + "grad_norm": 0.47192462934888785, + "learning_rate": 6.174466452461749e-06, + "loss": 0.4539, + "step": 11714 + }, + { + "epoch": 1.9239422741362675, + "grad_norm": 0.3453248892639385, + "learning_rate": 6.174031147509885e-06, + "loss": 0.452, + "step": 11715 + }, + { + "epoch": 1.9241065013446104, + "grad_norm": 0.46993171084266266, + "learning_rate": 6.173595824376396e-06, + "loss": 0.441, + "step": 11716 + }, + { + "epoch": 1.924270728552953, + "grad_norm": 0.3132400675291704, + "learning_rate": 6.17316048306601e-06, + "loss": 0.4474, + "step": 11717 + }, + { + "epoch": 1.9244349557612956, + "grad_norm": 0.5141692658873547, + "learning_rate": 6.172725123583452e-06, + "loss": 0.4445, + "step": 11718 + }, + { + "epoch": 1.9245991829696385, + "grad_norm": 0.3123892509641973, + "learning_rate": 6.1722897459334554e-06, + "loss": 0.4195, + "step": 11719 + }, + { + "epoch": 1.9247634101779814, + "grad_norm": 0.32517861042004026, + "learning_rate": 6.171854350120748e-06, + "loss": 0.4694, + "step": 11720 + }, + { + "epoch": 1.924927637386324, + "grad_norm": 0.43041962733396594, + "learning_rate": 6.171418936150057e-06, + "loss": 0.4448, + "step": 11721 + }, + { + "epoch": 1.9250918645946666, + "grad_norm": 0.32581878191201175, + "learning_rate": 6.170983504026111e-06, + "loss": 0.4353, + "step": 11722 + }, + { + "epoch": 1.9252560918030095, + "grad_norm": 0.3221459896375791, + "learning_rate": 6.17054805375364e-06, + "loss": 0.4501, + "step": 11723 + }, + { + "epoch": 1.9254203190113524, + "grad_norm": 0.4751198372362801, + "learning_rate": 6.170112585337375e-06, + "loss": 0.466, + "step": 11724 + }, + { + "epoch": 1.925584546219695, + "grad_norm": 0.4111261993043956, + "learning_rate": 6.169677098782044e-06, + "loss": 0.4415, + "step": 11725 + }, + { + "epoch": 1.9257487734280376, + "grad_norm": 0.4096292396716481, + "learning_rate": 6.169241594092376e-06, + "loss": 0.4322, + "step": 11726 + }, + { + "epoch": 1.9259130006363803, + "grad_norm": 0.3758588386747688, + "learning_rate": 6.168806071273102e-06, + "loss": 0.4531, + "step": 11727 + }, + { + "epoch": 1.9260772278447231, + "grad_norm": 0.32238900503219686, + "learning_rate": 6.168370530328952e-06, + "loss": 0.4496, + "step": 11728 + }, + { + "epoch": 1.926241455053066, + "grad_norm": 0.3738260566944197, + "learning_rate": 6.167934971264657e-06, + "loss": 0.4533, + "step": 11729 + }, + { + "epoch": 1.9264056822614086, + "grad_norm": 0.3247033594038588, + "learning_rate": 6.167499394084947e-06, + "loss": 0.4448, + "step": 11730 + }, + { + "epoch": 1.9265699094697513, + "grad_norm": 0.3613742112719459, + "learning_rate": 6.167063798794553e-06, + "loss": 0.452, + "step": 11731 + }, + { + "epoch": 1.9267341366780941, + "grad_norm": 0.30402605193304383, + "learning_rate": 6.166628185398207e-06, + "loss": 0.4418, + "step": 11732 + }, + { + "epoch": 1.926898363886437, + "grad_norm": 0.36077690801634543, + "learning_rate": 6.166192553900637e-06, + "loss": 0.4634, + "step": 11733 + }, + { + "epoch": 1.9270625910947796, + "grad_norm": 0.3157741710943167, + "learning_rate": 6.165756904306578e-06, + "loss": 0.437, + "step": 11734 + }, + { + "epoch": 1.9272268183031223, + "grad_norm": 0.36396243689784985, + "learning_rate": 6.16532123662076e-06, + "loss": 0.4715, + "step": 11735 + }, + { + "epoch": 1.9273910455114651, + "grad_norm": 0.32922576893702915, + "learning_rate": 6.164885550847916e-06, + "loss": 0.4401, + "step": 11736 + }, + { + "epoch": 1.927555272719808, + "grad_norm": 0.40557599637326114, + "learning_rate": 6.1644498469927755e-06, + "loss": 0.4527, + "step": 11737 + }, + { + "epoch": 1.9277194999281506, + "grad_norm": 0.32617286436421994, + "learning_rate": 6.164014125060072e-06, + "loss": 0.4699, + "step": 11738 + }, + { + "epoch": 1.9278837271364933, + "grad_norm": 0.3296499446537119, + "learning_rate": 6.163578385054538e-06, + "loss": 0.4516, + "step": 11739 + }, + { + "epoch": 1.9280479543448361, + "grad_norm": 0.5284605915318895, + "learning_rate": 6.163142626980906e-06, + "loss": 0.4438, + "step": 11740 + }, + { + "epoch": 1.928212181553179, + "grad_norm": 0.304222483380602, + "learning_rate": 6.1627068508439095e-06, + "loss": 0.4493, + "step": 11741 + }, + { + "epoch": 1.9283764087615216, + "grad_norm": 0.37578326123625305, + "learning_rate": 6.1622710566482795e-06, + "loss": 0.4453, + "step": 11742 + }, + { + "epoch": 1.9285406359698642, + "grad_norm": 0.3316740779606767, + "learning_rate": 6.161835244398751e-06, + "loss": 0.4475, + "step": 11743 + }, + { + "epoch": 1.9287048631782069, + "grad_norm": 0.4724230151433168, + "learning_rate": 6.161399414100057e-06, + "loss": 0.464, + "step": 11744 + }, + { + "epoch": 1.9288690903865497, + "grad_norm": 0.33102520210764236, + "learning_rate": 6.160963565756932e-06, + "loss": 0.4482, + "step": 11745 + }, + { + "epoch": 1.9290333175948926, + "grad_norm": 0.3039412054856362, + "learning_rate": 6.160527699374107e-06, + "loss": 0.4579, + "step": 11746 + }, + { + "epoch": 1.9291975448032352, + "grad_norm": 0.4759493387711221, + "learning_rate": 6.160091814956317e-06, + "loss": 0.4543, + "step": 11747 + }, + { + "epoch": 1.9293617720115779, + "grad_norm": 0.30295570419838147, + "learning_rate": 6.159655912508297e-06, + "loss": 0.4502, + "step": 11748 + }, + { + "epoch": 1.9295259992199207, + "grad_norm": 0.3095851008676612, + "learning_rate": 6.159219992034782e-06, + "loss": 0.4476, + "step": 11749 + }, + { + "epoch": 1.9296902264282636, + "grad_norm": 0.30467728546486345, + "learning_rate": 6.158784053540504e-06, + "loss": 0.4536, + "step": 11750 + }, + { + "epoch": 1.9298544536366062, + "grad_norm": 0.3967570517620851, + "learning_rate": 6.1583480970301995e-06, + "loss": 0.436, + "step": 11751 + }, + { + "epoch": 1.9300186808449489, + "grad_norm": 0.3002495362111364, + "learning_rate": 6.157912122508603e-06, + "loss": 0.4264, + "step": 11752 + }, + { + "epoch": 1.9301829080532917, + "grad_norm": 0.3828269006370637, + "learning_rate": 6.157476129980451e-06, + "loss": 0.4421, + "step": 11753 + }, + { + "epoch": 1.9303471352616346, + "grad_norm": 0.30484222608414635, + "learning_rate": 6.157040119450475e-06, + "loss": 0.4594, + "step": 11754 + }, + { + "epoch": 1.9305113624699772, + "grad_norm": 0.4559966835142827, + "learning_rate": 6.156604090923415e-06, + "loss": 0.4618, + "step": 11755 + }, + { + "epoch": 1.9306755896783199, + "grad_norm": 0.2895503351697011, + "learning_rate": 6.1561680444040035e-06, + "loss": 0.4558, + "step": 11756 + }, + { + "epoch": 1.9308398168866627, + "grad_norm": 0.30856588658860873, + "learning_rate": 6.1557319798969785e-06, + "loss": 0.4452, + "step": 11757 + }, + { + "epoch": 1.9310040440950056, + "grad_norm": 0.49077377646148856, + "learning_rate": 6.155295897407075e-06, + "loss": 0.4692, + "step": 11758 + }, + { + "epoch": 1.9311682713033482, + "grad_norm": 0.2968529348737747, + "learning_rate": 6.154859796939029e-06, + "loss": 0.4615, + "step": 11759 + }, + { + "epoch": 1.9313324985116909, + "grad_norm": 0.3808550571539235, + "learning_rate": 6.154423678497578e-06, + "loss": 0.4438, + "step": 11760 + }, + { + "epoch": 1.9314967257200335, + "grad_norm": 0.33257583457981604, + "learning_rate": 6.153987542087457e-06, + "loss": 0.4715, + "step": 11761 + }, + { + "epoch": 1.9316609529283764, + "grad_norm": 0.3034797701744059, + "learning_rate": 6.153551387713406e-06, + "loss": 0.4335, + "step": 11762 + }, + { + "epoch": 1.9318251801367192, + "grad_norm": 0.3366439268492937, + "learning_rate": 6.153115215380159e-06, + "loss": 0.4536, + "step": 11763 + }, + { + "epoch": 1.9319894073450619, + "grad_norm": 0.333839910373322, + "learning_rate": 6.1526790250924545e-06, + "loss": 0.4588, + "step": 11764 + }, + { + "epoch": 1.9321536345534045, + "grad_norm": 0.29555048190918864, + "learning_rate": 6.1522428168550286e-06, + "loss": 0.441, + "step": 11765 + }, + { + "epoch": 1.9323178617617474, + "grad_norm": 0.3160434566532558, + "learning_rate": 6.151806590672622e-06, + "loss": 0.4259, + "step": 11766 + }, + { + "epoch": 1.9324820889700902, + "grad_norm": 0.3977836618478647, + "learning_rate": 6.151370346549969e-06, + "loss": 0.4411, + "step": 11767 + }, + { + "epoch": 1.9326463161784329, + "grad_norm": 0.2852990076026332, + "learning_rate": 6.150934084491809e-06, + "loss": 0.4394, + "step": 11768 + }, + { + "epoch": 1.9328105433867755, + "grad_norm": 0.34967502718124766, + "learning_rate": 6.1504978045028825e-06, + "loss": 0.4371, + "step": 11769 + }, + { + "epoch": 1.9329747705951184, + "grad_norm": 0.2949264653113903, + "learning_rate": 6.150061506587925e-06, + "loss": 0.4506, + "step": 11770 + }, + { + "epoch": 1.9331389978034612, + "grad_norm": 0.32292941521963847, + "learning_rate": 6.149625190751676e-06, + "loss": 0.4514, + "step": 11771 + }, + { + "epoch": 1.9333032250118038, + "grad_norm": 0.3901095676284324, + "learning_rate": 6.149188856998874e-06, + "loss": 0.4461, + "step": 11772 + }, + { + "epoch": 1.9334674522201465, + "grad_norm": 0.3148958430118748, + "learning_rate": 6.148752505334259e-06, + "loss": 0.439, + "step": 11773 + }, + { + "epoch": 1.9336316794284893, + "grad_norm": 0.4645426254687232, + "learning_rate": 6.148316135762571e-06, + "loss": 0.4662, + "step": 11774 + }, + { + "epoch": 1.9337959066368322, + "grad_norm": 0.3381456693550708, + "learning_rate": 6.147879748288546e-06, + "loss": 0.4397, + "step": 11775 + }, + { + "epoch": 1.9339601338451748, + "grad_norm": 0.2965471428211383, + "learning_rate": 6.1474433429169255e-06, + "loss": 0.4574, + "step": 11776 + }, + { + "epoch": 1.9341243610535175, + "grad_norm": 0.3603201346209414, + "learning_rate": 6.14700691965245e-06, + "loss": 0.4469, + "step": 11777 + }, + { + "epoch": 1.9342885882618601, + "grad_norm": 0.32054268239319894, + "learning_rate": 6.146570478499859e-06, + "loss": 0.437, + "step": 11778 + }, + { + "epoch": 1.934452815470203, + "grad_norm": 0.48292569009156594, + "learning_rate": 6.146134019463895e-06, + "loss": 0.4561, + "step": 11779 + }, + { + "epoch": 1.9346170426785458, + "grad_norm": 0.3414445379177717, + "learning_rate": 6.1456975425492925e-06, + "loss": 0.4601, + "step": 11780 + }, + { + "epoch": 1.9347812698868885, + "grad_norm": 0.32659418586761957, + "learning_rate": 6.145261047760797e-06, + "loss": 0.4548, + "step": 11781 + }, + { + "epoch": 1.9349454970952311, + "grad_norm": 0.35652097068907895, + "learning_rate": 6.144824535103147e-06, + "loss": 0.4782, + "step": 11782 + }, + { + "epoch": 1.935109724303574, + "grad_norm": 0.2759711761110947, + "learning_rate": 6.144388004581084e-06, + "loss": 0.4528, + "step": 11783 + }, + { + "epoch": 1.9352739515119168, + "grad_norm": 0.34655152524113225, + "learning_rate": 6.143951456199352e-06, + "loss": 0.4426, + "step": 11784 + }, + { + "epoch": 1.9354381787202595, + "grad_norm": 0.30678758926269073, + "learning_rate": 6.143514889962687e-06, + "loss": 0.4533, + "step": 11785 + }, + { + "epoch": 1.935602405928602, + "grad_norm": 0.3465024740320573, + "learning_rate": 6.143078305875834e-06, + "loss": 0.4457, + "step": 11786 + }, + { + "epoch": 1.935766633136945, + "grad_norm": 0.2944811158711843, + "learning_rate": 6.142641703943534e-06, + "loss": 0.4469, + "step": 11787 + }, + { + "epoch": 1.9359308603452878, + "grad_norm": 0.4588778909977551, + "learning_rate": 6.142205084170529e-06, + "loss": 0.4569, + "step": 11788 + }, + { + "epoch": 1.9360950875536305, + "grad_norm": 0.38001254582850585, + "learning_rate": 6.141768446561563e-06, + "loss": 0.4391, + "step": 11789 + }, + { + "epoch": 1.936259314761973, + "grad_norm": 0.36647792906297894, + "learning_rate": 6.141331791121374e-06, + "loss": 0.4675, + "step": 11790 + }, + { + "epoch": 1.936423541970316, + "grad_norm": 0.3147751631851471, + "learning_rate": 6.140895117854708e-06, + "loss": 0.4451, + "step": 11791 + }, + { + "epoch": 1.9365877691786588, + "grad_norm": 0.37708723383068626, + "learning_rate": 6.140458426766305e-06, + "loss": 0.4363, + "step": 11792 + }, + { + "epoch": 1.9367519963870015, + "grad_norm": 0.314419687793131, + "learning_rate": 6.140021717860911e-06, + "loss": 0.4344, + "step": 11793 + }, + { + "epoch": 1.936916223595344, + "grad_norm": 0.2978678788734563, + "learning_rate": 6.139584991143268e-06, + "loss": 0.4354, + "step": 11794 + }, + { + "epoch": 1.9370804508036867, + "grad_norm": 0.6290390429035364, + "learning_rate": 6.139148246618118e-06, + "loss": 0.4435, + "step": 11795 + }, + { + "epoch": 1.9372446780120296, + "grad_norm": 0.3930535583884199, + "learning_rate": 6.138711484290205e-06, + "loss": 0.4452, + "step": 11796 + }, + { + "epoch": 1.9374089052203725, + "grad_norm": 0.4058397974826341, + "learning_rate": 6.1382747041642735e-06, + "loss": 0.4424, + "step": 11797 + }, + { + "epoch": 1.937573132428715, + "grad_norm": 0.28754785524275933, + "learning_rate": 6.137837906245067e-06, + "loss": 0.4626, + "step": 11798 + }, + { + "epoch": 1.9377373596370577, + "grad_norm": 0.37145994996852144, + "learning_rate": 6.13740109053733e-06, + "loss": 0.4296, + "step": 11799 + }, + { + "epoch": 1.9379015868454006, + "grad_norm": 0.31987207942764523, + "learning_rate": 6.136964257045804e-06, + "loss": 0.4407, + "step": 11800 + }, + { + "epoch": 1.9380658140537435, + "grad_norm": 0.3120597556496171, + "learning_rate": 6.136527405775238e-06, + "loss": 0.4154, + "step": 11801 + }, + { + "epoch": 1.938230041262086, + "grad_norm": 0.32184214275987144, + "learning_rate": 6.136090536730372e-06, + "loss": 0.4416, + "step": 11802 + }, + { + "epoch": 1.9383942684704287, + "grad_norm": 0.30422790837511227, + "learning_rate": 6.1356536499159555e-06, + "loss": 0.4775, + "step": 11803 + }, + { + "epoch": 1.9385584956787716, + "grad_norm": 0.34677800611122794, + "learning_rate": 6.1352167453367305e-06, + "loss": 0.4449, + "step": 11804 + }, + { + "epoch": 1.9387227228871144, + "grad_norm": 0.2992127868849352, + "learning_rate": 6.134779822997442e-06, + "loss": 0.4461, + "step": 11805 + }, + { + "epoch": 1.938886950095457, + "grad_norm": 0.3231711502256461, + "learning_rate": 6.134342882902836e-06, + "loss": 0.4586, + "step": 11806 + }, + { + "epoch": 1.9390511773037997, + "grad_norm": 0.3037909014573711, + "learning_rate": 6.133905925057659e-06, + "loss": 0.4529, + "step": 11807 + }, + { + "epoch": 1.9392154045121426, + "grad_norm": 0.3337539556315541, + "learning_rate": 6.1334689494666564e-06, + "loss": 0.4604, + "step": 11808 + }, + { + "epoch": 1.9393796317204854, + "grad_norm": 0.33305602986601746, + "learning_rate": 6.133031956134573e-06, + "loss": 0.4361, + "step": 11809 + }, + { + "epoch": 1.939543858928828, + "grad_norm": 0.8091816550177268, + "learning_rate": 6.132594945066157e-06, + "loss": 0.4646, + "step": 11810 + }, + { + "epoch": 1.9397080861371707, + "grad_norm": 0.32246244156970166, + "learning_rate": 6.132157916266152e-06, + "loss": 0.4639, + "step": 11811 + }, + { + "epoch": 1.9398723133455134, + "grad_norm": 0.44159454100120704, + "learning_rate": 6.131720869739307e-06, + "loss": 0.4508, + "step": 11812 + }, + { + "epoch": 1.9400365405538562, + "grad_norm": 0.2880429219605012, + "learning_rate": 6.131283805490368e-06, + "loss": 0.4475, + "step": 11813 + }, + { + "epoch": 1.940200767762199, + "grad_norm": 0.2735895117494609, + "learning_rate": 6.130846723524082e-06, + "loss": 0.4595, + "step": 11814 + }, + { + "epoch": 1.9403649949705417, + "grad_norm": 0.3123242230326185, + "learning_rate": 6.130409623845196e-06, + "loss": 0.4765, + "step": 11815 + }, + { + "epoch": 1.9405292221788843, + "grad_norm": 0.29578411751014344, + "learning_rate": 6.129972506458458e-06, + "loss": 0.4576, + "step": 11816 + }, + { + "epoch": 1.9406934493872272, + "grad_norm": 0.4095465840591172, + "learning_rate": 6.129535371368614e-06, + "loss": 0.4277, + "step": 11817 + }, + { + "epoch": 1.94085767659557, + "grad_norm": 0.27967699795557444, + "learning_rate": 6.129098218580414e-06, + "loss": 0.44, + "step": 11818 + }, + { + "epoch": 1.9410219038039127, + "grad_norm": 0.3700003102868501, + "learning_rate": 6.128661048098602e-06, + "loss": 0.4289, + "step": 11819 + }, + { + "epoch": 1.9411861310122553, + "grad_norm": 0.4692973743249203, + "learning_rate": 6.128223859927931e-06, + "loss": 0.4652, + "step": 11820 + }, + { + "epoch": 1.9413503582205982, + "grad_norm": 0.3164140915929167, + "learning_rate": 6.1277866540731465e-06, + "loss": 0.4383, + "step": 11821 + }, + { + "epoch": 1.941514585428941, + "grad_norm": 0.4703336243325058, + "learning_rate": 6.1273494305389956e-06, + "loss": 0.4446, + "step": 11822 + }, + { + "epoch": 1.9416788126372837, + "grad_norm": 0.35750823869899867, + "learning_rate": 6.126912189330231e-06, + "loss": 0.4488, + "step": 11823 + }, + { + "epoch": 1.9418430398456263, + "grad_norm": 0.3118945660735331, + "learning_rate": 6.126474930451599e-06, + "loss": 0.4787, + "step": 11824 + }, + { + "epoch": 1.9420072670539692, + "grad_norm": 0.2687496340376281, + "learning_rate": 6.126037653907848e-06, + "loss": 0.4405, + "step": 11825 + }, + { + "epoch": 1.942171494262312, + "grad_norm": 0.591042238910758, + "learning_rate": 6.125600359703728e-06, + "loss": 0.4355, + "step": 11826 + }, + { + "epoch": 1.9423357214706547, + "grad_norm": 0.32423012638815046, + "learning_rate": 6.125163047843991e-06, + "loss": 0.4647, + "step": 11827 + }, + { + "epoch": 1.9424999486789973, + "grad_norm": 0.448756546678214, + "learning_rate": 6.124725718333383e-06, + "loss": 0.4245, + "step": 11828 + }, + { + "epoch": 1.94266417588734, + "grad_norm": 0.30948247325201406, + "learning_rate": 6.124288371176655e-06, + "loss": 0.4461, + "step": 11829 + }, + { + "epoch": 1.9428284030956828, + "grad_norm": 0.2882913994343183, + "learning_rate": 6.123851006378556e-06, + "loss": 0.4506, + "step": 11830 + }, + { + "epoch": 1.9429926303040257, + "grad_norm": 0.3698476194070314, + "learning_rate": 6.123413623943839e-06, + "loss": 0.4417, + "step": 11831 + }, + { + "epoch": 1.9431568575123683, + "grad_norm": 0.39572904633456146, + "learning_rate": 6.122976223877253e-06, + "loss": 0.4455, + "step": 11832 + }, + { + "epoch": 1.943321084720711, + "grad_norm": 0.31198435687290366, + "learning_rate": 6.122538806183548e-06, + "loss": 0.4525, + "step": 11833 + }, + { + "epoch": 1.9434853119290538, + "grad_norm": 0.3316692403626222, + "learning_rate": 6.122101370867475e-06, + "loss": 0.462, + "step": 11834 + }, + { + "epoch": 1.9436495391373967, + "grad_norm": 0.33372374349452755, + "learning_rate": 6.121663917933784e-06, + "loss": 0.4522, + "step": 11835 + }, + { + "epoch": 1.9438137663457393, + "grad_norm": 0.3229007237330423, + "learning_rate": 6.121226447387229e-06, + "loss": 0.4402, + "step": 11836 + }, + { + "epoch": 1.943977993554082, + "grad_norm": 0.34876032942863944, + "learning_rate": 6.12078895923256e-06, + "loss": 0.4379, + "step": 11837 + }, + { + "epoch": 1.9441422207624248, + "grad_norm": 0.3025199224775647, + "learning_rate": 6.120351453474528e-06, + "loss": 0.4424, + "step": 11838 + }, + { + "epoch": 1.9443064479707677, + "grad_norm": 0.3042076390572115, + "learning_rate": 6.119913930117884e-06, + "loss": 0.422, + "step": 11839 + }, + { + "epoch": 1.9444706751791103, + "grad_norm": 0.3464106773619736, + "learning_rate": 6.119476389167382e-06, + "loss": 0.4286, + "step": 11840 + }, + { + "epoch": 1.944634902387453, + "grad_norm": 0.31756238332376324, + "learning_rate": 6.119038830627772e-06, + "loss": 0.4447, + "step": 11841 + }, + { + "epoch": 1.9447991295957958, + "grad_norm": 0.3361628110899518, + "learning_rate": 6.118601254503809e-06, + "loss": 0.4364, + "step": 11842 + }, + { + "epoch": 1.9449633568041387, + "grad_norm": 0.4516933395464814, + "learning_rate": 6.118163660800243e-06, + "loss": 0.4539, + "step": 11843 + }, + { + "epoch": 1.9451275840124813, + "grad_norm": 0.3930468527746109, + "learning_rate": 6.117726049521826e-06, + "loss": 0.47, + "step": 11844 + }, + { + "epoch": 1.945291811220824, + "grad_norm": 0.35826140019934466, + "learning_rate": 6.117288420673315e-06, + "loss": 0.4543, + "step": 11845 + }, + { + "epoch": 1.9454560384291666, + "grad_norm": 0.32262749277511915, + "learning_rate": 6.116850774259458e-06, + "loss": 0.4575, + "step": 11846 + }, + { + "epoch": 1.9456202656375094, + "grad_norm": 0.42017179470753757, + "learning_rate": 6.116413110285014e-06, + "loss": 0.4259, + "step": 11847 + }, + { + "epoch": 1.9457844928458523, + "grad_norm": 0.3091060953354457, + "learning_rate": 6.115975428754731e-06, + "loss": 0.4354, + "step": 11848 + }, + { + "epoch": 1.945948720054195, + "grad_norm": 0.449447054482981, + "learning_rate": 6.115537729673366e-06, + "loss": 0.4313, + "step": 11849 + }, + { + "epoch": 1.9461129472625376, + "grad_norm": 0.3242192526893855, + "learning_rate": 6.115100013045671e-06, + "loss": 0.4504, + "step": 11850 + }, + { + "epoch": 1.9462771744708804, + "grad_norm": 0.33839339236288085, + "learning_rate": 6.1146622788764e-06, + "loss": 0.4586, + "step": 11851 + }, + { + "epoch": 1.9464414016792233, + "grad_norm": 0.41077135290200856, + "learning_rate": 6.11422452717031e-06, + "loss": 0.45, + "step": 11852 + }, + { + "epoch": 1.946605628887566, + "grad_norm": 0.46154500843898666, + "learning_rate": 6.113786757932154e-06, + "loss": 0.4687, + "step": 11853 + }, + { + "epoch": 1.9467698560959086, + "grad_norm": 0.38772725882298875, + "learning_rate": 6.113348971166684e-06, + "loss": 0.4565, + "step": 11854 + }, + { + "epoch": 1.9469340833042514, + "grad_norm": 0.34951381136472154, + "learning_rate": 6.1129111668786565e-06, + "loss": 0.433, + "step": 11855 + }, + { + "epoch": 1.9470983105125943, + "grad_norm": 0.3058343506675485, + "learning_rate": 6.112473345072829e-06, + "loss": 0.4443, + "step": 11856 + }, + { + "epoch": 1.947262537720937, + "grad_norm": 0.458504542424256, + "learning_rate": 6.112035505753952e-06, + "loss": 0.4228, + "step": 11857 + }, + { + "epoch": 1.9474267649292796, + "grad_norm": 0.3648480272272111, + "learning_rate": 6.111597648926786e-06, + "loss": 0.4657, + "step": 11858 + }, + { + "epoch": 1.9475909921376224, + "grad_norm": 0.3315564115633353, + "learning_rate": 6.1111597745960825e-06, + "loss": 0.4344, + "step": 11859 + }, + { + "epoch": 1.9477552193459653, + "grad_norm": 0.3203259202134563, + "learning_rate": 6.110721882766598e-06, + "loss": 0.4507, + "step": 11860 + }, + { + "epoch": 1.947919446554308, + "grad_norm": 0.32526518490626355, + "learning_rate": 6.1102839734430905e-06, + "loss": 0.4483, + "step": 11861 + }, + { + "epoch": 1.9480836737626506, + "grad_norm": 0.30568500186568304, + "learning_rate": 6.109846046630315e-06, + "loss": 0.4251, + "step": 11862 + }, + { + "epoch": 1.9482479009709932, + "grad_norm": 0.34335284427598955, + "learning_rate": 6.1094081023330265e-06, + "loss": 0.442, + "step": 11863 + }, + { + "epoch": 1.948412128179336, + "grad_norm": 0.3000840673867567, + "learning_rate": 6.108970140555982e-06, + "loss": 0.4462, + "step": 11864 + }, + { + "epoch": 1.948576355387679, + "grad_norm": 0.3536142230185942, + "learning_rate": 6.10853216130394e-06, + "loss": 0.4583, + "step": 11865 + }, + { + "epoch": 1.9487405825960216, + "grad_norm": 0.31933514094763754, + "learning_rate": 6.108094164581656e-06, + "loss": 0.4355, + "step": 11866 + }, + { + "epoch": 1.9489048098043642, + "grad_norm": 0.35391874407088647, + "learning_rate": 6.107656150393888e-06, + "loss": 0.4603, + "step": 11867 + }, + { + "epoch": 1.949069037012707, + "grad_norm": 0.37510286288830225, + "learning_rate": 6.10721811874539e-06, + "loss": 0.4357, + "step": 11868 + }, + { + "epoch": 1.94923326422105, + "grad_norm": 0.3732690817187727, + "learning_rate": 6.106780069640924e-06, + "loss": 0.4551, + "step": 11869 + }, + { + "epoch": 1.9493974914293926, + "grad_norm": 0.3714339876825457, + "learning_rate": 6.106342003085246e-06, + "loss": 0.4611, + "step": 11870 + }, + { + "epoch": 1.9495617186377352, + "grad_norm": 0.2837021911002435, + "learning_rate": 6.1059039190831115e-06, + "loss": 0.4401, + "step": 11871 + }, + { + "epoch": 1.949725945846078, + "grad_norm": 0.4639579970164344, + "learning_rate": 6.105465817639281e-06, + "loss": 0.4399, + "step": 11872 + }, + { + "epoch": 1.949890173054421, + "grad_norm": 0.3320134698090167, + "learning_rate": 6.105027698758512e-06, + "loss": 0.4232, + "step": 11873 + }, + { + "epoch": 1.9500544002627636, + "grad_norm": 0.28217258687959057, + "learning_rate": 6.104589562445565e-06, + "loss": 0.4613, + "step": 11874 + }, + { + "epoch": 1.9502186274711062, + "grad_norm": 0.3642163892311067, + "learning_rate": 6.104151408705195e-06, + "loss": 0.4611, + "step": 11875 + }, + { + "epoch": 1.950382854679449, + "grad_norm": 0.7508595149691321, + "learning_rate": 6.103713237542163e-06, + "loss": 0.4404, + "step": 11876 + }, + { + "epoch": 1.950547081887792, + "grad_norm": 0.33888086777812015, + "learning_rate": 6.103275048961227e-06, + "loss": 0.4326, + "step": 11877 + }, + { + "epoch": 1.9507113090961345, + "grad_norm": 0.29969210211511377, + "learning_rate": 6.102836842967146e-06, + "loss": 0.4595, + "step": 11878 + }, + { + "epoch": 1.9508755363044772, + "grad_norm": 0.3898798633549321, + "learning_rate": 6.102398619564681e-06, + "loss": 0.4445, + "step": 11879 + }, + { + "epoch": 1.9510397635128198, + "grad_norm": 0.33948844287237157, + "learning_rate": 6.101960378758589e-06, + "loss": 0.4441, + "step": 11880 + }, + { + "epoch": 1.9512039907211627, + "grad_norm": 0.3009932679535096, + "learning_rate": 6.101522120553633e-06, + "loss": 0.4689, + "step": 11881 + }, + { + "epoch": 1.9513682179295055, + "grad_norm": 0.40499756454781155, + "learning_rate": 6.10108384495457e-06, + "loss": 0.443, + "step": 11882 + }, + { + "epoch": 1.9515324451378482, + "grad_norm": 0.4204322798597718, + "learning_rate": 6.100645551966162e-06, + "loss": 0.4286, + "step": 11883 + }, + { + "epoch": 1.9516966723461908, + "grad_norm": 0.31539927216386543, + "learning_rate": 6.100207241593167e-06, + "loss": 0.4402, + "step": 11884 + }, + { + "epoch": 1.9518608995545337, + "grad_norm": 0.295725731524103, + "learning_rate": 6.099768913840348e-06, + "loss": 0.4323, + "step": 11885 + }, + { + "epoch": 1.9520251267628765, + "grad_norm": 0.5328706680390949, + "learning_rate": 6.099330568712465e-06, + "loss": 0.4273, + "step": 11886 + }, + { + "epoch": 1.9521893539712192, + "grad_norm": 0.42119041668735635, + "learning_rate": 6.098892206214278e-06, + "loss": 0.447, + "step": 11887 + }, + { + "epoch": 1.9523535811795618, + "grad_norm": 0.3552233220762019, + "learning_rate": 6.098453826350549e-06, + "loss": 0.4624, + "step": 11888 + }, + { + "epoch": 1.9525178083879047, + "grad_norm": 0.35988767125876403, + "learning_rate": 6.0980154291260375e-06, + "loss": 0.4427, + "step": 11889 + }, + { + "epoch": 1.9526820355962475, + "grad_norm": 0.3725865342348446, + "learning_rate": 6.097577014545507e-06, + "loss": 0.4391, + "step": 11890 + }, + { + "epoch": 1.9528462628045902, + "grad_norm": 0.31357186920814756, + "learning_rate": 6.0971385826137194e-06, + "loss": 0.4449, + "step": 11891 + }, + { + "epoch": 1.9530104900129328, + "grad_norm": 0.38542678167037026, + "learning_rate": 6.0967001333354335e-06, + "loss": 0.4367, + "step": 11892 + }, + { + "epoch": 1.9531747172212757, + "grad_norm": 0.37312227942542797, + "learning_rate": 6.096261666715413e-06, + "loss": 0.4501, + "step": 11893 + }, + { + "epoch": 1.9533389444296185, + "grad_norm": 0.34996403105268287, + "learning_rate": 6.095823182758422e-06, + "loss": 0.4327, + "step": 11894 + }, + { + "epoch": 1.9535031716379612, + "grad_norm": 0.4262377392819523, + "learning_rate": 6.0953846814692214e-06, + "loss": 0.4317, + "step": 11895 + }, + { + "epoch": 1.9536673988463038, + "grad_norm": 0.3406779984733831, + "learning_rate": 6.094946162852573e-06, + "loss": 0.445, + "step": 11896 + }, + { + "epoch": 1.9538316260546464, + "grad_norm": 0.32443708298506085, + "learning_rate": 6.09450762691324e-06, + "loss": 0.4404, + "step": 11897 + }, + { + "epoch": 1.9539958532629893, + "grad_norm": 0.3150022109795289, + "learning_rate": 6.094069073655984e-06, + "loss": 0.4403, + "step": 11898 + }, + { + "epoch": 1.9541600804713322, + "grad_norm": 0.3574477988232649, + "learning_rate": 6.093630503085571e-06, + "loss": 0.4603, + "step": 11899 + }, + { + "epoch": 1.9543243076796748, + "grad_norm": 0.2974068481729623, + "learning_rate": 6.093191915206762e-06, + "loss": 0.4318, + "step": 11900 + }, + { + "epoch": 1.9544885348880174, + "grad_norm": 0.3768435152698072, + "learning_rate": 6.092753310024322e-06, + "loss": 0.4547, + "step": 11901 + }, + { + "epoch": 1.9546527620963603, + "grad_norm": 0.38455005690972244, + "learning_rate": 6.092314687543014e-06, + "loss": 0.4469, + "step": 11902 + }, + { + "epoch": 1.9548169893047032, + "grad_norm": 0.31822901379518637, + "learning_rate": 6.091876047767601e-06, + "loss": 0.4425, + "step": 11903 + }, + { + "epoch": 1.9549812165130458, + "grad_norm": 0.28402660304659677, + "learning_rate": 6.091437390702849e-06, + "loss": 0.4461, + "step": 11904 + }, + { + "epoch": 1.9551454437213884, + "grad_norm": 0.3521752474751283, + "learning_rate": 6.090998716353522e-06, + "loss": 0.4318, + "step": 11905 + }, + { + "epoch": 1.9553096709297313, + "grad_norm": 0.29060245833153847, + "learning_rate": 6.090560024724381e-06, + "loss": 0.4418, + "step": 11906 + }, + { + "epoch": 1.9554738981380742, + "grad_norm": 0.3912999714229161, + "learning_rate": 6.0901213158201946e-06, + "loss": 0.4295, + "step": 11907 + }, + { + "epoch": 1.9556381253464168, + "grad_norm": 0.3728285545096288, + "learning_rate": 6.089682589645727e-06, + "loss": 0.4326, + "step": 11908 + }, + { + "epoch": 1.9558023525547594, + "grad_norm": 0.3202203110579774, + "learning_rate": 6.08924384620574e-06, + "loss": 0.4463, + "step": 11909 + }, + { + "epoch": 1.9559665797631023, + "grad_norm": 0.536561797356937, + "learning_rate": 6.088805085505004e-06, + "loss": 0.4376, + "step": 11910 + }, + { + "epoch": 1.9561308069714451, + "grad_norm": 0.4067293800969105, + "learning_rate": 6.08836630754828e-06, + "loss": 0.4476, + "step": 11911 + }, + { + "epoch": 1.9562950341797878, + "grad_norm": 0.3378217956415752, + "learning_rate": 6.087927512340336e-06, + "loss": 0.4475, + "step": 11912 + }, + { + "epoch": 1.9564592613881304, + "grad_norm": 0.29079037548101955, + "learning_rate": 6.087488699885936e-06, + "loss": 0.4429, + "step": 11913 + }, + { + "epoch": 1.956623488596473, + "grad_norm": 0.37519263681592546, + "learning_rate": 6.0870498701898465e-06, + "loss": 0.4377, + "step": 11914 + }, + { + "epoch": 1.956787715804816, + "grad_norm": 0.35455480132931083, + "learning_rate": 6.086611023256836e-06, + "loss": 0.4667, + "step": 11915 + }, + { + "epoch": 1.9569519430131588, + "grad_norm": 0.4006295729770789, + "learning_rate": 6.086172159091667e-06, + "loss": 0.4507, + "step": 11916 + }, + { + "epoch": 1.9571161702215014, + "grad_norm": 0.5609335201389217, + "learning_rate": 6.085733277699109e-06, + "loss": 0.4528, + "step": 11917 + }, + { + "epoch": 1.957280397429844, + "grad_norm": 0.3352582817058797, + "learning_rate": 6.085294379083927e-06, + "loss": 0.4575, + "step": 11918 + }, + { + "epoch": 1.957444624638187, + "grad_norm": 0.34092400470920575, + "learning_rate": 6.084855463250887e-06, + "loss": 0.4442, + "step": 11919 + }, + { + "epoch": 1.9576088518465298, + "grad_norm": 0.4316159655034461, + "learning_rate": 6.08441653020476e-06, + "loss": 0.448, + "step": 11920 + }, + { + "epoch": 1.9577730790548724, + "grad_norm": 0.28558420396013084, + "learning_rate": 6.083977579950309e-06, + "loss": 0.4419, + "step": 11921 + }, + { + "epoch": 1.957937306263215, + "grad_norm": 0.40953524784616846, + "learning_rate": 6.083538612492302e-06, + "loss": 0.4469, + "step": 11922 + }, + { + "epoch": 1.958101533471558, + "grad_norm": 0.29969065319404764, + "learning_rate": 6.083099627835508e-06, + "loss": 0.4432, + "step": 11923 + }, + { + "epoch": 1.9582657606799008, + "grad_norm": 0.46090963146047326, + "learning_rate": 6.082660625984697e-06, + "loss": 0.4466, + "step": 11924 + }, + { + "epoch": 1.9584299878882434, + "grad_norm": 0.29142333619321953, + "learning_rate": 6.082221606944633e-06, + "loss": 0.4449, + "step": 11925 + }, + { + "epoch": 1.958594215096586, + "grad_norm": 0.38445693912408685, + "learning_rate": 6.081782570720085e-06, + "loss": 0.4441, + "step": 11926 + }, + { + "epoch": 1.958758442304929, + "grad_norm": 0.2926899702163581, + "learning_rate": 6.081343517315823e-06, + "loss": 0.4741, + "step": 11927 + }, + { + "epoch": 1.9589226695132718, + "grad_norm": 0.3185654549866373, + "learning_rate": 6.080904446736613e-06, + "loss": 0.4683, + "step": 11928 + }, + { + "epoch": 1.9590868967216144, + "grad_norm": 0.3257148447442194, + "learning_rate": 6.080465358987227e-06, + "loss": 0.4369, + "step": 11929 + }, + { + "epoch": 1.959251123929957, + "grad_norm": 0.34144365741472404, + "learning_rate": 6.0800262540724314e-06, + "loss": 0.4504, + "step": 11930 + }, + { + "epoch": 1.9594153511382997, + "grad_norm": 0.40757690940344427, + "learning_rate": 6.079587131996997e-06, + "loss": 0.4606, + "step": 11931 + }, + { + "epoch": 1.9595795783466425, + "grad_norm": 0.30314743296026664, + "learning_rate": 6.079147992765691e-06, + "loss": 0.4745, + "step": 11932 + }, + { + "epoch": 1.9597438055549854, + "grad_norm": 0.3476291582985087, + "learning_rate": 6.078708836383285e-06, + "loss": 0.4584, + "step": 11933 + }, + { + "epoch": 1.959908032763328, + "grad_norm": 0.38181630953661133, + "learning_rate": 6.078269662854546e-06, + "loss": 0.4507, + "step": 11934 + }, + { + "epoch": 1.9600722599716707, + "grad_norm": 0.3039837980681577, + "learning_rate": 6.077830472184249e-06, + "loss": 0.4496, + "step": 11935 + }, + { + "epoch": 1.9602364871800135, + "grad_norm": 0.3243794082669827, + "learning_rate": 6.0773912643771585e-06, + "loss": 0.4291, + "step": 11936 + }, + { + "epoch": 1.9604007143883564, + "grad_norm": 0.4193973744888972, + "learning_rate": 6.076952039438048e-06, + "loss": 0.4761, + "step": 11937 + }, + { + "epoch": 1.960564941596699, + "grad_norm": 0.2894100680354276, + "learning_rate": 6.076512797371685e-06, + "loss": 0.4309, + "step": 11938 + }, + { + "epoch": 1.9607291688050417, + "grad_norm": 0.47924954345632687, + "learning_rate": 6.0760735381828444e-06, + "loss": 0.439, + "step": 11939 + }, + { + "epoch": 1.9608933960133845, + "grad_norm": 0.3215352880588443, + "learning_rate": 6.075634261876292e-06, + "loss": 0.4399, + "step": 11940 + }, + { + "epoch": 1.9610576232217274, + "grad_norm": 0.3105394115840711, + "learning_rate": 6.0751949684568034e-06, + "loss": 0.4423, + "step": 11941 + }, + { + "epoch": 1.96122185043007, + "grad_norm": 0.3256268730700029, + "learning_rate": 6.074755657929146e-06, + "loss": 0.4395, + "step": 11942 + }, + { + "epoch": 1.9613860776384127, + "grad_norm": 0.4212632534343356, + "learning_rate": 6.074316330298094e-06, + "loss": 0.4591, + "step": 11943 + }, + { + "epoch": 1.9615503048467555, + "grad_norm": 0.33236517435930046, + "learning_rate": 6.073876985568417e-06, + "loss": 0.4351, + "step": 11944 + }, + { + "epoch": 1.9617145320550984, + "grad_norm": 0.3231407977837, + "learning_rate": 6.073437623744888e-06, + "loss": 0.4242, + "step": 11945 + }, + { + "epoch": 1.961878759263441, + "grad_norm": 0.30963714146020055, + "learning_rate": 6.072998244832279e-06, + "loss": 0.4376, + "step": 11946 + }, + { + "epoch": 1.9620429864717837, + "grad_norm": 0.29128255878835513, + "learning_rate": 6.072558848835359e-06, + "loss": 0.4531, + "step": 11947 + }, + { + "epoch": 1.9622072136801263, + "grad_norm": 0.2906945038194113, + "learning_rate": 6.0721194357589036e-06, + "loss": 0.4589, + "step": 11948 + }, + { + "epoch": 1.9623714408884692, + "grad_norm": 0.28236743251721147, + "learning_rate": 6.071680005607686e-06, + "loss": 0.4651, + "step": 11949 + }, + { + "epoch": 1.962535668096812, + "grad_norm": 0.3685667178520881, + "learning_rate": 6.071240558386477e-06, + "loss": 0.4517, + "step": 11950 + }, + { + "epoch": 1.9626998953051547, + "grad_norm": 0.2718554589683119, + "learning_rate": 6.0708010941000485e-06, + "loss": 0.4562, + "step": 11951 + }, + { + "epoch": 1.9628641225134973, + "grad_norm": 0.3077384180029517, + "learning_rate": 6.070361612753175e-06, + "loss": 0.4375, + "step": 11952 + }, + { + "epoch": 1.9630283497218401, + "grad_norm": 0.32225790357731776, + "learning_rate": 6.069922114350629e-06, + "loss": 0.4628, + "step": 11953 + }, + { + "epoch": 1.963192576930183, + "grad_norm": 0.4871176270330463, + "learning_rate": 6.069482598897186e-06, + "loss": 0.4521, + "step": 11954 + }, + { + "epoch": 1.9633568041385256, + "grad_norm": 0.4126252515018033, + "learning_rate": 6.069043066397615e-06, + "loss": 0.4402, + "step": 11955 + }, + { + "epoch": 1.9635210313468683, + "grad_norm": 0.3037151550263849, + "learning_rate": 6.0686035168566945e-06, + "loss": 0.4552, + "step": 11956 + }, + { + "epoch": 1.9636852585552111, + "grad_norm": 0.3326743916440129, + "learning_rate": 6.068163950279195e-06, + "loss": 0.437, + "step": 11957 + }, + { + "epoch": 1.963849485763554, + "grad_norm": 0.3131914650921411, + "learning_rate": 6.067724366669895e-06, + "loss": 0.4454, + "step": 11958 + }, + { + "epoch": 1.9640137129718966, + "grad_norm": 0.33370325669191686, + "learning_rate": 6.067284766033564e-06, + "loss": 0.4556, + "step": 11959 + }, + { + "epoch": 1.9641779401802393, + "grad_norm": 0.2634975212310298, + "learning_rate": 6.066845148374978e-06, + "loss": 0.4439, + "step": 11960 + }, + { + "epoch": 1.9643421673885821, + "grad_norm": 0.297116985250537, + "learning_rate": 6.066405513698912e-06, + "loss": 0.4372, + "step": 11961 + }, + { + "epoch": 1.964506394596925, + "grad_norm": 0.3084785855406754, + "learning_rate": 6.0659658620101424e-06, + "loss": 0.4395, + "step": 11962 + }, + { + "epoch": 1.9646706218052676, + "grad_norm": 0.30382296858647195, + "learning_rate": 6.065526193313442e-06, + "loss": 0.4436, + "step": 11963 + }, + { + "epoch": 1.9648348490136103, + "grad_norm": 0.47659789553897186, + "learning_rate": 6.065086507613587e-06, + "loss": 0.4531, + "step": 11964 + }, + { + "epoch": 1.964999076221953, + "grad_norm": 0.32839380600621654, + "learning_rate": 6.064646804915353e-06, + "loss": 0.4427, + "step": 11965 + }, + { + "epoch": 1.9651633034302958, + "grad_norm": 0.28526974160350493, + "learning_rate": 6.0642070852235156e-06, + "loss": 0.4546, + "step": 11966 + }, + { + "epoch": 1.9653275306386386, + "grad_norm": 0.2921253976160405, + "learning_rate": 6.063767348542849e-06, + "loss": 0.4477, + "step": 11967 + }, + { + "epoch": 1.9654917578469813, + "grad_norm": 0.32245870607633376, + "learning_rate": 6.06332759487813e-06, + "loss": 0.4463, + "step": 11968 + }, + { + "epoch": 1.965655985055324, + "grad_norm": 0.265974759218518, + "learning_rate": 6.062887824234138e-06, + "loss": 0.4475, + "step": 11969 + }, + { + "epoch": 1.9658202122636668, + "grad_norm": 0.3420749303053669, + "learning_rate": 6.0624480366156455e-06, + "loss": 0.4371, + "step": 11970 + }, + { + "epoch": 1.9659844394720096, + "grad_norm": 0.5955902754876532, + "learning_rate": 6.062008232027429e-06, + "loss": 0.4318, + "step": 11971 + }, + { + "epoch": 1.9661486666803523, + "grad_norm": 0.2654119467227501, + "learning_rate": 6.061568410474266e-06, + "loss": 0.4618, + "step": 11972 + }, + { + "epoch": 1.966312893888695, + "grad_norm": 0.3256763544307515, + "learning_rate": 6.061128571960935e-06, + "loss": 0.4478, + "step": 11973 + }, + { + "epoch": 1.9664771210970378, + "grad_norm": 0.38804030892894026, + "learning_rate": 6.06068871649221e-06, + "loss": 0.4599, + "step": 11974 + }, + { + "epoch": 1.9666413483053806, + "grad_norm": 0.3603884815671852, + "learning_rate": 6.060248844072872e-06, + "loss": 0.4407, + "step": 11975 + }, + { + "epoch": 1.9668055755137233, + "grad_norm": 0.29521900289850656, + "learning_rate": 6.059808954707696e-06, + "loss": 0.4553, + "step": 11976 + }, + { + "epoch": 1.966969802722066, + "grad_norm": 0.31907625861012234, + "learning_rate": 6.059369048401459e-06, + "loss": 0.4485, + "step": 11977 + }, + { + "epoch": 1.9671340299304088, + "grad_norm": 0.2827112303883396, + "learning_rate": 6.058929125158942e-06, + "loss": 0.4542, + "step": 11978 + }, + { + "epoch": 1.9672982571387516, + "grad_norm": 0.4095160662478101, + "learning_rate": 6.05848918498492e-06, + "loss": 0.4447, + "step": 11979 + }, + { + "epoch": 1.9674624843470943, + "grad_norm": 0.30726510403041185, + "learning_rate": 6.058049227884171e-06, + "loss": 0.4373, + "step": 11980 + }, + { + "epoch": 1.967626711555437, + "grad_norm": 0.3706977126954439, + "learning_rate": 6.057609253861475e-06, + "loss": 0.4433, + "step": 11981 + }, + { + "epoch": 1.9677909387637795, + "grad_norm": 0.27819231084749946, + "learning_rate": 6.057169262921609e-06, + "loss": 0.4415, + "step": 11982 + }, + { + "epoch": 1.9679551659721224, + "grad_norm": 0.2774976321074535, + "learning_rate": 6.056729255069356e-06, + "loss": 0.4415, + "step": 11983 + }, + { + "epoch": 1.9681193931804652, + "grad_norm": 0.3394973626327754, + "learning_rate": 6.05628923030949e-06, + "loss": 0.4256, + "step": 11984 + }, + { + "epoch": 1.9682836203888079, + "grad_norm": 0.270213655988594, + "learning_rate": 6.055849188646791e-06, + "loss": 0.4301, + "step": 11985 + }, + { + "epoch": 1.9684478475971505, + "grad_norm": 0.44524049804233634, + "learning_rate": 6.055409130086039e-06, + "loss": 0.4662, + "step": 11986 + }, + { + "epoch": 1.9686120748054934, + "grad_norm": 0.32487464715932335, + "learning_rate": 6.054969054632015e-06, + "loss": 0.4474, + "step": 11987 + }, + { + "epoch": 1.9687763020138362, + "grad_norm": 0.36859889874615925, + "learning_rate": 6.0545289622894956e-06, + "loss": 0.4406, + "step": 11988 + }, + { + "epoch": 1.9689405292221789, + "grad_norm": 0.29552264416651164, + "learning_rate": 6.054088853063263e-06, + "loss": 0.4396, + "step": 11989 + }, + { + "epoch": 1.9691047564305215, + "grad_norm": 0.2740422492527266, + "learning_rate": 6.053648726958096e-06, + "loss": 0.4517, + "step": 11990 + }, + { + "epoch": 1.9692689836388644, + "grad_norm": 0.32702865970003103, + "learning_rate": 6.053208583978776e-06, + "loss": 0.4542, + "step": 11991 + }, + { + "epoch": 1.9694332108472072, + "grad_norm": 0.30746129718076604, + "learning_rate": 6.052768424130081e-06, + "loss": 0.436, + "step": 11992 + }, + { + "epoch": 1.9695974380555499, + "grad_norm": 0.3372313936554249, + "learning_rate": 6.052328247416795e-06, + "loss": 0.4574, + "step": 11993 + }, + { + "epoch": 1.9697616652638925, + "grad_norm": 0.31184738193789135, + "learning_rate": 6.051888053843697e-06, + "loss": 0.4519, + "step": 11994 + }, + { + "epoch": 1.9699258924722354, + "grad_norm": 0.2956821322627038, + "learning_rate": 6.051447843415567e-06, + "loss": 0.4409, + "step": 11995 + }, + { + "epoch": 1.9700901196805782, + "grad_norm": 0.40843040125952373, + "learning_rate": 6.051007616137187e-06, + "loss": 0.4558, + "step": 11996 + }, + { + "epoch": 1.9702543468889209, + "grad_norm": 0.32893246578214913, + "learning_rate": 6.050567372013338e-06, + "loss": 0.4265, + "step": 11997 + }, + { + "epoch": 1.9704185740972635, + "grad_norm": 0.26744505739084257, + "learning_rate": 6.050127111048803e-06, + "loss": 0.4546, + "step": 11998 + }, + { + "epoch": 1.9705828013056061, + "grad_norm": 0.2781686616464339, + "learning_rate": 6.049686833248362e-06, + "loss": 0.4448, + "step": 11999 + }, + { + "epoch": 1.970747028513949, + "grad_norm": 0.27749506883908925, + "learning_rate": 6.049246538616796e-06, + "loss": 0.4388, + "step": 12000 + }, + { + "epoch": 1.9709112557222919, + "grad_norm": 0.39291413716423657, + "learning_rate": 6.048806227158889e-06, + "loss": 0.4289, + "step": 12001 + }, + { + "epoch": 1.9710754829306345, + "grad_norm": 0.2988992716204715, + "learning_rate": 6.048365898879423e-06, + "loss": 0.4484, + "step": 12002 + }, + { + "epoch": 1.9712397101389771, + "grad_norm": 0.30423634070698163, + "learning_rate": 6.04792555378318e-06, + "loss": 0.4284, + "step": 12003 + }, + { + "epoch": 1.97140393734732, + "grad_norm": 0.3669559441068953, + "learning_rate": 6.047485191874944e-06, + "loss": 0.4717, + "step": 12004 + }, + { + "epoch": 1.9715681645556629, + "grad_norm": 0.33458514236368003, + "learning_rate": 6.047044813159494e-06, + "loss": 0.4448, + "step": 12005 + }, + { + "epoch": 1.9717323917640055, + "grad_norm": 0.29073195184717887, + "learning_rate": 6.046604417641616e-06, + "loss": 0.4541, + "step": 12006 + }, + { + "epoch": 1.9718966189723481, + "grad_norm": 0.3558324342641227, + "learning_rate": 6.046164005326092e-06, + "loss": 0.44, + "step": 12007 + }, + { + "epoch": 1.972060846180691, + "grad_norm": 0.28499414277142565, + "learning_rate": 6.0457235762177065e-06, + "loss": 0.4573, + "step": 12008 + }, + { + "epoch": 1.9722250733890339, + "grad_norm": 0.32143312180520023, + "learning_rate": 6.045283130321242e-06, + "loss": 0.4505, + "step": 12009 + }, + { + "epoch": 1.9723893005973765, + "grad_norm": 0.3139482439274381, + "learning_rate": 6.044842667641482e-06, + "loss": 0.4546, + "step": 12010 + }, + { + "epoch": 1.9725535278057191, + "grad_norm": 0.36164038349410554, + "learning_rate": 6.04440218818321e-06, + "loss": 0.4533, + "step": 12011 + }, + { + "epoch": 1.972717755014062, + "grad_norm": 0.34017728293412075, + "learning_rate": 6.043961691951212e-06, + "loss": 0.4747, + "step": 12012 + }, + { + "epoch": 1.9728819822224049, + "grad_norm": 0.28030323013934577, + "learning_rate": 6.04352117895027e-06, + "loss": 0.4353, + "step": 12013 + }, + { + "epoch": 1.9730462094307475, + "grad_norm": 0.36739679215647425, + "learning_rate": 6.043080649185171e-06, + "loss": 0.4448, + "step": 12014 + }, + { + "epoch": 1.9732104366390901, + "grad_norm": 0.3699289824111152, + "learning_rate": 6.042640102660695e-06, + "loss": 0.442, + "step": 12015 + }, + { + "epoch": 1.9733746638474328, + "grad_norm": 0.29453155207049314, + "learning_rate": 6.042199539381633e-06, + "loss": 0.4512, + "step": 12016 + }, + { + "epoch": 1.9735388910557756, + "grad_norm": 0.3369125096658498, + "learning_rate": 6.041758959352764e-06, + "loss": 0.4753, + "step": 12017 + }, + { + "epoch": 1.9737031182641185, + "grad_norm": 0.359560199304719, + "learning_rate": 6.041318362578878e-06, + "loss": 0.4441, + "step": 12018 + }, + { + "epoch": 1.9738673454724611, + "grad_norm": 0.4074514051910816, + "learning_rate": 6.040877749064757e-06, + "loss": 0.4534, + "step": 12019 + }, + { + "epoch": 1.9740315726808038, + "grad_norm": 0.3280832276122813, + "learning_rate": 6.040437118815187e-06, + "loss": 0.4385, + "step": 12020 + }, + { + "epoch": 1.9741957998891466, + "grad_norm": 0.29592394541807415, + "learning_rate": 6.039996471834956e-06, + "loss": 0.4272, + "step": 12021 + }, + { + "epoch": 1.9743600270974895, + "grad_norm": 0.4068099660831585, + "learning_rate": 6.039555808128848e-06, + "loss": 0.4497, + "step": 12022 + }, + { + "epoch": 1.9745242543058321, + "grad_norm": 0.40547477133364795, + "learning_rate": 6.039115127701649e-06, + "loss": 0.4596, + "step": 12023 + }, + { + "epoch": 1.9746884815141748, + "grad_norm": 0.33863995062257035, + "learning_rate": 6.038674430558144e-06, + "loss": 0.4527, + "step": 12024 + }, + { + "epoch": 1.9748527087225176, + "grad_norm": 0.2859546787151384, + "learning_rate": 6.038233716703122e-06, + "loss": 0.4227, + "step": 12025 + }, + { + "epoch": 1.9750169359308605, + "grad_norm": 0.43978961671594363, + "learning_rate": 6.037792986141368e-06, + "loss": 0.4293, + "step": 12026 + }, + { + "epoch": 1.975181163139203, + "grad_norm": 0.32042395347082325, + "learning_rate": 6.037352238877669e-06, + "loss": 0.4436, + "step": 12027 + }, + { + "epoch": 1.9753453903475457, + "grad_norm": 0.45653170556439265, + "learning_rate": 6.036911474916813e-06, + "loss": 0.4355, + "step": 12028 + }, + { + "epoch": 1.9755096175558886, + "grad_norm": 0.31606632423937275, + "learning_rate": 6.036470694263585e-06, + "loss": 0.4571, + "step": 12029 + }, + { + "epoch": 1.9756738447642315, + "grad_norm": 0.36776567749481, + "learning_rate": 6.036029896922774e-06, + "loss": 0.4325, + "step": 12030 + }, + { + "epoch": 1.975838071972574, + "grad_norm": 0.2897838666344959, + "learning_rate": 6.035589082899168e-06, + "loss": 0.4512, + "step": 12031 + }, + { + "epoch": 1.9760022991809167, + "grad_norm": 0.34939504492231116, + "learning_rate": 6.035148252197554e-06, + "loss": 0.4403, + "step": 12032 + }, + { + "epoch": 1.9761665263892594, + "grad_norm": 0.37178263437531295, + "learning_rate": 6.034707404822718e-06, + "loss": 0.4355, + "step": 12033 + }, + { + "epoch": 1.9763307535976022, + "grad_norm": 0.3404844380988809, + "learning_rate": 6.034266540779451e-06, + "loss": 0.465, + "step": 12034 + }, + { + "epoch": 1.976494980805945, + "grad_norm": 0.35425656017271817, + "learning_rate": 6.033825660072538e-06, + "loss": 0.4466, + "step": 12035 + }, + { + "epoch": 1.9766592080142877, + "grad_norm": 0.3120757128379207, + "learning_rate": 6.033384762706772e-06, + "loss": 0.422, + "step": 12036 + }, + { + "epoch": 1.9768234352226304, + "grad_norm": 0.35814186378850754, + "learning_rate": 6.032943848686938e-06, + "loss": 0.4438, + "step": 12037 + }, + { + "epoch": 1.9769876624309732, + "grad_norm": 0.31766052787014015, + "learning_rate": 6.032502918017823e-06, + "loss": 0.4316, + "step": 12038 + }, + { + "epoch": 1.977151889639316, + "grad_norm": 0.3316287841188254, + "learning_rate": 6.032061970704221e-06, + "loss": 0.4413, + "step": 12039 + }, + { + "epoch": 1.9773161168476587, + "grad_norm": 0.29366943908554144, + "learning_rate": 6.031621006750918e-06, + "loss": 0.4524, + "step": 12040 + }, + { + "epoch": 1.9774803440560014, + "grad_norm": 0.3365778223058008, + "learning_rate": 6.031180026162704e-06, + "loss": 0.4469, + "step": 12041 + }, + { + "epoch": 1.9776445712643442, + "grad_norm": 0.273328767172499, + "learning_rate": 6.030739028944369e-06, + "loss": 0.4376, + "step": 12042 + }, + { + "epoch": 1.977808798472687, + "grad_norm": 0.34032479412588085, + "learning_rate": 6.030298015100702e-06, + "loss": 0.4383, + "step": 12043 + }, + { + "epoch": 1.9779730256810297, + "grad_norm": 0.29204890209099327, + "learning_rate": 6.029856984636491e-06, + "loss": 0.4493, + "step": 12044 + }, + { + "epoch": 1.9781372528893724, + "grad_norm": 0.31070441585017955, + "learning_rate": 6.02941593755653e-06, + "loss": 0.4339, + "step": 12045 + }, + { + "epoch": 1.9783014800977152, + "grad_norm": 0.34355821495186206, + "learning_rate": 6.028974873865607e-06, + "loss": 0.4497, + "step": 12046 + }, + { + "epoch": 1.978465707306058, + "grad_norm": 0.37841270289295065, + "learning_rate": 6.0285337935685125e-06, + "loss": 0.4667, + "step": 12047 + }, + { + "epoch": 1.9786299345144007, + "grad_norm": 0.2880315830387975, + "learning_rate": 6.028092696670037e-06, + "loss": 0.446, + "step": 12048 + }, + { + "epoch": 1.9787941617227434, + "grad_norm": 0.2860235218704819, + "learning_rate": 6.02765158317497e-06, + "loss": 0.4548, + "step": 12049 + }, + { + "epoch": 1.978958388931086, + "grad_norm": 0.6820882451984894, + "learning_rate": 6.027210453088106e-06, + "loss": 0.4264, + "step": 12050 + }, + { + "epoch": 1.9791226161394289, + "grad_norm": 0.3449763478453752, + "learning_rate": 6.026769306414234e-06, + "loss": 0.4534, + "step": 12051 + }, + { + "epoch": 1.9792868433477717, + "grad_norm": 0.31103476736921737, + "learning_rate": 6.026328143158143e-06, + "loss": 0.4565, + "step": 12052 + }, + { + "epoch": 1.9794510705561144, + "grad_norm": 0.297356778106612, + "learning_rate": 6.0258869633246275e-06, + "loss": 0.4705, + "step": 12053 + }, + { + "epoch": 1.979615297764457, + "grad_norm": 0.2549611004267377, + "learning_rate": 6.0254457669184795e-06, + "loss": 0.4406, + "step": 12054 + }, + { + "epoch": 1.9797795249727999, + "grad_norm": 0.3368960762751481, + "learning_rate": 6.025004553944488e-06, + "loss": 0.4179, + "step": 12055 + }, + { + "epoch": 1.9799437521811427, + "grad_norm": 0.28991698335346405, + "learning_rate": 6.0245633244074485e-06, + "loss": 0.4597, + "step": 12056 + }, + { + "epoch": 1.9801079793894854, + "grad_norm": 0.3258666563489521, + "learning_rate": 6.02412207831215e-06, + "loss": 0.4308, + "step": 12057 + }, + { + "epoch": 1.980272206597828, + "grad_norm": 0.3596563049725067, + "learning_rate": 6.023680815663386e-06, + "loss": 0.467, + "step": 12058 + }, + { + "epoch": 1.9804364338061708, + "grad_norm": 0.3375615413381802, + "learning_rate": 6.02323953646595e-06, + "loss": 0.4426, + "step": 12059 + }, + { + "epoch": 1.9806006610145137, + "grad_norm": 0.2701265231327295, + "learning_rate": 6.022798240724633e-06, + "loss": 0.4414, + "step": 12060 + }, + { + "epoch": 1.9807648882228563, + "grad_norm": 0.327944132720733, + "learning_rate": 6.0223569284442296e-06, + "loss": 0.46, + "step": 12061 + }, + { + "epoch": 1.980929115431199, + "grad_norm": 0.28050418382709885, + "learning_rate": 6.021915599629533e-06, + "loss": 0.4757, + "step": 12062 + }, + { + "epoch": 1.9810933426395418, + "grad_norm": 0.3169619359243772, + "learning_rate": 6.021474254285334e-06, + "loss": 0.4506, + "step": 12063 + }, + { + "epoch": 1.9812575698478847, + "grad_norm": 0.2981630396884761, + "learning_rate": 6.021032892416428e-06, + "loss": 0.4647, + "step": 12064 + }, + { + "epoch": 1.9814217970562273, + "grad_norm": 0.32351176269873505, + "learning_rate": 6.020591514027608e-06, + "loss": 0.4581, + "step": 12065 + }, + { + "epoch": 1.98158602426457, + "grad_norm": 0.4231399008420313, + "learning_rate": 6.02015011912367e-06, + "loss": 0.4458, + "step": 12066 + }, + { + "epoch": 1.9817502514729126, + "grad_norm": 0.3037909970679311, + "learning_rate": 6.019708707709406e-06, + "loss": 0.4493, + "step": 12067 + }, + { + "epoch": 1.9819144786812555, + "grad_norm": 0.27711451234107276, + "learning_rate": 6.019267279789607e-06, + "loss": 0.4367, + "step": 12068 + }, + { + "epoch": 1.9820787058895983, + "grad_norm": 0.26567383056133026, + "learning_rate": 6.018825835369073e-06, + "loss": 0.4137, + "step": 12069 + }, + { + "epoch": 1.982242933097941, + "grad_norm": 0.3748808611920089, + "learning_rate": 6.018384374452596e-06, + "loss": 0.4715, + "step": 12070 + }, + { + "epoch": 1.9824071603062836, + "grad_norm": 0.381151333801941, + "learning_rate": 6.017942897044971e-06, + "loss": 0.4562, + "step": 12071 + }, + { + "epoch": 1.9825713875146265, + "grad_norm": 0.4471531506882593, + "learning_rate": 6.017501403150992e-06, + "loss": 0.4493, + "step": 12072 + }, + { + "epoch": 1.9827356147229693, + "grad_norm": 0.3843248058077205, + "learning_rate": 6.017059892775455e-06, + "loss": 0.4458, + "step": 12073 + }, + { + "epoch": 1.982899841931312, + "grad_norm": 0.4006785253213048, + "learning_rate": 6.016618365923154e-06, + "loss": 0.4549, + "step": 12074 + }, + { + "epoch": 1.9830640691396546, + "grad_norm": 0.40463613359960876, + "learning_rate": 6.016176822598886e-06, + "loss": 0.4634, + "step": 12075 + }, + { + "epoch": 1.9832282963479975, + "grad_norm": 0.2790785754162893, + "learning_rate": 6.015735262807448e-06, + "loss": 0.4595, + "step": 12076 + }, + { + "epoch": 1.9833925235563403, + "grad_norm": 0.3065794759268286, + "learning_rate": 6.015293686553632e-06, + "loss": 0.4687, + "step": 12077 + }, + { + "epoch": 1.983556750764683, + "grad_norm": 0.4796119205886854, + "learning_rate": 6.014852093842236e-06, + "loss": 0.449, + "step": 12078 + }, + { + "epoch": 1.9837209779730256, + "grad_norm": 0.32901033246316697, + "learning_rate": 6.014410484678055e-06, + "loss": 0.444, + "step": 12079 + }, + { + "epoch": 1.9838852051813685, + "grad_norm": 0.31620813737993725, + "learning_rate": 6.0139688590658875e-06, + "loss": 0.4543, + "step": 12080 + }, + { + "epoch": 1.9840494323897113, + "grad_norm": 0.5304261684883916, + "learning_rate": 6.013527217010528e-06, + "loss": 0.4352, + "step": 12081 + }, + { + "epoch": 1.984213659598054, + "grad_norm": 0.33483212395030826, + "learning_rate": 6.0130855585167735e-06, + "loss": 0.4287, + "step": 12082 + }, + { + "epoch": 1.9843778868063966, + "grad_norm": 0.31516123536811785, + "learning_rate": 6.012643883589422e-06, + "loss": 0.4534, + "step": 12083 + }, + { + "epoch": 1.9845421140147392, + "grad_norm": 0.36557989364427224, + "learning_rate": 6.012202192233269e-06, + "loss": 0.4396, + "step": 12084 + }, + { + "epoch": 1.984706341223082, + "grad_norm": 0.3248087899223682, + "learning_rate": 6.011760484453113e-06, + "loss": 0.4308, + "step": 12085 + }, + { + "epoch": 1.984870568431425, + "grad_norm": 0.33659809991010253, + "learning_rate": 6.01131876025375e-06, + "loss": 0.4239, + "step": 12086 + }, + { + "epoch": 1.9850347956397676, + "grad_norm": 0.4750655674070637, + "learning_rate": 6.010877019639978e-06, + "loss": 0.4417, + "step": 12087 + }, + { + "epoch": 1.9851990228481102, + "grad_norm": 0.7893302234320693, + "learning_rate": 6.010435262616595e-06, + "loss": 0.4511, + "step": 12088 + }, + { + "epoch": 1.985363250056453, + "grad_norm": 0.2919750160730782, + "learning_rate": 6.009993489188401e-06, + "loss": 0.4235, + "step": 12089 + }, + { + "epoch": 1.985527477264796, + "grad_norm": 0.269083122192506, + "learning_rate": 6.00955169936019e-06, + "loss": 0.4384, + "step": 12090 + }, + { + "epoch": 1.9856917044731386, + "grad_norm": 0.3046254431436656, + "learning_rate": 6.009109893136764e-06, + "loss": 0.4439, + "step": 12091 + }, + { + "epoch": 1.9858559316814812, + "grad_norm": 0.34705997935606975, + "learning_rate": 6.0086680705229185e-06, + "loss": 0.4656, + "step": 12092 + }, + { + "epoch": 1.986020158889824, + "grad_norm": 0.3014169128468884, + "learning_rate": 6.008226231523454e-06, + "loss": 0.4425, + "step": 12093 + }, + { + "epoch": 1.986184386098167, + "grad_norm": 0.3203316019921863, + "learning_rate": 6.007784376143168e-06, + "loss": 0.4604, + "step": 12094 + }, + { + "epoch": 1.9863486133065096, + "grad_norm": 0.28211784166140474, + "learning_rate": 6.007342504386861e-06, + "loss": 0.4461, + "step": 12095 + }, + { + "epoch": 1.9865128405148522, + "grad_norm": 0.713397223611457, + "learning_rate": 6.0069006162593316e-06, + "loss": 0.4712, + "step": 12096 + }, + { + "epoch": 1.986677067723195, + "grad_norm": 0.32804549480165146, + "learning_rate": 6.006458711765378e-06, + "loss": 0.4464, + "step": 12097 + }, + { + "epoch": 1.986841294931538, + "grad_norm": 0.4390495685818857, + "learning_rate": 6.0060167909098005e-06, + "loss": 0.4532, + "step": 12098 + }, + { + "epoch": 1.9870055221398806, + "grad_norm": 0.29066432280196225, + "learning_rate": 6.005574853697399e-06, + "loss": 0.4299, + "step": 12099 + }, + { + "epoch": 1.9871697493482232, + "grad_norm": 0.27970517847861814, + "learning_rate": 6.005132900132976e-06, + "loss": 0.451, + "step": 12100 + }, + { + "epoch": 1.9873339765565659, + "grad_norm": 0.2993981409645384, + "learning_rate": 6.004690930221326e-06, + "loss": 0.449, + "step": 12101 + }, + { + "epoch": 1.9874982037649087, + "grad_norm": 0.4881082910339583, + "learning_rate": 6.0042489439672526e-06, + "loss": 0.4428, + "step": 12102 + }, + { + "epoch": 1.9876624309732516, + "grad_norm": 0.35593125919821117, + "learning_rate": 6.0038069413755554e-06, + "loss": 0.4737, + "step": 12103 + }, + { + "epoch": 1.9878266581815942, + "grad_norm": 0.3404574977066621, + "learning_rate": 6.003364922451035e-06, + "loss": 0.4371, + "step": 12104 + }, + { + "epoch": 1.9879908853899368, + "grad_norm": 0.35843795271388007, + "learning_rate": 6.002922887198494e-06, + "loss": 0.4464, + "step": 12105 + }, + { + "epoch": 1.9881551125982797, + "grad_norm": 0.278424406244817, + "learning_rate": 6.002480835622731e-06, + "loss": 0.4358, + "step": 12106 + }, + { + "epoch": 1.9883193398066226, + "grad_norm": 0.3792249206543696, + "learning_rate": 6.0020387677285474e-06, + "loss": 0.4479, + "step": 12107 + }, + { + "epoch": 1.9884835670149652, + "grad_norm": 0.3097187849705982, + "learning_rate": 6.001596683520746e-06, + "loss": 0.4455, + "step": 12108 + }, + { + "epoch": 1.9886477942233078, + "grad_norm": 0.3912751913987081, + "learning_rate": 6.001154583004126e-06, + "loss": 0.4589, + "step": 12109 + }, + { + "epoch": 1.9888120214316507, + "grad_norm": 0.36773299409138227, + "learning_rate": 6.000712466183492e-06, + "loss": 0.4546, + "step": 12110 + }, + { + "epoch": 1.9889762486399936, + "grad_norm": 0.25200881130078995, + "learning_rate": 6.000270333063643e-06, + "loss": 0.4221, + "step": 12111 + }, + { + "epoch": 1.9891404758483362, + "grad_norm": 0.41745494296022206, + "learning_rate": 5.999828183649382e-06, + "loss": 0.4375, + "step": 12112 + }, + { + "epoch": 1.9893047030566788, + "grad_norm": 0.41119193520709213, + "learning_rate": 5.999386017945512e-06, + "loss": 0.4527, + "step": 12113 + }, + { + "epoch": 1.9894689302650217, + "grad_norm": 0.3524347997142921, + "learning_rate": 5.998943835956833e-06, + "loss": 0.4477, + "step": 12114 + }, + { + "epoch": 1.9896331574733646, + "grad_norm": 0.6992056215375053, + "learning_rate": 5.998501637688151e-06, + "loss": 0.4373, + "step": 12115 + }, + { + "epoch": 1.9897973846817072, + "grad_norm": 0.35753146495246646, + "learning_rate": 5.998059423144266e-06, + "loss": 0.4239, + "step": 12116 + }, + { + "epoch": 1.9899616118900498, + "grad_norm": 0.305443312300451, + "learning_rate": 5.9976171923299825e-06, + "loss": 0.4599, + "step": 12117 + }, + { + "epoch": 1.9901258390983925, + "grad_norm": 0.3468952965866211, + "learning_rate": 5.997174945250102e-06, + "loss": 0.4367, + "step": 12118 + }, + { + "epoch": 1.9902900663067353, + "grad_norm": 0.30992685733369446, + "learning_rate": 5.996732681909429e-06, + "loss": 0.4238, + "step": 12119 + }, + { + "epoch": 1.9904542935150782, + "grad_norm": 0.3089277099150746, + "learning_rate": 5.9962904023127654e-06, + "loss": 0.4668, + "step": 12120 + }, + { + "epoch": 1.9906185207234208, + "grad_norm": 0.4060973659557495, + "learning_rate": 5.995848106464918e-06, + "loss": 0.4341, + "step": 12121 + }, + { + "epoch": 1.9907827479317635, + "grad_norm": 0.3343300894937679, + "learning_rate": 5.995405794370687e-06, + "loss": 0.4388, + "step": 12122 + }, + { + "epoch": 1.9909469751401063, + "grad_norm": 0.4022894904900554, + "learning_rate": 5.994963466034877e-06, + "loss": 0.4604, + "step": 12123 + }, + { + "epoch": 1.9911112023484492, + "grad_norm": 0.2809421394977338, + "learning_rate": 5.994521121462295e-06, + "loss": 0.4306, + "step": 12124 + }, + { + "epoch": 1.9912754295567918, + "grad_norm": 0.2999392110190069, + "learning_rate": 5.994078760657742e-06, + "loss": 0.4521, + "step": 12125 + }, + { + "epoch": 1.9914396567651345, + "grad_norm": 0.3439605067320684, + "learning_rate": 5.9936363836260235e-06, + "loss": 0.4559, + "step": 12126 + }, + { + "epoch": 1.9916038839734773, + "grad_norm": 0.35520361022230595, + "learning_rate": 5.993193990371945e-06, + "loss": 0.448, + "step": 12127 + }, + { + "epoch": 1.9917681111818202, + "grad_norm": 0.3873609826678473, + "learning_rate": 5.99275158090031e-06, + "loss": 0.4431, + "step": 12128 + }, + { + "epoch": 1.9919323383901628, + "grad_norm": 0.623525798670634, + "learning_rate": 5.9923091552159244e-06, + "loss": 0.4399, + "step": 12129 + }, + { + "epoch": 1.9920965655985055, + "grad_norm": 0.375059399085205, + "learning_rate": 5.9918667133235946e-06, + "loss": 0.4486, + "step": 12130 + }, + { + "epoch": 1.9922607928068483, + "grad_norm": 0.4259468484416419, + "learning_rate": 5.991424255228122e-06, + "loss": 0.4604, + "step": 12131 + }, + { + "epoch": 1.9924250200151912, + "grad_norm": 0.48872792746183463, + "learning_rate": 5.990981780934316e-06, + "loss": 0.4451, + "step": 12132 + }, + { + "epoch": 1.9925892472235338, + "grad_norm": 0.3400954850191665, + "learning_rate": 5.990539290446981e-06, + "loss": 0.4511, + "step": 12133 + }, + { + "epoch": 1.9927534744318764, + "grad_norm": 0.3584423281926243, + "learning_rate": 5.990096783770923e-06, + "loss": 0.4383, + "step": 12134 + }, + { + "epoch": 1.992917701640219, + "grad_norm": 0.34324978031707004, + "learning_rate": 5.989654260910947e-06, + "loss": 0.442, + "step": 12135 + }, + { + "epoch": 1.993081928848562, + "grad_norm": 0.32954134231022136, + "learning_rate": 5.98921172187186e-06, + "loss": 0.4543, + "step": 12136 + }, + { + "epoch": 1.9932461560569048, + "grad_norm": 0.392536390634084, + "learning_rate": 5.9887691666584685e-06, + "loss": 0.4678, + "step": 12137 + }, + { + "epoch": 1.9934103832652474, + "grad_norm": 0.2785571757259487, + "learning_rate": 5.9883265952755796e-06, + "loss": 0.4437, + "step": 12138 + }, + { + "epoch": 1.99357461047359, + "grad_norm": 0.3650211841002478, + "learning_rate": 5.987884007728001e-06, + "loss": 0.4457, + "step": 12139 + }, + { + "epoch": 1.993738837681933, + "grad_norm": 0.5938490366217879, + "learning_rate": 5.987441404020537e-06, + "loss": 0.4364, + "step": 12140 + }, + { + "epoch": 1.9939030648902758, + "grad_norm": 0.3330731764741555, + "learning_rate": 5.986998784157995e-06, + "loss": 0.4222, + "step": 12141 + }, + { + "epoch": 1.9940672920986184, + "grad_norm": 0.3502325402671695, + "learning_rate": 5.986556148145183e-06, + "loss": 0.435, + "step": 12142 + }, + { + "epoch": 1.994231519306961, + "grad_norm": 0.3442202918445196, + "learning_rate": 5.98611349598691e-06, + "loss": 0.4507, + "step": 12143 + }, + { + "epoch": 1.994395746515304, + "grad_norm": 0.4005337023377969, + "learning_rate": 5.985670827687983e-06, + "loss": 0.4468, + "step": 12144 + }, + { + "epoch": 1.9945599737236468, + "grad_norm": 0.29403675952349245, + "learning_rate": 5.985228143253207e-06, + "loss": 0.4617, + "step": 12145 + }, + { + "epoch": 1.9947242009319894, + "grad_norm": 0.430171744081949, + "learning_rate": 5.984785442687394e-06, + "loss": 0.4428, + "step": 12146 + }, + { + "epoch": 1.994888428140332, + "grad_norm": 0.7144987121760876, + "learning_rate": 5.984342725995349e-06, + "loss": 0.4536, + "step": 12147 + }, + { + "epoch": 1.995052655348675, + "grad_norm": 0.5575791795273514, + "learning_rate": 5.9838999931818816e-06, + "loss": 0.4447, + "step": 12148 + }, + { + "epoch": 1.9952168825570178, + "grad_norm": 0.3103253306063338, + "learning_rate": 5.983457244251801e-06, + "loss": 0.4482, + "step": 12149 + }, + { + "epoch": 1.9953811097653604, + "grad_norm": 0.306105882253974, + "learning_rate": 5.983014479209914e-06, + "loss": 0.4603, + "step": 12150 + }, + { + "epoch": 1.995545336973703, + "grad_norm": 0.3256489586607816, + "learning_rate": 5.982571698061033e-06, + "loss": 0.4502, + "step": 12151 + }, + { + "epoch": 1.9957095641820457, + "grad_norm": 0.30675263299161515, + "learning_rate": 5.982128900809962e-06, + "loss": 0.4408, + "step": 12152 + }, + { + "epoch": 1.9958737913903886, + "grad_norm": 0.3979035965681152, + "learning_rate": 5.981686087461514e-06, + "loss": 0.4338, + "step": 12153 + }, + { + "epoch": 1.9960380185987314, + "grad_norm": 0.5714962273931158, + "learning_rate": 5.981243258020498e-06, + "loss": 0.473, + "step": 12154 + }, + { + "epoch": 1.996202245807074, + "grad_norm": 0.5233640080963108, + "learning_rate": 5.980800412491722e-06, + "loss": 0.4452, + "step": 12155 + }, + { + "epoch": 1.9963664730154167, + "grad_norm": 0.5095439866164315, + "learning_rate": 5.980357550879997e-06, + "loss": 0.4404, + "step": 12156 + }, + { + "epoch": 1.9965307002237596, + "grad_norm": 0.34135588876267836, + "learning_rate": 5.979914673190132e-06, + "loss": 0.4641, + "step": 12157 + }, + { + "epoch": 1.9966949274321024, + "grad_norm": 0.3166966051628447, + "learning_rate": 5.979471779426938e-06, + "loss": 0.4463, + "step": 12158 + }, + { + "epoch": 1.996859154640445, + "grad_norm": 0.3534752492741154, + "learning_rate": 5.9790288695952256e-06, + "loss": 0.4179, + "step": 12159 + }, + { + "epoch": 1.9970233818487877, + "grad_norm": 0.2803029304863608, + "learning_rate": 5.9785859436998035e-06, + "loss": 0.438, + "step": 12160 + }, + { + "epoch": 1.9971876090571306, + "grad_norm": 0.3739910815165364, + "learning_rate": 5.978143001745484e-06, + "loss": 0.4254, + "step": 12161 + }, + { + "epoch": 1.9973518362654734, + "grad_norm": 0.33129160415655884, + "learning_rate": 5.977700043737075e-06, + "loss": 0.4323, + "step": 12162 + }, + { + "epoch": 1.997516063473816, + "grad_norm": 0.31774150789673616, + "learning_rate": 5.977257069679393e-06, + "loss": 0.443, + "step": 12163 + }, + { + "epoch": 1.9976802906821587, + "grad_norm": 0.3477407627974902, + "learning_rate": 5.9768140795772445e-06, + "loss": 0.4463, + "step": 12164 + }, + { + "epoch": 1.9978445178905015, + "grad_norm": 0.3209257252688937, + "learning_rate": 5.9763710734354415e-06, + "loss": 0.4492, + "step": 12165 + }, + { + "epoch": 1.9980087450988444, + "grad_norm": 0.32082689954088633, + "learning_rate": 5.9759280512587966e-06, + "loss": 0.4436, + "step": 12166 + }, + { + "epoch": 1.998172972307187, + "grad_norm": 0.35698917095820754, + "learning_rate": 5.975485013052122e-06, + "loss": 0.4476, + "step": 12167 + }, + { + "epoch": 1.9983371995155297, + "grad_norm": 0.3286716694844807, + "learning_rate": 5.975041958820227e-06, + "loss": 0.4326, + "step": 12168 + }, + { + "epoch": 1.9985014267238723, + "grad_norm": 0.42593190197587716, + "learning_rate": 5.974598888567925e-06, + "loss": 0.4603, + "step": 12169 + }, + { + "epoch": 1.9986656539322152, + "grad_norm": 0.3397923984316032, + "learning_rate": 5.974155802300027e-06, + "loss": 0.4391, + "step": 12170 + }, + { + "epoch": 1.998829881140558, + "grad_norm": 0.4923490616754908, + "learning_rate": 5.97371270002135e-06, + "loss": 0.464, + "step": 12171 + }, + { + "epoch": 1.9989941083489007, + "grad_norm": 0.3340119955565463, + "learning_rate": 5.973269581736701e-06, + "loss": 0.4498, + "step": 12172 + }, + { + "epoch": 1.9991583355572433, + "grad_norm": 0.4640454105520234, + "learning_rate": 5.972826447450896e-06, + "loss": 0.4515, + "step": 12173 + }, + { + "epoch": 1.9993225627655862, + "grad_norm": 0.6088139999913015, + "learning_rate": 5.9723832971687455e-06, + "loss": 0.445, + "step": 12174 + }, + { + "epoch": 1.999486789973929, + "grad_norm": 0.27415029930250745, + "learning_rate": 5.971940130895065e-06, + "loss": 0.4335, + "step": 12175 + }, + { + "epoch": 1.9996510171822717, + "grad_norm": 0.30806362684880456, + "learning_rate": 5.971496948634665e-06, + "loss": 0.4513, + "step": 12176 + }, + { + "epoch": 1.9998152443906143, + "grad_norm": 0.4787944606540653, + "learning_rate": 5.9710537503923605e-06, + "loss": 0.4689, + "step": 12177 + }, + { + "epoch": 1.9999794715989572, + "grad_norm": 0.2904555426356073, + "learning_rate": 5.970610536172966e-06, + "loss": 0.4574, + "step": 12178 + }, + { + "epoch": 2.0001436988073, + "grad_norm": 0.42320407092102436, + "learning_rate": 5.970167305981294e-06, + "loss": 0.4538, + "step": 12179 + }, + { + "epoch": 2.0003079260156427, + "grad_norm": 0.3083798483536343, + "learning_rate": 5.969724059822159e-06, + "loss": 0.4534, + "step": 12180 + }, + { + "epoch": 2.0004721532239853, + "grad_norm": 0.4283547328237905, + "learning_rate": 5.969280797700373e-06, + "loss": 0.4586, + "step": 12181 + }, + { + "epoch": 2.000636380432328, + "grad_norm": 0.4112612822032779, + "learning_rate": 5.968837519620753e-06, + "loss": 0.4597, + "step": 12182 + }, + { + "epoch": 2.000800607640671, + "grad_norm": 0.3368213650434348, + "learning_rate": 5.968394225588113e-06, + "loss": 0.4564, + "step": 12183 + }, + { + "epoch": 2.0009648348490137, + "grad_norm": 0.3492692409648053, + "learning_rate": 5.967950915607267e-06, + "loss": 0.4603, + "step": 12184 + }, + { + "epoch": 2.0011290620573563, + "grad_norm": 0.327969557551925, + "learning_rate": 5.967507589683027e-06, + "loss": 0.4592, + "step": 12185 + }, + { + "epoch": 2.001293289265699, + "grad_norm": 1.0052464626999962, + "learning_rate": 5.967064247820214e-06, + "loss": 0.447, + "step": 12186 + }, + { + "epoch": 2.001457516474042, + "grad_norm": 0.3323420980370512, + "learning_rate": 5.966620890023639e-06, + "loss": 0.4497, + "step": 12187 + }, + { + "epoch": 2.0016217436823847, + "grad_norm": 0.30221353880927326, + "learning_rate": 5.966177516298119e-06, + "loss": 0.4381, + "step": 12188 + }, + { + "epoch": 2.0017859708907273, + "grad_norm": 0.34728416679258106, + "learning_rate": 5.965734126648467e-06, + "loss": 0.4511, + "step": 12189 + }, + { + "epoch": 2.00195019809907, + "grad_norm": 0.34886261634920057, + "learning_rate": 5.9652907210795e-06, + "loss": 0.4818, + "step": 12190 + }, + { + "epoch": 2.002114425307413, + "grad_norm": 0.3436162370687873, + "learning_rate": 5.964847299596035e-06, + "loss": 0.448, + "step": 12191 + }, + { + "epoch": 2.0022786525157557, + "grad_norm": 0.4215227813011232, + "learning_rate": 5.964403862202888e-06, + "loss": 0.4549, + "step": 12192 + }, + { + "epoch": 2.0024428797240983, + "grad_norm": 0.3366921499133086, + "learning_rate": 5.963960408904874e-06, + "loss": 0.4392, + "step": 12193 + }, + { + "epoch": 2.002607106932441, + "grad_norm": 0.3245529569332661, + "learning_rate": 5.963516939706809e-06, + "loss": 0.4551, + "step": 12194 + }, + { + "epoch": 2.0027713341407836, + "grad_norm": 0.33977378407262626, + "learning_rate": 5.963073454613509e-06, + "loss": 0.4695, + "step": 12195 + }, + { + "epoch": 2.0029355613491266, + "grad_norm": 0.3069779515187422, + "learning_rate": 5.962629953629794e-06, + "loss": 0.4412, + "step": 12196 + }, + { + "epoch": 2.0030997885574693, + "grad_norm": 0.3507049844560384, + "learning_rate": 5.962186436760476e-06, + "loss": 0.4761, + "step": 12197 + }, + { + "epoch": 2.003264015765812, + "grad_norm": 0.3491589362302453, + "learning_rate": 5.9617429040103785e-06, + "loss": 0.441, + "step": 12198 + }, + { + "epoch": 2.0034282429741546, + "grad_norm": 0.3742547465374901, + "learning_rate": 5.961299355384311e-06, + "loss": 0.4281, + "step": 12199 + }, + { + "epoch": 2.0035924701824976, + "grad_norm": 0.4053926473537887, + "learning_rate": 5.960855790887098e-06, + "loss": 0.4504, + "step": 12200 + }, + { + "epoch": 2.0037566973908403, + "grad_norm": 0.35957952049354897, + "learning_rate": 5.960412210523552e-06, + "loss": 0.4252, + "step": 12201 + }, + { + "epoch": 2.003920924599183, + "grad_norm": 0.32657403088533404, + "learning_rate": 5.959968614298493e-06, + "loss": 0.4472, + "step": 12202 + }, + { + "epoch": 2.0040851518075256, + "grad_norm": 0.3244325404399533, + "learning_rate": 5.959525002216738e-06, + "loss": 0.4618, + "step": 12203 + }, + { + "epoch": 2.0042493790158686, + "grad_norm": 0.6620676419938378, + "learning_rate": 5.959081374283106e-06, + "loss": 0.4516, + "step": 12204 + }, + { + "epoch": 2.0044136062242113, + "grad_norm": 0.31677753319473856, + "learning_rate": 5.9586377305024145e-06, + "loss": 0.4485, + "step": 12205 + }, + { + "epoch": 2.004577833432554, + "grad_norm": 0.4121208836205025, + "learning_rate": 5.958194070879482e-06, + "loss": 0.4532, + "step": 12206 + }, + { + "epoch": 2.0047420606408966, + "grad_norm": 0.3116933045975636, + "learning_rate": 5.957750395419127e-06, + "loss": 0.4567, + "step": 12207 + }, + { + "epoch": 2.0049062878492396, + "grad_norm": 0.33302922228958487, + "learning_rate": 5.957306704126169e-06, + "loss": 0.4518, + "step": 12208 + }, + { + "epoch": 2.0050705150575823, + "grad_norm": 0.4198718931149218, + "learning_rate": 5.956862997005428e-06, + "loss": 0.4647, + "step": 12209 + }, + { + "epoch": 2.005234742265925, + "grad_norm": 0.34251455165151395, + "learning_rate": 5.956419274061719e-06, + "loss": 0.4512, + "step": 12210 + }, + { + "epoch": 2.0053989694742675, + "grad_norm": 0.3519346429865396, + "learning_rate": 5.955975535299863e-06, + "loss": 0.4402, + "step": 12211 + }, + { + "epoch": 2.00556319668261, + "grad_norm": 0.36308105461875656, + "learning_rate": 5.955531780724683e-06, + "loss": 0.4598, + "step": 12212 + }, + { + "epoch": 2.0057274238909533, + "grad_norm": 0.3741093286692219, + "learning_rate": 5.955088010340995e-06, + "loss": 0.4443, + "step": 12213 + }, + { + "epoch": 2.005891651099296, + "grad_norm": 0.328443035688512, + "learning_rate": 5.954644224153618e-06, + "loss": 0.4192, + "step": 12214 + }, + { + "epoch": 2.0060558783076385, + "grad_norm": 0.3324771672414727, + "learning_rate": 5.954200422167376e-06, + "loss": 0.4319, + "step": 12215 + }, + { + "epoch": 2.006220105515981, + "grad_norm": 0.29254376840142776, + "learning_rate": 5.953756604387085e-06, + "loss": 0.4662, + "step": 12216 + }, + { + "epoch": 2.0063843327243243, + "grad_norm": 0.26014344535423545, + "learning_rate": 5.953312770817568e-06, + "loss": 0.432, + "step": 12217 + }, + { + "epoch": 2.006548559932667, + "grad_norm": 0.3568735894151926, + "learning_rate": 5.952868921463643e-06, + "loss": 0.4651, + "step": 12218 + }, + { + "epoch": 2.0067127871410095, + "grad_norm": 0.3141590794217856, + "learning_rate": 5.952425056330134e-06, + "loss": 0.4567, + "step": 12219 + }, + { + "epoch": 2.006877014349352, + "grad_norm": 0.3884315355667573, + "learning_rate": 5.951981175421858e-06, + "loss": 0.4439, + "step": 12220 + }, + { + "epoch": 2.0070412415576953, + "grad_norm": 0.30058138999138845, + "learning_rate": 5.951537278743639e-06, + "loss": 0.432, + "step": 12221 + }, + { + "epoch": 2.007205468766038, + "grad_norm": 0.44575480874833284, + "learning_rate": 5.951093366300296e-06, + "loss": 0.4355, + "step": 12222 + }, + { + "epoch": 2.0073696959743805, + "grad_norm": 0.366245952203828, + "learning_rate": 5.950649438096653e-06, + "loss": 0.4535, + "step": 12223 + }, + { + "epoch": 2.007533923182723, + "grad_norm": 0.2867930542655121, + "learning_rate": 5.9502054941375285e-06, + "loss": 0.4378, + "step": 12224 + }, + { + "epoch": 2.0076981503910662, + "grad_norm": 0.47746484419606905, + "learning_rate": 5.949761534427746e-06, + "loss": 0.4115, + "step": 12225 + }, + { + "epoch": 2.007862377599409, + "grad_norm": 0.37527877564632606, + "learning_rate": 5.9493175589721265e-06, + "loss": 0.4338, + "step": 12226 + }, + { + "epoch": 2.0080266048077515, + "grad_norm": 0.3297901763313656, + "learning_rate": 5.948873567775493e-06, + "loss": 0.4139, + "step": 12227 + }, + { + "epoch": 2.008190832016094, + "grad_norm": 0.2561263798325084, + "learning_rate": 5.948429560842666e-06, + "loss": 0.447, + "step": 12228 + }, + { + "epoch": 2.008355059224437, + "grad_norm": 0.4365444130306358, + "learning_rate": 5.94798553817847e-06, + "loss": 0.458, + "step": 12229 + }, + { + "epoch": 2.00851928643278, + "grad_norm": 0.5232889294616114, + "learning_rate": 5.9475414997877255e-06, + "loss": 0.447, + "step": 12230 + }, + { + "epoch": 2.0086835136411225, + "grad_norm": 0.3411242576654733, + "learning_rate": 5.947097445675258e-06, + "loss": 0.4541, + "step": 12231 + }, + { + "epoch": 2.008847740849465, + "grad_norm": 0.3540437876499908, + "learning_rate": 5.946653375845887e-06, + "loss": 0.4528, + "step": 12232 + }, + { + "epoch": 2.009011968057808, + "grad_norm": 0.41463655700949253, + "learning_rate": 5.946209290304437e-06, + "loss": 0.4484, + "step": 12233 + }, + { + "epoch": 2.009176195266151, + "grad_norm": 0.3590231272906019, + "learning_rate": 5.945765189055731e-06, + "loss": 0.4646, + "step": 12234 + }, + { + "epoch": 2.0093404224744935, + "grad_norm": 0.5948107860227744, + "learning_rate": 5.945321072104593e-06, + "loss": 0.4482, + "step": 12235 + }, + { + "epoch": 2.009504649682836, + "grad_norm": 0.309417465568033, + "learning_rate": 5.944876939455848e-06, + "loss": 0.4331, + "step": 12236 + }, + { + "epoch": 2.009668876891179, + "grad_norm": 0.3306098908330728, + "learning_rate": 5.944432791114314e-06, + "loss": 0.4499, + "step": 12237 + }, + { + "epoch": 2.009833104099522, + "grad_norm": 0.31621060772673043, + "learning_rate": 5.943988627084822e-06, + "loss": 0.4568, + "step": 12238 + }, + { + "epoch": 2.0099973313078645, + "grad_norm": 0.2856713698078628, + "learning_rate": 5.943544447372191e-06, + "loss": 0.4255, + "step": 12239 + }, + { + "epoch": 2.010161558516207, + "grad_norm": 0.28032980432657734, + "learning_rate": 5.943100251981248e-06, + "loss": 0.45, + "step": 12240 + }, + { + "epoch": 2.01032578572455, + "grad_norm": 0.3390743841064049, + "learning_rate": 5.942656040916815e-06, + "loss": 0.464, + "step": 12241 + }, + { + "epoch": 2.010490012932893, + "grad_norm": 0.3187089522691845, + "learning_rate": 5.942211814183721e-06, + "loss": 0.4534, + "step": 12242 + }, + { + "epoch": 2.0106542401412355, + "grad_norm": 0.42615693774868557, + "learning_rate": 5.941767571786786e-06, + "loss": 0.4631, + "step": 12243 + }, + { + "epoch": 2.010818467349578, + "grad_norm": 0.36667170180078384, + "learning_rate": 5.941323313730836e-06, + "loss": 0.4236, + "step": 12244 + }, + { + "epoch": 2.010982694557921, + "grad_norm": 0.43862327105157484, + "learning_rate": 5.940879040020696e-06, + "loss": 0.4519, + "step": 12245 + }, + { + "epoch": 2.0111469217662634, + "grad_norm": 0.33579804262100527, + "learning_rate": 5.940434750661194e-06, + "loss": 0.4137, + "step": 12246 + }, + { + "epoch": 2.0113111489746065, + "grad_norm": 0.30978789940948254, + "learning_rate": 5.939990445657153e-06, + "loss": 0.4343, + "step": 12247 + }, + { + "epoch": 2.011475376182949, + "grad_norm": 0.3728929087707586, + "learning_rate": 5.939546125013399e-06, + "loss": 0.4539, + "step": 12248 + }, + { + "epoch": 2.0116396033912918, + "grad_norm": 0.3256492544812732, + "learning_rate": 5.939101788734757e-06, + "loss": 0.4551, + "step": 12249 + }, + { + "epoch": 2.0118038305996344, + "grad_norm": 0.30868824642963294, + "learning_rate": 5.938657436826054e-06, + "loss": 0.4391, + "step": 12250 + }, + { + "epoch": 2.0119680578079775, + "grad_norm": 0.28528064429757055, + "learning_rate": 5.938213069292117e-06, + "loss": 0.4352, + "step": 12251 + }, + { + "epoch": 2.01213228501632, + "grad_norm": 0.720505405909572, + "learning_rate": 5.93776868613777e-06, + "loss": 0.4582, + "step": 12252 + }, + { + "epoch": 2.0122965122246628, + "grad_norm": 0.40996297358279415, + "learning_rate": 5.93732428736784e-06, + "loss": 0.4366, + "step": 12253 + }, + { + "epoch": 2.0124607394330054, + "grad_norm": 0.36966590337145583, + "learning_rate": 5.936879872987155e-06, + "loss": 0.4422, + "step": 12254 + }, + { + "epoch": 2.0126249666413485, + "grad_norm": 0.40644461183977937, + "learning_rate": 5.93643544300054e-06, + "loss": 0.4352, + "step": 12255 + }, + { + "epoch": 2.012789193849691, + "grad_norm": 0.348232310385563, + "learning_rate": 5.935990997412823e-06, + "loss": 0.4499, + "step": 12256 + }, + { + "epoch": 2.0129534210580338, + "grad_norm": 0.30635919875987044, + "learning_rate": 5.9355465362288315e-06, + "loss": 0.4597, + "step": 12257 + }, + { + "epoch": 2.0131176482663764, + "grad_norm": 0.38023284124533757, + "learning_rate": 5.9351020594533914e-06, + "loss": 0.4341, + "step": 12258 + }, + { + "epoch": 2.0132818754747195, + "grad_norm": 0.3209364969044947, + "learning_rate": 5.934657567091332e-06, + "loss": 0.4462, + "step": 12259 + }, + { + "epoch": 2.013446102683062, + "grad_norm": 0.4138574131679595, + "learning_rate": 5.9342130591474785e-06, + "loss": 0.4343, + "step": 12260 + }, + { + "epoch": 2.0136103298914048, + "grad_norm": 0.3691427016956709, + "learning_rate": 5.933768535626662e-06, + "loss": 0.4557, + "step": 12261 + }, + { + "epoch": 2.0137745570997474, + "grad_norm": 0.3137057625519252, + "learning_rate": 5.933323996533708e-06, + "loss": 0.4639, + "step": 12262 + }, + { + "epoch": 2.01393878430809, + "grad_norm": 0.4452247347531062, + "learning_rate": 5.932879441873445e-06, + "loss": 0.4478, + "step": 12263 + }, + { + "epoch": 2.014103011516433, + "grad_norm": 0.32534561106326604, + "learning_rate": 5.932434871650701e-06, + "loss": 0.4222, + "step": 12264 + }, + { + "epoch": 2.0142672387247758, + "grad_norm": 0.4228276902347355, + "learning_rate": 5.931990285870306e-06, + "loss": 0.4505, + "step": 12265 + }, + { + "epoch": 2.0144314659331184, + "grad_norm": 0.4876125453560543, + "learning_rate": 5.931545684537086e-06, + "loss": 0.4232, + "step": 12266 + }, + { + "epoch": 2.014595693141461, + "grad_norm": 0.4172990521894343, + "learning_rate": 5.9311010676558724e-06, + "loss": 0.4313, + "step": 12267 + }, + { + "epoch": 2.014759920349804, + "grad_norm": 0.4012043012163585, + "learning_rate": 5.9306564352314935e-06, + "loss": 0.4425, + "step": 12268 + }, + { + "epoch": 2.0149241475581467, + "grad_norm": 0.31821251020135133, + "learning_rate": 5.930211787268777e-06, + "loss": 0.455, + "step": 12269 + }, + { + "epoch": 2.0150883747664894, + "grad_norm": 0.3358598549375324, + "learning_rate": 5.929767123772555e-06, + "loss": 0.4501, + "step": 12270 + }, + { + "epoch": 2.015252601974832, + "grad_norm": 0.34992544409089366, + "learning_rate": 5.929322444747655e-06, + "loss": 0.4398, + "step": 12271 + }, + { + "epoch": 2.015416829183175, + "grad_norm": 0.46600892538976924, + "learning_rate": 5.928877750198906e-06, + "loss": 0.4376, + "step": 12272 + }, + { + "epoch": 2.0155810563915177, + "grad_norm": 0.4421023504524407, + "learning_rate": 5.928433040131139e-06, + "loss": 0.4627, + "step": 12273 + }, + { + "epoch": 2.0157452835998604, + "grad_norm": 0.3466636254391006, + "learning_rate": 5.9279883145491835e-06, + "loss": 0.4657, + "step": 12274 + }, + { + "epoch": 2.015909510808203, + "grad_norm": 0.41056134961088725, + "learning_rate": 5.927543573457871e-06, + "loss": 0.4482, + "step": 12275 + }, + { + "epoch": 2.016073738016546, + "grad_norm": 0.3998987345596671, + "learning_rate": 5.927098816862031e-06, + "loss": 0.4558, + "step": 12276 + }, + { + "epoch": 2.0162379652248887, + "grad_norm": 0.3411150872881389, + "learning_rate": 5.926654044766493e-06, + "loss": 0.4253, + "step": 12277 + }, + { + "epoch": 2.0164021924332314, + "grad_norm": 0.44622183591825637, + "learning_rate": 5.926209257176087e-06, + "loss": 0.4414, + "step": 12278 + }, + { + "epoch": 2.016566419641574, + "grad_norm": 0.4301937123349158, + "learning_rate": 5.925764454095646e-06, + "loss": 0.4332, + "step": 12279 + }, + { + "epoch": 2.0167306468499167, + "grad_norm": 0.518469057403512, + "learning_rate": 5.925319635530003e-06, + "loss": 0.4605, + "step": 12280 + }, + { + "epoch": 2.0168948740582597, + "grad_norm": 0.32242423510941276, + "learning_rate": 5.924874801483985e-06, + "loss": 0.4433, + "step": 12281 + }, + { + "epoch": 2.0170591012666024, + "grad_norm": 0.3939941753019156, + "learning_rate": 5.924429951962424e-06, + "loss": 0.4509, + "step": 12282 + }, + { + "epoch": 2.017223328474945, + "grad_norm": 0.330619113684146, + "learning_rate": 5.9239850869701516e-06, + "loss": 0.4334, + "step": 12283 + }, + { + "epoch": 2.0173875556832876, + "grad_norm": 0.4240219166481088, + "learning_rate": 5.923540206512001e-06, + "loss": 0.4517, + "step": 12284 + }, + { + "epoch": 2.0175517828916307, + "grad_norm": 0.37369167937967235, + "learning_rate": 5.923095310592804e-06, + "loss": 0.4418, + "step": 12285 + }, + { + "epoch": 2.0177160100999734, + "grad_norm": 0.33052399671771765, + "learning_rate": 5.922650399217391e-06, + "loss": 0.4549, + "step": 12286 + }, + { + "epoch": 2.017880237308316, + "grad_norm": 0.3318820800605246, + "learning_rate": 5.922205472390594e-06, + "loss": 0.4641, + "step": 12287 + }, + { + "epoch": 2.0180444645166586, + "grad_norm": 0.25965853886842816, + "learning_rate": 5.9217605301172475e-06, + "loss": 0.4327, + "step": 12288 + }, + { + "epoch": 2.0182086917250017, + "grad_norm": 0.39578260130111304, + "learning_rate": 5.921315572402183e-06, + "loss": 0.4443, + "step": 12289 + }, + { + "epoch": 2.0183729189333444, + "grad_norm": 0.3019127278366922, + "learning_rate": 5.920870599250232e-06, + "loss": 0.4526, + "step": 12290 + }, + { + "epoch": 2.018537146141687, + "grad_norm": 0.31515638588112566, + "learning_rate": 5.92042561066623e-06, + "loss": 0.4341, + "step": 12291 + }, + { + "epoch": 2.0187013733500296, + "grad_norm": 0.3347035481218894, + "learning_rate": 5.9199806066550074e-06, + "loss": 0.4211, + "step": 12292 + }, + { + "epoch": 2.0188656005583727, + "grad_norm": 0.342636954069315, + "learning_rate": 5.919535587221398e-06, + "loss": 0.4528, + "step": 12293 + }, + { + "epoch": 2.0190298277667154, + "grad_norm": 0.31264338761398147, + "learning_rate": 5.919090552370235e-06, + "loss": 0.4433, + "step": 12294 + }, + { + "epoch": 2.019194054975058, + "grad_norm": 0.30817308473653016, + "learning_rate": 5.918645502106354e-06, + "loss": 0.4533, + "step": 12295 + }, + { + "epoch": 2.0193582821834006, + "grad_norm": 0.4982108932983128, + "learning_rate": 5.918200436434586e-06, + "loss": 0.4467, + "step": 12296 + }, + { + "epoch": 2.0195225093917433, + "grad_norm": 0.3278038373993306, + "learning_rate": 5.9177553553597665e-06, + "loss": 0.4554, + "step": 12297 + }, + { + "epoch": 2.0196867366000864, + "grad_norm": 0.3155972149201842, + "learning_rate": 5.917310258886728e-06, + "loss": 0.4544, + "step": 12298 + }, + { + "epoch": 2.019850963808429, + "grad_norm": 0.36100394939817493, + "learning_rate": 5.916865147020307e-06, + "loss": 0.4359, + "step": 12299 + }, + { + "epoch": 2.0200151910167716, + "grad_norm": 0.474739217853946, + "learning_rate": 5.916420019765336e-06, + "loss": 0.4442, + "step": 12300 + }, + { + "epoch": 2.0201794182251143, + "grad_norm": 0.3725941176745618, + "learning_rate": 5.915974877126649e-06, + "loss": 0.4448, + "step": 12301 + }, + { + "epoch": 2.0203436454334573, + "grad_norm": 0.3394057147474312, + "learning_rate": 5.915529719109083e-06, + "loss": 0.4326, + "step": 12302 + }, + { + "epoch": 2.0205078726418, + "grad_norm": 0.3333744225766765, + "learning_rate": 5.9150845457174704e-06, + "loss": 0.444, + "step": 12303 + }, + { + "epoch": 2.0206720998501426, + "grad_norm": 0.3984379295038264, + "learning_rate": 5.9146393569566485e-06, + "loss": 0.4323, + "step": 12304 + }, + { + "epoch": 2.0208363270584853, + "grad_norm": 0.4014873188536819, + "learning_rate": 5.914194152831451e-06, + "loss": 0.4556, + "step": 12305 + }, + { + "epoch": 2.0210005542668283, + "grad_norm": 0.38413574125456695, + "learning_rate": 5.913748933346714e-06, + "loss": 0.4335, + "step": 12306 + }, + { + "epoch": 2.021164781475171, + "grad_norm": 0.33331851521554756, + "learning_rate": 5.9133036985072705e-06, + "loss": 0.4507, + "step": 12307 + }, + { + "epoch": 2.0213290086835136, + "grad_norm": 0.3326758862247054, + "learning_rate": 5.91285844831796e-06, + "loss": 0.4528, + "step": 12308 + }, + { + "epoch": 2.0214932358918563, + "grad_norm": 0.29505514100770924, + "learning_rate": 5.912413182783617e-06, + "loss": 0.4401, + "step": 12309 + }, + { + "epoch": 2.0216574631001993, + "grad_norm": 0.34154494537786756, + "learning_rate": 5.911967901909078e-06, + "loss": 0.4341, + "step": 12310 + }, + { + "epoch": 2.021821690308542, + "grad_norm": 0.4049103391456977, + "learning_rate": 5.911522605699176e-06, + "loss": 0.4444, + "step": 12311 + }, + { + "epoch": 2.0219859175168846, + "grad_norm": 0.37915910316572177, + "learning_rate": 5.911077294158751e-06, + "loss": 0.4498, + "step": 12312 + }, + { + "epoch": 2.0221501447252272, + "grad_norm": 0.42567405204291847, + "learning_rate": 5.910631967292638e-06, + "loss": 0.4521, + "step": 12313 + }, + { + "epoch": 2.02231437193357, + "grad_norm": 0.303818370149242, + "learning_rate": 5.910186625105676e-06, + "loss": 0.4415, + "step": 12314 + }, + { + "epoch": 2.022478599141913, + "grad_norm": 0.3723931157545376, + "learning_rate": 5.909741267602698e-06, + "loss": 0.4303, + "step": 12315 + }, + { + "epoch": 2.0226428263502556, + "grad_norm": 0.34244289280435986, + "learning_rate": 5.909295894788541e-06, + "loss": 0.4294, + "step": 12316 + }, + { + "epoch": 2.0228070535585982, + "grad_norm": 0.45940405346671165, + "learning_rate": 5.9088505066680465e-06, + "loss": 0.4461, + "step": 12317 + }, + { + "epoch": 2.022971280766941, + "grad_norm": 0.5371059400275238, + "learning_rate": 5.908405103246049e-06, + "loss": 0.4659, + "step": 12318 + }, + { + "epoch": 2.023135507975284, + "grad_norm": 0.37195619324867074, + "learning_rate": 5.907959684527387e-06, + "loss": 0.4438, + "step": 12319 + }, + { + "epoch": 2.0232997351836266, + "grad_norm": 0.4444748904870266, + "learning_rate": 5.907514250516897e-06, + "loss": 0.4413, + "step": 12320 + }, + { + "epoch": 2.0234639623919692, + "grad_norm": 0.4461299536447834, + "learning_rate": 5.907068801219417e-06, + "loss": 0.4335, + "step": 12321 + }, + { + "epoch": 2.023628189600312, + "grad_norm": 0.43994447948385823, + "learning_rate": 5.9066233366397854e-06, + "loss": 0.4451, + "step": 12322 + }, + { + "epoch": 2.023792416808655, + "grad_norm": 0.3247864832152194, + "learning_rate": 5.90617785678284e-06, + "loss": 0.4544, + "step": 12323 + }, + { + "epoch": 2.0239566440169976, + "grad_norm": 0.3456630868813913, + "learning_rate": 5.9057323616534216e-06, + "loss": 0.4539, + "step": 12324 + }, + { + "epoch": 2.0241208712253402, + "grad_norm": 0.4800805237875434, + "learning_rate": 5.905286851256365e-06, + "loss": 0.4624, + "step": 12325 + }, + { + "epoch": 2.024285098433683, + "grad_norm": 0.34959063433120807, + "learning_rate": 5.904841325596511e-06, + "loss": 0.4667, + "step": 12326 + }, + { + "epoch": 2.024449325642026, + "grad_norm": 0.41151557024535956, + "learning_rate": 5.904395784678698e-06, + "loss": 0.4403, + "step": 12327 + }, + { + "epoch": 2.0246135528503686, + "grad_norm": 0.2929240886104455, + "learning_rate": 5.903950228507764e-06, + "loss": 0.4422, + "step": 12328 + }, + { + "epoch": 2.0247777800587112, + "grad_norm": 0.4225784029548028, + "learning_rate": 5.903504657088551e-06, + "loss": 0.4294, + "step": 12329 + }, + { + "epoch": 2.024942007267054, + "grad_norm": 0.3198757505033295, + "learning_rate": 5.903059070425895e-06, + "loss": 0.4427, + "step": 12330 + }, + { + "epoch": 2.0251062344753965, + "grad_norm": 0.35512213130023546, + "learning_rate": 5.902613468524639e-06, + "loss": 0.435, + "step": 12331 + }, + { + "epoch": 2.0252704616837396, + "grad_norm": 0.4014329270546415, + "learning_rate": 5.902167851389619e-06, + "loss": 0.4528, + "step": 12332 + }, + { + "epoch": 2.0254346888920822, + "grad_norm": 0.3685874338668055, + "learning_rate": 5.901722219025678e-06, + "loss": 0.4603, + "step": 12333 + }, + { + "epoch": 2.025598916100425, + "grad_norm": 0.28477749529150725, + "learning_rate": 5.901276571437654e-06, + "loss": 0.4383, + "step": 12334 + }, + { + "epoch": 2.0257631433087675, + "grad_norm": 0.3095326874609088, + "learning_rate": 5.9008309086303875e-06, + "loss": 0.4409, + "step": 12335 + }, + { + "epoch": 2.0259273705171106, + "grad_norm": 0.45871021472994483, + "learning_rate": 5.900385230608718e-06, + "loss": 0.459, + "step": 12336 + }, + { + "epoch": 2.026091597725453, + "grad_norm": 1.0720311778843719, + "learning_rate": 5.8999395373774885e-06, + "loss": 0.4294, + "step": 12337 + }, + { + "epoch": 2.026255824933796, + "grad_norm": 0.2924154206097919, + "learning_rate": 5.899493828941537e-06, + "loss": 0.4704, + "step": 12338 + }, + { + "epoch": 2.0264200521421385, + "grad_norm": 0.327283098845854, + "learning_rate": 5.899048105305709e-06, + "loss": 0.4586, + "step": 12339 + }, + { + "epoch": 2.0265842793504816, + "grad_norm": 0.349614059094532, + "learning_rate": 5.898602366474839e-06, + "loss": 0.4415, + "step": 12340 + }, + { + "epoch": 2.026748506558824, + "grad_norm": 0.34197640336286816, + "learning_rate": 5.898156612453772e-06, + "loss": 0.43, + "step": 12341 + }, + { + "epoch": 2.026912733767167, + "grad_norm": 0.4623929536887179, + "learning_rate": 5.897710843247348e-06, + "loss": 0.4478, + "step": 12342 + }, + { + "epoch": 2.0270769609755095, + "grad_norm": 0.5355379594969214, + "learning_rate": 5.89726505886041e-06, + "loss": 0.4451, + "step": 12343 + }, + { + "epoch": 2.0272411881838526, + "grad_norm": 0.3055222951108194, + "learning_rate": 5.896819259297799e-06, + "loss": 0.4359, + "step": 12344 + }, + { + "epoch": 2.027405415392195, + "grad_norm": 0.3074334568504291, + "learning_rate": 5.896373444564355e-06, + "loss": 0.4468, + "step": 12345 + }, + { + "epoch": 2.027569642600538, + "grad_norm": 0.33119785643769395, + "learning_rate": 5.895927614664923e-06, + "loss": 0.4489, + "step": 12346 + }, + { + "epoch": 2.0277338698088805, + "grad_norm": 0.5005833716890481, + "learning_rate": 5.895481769604343e-06, + "loss": 0.4142, + "step": 12347 + }, + { + "epoch": 2.027898097017223, + "grad_norm": 0.4446782973011459, + "learning_rate": 5.895035909387459e-06, + "loss": 0.4455, + "step": 12348 + }, + { + "epoch": 2.028062324225566, + "grad_norm": 0.3169868306167431, + "learning_rate": 5.894590034019111e-06, + "loss": 0.4518, + "step": 12349 + }, + { + "epoch": 2.028226551433909, + "grad_norm": 0.3679129008792082, + "learning_rate": 5.894144143504144e-06, + "loss": 0.4582, + "step": 12350 + }, + { + "epoch": 2.0283907786422515, + "grad_norm": 0.3356771628200559, + "learning_rate": 5.8936982378474e-06, + "loss": 0.455, + "step": 12351 + }, + { + "epoch": 2.028555005850594, + "grad_norm": 0.37868840952035804, + "learning_rate": 5.893252317053722e-06, + "loss": 0.4564, + "step": 12352 + }, + { + "epoch": 2.028719233058937, + "grad_norm": 0.3660224757041939, + "learning_rate": 5.892806381127953e-06, + "loss": 0.416, + "step": 12353 + }, + { + "epoch": 2.02888346026728, + "grad_norm": 0.35213115451904753, + "learning_rate": 5.892360430074936e-06, + "loss": 0.4313, + "step": 12354 + }, + { + "epoch": 2.0290476874756225, + "grad_norm": 0.3061381002851472, + "learning_rate": 5.891914463899515e-06, + "loss": 0.4464, + "step": 12355 + }, + { + "epoch": 2.029211914683965, + "grad_norm": 0.3198294060410183, + "learning_rate": 5.891468482606532e-06, + "loss": 0.4465, + "step": 12356 + }, + { + "epoch": 2.029376141892308, + "grad_norm": 0.5061425404700527, + "learning_rate": 5.8910224862008345e-06, + "loss": 0.4704, + "step": 12357 + }, + { + "epoch": 2.029540369100651, + "grad_norm": 0.2906027127123769, + "learning_rate": 5.890576474687264e-06, + "loss": 0.4539, + "step": 12358 + }, + { + "epoch": 2.0297045963089935, + "grad_norm": 0.9628300081908078, + "learning_rate": 5.890130448070665e-06, + "loss": 0.4605, + "step": 12359 + }, + { + "epoch": 2.029868823517336, + "grad_norm": 0.339842469867948, + "learning_rate": 5.889684406355879e-06, + "loss": 0.4452, + "step": 12360 + }, + { + "epoch": 2.030033050725679, + "grad_norm": 0.2898973367954881, + "learning_rate": 5.889238349547755e-06, + "loss": 0.4559, + "step": 12361 + }, + { + "epoch": 2.030197277934022, + "grad_norm": 0.32955122544839677, + "learning_rate": 5.8887922776511355e-06, + "loss": 0.4593, + "step": 12362 + }, + { + "epoch": 2.0303615051423645, + "grad_norm": 0.29610395042009074, + "learning_rate": 5.888346190670868e-06, + "loss": 0.46, + "step": 12363 + }, + { + "epoch": 2.030525732350707, + "grad_norm": 0.3076361536809495, + "learning_rate": 5.887900088611792e-06, + "loss": 0.4537, + "step": 12364 + }, + { + "epoch": 2.0306899595590497, + "grad_norm": 0.3806885935501196, + "learning_rate": 5.887453971478756e-06, + "loss": 0.4385, + "step": 12365 + }, + { + "epoch": 2.030854186767393, + "grad_norm": 0.4892710621594915, + "learning_rate": 5.8870078392766036e-06, + "loss": 0.4344, + "step": 12366 + }, + { + "epoch": 2.0310184139757355, + "grad_norm": 0.6621760845336766, + "learning_rate": 5.886561692010184e-06, + "loss": 0.4566, + "step": 12367 + }, + { + "epoch": 2.031182641184078, + "grad_norm": 0.34316954928589083, + "learning_rate": 5.886115529684339e-06, + "loss": 0.4545, + "step": 12368 + }, + { + "epoch": 2.0313468683924207, + "grad_norm": 0.29649605317323635, + "learning_rate": 5.8856693523039155e-06, + "loss": 0.4409, + "step": 12369 + }, + { + "epoch": 2.031511095600764, + "grad_norm": 0.32991559461150566, + "learning_rate": 5.885223159873759e-06, + "loss": 0.4436, + "step": 12370 + }, + { + "epoch": 2.0316753228091065, + "grad_norm": 0.3997830334051244, + "learning_rate": 5.884776952398717e-06, + "loss": 0.4325, + "step": 12371 + }, + { + "epoch": 2.031839550017449, + "grad_norm": 0.2950333162096897, + "learning_rate": 5.884330729883634e-06, + "loss": 0.455, + "step": 12372 + }, + { + "epoch": 2.0320037772257917, + "grad_norm": 0.3424041916207841, + "learning_rate": 5.883884492333359e-06, + "loss": 0.44, + "step": 12373 + }, + { + "epoch": 2.032168004434135, + "grad_norm": 0.27965686096829506, + "learning_rate": 5.883438239752734e-06, + "loss": 0.427, + "step": 12374 + }, + { + "epoch": 2.0323322316424774, + "grad_norm": 0.29382204979502846, + "learning_rate": 5.882991972146611e-06, + "loss": 0.4455, + "step": 12375 + }, + { + "epoch": 2.03249645885082, + "grad_norm": 0.32015433618862266, + "learning_rate": 5.882545689519834e-06, + "loss": 0.4558, + "step": 12376 + }, + { + "epoch": 2.0326606860591627, + "grad_norm": 0.3289590563823037, + "learning_rate": 5.88209939187725e-06, + "loss": 0.4581, + "step": 12377 + }, + { + "epoch": 2.032824913267506, + "grad_norm": 0.3202040927893558, + "learning_rate": 5.881653079223708e-06, + "loss": 0.433, + "step": 12378 + }, + { + "epoch": 2.0329891404758484, + "grad_norm": 0.4237401638627447, + "learning_rate": 5.881206751564053e-06, + "loss": 0.4468, + "step": 12379 + }, + { + "epoch": 2.033153367684191, + "grad_norm": 0.4113867464229899, + "learning_rate": 5.8807604089031345e-06, + "loss": 0.4498, + "step": 12380 + }, + { + "epoch": 2.0333175948925337, + "grad_norm": 0.3713036320098837, + "learning_rate": 5.880314051245799e-06, + "loss": 0.4636, + "step": 12381 + }, + { + "epoch": 2.0334818221008764, + "grad_norm": 0.3487669051771585, + "learning_rate": 5.879867678596896e-06, + "loss": 0.4261, + "step": 12382 + }, + { + "epoch": 2.0336460493092194, + "grad_norm": 0.4297560095357159, + "learning_rate": 5.8794212909612705e-06, + "loss": 0.4632, + "step": 12383 + }, + { + "epoch": 2.033810276517562, + "grad_norm": 0.41595656127910297, + "learning_rate": 5.878974888343773e-06, + "loss": 0.4363, + "step": 12384 + }, + { + "epoch": 2.0339745037259047, + "grad_norm": 0.31941281295240154, + "learning_rate": 5.878528470749252e-06, + "loss": 0.4616, + "step": 12385 + }, + { + "epoch": 2.0341387309342474, + "grad_norm": 0.3736947516535489, + "learning_rate": 5.878082038182555e-06, + "loss": 0.4407, + "step": 12386 + }, + { + "epoch": 2.0343029581425904, + "grad_norm": 0.3648758434688456, + "learning_rate": 5.8776355906485325e-06, + "loss": 0.4615, + "step": 12387 + }, + { + "epoch": 2.034467185350933, + "grad_norm": 0.33317524041004737, + "learning_rate": 5.877189128152032e-06, + "loss": 0.4516, + "step": 12388 + }, + { + "epoch": 2.0346314125592757, + "grad_norm": 0.34434024200356267, + "learning_rate": 5.876742650697902e-06, + "loss": 0.4687, + "step": 12389 + }, + { + "epoch": 2.0347956397676183, + "grad_norm": 0.3873243560043453, + "learning_rate": 5.876296158290991e-06, + "loss": 0.4664, + "step": 12390 + }, + { + "epoch": 2.0349598669759614, + "grad_norm": 0.6587433734412117, + "learning_rate": 5.87584965093615e-06, + "loss": 0.4384, + "step": 12391 + }, + { + "epoch": 2.035124094184304, + "grad_norm": 0.376035639840286, + "learning_rate": 5.87540312863823e-06, + "loss": 0.4448, + "step": 12392 + }, + { + "epoch": 2.0352883213926467, + "grad_norm": 0.3356221661068441, + "learning_rate": 5.874956591402078e-06, + "loss": 0.4504, + "step": 12393 + }, + { + "epoch": 2.0354525486009893, + "grad_norm": 0.3364467718112007, + "learning_rate": 5.874510039232544e-06, + "loss": 0.4287, + "step": 12394 + }, + { + "epoch": 2.0356167758093324, + "grad_norm": 0.3530682596790142, + "learning_rate": 5.874063472134479e-06, + "loss": 0.4323, + "step": 12395 + }, + { + "epoch": 2.035781003017675, + "grad_norm": 0.41389982440021067, + "learning_rate": 5.8736168901127325e-06, + "loss": 0.4536, + "step": 12396 + }, + { + "epoch": 2.0359452302260177, + "grad_norm": 0.317301755237896, + "learning_rate": 5.873170293172156e-06, + "loss": 0.4371, + "step": 12397 + }, + { + "epoch": 2.0361094574343603, + "grad_norm": 0.34272680753358126, + "learning_rate": 5.872723681317599e-06, + "loss": 0.4491, + "step": 12398 + }, + { + "epoch": 2.036273684642703, + "grad_norm": 0.4006309979835223, + "learning_rate": 5.87227705455391e-06, + "loss": 0.441, + "step": 12399 + }, + { + "epoch": 2.036437911851046, + "grad_norm": 0.5542787557823411, + "learning_rate": 5.871830412885944e-06, + "loss": 0.4296, + "step": 12400 + }, + { + "epoch": 2.0366021390593887, + "grad_norm": 0.3431090307672414, + "learning_rate": 5.871383756318551e-06, + "loss": 0.4513, + "step": 12401 + }, + { + "epoch": 2.0367663662677313, + "grad_norm": 0.3586838401808626, + "learning_rate": 5.87093708485658e-06, + "loss": 0.4485, + "step": 12402 + }, + { + "epoch": 2.036930593476074, + "grad_norm": 0.3051512780842894, + "learning_rate": 5.8704903985048825e-06, + "loss": 0.4547, + "step": 12403 + }, + { + "epoch": 2.037094820684417, + "grad_norm": 0.35974186850941714, + "learning_rate": 5.870043697268312e-06, + "loss": 0.4495, + "step": 12404 + }, + { + "epoch": 2.0372590478927597, + "grad_norm": 0.6056306895454063, + "learning_rate": 5.869596981151719e-06, + "loss": 0.467, + "step": 12405 + }, + { + "epoch": 2.0374232751011023, + "grad_norm": 0.2819644783966082, + "learning_rate": 5.869150250159955e-06, + "loss": 0.4314, + "step": 12406 + }, + { + "epoch": 2.037587502309445, + "grad_norm": 0.4286095726381181, + "learning_rate": 5.868703504297873e-06, + "loss": 0.4474, + "step": 12407 + }, + { + "epoch": 2.037751729517788, + "grad_norm": 0.32038506407005746, + "learning_rate": 5.868256743570323e-06, + "loss": 0.4422, + "step": 12408 + }, + { + "epoch": 2.0379159567261307, + "grad_norm": 0.3259096360271549, + "learning_rate": 5.8678099679821595e-06, + "loss": 0.4517, + "step": 12409 + }, + { + "epoch": 2.0380801839344733, + "grad_norm": 0.27644003099357023, + "learning_rate": 5.867363177538234e-06, + "loss": 0.4327, + "step": 12410 + }, + { + "epoch": 2.038244411142816, + "grad_norm": 0.3402320682171515, + "learning_rate": 5.866916372243399e-06, + "loss": 0.4618, + "step": 12411 + }, + { + "epoch": 2.038408638351159, + "grad_norm": 0.284775187844616, + "learning_rate": 5.866469552102506e-06, + "loss": 0.4524, + "step": 12412 + }, + { + "epoch": 2.0385728655595017, + "grad_norm": 0.4978827495915803, + "learning_rate": 5.866022717120411e-06, + "loss": 0.434, + "step": 12413 + }, + { + "epoch": 2.0387370927678443, + "grad_norm": 0.8185449778955562, + "learning_rate": 5.865575867301965e-06, + "loss": 0.4595, + "step": 12414 + }, + { + "epoch": 2.038901319976187, + "grad_norm": 0.3231777306250387, + "learning_rate": 5.8651290026520205e-06, + "loss": 0.4369, + "step": 12415 + }, + { + "epoch": 2.0390655471845296, + "grad_norm": 0.329589659019701, + "learning_rate": 5.864682123175433e-06, + "loss": 0.4475, + "step": 12416 + }, + { + "epoch": 2.0392297743928727, + "grad_norm": 1.1849958099030102, + "learning_rate": 5.864235228877056e-06, + "loss": 0.4255, + "step": 12417 + }, + { + "epoch": 2.0393940016012153, + "grad_norm": 0.33656368419001453, + "learning_rate": 5.86378831976174e-06, + "loss": 0.4348, + "step": 12418 + }, + { + "epoch": 2.039558228809558, + "grad_norm": 0.3167560065962795, + "learning_rate": 5.863341395834341e-06, + "loss": 0.4144, + "step": 12419 + }, + { + "epoch": 2.0397224560179006, + "grad_norm": 0.3695074006748326, + "learning_rate": 5.862894457099714e-06, + "loss": 0.4503, + "step": 12420 + }, + { + "epoch": 2.0398866832262437, + "grad_norm": 0.3667195996387198, + "learning_rate": 5.862447503562713e-06, + "loss": 0.4379, + "step": 12421 + }, + { + "epoch": 2.0400509104345863, + "grad_norm": 0.39032351094667883, + "learning_rate": 5.862000535228191e-06, + "loss": 0.4329, + "step": 12422 + }, + { + "epoch": 2.040215137642929, + "grad_norm": 0.3380516948636253, + "learning_rate": 5.861553552101003e-06, + "loss": 0.4586, + "step": 12423 + }, + { + "epoch": 2.0403793648512716, + "grad_norm": 0.35621909121959594, + "learning_rate": 5.861106554186003e-06, + "loss": 0.4533, + "step": 12424 + }, + { + "epoch": 2.0405435920596147, + "grad_norm": 0.3354530237084805, + "learning_rate": 5.860659541488048e-06, + "loss": 0.4391, + "step": 12425 + }, + { + "epoch": 2.0407078192679573, + "grad_norm": 0.2989633077547237, + "learning_rate": 5.860212514011992e-06, + "loss": 0.4241, + "step": 12426 + }, + { + "epoch": 2.0408720464763, + "grad_norm": 0.35862761603268756, + "learning_rate": 5.859765471762688e-06, + "loss": 0.4446, + "step": 12427 + }, + { + "epoch": 2.0410362736846426, + "grad_norm": 0.30444330966462657, + "learning_rate": 5.859318414744995e-06, + "loss": 0.474, + "step": 12428 + }, + { + "epoch": 2.0412005008929857, + "grad_norm": 0.352540677765662, + "learning_rate": 5.8588713429637655e-06, + "loss": 0.4387, + "step": 12429 + }, + { + "epoch": 2.0413647281013283, + "grad_norm": 0.3778446185553416, + "learning_rate": 5.8584242564238566e-06, + "loss": 0.4591, + "step": 12430 + }, + { + "epoch": 2.041528955309671, + "grad_norm": 0.3491850807666615, + "learning_rate": 5.857977155130124e-06, + "loss": 0.4621, + "step": 12431 + }, + { + "epoch": 2.0416931825180136, + "grad_norm": 0.38612813413715863, + "learning_rate": 5.857530039087423e-06, + "loss": 0.4732, + "step": 12432 + }, + { + "epoch": 2.041857409726356, + "grad_norm": 0.3933909296763072, + "learning_rate": 5.85708290830061e-06, + "loss": 0.4572, + "step": 12433 + }, + { + "epoch": 2.0420216369346993, + "grad_norm": 0.45460547103818244, + "learning_rate": 5.856635762774542e-06, + "loss": 0.4554, + "step": 12434 + }, + { + "epoch": 2.042185864143042, + "grad_norm": 0.4897357475104495, + "learning_rate": 5.856188602514075e-06, + "loss": 0.4549, + "step": 12435 + }, + { + "epoch": 2.0423500913513846, + "grad_norm": 0.37141597461045756, + "learning_rate": 5.855741427524066e-06, + "loss": 0.4341, + "step": 12436 + }, + { + "epoch": 2.042514318559727, + "grad_norm": 0.3152170933745343, + "learning_rate": 5.8552942378093694e-06, + "loss": 0.4383, + "step": 12437 + }, + { + "epoch": 2.0426785457680703, + "grad_norm": 0.3315546517567952, + "learning_rate": 5.854847033374845e-06, + "loss": 0.4422, + "step": 12438 + }, + { + "epoch": 2.042842772976413, + "grad_norm": 0.34849627509879655, + "learning_rate": 5.854399814225349e-06, + "loss": 0.4534, + "step": 12439 + }, + { + "epoch": 2.0430070001847556, + "grad_norm": 0.3415060059087718, + "learning_rate": 5.8539525803657375e-06, + "loss": 0.4244, + "step": 12440 + }, + { + "epoch": 2.043171227393098, + "grad_norm": 0.39313173165872756, + "learning_rate": 5.85350533180087e-06, + "loss": 0.4217, + "step": 12441 + }, + { + "epoch": 2.0433354546014413, + "grad_norm": 0.3429924779240633, + "learning_rate": 5.853058068535603e-06, + "loss": 0.4577, + "step": 12442 + }, + { + "epoch": 2.043499681809784, + "grad_norm": 0.32928596156929474, + "learning_rate": 5.852610790574793e-06, + "loss": 0.4566, + "step": 12443 + }, + { + "epoch": 2.0436639090181266, + "grad_norm": 0.47849256644842075, + "learning_rate": 5.8521634979232995e-06, + "loss": 0.4444, + "step": 12444 + }, + { + "epoch": 2.043828136226469, + "grad_norm": 0.3687608357126545, + "learning_rate": 5.851716190585981e-06, + "loss": 0.4318, + "step": 12445 + }, + { + "epoch": 2.0439923634348123, + "grad_norm": 0.32895085912600486, + "learning_rate": 5.851268868567694e-06, + "loss": 0.4203, + "step": 12446 + }, + { + "epoch": 2.044156590643155, + "grad_norm": 0.37289343076052367, + "learning_rate": 5.850821531873298e-06, + "loss": 0.4433, + "step": 12447 + }, + { + "epoch": 2.0443208178514976, + "grad_norm": 0.31859032614581423, + "learning_rate": 5.8503741805076496e-06, + "loss": 0.4338, + "step": 12448 + }, + { + "epoch": 2.04448504505984, + "grad_norm": 0.2887224225883823, + "learning_rate": 5.8499268144756104e-06, + "loss": 0.4357, + "step": 12449 + }, + { + "epoch": 2.044649272268183, + "grad_norm": 0.4697271025351198, + "learning_rate": 5.8494794337820375e-06, + "loss": 0.452, + "step": 12450 + }, + { + "epoch": 2.044813499476526, + "grad_norm": 0.3282934512860411, + "learning_rate": 5.849032038431792e-06, + "loss": 0.4449, + "step": 12451 + }, + { + "epoch": 2.0449777266848685, + "grad_norm": 0.385047656440147, + "learning_rate": 5.8485846284297285e-06, + "loss": 0.4388, + "step": 12452 + }, + { + "epoch": 2.045141953893211, + "grad_norm": 0.36439150850801005, + "learning_rate": 5.848137203780709e-06, + "loss": 0.4558, + "step": 12453 + }, + { + "epoch": 2.045306181101554, + "grad_norm": 0.40450690234416153, + "learning_rate": 5.847689764489595e-06, + "loss": 0.4337, + "step": 12454 + }, + { + "epoch": 2.045470408309897, + "grad_norm": 0.43383659347572107, + "learning_rate": 5.847242310561243e-06, + "loss": 0.444, + "step": 12455 + }, + { + "epoch": 2.0456346355182395, + "grad_norm": 0.33992573109644364, + "learning_rate": 5.846794842000516e-06, + "loss": 0.4302, + "step": 12456 + }, + { + "epoch": 2.045798862726582, + "grad_norm": 0.3404891501105816, + "learning_rate": 5.84634735881227e-06, + "loss": 0.4315, + "step": 12457 + }, + { + "epoch": 2.045963089934925, + "grad_norm": 0.3948929516683718, + "learning_rate": 5.845899861001367e-06, + "loss": 0.4352, + "step": 12458 + }, + { + "epoch": 2.046127317143268, + "grad_norm": 0.36232433689948934, + "learning_rate": 5.845452348572668e-06, + "loss": 0.4228, + "step": 12459 + }, + { + "epoch": 2.0462915443516105, + "grad_norm": 0.4879837670116479, + "learning_rate": 5.845004821531033e-06, + "loss": 0.4479, + "step": 12460 + }, + { + "epoch": 2.046455771559953, + "grad_norm": 0.43242467602156015, + "learning_rate": 5.844557279881321e-06, + "loss": 0.4338, + "step": 12461 + }, + { + "epoch": 2.046619998768296, + "grad_norm": 0.46972252076723947, + "learning_rate": 5.844109723628395e-06, + "loss": 0.4296, + "step": 12462 + }, + { + "epoch": 2.046784225976639, + "grad_norm": 0.348287060668494, + "learning_rate": 5.843662152777117e-06, + "loss": 0.4462, + "step": 12463 + }, + { + "epoch": 2.0469484531849815, + "grad_norm": 0.3554017413497341, + "learning_rate": 5.843214567332343e-06, + "loss": 0.4494, + "step": 12464 + }, + { + "epoch": 2.047112680393324, + "grad_norm": 0.7426503419751455, + "learning_rate": 5.842766967298939e-06, + "loss": 0.4375, + "step": 12465 + }, + { + "epoch": 2.047276907601667, + "grad_norm": 0.34331608608976466, + "learning_rate": 5.842319352681763e-06, + "loss": 0.4527, + "step": 12466 + }, + { + "epoch": 2.0474411348100094, + "grad_norm": 0.326253689959674, + "learning_rate": 5.8418717234856785e-06, + "loss": 0.4347, + "step": 12467 + }, + { + "epoch": 2.0476053620183525, + "grad_norm": 1.2139001651559362, + "learning_rate": 5.841424079715548e-06, + "loss": 0.4685, + "step": 12468 + }, + { + "epoch": 2.047769589226695, + "grad_norm": 1.809298572630905, + "learning_rate": 5.840976421376231e-06, + "loss": 0.4267, + "step": 12469 + }, + { + "epoch": 2.047933816435038, + "grad_norm": 0.643982392760066, + "learning_rate": 5.840528748472593e-06, + "loss": 0.4511, + "step": 12470 + }, + { + "epoch": 2.0480980436433804, + "grad_norm": 0.4200958520866591, + "learning_rate": 5.84008106100949e-06, + "loss": 0.4756, + "step": 12471 + }, + { + "epoch": 2.0482622708517235, + "grad_norm": 0.3785922765747419, + "learning_rate": 5.839633358991792e-06, + "loss": 0.4455, + "step": 12472 + }, + { + "epoch": 2.048426498060066, + "grad_norm": 0.334283585605747, + "learning_rate": 5.839185642424356e-06, + "loss": 0.4554, + "step": 12473 + }, + { + "epoch": 2.048590725268409, + "grad_norm": 0.2979365135820452, + "learning_rate": 5.838737911312046e-06, + "loss": 0.4394, + "step": 12474 + }, + { + "epoch": 2.0487549524767514, + "grad_norm": 0.3928511038792992, + "learning_rate": 5.838290165659726e-06, + "loss": 0.4309, + "step": 12475 + }, + { + "epoch": 2.0489191796850945, + "grad_norm": 0.42167196945885604, + "learning_rate": 5.837842405472259e-06, + "loss": 0.4202, + "step": 12476 + }, + { + "epoch": 2.049083406893437, + "grad_norm": 0.384941912400613, + "learning_rate": 5.837394630754504e-06, + "loss": 0.4454, + "step": 12477 + }, + { + "epoch": 2.04924763410178, + "grad_norm": 0.6027380809373334, + "learning_rate": 5.83694684151133e-06, + "loss": 0.4436, + "step": 12478 + }, + { + "epoch": 2.0494118613101224, + "grad_norm": 0.5500830911834226, + "learning_rate": 5.836499037747598e-06, + "loss": 0.4446, + "step": 12479 + }, + { + "epoch": 2.0495760885184655, + "grad_norm": 0.49515778776697733, + "learning_rate": 5.836051219468171e-06, + "loss": 0.4372, + "step": 12480 + }, + { + "epoch": 2.049740315726808, + "grad_norm": 0.43075419664571624, + "learning_rate": 5.835603386677913e-06, + "loss": 0.4429, + "step": 12481 + }, + { + "epoch": 2.049904542935151, + "grad_norm": 0.3542392734227228, + "learning_rate": 5.8351555393816885e-06, + "loss": 0.4262, + "step": 12482 + }, + { + "epoch": 2.0500687701434934, + "grad_norm": 0.38268666364407705, + "learning_rate": 5.8347076775843604e-06, + "loss": 0.4303, + "step": 12483 + }, + { + "epoch": 2.050232997351836, + "grad_norm": 0.47765234932548395, + "learning_rate": 5.834259801290795e-06, + "loss": 0.4498, + "step": 12484 + }, + { + "epoch": 2.050397224560179, + "grad_norm": 0.41345370663759506, + "learning_rate": 5.833811910505855e-06, + "loss": 0.4189, + "step": 12485 + }, + { + "epoch": 2.050561451768522, + "grad_norm": 0.4104341797134906, + "learning_rate": 5.833364005234404e-06, + "loss": 0.4538, + "step": 12486 + }, + { + "epoch": 2.0507256789768644, + "grad_norm": 0.31647737223956823, + "learning_rate": 5.83291608548131e-06, + "loss": 0.4407, + "step": 12487 + }, + { + "epoch": 2.050889906185207, + "grad_norm": 0.37302273496194177, + "learning_rate": 5.832468151251435e-06, + "loss": 0.4433, + "step": 12488 + }, + { + "epoch": 2.05105413339355, + "grad_norm": 0.3554326401961096, + "learning_rate": 5.832020202549644e-06, + "loss": 0.4432, + "step": 12489 + }, + { + "epoch": 2.0512183606018928, + "grad_norm": 0.491954272790454, + "learning_rate": 5.831572239380806e-06, + "loss": 0.4564, + "step": 12490 + }, + { + "epoch": 2.0513825878102354, + "grad_norm": 0.3789475522315765, + "learning_rate": 5.831124261749781e-06, + "loss": 0.4495, + "step": 12491 + }, + { + "epoch": 2.051546815018578, + "grad_norm": 0.647424818774864, + "learning_rate": 5.830676269661436e-06, + "loss": 0.4501, + "step": 12492 + }, + { + "epoch": 2.051711042226921, + "grad_norm": 0.5823413883449976, + "learning_rate": 5.830228263120641e-06, + "loss": 0.4213, + "step": 12493 + }, + { + "epoch": 2.0518752694352638, + "grad_norm": 0.27813242543412897, + "learning_rate": 5.829780242132256e-06, + "loss": 0.4397, + "step": 12494 + }, + { + "epoch": 2.0520394966436064, + "grad_norm": 0.3284450588600766, + "learning_rate": 5.829332206701149e-06, + "loss": 0.443, + "step": 12495 + }, + { + "epoch": 2.052203723851949, + "grad_norm": 0.3445192485646923, + "learning_rate": 5.828884156832186e-06, + "loss": 0.4518, + "step": 12496 + }, + { + "epoch": 2.052367951060292, + "grad_norm": 0.549298740102889, + "learning_rate": 5.828436092530235e-06, + "loss": 0.4444, + "step": 12497 + }, + { + "epoch": 2.0525321782686348, + "grad_norm": 0.46014439109326594, + "learning_rate": 5.82798801380016e-06, + "loss": 0.4532, + "step": 12498 + }, + { + "epoch": 2.0526964054769774, + "grad_norm": 0.28368044288710964, + "learning_rate": 5.8275399206468304e-06, + "loss": 0.441, + "step": 12499 + }, + { + "epoch": 2.05286063268532, + "grad_norm": 0.3968388641653914, + "learning_rate": 5.8270918130751085e-06, + "loss": 0.4459, + "step": 12500 + }, + { + "epoch": 2.0530248598936627, + "grad_norm": 0.3710478412314034, + "learning_rate": 5.8266436910898656e-06, + "loss": 0.463, + "step": 12501 + }, + { + "epoch": 2.0531890871020058, + "grad_norm": 0.313924593295772, + "learning_rate": 5.826195554695966e-06, + "loss": 0.4392, + "step": 12502 + }, + { + "epoch": 2.0533533143103484, + "grad_norm": 0.4580347375031826, + "learning_rate": 5.825747403898278e-06, + "loss": 0.4503, + "step": 12503 + }, + { + "epoch": 2.053517541518691, + "grad_norm": 0.4132435728401116, + "learning_rate": 5.825299238701669e-06, + "loss": 0.4559, + "step": 12504 + }, + { + "epoch": 2.0536817687270337, + "grad_norm": 0.36199062801530246, + "learning_rate": 5.824851059111007e-06, + "loss": 0.4483, + "step": 12505 + }, + { + "epoch": 2.0538459959353768, + "grad_norm": 0.4748756143679125, + "learning_rate": 5.824402865131159e-06, + "loss": 0.4575, + "step": 12506 + }, + { + "epoch": 2.0540102231437194, + "grad_norm": 0.3218339347858069, + "learning_rate": 5.823954656766991e-06, + "loss": 0.4504, + "step": 12507 + }, + { + "epoch": 2.054174450352062, + "grad_norm": 0.31580790798058256, + "learning_rate": 5.823506434023374e-06, + "loss": 0.4397, + "step": 12508 + }, + { + "epoch": 2.0543386775604047, + "grad_norm": 0.35353519486956647, + "learning_rate": 5.823058196905177e-06, + "loss": 0.4517, + "step": 12509 + }, + { + "epoch": 2.0545029047687478, + "grad_norm": 0.46529793989703433, + "learning_rate": 5.8226099454172644e-06, + "loss": 0.458, + "step": 12510 + }, + { + "epoch": 2.0546671319770904, + "grad_norm": 0.3205124315397412, + "learning_rate": 5.822161679564506e-06, + "loss": 0.4364, + "step": 12511 + }, + { + "epoch": 2.054831359185433, + "grad_norm": 0.3610849789933956, + "learning_rate": 5.821713399351771e-06, + "loss": 0.4436, + "step": 12512 + }, + { + "epoch": 2.0549955863937757, + "grad_norm": 0.3604424163085528, + "learning_rate": 5.821265104783929e-06, + "loss": 0.4686, + "step": 12513 + }, + { + "epoch": 2.0551598136021187, + "grad_norm": 0.3869786566160952, + "learning_rate": 5.820816795865848e-06, + "loss": 0.4518, + "step": 12514 + }, + { + "epoch": 2.0553240408104614, + "grad_norm": 0.31366177202073153, + "learning_rate": 5.8203684726023965e-06, + "loss": 0.4382, + "step": 12515 + }, + { + "epoch": 2.055488268018804, + "grad_norm": 0.3816416592520499, + "learning_rate": 5.819920134998445e-06, + "loss": 0.4486, + "step": 12516 + }, + { + "epoch": 2.0556524952271467, + "grad_norm": 0.36611323579211497, + "learning_rate": 5.819471783058861e-06, + "loss": 0.4636, + "step": 12517 + }, + { + "epoch": 2.0558167224354893, + "grad_norm": 0.3926812761164695, + "learning_rate": 5.8190234167885164e-06, + "loss": 0.4313, + "step": 12518 + }, + { + "epoch": 2.0559809496438324, + "grad_norm": 0.3325452728082553, + "learning_rate": 5.818575036192279e-06, + "loss": 0.4638, + "step": 12519 + }, + { + "epoch": 2.056145176852175, + "grad_norm": 0.42067798281032054, + "learning_rate": 5.81812664127502e-06, + "loss": 0.4637, + "step": 12520 + }, + { + "epoch": 2.0563094040605177, + "grad_norm": 0.318034404416098, + "learning_rate": 5.817678232041608e-06, + "loss": 0.4283, + "step": 12521 + }, + { + "epoch": 2.0564736312688603, + "grad_norm": 0.48940136510980775, + "learning_rate": 5.817229808496915e-06, + "loss": 0.4436, + "step": 12522 + }, + { + "epoch": 2.0566378584772034, + "grad_norm": 0.2882544661032187, + "learning_rate": 5.816781370645809e-06, + "loss": 0.4309, + "step": 12523 + }, + { + "epoch": 2.056802085685546, + "grad_norm": 0.5042845819900779, + "learning_rate": 5.816332918493164e-06, + "loss": 0.4497, + "step": 12524 + }, + { + "epoch": 2.0569663128938886, + "grad_norm": 0.29557746058877027, + "learning_rate": 5.815884452043846e-06, + "loss": 0.4365, + "step": 12525 + }, + { + "epoch": 2.0571305401022313, + "grad_norm": 0.33012923255531806, + "learning_rate": 5.81543597130273e-06, + "loss": 0.4569, + "step": 12526 + }, + { + "epoch": 2.0572947673105744, + "grad_norm": 0.4815314599818145, + "learning_rate": 5.8149874762746844e-06, + "loss": 0.4455, + "step": 12527 + }, + { + "epoch": 2.057458994518917, + "grad_norm": 0.30529200972710946, + "learning_rate": 5.814538966964581e-06, + "loss": 0.4355, + "step": 12528 + }, + { + "epoch": 2.0576232217272596, + "grad_norm": 0.2781313905407027, + "learning_rate": 5.814090443377291e-06, + "loss": 0.4394, + "step": 12529 + }, + { + "epoch": 2.0577874489356023, + "grad_norm": 0.33271003194262444, + "learning_rate": 5.813641905517687e-06, + "loss": 0.4605, + "step": 12530 + }, + { + "epoch": 2.0579516761439454, + "grad_norm": 0.33716505625152493, + "learning_rate": 5.813193353390637e-06, + "loss": 0.4708, + "step": 12531 + }, + { + "epoch": 2.058115903352288, + "grad_norm": 0.353896302279186, + "learning_rate": 5.812744787001017e-06, + "loss": 0.4373, + "step": 12532 + }, + { + "epoch": 2.0582801305606306, + "grad_norm": 0.58896944714371, + "learning_rate": 5.812296206353696e-06, + "loss": 0.4643, + "step": 12533 + }, + { + "epoch": 2.0584443577689733, + "grad_norm": 0.292337170150739, + "learning_rate": 5.811847611453549e-06, + "loss": 0.4405, + "step": 12534 + }, + { + "epoch": 2.058608584977316, + "grad_norm": 0.29791135402795793, + "learning_rate": 5.811399002305445e-06, + "loss": 0.4583, + "step": 12535 + }, + { + "epoch": 2.058772812185659, + "grad_norm": 0.29900219174218134, + "learning_rate": 5.810950378914256e-06, + "loss": 0.4568, + "step": 12536 + }, + { + "epoch": 2.0589370393940016, + "grad_norm": 0.28608728842480263, + "learning_rate": 5.810501741284858e-06, + "loss": 0.4557, + "step": 12537 + }, + { + "epoch": 2.0591012666023443, + "grad_norm": 0.3254239739716752, + "learning_rate": 5.8100530894221215e-06, + "loss": 0.4335, + "step": 12538 + }, + { + "epoch": 2.059265493810687, + "grad_norm": 1.5228037555250937, + "learning_rate": 5.80960442333092e-06, + "loss": 0.4402, + "step": 12539 + }, + { + "epoch": 2.05942972101903, + "grad_norm": 0.4226075001082411, + "learning_rate": 5.809155743016125e-06, + "loss": 0.4543, + "step": 12540 + }, + { + "epoch": 2.0595939482273726, + "grad_norm": 0.3498474294927714, + "learning_rate": 5.80870704848261e-06, + "loss": 0.4519, + "step": 12541 + }, + { + "epoch": 2.0597581754357153, + "grad_norm": 0.31951437366502333, + "learning_rate": 5.808258339735251e-06, + "loss": 0.4289, + "step": 12542 + }, + { + "epoch": 2.059922402644058, + "grad_norm": 0.3069067592827343, + "learning_rate": 5.807809616778918e-06, + "loss": 0.448, + "step": 12543 + }, + { + "epoch": 2.060086629852401, + "grad_norm": 0.34492891136792414, + "learning_rate": 5.807360879618486e-06, + "loss": 0.4223, + "step": 12544 + }, + { + "epoch": 2.0602508570607436, + "grad_norm": 0.37131805354396374, + "learning_rate": 5.806912128258828e-06, + "loss": 0.4387, + "step": 12545 + }, + { + "epoch": 2.0604150842690863, + "grad_norm": 0.34516089438021247, + "learning_rate": 5.806463362704819e-06, + "loss": 0.4268, + "step": 12546 + }, + { + "epoch": 2.060579311477429, + "grad_norm": 0.34676052883678987, + "learning_rate": 5.806014582961333e-06, + "loss": 0.4409, + "step": 12547 + }, + { + "epoch": 2.060743538685772, + "grad_norm": 0.32879785289901325, + "learning_rate": 5.805565789033244e-06, + "loss": 0.4446, + "step": 12548 + }, + { + "epoch": 2.0609077658941146, + "grad_norm": 0.33290467087799636, + "learning_rate": 5.805116980925425e-06, + "loss": 0.4405, + "step": 12549 + }, + { + "epoch": 2.0610719931024573, + "grad_norm": 0.37078533186888135, + "learning_rate": 5.80466815864275e-06, + "loss": 0.4466, + "step": 12550 + }, + { + "epoch": 2.0612362203108, + "grad_norm": 0.2745242092923762, + "learning_rate": 5.804219322190098e-06, + "loss": 0.4486, + "step": 12551 + }, + { + "epoch": 2.0614004475191425, + "grad_norm": 0.36904796252237204, + "learning_rate": 5.80377047157234e-06, + "loss": 0.4519, + "step": 12552 + }, + { + "epoch": 2.0615646747274856, + "grad_norm": 0.5335812773510453, + "learning_rate": 5.8033216067943515e-06, + "loss": 0.4558, + "step": 12553 + }, + { + "epoch": 2.0617289019358283, + "grad_norm": 0.3053953391869156, + "learning_rate": 5.802872727861009e-06, + "loss": 0.4586, + "step": 12554 + }, + { + "epoch": 2.061893129144171, + "grad_norm": 0.40520225583447983, + "learning_rate": 5.802423834777186e-06, + "loss": 0.4351, + "step": 12555 + }, + { + "epoch": 2.0620573563525135, + "grad_norm": 0.3988187826185149, + "learning_rate": 5.801974927547758e-06, + "loss": 0.4356, + "step": 12556 + }, + { + "epoch": 2.0622215835608566, + "grad_norm": 0.3380322407657435, + "learning_rate": 5.8015260061776024e-06, + "loss": 0.4604, + "step": 12557 + }, + { + "epoch": 2.0623858107691992, + "grad_norm": 0.3348430293607122, + "learning_rate": 5.801077070671595e-06, + "loss": 0.4293, + "step": 12558 + }, + { + "epoch": 2.062550037977542, + "grad_norm": 0.3842683264535958, + "learning_rate": 5.80062812103461e-06, + "loss": 0.4341, + "step": 12559 + }, + { + "epoch": 2.0627142651858845, + "grad_norm": 0.30200025238816425, + "learning_rate": 5.800179157271522e-06, + "loss": 0.4701, + "step": 12560 + }, + { + "epoch": 2.0628784923942276, + "grad_norm": 0.31908377463654947, + "learning_rate": 5.79973017938721e-06, + "loss": 0.4494, + "step": 12561 + }, + { + "epoch": 2.0630427196025702, + "grad_norm": 0.5005507434852071, + "learning_rate": 5.7992811873865496e-06, + "loss": 0.451, + "step": 12562 + }, + { + "epoch": 2.063206946810913, + "grad_norm": 0.366742905796703, + "learning_rate": 5.7988321812744175e-06, + "loss": 0.4317, + "step": 12563 + }, + { + "epoch": 2.0633711740192555, + "grad_norm": 0.5555018777431691, + "learning_rate": 5.798383161055691e-06, + "loss": 0.4484, + "step": 12564 + }, + { + "epoch": 2.0635354012275986, + "grad_norm": 0.3180625989819009, + "learning_rate": 5.797934126735244e-06, + "loss": 0.4381, + "step": 12565 + }, + { + "epoch": 2.0636996284359412, + "grad_norm": 0.32545313266240605, + "learning_rate": 5.797485078317956e-06, + "loss": 0.4591, + "step": 12566 + }, + { + "epoch": 2.063863855644284, + "grad_norm": 0.43023638563585087, + "learning_rate": 5.797036015808704e-06, + "loss": 0.4502, + "step": 12567 + }, + { + "epoch": 2.0640280828526265, + "grad_norm": 0.3400815349047949, + "learning_rate": 5.796586939212365e-06, + "loss": 0.458, + "step": 12568 + }, + { + "epoch": 2.064192310060969, + "grad_norm": 0.34321856939719503, + "learning_rate": 5.796137848533816e-06, + "loss": 0.4243, + "step": 12569 + }, + { + "epoch": 2.0643565372693122, + "grad_norm": 0.3930605011253571, + "learning_rate": 5.795688743777934e-06, + "loss": 0.4503, + "step": 12570 + }, + { + "epoch": 2.064520764477655, + "grad_norm": 0.3441430685584969, + "learning_rate": 5.795239624949597e-06, + "loss": 0.4525, + "step": 12571 + }, + { + "epoch": 2.0646849916859975, + "grad_norm": 0.3092235368191995, + "learning_rate": 5.794790492053685e-06, + "loss": 0.4407, + "step": 12572 + }, + { + "epoch": 2.06484921889434, + "grad_norm": 0.2956042164630155, + "learning_rate": 5.7943413450950745e-06, + "loss": 0.429, + "step": 12573 + }, + { + "epoch": 2.0650134461026832, + "grad_norm": 0.5785693723379479, + "learning_rate": 5.793892184078642e-06, + "loss": 0.4476, + "step": 12574 + }, + { + "epoch": 2.065177673311026, + "grad_norm": 0.4050586734624316, + "learning_rate": 5.793443009009268e-06, + "loss": 0.4385, + "step": 12575 + }, + { + "epoch": 2.0653419005193685, + "grad_norm": 0.44417807478608184, + "learning_rate": 5.792993819891831e-06, + "loss": 0.4655, + "step": 12576 + }, + { + "epoch": 2.065506127727711, + "grad_norm": 0.32094819856464846, + "learning_rate": 5.792544616731208e-06, + "loss": 0.4371, + "step": 12577 + }, + { + "epoch": 2.065670354936054, + "grad_norm": 0.3225322891675443, + "learning_rate": 5.792095399532279e-06, + "loss": 0.4388, + "step": 12578 + }, + { + "epoch": 2.065834582144397, + "grad_norm": 0.3611766104446534, + "learning_rate": 5.791646168299923e-06, + "loss": 0.44, + "step": 12579 + }, + { + "epoch": 2.0659988093527395, + "grad_norm": 0.41265178472084757, + "learning_rate": 5.791196923039019e-06, + "loss": 0.4262, + "step": 12580 + }, + { + "epoch": 2.066163036561082, + "grad_norm": 0.5685535710349041, + "learning_rate": 5.790747663754445e-06, + "loss": 0.4407, + "step": 12581 + }, + { + "epoch": 2.066327263769425, + "grad_norm": 0.4290513029494653, + "learning_rate": 5.790298390451083e-06, + "loss": 0.4375, + "step": 12582 + }, + { + "epoch": 2.066491490977768, + "grad_norm": 0.31193517737081866, + "learning_rate": 5.78984910313381e-06, + "loss": 0.4455, + "step": 12583 + }, + { + "epoch": 2.0666557181861105, + "grad_norm": 0.7311649893085822, + "learning_rate": 5.789399801807506e-06, + "loss": 0.4621, + "step": 12584 + }, + { + "epoch": 2.066819945394453, + "grad_norm": 0.35584981635411755, + "learning_rate": 5.7889504864770525e-06, + "loss": 0.435, + "step": 12585 + }, + { + "epoch": 2.0669841726027958, + "grad_norm": 0.34996905818049884, + "learning_rate": 5.788501157147328e-06, + "loss": 0.4436, + "step": 12586 + }, + { + "epoch": 2.067148399811139, + "grad_norm": 0.2944915229752169, + "learning_rate": 5.788051813823214e-06, + "loss": 0.4583, + "step": 12587 + }, + { + "epoch": 2.0673126270194815, + "grad_norm": 0.34794445160376564, + "learning_rate": 5.78760245650959e-06, + "loss": 0.4608, + "step": 12588 + }, + { + "epoch": 2.067476854227824, + "grad_norm": 0.31060770589110037, + "learning_rate": 5.787153085211336e-06, + "loss": 0.4636, + "step": 12589 + }, + { + "epoch": 2.0676410814361668, + "grad_norm": 0.36268150169000246, + "learning_rate": 5.786703699933333e-06, + "loss": 0.4428, + "step": 12590 + }, + { + "epoch": 2.06780530864451, + "grad_norm": 0.4948601709280069, + "learning_rate": 5.786254300680463e-06, + "loss": 0.4552, + "step": 12591 + }, + { + "epoch": 2.0679695358528525, + "grad_norm": 0.8228867505642475, + "learning_rate": 5.785804887457604e-06, + "loss": 0.4567, + "step": 12592 + }, + { + "epoch": 2.068133763061195, + "grad_norm": 0.33416355670733, + "learning_rate": 5.78535546026964e-06, + "loss": 0.4259, + "step": 12593 + }, + { + "epoch": 2.0682979902695378, + "grad_norm": 0.4689217739481361, + "learning_rate": 5.784906019121451e-06, + "loss": 0.447, + "step": 12594 + }, + { + "epoch": 2.068462217477881, + "grad_norm": 0.32276567150550434, + "learning_rate": 5.784456564017918e-06, + "loss": 0.4617, + "step": 12595 + }, + { + "epoch": 2.0686264446862235, + "grad_norm": 0.3707358256052318, + "learning_rate": 5.784007094963924e-06, + "loss": 0.4451, + "step": 12596 + }, + { + "epoch": 2.068790671894566, + "grad_norm": 0.7082352154662992, + "learning_rate": 5.783557611964349e-06, + "loss": 0.457, + "step": 12597 + }, + { + "epoch": 2.0689548991029088, + "grad_norm": 0.3352628121247371, + "learning_rate": 5.783108115024076e-06, + "loss": 0.4379, + "step": 12598 + }, + { + "epoch": 2.069119126311252, + "grad_norm": 0.33373071703006074, + "learning_rate": 5.782658604147985e-06, + "loss": 0.4269, + "step": 12599 + }, + { + "epoch": 2.0692833535195945, + "grad_norm": 0.3890245550176744, + "learning_rate": 5.78220907934096e-06, + "loss": 0.4238, + "step": 12600 + }, + { + "epoch": 2.069447580727937, + "grad_norm": 0.42551587495552057, + "learning_rate": 5.781759540607884e-06, + "loss": 0.4379, + "step": 12601 + }, + { + "epoch": 2.0696118079362797, + "grad_norm": 0.3848246045704332, + "learning_rate": 5.781309987953638e-06, + "loss": 0.4355, + "step": 12602 + }, + { + "epoch": 2.0697760351446224, + "grad_norm": 0.39510346983773065, + "learning_rate": 5.780860421383105e-06, + "loss": 0.445, + "step": 12603 + }, + { + "epoch": 2.0699402623529655, + "grad_norm": 0.5293818907890222, + "learning_rate": 5.780410840901166e-06, + "loss": 0.4388, + "step": 12604 + }, + { + "epoch": 2.070104489561308, + "grad_norm": 0.43186625564440984, + "learning_rate": 5.779961246512707e-06, + "loss": 0.4424, + "step": 12605 + }, + { + "epoch": 2.0702687167696507, + "grad_norm": 0.35612676312621716, + "learning_rate": 5.77951163822261e-06, + "loss": 0.4324, + "step": 12606 + }, + { + "epoch": 2.0704329439779934, + "grad_norm": 0.44096187888292104, + "learning_rate": 5.779062016035756e-06, + "loss": 0.459, + "step": 12607 + }, + { + "epoch": 2.0705971711863365, + "grad_norm": 0.3641439511112035, + "learning_rate": 5.7786123799570305e-06, + "loss": 0.4621, + "step": 12608 + }, + { + "epoch": 2.070761398394679, + "grad_norm": 0.48210480440260733, + "learning_rate": 5.778162729991317e-06, + "loss": 0.4358, + "step": 12609 + }, + { + "epoch": 2.0709256256030217, + "grad_norm": 0.40744258619042256, + "learning_rate": 5.7777130661435004e-06, + "loss": 0.4298, + "step": 12610 + }, + { + "epoch": 2.0710898528113644, + "grad_norm": 0.2950203895635282, + "learning_rate": 5.777263388418461e-06, + "loss": 0.4276, + "step": 12611 + }, + { + "epoch": 2.0712540800197075, + "grad_norm": 0.3186235372237503, + "learning_rate": 5.776813696821085e-06, + "loss": 0.4577, + "step": 12612 + }, + { + "epoch": 2.07141830722805, + "grad_norm": 0.39173521730263494, + "learning_rate": 5.776363991356255e-06, + "loss": 0.4817, + "step": 12613 + }, + { + "epoch": 2.0715825344363927, + "grad_norm": 0.37345701908829565, + "learning_rate": 5.7759142720288586e-06, + "loss": 0.4396, + "step": 12614 + }, + { + "epoch": 2.0717467616447354, + "grad_norm": 0.6131889982299739, + "learning_rate": 5.775464538843775e-06, + "loss": 0.4282, + "step": 12615 + }, + { + "epoch": 2.0719109888530785, + "grad_norm": 0.4123409725560399, + "learning_rate": 5.775014791805894e-06, + "loss": 0.4483, + "step": 12616 + }, + { + "epoch": 2.072075216061421, + "grad_norm": 0.3181460531036635, + "learning_rate": 5.7745650309200965e-06, + "loss": 0.4546, + "step": 12617 + }, + { + "epoch": 2.0722394432697637, + "grad_norm": 0.34628966447613285, + "learning_rate": 5.77411525619127e-06, + "loss": 0.4405, + "step": 12618 + }, + { + "epoch": 2.0724036704781064, + "grad_norm": 0.32934812441671757, + "learning_rate": 5.773665467624296e-06, + "loss": 0.4236, + "step": 12619 + }, + { + "epoch": 2.072567897686449, + "grad_norm": 0.3320241576727553, + "learning_rate": 5.7732156652240635e-06, + "loss": 0.4419, + "step": 12620 + }, + { + "epoch": 2.072732124894792, + "grad_norm": 0.3751110180170982, + "learning_rate": 5.772765848995457e-06, + "loss": 0.42, + "step": 12621 + }, + { + "epoch": 2.0728963521031347, + "grad_norm": 0.3094726411805009, + "learning_rate": 5.772316018943361e-06, + "loss": 0.452, + "step": 12622 + }, + { + "epoch": 2.0730605793114774, + "grad_norm": 0.38697336585851455, + "learning_rate": 5.771866175072659e-06, + "loss": 0.4264, + "step": 12623 + }, + { + "epoch": 2.07322480651982, + "grad_norm": 0.304288316654996, + "learning_rate": 5.77141631738824e-06, + "loss": 0.4484, + "step": 12624 + }, + { + "epoch": 2.073389033728163, + "grad_norm": 0.482532511700392, + "learning_rate": 5.770966445894991e-06, + "loss": 0.4545, + "step": 12625 + }, + { + "epoch": 2.0735532609365057, + "grad_norm": 0.32262406085428247, + "learning_rate": 5.770516560597794e-06, + "loss": 0.4462, + "step": 12626 + }, + { + "epoch": 2.0737174881448484, + "grad_norm": 0.3325404739646796, + "learning_rate": 5.770066661501538e-06, + "loss": 0.4239, + "step": 12627 + }, + { + "epoch": 2.073881715353191, + "grad_norm": 1.9006644872008762, + "learning_rate": 5.769616748611106e-06, + "loss": 0.4709, + "step": 12628 + }, + { + "epoch": 2.074045942561534, + "grad_norm": 0.3423420126583943, + "learning_rate": 5.769166821931389e-06, + "loss": 0.4424, + "step": 12629 + }, + { + "epoch": 2.0742101697698767, + "grad_norm": 0.29731610057754654, + "learning_rate": 5.7687168814672726e-06, + "loss": 0.4483, + "step": 12630 + }, + { + "epoch": 2.0743743969782193, + "grad_norm": 0.3874440450920287, + "learning_rate": 5.768266927223642e-06, + "loss": 0.4681, + "step": 12631 + }, + { + "epoch": 2.074538624186562, + "grad_norm": 0.3297323496352712, + "learning_rate": 5.767816959205384e-06, + "loss": 0.4278, + "step": 12632 + }, + { + "epoch": 2.074702851394905, + "grad_norm": 0.3816910179675114, + "learning_rate": 5.767366977417386e-06, + "loss": 0.4577, + "step": 12633 + }, + { + "epoch": 2.0748670786032477, + "grad_norm": 0.4387489677520473, + "learning_rate": 5.766916981864536e-06, + "loss": 0.4406, + "step": 12634 + }, + { + "epoch": 2.0750313058115903, + "grad_norm": 0.3323500901032443, + "learning_rate": 5.7664669725517215e-06, + "loss": 0.4474, + "step": 12635 + }, + { + "epoch": 2.075195533019933, + "grad_norm": 0.4094691211689768, + "learning_rate": 5.766016949483831e-06, + "loss": 0.4354, + "step": 12636 + }, + { + "epoch": 2.0753597602282756, + "grad_norm": 0.3247612685603506, + "learning_rate": 5.765566912665748e-06, + "loss": 0.4453, + "step": 12637 + }, + { + "epoch": 2.0755239874366187, + "grad_norm": 0.2992283474155525, + "learning_rate": 5.765116862102365e-06, + "loss": 0.4508, + "step": 12638 + }, + { + "epoch": 2.0756882146449613, + "grad_norm": 0.2833987945361159, + "learning_rate": 5.764666797798569e-06, + "loss": 0.4493, + "step": 12639 + }, + { + "epoch": 2.075852441853304, + "grad_norm": 0.36076810908372076, + "learning_rate": 5.764216719759246e-06, + "loss": 0.4424, + "step": 12640 + }, + { + "epoch": 2.0760166690616466, + "grad_norm": 0.33496405013392627, + "learning_rate": 5.763766627989285e-06, + "loss": 0.4376, + "step": 12641 + }, + { + "epoch": 2.0761808962699897, + "grad_norm": 0.3664945473785977, + "learning_rate": 5.763316522493576e-06, + "loss": 0.4511, + "step": 12642 + }, + { + "epoch": 2.0763451234783323, + "grad_norm": 0.3160831448540927, + "learning_rate": 5.7628664032770066e-06, + "loss": 0.4547, + "step": 12643 + }, + { + "epoch": 2.076509350686675, + "grad_norm": 0.3800779021900539, + "learning_rate": 5.7624162703444655e-06, + "loss": 0.4571, + "step": 12644 + }, + { + "epoch": 2.0766735778950176, + "grad_norm": 0.3096148612282082, + "learning_rate": 5.761966123700843e-06, + "loss": 0.4465, + "step": 12645 + }, + { + "epoch": 2.0768378051033607, + "grad_norm": 0.4700276065412499, + "learning_rate": 5.761515963351024e-06, + "loss": 0.4371, + "step": 12646 + }, + { + "epoch": 2.0770020323117033, + "grad_norm": 0.5717112387339418, + "learning_rate": 5.761065789299902e-06, + "loss": 0.4733, + "step": 12647 + }, + { + "epoch": 2.077166259520046, + "grad_norm": 0.3478262531191078, + "learning_rate": 5.760615601552365e-06, + "loss": 0.4252, + "step": 12648 + }, + { + "epoch": 2.0773304867283886, + "grad_norm": 0.4208026301767626, + "learning_rate": 5.760165400113301e-06, + "loss": 0.4344, + "step": 12649 + }, + { + "epoch": 2.0774947139367317, + "grad_norm": 0.3241596365519559, + "learning_rate": 5.759715184987602e-06, + "loss": 0.4352, + "step": 12650 + }, + { + "epoch": 2.0776589411450743, + "grad_norm": 0.40095490723714017, + "learning_rate": 5.7592649561801576e-06, + "loss": 0.4414, + "step": 12651 + }, + { + "epoch": 2.077823168353417, + "grad_norm": 0.34814818074051035, + "learning_rate": 5.7588147136958555e-06, + "loss": 0.4656, + "step": 12652 + }, + { + "epoch": 2.0779873955617596, + "grad_norm": 0.5480012497939601, + "learning_rate": 5.758364457539587e-06, + "loss": 0.4658, + "step": 12653 + }, + { + "epoch": 2.0781516227701022, + "grad_norm": 0.4022849269106388, + "learning_rate": 5.757914187716242e-06, + "loss": 0.4481, + "step": 12654 + }, + { + "epoch": 2.0783158499784453, + "grad_norm": 0.3296491558960112, + "learning_rate": 5.757463904230713e-06, + "loss": 0.4312, + "step": 12655 + }, + { + "epoch": 2.078480077186788, + "grad_norm": 0.291526171977682, + "learning_rate": 5.757013607087888e-06, + "loss": 0.4658, + "step": 12656 + }, + { + "epoch": 2.0786443043951306, + "grad_norm": 0.3151420850285067, + "learning_rate": 5.756563296292658e-06, + "loss": 0.4162, + "step": 12657 + }, + { + "epoch": 2.0788085316034732, + "grad_norm": 0.7250952171515853, + "learning_rate": 5.756112971849915e-06, + "loss": 0.4579, + "step": 12658 + }, + { + "epoch": 2.0789727588118163, + "grad_norm": 0.323930778535311, + "learning_rate": 5.755662633764549e-06, + "loss": 0.4332, + "step": 12659 + }, + { + "epoch": 2.079136986020159, + "grad_norm": 0.3319897590687082, + "learning_rate": 5.755212282041452e-06, + "loss": 0.4634, + "step": 12660 + }, + { + "epoch": 2.0793012132285016, + "grad_norm": 0.2965645629673636, + "learning_rate": 5.754761916685515e-06, + "loss": 0.4494, + "step": 12661 + }, + { + "epoch": 2.0794654404368442, + "grad_norm": 0.4789465039629421, + "learning_rate": 5.754311537701626e-06, + "loss": 0.4645, + "step": 12662 + }, + { + "epoch": 2.0796296676451873, + "grad_norm": 0.3051466720049543, + "learning_rate": 5.753861145094682e-06, + "loss": 0.4367, + "step": 12663 + }, + { + "epoch": 2.07979389485353, + "grad_norm": 0.34364344780338235, + "learning_rate": 5.753410738869573e-06, + "loss": 0.4287, + "step": 12664 + }, + { + "epoch": 2.0799581220618726, + "grad_norm": 0.2942069240384221, + "learning_rate": 5.75296031903119e-06, + "loss": 0.4331, + "step": 12665 + }, + { + "epoch": 2.080122349270215, + "grad_norm": 0.4048541793023706, + "learning_rate": 5.752509885584423e-06, + "loss": 0.4461, + "step": 12666 + }, + { + "epoch": 2.0802865764785583, + "grad_norm": 0.3473334256844807, + "learning_rate": 5.752059438534168e-06, + "loss": 0.4515, + "step": 12667 + }, + { + "epoch": 2.080450803686901, + "grad_norm": 0.33238636058550425, + "learning_rate": 5.751608977885315e-06, + "loss": 0.4528, + "step": 12668 + }, + { + "epoch": 2.0806150308952436, + "grad_norm": 0.3399610070481964, + "learning_rate": 5.751158503642758e-06, + "loss": 0.4663, + "step": 12669 + }, + { + "epoch": 2.080779258103586, + "grad_norm": 0.44808401171722956, + "learning_rate": 5.750708015811389e-06, + "loss": 0.4313, + "step": 12670 + }, + { + "epoch": 2.080943485311929, + "grad_norm": 0.30559942252118855, + "learning_rate": 5.7502575143960985e-06, + "loss": 0.4576, + "step": 12671 + }, + { + "epoch": 2.081107712520272, + "grad_norm": 0.29963645365849745, + "learning_rate": 5.749806999401783e-06, + "loss": 0.4201, + "step": 12672 + }, + { + "epoch": 2.0812719397286146, + "grad_norm": 0.30932198493791446, + "learning_rate": 5.7493564708333324e-06, + "loss": 0.4507, + "step": 12673 + }, + { + "epoch": 2.081436166936957, + "grad_norm": 0.38662130188894195, + "learning_rate": 5.748905928695643e-06, + "loss": 0.4598, + "step": 12674 + }, + { + "epoch": 2.0816003941453, + "grad_norm": 0.3130379513175994, + "learning_rate": 5.748455372993606e-06, + "loss": 0.4262, + "step": 12675 + }, + { + "epoch": 2.081764621353643, + "grad_norm": 0.28607888291252975, + "learning_rate": 5.748004803732115e-06, + "loss": 0.4517, + "step": 12676 + }, + { + "epoch": 2.0819288485619856, + "grad_norm": 0.2895532108578058, + "learning_rate": 5.747554220916065e-06, + "loss": 0.4261, + "step": 12677 + }, + { + "epoch": 2.082093075770328, + "grad_norm": 0.4363820417166602, + "learning_rate": 5.747103624550347e-06, + "loss": 0.4455, + "step": 12678 + }, + { + "epoch": 2.082257302978671, + "grad_norm": 0.35167816630302146, + "learning_rate": 5.7466530146398576e-06, + "loss": 0.4309, + "step": 12679 + }, + { + "epoch": 2.082421530187014, + "grad_norm": 0.3577392213525, + "learning_rate": 5.746202391189488e-06, + "loss": 0.4519, + "step": 12680 + }, + { + "epoch": 2.0825857573953566, + "grad_norm": 0.33848902980111123, + "learning_rate": 5.745751754204137e-06, + "loss": 0.4325, + "step": 12681 + }, + { + "epoch": 2.082749984603699, + "grad_norm": 0.3605086606372533, + "learning_rate": 5.7453011036886955e-06, + "loss": 0.4356, + "step": 12682 + }, + { + "epoch": 2.082914211812042, + "grad_norm": 0.35835761281442624, + "learning_rate": 5.744850439648058e-06, + "loss": 0.4476, + "step": 12683 + }, + { + "epoch": 2.083078439020385, + "grad_norm": 0.29445262431486496, + "learning_rate": 5.744399762087121e-06, + "loss": 0.4385, + "step": 12684 + }, + { + "epoch": 2.0832426662287276, + "grad_norm": 0.2838943444170344, + "learning_rate": 5.7439490710107785e-06, + "loss": 0.4305, + "step": 12685 + }, + { + "epoch": 2.08340689343707, + "grad_norm": 0.39643038537134845, + "learning_rate": 5.743498366423923e-06, + "loss": 0.4353, + "step": 12686 + }, + { + "epoch": 2.083571120645413, + "grad_norm": 0.26822456161375324, + "learning_rate": 5.743047648331453e-06, + "loss": 0.4378, + "step": 12687 + }, + { + "epoch": 2.0837353478537555, + "grad_norm": 0.3759392038585772, + "learning_rate": 5.742596916738263e-06, + "loss": 0.4605, + "step": 12688 + }, + { + "epoch": 2.0838995750620986, + "grad_norm": 0.39420292932984524, + "learning_rate": 5.742146171649249e-06, + "loss": 0.4342, + "step": 12689 + }, + { + "epoch": 2.084063802270441, + "grad_norm": 0.4549535925788061, + "learning_rate": 5.741695413069304e-06, + "loss": 0.4289, + "step": 12690 + }, + { + "epoch": 2.084228029478784, + "grad_norm": 0.3253546959983401, + "learning_rate": 5.741244641003325e-06, + "loss": 0.439, + "step": 12691 + }, + { + "epoch": 2.0843922566871265, + "grad_norm": 0.2965903307460332, + "learning_rate": 5.740793855456207e-06, + "loss": 0.4389, + "step": 12692 + }, + { + "epoch": 2.0845564838954695, + "grad_norm": 0.32124029350309213, + "learning_rate": 5.74034305643285e-06, + "loss": 0.4189, + "step": 12693 + }, + { + "epoch": 2.084720711103812, + "grad_norm": 0.3386294509403138, + "learning_rate": 5.7398922439381455e-06, + "loss": 0.4544, + "step": 12694 + }, + { + "epoch": 2.084884938312155, + "grad_norm": 0.316581751906967, + "learning_rate": 5.739441417976991e-06, + "loss": 0.4422, + "step": 12695 + }, + { + "epoch": 2.0850491655204975, + "grad_norm": 0.3995425057420119, + "learning_rate": 5.738990578554283e-06, + "loss": 0.435, + "step": 12696 + }, + { + "epoch": 2.0852133927288405, + "grad_norm": 0.41720524884760174, + "learning_rate": 5.73853972567492e-06, + "loss": 0.4638, + "step": 12697 + }, + { + "epoch": 2.085377619937183, + "grad_norm": 0.34690086468609344, + "learning_rate": 5.738088859343795e-06, + "loss": 0.4625, + "step": 12698 + }, + { + "epoch": 2.085541847145526, + "grad_norm": 0.3311064860378474, + "learning_rate": 5.737637979565808e-06, + "loss": 0.4471, + "step": 12699 + }, + { + "epoch": 2.0857060743538685, + "grad_norm": 0.3264983757330153, + "learning_rate": 5.737187086345854e-06, + "loss": 0.4497, + "step": 12700 + }, + { + "epoch": 2.0858703015622115, + "grad_norm": 0.44699065895405937, + "learning_rate": 5.736736179688833e-06, + "loss": 0.4485, + "step": 12701 + }, + { + "epoch": 2.086034528770554, + "grad_norm": 0.34545500806939633, + "learning_rate": 5.736285259599639e-06, + "loss": 0.4479, + "step": 12702 + }, + { + "epoch": 2.086198755978897, + "grad_norm": 0.39482652743473867, + "learning_rate": 5.73583432608317e-06, + "loss": 0.4422, + "step": 12703 + }, + { + "epoch": 2.0863629831872395, + "grad_norm": 0.341075659467438, + "learning_rate": 5.735383379144325e-06, + "loss": 0.4414, + "step": 12704 + }, + { + "epoch": 2.086527210395582, + "grad_norm": 0.3309139894092436, + "learning_rate": 5.734932418788001e-06, + "loss": 0.4444, + "step": 12705 + }, + { + "epoch": 2.086691437603925, + "grad_norm": 0.3247661891625629, + "learning_rate": 5.734481445019097e-06, + "loss": 0.4396, + "step": 12706 + }, + { + "epoch": 2.086855664812268, + "grad_norm": 0.326532593618532, + "learning_rate": 5.734030457842508e-06, + "loss": 0.4546, + "step": 12707 + }, + { + "epoch": 2.0870198920206104, + "grad_norm": 0.3922216807175495, + "learning_rate": 5.733579457263135e-06, + "loss": 0.4629, + "step": 12708 + }, + { + "epoch": 2.087184119228953, + "grad_norm": 0.394354893553275, + "learning_rate": 5.7331284432858755e-06, + "loss": 0.4522, + "step": 12709 + }, + { + "epoch": 2.087348346437296, + "grad_norm": 0.402018882323647, + "learning_rate": 5.7326774159156275e-06, + "loss": 0.4489, + "step": 12710 + }, + { + "epoch": 2.087512573645639, + "grad_norm": 0.33228228472187143, + "learning_rate": 5.73222637515729e-06, + "loss": 0.4263, + "step": 12711 + }, + { + "epoch": 2.0876768008539814, + "grad_norm": 0.4159735965654368, + "learning_rate": 5.731775321015762e-06, + "loss": 0.4548, + "step": 12712 + }, + { + "epoch": 2.087841028062324, + "grad_norm": 0.4843775098777932, + "learning_rate": 5.731324253495942e-06, + "loss": 0.4442, + "step": 12713 + }, + { + "epoch": 2.088005255270667, + "grad_norm": 0.33755449261044895, + "learning_rate": 5.730873172602731e-06, + "loss": 0.4554, + "step": 12714 + }, + { + "epoch": 2.08816948247901, + "grad_norm": 0.4117674329873463, + "learning_rate": 5.730422078341024e-06, + "loss": 0.4238, + "step": 12715 + }, + { + "epoch": 2.0883337096873524, + "grad_norm": 0.3001693197707603, + "learning_rate": 5.729970970715722e-06, + "loss": 0.4425, + "step": 12716 + }, + { + "epoch": 2.088497936895695, + "grad_norm": 0.3338517787117876, + "learning_rate": 5.729519849731726e-06, + "loss": 0.4456, + "step": 12717 + }, + { + "epoch": 2.088662164104038, + "grad_norm": 0.42389755185789696, + "learning_rate": 5.729068715393936e-06, + "loss": 0.4574, + "step": 12718 + }, + { + "epoch": 2.088826391312381, + "grad_norm": 0.44123544041308665, + "learning_rate": 5.72861756770725e-06, + "loss": 0.4652, + "step": 12719 + }, + { + "epoch": 2.0889906185207234, + "grad_norm": 0.35369014153572237, + "learning_rate": 5.7281664066765675e-06, + "loss": 0.4426, + "step": 12720 + }, + { + "epoch": 2.089154845729066, + "grad_norm": 0.3249641567404478, + "learning_rate": 5.727715232306789e-06, + "loss": 0.4383, + "step": 12721 + }, + { + "epoch": 2.0893190729374087, + "grad_norm": 0.32121212235146046, + "learning_rate": 5.727264044602817e-06, + "loss": 0.4364, + "step": 12722 + }, + { + "epoch": 2.089483300145752, + "grad_norm": 0.3503459484904393, + "learning_rate": 5.72681284356955e-06, + "loss": 0.454, + "step": 12723 + }, + { + "epoch": 2.0896475273540944, + "grad_norm": 0.3292701887707263, + "learning_rate": 5.726361629211887e-06, + "loss": 0.4371, + "step": 12724 + }, + { + "epoch": 2.089811754562437, + "grad_norm": 0.29747395088568335, + "learning_rate": 5.7259104015347315e-06, + "loss": 0.4453, + "step": 12725 + }, + { + "epoch": 2.0899759817707797, + "grad_norm": 0.34403589806070617, + "learning_rate": 5.725459160542981e-06, + "loss": 0.433, + "step": 12726 + }, + { + "epoch": 2.090140208979123, + "grad_norm": 0.3846893582906354, + "learning_rate": 5.72500790624154e-06, + "loss": 0.4345, + "step": 12727 + }, + { + "epoch": 2.0903044361874654, + "grad_norm": 0.3406838298169791, + "learning_rate": 5.724556638635308e-06, + "loss": 0.4279, + "step": 12728 + }, + { + "epoch": 2.090468663395808, + "grad_norm": 0.4079214046800453, + "learning_rate": 5.724105357729185e-06, + "loss": 0.4468, + "step": 12729 + }, + { + "epoch": 2.0906328906041507, + "grad_norm": 0.3143000270751584, + "learning_rate": 5.723654063528074e-06, + "loss": 0.4315, + "step": 12730 + }, + { + "epoch": 2.0907971178124938, + "grad_norm": 0.32459926896482116, + "learning_rate": 5.723202756036876e-06, + "loss": 0.4371, + "step": 12731 + }, + { + "epoch": 2.0909613450208364, + "grad_norm": 0.31723359855263167, + "learning_rate": 5.722751435260493e-06, + "loss": 0.4486, + "step": 12732 + }, + { + "epoch": 2.091125572229179, + "grad_norm": 0.3074426863744088, + "learning_rate": 5.7223001012038254e-06, + "loss": 0.4255, + "step": 12733 + }, + { + "epoch": 2.0912897994375217, + "grad_norm": 0.3509154461452414, + "learning_rate": 5.721848753871777e-06, + "loss": 0.4373, + "step": 12734 + }, + { + "epoch": 2.0914540266458648, + "grad_norm": 0.35720034447561816, + "learning_rate": 5.721397393269249e-06, + "loss": 0.4243, + "step": 12735 + }, + { + "epoch": 2.0916182538542074, + "grad_norm": 0.3614472189732083, + "learning_rate": 5.720946019401143e-06, + "loss": 0.4608, + "step": 12736 + }, + { + "epoch": 2.09178248106255, + "grad_norm": 0.332457995357455, + "learning_rate": 5.720494632272363e-06, + "loss": 0.4511, + "step": 12737 + }, + { + "epoch": 2.0919467082708927, + "grad_norm": 0.4137143471345643, + "learning_rate": 5.720043231887808e-06, + "loss": 0.4487, + "step": 12738 + }, + { + "epoch": 2.0921109354792353, + "grad_norm": 0.30603264772356137, + "learning_rate": 5.719591818252387e-06, + "loss": 0.4734, + "step": 12739 + }, + { + "epoch": 2.0922751626875784, + "grad_norm": 0.40363108857332247, + "learning_rate": 5.719140391370996e-06, + "loss": 0.4526, + "step": 12740 + }, + { + "epoch": 2.092439389895921, + "grad_norm": 0.38789927188689594, + "learning_rate": 5.718688951248541e-06, + "loss": 0.4449, + "step": 12741 + }, + { + "epoch": 2.0926036171042637, + "grad_norm": 0.3309222451571833, + "learning_rate": 5.718237497889926e-06, + "loss": 0.4547, + "step": 12742 + }, + { + "epoch": 2.0927678443126063, + "grad_norm": 0.4855079010515181, + "learning_rate": 5.717786031300054e-06, + "loss": 0.4249, + "step": 12743 + }, + { + "epoch": 2.0929320715209494, + "grad_norm": 0.38233922938650783, + "learning_rate": 5.717334551483825e-06, + "loss": 0.4391, + "step": 12744 + }, + { + "epoch": 2.093096298729292, + "grad_norm": 0.3406979729307253, + "learning_rate": 5.716883058446147e-06, + "loss": 0.4369, + "step": 12745 + }, + { + "epoch": 2.0932605259376347, + "grad_norm": 0.30944525643888987, + "learning_rate": 5.716431552191921e-06, + "loss": 0.4251, + "step": 12746 + }, + { + "epoch": 2.0934247531459773, + "grad_norm": 0.31735897928397955, + "learning_rate": 5.7159800327260525e-06, + "loss": 0.4425, + "step": 12747 + }, + { + "epoch": 2.0935889803543204, + "grad_norm": 0.5216526947925395, + "learning_rate": 5.715528500053444e-06, + "loss": 0.442, + "step": 12748 + }, + { + "epoch": 2.093753207562663, + "grad_norm": 0.47185098877306997, + "learning_rate": 5.715076954178999e-06, + "loss": 0.4323, + "step": 12749 + }, + { + "epoch": 2.0939174347710057, + "grad_norm": 0.4193173428927055, + "learning_rate": 5.714625395107623e-06, + "loss": 0.4698, + "step": 12750 + }, + { + "epoch": 2.0940816619793483, + "grad_norm": 0.6124497113365915, + "learning_rate": 5.714173822844221e-06, + "loss": 0.4431, + "step": 12751 + }, + { + "epoch": 2.0942458891876914, + "grad_norm": 0.293266143258921, + "learning_rate": 5.713722237393696e-06, + "loss": 0.4421, + "step": 12752 + }, + { + "epoch": 2.094410116396034, + "grad_norm": 0.6165608268152939, + "learning_rate": 5.713270638760955e-06, + "loss": 0.4347, + "step": 12753 + }, + { + "epoch": 2.0945743436043767, + "grad_norm": 0.3047436508071504, + "learning_rate": 5.7128190269508995e-06, + "loss": 0.457, + "step": 12754 + }, + { + "epoch": 2.0947385708127193, + "grad_norm": 0.3668284014045656, + "learning_rate": 5.712367401968436e-06, + "loss": 0.4328, + "step": 12755 + }, + { + "epoch": 2.094902798021062, + "grad_norm": 0.33986561289705924, + "learning_rate": 5.711915763818472e-06, + "loss": 0.4687, + "step": 12756 + }, + { + "epoch": 2.095067025229405, + "grad_norm": 0.3706800130778407, + "learning_rate": 5.711464112505909e-06, + "loss": 0.4489, + "step": 12757 + }, + { + "epoch": 2.0952312524377477, + "grad_norm": 0.39079281718805936, + "learning_rate": 5.711012448035652e-06, + "loss": 0.4633, + "step": 12758 + }, + { + "epoch": 2.0953954796460903, + "grad_norm": 0.32449038886318793, + "learning_rate": 5.710560770412611e-06, + "loss": 0.4216, + "step": 12759 + }, + { + "epoch": 2.095559706854433, + "grad_norm": 0.3466183057361125, + "learning_rate": 5.710109079641688e-06, + "loss": 0.4291, + "step": 12760 + }, + { + "epoch": 2.095723934062776, + "grad_norm": 0.3446100118332191, + "learning_rate": 5.70965737572779e-06, + "loss": 0.4605, + "step": 12761 + }, + { + "epoch": 2.0958881612711187, + "grad_norm": 0.3495938088505321, + "learning_rate": 5.7092056586758225e-06, + "loss": 0.4442, + "step": 12762 + }, + { + "epoch": 2.0960523884794613, + "grad_norm": 0.36872695332068844, + "learning_rate": 5.708753928490691e-06, + "loss": 0.459, + "step": 12763 + }, + { + "epoch": 2.096216615687804, + "grad_norm": 0.372112203602689, + "learning_rate": 5.708302185177304e-06, + "loss": 0.4351, + "step": 12764 + }, + { + "epoch": 2.096380842896147, + "grad_norm": 0.31392433890046545, + "learning_rate": 5.707850428740565e-06, + "loss": 0.4495, + "step": 12765 + }, + { + "epoch": 2.0965450701044896, + "grad_norm": 0.307439405862085, + "learning_rate": 5.707398659185383e-06, + "loss": 0.4327, + "step": 12766 + }, + { + "epoch": 2.0967092973128323, + "grad_norm": 0.41439290769866416, + "learning_rate": 5.706946876516664e-06, + "loss": 0.4503, + "step": 12767 + }, + { + "epoch": 2.096873524521175, + "grad_norm": 0.3074612197500893, + "learning_rate": 5.706495080739314e-06, + "loss": 0.4367, + "step": 12768 + }, + { + "epoch": 2.097037751729518, + "grad_norm": 0.5222822521040957, + "learning_rate": 5.706043271858241e-06, + "loss": 0.416, + "step": 12769 + }, + { + "epoch": 2.0972019789378606, + "grad_norm": 0.34805519039750665, + "learning_rate": 5.70559144987835e-06, + "loss": 0.4527, + "step": 12770 + }, + { + "epoch": 2.0973662061462033, + "grad_norm": 0.3251807292612414, + "learning_rate": 5.7051396148045514e-06, + "loss": 0.4297, + "step": 12771 + }, + { + "epoch": 2.097530433354546, + "grad_norm": 0.30016213451186896, + "learning_rate": 5.70468776664175e-06, + "loss": 0.4301, + "step": 12772 + }, + { + "epoch": 2.0976946605628886, + "grad_norm": 0.3500767429575023, + "learning_rate": 5.704235905394855e-06, + "loss": 0.4569, + "step": 12773 + }, + { + "epoch": 2.0978588877712316, + "grad_norm": 0.44306698843867504, + "learning_rate": 5.703784031068771e-06, + "loss": 0.4474, + "step": 12774 + }, + { + "epoch": 2.0980231149795743, + "grad_norm": 0.35216126579915613, + "learning_rate": 5.703332143668409e-06, + "loss": 0.4296, + "step": 12775 + }, + { + "epoch": 2.098187342187917, + "grad_norm": 0.3069250697540145, + "learning_rate": 5.702880243198678e-06, + "loss": 0.4239, + "step": 12776 + }, + { + "epoch": 2.0983515693962596, + "grad_norm": 0.3491742333738244, + "learning_rate": 5.7024283296644825e-06, + "loss": 0.4424, + "step": 12777 + }, + { + "epoch": 2.0985157966046026, + "grad_norm": 0.3532745513440168, + "learning_rate": 5.701976403070732e-06, + "loss": 0.4547, + "step": 12778 + }, + { + "epoch": 2.0986800238129453, + "grad_norm": 0.6853319161198199, + "learning_rate": 5.701524463422336e-06, + "loss": 0.442, + "step": 12779 + }, + { + "epoch": 2.098844251021288, + "grad_norm": 0.31164913944888184, + "learning_rate": 5.701072510724201e-06, + "loss": 0.4422, + "step": 12780 + }, + { + "epoch": 2.0990084782296305, + "grad_norm": 0.33442415700253836, + "learning_rate": 5.700620544981238e-06, + "loss": 0.4329, + "step": 12781 + }, + { + "epoch": 2.0991727054379736, + "grad_norm": 0.29691650447931556, + "learning_rate": 5.7001685661983545e-06, + "loss": 0.4464, + "step": 12782 + }, + { + "epoch": 2.0993369326463163, + "grad_norm": 0.4273272208437839, + "learning_rate": 5.699716574380459e-06, + "loss": 0.4387, + "step": 12783 + }, + { + "epoch": 2.099501159854659, + "grad_norm": 0.3664218978422905, + "learning_rate": 5.699264569532461e-06, + "loss": 0.4407, + "step": 12784 + }, + { + "epoch": 2.0996653870630015, + "grad_norm": 0.2972060184677094, + "learning_rate": 5.698812551659271e-06, + "loss": 0.431, + "step": 12785 + }, + { + "epoch": 2.0998296142713446, + "grad_norm": 0.3639133252374928, + "learning_rate": 5.698360520765798e-06, + "loss": 0.4439, + "step": 12786 + }, + { + "epoch": 2.0999938414796873, + "grad_norm": 0.30413518112204113, + "learning_rate": 5.697908476856948e-06, + "loss": 0.4441, + "step": 12787 + }, + { + "epoch": 2.10015806868803, + "grad_norm": 0.34623216367586335, + "learning_rate": 5.697456419937635e-06, + "loss": 0.4718, + "step": 12788 + }, + { + "epoch": 2.1003222958963725, + "grad_norm": 0.4154051288309667, + "learning_rate": 5.697004350012767e-06, + "loss": 0.4382, + "step": 12789 + }, + { + "epoch": 2.100486523104715, + "grad_norm": 0.33461676750805575, + "learning_rate": 5.696552267087253e-06, + "loss": 0.4585, + "step": 12790 + }, + { + "epoch": 2.1006507503130583, + "grad_norm": 0.4078581820722512, + "learning_rate": 5.696100171166006e-06, + "loss": 0.4295, + "step": 12791 + }, + { + "epoch": 2.100814977521401, + "grad_norm": 0.28825390150215113, + "learning_rate": 5.695648062253933e-06, + "loss": 0.4289, + "step": 12792 + }, + { + "epoch": 2.1009792047297435, + "grad_norm": 0.43816165937186574, + "learning_rate": 5.695195940355946e-06, + "loss": 0.4544, + "step": 12793 + }, + { + "epoch": 2.101143431938086, + "grad_norm": 0.5703413364880195, + "learning_rate": 5.694743805476955e-06, + "loss": 0.4567, + "step": 12794 + }, + { + "epoch": 2.1013076591464293, + "grad_norm": 0.35355716963073625, + "learning_rate": 5.69429165762187e-06, + "loss": 0.4656, + "step": 12795 + }, + { + "epoch": 2.101471886354772, + "grad_norm": 1.0919983683018712, + "learning_rate": 5.693839496795605e-06, + "loss": 0.4431, + "step": 12796 + }, + { + "epoch": 2.1016361135631145, + "grad_norm": 0.33052976505924586, + "learning_rate": 5.6933873230030665e-06, + "loss": 0.4374, + "step": 12797 + }, + { + "epoch": 2.101800340771457, + "grad_norm": 1.522094268884421, + "learning_rate": 5.692935136249169e-06, + "loss": 0.4613, + "step": 12798 + }, + { + "epoch": 2.1019645679798002, + "grad_norm": 0.35010018307879287, + "learning_rate": 5.692482936538821e-06, + "loss": 0.4227, + "step": 12799 + }, + { + "epoch": 2.102128795188143, + "grad_norm": 0.3341888208412005, + "learning_rate": 5.692030723876934e-06, + "loss": 0.4534, + "step": 12800 + }, + { + "epoch": 2.1022930223964855, + "grad_norm": 0.4399329112161016, + "learning_rate": 5.691578498268423e-06, + "loss": 0.4605, + "step": 12801 + }, + { + "epoch": 2.102457249604828, + "grad_norm": 0.4055705345962353, + "learning_rate": 5.691126259718197e-06, + "loss": 0.458, + "step": 12802 + }, + { + "epoch": 2.1026214768131712, + "grad_norm": 0.312902913896997, + "learning_rate": 5.690674008231166e-06, + "loss": 0.4495, + "step": 12803 + }, + { + "epoch": 2.102785704021514, + "grad_norm": 0.3318988567653643, + "learning_rate": 5.690221743812244e-06, + "loss": 0.4626, + "step": 12804 + }, + { + "epoch": 2.1029499312298565, + "grad_norm": 0.34101958993527215, + "learning_rate": 5.689769466466344e-06, + "loss": 0.4202, + "step": 12805 + }, + { + "epoch": 2.103114158438199, + "grad_norm": 0.30553517046334017, + "learning_rate": 5.689317176198377e-06, + "loss": 0.448, + "step": 12806 + }, + { + "epoch": 2.103278385646542, + "grad_norm": 0.29574474153044406, + "learning_rate": 5.688864873013256e-06, + "loss": 0.4267, + "step": 12807 + }, + { + "epoch": 2.103442612854885, + "grad_norm": 0.4147066715116571, + "learning_rate": 5.688412556915891e-06, + "loss": 0.4351, + "step": 12808 + }, + { + "epoch": 2.1036068400632275, + "grad_norm": 0.3180074558597533, + "learning_rate": 5.687960227911197e-06, + "loss": 0.434, + "step": 12809 + }, + { + "epoch": 2.10377106727157, + "grad_norm": 0.3715894218391581, + "learning_rate": 5.6875078860040854e-06, + "loss": 0.4319, + "step": 12810 + }, + { + "epoch": 2.103935294479913, + "grad_norm": 0.3526111909409907, + "learning_rate": 5.687055531199473e-06, + "loss": 0.4302, + "step": 12811 + }, + { + "epoch": 2.104099521688256, + "grad_norm": 0.3747430339445163, + "learning_rate": 5.6866031635022664e-06, + "loss": 0.4376, + "step": 12812 + }, + { + "epoch": 2.1042637488965985, + "grad_norm": 0.4540276918521262, + "learning_rate": 5.686150782917382e-06, + "loss": 0.4383, + "step": 12813 + }, + { + "epoch": 2.104427976104941, + "grad_norm": 0.3795862762748715, + "learning_rate": 5.685698389449735e-06, + "loss": 0.4659, + "step": 12814 + }, + { + "epoch": 2.104592203313284, + "grad_norm": 0.41673579474541966, + "learning_rate": 5.685245983104235e-06, + "loss": 0.4393, + "step": 12815 + }, + { + "epoch": 2.104756430521627, + "grad_norm": 0.3116492369443997, + "learning_rate": 5.684793563885799e-06, + "loss": 0.4574, + "step": 12816 + }, + { + "epoch": 2.1049206577299695, + "grad_norm": 0.3399815985478469, + "learning_rate": 5.684341131799338e-06, + "loss": 0.4369, + "step": 12817 + }, + { + "epoch": 2.105084884938312, + "grad_norm": 0.5281393777848307, + "learning_rate": 5.683888686849769e-06, + "loss": 0.4143, + "step": 12818 + }, + { + "epoch": 2.1052491121466548, + "grad_norm": 0.3429045769350071, + "learning_rate": 5.6834362290420015e-06, + "loss": 0.4436, + "step": 12819 + }, + { + "epoch": 2.105413339354998, + "grad_norm": 0.4486855426294437, + "learning_rate": 5.682983758380955e-06, + "loss": 0.438, + "step": 12820 + }, + { + "epoch": 2.1055775665633405, + "grad_norm": 0.29769448844466256, + "learning_rate": 5.682531274871538e-06, + "loss": 0.4387, + "step": 12821 + }, + { + "epoch": 2.105741793771683, + "grad_norm": 0.35663890431486234, + "learning_rate": 5.68207877851867e-06, + "loss": 0.4351, + "step": 12822 + }, + { + "epoch": 2.1059060209800258, + "grad_norm": 0.3455155676747153, + "learning_rate": 5.6816262693272625e-06, + "loss": 0.4459, + "step": 12823 + }, + { + "epoch": 2.1060702481883684, + "grad_norm": 0.3476365547950273, + "learning_rate": 5.681173747302231e-06, + "loss": 0.437, + "step": 12824 + }, + { + "epoch": 2.1062344753967115, + "grad_norm": 0.31649578705530995, + "learning_rate": 5.680721212448492e-06, + "loss": 0.4493, + "step": 12825 + }, + { + "epoch": 2.106398702605054, + "grad_norm": 0.3246652992220023, + "learning_rate": 5.680268664770957e-06, + "loss": 0.4527, + "step": 12826 + }, + { + "epoch": 2.1065629298133968, + "grad_norm": 0.3578087030598692, + "learning_rate": 5.679816104274546e-06, + "loss": 0.4339, + "step": 12827 + }, + { + "epoch": 2.1067271570217394, + "grad_norm": 0.4092237618364479, + "learning_rate": 5.679363530964167e-06, + "loss": 0.447, + "step": 12828 + }, + { + "epoch": 2.1068913842300825, + "grad_norm": 0.3361602929137724, + "learning_rate": 5.678910944844742e-06, + "loss": 0.4472, + "step": 12829 + }, + { + "epoch": 2.107055611438425, + "grad_norm": 0.38061197908341743, + "learning_rate": 5.6784583459211855e-06, + "loss": 0.437, + "step": 12830 + }, + { + "epoch": 2.1072198386467678, + "grad_norm": 0.33944842725725227, + "learning_rate": 5.678005734198412e-06, + "loss": 0.4142, + "step": 12831 + }, + { + "epoch": 2.1073840658551104, + "grad_norm": 0.27748281160012517, + "learning_rate": 5.677553109681335e-06, + "loss": 0.4437, + "step": 12832 + }, + { + "epoch": 2.1075482930634535, + "grad_norm": 0.493159606649079, + "learning_rate": 5.677100472374873e-06, + "loss": 0.4537, + "step": 12833 + }, + { + "epoch": 2.107712520271796, + "grad_norm": 0.33143414634234014, + "learning_rate": 5.676647822283942e-06, + "loss": 0.4473, + "step": 12834 + }, + { + "epoch": 2.1078767474801388, + "grad_norm": 0.33655801079061937, + "learning_rate": 5.67619515941346e-06, + "loss": 0.439, + "step": 12835 + }, + { + "epoch": 2.1080409746884814, + "grad_norm": 0.30323548739001926, + "learning_rate": 5.675742483768339e-06, + "loss": 0.4321, + "step": 12836 + }, + { + "epoch": 2.1082052018968245, + "grad_norm": 0.3308812641204237, + "learning_rate": 5.675289795353498e-06, + "loss": 0.4344, + "step": 12837 + }, + { + "epoch": 2.108369429105167, + "grad_norm": 0.3358438917250515, + "learning_rate": 5.674837094173854e-06, + "loss": 0.4495, + "step": 12838 + }, + { + "epoch": 2.1085336563135098, + "grad_norm": 0.4482835836763625, + "learning_rate": 5.674384380234323e-06, + "loss": 0.4488, + "step": 12839 + }, + { + "epoch": 2.1086978835218524, + "grad_norm": 0.29048868571265907, + "learning_rate": 5.673931653539824e-06, + "loss": 0.4486, + "step": 12840 + }, + { + "epoch": 2.108862110730195, + "grad_norm": 0.4323304845732, + "learning_rate": 5.6734789140952695e-06, + "loss": 0.4321, + "step": 12841 + }, + { + "epoch": 2.109026337938538, + "grad_norm": 0.30474831684058123, + "learning_rate": 5.673026161905581e-06, + "loss": 0.4372, + "step": 12842 + }, + { + "epoch": 2.1091905651468807, + "grad_norm": 0.38957900370384485, + "learning_rate": 5.672573396975674e-06, + "loss": 0.4331, + "step": 12843 + }, + { + "epoch": 2.1093547923552234, + "grad_norm": 0.33247541581000345, + "learning_rate": 5.672120619310466e-06, + "loss": 0.4318, + "step": 12844 + }, + { + "epoch": 2.109519019563566, + "grad_norm": 0.366253574432419, + "learning_rate": 5.671667828914876e-06, + "loss": 0.4543, + "step": 12845 + }, + { + "epoch": 2.109683246771909, + "grad_norm": 0.28846496806996375, + "learning_rate": 5.6712150257938196e-06, + "loss": 0.4418, + "step": 12846 + }, + { + "epoch": 2.1098474739802517, + "grad_norm": 0.37577112889588415, + "learning_rate": 5.670762209952215e-06, + "loss": 0.4423, + "step": 12847 + }, + { + "epoch": 2.1100117011885944, + "grad_norm": 0.327766941899745, + "learning_rate": 5.670309381394982e-06, + "loss": 0.4452, + "step": 12848 + }, + { + "epoch": 2.110175928396937, + "grad_norm": 0.3545541182125981, + "learning_rate": 5.669856540127037e-06, + "loss": 0.4251, + "step": 12849 + }, + { + "epoch": 2.11034015560528, + "grad_norm": 0.3924250514834875, + "learning_rate": 5.6694036861533e-06, + "loss": 0.4482, + "step": 12850 + }, + { + "epoch": 2.1105043828136227, + "grad_norm": 0.3583855348922854, + "learning_rate": 5.668950819478688e-06, + "loss": 0.4352, + "step": 12851 + }, + { + "epoch": 2.1106686100219654, + "grad_norm": 0.3968481426345676, + "learning_rate": 5.66849794010812e-06, + "loss": 0.4337, + "step": 12852 + }, + { + "epoch": 2.110832837230308, + "grad_norm": 0.3224990826352743, + "learning_rate": 5.6680450480465145e-06, + "loss": 0.4443, + "step": 12853 + }, + { + "epoch": 2.110997064438651, + "grad_norm": 0.39144802472570384, + "learning_rate": 5.667592143298791e-06, + "loss": 0.4289, + "step": 12854 + }, + { + "epoch": 2.1111612916469937, + "grad_norm": 0.3718451290345085, + "learning_rate": 5.667139225869867e-06, + "loss": 0.4506, + "step": 12855 + }, + { + "epoch": 2.1113255188553364, + "grad_norm": 0.3297370105246588, + "learning_rate": 5.666686295764665e-06, + "loss": 0.4506, + "step": 12856 + }, + { + "epoch": 2.111489746063679, + "grad_norm": 0.5862997461169424, + "learning_rate": 5.6662333529880994e-06, + "loss": 0.4494, + "step": 12857 + }, + { + "epoch": 2.1116539732720216, + "grad_norm": 0.42111008173194764, + "learning_rate": 5.665780397545093e-06, + "loss": 0.4436, + "step": 12858 + }, + { + "epoch": 2.1118182004803647, + "grad_norm": 0.33023195184528636, + "learning_rate": 5.665327429440566e-06, + "loss": 0.4374, + "step": 12859 + }, + { + "epoch": 2.1119824276887074, + "grad_norm": 0.39047191719718477, + "learning_rate": 5.664874448679434e-06, + "loss": 0.4404, + "step": 12860 + }, + { + "epoch": 2.11214665489705, + "grad_norm": 0.36364259425271644, + "learning_rate": 5.6644214552666205e-06, + "loss": 0.4453, + "step": 12861 + }, + { + "epoch": 2.1123108821053926, + "grad_norm": 0.47280712222169236, + "learning_rate": 5.663968449207044e-06, + "loss": 0.442, + "step": 12862 + }, + { + "epoch": 2.1124751093137357, + "grad_norm": 0.34497358185281746, + "learning_rate": 5.663515430505626e-06, + "loss": 0.4632, + "step": 12863 + }, + { + "epoch": 2.1126393365220784, + "grad_norm": 0.35565401681845893, + "learning_rate": 5.663062399167285e-06, + "loss": 0.4572, + "step": 12864 + }, + { + "epoch": 2.112803563730421, + "grad_norm": 0.3225578882958658, + "learning_rate": 5.662609355196944e-06, + "loss": 0.4578, + "step": 12865 + }, + { + "epoch": 2.1129677909387636, + "grad_norm": 0.3207304288053452, + "learning_rate": 5.662156298599518e-06, + "loss": 0.4319, + "step": 12866 + }, + { + "epoch": 2.1131320181471067, + "grad_norm": 0.6186662136471093, + "learning_rate": 5.661703229379933e-06, + "loss": 0.4241, + "step": 12867 + }, + { + "epoch": 2.1132962453554494, + "grad_norm": 0.3004416227688589, + "learning_rate": 5.661250147543107e-06, + "loss": 0.4552, + "step": 12868 + }, + { + "epoch": 2.113460472563792, + "grad_norm": 0.3337040866259995, + "learning_rate": 5.660797053093965e-06, + "loss": 0.4431, + "step": 12869 + }, + { + "epoch": 2.1136246997721346, + "grad_norm": 0.30755631851279613, + "learning_rate": 5.6603439460374226e-06, + "loss": 0.4349, + "step": 12870 + }, + { + "epoch": 2.1137889269804777, + "grad_norm": 0.35471867913214444, + "learning_rate": 5.659890826378403e-06, + "loss": 0.4443, + "step": 12871 + }, + { + "epoch": 2.1139531541888203, + "grad_norm": 0.29714877631996933, + "learning_rate": 5.659437694121827e-06, + "loss": 0.4423, + "step": 12872 + }, + { + "epoch": 2.114117381397163, + "grad_norm": 0.2906786915109368, + "learning_rate": 5.658984549272619e-06, + "loss": 0.4479, + "step": 12873 + }, + { + "epoch": 2.1142816086055056, + "grad_norm": 0.32187424124789155, + "learning_rate": 5.658531391835699e-06, + "loss": 0.4357, + "step": 12874 + }, + { + "epoch": 2.1144458358138483, + "grad_norm": 0.29924613569598135, + "learning_rate": 5.658078221815986e-06, + "loss": 0.4509, + "step": 12875 + }, + { + "epoch": 2.1146100630221913, + "grad_norm": 0.30270006577595426, + "learning_rate": 5.657625039218405e-06, + "loss": 0.4335, + "step": 12876 + }, + { + "epoch": 2.114774290230534, + "grad_norm": 0.3369789030042965, + "learning_rate": 5.6571718440478774e-06, + "loss": 0.4334, + "step": 12877 + }, + { + "epoch": 2.1149385174388766, + "grad_norm": 0.6159556214342785, + "learning_rate": 5.656718636309324e-06, + "loss": 0.4382, + "step": 12878 + }, + { + "epoch": 2.1151027446472193, + "grad_norm": 0.3931772247618362, + "learning_rate": 5.65626541600767e-06, + "loss": 0.4486, + "step": 12879 + }, + { + "epoch": 2.1152669718555623, + "grad_norm": 0.3023065547371581, + "learning_rate": 5.655812183147834e-06, + "loss": 0.4478, + "step": 12880 + }, + { + "epoch": 2.115431199063905, + "grad_norm": 0.46112021793977825, + "learning_rate": 5.655358937734742e-06, + "loss": 0.4535, + "step": 12881 + }, + { + "epoch": 2.1155954262722476, + "grad_norm": 0.31824756429454715, + "learning_rate": 5.654905679773315e-06, + "loss": 0.4336, + "step": 12882 + }, + { + "epoch": 2.1157596534805903, + "grad_norm": 0.45968576013510043, + "learning_rate": 5.654452409268476e-06, + "loss": 0.4484, + "step": 12883 + }, + { + "epoch": 2.1159238806889333, + "grad_norm": 0.32054414238138046, + "learning_rate": 5.653999126225148e-06, + "loss": 0.4297, + "step": 12884 + }, + { + "epoch": 2.116088107897276, + "grad_norm": 1.3758552124140686, + "learning_rate": 5.653545830648254e-06, + "loss": 0.4522, + "step": 12885 + }, + { + "epoch": 2.1162523351056186, + "grad_norm": 0.3020933416951688, + "learning_rate": 5.653092522542717e-06, + "loss": 0.4419, + "step": 12886 + }, + { + "epoch": 2.1164165623139612, + "grad_norm": 0.29168531190257574, + "learning_rate": 5.652639201913461e-06, + "loss": 0.4563, + "step": 12887 + }, + { + "epoch": 2.1165807895223043, + "grad_norm": 0.39239511980870456, + "learning_rate": 5.652185868765409e-06, + "loss": 0.4533, + "step": 12888 + }, + { + "epoch": 2.116745016730647, + "grad_norm": 0.328826136046041, + "learning_rate": 5.651732523103485e-06, + "loss": 0.4404, + "step": 12889 + }, + { + "epoch": 2.1169092439389896, + "grad_norm": 0.3431931638495928, + "learning_rate": 5.6512791649326136e-06, + "loss": 0.4611, + "step": 12890 + }, + { + "epoch": 2.1170734711473322, + "grad_norm": 0.28812679712227424, + "learning_rate": 5.650825794257716e-06, + "loss": 0.438, + "step": 12891 + }, + { + "epoch": 2.117237698355675, + "grad_norm": 0.2902608751383587, + "learning_rate": 5.650372411083718e-06, + "loss": 0.4342, + "step": 12892 + }, + { + "epoch": 2.117401925564018, + "grad_norm": 0.37581087709899136, + "learning_rate": 5.649919015415546e-06, + "loss": 0.4447, + "step": 12893 + }, + { + "epoch": 2.1175661527723606, + "grad_norm": 0.32938488910246017, + "learning_rate": 5.649465607258122e-06, + "loss": 0.4396, + "step": 12894 + }, + { + "epoch": 2.1177303799807032, + "grad_norm": 0.4706196191524123, + "learning_rate": 5.649012186616368e-06, + "loss": 0.4348, + "step": 12895 + }, + { + "epoch": 2.117894607189046, + "grad_norm": 0.41697900587103304, + "learning_rate": 5.648558753495212e-06, + "loss": 0.4366, + "step": 12896 + }, + { + "epoch": 2.118058834397389, + "grad_norm": 0.31704559733383314, + "learning_rate": 5.648105307899579e-06, + "loss": 0.4481, + "step": 12897 + }, + { + "epoch": 2.1182230616057316, + "grad_norm": 0.47313061714381116, + "learning_rate": 5.647651849834392e-06, + "loss": 0.4421, + "step": 12898 + }, + { + "epoch": 2.1183872888140742, + "grad_norm": 0.3885098414913026, + "learning_rate": 5.647198379304578e-06, + "loss": 0.4371, + "step": 12899 + }, + { + "epoch": 2.118551516022417, + "grad_norm": 0.35465476613932184, + "learning_rate": 5.646744896315059e-06, + "loss": 0.453, + "step": 12900 + }, + { + "epoch": 2.11871574323076, + "grad_norm": 0.33812434114988693, + "learning_rate": 5.646291400870763e-06, + "loss": 0.4476, + "step": 12901 + }, + { + "epoch": 2.1188799704391026, + "grad_norm": 0.2907497876750063, + "learning_rate": 5.645837892976615e-06, + "loss": 0.4291, + "step": 12902 + }, + { + "epoch": 2.1190441976474452, + "grad_norm": 0.2562377614658291, + "learning_rate": 5.6453843726375395e-06, + "loss": 0.4528, + "step": 12903 + }, + { + "epoch": 2.119208424855788, + "grad_norm": 0.4183063196730311, + "learning_rate": 5.644930839858463e-06, + "loss": 0.4566, + "step": 12904 + }, + { + "epoch": 2.119372652064131, + "grad_norm": 0.3360083856619884, + "learning_rate": 5.644477294644312e-06, + "loss": 0.4612, + "step": 12905 + }, + { + "epoch": 2.1195368792724736, + "grad_norm": 0.3550177847074318, + "learning_rate": 5.644023737000011e-06, + "loss": 0.413, + "step": 12906 + }, + { + "epoch": 2.119701106480816, + "grad_norm": 0.3168948460799242, + "learning_rate": 5.643570166930485e-06, + "loss": 0.4267, + "step": 12907 + }, + { + "epoch": 2.119865333689159, + "grad_norm": 0.41299478080841584, + "learning_rate": 5.643116584440665e-06, + "loss": 0.4389, + "step": 12908 + }, + { + "epoch": 2.1200295608975015, + "grad_norm": 0.285914803678043, + "learning_rate": 5.642662989535472e-06, + "loss": 0.4204, + "step": 12909 + }, + { + "epoch": 2.1201937881058446, + "grad_norm": 0.3225596692159442, + "learning_rate": 5.642209382219836e-06, + "loss": 0.4421, + "step": 12910 + }, + { + "epoch": 2.120358015314187, + "grad_norm": 0.3499417079033318, + "learning_rate": 5.6417557624986815e-06, + "loss": 0.4402, + "step": 12911 + }, + { + "epoch": 2.12052224252253, + "grad_norm": 0.29267360443334167, + "learning_rate": 5.641302130376935e-06, + "loss": 0.4348, + "step": 12912 + }, + { + "epoch": 2.1206864697308725, + "grad_norm": 0.32426656914720464, + "learning_rate": 5.640848485859526e-06, + "loss": 0.4419, + "step": 12913 + }, + { + "epoch": 2.1208506969392156, + "grad_norm": 0.3281027482639434, + "learning_rate": 5.6403948289513795e-06, + "loss": 0.4561, + "step": 12914 + }, + { + "epoch": 2.121014924147558, + "grad_norm": 0.3220048109109444, + "learning_rate": 5.6399411596574245e-06, + "loss": 0.4575, + "step": 12915 + }, + { + "epoch": 2.121179151355901, + "grad_norm": 0.5050677501725908, + "learning_rate": 5.639487477982585e-06, + "loss": 0.4443, + "step": 12916 + }, + { + "epoch": 2.1213433785642435, + "grad_norm": 0.33091551603894176, + "learning_rate": 5.639033783931792e-06, + "loss": 0.438, + "step": 12917 + }, + { + "epoch": 2.1215076057725866, + "grad_norm": 0.5370997172267648, + "learning_rate": 5.63858007750997e-06, + "loss": 0.442, + "step": 12918 + }, + { + "epoch": 2.121671832980929, + "grad_norm": 0.3195961149692968, + "learning_rate": 5.638126358722049e-06, + "loss": 0.4289, + "step": 12919 + }, + { + "epoch": 2.121836060189272, + "grad_norm": 0.29324147405114565, + "learning_rate": 5.637672627572955e-06, + "loss": 0.4474, + "step": 12920 + }, + { + "epoch": 2.1220002873976145, + "grad_norm": 0.2773735461045715, + "learning_rate": 5.637218884067618e-06, + "loss": 0.4613, + "step": 12921 + }, + { + "epoch": 2.1221645146059576, + "grad_norm": 0.3540915443199416, + "learning_rate": 5.636765128210965e-06, + "loss": 0.4309, + "step": 12922 + }, + { + "epoch": 2.1223287418143, + "grad_norm": 0.3887095467543331, + "learning_rate": 5.636311360007924e-06, + "loss": 0.4282, + "step": 12923 + }, + { + "epoch": 2.122492969022643, + "grad_norm": 0.3299693991544904, + "learning_rate": 5.635857579463423e-06, + "loss": 0.4328, + "step": 12924 + }, + { + "epoch": 2.1226571962309855, + "grad_norm": 0.37227913141766533, + "learning_rate": 5.635403786582392e-06, + "loss": 0.4406, + "step": 12925 + }, + { + "epoch": 2.122821423439328, + "grad_norm": 0.3635459484822922, + "learning_rate": 5.634949981369758e-06, + "loss": 0.4633, + "step": 12926 + }, + { + "epoch": 2.122985650647671, + "grad_norm": 0.34839992097364925, + "learning_rate": 5.634496163830452e-06, + "loss": 0.4488, + "step": 12927 + }, + { + "epoch": 2.123149877856014, + "grad_norm": 0.333284343465869, + "learning_rate": 5.634042333969401e-06, + "loss": 0.4568, + "step": 12928 + }, + { + "epoch": 2.1233141050643565, + "grad_norm": 0.3805554693548058, + "learning_rate": 5.633588491791533e-06, + "loss": 0.4514, + "step": 12929 + }, + { + "epoch": 2.123478332272699, + "grad_norm": 0.33782011930290967, + "learning_rate": 5.63313463730178e-06, + "loss": 0.4413, + "step": 12930 + }, + { + "epoch": 2.123642559481042, + "grad_norm": 0.3804235495657651, + "learning_rate": 5.63268077050507e-06, + "loss": 0.4547, + "step": 12931 + }, + { + "epoch": 2.123806786689385, + "grad_norm": 0.34878407931263294, + "learning_rate": 5.632226891406332e-06, + "loss": 0.4447, + "step": 12932 + }, + { + "epoch": 2.1239710138977275, + "grad_norm": 0.2980431569749951, + "learning_rate": 5.631773000010497e-06, + "loss": 0.4501, + "step": 12933 + }, + { + "epoch": 2.12413524110607, + "grad_norm": 0.2758512424001063, + "learning_rate": 5.631319096322493e-06, + "loss": 0.4375, + "step": 12934 + }, + { + "epoch": 2.124299468314413, + "grad_norm": 0.31581503124840343, + "learning_rate": 5.6308651803472505e-06, + "loss": 0.4398, + "step": 12935 + }, + { + "epoch": 2.124463695522756, + "grad_norm": 0.4350563326476109, + "learning_rate": 5.630411252089699e-06, + "loss": 0.4385, + "step": 12936 + }, + { + "epoch": 2.1246279227310985, + "grad_norm": 0.32043068856204204, + "learning_rate": 5.62995731155477e-06, + "loss": 0.4418, + "step": 12937 + }, + { + "epoch": 2.124792149939441, + "grad_norm": 0.3878698223491185, + "learning_rate": 5.629503358747392e-06, + "loss": 0.4548, + "step": 12938 + }, + { + "epoch": 2.124956377147784, + "grad_norm": 0.395039731182331, + "learning_rate": 5.6290493936724965e-06, + "loss": 0.4387, + "step": 12939 + }, + { + "epoch": 2.125120604356127, + "grad_norm": 0.3315098767767503, + "learning_rate": 5.628595416335014e-06, + "loss": 0.4409, + "step": 12940 + }, + { + "epoch": 2.1252848315644695, + "grad_norm": 0.3856410406992659, + "learning_rate": 5.628141426739875e-06, + "loss": 0.4578, + "step": 12941 + }, + { + "epoch": 2.125449058772812, + "grad_norm": 0.2973728241039122, + "learning_rate": 5.627687424892011e-06, + "loss": 0.4412, + "step": 12942 + }, + { + "epoch": 2.1256132859811547, + "grad_norm": 0.7612320769649644, + "learning_rate": 5.62723341079635e-06, + "loss": 0.4356, + "step": 12943 + }, + { + "epoch": 2.125777513189498, + "grad_norm": 0.34729546541020695, + "learning_rate": 5.626779384457826e-06, + "loss": 0.4264, + "step": 12944 + }, + { + "epoch": 2.1259417403978405, + "grad_norm": 0.3043427111694188, + "learning_rate": 5.6263253458813706e-06, + "loss": 0.4497, + "step": 12945 + }, + { + "epoch": 2.126105967606183, + "grad_norm": 0.32606508097107895, + "learning_rate": 5.625871295071912e-06, + "loss": 0.4416, + "step": 12946 + }, + { + "epoch": 2.1262701948145257, + "grad_norm": 0.4127021304590552, + "learning_rate": 5.625417232034384e-06, + "loss": 0.45, + "step": 12947 + }, + { + "epoch": 2.126434422022869, + "grad_norm": 0.41019044971934865, + "learning_rate": 5.624963156773718e-06, + "loss": 0.4496, + "step": 12948 + }, + { + "epoch": 2.1265986492312114, + "grad_norm": 0.31596254214135494, + "learning_rate": 5.624509069294845e-06, + "loss": 0.4331, + "step": 12949 + }, + { + "epoch": 2.126762876439554, + "grad_norm": 0.5525985814660054, + "learning_rate": 5.6240549696026975e-06, + "loss": 0.4287, + "step": 12950 + }, + { + "epoch": 2.1269271036478967, + "grad_norm": 0.34763719641785923, + "learning_rate": 5.623600857702207e-06, + "loss": 0.4355, + "step": 12951 + }, + { + "epoch": 2.12709133085624, + "grad_norm": 0.3335035146638821, + "learning_rate": 5.6231467335983055e-06, + "loss": 0.4353, + "step": 12952 + }, + { + "epoch": 2.1272555580645824, + "grad_norm": 0.3217134828394556, + "learning_rate": 5.622692597295925e-06, + "loss": 0.4277, + "step": 12953 + }, + { + "epoch": 2.127419785272925, + "grad_norm": 0.3820768103292228, + "learning_rate": 5.622238448799999e-06, + "loss": 0.4487, + "step": 12954 + }, + { + "epoch": 2.1275840124812677, + "grad_norm": 0.4124972463916858, + "learning_rate": 5.621784288115459e-06, + "loss": 0.4457, + "step": 12955 + }, + { + "epoch": 2.127748239689611, + "grad_norm": 0.41009248794214526, + "learning_rate": 5.621330115247238e-06, + "loss": 0.4431, + "step": 12956 + }, + { + "epoch": 2.1279124668979534, + "grad_norm": 0.324419514380881, + "learning_rate": 5.620875930200269e-06, + "loss": 0.4171, + "step": 12957 + }, + { + "epoch": 2.128076694106296, + "grad_norm": 0.44635639840875285, + "learning_rate": 5.620421732979484e-06, + "loss": 0.4335, + "step": 12958 + }, + { + "epoch": 2.1282409213146387, + "grad_norm": 0.3072615539433608, + "learning_rate": 5.619967523589817e-06, + "loss": 0.427, + "step": 12959 + }, + { + "epoch": 2.1284051485229813, + "grad_norm": 0.4381451304638284, + "learning_rate": 5.619513302036201e-06, + "loss": 0.4552, + "step": 12960 + }, + { + "epoch": 2.1285693757313244, + "grad_norm": 0.6381434192285261, + "learning_rate": 5.6190590683235686e-06, + "loss": 0.4299, + "step": 12961 + }, + { + "epoch": 2.128733602939667, + "grad_norm": 0.45459376598759865, + "learning_rate": 5.618604822456854e-06, + "loss": 0.4414, + "step": 12962 + }, + { + "epoch": 2.1288978301480097, + "grad_norm": 0.49530451943785425, + "learning_rate": 5.6181505644409904e-06, + "loss": 0.462, + "step": 12963 + }, + { + "epoch": 2.1290620573563523, + "grad_norm": 0.3506050848447432, + "learning_rate": 5.617696294280911e-06, + "loss": 0.4587, + "step": 12964 + }, + { + "epoch": 2.1292262845646954, + "grad_norm": 0.3191382725265907, + "learning_rate": 5.617242011981551e-06, + "loss": 0.4386, + "step": 12965 + }, + { + "epoch": 2.129390511773038, + "grad_norm": 0.3174610775036302, + "learning_rate": 5.616787717547844e-06, + "loss": 0.4326, + "step": 12966 + }, + { + "epoch": 2.1295547389813807, + "grad_norm": 0.537093039177515, + "learning_rate": 5.616333410984723e-06, + "loss": 0.4605, + "step": 12967 + }, + { + "epoch": 2.1297189661897233, + "grad_norm": 0.31637109473600866, + "learning_rate": 5.615879092297121e-06, + "loss": 0.4557, + "step": 12968 + }, + { + "epoch": 2.1298831933980664, + "grad_norm": 0.3918704384575548, + "learning_rate": 5.615424761489978e-06, + "loss": 0.4314, + "step": 12969 + }, + { + "epoch": 2.130047420606409, + "grad_norm": 0.35561557929773774, + "learning_rate": 5.6149704185682215e-06, + "loss": 0.4345, + "step": 12970 + }, + { + "epoch": 2.1302116478147517, + "grad_norm": 0.4669581383106852, + "learning_rate": 5.614516063536791e-06, + "loss": 0.4602, + "step": 12971 + }, + { + "epoch": 2.1303758750230943, + "grad_norm": 0.398092381638841, + "learning_rate": 5.614061696400619e-06, + "loss": 0.4342, + "step": 12972 + }, + { + "epoch": 2.1305401022314374, + "grad_norm": 0.5613327187613648, + "learning_rate": 5.6136073171646404e-06, + "loss": 0.4566, + "step": 12973 + }, + { + "epoch": 2.13070432943978, + "grad_norm": 0.34100163249184556, + "learning_rate": 5.6131529258337906e-06, + "loss": 0.444, + "step": 12974 + }, + { + "epoch": 2.1308685566481227, + "grad_norm": 0.4985210923911517, + "learning_rate": 5.612698522413005e-06, + "loss": 0.4402, + "step": 12975 + }, + { + "epoch": 2.1310327838564653, + "grad_norm": 0.3421448979356347, + "learning_rate": 5.61224410690722e-06, + "loss": 0.4419, + "step": 12976 + }, + { + "epoch": 2.131197011064808, + "grad_norm": 0.34251735534570665, + "learning_rate": 5.611789679321369e-06, + "loss": 0.4548, + "step": 12977 + }, + { + "epoch": 2.131361238273151, + "grad_norm": 0.3535445694441817, + "learning_rate": 5.611335239660387e-06, + "loss": 0.4449, + "step": 12978 + }, + { + "epoch": 2.1315254654814937, + "grad_norm": 0.32032005716195605, + "learning_rate": 5.610880787929211e-06, + "loss": 0.454, + "step": 12979 + }, + { + "epoch": 2.1316896926898363, + "grad_norm": 0.32004352432492167, + "learning_rate": 5.610426324132778e-06, + "loss": 0.4466, + "step": 12980 + }, + { + "epoch": 2.131853919898179, + "grad_norm": 0.29014953643068214, + "learning_rate": 5.6099718482760235e-06, + "loss": 0.4714, + "step": 12981 + }, + { + "epoch": 2.132018147106522, + "grad_norm": 0.34390516599209603, + "learning_rate": 5.609517360363881e-06, + "loss": 0.4304, + "step": 12982 + }, + { + "epoch": 2.1321823743148647, + "grad_norm": 0.3213270673623128, + "learning_rate": 5.6090628604012875e-06, + "loss": 0.4484, + "step": 12983 + }, + { + "epoch": 2.1323466015232073, + "grad_norm": 0.3058927295010632, + "learning_rate": 5.608608348393181e-06, + "loss": 0.4505, + "step": 12984 + }, + { + "epoch": 2.13251082873155, + "grad_norm": 0.4679978012777797, + "learning_rate": 5.608153824344498e-06, + "loss": 0.4449, + "step": 12985 + }, + { + "epoch": 2.132675055939893, + "grad_norm": 0.3197622088020438, + "learning_rate": 5.607699288260174e-06, + "loss": 0.4454, + "step": 12986 + }, + { + "epoch": 2.1328392831482357, + "grad_norm": 0.27578517030301564, + "learning_rate": 5.607244740145145e-06, + "loss": 0.4387, + "step": 12987 + }, + { + "epoch": 2.1330035103565783, + "grad_norm": 0.395659876071687, + "learning_rate": 5.606790180004349e-06, + "loss": 0.4415, + "step": 12988 + }, + { + "epoch": 2.133167737564921, + "grad_norm": 0.3075443442701608, + "learning_rate": 5.6063356078427225e-06, + "loss": 0.4536, + "step": 12989 + }, + { + "epoch": 2.133331964773264, + "grad_norm": 0.41313177332432993, + "learning_rate": 5.605881023665203e-06, + "loss": 0.4289, + "step": 12990 + }, + { + "epoch": 2.1334961919816067, + "grad_norm": 0.3571092549071176, + "learning_rate": 5.605426427476729e-06, + "loss": 0.4664, + "step": 12991 + }, + { + "epoch": 2.1336604191899493, + "grad_norm": 0.39056888922382665, + "learning_rate": 5.604971819282235e-06, + "loss": 0.4258, + "step": 12992 + }, + { + "epoch": 2.133824646398292, + "grad_norm": 0.46537769479849345, + "learning_rate": 5.60451719908666e-06, + "loss": 0.449, + "step": 12993 + }, + { + "epoch": 2.1339888736066346, + "grad_norm": 0.32186461409923345, + "learning_rate": 5.604062566894941e-06, + "loss": 0.4595, + "step": 12994 + }, + { + "epoch": 2.1341531008149777, + "grad_norm": 0.31168989365248334, + "learning_rate": 5.603607922712017e-06, + "loss": 0.4561, + "step": 12995 + }, + { + "epoch": 2.1343173280233203, + "grad_norm": 0.31585319111399035, + "learning_rate": 5.603153266542826e-06, + "loss": 0.431, + "step": 12996 + }, + { + "epoch": 2.134481555231663, + "grad_norm": 0.30053858074909556, + "learning_rate": 5.602698598392304e-06, + "loss": 0.4168, + "step": 12997 + }, + { + "epoch": 2.1346457824400056, + "grad_norm": 0.36234676368672447, + "learning_rate": 5.602243918265391e-06, + "loss": 0.4358, + "step": 12998 + }, + { + "epoch": 2.1348100096483487, + "grad_norm": 0.33428480744979727, + "learning_rate": 5.601789226167023e-06, + "loss": 0.4185, + "step": 12999 + }, + { + "epoch": 2.1349742368566913, + "grad_norm": 0.3244302536241494, + "learning_rate": 5.601334522102142e-06, + "loss": 0.4707, + "step": 13000 + }, + { + "epoch": 2.135138464065034, + "grad_norm": 0.3502757192894509, + "learning_rate": 5.600879806075683e-06, + "loss": 0.4303, + "step": 13001 + }, + { + "epoch": 2.1353026912733766, + "grad_norm": 0.3685548422888394, + "learning_rate": 5.600425078092588e-06, + "loss": 0.436, + "step": 13002 + }, + { + "epoch": 2.1354669184817197, + "grad_norm": 0.2929653807287389, + "learning_rate": 5.599970338157792e-06, + "loss": 0.4328, + "step": 13003 + }, + { + "epoch": 2.1356311456900623, + "grad_norm": 0.28914285602883866, + "learning_rate": 5.599515586276236e-06, + "loss": 0.4346, + "step": 13004 + }, + { + "epoch": 2.135795372898405, + "grad_norm": 0.3748436051569583, + "learning_rate": 5.59906082245286e-06, + "loss": 0.4497, + "step": 13005 + }, + { + "epoch": 2.1359596001067476, + "grad_norm": 0.4301403569066467, + "learning_rate": 5.598606046692603e-06, + "loss": 0.449, + "step": 13006 + }, + { + "epoch": 2.1361238273150907, + "grad_norm": 0.2750459024303177, + "learning_rate": 5.598151259000401e-06, + "loss": 0.437, + "step": 13007 + }, + { + "epoch": 2.1362880545234333, + "grad_norm": 0.32575515399845445, + "learning_rate": 5.597696459381197e-06, + "loss": 0.4617, + "step": 13008 + }, + { + "epoch": 2.136452281731776, + "grad_norm": 0.2996017078795059, + "learning_rate": 5.597241647839928e-06, + "loss": 0.4336, + "step": 13009 + }, + { + "epoch": 2.1366165089401186, + "grad_norm": 0.3353749243950319, + "learning_rate": 5.596786824381538e-06, + "loss": 0.4374, + "step": 13010 + }, + { + "epoch": 2.136780736148461, + "grad_norm": 0.3015536802983604, + "learning_rate": 5.596331989010964e-06, + "loss": 0.4537, + "step": 13011 + }, + { + "epoch": 2.1369449633568043, + "grad_norm": 0.7102526684523022, + "learning_rate": 5.595877141733144e-06, + "loss": 0.4264, + "step": 13012 + }, + { + "epoch": 2.137109190565147, + "grad_norm": 1.0802129528539435, + "learning_rate": 5.595422282553021e-06, + "loss": 0.4365, + "step": 13013 + }, + { + "epoch": 2.1372734177734896, + "grad_norm": 0.3765904066273181, + "learning_rate": 5.594967411475532e-06, + "loss": 0.4365, + "step": 13014 + }, + { + "epoch": 2.137437644981832, + "grad_norm": 0.3296267997367895, + "learning_rate": 5.594512528505624e-06, + "loss": 0.452, + "step": 13015 + }, + { + "epoch": 2.1376018721901753, + "grad_norm": 0.32635813808393965, + "learning_rate": 5.59405763364823e-06, + "loss": 0.4335, + "step": 13016 + }, + { + "epoch": 2.137766099398518, + "grad_norm": 0.3054992350198616, + "learning_rate": 5.593602726908295e-06, + "loss": 0.441, + "step": 13017 + }, + { + "epoch": 2.1379303266068606, + "grad_norm": 0.3160707553495685, + "learning_rate": 5.593147808290756e-06, + "loss": 0.4384, + "step": 13018 + }, + { + "epoch": 2.138094553815203, + "grad_norm": 0.28928587834988345, + "learning_rate": 5.592692877800559e-06, + "loss": 0.4279, + "step": 13019 + }, + { + "epoch": 2.1382587810235463, + "grad_norm": 0.401047698366232, + "learning_rate": 5.592237935442642e-06, + "loss": 0.4183, + "step": 13020 + }, + { + "epoch": 2.138423008231889, + "grad_norm": 0.3045454687637298, + "learning_rate": 5.591782981221946e-06, + "loss": 0.4626, + "step": 13021 + }, + { + "epoch": 2.1385872354402315, + "grad_norm": 0.2681046998791144, + "learning_rate": 5.591328015143411e-06, + "loss": 0.44, + "step": 13022 + }, + { + "epoch": 2.138751462648574, + "grad_norm": 0.4218919748405687, + "learning_rate": 5.590873037211982e-06, + "loss": 0.4417, + "step": 13023 + }, + { + "epoch": 2.1389156898569173, + "grad_norm": 0.2793908490461997, + "learning_rate": 5.590418047432597e-06, + "loss": 0.4348, + "step": 13024 + }, + { + "epoch": 2.13907991706526, + "grad_norm": 0.3172246138905424, + "learning_rate": 5.589963045810202e-06, + "loss": 0.446, + "step": 13025 + }, + { + "epoch": 2.1392441442736025, + "grad_norm": 0.312775867484758, + "learning_rate": 5.589508032349734e-06, + "loss": 0.4463, + "step": 13026 + }, + { + "epoch": 2.139408371481945, + "grad_norm": 0.3568040949452443, + "learning_rate": 5.589053007056136e-06, + "loss": 0.4454, + "step": 13027 + }, + { + "epoch": 2.139572598690288, + "grad_norm": 0.3577210247324337, + "learning_rate": 5.588597969934353e-06, + "loss": 0.4448, + "step": 13028 + }, + { + "epoch": 2.139736825898631, + "grad_norm": 0.33210112769047656, + "learning_rate": 5.588142920989323e-06, + "loss": 0.443, + "step": 13029 + }, + { + "epoch": 2.1399010531069735, + "grad_norm": 0.3524122026419944, + "learning_rate": 5.587687860225991e-06, + "loss": 0.4571, + "step": 13030 + }, + { + "epoch": 2.140065280315316, + "grad_norm": 0.5054906330538123, + "learning_rate": 5.5872327876493e-06, + "loss": 0.4429, + "step": 13031 + }, + { + "epoch": 2.140229507523659, + "grad_norm": 0.32656193120293153, + "learning_rate": 5.58677770326419e-06, + "loss": 0.4489, + "step": 13032 + }, + { + "epoch": 2.140393734732002, + "grad_norm": 0.3670053673951288, + "learning_rate": 5.586322607075604e-06, + "loss": 0.4258, + "step": 13033 + }, + { + "epoch": 2.1405579619403445, + "grad_norm": 0.3701069054773225, + "learning_rate": 5.585867499088488e-06, + "loss": 0.4419, + "step": 13034 + }, + { + "epoch": 2.140722189148687, + "grad_norm": 0.34285488173179895, + "learning_rate": 5.5854123793077805e-06, + "loss": 0.4371, + "step": 13035 + }, + { + "epoch": 2.14088641635703, + "grad_norm": 0.2893085117521016, + "learning_rate": 5.5849572477384276e-06, + "loss": 0.4504, + "step": 13036 + }, + { + "epoch": 2.141050643565373, + "grad_norm": 0.4135270238157861, + "learning_rate": 5.584502104385371e-06, + "loss": 0.4536, + "step": 13037 + }, + { + "epoch": 2.1412148707737155, + "grad_norm": 0.9039359209112584, + "learning_rate": 5.584046949253554e-06, + "loss": 0.4541, + "step": 13038 + }, + { + "epoch": 2.141379097982058, + "grad_norm": 0.4604628288870824, + "learning_rate": 5.583591782347923e-06, + "loss": 0.4627, + "step": 13039 + }, + { + "epoch": 2.141543325190401, + "grad_norm": 0.361287982585956, + "learning_rate": 5.583136603673417e-06, + "loss": 0.4319, + "step": 13040 + }, + { + "epoch": 2.141707552398744, + "grad_norm": 0.28686311031123324, + "learning_rate": 5.582681413234982e-06, + "loss": 0.4349, + "step": 13041 + }, + { + "epoch": 2.1418717796070865, + "grad_norm": 0.35422337043200885, + "learning_rate": 5.582226211037562e-06, + "loss": 0.4418, + "step": 13042 + }, + { + "epoch": 2.142036006815429, + "grad_norm": 0.3200420784217416, + "learning_rate": 5.5817709970861e-06, + "loss": 0.4364, + "step": 13043 + }, + { + "epoch": 2.142200234023772, + "grad_norm": 0.35940330185959896, + "learning_rate": 5.581315771385542e-06, + "loss": 0.4302, + "step": 13044 + }, + { + "epoch": 2.1423644612321144, + "grad_norm": 0.288818054045889, + "learning_rate": 5.580860533940831e-06, + "loss": 0.4481, + "step": 13045 + }, + { + "epoch": 2.1425286884404575, + "grad_norm": 0.34310898496235703, + "learning_rate": 5.5804052847569096e-06, + "loss": 0.4445, + "step": 13046 + }, + { + "epoch": 2.1426929156488, + "grad_norm": 0.3507973417433449, + "learning_rate": 5.579950023838725e-06, + "loss": 0.4413, + "step": 13047 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.40522847529414024, + "learning_rate": 5.57949475119122e-06, + "loss": 0.4522, + "step": 13048 + }, + { + "epoch": 2.1430213700654854, + "grad_norm": 0.3351266446909605, + "learning_rate": 5.579039466819341e-06, + "loss": 0.4465, + "step": 13049 + }, + { + "epoch": 2.1431855972738285, + "grad_norm": 0.3245795390439118, + "learning_rate": 5.578584170728031e-06, + "loss": 0.4415, + "step": 13050 + }, + { + "epoch": 2.143349824482171, + "grad_norm": 0.2817301746950823, + "learning_rate": 5.578128862922235e-06, + "loss": 0.443, + "step": 13051 + }, + { + "epoch": 2.143514051690514, + "grad_norm": 0.3772109800765201, + "learning_rate": 5.5776735434069e-06, + "loss": 0.452, + "step": 13052 + }, + { + "epoch": 2.1436782788988564, + "grad_norm": 0.41514420047183903, + "learning_rate": 5.577218212186968e-06, + "loss": 0.4514, + "step": 13053 + }, + { + "epoch": 2.1438425061071995, + "grad_norm": 0.34853574492762074, + "learning_rate": 5.576762869267388e-06, + "loss": 0.4421, + "step": 13054 + }, + { + "epoch": 2.144006733315542, + "grad_norm": 0.37626637398815654, + "learning_rate": 5.576307514653103e-06, + "loss": 0.4602, + "step": 13055 + }, + { + "epoch": 2.144170960523885, + "grad_norm": 0.3908924960128689, + "learning_rate": 5.5758521483490605e-06, + "loss": 0.4251, + "step": 13056 + }, + { + "epoch": 2.1443351877322274, + "grad_norm": 0.3037037346436557, + "learning_rate": 5.575396770360205e-06, + "loss": 0.4376, + "step": 13057 + }, + { + "epoch": 2.1444994149405705, + "grad_norm": 0.39166416556086264, + "learning_rate": 5.5749413806914825e-06, + "loss": 0.4551, + "step": 13058 + }, + { + "epoch": 2.144663642148913, + "grad_norm": 0.2758285721291899, + "learning_rate": 5.57448597934784e-06, + "loss": 0.4538, + "step": 13059 + }, + { + "epoch": 2.144827869357256, + "grad_norm": 0.30518189424114334, + "learning_rate": 5.57403056633422e-06, + "loss": 0.4432, + "step": 13060 + }, + { + "epoch": 2.1449920965655984, + "grad_norm": 0.3021189985917047, + "learning_rate": 5.573575141655574e-06, + "loss": 0.4256, + "step": 13061 + }, + { + "epoch": 2.145156323773941, + "grad_norm": 0.34734420320468, + "learning_rate": 5.573119705316844e-06, + "loss": 0.4478, + "step": 13062 + }, + { + "epoch": 2.145320550982284, + "grad_norm": 0.48479298084851663, + "learning_rate": 5.572664257322978e-06, + "loss": 0.4456, + "step": 13063 + }, + { + "epoch": 2.1454847781906268, + "grad_norm": 0.9737469501992383, + "learning_rate": 5.572208797678923e-06, + "loss": 0.4519, + "step": 13064 + }, + { + "epoch": 2.1456490053989694, + "grad_norm": 0.373845968709997, + "learning_rate": 5.571753326389628e-06, + "loss": 0.4225, + "step": 13065 + }, + { + "epoch": 2.145813232607312, + "grad_norm": 0.32952687545014325, + "learning_rate": 5.571297843460035e-06, + "loss": 0.4551, + "step": 13066 + }, + { + "epoch": 2.145977459815655, + "grad_norm": 0.28185179231913293, + "learning_rate": 5.570842348895093e-06, + "loss": 0.4522, + "step": 13067 + }, + { + "epoch": 2.1461416870239978, + "grad_norm": 0.3750226506331507, + "learning_rate": 5.570386842699751e-06, + "loss": 0.4388, + "step": 13068 + }, + { + "epoch": 2.1463059142323404, + "grad_norm": 0.363792534235299, + "learning_rate": 5.569931324878955e-06, + "loss": 0.4566, + "step": 13069 + }, + { + "epoch": 2.146470141440683, + "grad_norm": 0.4173978857711169, + "learning_rate": 5.56947579543765e-06, + "loss": 0.4456, + "step": 13070 + }, + { + "epoch": 2.146634368649026, + "grad_norm": 0.3617383410012619, + "learning_rate": 5.5690202543807866e-06, + "loss": 0.4536, + "step": 13071 + }, + { + "epoch": 2.1467985958573688, + "grad_norm": 0.31683670414164145, + "learning_rate": 5.568564701713312e-06, + "loss": 0.4339, + "step": 13072 + }, + { + "epoch": 2.1469628230657114, + "grad_norm": 0.6722978519914278, + "learning_rate": 5.568109137440174e-06, + "loss": 0.4453, + "step": 13073 + }, + { + "epoch": 2.147127050274054, + "grad_norm": 0.31359460500682335, + "learning_rate": 5.567653561566319e-06, + "loss": 0.4406, + "step": 13074 + }, + { + "epoch": 2.147291277482397, + "grad_norm": 0.3756857719735049, + "learning_rate": 5.567197974096695e-06, + "loss": 0.4512, + "step": 13075 + }, + { + "epoch": 2.1474555046907398, + "grad_norm": 0.28206587584500525, + "learning_rate": 5.566742375036252e-06, + "loss": 0.4431, + "step": 13076 + }, + { + "epoch": 2.1476197318990824, + "grad_norm": 0.3034882846676163, + "learning_rate": 5.566286764389937e-06, + "loss": 0.4387, + "step": 13077 + }, + { + "epoch": 2.147783959107425, + "grad_norm": 0.33707031344715366, + "learning_rate": 5.5658311421627e-06, + "loss": 0.4486, + "step": 13078 + }, + { + "epoch": 2.1479481863157677, + "grad_norm": 0.43035245643608405, + "learning_rate": 5.5653755083594865e-06, + "loss": 0.4515, + "step": 13079 + }, + { + "epoch": 2.1481124135241108, + "grad_norm": 0.3180318636784757, + "learning_rate": 5.564919862985248e-06, + "loss": 0.4323, + "step": 13080 + }, + { + "epoch": 2.1482766407324534, + "grad_norm": 0.7311184003595356, + "learning_rate": 5.564464206044931e-06, + "loss": 0.462, + "step": 13081 + }, + { + "epoch": 2.148440867940796, + "grad_norm": 0.3498635077663437, + "learning_rate": 5.5640085375434855e-06, + "loss": 0.4308, + "step": 13082 + }, + { + "epoch": 2.1486050951491387, + "grad_norm": 0.9829592808567664, + "learning_rate": 5.5635528574858614e-06, + "loss": 0.4572, + "step": 13083 + }, + { + "epoch": 2.1487693223574817, + "grad_norm": 0.3950510374950475, + "learning_rate": 5.563097165877006e-06, + "loss": 0.4555, + "step": 13084 + }, + { + "epoch": 2.1489335495658244, + "grad_norm": 1.013249951682912, + "learning_rate": 5.562641462721869e-06, + "loss": 0.4385, + "step": 13085 + }, + { + "epoch": 2.149097776774167, + "grad_norm": 0.3005075649363099, + "learning_rate": 5.562185748025402e-06, + "loss": 0.4302, + "step": 13086 + }, + { + "epoch": 2.1492620039825097, + "grad_norm": 0.7128528813599668, + "learning_rate": 5.561730021792551e-06, + "loss": 0.4462, + "step": 13087 + }, + { + "epoch": 2.1494262311908527, + "grad_norm": 0.34082831138362135, + "learning_rate": 5.561274284028269e-06, + "loss": 0.4103, + "step": 13088 + }, + { + "epoch": 2.1495904583991954, + "grad_norm": 0.27912374758491476, + "learning_rate": 5.560818534737502e-06, + "loss": 0.4734, + "step": 13089 + }, + { + "epoch": 2.149754685607538, + "grad_norm": 0.41419567612684344, + "learning_rate": 5.560362773925204e-06, + "loss": 0.4371, + "step": 13090 + }, + { + "epoch": 2.1499189128158807, + "grad_norm": 0.533304143338659, + "learning_rate": 5.559907001596322e-06, + "loss": 0.4488, + "step": 13091 + }, + { + "epoch": 2.1500831400242237, + "grad_norm": 0.3287216931610325, + "learning_rate": 5.559451217755807e-06, + "loss": 0.423, + "step": 13092 + }, + { + "epoch": 2.1502473672325664, + "grad_norm": 0.2987444710279113, + "learning_rate": 5.55899542240861e-06, + "loss": 0.4284, + "step": 13093 + }, + { + "epoch": 2.150411594440909, + "grad_norm": 0.3915364216911103, + "learning_rate": 5.558539615559681e-06, + "loss": 0.435, + "step": 13094 + }, + { + "epoch": 2.1505758216492517, + "grad_norm": 0.34923159544476823, + "learning_rate": 5.55808379721397e-06, + "loss": 0.4442, + "step": 13095 + }, + { + "epoch": 2.1507400488575943, + "grad_norm": 0.3603631249694652, + "learning_rate": 5.557627967376427e-06, + "loss": 0.4224, + "step": 13096 + }, + { + "epoch": 2.1509042760659374, + "grad_norm": 0.47614439422632243, + "learning_rate": 5.557172126052005e-06, + "loss": 0.4412, + "step": 13097 + }, + { + "epoch": 2.15106850327428, + "grad_norm": 0.3048139485131453, + "learning_rate": 5.556716273245654e-06, + "loss": 0.4444, + "step": 13098 + }, + { + "epoch": 2.1512327304826226, + "grad_norm": 0.3342919021364949, + "learning_rate": 5.556260408962323e-06, + "loss": 0.4726, + "step": 13099 + }, + { + "epoch": 2.1513969576909653, + "grad_norm": 0.29465856880907676, + "learning_rate": 5.5558045332069645e-06, + "loss": 0.4508, + "step": 13100 + }, + { + "epoch": 2.1515611848993084, + "grad_norm": 0.2961227755518122, + "learning_rate": 5.555348645984531e-06, + "loss": 0.4494, + "step": 13101 + }, + { + "epoch": 2.151725412107651, + "grad_norm": 0.27934800568638346, + "learning_rate": 5.554892747299973e-06, + "loss": 0.4391, + "step": 13102 + }, + { + "epoch": 2.1518896393159936, + "grad_norm": 0.33111901066285837, + "learning_rate": 5.554436837158242e-06, + "loss": 0.4348, + "step": 13103 + }, + { + "epoch": 2.1520538665243363, + "grad_norm": 0.3684615911145497, + "learning_rate": 5.553980915564289e-06, + "loss": 0.4521, + "step": 13104 + }, + { + "epoch": 2.1522180937326794, + "grad_norm": 0.4292233032330315, + "learning_rate": 5.553524982523065e-06, + "loss": 0.4528, + "step": 13105 + }, + { + "epoch": 2.152382320941022, + "grad_norm": 0.43574829372998947, + "learning_rate": 5.553069038039525e-06, + "loss": 0.4537, + "step": 13106 + }, + { + "epoch": 2.1525465481493646, + "grad_norm": 0.294165085934928, + "learning_rate": 5.55261308211862e-06, + "loss": 0.4509, + "step": 13107 + }, + { + "epoch": 2.1527107753577073, + "grad_norm": 0.31222154571936955, + "learning_rate": 5.5521571147653e-06, + "loss": 0.4509, + "step": 13108 + }, + { + "epoch": 2.1528750025660504, + "grad_norm": 0.27790905680021066, + "learning_rate": 5.551701135984519e-06, + "loss": 0.4542, + "step": 13109 + }, + { + "epoch": 2.153039229774393, + "grad_norm": 0.4239281111220191, + "learning_rate": 5.551245145781228e-06, + "loss": 0.4555, + "step": 13110 + }, + { + "epoch": 2.1532034569827356, + "grad_norm": 0.33405406926044195, + "learning_rate": 5.550789144160381e-06, + "loss": 0.4255, + "step": 13111 + }, + { + "epoch": 2.1533676841910783, + "grad_norm": 0.3045577111345963, + "learning_rate": 5.550333131126931e-06, + "loss": 0.4337, + "step": 13112 + }, + { + "epoch": 2.153531911399421, + "grad_norm": 0.3667448605274694, + "learning_rate": 5.549877106685829e-06, + "loss": 0.4398, + "step": 13113 + }, + { + "epoch": 2.153696138607764, + "grad_norm": 0.3587313899783721, + "learning_rate": 5.549421070842028e-06, + "loss": 0.4337, + "step": 13114 + }, + { + "epoch": 2.1538603658161066, + "grad_norm": 0.31611137498570135, + "learning_rate": 5.548965023600482e-06, + "loss": 0.43, + "step": 13115 + }, + { + "epoch": 2.1540245930244493, + "grad_norm": 0.41761280354962393, + "learning_rate": 5.548508964966144e-06, + "loss": 0.4389, + "step": 13116 + }, + { + "epoch": 2.154188820232792, + "grad_norm": 0.3276668465800213, + "learning_rate": 5.548052894943968e-06, + "loss": 0.4386, + "step": 13117 + }, + { + "epoch": 2.154353047441135, + "grad_norm": 0.3058632740019484, + "learning_rate": 5.547596813538905e-06, + "loss": 0.4275, + "step": 13118 + }, + { + "epoch": 2.1545172746494776, + "grad_norm": 0.3134859102629066, + "learning_rate": 5.547140720755911e-06, + "loss": 0.4368, + "step": 13119 + }, + { + "epoch": 2.1546815018578203, + "grad_norm": 1.9136277979976022, + "learning_rate": 5.546684616599937e-06, + "loss": 0.4645, + "step": 13120 + }, + { + "epoch": 2.154845729066163, + "grad_norm": 0.3081164038829003, + "learning_rate": 5.5462285010759385e-06, + "loss": 0.4404, + "step": 13121 + }, + { + "epoch": 2.155009956274506, + "grad_norm": 0.7771526801527265, + "learning_rate": 5.545772374188871e-06, + "loss": 0.4397, + "step": 13122 + }, + { + "epoch": 2.1551741834828486, + "grad_norm": 0.2715873990185136, + "learning_rate": 5.545316235943686e-06, + "loss": 0.4275, + "step": 13123 + }, + { + "epoch": 2.1553384106911913, + "grad_norm": 0.33280980643281594, + "learning_rate": 5.544860086345337e-06, + "loss": 0.4512, + "step": 13124 + }, + { + "epoch": 2.155502637899534, + "grad_norm": 0.3851568549443537, + "learning_rate": 5.54440392539878e-06, + "loss": 0.4319, + "step": 13125 + }, + { + "epoch": 2.155666865107877, + "grad_norm": 0.3063502288189881, + "learning_rate": 5.5439477531089685e-06, + "loss": 0.4368, + "step": 13126 + }, + { + "epoch": 2.1558310923162196, + "grad_norm": 0.32417940549514135, + "learning_rate": 5.543491569480859e-06, + "loss": 0.4548, + "step": 13127 + }, + { + "epoch": 2.1559953195245622, + "grad_norm": 0.2819287851853112, + "learning_rate": 5.543035374519403e-06, + "loss": 0.4314, + "step": 13128 + }, + { + "epoch": 2.156159546732905, + "grad_norm": 0.3343017480828349, + "learning_rate": 5.542579168229557e-06, + "loss": 0.4413, + "step": 13129 + }, + { + "epoch": 2.1563237739412475, + "grad_norm": 0.30548747472069426, + "learning_rate": 5.542122950616274e-06, + "loss": 0.443, + "step": 13130 + }, + { + "epoch": 2.1564880011495906, + "grad_norm": 0.27993439186124225, + "learning_rate": 5.5416667216845124e-06, + "loss": 0.4386, + "step": 13131 + }, + { + "epoch": 2.1566522283579332, + "grad_norm": 0.32577795398226056, + "learning_rate": 5.541210481439225e-06, + "loss": 0.4435, + "step": 13132 + }, + { + "epoch": 2.156816455566276, + "grad_norm": 0.32823780085029325, + "learning_rate": 5.540754229885367e-06, + "loss": 0.4507, + "step": 13133 + }, + { + "epoch": 2.1569806827746185, + "grad_norm": 0.3047427313061152, + "learning_rate": 5.5402979670278946e-06, + "loss": 0.43, + "step": 13134 + }, + { + "epoch": 2.1571449099829616, + "grad_norm": 0.340813084378149, + "learning_rate": 5.539841692871761e-06, + "loss": 0.4279, + "step": 13135 + }, + { + "epoch": 2.1573091371913042, + "grad_norm": 0.46663307065196113, + "learning_rate": 5.539385407421925e-06, + "loss": 0.4634, + "step": 13136 + }, + { + "epoch": 2.157473364399647, + "grad_norm": 0.37566933247851336, + "learning_rate": 5.538929110683342e-06, + "loss": 0.4472, + "step": 13137 + }, + { + "epoch": 2.1576375916079895, + "grad_norm": 0.30868472477797043, + "learning_rate": 5.538472802660965e-06, + "loss": 0.4586, + "step": 13138 + }, + { + "epoch": 2.1578018188163326, + "grad_norm": 0.34066658358138063, + "learning_rate": 5.538016483359751e-06, + "loss": 0.4448, + "step": 13139 + }, + { + "epoch": 2.1579660460246752, + "grad_norm": 0.4073911081151271, + "learning_rate": 5.537560152784659e-06, + "loss": 0.4563, + "step": 13140 + }, + { + "epoch": 2.158130273233018, + "grad_norm": 0.29822675903780166, + "learning_rate": 5.537103810940641e-06, + "loss": 0.4621, + "step": 13141 + }, + { + "epoch": 2.1582945004413605, + "grad_norm": 0.31971502442009747, + "learning_rate": 5.536647457832656e-06, + "loss": 0.4553, + "step": 13142 + }, + { + "epoch": 2.1584587276497036, + "grad_norm": 0.3426810765747357, + "learning_rate": 5.53619109346566e-06, + "loss": 0.4477, + "step": 13143 + }, + { + "epoch": 2.1586229548580462, + "grad_norm": 0.945124020682834, + "learning_rate": 5.5357347178446086e-06, + "loss": 0.4475, + "step": 13144 + }, + { + "epoch": 2.158787182066389, + "grad_norm": 0.4918830844122075, + "learning_rate": 5.535278330974459e-06, + "loss": 0.4576, + "step": 13145 + }, + { + "epoch": 2.1589514092747315, + "grad_norm": 0.3058756489796773, + "learning_rate": 5.534821932860169e-06, + "loss": 0.4337, + "step": 13146 + }, + { + "epoch": 2.159115636483074, + "grad_norm": 0.3120707299763528, + "learning_rate": 5.534365523506694e-06, + "loss": 0.4735, + "step": 13147 + }, + { + "epoch": 2.1592798636914172, + "grad_norm": 0.2846250730751281, + "learning_rate": 5.5339091029189925e-06, + "loss": 0.437, + "step": 13148 + }, + { + "epoch": 2.15944409089976, + "grad_norm": 0.3201064753442044, + "learning_rate": 5.53345267110202e-06, + "loss": 0.469, + "step": 13149 + }, + { + "epoch": 2.1596083181081025, + "grad_norm": 0.29766138661669544, + "learning_rate": 5.532996228060735e-06, + "loss": 0.429, + "step": 13150 + }, + { + "epoch": 2.159772545316445, + "grad_norm": 0.2846032186777436, + "learning_rate": 5.532539773800095e-06, + "loss": 0.4401, + "step": 13151 + }, + { + "epoch": 2.159936772524788, + "grad_norm": 0.3129947601427393, + "learning_rate": 5.5320833083250565e-06, + "loss": 0.446, + "step": 13152 + }, + { + "epoch": 2.160100999733131, + "grad_norm": 0.3144839696313304, + "learning_rate": 5.531626831640578e-06, + "loss": 0.452, + "step": 13153 + }, + { + "epoch": 2.1602652269414735, + "grad_norm": 0.2959377033395708, + "learning_rate": 5.531170343751617e-06, + "loss": 0.4373, + "step": 13154 + }, + { + "epoch": 2.160429454149816, + "grad_norm": 0.40203476934860716, + "learning_rate": 5.530713844663132e-06, + "loss": 0.4332, + "step": 13155 + }, + { + "epoch": 2.160593681358159, + "grad_norm": 0.34219461068539536, + "learning_rate": 5.530257334380081e-06, + "loss": 0.4348, + "step": 13156 + }, + { + "epoch": 2.160757908566502, + "grad_norm": 0.33771282218080734, + "learning_rate": 5.529800812907421e-06, + "loss": 0.4639, + "step": 13157 + }, + { + "epoch": 2.1609221357748445, + "grad_norm": 0.31856610527953794, + "learning_rate": 5.529344280250111e-06, + "loss": 0.4474, + "step": 13158 + }, + { + "epoch": 2.161086362983187, + "grad_norm": 0.3171282603923975, + "learning_rate": 5.528887736413109e-06, + "loss": 0.457, + "step": 13159 + }, + { + "epoch": 2.16125059019153, + "grad_norm": 0.35217117196778364, + "learning_rate": 5.528431181401375e-06, + "loss": 0.4619, + "step": 13160 + }, + { + "epoch": 2.161414817399873, + "grad_norm": 0.6060833146178379, + "learning_rate": 5.527974615219866e-06, + "loss": 0.4392, + "step": 13161 + }, + { + "epoch": 2.1615790446082155, + "grad_norm": 0.589550773106566, + "learning_rate": 5.527518037873542e-06, + "loss": 0.4371, + "step": 13162 + }, + { + "epoch": 2.161743271816558, + "grad_norm": 0.3145128961142026, + "learning_rate": 5.527061449367359e-06, + "loss": 0.4506, + "step": 13163 + }, + { + "epoch": 2.1619074990249008, + "grad_norm": 0.28483661690939105, + "learning_rate": 5.52660484970628e-06, + "loss": 0.4407, + "step": 13164 + }, + { + "epoch": 2.162071726233244, + "grad_norm": 0.31160106147746397, + "learning_rate": 5.526148238895262e-06, + "loss": 0.4325, + "step": 13165 + }, + { + "epoch": 2.1622359534415865, + "grad_norm": 0.8214018457380634, + "learning_rate": 5.525691616939266e-06, + "loss": 0.449, + "step": 13166 + }, + { + "epoch": 2.162400180649929, + "grad_norm": 0.42579552944763127, + "learning_rate": 5.525234983843247e-06, + "loss": 0.4375, + "step": 13167 + }, + { + "epoch": 2.1625644078582718, + "grad_norm": 0.3172437304419483, + "learning_rate": 5.524778339612168e-06, + "loss": 0.4406, + "step": 13168 + }, + { + "epoch": 2.162728635066615, + "grad_norm": 0.3581226513019574, + "learning_rate": 5.5243216842509895e-06, + "loss": 0.4547, + "step": 13169 + }, + { + "epoch": 2.1628928622749575, + "grad_norm": 1.0570246497495848, + "learning_rate": 5.523865017764668e-06, + "loss": 0.4544, + "step": 13170 + }, + { + "epoch": 2.1630570894833, + "grad_norm": 0.3390380057126517, + "learning_rate": 5.523408340158167e-06, + "loss": 0.4214, + "step": 13171 + }, + { + "epoch": 2.1632213166916427, + "grad_norm": 0.36776919352984283, + "learning_rate": 5.5229516514364426e-06, + "loss": 0.4381, + "step": 13172 + }, + { + "epoch": 2.163385543899986, + "grad_norm": 0.30982836563580607, + "learning_rate": 5.522494951604457e-06, + "loss": 0.4489, + "step": 13173 + }, + { + "epoch": 2.1635497711083285, + "grad_norm": 0.44372920909803376, + "learning_rate": 5.522038240667172e-06, + "loss": 0.455, + "step": 13174 + }, + { + "epoch": 2.163713998316671, + "grad_norm": 0.3466451063504817, + "learning_rate": 5.521581518629544e-06, + "loss": 0.4525, + "step": 13175 + }, + { + "epoch": 2.1638782255250137, + "grad_norm": 0.3106973226661059, + "learning_rate": 5.521124785496538e-06, + "loss": 0.4483, + "step": 13176 + }, + { + "epoch": 2.164042452733357, + "grad_norm": 0.28579331113902645, + "learning_rate": 5.52066804127311e-06, + "loss": 0.445, + "step": 13177 + }, + { + "epoch": 2.1642066799416995, + "grad_norm": 0.26832241525859035, + "learning_rate": 5.5202112859642245e-06, + "loss": 0.4222, + "step": 13178 + }, + { + "epoch": 2.164370907150042, + "grad_norm": 0.32216668813831933, + "learning_rate": 5.51975451957484e-06, + "loss": 0.4538, + "step": 13179 + }, + { + "epoch": 2.1645351343583847, + "grad_norm": 0.34484591491438377, + "learning_rate": 5.519297742109918e-06, + "loss": 0.4371, + "step": 13180 + }, + { + "epoch": 2.1646993615667274, + "grad_norm": 0.328862402254495, + "learning_rate": 5.518840953574418e-06, + "loss": 0.4318, + "step": 13181 + }, + { + "epoch": 2.1648635887750705, + "grad_norm": 0.27757721400634844, + "learning_rate": 5.518384153973306e-06, + "loss": 0.4412, + "step": 13182 + }, + { + "epoch": 2.165027815983413, + "grad_norm": 0.32631979517754256, + "learning_rate": 5.517927343311538e-06, + "loss": 0.4317, + "step": 13183 + }, + { + "epoch": 2.1651920431917557, + "grad_norm": 0.27432476169664666, + "learning_rate": 5.517470521594078e-06, + "loss": 0.4293, + "step": 13184 + }, + { + "epoch": 2.1653562704000984, + "grad_norm": 0.28966912344757134, + "learning_rate": 5.517013688825888e-06, + "loss": 0.4549, + "step": 13185 + }, + { + "epoch": 2.1655204976084415, + "grad_norm": 0.6730067083934345, + "learning_rate": 5.516556845011929e-06, + "loss": 0.4474, + "step": 13186 + }, + { + "epoch": 2.165684724816784, + "grad_norm": 0.29913936874253383, + "learning_rate": 5.516099990157161e-06, + "loss": 0.4675, + "step": 13187 + }, + { + "epoch": 2.1658489520251267, + "grad_norm": 0.3424981637374919, + "learning_rate": 5.515643124266546e-06, + "loss": 0.4373, + "step": 13188 + }, + { + "epoch": 2.1660131792334694, + "grad_norm": 0.30630459205370697, + "learning_rate": 5.51518624734505e-06, + "loss": 0.4465, + "step": 13189 + }, + { + "epoch": 2.1661774064418124, + "grad_norm": 0.34883759198472747, + "learning_rate": 5.514729359397632e-06, + "loss": 0.4404, + "step": 13190 + }, + { + "epoch": 2.166341633650155, + "grad_norm": 0.302842026651347, + "learning_rate": 5.5142724604292555e-06, + "loss": 0.4529, + "step": 13191 + }, + { + "epoch": 2.1665058608584977, + "grad_norm": 0.3322886428250413, + "learning_rate": 5.513815550444881e-06, + "loss": 0.4315, + "step": 13192 + }, + { + "epoch": 2.1666700880668404, + "grad_norm": 0.3008715144635831, + "learning_rate": 5.513358629449472e-06, + "loss": 0.4503, + "step": 13193 + }, + { + "epoch": 2.1668343152751834, + "grad_norm": 0.31224920669877515, + "learning_rate": 5.512901697447992e-06, + "loss": 0.4351, + "step": 13194 + }, + { + "epoch": 2.166998542483526, + "grad_norm": 0.28463358286996604, + "learning_rate": 5.512444754445403e-06, + "loss": 0.4423, + "step": 13195 + }, + { + "epoch": 2.1671627696918687, + "grad_norm": 0.4034444644429901, + "learning_rate": 5.511987800446668e-06, + "loss": 0.4404, + "step": 13196 + }, + { + "epoch": 2.1673269969002114, + "grad_norm": 0.38665482565892567, + "learning_rate": 5.511530835456749e-06, + "loss": 0.4366, + "step": 13197 + }, + { + "epoch": 2.167491224108554, + "grad_norm": 0.3494804160464828, + "learning_rate": 5.51107385948061e-06, + "loss": 0.4525, + "step": 13198 + }, + { + "epoch": 2.167655451316897, + "grad_norm": 0.3841231205093899, + "learning_rate": 5.510616872523214e-06, + "loss": 0.4509, + "step": 13199 + }, + { + "epoch": 2.1678196785252397, + "grad_norm": 0.28514366579375827, + "learning_rate": 5.510159874589527e-06, + "loss": 0.4426, + "step": 13200 + }, + { + "epoch": 2.1679839057335824, + "grad_norm": 0.3620023664594434, + "learning_rate": 5.5097028656845065e-06, + "loss": 0.4358, + "step": 13201 + }, + { + "epoch": 2.168148132941925, + "grad_norm": 0.32201321607882394, + "learning_rate": 5.509245845813121e-06, + "loss": 0.4403, + "step": 13202 + }, + { + "epoch": 2.168312360150268, + "grad_norm": 0.3210763284964427, + "learning_rate": 5.508788814980333e-06, + "loss": 0.4568, + "step": 13203 + }, + { + "epoch": 2.1684765873586107, + "grad_norm": 0.3317228382026454, + "learning_rate": 5.508331773191104e-06, + "loss": 0.432, + "step": 13204 + }, + { + "epoch": 2.1686408145669533, + "grad_norm": 0.30859052454192315, + "learning_rate": 5.507874720450403e-06, + "loss": 0.4565, + "step": 13205 + }, + { + "epoch": 2.168805041775296, + "grad_norm": 0.27373704128356613, + "learning_rate": 5.507417656763189e-06, + "loss": 0.4408, + "step": 13206 + }, + { + "epoch": 2.168969268983639, + "grad_norm": 0.390830204231793, + "learning_rate": 5.506960582134428e-06, + "loss": 0.4389, + "step": 13207 + }, + { + "epoch": 2.1691334961919817, + "grad_norm": 0.316536079441099, + "learning_rate": 5.506503496569085e-06, + "loss": 0.4676, + "step": 13208 + }, + { + "epoch": 2.1692977234003243, + "grad_norm": 0.30981856257732066, + "learning_rate": 5.506046400072122e-06, + "loss": 0.4464, + "step": 13209 + }, + { + "epoch": 2.169461950608667, + "grad_norm": 0.8040089066478628, + "learning_rate": 5.505589292648508e-06, + "loss": 0.4571, + "step": 13210 + }, + { + "epoch": 2.16962617781701, + "grad_norm": 0.29944619080582335, + "learning_rate": 5.505132174303204e-06, + "loss": 0.4429, + "step": 13211 + }, + { + "epoch": 2.1697904050253527, + "grad_norm": 0.2931380112774569, + "learning_rate": 5.504675045041174e-06, + "loss": 0.432, + "step": 13212 + }, + { + "epoch": 2.1699546322336953, + "grad_norm": 0.32723713071086563, + "learning_rate": 5.504217904867386e-06, + "loss": 0.4673, + "step": 13213 + }, + { + "epoch": 2.170118859442038, + "grad_norm": 0.2680010066570768, + "learning_rate": 5.503760753786804e-06, + "loss": 0.4677, + "step": 13214 + }, + { + "epoch": 2.1702830866503806, + "grad_norm": 0.29538207235909225, + "learning_rate": 5.503303591804392e-06, + "loss": 0.4396, + "step": 13215 + }, + { + "epoch": 2.1704473138587237, + "grad_norm": 0.3072993424577965, + "learning_rate": 5.5028464189251155e-06, + "loss": 0.4505, + "step": 13216 + }, + { + "epoch": 2.1706115410670663, + "grad_norm": 0.311688439811001, + "learning_rate": 5.502389235153941e-06, + "loss": 0.4313, + "step": 13217 + }, + { + "epoch": 2.170775768275409, + "grad_norm": 0.35361331964980885, + "learning_rate": 5.501932040495832e-06, + "loss": 0.4646, + "step": 13218 + }, + { + "epoch": 2.1709399954837516, + "grad_norm": 0.339514386407374, + "learning_rate": 5.501474834955756e-06, + "loss": 0.4427, + "step": 13219 + }, + { + "epoch": 2.1711042226920947, + "grad_norm": 0.3359321760848037, + "learning_rate": 5.501017618538679e-06, + "loss": 0.4365, + "step": 13220 + }, + { + "epoch": 2.1712684499004373, + "grad_norm": 0.29013968578222354, + "learning_rate": 5.500560391249565e-06, + "loss": 0.4308, + "step": 13221 + }, + { + "epoch": 2.17143267710878, + "grad_norm": 0.33718799981468217, + "learning_rate": 5.5001031530933794e-06, + "loss": 0.4393, + "step": 13222 + }, + { + "epoch": 2.1715969043171226, + "grad_norm": 0.3303170855751071, + "learning_rate": 5.499645904075091e-06, + "loss": 0.4309, + "step": 13223 + }, + { + "epoch": 2.1717611315254657, + "grad_norm": 0.3150015557314923, + "learning_rate": 5.499188644199664e-06, + "loss": 0.4339, + "step": 13224 + }, + { + "epoch": 2.1719253587338083, + "grad_norm": 0.3075108331038909, + "learning_rate": 5.4987313734720665e-06, + "loss": 0.451, + "step": 13225 + }, + { + "epoch": 2.172089585942151, + "grad_norm": 0.33633278099740577, + "learning_rate": 5.4982740918972625e-06, + "loss": 0.431, + "step": 13226 + }, + { + "epoch": 2.1722538131504936, + "grad_norm": 0.2829229596881379, + "learning_rate": 5.497816799480219e-06, + "loss": 0.4352, + "step": 13227 + }, + { + "epoch": 2.1724180403588367, + "grad_norm": 0.2738157895536735, + "learning_rate": 5.497359496225905e-06, + "loss": 0.4403, + "step": 13228 + }, + { + "epoch": 2.1725822675671793, + "grad_norm": 0.3134244993684418, + "learning_rate": 5.496902182139286e-06, + "loss": 0.4459, + "step": 13229 + }, + { + "epoch": 2.172746494775522, + "grad_norm": 0.3357126734986133, + "learning_rate": 5.496444857225326e-06, + "loss": 0.4443, + "step": 13230 + }, + { + "epoch": 2.1729107219838646, + "grad_norm": 0.3924307054131407, + "learning_rate": 5.495987521488996e-06, + "loss": 0.437, + "step": 13231 + }, + { + "epoch": 2.1730749491922072, + "grad_norm": 0.3104051211706088, + "learning_rate": 5.495530174935261e-06, + "loss": 0.446, + "step": 13232 + }, + { + "epoch": 2.1732391764005503, + "grad_norm": 0.38846412274853226, + "learning_rate": 5.49507281756909e-06, + "loss": 0.4555, + "step": 13233 + }, + { + "epoch": 2.173403403608893, + "grad_norm": 0.3313264905531258, + "learning_rate": 5.4946154493954495e-06, + "loss": 0.4344, + "step": 13234 + }, + { + "epoch": 2.1735676308172356, + "grad_norm": 0.3243910241314963, + "learning_rate": 5.494158070419304e-06, + "loss": 0.4423, + "step": 13235 + }, + { + "epoch": 2.1737318580255782, + "grad_norm": 0.5319929603313633, + "learning_rate": 5.493700680645626e-06, + "loss": 0.4391, + "step": 13236 + }, + { + "epoch": 2.1738960852339213, + "grad_norm": 0.26307427359923147, + "learning_rate": 5.49324328007938e-06, + "loss": 0.4479, + "step": 13237 + }, + { + "epoch": 2.174060312442264, + "grad_norm": 0.3178794271208854, + "learning_rate": 5.492785868725535e-06, + "loss": 0.4577, + "step": 13238 + }, + { + "epoch": 2.1742245396506066, + "grad_norm": 0.2809221309088171, + "learning_rate": 5.49232844658906e-06, + "loss": 0.469, + "step": 13239 + }, + { + "epoch": 2.174388766858949, + "grad_norm": 0.3735418362737004, + "learning_rate": 5.491871013674921e-06, + "loss": 0.4233, + "step": 13240 + }, + { + "epoch": 2.1745529940672923, + "grad_norm": 0.2725105093139469, + "learning_rate": 5.491413569988085e-06, + "loss": 0.4543, + "step": 13241 + }, + { + "epoch": 2.174717221275635, + "grad_norm": 0.3151526891984052, + "learning_rate": 5.490956115533523e-06, + "loss": 0.4607, + "step": 13242 + }, + { + "epoch": 2.1748814484839776, + "grad_norm": 0.36699330490616217, + "learning_rate": 5.4904986503162035e-06, + "loss": 0.4554, + "step": 13243 + }, + { + "epoch": 2.17504567569232, + "grad_norm": 0.3380410448393794, + "learning_rate": 5.490041174341094e-06, + "loss": 0.4351, + "step": 13244 + }, + { + "epoch": 2.1752099029006633, + "grad_norm": 0.3373211564238718, + "learning_rate": 5.489583687613164e-06, + "loss": 0.4429, + "step": 13245 + }, + { + "epoch": 2.175374130109006, + "grad_norm": 0.3425286568997985, + "learning_rate": 5.48912619013738e-06, + "loss": 0.4482, + "step": 13246 + }, + { + "epoch": 2.1755383573173486, + "grad_norm": 0.30984839575858375, + "learning_rate": 5.488668681918712e-06, + "loss": 0.4354, + "step": 13247 + }, + { + "epoch": 2.175702584525691, + "grad_norm": 0.2957698078978961, + "learning_rate": 5.488211162962132e-06, + "loss": 0.4426, + "step": 13248 + }, + { + "epoch": 2.175866811734034, + "grad_norm": 0.3545053186690607, + "learning_rate": 5.487753633272605e-06, + "loss": 0.4529, + "step": 13249 + }, + { + "epoch": 2.176031038942377, + "grad_norm": 0.345155396866791, + "learning_rate": 5.4872960928551015e-06, + "loss": 0.4319, + "step": 13250 + }, + { + "epoch": 2.1761952661507196, + "grad_norm": 0.3096685753746912, + "learning_rate": 5.4868385417145905e-06, + "loss": 0.4251, + "step": 13251 + }, + { + "epoch": 2.176359493359062, + "grad_norm": 0.34819121870325165, + "learning_rate": 5.486380979856042e-06, + "loss": 0.439, + "step": 13252 + }, + { + "epoch": 2.176523720567405, + "grad_norm": 0.365145070586753, + "learning_rate": 5.485923407284428e-06, + "loss": 0.4552, + "step": 13253 + }, + { + "epoch": 2.176687947775748, + "grad_norm": 0.302409582339075, + "learning_rate": 5.4854658240047145e-06, + "loss": 0.4576, + "step": 13254 + }, + { + "epoch": 2.1768521749840906, + "grad_norm": 0.3099827305656743, + "learning_rate": 5.4850082300218725e-06, + "loss": 0.4182, + "step": 13255 + }, + { + "epoch": 2.177016402192433, + "grad_norm": 0.3303933405482676, + "learning_rate": 5.4845506253408705e-06, + "loss": 0.4438, + "step": 13256 + }, + { + "epoch": 2.177180629400776, + "grad_norm": 0.32728970434710136, + "learning_rate": 5.484093009966682e-06, + "loss": 0.4346, + "step": 13257 + }, + { + "epoch": 2.177344856609119, + "grad_norm": 0.32484025588411, + "learning_rate": 5.483635383904273e-06, + "loss": 0.4507, + "step": 13258 + }, + { + "epoch": 2.1775090838174616, + "grad_norm": 0.3801502598555356, + "learning_rate": 5.483177747158619e-06, + "loss": 0.4434, + "step": 13259 + }, + { + "epoch": 2.177673311025804, + "grad_norm": 0.2890146137698474, + "learning_rate": 5.482720099734686e-06, + "loss": 0.4617, + "step": 13260 + }, + { + "epoch": 2.177837538234147, + "grad_norm": 0.3036280477508943, + "learning_rate": 5.482262441637445e-06, + "loss": 0.4692, + "step": 13261 + }, + { + "epoch": 2.17800176544249, + "grad_norm": 0.2767654696097153, + "learning_rate": 5.481804772871868e-06, + "loss": 0.4392, + "step": 13262 + }, + { + "epoch": 2.1781659926508325, + "grad_norm": 0.3103322912436335, + "learning_rate": 5.481347093442926e-06, + "loss": 0.4351, + "step": 13263 + }, + { + "epoch": 2.178330219859175, + "grad_norm": 0.3730970128678796, + "learning_rate": 5.480889403355589e-06, + "loss": 0.446, + "step": 13264 + }, + { + "epoch": 2.178494447067518, + "grad_norm": 0.4406758847281105, + "learning_rate": 5.4804317026148274e-06, + "loss": 0.449, + "step": 13265 + }, + { + "epoch": 2.1786586742758605, + "grad_norm": 0.31286994077533276, + "learning_rate": 5.4799739912256126e-06, + "loss": 0.4337, + "step": 13266 + }, + { + "epoch": 2.1788229014842035, + "grad_norm": 0.31232970447969627, + "learning_rate": 5.479516269192915e-06, + "loss": 0.4538, + "step": 13267 + }, + { + "epoch": 2.178987128692546, + "grad_norm": 0.39424467153664117, + "learning_rate": 5.47905853652171e-06, + "loss": 0.4461, + "step": 13268 + }, + { + "epoch": 2.179151355900889, + "grad_norm": 0.2763369646076082, + "learning_rate": 5.4786007932169634e-06, + "loss": 0.4412, + "step": 13269 + }, + { + "epoch": 2.1793155831092315, + "grad_norm": 0.3549322006277845, + "learning_rate": 5.478143039283651e-06, + "loss": 0.4395, + "step": 13270 + }, + { + "epoch": 2.1794798103175745, + "grad_norm": 0.3806948554055574, + "learning_rate": 5.477685274726741e-06, + "loss": 0.4377, + "step": 13271 + }, + { + "epoch": 2.179644037525917, + "grad_norm": 0.36432326690094835, + "learning_rate": 5.477227499551208e-06, + "loss": 0.4354, + "step": 13272 + }, + { + "epoch": 2.17980826473426, + "grad_norm": 0.4599377738684147, + "learning_rate": 5.476769713762024e-06, + "loss": 0.4674, + "step": 13273 + }, + { + "epoch": 2.1799724919426025, + "grad_norm": 0.3250357559341009, + "learning_rate": 5.47631191736416e-06, + "loss": 0.4174, + "step": 13274 + }, + { + "epoch": 2.1801367191509455, + "grad_norm": 0.32868619559098194, + "learning_rate": 5.475854110362586e-06, + "loss": 0.4413, + "step": 13275 + }, + { + "epoch": 2.180300946359288, + "grad_norm": 0.34100178065605913, + "learning_rate": 5.475396292762278e-06, + "loss": 0.4561, + "step": 13276 + }, + { + "epoch": 2.180465173567631, + "grad_norm": 0.5793751621914814, + "learning_rate": 5.4749384645682054e-06, + "loss": 0.4586, + "step": 13277 + }, + { + "epoch": 2.1806294007759734, + "grad_norm": 0.4069008528073191, + "learning_rate": 5.474480625785343e-06, + "loss": 0.4164, + "step": 13278 + }, + { + "epoch": 2.1807936279843165, + "grad_norm": 0.3734986648935865, + "learning_rate": 5.474022776418661e-06, + "loss": 0.4493, + "step": 13279 + }, + { + "epoch": 2.180957855192659, + "grad_norm": 0.3441849653741825, + "learning_rate": 5.473564916473134e-06, + "loss": 0.4106, + "step": 13280 + }, + { + "epoch": 2.181122082401002, + "grad_norm": 0.29951787954088177, + "learning_rate": 5.473107045953734e-06, + "loss": 0.4416, + "step": 13281 + }, + { + "epoch": 2.1812863096093444, + "grad_norm": 0.4575444456948254, + "learning_rate": 5.472649164865434e-06, + "loss": 0.4501, + "step": 13282 + }, + { + "epoch": 2.181450536817687, + "grad_norm": 0.3720739585342384, + "learning_rate": 5.472191273213208e-06, + "loss": 0.433, + "step": 13283 + }, + { + "epoch": 2.18161476402603, + "grad_norm": 0.3481388997776785, + "learning_rate": 5.471733371002027e-06, + "loss": 0.4431, + "step": 13284 + }, + { + "epoch": 2.181778991234373, + "grad_norm": 0.31569298160947046, + "learning_rate": 5.471275458236865e-06, + "loss": 0.4709, + "step": 13285 + }, + { + "epoch": 2.1819432184427154, + "grad_norm": 0.3675943375619008, + "learning_rate": 5.470817534922698e-06, + "loss": 0.4618, + "step": 13286 + }, + { + "epoch": 2.182107445651058, + "grad_norm": 0.2889262086312248, + "learning_rate": 5.470359601064495e-06, + "loss": 0.4383, + "step": 13287 + }, + { + "epoch": 2.182271672859401, + "grad_norm": 0.38329425216994023, + "learning_rate": 5.469901656667235e-06, + "loss": 0.437, + "step": 13288 + }, + { + "epoch": 2.182435900067744, + "grad_norm": 0.3052468220993611, + "learning_rate": 5.469443701735887e-06, + "loss": 0.4634, + "step": 13289 + }, + { + "epoch": 2.1826001272760864, + "grad_norm": 0.3603417523141972, + "learning_rate": 5.468985736275426e-06, + "loss": 0.4242, + "step": 13290 + }, + { + "epoch": 2.182764354484429, + "grad_norm": 0.27351967521952214, + "learning_rate": 5.468527760290828e-06, + "loss": 0.4479, + "step": 13291 + }, + { + "epoch": 2.182928581692772, + "grad_norm": 0.7082922791523382, + "learning_rate": 5.468069773787066e-06, + "loss": 0.4306, + "step": 13292 + }, + { + "epoch": 2.183092808901115, + "grad_norm": 0.3798345765622769, + "learning_rate": 5.467611776769112e-06, + "loss": 0.4247, + "step": 13293 + }, + { + "epoch": 2.1832570361094574, + "grad_norm": 0.532693372970332, + "learning_rate": 5.467153769241942e-06, + "loss": 0.4451, + "step": 13294 + }, + { + "epoch": 2.1834212633178, + "grad_norm": 0.37091825250503574, + "learning_rate": 5.466695751210532e-06, + "loss": 0.4563, + "step": 13295 + }, + { + "epoch": 2.183585490526143, + "grad_norm": 0.3210104071626388, + "learning_rate": 5.466237722679854e-06, + "loss": 0.435, + "step": 13296 + }, + { + "epoch": 2.183749717734486, + "grad_norm": 0.280191494056349, + "learning_rate": 5.465779683654884e-06, + "loss": 0.4162, + "step": 13297 + }, + { + "epoch": 2.1839139449428284, + "grad_norm": 0.3062510831224376, + "learning_rate": 5.465321634140597e-06, + "loss": 0.4377, + "step": 13298 + }, + { + "epoch": 2.184078172151171, + "grad_norm": 0.5031795325550483, + "learning_rate": 5.464863574141968e-06, + "loss": 0.4578, + "step": 13299 + }, + { + "epoch": 2.1842423993595137, + "grad_norm": 0.3211714977323895, + "learning_rate": 5.464405503663969e-06, + "loss": 0.436, + "step": 13300 + }, + { + "epoch": 2.184406626567857, + "grad_norm": 0.3563243829030005, + "learning_rate": 5.463947422711578e-06, + "loss": 0.4407, + "step": 13301 + }, + { + "epoch": 2.1845708537761994, + "grad_norm": 0.5806060878516683, + "learning_rate": 5.46348933128977e-06, + "loss": 0.4444, + "step": 13302 + }, + { + "epoch": 2.184735080984542, + "grad_norm": 0.3560099487190159, + "learning_rate": 5.463031229403521e-06, + "loss": 0.4552, + "step": 13303 + }, + { + "epoch": 2.1848993081928847, + "grad_norm": 0.27206041929266034, + "learning_rate": 5.462573117057804e-06, + "loss": 0.4567, + "step": 13304 + }, + { + "epoch": 2.1850635354012278, + "grad_norm": 0.3331834230070897, + "learning_rate": 5.462114994257596e-06, + "loss": 0.4334, + "step": 13305 + }, + { + "epoch": 2.1852277626095704, + "grad_norm": 0.31765346174580844, + "learning_rate": 5.461656861007872e-06, + "loss": 0.4502, + "step": 13306 + }, + { + "epoch": 2.185391989817913, + "grad_norm": 0.38588394716998564, + "learning_rate": 5.461198717313611e-06, + "loss": 0.4341, + "step": 13307 + }, + { + "epoch": 2.1855562170262557, + "grad_norm": 0.37336262040504764, + "learning_rate": 5.460740563179784e-06, + "loss": 0.4542, + "step": 13308 + }, + { + "epoch": 2.1857204442345988, + "grad_norm": 0.30693822439100865, + "learning_rate": 5.46028239861137e-06, + "loss": 0.4285, + "step": 13309 + }, + { + "epoch": 2.1858846714429414, + "grad_norm": 0.35159606342901895, + "learning_rate": 5.4598242236133434e-06, + "loss": 0.439, + "step": 13310 + }, + { + "epoch": 2.186048898651284, + "grad_norm": 0.5059138852284416, + "learning_rate": 5.459366038190682e-06, + "loss": 0.4463, + "step": 13311 + }, + { + "epoch": 2.1862131258596267, + "grad_norm": 0.42008476646526727, + "learning_rate": 5.458907842348362e-06, + "loss": 0.4468, + "step": 13312 + }, + { + "epoch": 2.1863773530679698, + "grad_norm": 0.4795580884704337, + "learning_rate": 5.458449636091359e-06, + "loss": 0.4393, + "step": 13313 + }, + { + "epoch": 2.1865415802763124, + "grad_norm": 0.2752909964148785, + "learning_rate": 5.457991419424649e-06, + "loss": 0.4263, + "step": 13314 + }, + { + "epoch": 2.186705807484655, + "grad_norm": 0.2727469759578638, + "learning_rate": 5.45753319235321e-06, + "loss": 0.4538, + "step": 13315 + }, + { + "epoch": 2.1868700346929977, + "grad_norm": 0.4963736171783304, + "learning_rate": 5.45707495488202e-06, + "loss": 0.4378, + "step": 13316 + }, + { + "epoch": 2.1870342619013403, + "grad_norm": 0.3257932535539851, + "learning_rate": 5.456616707016054e-06, + "loss": 0.46, + "step": 13317 + }, + { + "epoch": 2.1871984891096834, + "grad_norm": 0.30857930741888084, + "learning_rate": 5.456158448760289e-06, + "loss": 0.4387, + "step": 13318 + }, + { + "epoch": 2.187362716318026, + "grad_norm": 0.3837863937799594, + "learning_rate": 5.455700180119701e-06, + "loss": 0.4317, + "step": 13319 + }, + { + "epoch": 2.1875269435263687, + "grad_norm": 0.46225930270367893, + "learning_rate": 5.45524190109927e-06, + "loss": 0.4528, + "step": 13320 + }, + { + "epoch": 2.1876911707347113, + "grad_norm": 0.302233763714421, + "learning_rate": 5.454783611703972e-06, + "loss": 0.4556, + "step": 13321 + }, + { + "epoch": 2.1878553979430544, + "grad_norm": 0.34696738770294017, + "learning_rate": 5.454325311938786e-06, + "loss": 0.4173, + "step": 13322 + }, + { + "epoch": 2.188019625151397, + "grad_norm": 0.28421677391663946, + "learning_rate": 5.453867001808686e-06, + "loss": 0.4347, + "step": 13323 + }, + { + "epoch": 2.1881838523597397, + "grad_norm": 0.2761714620499444, + "learning_rate": 5.453408681318653e-06, + "loss": 0.4571, + "step": 13324 + }, + { + "epoch": 2.1883480795680823, + "grad_norm": 0.28369005370351114, + "learning_rate": 5.452950350473663e-06, + "loss": 0.4234, + "step": 13325 + }, + { + "epoch": 2.1885123067764254, + "grad_norm": 0.3172399933133409, + "learning_rate": 5.452492009278697e-06, + "loss": 0.4257, + "step": 13326 + }, + { + "epoch": 2.188676533984768, + "grad_norm": 0.38036218554406415, + "learning_rate": 5.452033657738727e-06, + "loss": 0.4449, + "step": 13327 + }, + { + "epoch": 2.1888407611931107, + "grad_norm": 0.2983968064999166, + "learning_rate": 5.4515752958587376e-06, + "loss": 0.4528, + "step": 13328 + }, + { + "epoch": 2.1890049884014533, + "grad_norm": 0.4085831022949406, + "learning_rate": 5.4511169236437026e-06, + "loss": 0.4323, + "step": 13329 + }, + { + "epoch": 2.1891692156097964, + "grad_norm": 0.3073148198837579, + "learning_rate": 5.450658541098603e-06, + "loss": 0.4251, + "step": 13330 + }, + { + "epoch": 2.189333442818139, + "grad_norm": 0.3552304549244225, + "learning_rate": 5.450200148228416e-06, + "loss": 0.4332, + "step": 13331 + }, + { + "epoch": 2.1894976700264817, + "grad_norm": 0.4129208416334564, + "learning_rate": 5.449741745038121e-06, + "loss": 0.4513, + "step": 13332 + }, + { + "epoch": 2.1896618972348243, + "grad_norm": 0.45681481402253127, + "learning_rate": 5.449283331532696e-06, + "loss": 0.4273, + "step": 13333 + }, + { + "epoch": 2.189826124443167, + "grad_norm": 0.28285821802334854, + "learning_rate": 5.4488249077171185e-06, + "loss": 0.4364, + "step": 13334 + }, + { + "epoch": 2.18999035165151, + "grad_norm": 0.3035145508717711, + "learning_rate": 5.44836647359637e-06, + "loss": 0.4488, + "step": 13335 + }, + { + "epoch": 2.1901545788598527, + "grad_norm": 0.44449304721783334, + "learning_rate": 5.447908029175429e-06, + "loss": 0.437, + "step": 13336 + }, + { + "epoch": 2.1903188060681953, + "grad_norm": 0.3076759098785597, + "learning_rate": 5.447449574459275e-06, + "loss": 0.4598, + "step": 13337 + }, + { + "epoch": 2.190483033276538, + "grad_norm": 0.35226795756882684, + "learning_rate": 5.446991109452884e-06, + "loss": 0.4451, + "step": 13338 + }, + { + "epoch": 2.190647260484881, + "grad_norm": 0.3400446784915832, + "learning_rate": 5.44653263416124e-06, + "loss": 0.4341, + "step": 13339 + }, + { + "epoch": 2.1908114876932236, + "grad_norm": 0.3271085565370917, + "learning_rate": 5.446074148589319e-06, + "loss": 0.4204, + "step": 13340 + }, + { + "epoch": 2.1909757149015663, + "grad_norm": 0.31458599529445347, + "learning_rate": 5.445615652742105e-06, + "loss": 0.4514, + "step": 13341 + }, + { + "epoch": 2.191139942109909, + "grad_norm": 0.3009846308571748, + "learning_rate": 5.445157146624571e-06, + "loss": 0.4541, + "step": 13342 + }, + { + "epoch": 2.191304169318252, + "grad_norm": 0.31103393177917293, + "learning_rate": 5.444698630241701e-06, + "loss": 0.4652, + "step": 13343 + }, + { + "epoch": 2.1914683965265946, + "grad_norm": 0.283057430214247, + "learning_rate": 5.444240103598475e-06, + "loss": 0.4532, + "step": 13344 + }, + { + "epoch": 2.1916326237349373, + "grad_norm": 0.29297163882420746, + "learning_rate": 5.4437815666998725e-06, + "loss": 0.4457, + "step": 13345 + }, + { + "epoch": 2.19179685094328, + "grad_norm": 0.3287114100302679, + "learning_rate": 5.4433230195508744e-06, + "loss": 0.4414, + "step": 13346 + }, + { + "epoch": 2.191961078151623, + "grad_norm": 0.33619588189444394, + "learning_rate": 5.442864462156459e-06, + "loss": 0.4293, + "step": 13347 + }, + { + "epoch": 2.1921253053599656, + "grad_norm": 0.30645737669849693, + "learning_rate": 5.442405894521608e-06, + "loss": 0.4648, + "step": 13348 + }, + { + "epoch": 2.1922895325683083, + "grad_norm": 0.29798132461815596, + "learning_rate": 5.441947316651303e-06, + "loss": 0.4462, + "step": 13349 + }, + { + "epoch": 2.192453759776651, + "grad_norm": 0.3376869196756182, + "learning_rate": 5.441488728550522e-06, + "loss": 0.4192, + "step": 13350 + }, + { + "epoch": 2.1926179869849935, + "grad_norm": 0.3381138888087342, + "learning_rate": 5.4410301302242485e-06, + "loss": 0.4398, + "step": 13351 + }, + { + "epoch": 2.1927822141933366, + "grad_norm": 0.3274713175485761, + "learning_rate": 5.440571521677461e-06, + "loss": 0.4326, + "step": 13352 + }, + { + "epoch": 2.1929464414016793, + "grad_norm": 0.3378498619220711, + "learning_rate": 5.440112902915141e-06, + "loss": 0.4504, + "step": 13353 + }, + { + "epoch": 2.193110668610022, + "grad_norm": 0.3158528798529663, + "learning_rate": 5.439654273942271e-06, + "loss": 0.4214, + "step": 13354 + }, + { + "epoch": 2.1932748958183645, + "grad_norm": 0.3505857824282715, + "learning_rate": 5.439195634763829e-06, + "loss": 0.4612, + "step": 13355 + }, + { + "epoch": 2.1934391230267076, + "grad_norm": 0.28799065331510204, + "learning_rate": 5.438736985384801e-06, + "loss": 0.4298, + "step": 13356 + }, + { + "epoch": 2.1936033502350503, + "grad_norm": 0.6200882777286091, + "learning_rate": 5.438278325810165e-06, + "loss": 0.4536, + "step": 13357 + }, + { + "epoch": 2.193767577443393, + "grad_norm": 0.270771121171262, + "learning_rate": 5.437819656044903e-06, + "loss": 0.4481, + "step": 13358 + }, + { + "epoch": 2.1939318046517355, + "grad_norm": 0.42757081634472144, + "learning_rate": 5.437360976093996e-06, + "loss": 0.4518, + "step": 13359 + }, + { + "epoch": 2.1940960318600786, + "grad_norm": 0.365960276967426, + "learning_rate": 5.436902285962429e-06, + "loss": 0.4292, + "step": 13360 + }, + { + "epoch": 2.1942602590684213, + "grad_norm": 0.3204066220765556, + "learning_rate": 5.436443585655178e-06, + "loss": 0.4467, + "step": 13361 + }, + { + "epoch": 2.194424486276764, + "grad_norm": 0.623873347080686, + "learning_rate": 5.435984875177231e-06, + "loss": 0.4261, + "step": 13362 + }, + { + "epoch": 2.1945887134851065, + "grad_norm": 0.338318483075795, + "learning_rate": 5.435526154533565e-06, + "loss": 0.4425, + "step": 13363 + }, + { + "epoch": 2.1947529406934496, + "grad_norm": 0.2930560099271347, + "learning_rate": 5.4350674237291666e-06, + "loss": 0.4622, + "step": 13364 + }, + { + "epoch": 2.1949171679017923, + "grad_norm": 0.4904425915483045, + "learning_rate": 5.434608682769016e-06, + "loss": 0.4576, + "step": 13365 + }, + { + "epoch": 2.195081395110135, + "grad_norm": 0.36543303457059684, + "learning_rate": 5.434149931658095e-06, + "loss": 0.4477, + "step": 13366 + }, + { + "epoch": 2.1952456223184775, + "grad_norm": 0.373200542196656, + "learning_rate": 5.433691170401385e-06, + "loss": 0.4362, + "step": 13367 + }, + { + "epoch": 2.19540984952682, + "grad_norm": 0.3640609524375743, + "learning_rate": 5.433232399003872e-06, + "loss": 0.424, + "step": 13368 + }, + { + "epoch": 2.1955740767351632, + "grad_norm": 0.31082790107927044, + "learning_rate": 5.4327736174705355e-06, + "loss": 0.4329, + "step": 13369 + }, + { + "epoch": 2.195738303943506, + "grad_norm": 0.5255387391693609, + "learning_rate": 5.432314825806362e-06, + "loss": 0.4538, + "step": 13370 + }, + { + "epoch": 2.1959025311518485, + "grad_norm": 0.3212060850181198, + "learning_rate": 5.431856024016333e-06, + "loss": 0.4353, + "step": 13371 + }, + { + "epoch": 2.196066758360191, + "grad_norm": 0.313522628120879, + "learning_rate": 5.4313972121054275e-06, + "loss": 0.4384, + "step": 13372 + }, + { + "epoch": 2.1962309855685342, + "grad_norm": 0.3749338646774257, + "learning_rate": 5.430938390078634e-06, + "loss": 0.4467, + "step": 13373 + }, + { + "epoch": 2.196395212776877, + "grad_norm": 0.2977863779812921, + "learning_rate": 5.430479557940933e-06, + "loss": 0.4495, + "step": 13374 + }, + { + "epoch": 2.1965594399852195, + "grad_norm": 0.28315765122651704, + "learning_rate": 5.430020715697309e-06, + "loss": 0.4448, + "step": 13375 + }, + { + "epoch": 2.196723667193562, + "grad_norm": 0.30598231606697035, + "learning_rate": 5.429561863352744e-06, + "loss": 0.4422, + "step": 13376 + }, + { + "epoch": 2.1968878944019052, + "grad_norm": 0.35302107028478596, + "learning_rate": 5.429103000912222e-06, + "loss": 0.4544, + "step": 13377 + }, + { + "epoch": 2.197052121610248, + "grad_norm": 0.8169332006027017, + "learning_rate": 5.42864412838073e-06, + "loss": 0.4185, + "step": 13378 + }, + { + "epoch": 2.1972163488185905, + "grad_norm": 0.3353800209447468, + "learning_rate": 5.4281852457632475e-06, + "loss": 0.4456, + "step": 13379 + }, + { + "epoch": 2.197380576026933, + "grad_norm": 0.29430933701926504, + "learning_rate": 5.42772635306476e-06, + "loss": 0.4505, + "step": 13380 + }, + { + "epoch": 2.1975448032352762, + "grad_norm": 0.448133699112051, + "learning_rate": 5.427267450290251e-06, + "loss": 0.4529, + "step": 13381 + }, + { + "epoch": 2.197709030443619, + "grad_norm": 0.32960691354841337, + "learning_rate": 5.426808537444707e-06, + "loss": 0.4342, + "step": 13382 + }, + { + "epoch": 2.1978732576519615, + "grad_norm": 0.33870856955694467, + "learning_rate": 5.42634961453311e-06, + "loss": 0.4419, + "step": 13383 + }, + { + "epoch": 2.198037484860304, + "grad_norm": 0.34599685380848416, + "learning_rate": 5.425890681560443e-06, + "loss": 0.4368, + "step": 13384 + }, + { + "epoch": 2.198201712068647, + "grad_norm": 0.5129549940134407, + "learning_rate": 5.425431738531693e-06, + "loss": 0.4605, + "step": 13385 + }, + { + "epoch": 2.19836593927699, + "grad_norm": 0.3833834807966303, + "learning_rate": 5.424972785451844e-06, + "loss": 0.4247, + "step": 13386 + }, + { + "epoch": 2.1985301664853325, + "grad_norm": 0.3204893316070029, + "learning_rate": 5.424513822325881e-06, + "loss": 0.4422, + "step": 13387 + }, + { + "epoch": 2.198694393693675, + "grad_norm": 0.36698443042704193, + "learning_rate": 5.4240548491587885e-06, + "loss": 0.413, + "step": 13388 + }, + { + "epoch": 2.198858620902018, + "grad_norm": 0.3811017115206296, + "learning_rate": 5.4235958659555495e-06, + "loss": 0.4781, + "step": 13389 + }, + { + "epoch": 2.199022848110361, + "grad_norm": 0.32255326893783004, + "learning_rate": 5.4231368727211526e-06, + "loss": 0.4523, + "step": 13390 + }, + { + "epoch": 2.1991870753187035, + "grad_norm": 0.572634516628206, + "learning_rate": 5.422677869460581e-06, + "loss": 0.4524, + "step": 13391 + }, + { + "epoch": 2.199351302527046, + "grad_norm": 0.3633323388041515, + "learning_rate": 5.422218856178818e-06, + "loss": 0.422, + "step": 13392 + }, + { + "epoch": 2.1995155297353888, + "grad_norm": 0.34242897491608737, + "learning_rate": 5.421759832880852e-06, + "loss": 0.4436, + "step": 13393 + }, + { + "epoch": 2.199679756943732, + "grad_norm": 0.36406147142191303, + "learning_rate": 5.421300799571668e-06, + "loss": 0.4312, + "step": 13394 + }, + { + "epoch": 2.1998439841520745, + "grad_norm": 0.2844107960510927, + "learning_rate": 5.420841756256251e-06, + "loss": 0.4572, + "step": 13395 + }, + { + "epoch": 2.200008211360417, + "grad_norm": 0.33568246893123604, + "learning_rate": 5.420382702939585e-06, + "loss": 0.4307, + "step": 13396 + }, + { + "epoch": 2.2001724385687598, + "grad_norm": 0.310811054875232, + "learning_rate": 5.419923639626657e-06, + "loss": 0.4317, + "step": 13397 + }, + { + "epoch": 2.200336665777103, + "grad_norm": 0.3290758135242331, + "learning_rate": 5.419464566322454e-06, + "loss": 0.4512, + "step": 13398 + }, + { + "epoch": 2.2005008929854455, + "grad_norm": 0.30047132711866265, + "learning_rate": 5.419005483031963e-06, + "loss": 0.4323, + "step": 13399 + }, + { + "epoch": 2.200665120193788, + "grad_norm": 0.2852061143153248, + "learning_rate": 5.4185463897601675e-06, + "loss": 0.4317, + "step": 13400 + }, + { + "epoch": 2.2008293474021308, + "grad_norm": 0.3378154655198677, + "learning_rate": 5.418087286512053e-06, + "loss": 0.4562, + "step": 13401 + }, + { + "epoch": 2.2009935746104734, + "grad_norm": 0.3812354025796814, + "learning_rate": 5.4176281732926076e-06, + "loss": 0.4594, + "step": 13402 + }, + { + "epoch": 2.2011578018188165, + "grad_norm": 0.41651687708085594, + "learning_rate": 5.417169050106818e-06, + "loss": 0.4339, + "step": 13403 + }, + { + "epoch": 2.201322029027159, + "grad_norm": 0.2878243822875417, + "learning_rate": 5.41670991695967e-06, + "loss": 0.4233, + "step": 13404 + }, + { + "epoch": 2.2014862562355018, + "grad_norm": 0.32510137346731693, + "learning_rate": 5.416250773856151e-06, + "loss": 0.4496, + "step": 13405 + }, + { + "epoch": 2.2016504834438444, + "grad_norm": 0.2746385162503503, + "learning_rate": 5.415791620801245e-06, + "loss": 0.4438, + "step": 13406 + }, + { + "epoch": 2.2018147106521875, + "grad_norm": 0.34944524205436805, + "learning_rate": 5.415332457799944e-06, + "loss": 0.45, + "step": 13407 + }, + { + "epoch": 2.20197893786053, + "grad_norm": 0.35040689222504817, + "learning_rate": 5.41487328485723e-06, + "loss": 0.4436, + "step": 13408 + }, + { + "epoch": 2.2021431650688728, + "grad_norm": 0.2910862319373962, + "learning_rate": 5.414414101978092e-06, + "loss": 0.4252, + "step": 13409 + }, + { + "epoch": 2.2023073922772154, + "grad_norm": 0.28937276374288046, + "learning_rate": 5.413954909167518e-06, + "loss": 0.4517, + "step": 13410 + }, + { + "epoch": 2.2024716194855585, + "grad_norm": 0.5219373842745609, + "learning_rate": 5.413495706430494e-06, + "loss": 0.4383, + "step": 13411 + }, + { + "epoch": 2.202635846693901, + "grad_norm": 8.93823507260522, + "learning_rate": 5.4130364937720085e-06, + "loss": 0.4431, + "step": 13412 + }, + { + "epoch": 2.2028000739022437, + "grad_norm": 0.31732089025480203, + "learning_rate": 5.412577271197047e-06, + "loss": 0.4258, + "step": 13413 + }, + { + "epoch": 2.2029643011105864, + "grad_norm": 0.3292152186596089, + "learning_rate": 5.4121180387105995e-06, + "loss": 0.4336, + "step": 13414 + }, + { + "epoch": 2.2031285283189295, + "grad_norm": 0.3087280941473258, + "learning_rate": 5.411658796317653e-06, + "loss": 0.4587, + "step": 13415 + }, + { + "epoch": 2.203292755527272, + "grad_norm": 0.3653836232230834, + "learning_rate": 5.411199544023195e-06, + "loss": 0.4342, + "step": 13416 + }, + { + "epoch": 2.2034569827356147, + "grad_norm": 0.27311329098667947, + "learning_rate": 5.410740281832212e-06, + "loss": 0.4418, + "step": 13417 + }, + { + "epoch": 2.2036212099439574, + "grad_norm": 0.49616160219727334, + "learning_rate": 5.410281009749694e-06, + "loss": 0.4256, + "step": 13418 + }, + { + "epoch": 2.2037854371523, + "grad_norm": 0.4126504742938762, + "learning_rate": 5.40982172778063e-06, + "loss": 0.4317, + "step": 13419 + }, + { + "epoch": 2.203949664360643, + "grad_norm": 0.5683373155278401, + "learning_rate": 5.409362435930006e-06, + "loss": 0.446, + "step": 13420 + }, + { + "epoch": 2.2041138915689857, + "grad_norm": 0.5772482685621865, + "learning_rate": 5.408903134202812e-06, + "loss": 0.4537, + "step": 13421 + }, + { + "epoch": 2.2042781187773284, + "grad_norm": 0.9024243798707035, + "learning_rate": 5.408443822604033e-06, + "loss": 0.4324, + "step": 13422 + }, + { + "epoch": 2.204442345985671, + "grad_norm": 0.432771176343393, + "learning_rate": 5.407984501138664e-06, + "loss": 0.4397, + "step": 13423 + }, + { + "epoch": 2.204606573194014, + "grad_norm": 0.6234869815136771, + "learning_rate": 5.407525169811689e-06, + "loss": 0.4374, + "step": 13424 + }, + { + "epoch": 2.2047708004023567, + "grad_norm": 0.5594566730793743, + "learning_rate": 5.4070658286280965e-06, + "loss": 0.4396, + "step": 13425 + }, + { + "epoch": 2.2049350276106994, + "grad_norm": 0.5564798666728442, + "learning_rate": 5.406606477592876e-06, + "loss": 0.4682, + "step": 13426 + }, + { + "epoch": 2.205099254819042, + "grad_norm": 0.4843442564867565, + "learning_rate": 5.406147116711019e-06, + "loss": 0.437, + "step": 13427 + }, + { + "epoch": 2.205263482027385, + "grad_norm": 0.36792018386639114, + "learning_rate": 5.405687745987512e-06, + "loss": 0.4345, + "step": 13428 + }, + { + "epoch": 2.2054277092357277, + "grad_norm": 0.3879908470046889, + "learning_rate": 5.405228365427346e-06, + "loss": 0.4384, + "step": 13429 + }, + { + "epoch": 2.2055919364440704, + "grad_norm": 0.45374078326582784, + "learning_rate": 5.404768975035508e-06, + "loss": 0.4356, + "step": 13430 + }, + { + "epoch": 2.205756163652413, + "grad_norm": 0.3407757604959021, + "learning_rate": 5.404309574816988e-06, + "loss": 0.4366, + "step": 13431 + }, + { + "epoch": 2.205920390860756, + "grad_norm": 0.3141407896315137, + "learning_rate": 5.4038501647767785e-06, + "loss": 0.4357, + "step": 13432 + }, + { + "epoch": 2.2060846180690987, + "grad_norm": 0.3267762123383814, + "learning_rate": 5.403390744919866e-06, + "loss": 0.4614, + "step": 13433 + }, + { + "epoch": 2.2062488452774414, + "grad_norm": 0.37945900480305744, + "learning_rate": 5.4029313152512424e-06, + "loss": 0.4551, + "step": 13434 + }, + { + "epoch": 2.206413072485784, + "grad_norm": 0.4124331195609138, + "learning_rate": 5.402471875775894e-06, + "loss": 0.4538, + "step": 13435 + }, + { + "epoch": 2.2065772996941266, + "grad_norm": 0.6909163990609498, + "learning_rate": 5.402012426498814e-06, + "loss": 0.4542, + "step": 13436 + }, + { + "epoch": 2.2067415269024697, + "grad_norm": 0.4938727070001304, + "learning_rate": 5.401552967424993e-06, + "loss": 0.4599, + "step": 13437 + }, + { + "epoch": 2.2069057541108124, + "grad_norm": 0.4031025804524951, + "learning_rate": 5.401093498559418e-06, + "loss": 0.4301, + "step": 13438 + }, + { + "epoch": 2.207069981319155, + "grad_norm": 0.3031333079133587, + "learning_rate": 5.400634019907082e-06, + "loss": 0.429, + "step": 13439 + }, + { + "epoch": 2.2072342085274976, + "grad_norm": 0.4760746129352209, + "learning_rate": 5.400174531472973e-06, + "loss": 0.4522, + "step": 13440 + }, + { + "epoch": 2.2073984357358407, + "grad_norm": 0.37261624908788993, + "learning_rate": 5.3997150332620855e-06, + "loss": 0.448, + "step": 13441 + }, + { + "epoch": 2.2075626629441834, + "grad_norm": 0.3805111962488991, + "learning_rate": 5.399255525279405e-06, + "loss": 0.449, + "step": 13442 + }, + { + "epoch": 2.207726890152526, + "grad_norm": 0.36182810691107475, + "learning_rate": 5.398796007529926e-06, + "loss": 0.4328, + "step": 13443 + }, + { + "epoch": 2.2078911173608686, + "grad_norm": 0.34366558370047795, + "learning_rate": 5.398336480018638e-06, + "loss": 0.4285, + "step": 13444 + }, + { + "epoch": 2.2080553445692117, + "grad_norm": 0.3058709969084702, + "learning_rate": 5.397876942750532e-06, + "loss": 0.4481, + "step": 13445 + }, + { + "epoch": 2.2082195717775543, + "grad_norm": 0.33400468632077346, + "learning_rate": 5.397417395730599e-06, + "loss": 0.449, + "step": 13446 + }, + { + "epoch": 2.208383798985897, + "grad_norm": 0.4026208309464146, + "learning_rate": 5.396957838963829e-06, + "loss": 0.4412, + "step": 13447 + }, + { + "epoch": 2.2085480261942396, + "grad_norm": 0.35216660485768597, + "learning_rate": 5.396498272455217e-06, + "loss": 0.4534, + "step": 13448 + }, + { + "epoch": 2.2087122534025827, + "grad_norm": 0.4944410426286778, + "learning_rate": 5.39603869620975e-06, + "loss": 0.451, + "step": 13449 + }, + { + "epoch": 2.2088764806109253, + "grad_norm": 0.35289336830677026, + "learning_rate": 5.395579110232421e-06, + "loss": 0.443, + "step": 13450 + }, + { + "epoch": 2.209040707819268, + "grad_norm": 0.3493194840023415, + "learning_rate": 5.395119514528222e-06, + "loss": 0.4473, + "step": 13451 + }, + { + "epoch": 2.2092049350276106, + "grad_norm": 0.2907115504572143, + "learning_rate": 5.394659909102144e-06, + "loss": 0.4417, + "step": 13452 + }, + { + "epoch": 2.2093691622359533, + "grad_norm": 0.3184738041292627, + "learning_rate": 5.39420029395918e-06, + "loss": 0.4483, + "step": 13453 + }, + { + "epoch": 2.2095333894442963, + "grad_norm": 0.3785358669834699, + "learning_rate": 5.393740669104321e-06, + "loss": 0.4491, + "step": 13454 + }, + { + "epoch": 2.209697616652639, + "grad_norm": 0.4155139242934717, + "learning_rate": 5.393281034542559e-06, + "loss": 0.4555, + "step": 13455 + }, + { + "epoch": 2.2098618438609816, + "grad_norm": 0.3500073461421561, + "learning_rate": 5.392821390278885e-06, + "loss": 0.4247, + "step": 13456 + }, + { + "epoch": 2.2100260710693242, + "grad_norm": 0.293089681541331, + "learning_rate": 5.392361736318293e-06, + "loss": 0.4445, + "step": 13457 + }, + { + "epoch": 2.2101902982776673, + "grad_norm": 0.37913419132543585, + "learning_rate": 5.3919020726657746e-06, + "loss": 0.4511, + "step": 13458 + }, + { + "epoch": 2.21035452548601, + "grad_norm": 0.3325337362826304, + "learning_rate": 5.391442399326323e-06, + "loss": 0.4466, + "step": 13459 + }, + { + "epoch": 2.2105187526943526, + "grad_norm": 0.30312378625958575, + "learning_rate": 5.390982716304928e-06, + "loss": 0.4444, + "step": 13460 + }, + { + "epoch": 2.2106829799026952, + "grad_norm": 0.45363941132765107, + "learning_rate": 5.3905230236065845e-06, + "loss": 0.4339, + "step": 13461 + }, + { + "epoch": 2.2108472071110383, + "grad_norm": 0.35081016919396296, + "learning_rate": 5.390063321236287e-06, + "loss": 0.4468, + "step": 13462 + }, + { + "epoch": 2.211011434319381, + "grad_norm": 0.4229490886794608, + "learning_rate": 5.389603609199025e-06, + "loss": 0.4377, + "step": 13463 + }, + { + "epoch": 2.2111756615277236, + "grad_norm": 0.5187989672492346, + "learning_rate": 5.389143887499791e-06, + "loss": 0.4276, + "step": 13464 + }, + { + "epoch": 2.2113398887360662, + "grad_norm": 0.349141035977585, + "learning_rate": 5.38868415614358e-06, + "loss": 0.4585, + "step": 13465 + }, + { + "epoch": 2.2115041159444093, + "grad_norm": 0.38330880903839587, + "learning_rate": 5.388224415135387e-06, + "loss": 0.4477, + "step": 13466 + }, + { + "epoch": 2.211668343152752, + "grad_norm": 0.3643086489980097, + "learning_rate": 5.387764664480201e-06, + "loss": 0.435, + "step": 13467 + }, + { + "epoch": 2.2118325703610946, + "grad_norm": 1.2701962885891152, + "learning_rate": 5.387304904183018e-06, + "loss": 0.4448, + "step": 13468 + }, + { + "epoch": 2.2119967975694372, + "grad_norm": 0.30489125033902276, + "learning_rate": 5.38684513424883e-06, + "loss": 0.4342, + "step": 13469 + }, + { + "epoch": 2.21216102477778, + "grad_norm": 0.31767007262386243, + "learning_rate": 5.386385354682632e-06, + "loss": 0.4261, + "step": 13470 + }, + { + "epoch": 2.212325251986123, + "grad_norm": 0.4085825381441104, + "learning_rate": 5.385925565489416e-06, + "loss": 0.4199, + "step": 13471 + }, + { + "epoch": 2.2124894791944656, + "grad_norm": 0.33488337192858, + "learning_rate": 5.385465766674178e-06, + "loss": 0.4192, + "step": 13472 + }, + { + "epoch": 2.2126537064028082, + "grad_norm": 0.3389412489135619, + "learning_rate": 5.385005958241911e-06, + "loss": 0.441, + "step": 13473 + }, + { + "epoch": 2.212817933611151, + "grad_norm": 0.3722364049933887, + "learning_rate": 5.384546140197607e-06, + "loss": 0.4543, + "step": 13474 + }, + { + "epoch": 2.212982160819494, + "grad_norm": 0.38592443024169204, + "learning_rate": 5.384086312546262e-06, + "loss": 0.4365, + "step": 13475 + }, + { + "epoch": 2.2131463880278366, + "grad_norm": 0.3146545371530525, + "learning_rate": 5.383626475292869e-06, + "loss": 0.4475, + "step": 13476 + }, + { + "epoch": 2.2133106152361792, + "grad_norm": 0.27347639969455967, + "learning_rate": 5.383166628442426e-06, + "loss": 0.4403, + "step": 13477 + }, + { + "epoch": 2.213474842444522, + "grad_norm": 0.3374210661622069, + "learning_rate": 5.382706771999923e-06, + "loss": 0.4394, + "step": 13478 + }, + { + "epoch": 2.213639069652865, + "grad_norm": 0.45448892335658087, + "learning_rate": 5.382246905970355e-06, + "loss": 0.4253, + "step": 13479 + }, + { + "epoch": 2.2138032968612076, + "grad_norm": 0.28749697984270106, + "learning_rate": 5.381787030358718e-06, + "loss": 0.4429, + "step": 13480 + }, + { + "epoch": 2.21396752406955, + "grad_norm": 0.36495583828274214, + "learning_rate": 5.3813271451700064e-06, + "loss": 0.4351, + "step": 13481 + }, + { + "epoch": 2.214131751277893, + "grad_norm": 0.4129797086459647, + "learning_rate": 5.380867250409214e-06, + "loss": 0.4359, + "step": 13482 + }, + { + "epoch": 2.214295978486236, + "grad_norm": 0.3320847636462863, + "learning_rate": 5.380407346081338e-06, + "loss": 0.4417, + "step": 13483 + }, + { + "epoch": 2.2144602056945786, + "grad_norm": 0.33595000981170686, + "learning_rate": 5.379947432191372e-06, + "loss": 0.4466, + "step": 13484 + }, + { + "epoch": 2.214624432902921, + "grad_norm": 0.3459516900532747, + "learning_rate": 5.379487508744311e-06, + "loss": 0.4249, + "step": 13485 + }, + { + "epoch": 2.214788660111264, + "grad_norm": 0.289936742009504, + "learning_rate": 5.3790275757451496e-06, + "loss": 0.4356, + "step": 13486 + }, + { + "epoch": 2.2149528873196065, + "grad_norm": 0.33670869144840854, + "learning_rate": 5.378567633198885e-06, + "loss": 0.4436, + "step": 13487 + }, + { + "epoch": 2.2151171145279496, + "grad_norm": 0.36515225989280037, + "learning_rate": 5.378107681110511e-06, + "loss": 0.4227, + "step": 13488 + }, + { + "epoch": 2.215281341736292, + "grad_norm": 0.3082061760174833, + "learning_rate": 5.377647719485024e-06, + "loss": 0.4461, + "step": 13489 + }, + { + "epoch": 2.215445568944635, + "grad_norm": 0.3461068541761683, + "learning_rate": 5.377187748327418e-06, + "loss": 0.4383, + "step": 13490 + }, + { + "epoch": 2.2156097961529775, + "grad_norm": 0.3146220360337419, + "learning_rate": 5.3767277676426915e-06, + "loss": 0.4579, + "step": 13491 + }, + { + "epoch": 2.2157740233613206, + "grad_norm": 0.2933583327660923, + "learning_rate": 5.376267777435838e-06, + "loss": 0.4662, + "step": 13492 + }, + { + "epoch": 2.215938250569663, + "grad_norm": 0.36036615968876007, + "learning_rate": 5.375807777711855e-06, + "loss": 0.4282, + "step": 13493 + }, + { + "epoch": 2.216102477778006, + "grad_norm": 0.3082561878071788, + "learning_rate": 5.3753477684757366e-06, + "loss": 0.446, + "step": 13494 + }, + { + "epoch": 2.2162667049863485, + "grad_norm": 0.2860483461505643, + "learning_rate": 5.374887749732482e-06, + "loss": 0.4432, + "step": 13495 + }, + { + "epoch": 2.2164309321946916, + "grad_norm": 0.30065915277707855, + "learning_rate": 5.3744277214870836e-06, + "loss": 0.4567, + "step": 13496 + }, + { + "epoch": 2.216595159403034, + "grad_norm": 0.28339697722762697, + "learning_rate": 5.3739676837445414e-06, + "loss": 0.4367, + "step": 13497 + }, + { + "epoch": 2.216759386611377, + "grad_norm": 0.374467941980816, + "learning_rate": 5.37350763650985e-06, + "loss": 0.4438, + "step": 13498 + }, + { + "epoch": 2.2169236138197195, + "grad_norm": 0.35613700649582364, + "learning_rate": 5.3730475797880066e-06, + "loss": 0.4382, + "step": 13499 + }, + { + "epoch": 2.2170878410280626, + "grad_norm": 0.40460989848519385, + "learning_rate": 5.372587513584006e-06, + "loss": 0.4322, + "step": 13500 + }, + { + "epoch": 2.217252068236405, + "grad_norm": 0.276120695564519, + "learning_rate": 5.372127437902847e-06, + "loss": 0.4412, + "step": 13501 + }, + { + "epoch": 2.217416295444748, + "grad_norm": 0.2981034665655008, + "learning_rate": 5.3716673527495275e-06, + "loss": 0.4262, + "step": 13502 + }, + { + "epoch": 2.2175805226530905, + "grad_norm": 0.3595831031378883, + "learning_rate": 5.371207258129041e-06, + "loss": 0.4305, + "step": 13503 + }, + { + "epoch": 2.217744749861433, + "grad_norm": 0.35041234040561664, + "learning_rate": 5.3707471540463884e-06, + "loss": 0.457, + "step": 13504 + }, + { + "epoch": 2.217908977069776, + "grad_norm": 0.3376497856717965, + "learning_rate": 5.370287040506563e-06, + "loss": 0.4454, + "step": 13505 + }, + { + "epoch": 2.218073204278119, + "grad_norm": 0.3492901426130223, + "learning_rate": 5.369826917514565e-06, + "loss": 0.4239, + "step": 13506 + }, + { + "epoch": 2.2182374314864615, + "grad_norm": 0.3528616323045615, + "learning_rate": 5.369366785075392e-06, + "loss": 0.4418, + "step": 13507 + }, + { + "epoch": 2.218401658694804, + "grad_norm": 0.5411887062016001, + "learning_rate": 5.368906643194039e-06, + "loss": 0.4467, + "step": 13508 + }, + { + "epoch": 2.218565885903147, + "grad_norm": 0.281710369398488, + "learning_rate": 5.3684464918755054e-06, + "loss": 0.4289, + "step": 13509 + }, + { + "epoch": 2.21873011311149, + "grad_norm": 0.31953154742558165, + "learning_rate": 5.367986331124787e-06, + "loss": 0.4363, + "step": 13510 + }, + { + "epoch": 2.2188943403198325, + "grad_norm": 0.27411536805995634, + "learning_rate": 5.367526160946885e-06, + "loss": 0.4382, + "step": 13511 + }, + { + "epoch": 2.219058567528175, + "grad_norm": 0.28347580644466774, + "learning_rate": 5.367065981346796e-06, + "loss": 0.4318, + "step": 13512 + }, + { + "epoch": 2.219222794736518, + "grad_norm": 0.35565024009430696, + "learning_rate": 5.366605792329516e-06, + "loss": 0.4408, + "step": 13513 + }, + { + "epoch": 2.219387021944861, + "grad_norm": 0.3173430366533922, + "learning_rate": 5.366145593900044e-06, + "loss": 0.4578, + "step": 13514 + }, + { + "epoch": 2.2195512491532035, + "grad_norm": 0.35393030124900166, + "learning_rate": 5.365685386063378e-06, + "loss": 0.4452, + "step": 13515 + }, + { + "epoch": 2.219715476361546, + "grad_norm": 0.27084181578202887, + "learning_rate": 5.365225168824519e-06, + "loss": 0.4474, + "step": 13516 + }, + { + "epoch": 2.219879703569889, + "grad_norm": 0.6068977121317608, + "learning_rate": 5.364764942188463e-06, + "loss": 0.4433, + "step": 13517 + }, + { + "epoch": 2.220043930778232, + "grad_norm": 0.2827573163324981, + "learning_rate": 5.364304706160208e-06, + "loss": 0.4488, + "step": 13518 + }, + { + "epoch": 2.2202081579865744, + "grad_norm": 0.3787643336445636, + "learning_rate": 5.363844460744755e-06, + "loss": 0.4316, + "step": 13519 + }, + { + "epoch": 2.220372385194917, + "grad_norm": 0.30874648437768004, + "learning_rate": 5.3633842059471e-06, + "loss": 0.4378, + "step": 13520 + }, + { + "epoch": 2.2205366124032597, + "grad_norm": 0.336792254197232, + "learning_rate": 5.3629239417722444e-06, + "loss": 0.4524, + "step": 13521 + }, + { + "epoch": 2.220700839611603, + "grad_norm": 0.3227196910096211, + "learning_rate": 5.362463668225184e-06, + "loss": 0.4412, + "step": 13522 + }, + { + "epoch": 2.2208650668199454, + "grad_norm": 0.32507962705134064, + "learning_rate": 5.362003385310921e-06, + "loss": 0.4536, + "step": 13523 + }, + { + "epoch": 2.221029294028288, + "grad_norm": 0.2788200062922952, + "learning_rate": 5.361543093034452e-06, + "loss": 0.4432, + "step": 13524 + }, + { + "epoch": 2.2211935212366307, + "grad_norm": 0.31509433182889934, + "learning_rate": 5.361082791400778e-06, + "loss": 0.4341, + "step": 13525 + }, + { + "epoch": 2.221357748444974, + "grad_norm": 0.3591196718424824, + "learning_rate": 5.360622480414898e-06, + "loss": 0.4329, + "step": 13526 + }, + { + "epoch": 2.2215219756533164, + "grad_norm": 0.30416959370332597, + "learning_rate": 5.360162160081811e-06, + "loss": 0.4394, + "step": 13527 + }, + { + "epoch": 2.221686202861659, + "grad_norm": 0.6191722097673911, + "learning_rate": 5.359701830406516e-06, + "loss": 0.4481, + "step": 13528 + }, + { + "epoch": 2.2218504300700017, + "grad_norm": 0.7503304964039875, + "learning_rate": 5.359241491394014e-06, + "loss": 0.4292, + "step": 13529 + }, + { + "epoch": 2.222014657278345, + "grad_norm": 0.42825504406683945, + "learning_rate": 5.358781143049304e-06, + "loss": 0.4342, + "step": 13530 + }, + { + "epoch": 2.2221788844866874, + "grad_norm": 0.29088118407198305, + "learning_rate": 5.358320785377386e-06, + "loss": 0.4375, + "step": 13531 + }, + { + "epoch": 2.22234311169503, + "grad_norm": 0.33448670629242305, + "learning_rate": 5.357860418383259e-06, + "loss": 0.432, + "step": 13532 + }, + { + "epoch": 2.2225073389033727, + "grad_norm": 0.28810069927283627, + "learning_rate": 5.357400042071925e-06, + "loss": 0.4489, + "step": 13533 + }, + { + "epoch": 2.222671566111716, + "grad_norm": 0.39361019961728927, + "learning_rate": 5.356939656448381e-06, + "loss": 0.4466, + "step": 13534 + }, + { + "epoch": 2.2228357933200584, + "grad_norm": 0.3077810757273589, + "learning_rate": 5.356479261517631e-06, + "loss": 0.4344, + "step": 13535 + }, + { + "epoch": 2.223000020528401, + "grad_norm": 0.44608262878290833, + "learning_rate": 5.356018857284672e-06, + "loss": 0.4395, + "step": 13536 + }, + { + "epoch": 2.2231642477367437, + "grad_norm": 0.45678308542254015, + "learning_rate": 5.355558443754508e-06, + "loss": 0.4452, + "step": 13537 + }, + { + "epoch": 2.2233284749450863, + "grad_norm": 0.3910797651862828, + "learning_rate": 5.355098020932136e-06, + "loss": 0.4339, + "step": 13538 + }, + { + "epoch": 2.2234927021534294, + "grad_norm": 1.0072820509471674, + "learning_rate": 5.354637588822559e-06, + "loss": 0.4631, + "step": 13539 + }, + { + "epoch": 2.223656929361772, + "grad_norm": 0.3495902620317632, + "learning_rate": 5.354177147430777e-06, + "loss": 0.4667, + "step": 13540 + }, + { + "epoch": 2.2238211565701147, + "grad_norm": 0.35133521693691183, + "learning_rate": 5.353716696761791e-06, + "loss": 0.4499, + "step": 13541 + }, + { + "epoch": 2.2239853837784573, + "grad_norm": 0.40684209571486746, + "learning_rate": 5.3532562368206006e-06, + "loss": 0.4511, + "step": 13542 + }, + { + "epoch": 2.2241496109868004, + "grad_norm": 0.652470986013063, + "learning_rate": 5.3527957676122085e-06, + "loss": 0.451, + "step": 13543 + }, + { + "epoch": 2.224313838195143, + "grad_norm": 0.32350572923745236, + "learning_rate": 5.352335289141614e-06, + "loss": 0.4437, + "step": 13544 + }, + { + "epoch": 2.2244780654034857, + "grad_norm": 0.28213502193799567, + "learning_rate": 5.351874801413822e-06, + "loss": 0.4281, + "step": 13545 + }, + { + "epoch": 2.2246422926118283, + "grad_norm": 0.34122318350658276, + "learning_rate": 5.351414304433832e-06, + "loss": 0.4425, + "step": 13546 + }, + { + "epoch": 2.2248065198201714, + "grad_norm": 0.34055176377424634, + "learning_rate": 5.350953798206643e-06, + "loss": 0.4641, + "step": 13547 + }, + { + "epoch": 2.224970747028514, + "grad_norm": 0.3056182901324451, + "learning_rate": 5.350493282737258e-06, + "loss": 0.4472, + "step": 13548 + }, + { + "epoch": 2.2251349742368567, + "grad_norm": 0.4295171215648557, + "learning_rate": 5.3500327580306804e-06, + "loss": 0.4411, + "step": 13549 + }, + { + "epoch": 2.2252992014451993, + "grad_norm": 0.31663958292820704, + "learning_rate": 5.349572224091911e-06, + "loss": 0.4465, + "step": 13550 + }, + { + "epoch": 2.2254634286535424, + "grad_norm": 0.3231332831732216, + "learning_rate": 5.34911168092595e-06, + "loss": 0.4488, + "step": 13551 + }, + { + "epoch": 2.225627655861885, + "grad_norm": 0.3500335217632181, + "learning_rate": 5.348651128537802e-06, + "loss": 0.4341, + "step": 13552 + }, + { + "epoch": 2.2257918830702277, + "grad_norm": 0.282708868398423, + "learning_rate": 5.348190566932467e-06, + "loss": 0.4248, + "step": 13553 + }, + { + "epoch": 2.2259561102785703, + "grad_norm": 0.3214462040558992, + "learning_rate": 5.347729996114949e-06, + "loss": 0.4301, + "step": 13554 + }, + { + "epoch": 2.226120337486913, + "grad_norm": 0.31372033167420005, + "learning_rate": 5.347269416090249e-06, + "loss": 0.4259, + "step": 13555 + }, + { + "epoch": 2.226284564695256, + "grad_norm": 0.3327297401900336, + "learning_rate": 5.3468088268633695e-06, + "loss": 0.4441, + "step": 13556 + }, + { + "epoch": 2.2264487919035987, + "grad_norm": 0.3294442381478894, + "learning_rate": 5.346348228439312e-06, + "loss": 0.4471, + "step": 13557 + }, + { + "epoch": 2.2266130191119413, + "grad_norm": 0.3929457500037696, + "learning_rate": 5.345887620823081e-06, + "loss": 0.446, + "step": 13558 + }, + { + "epoch": 2.226777246320284, + "grad_norm": 0.3631153626815294, + "learning_rate": 5.345427004019677e-06, + "loss": 0.4353, + "step": 13559 + }, + { + "epoch": 2.226941473528627, + "grad_norm": 0.2869505335422441, + "learning_rate": 5.344966378034106e-06, + "loss": 0.4031, + "step": 13560 + }, + { + "epoch": 2.2271057007369697, + "grad_norm": 1.0574105052879286, + "learning_rate": 5.3445057428713675e-06, + "loss": 0.4474, + "step": 13561 + }, + { + "epoch": 2.2272699279453123, + "grad_norm": 0.3579479567718406, + "learning_rate": 5.344045098536466e-06, + "loss": 0.4628, + "step": 13562 + }, + { + "epoch": 2.227434155153655, + "grad_norm": 0.3108652105716026, + "learning_rate": 5.343584445034403e-06, + "loss": 0.4389, + "step": 13563 + }, + { + "epoch": 2.227598382361998, + "grad_norm": 0.31838087198749193, + "learning_rate": 5.343123782370185e-06, + "loss": 0.4672, + "step": 13564 + }, + { + "epoch": 2.2277626095703407, + "grad_norm": 0.348928364328581, + "learning_rate": 5.342663110548813e-06, + "loss": 0.4227, + "step": 13565 + }, + { + "epoch": 2.2279268367786833, + "grad_norm": 0.375223157962436, + "learning_rate": 5.34220242957529e-06, + "loss": 0.4516, + "step": 13566 + }, + { + "epoch": 2.228091063987026, + "grad_norm": 0.36572646774424566, + "learning_rate": 5.3417417394546195e-06, + "loss": 0.4651, + "step": 13567 + }, + { + "epoch": 2.228255291195369, + "grad_norm": 0.3584240750256261, + "learning_rate": 5.341281040191805e-06, + "loss": 0.4305, + "step": 13568 + }, + { + "epoch": 2.2284195184037117, + "grad_norm": 0.3472023415751713, + "learning_rate": 5.340820331791852e-06, + "loss": 0.4582, + "step": 13569 + }, + { + "epoch": 2.2285837456120543, + "grad_norm": 0.36169487795111643, + "learning_rate": 5.340359614259764e-06, + "loss": 0.4476, + "step": 13570 + }, + { + "epoch": 2.228747972820397, + "grad_norm": 0.4990169623639773, + "learning_rate": 5.3398988876005415e-06, + "loss": 0.448, + "step": 13571 + }, + { + "epoch": 2.2289122000287396, + "grad_norm": 0.3237664127605256, + "learning_rate": 5.339438151819192e-06, + "loss": 0.44, + "step": 13572 + }, + { + "epoch": 2.2290764272370827, + "grad_norm": 0.3970367551739623, + "learning_rate": 5.338977406920717e-06, + "loss": 0.434, + "step": 13573 + }, + { + "epoch": 2.2292406544454253, + "grad_norm": 0.35942224426552793, + "learning_rate": 5.338516652910123e-06, + "loss": 0.4468, + "step": 13574 + }, + { + "epoch": 2.229404881653768, + "grad_norm": 0.27415087723762693, + "learning_rate": 5.338055889792414e-06, + "loss": 0.4583, + "step": 13575 + }, + { + "epoch": 2.2295691088621106, + "grad_norm": 0.30038302397071287, + "learning_rate": 5.337595117572591e-06, + "loss": 0.4448, + "step": 13576 + }, + { + "epoch": 2.2297333360704537, + "grad_norm": 0.6853846292788964, + "learning_rate": 5.337134336255663e-06, + "loss": 0.4408, + "step": 13577 + }, + { + "epoch": 2.2298975632787963, + "grad_norm": 0.5243657367788079, + "learning_rate": 5.336673545846631e-06, + "loss": 0.4418, + "step": 13578 + }, + { + "epoch": 2.230061790487139, + "grad_norm": 0.3239461966116936, + "learning_rate": 5.336212746350502e-06, + "loss": 0.4591, + "step": 13579 + }, + { + "epoch": 2.2302260176954816, + "grad_norm": 0.380531154323319, + "learning_rate": 5.33575193777228e-06, + "loss": 0.4608, + "step": 13580 + }, + { + "epoch": 2.2303902449038246, + "grad_norm": 0.30471466475214776, + "learning_rate": 5.33529112011697e-06, + "loss": 0.4535, + "step": 13581 + }, + { + "epoch": 2.2305544721121673, + "grad_norm": 0.42057761873332117, + "learning_rate": 5.334830293389576e-06, + "loss": 0.4575, + "step": 13582 + }, + { + "epoch": 2.23071869932051, + "grad_norm": 0.30684393851680114, + "learning_rate": 5.334369457595104e-06, + "loss": 0.4538, + "step": 13583 + }, + { + "epoch": 2.2308829265288526, + "grad_norm": 0.4505269932130537, + "learning_rate": 5.333908612738558e-06, + "loss": 0.4441, + "step": 13584 + }, + { + "epoch": 2.2310471537371956, + "grad_norm": 0.3104451390730639, + "learning_rate": 5.333447758824945e-06, + "loss": 0.4415, + "step": 13585 + }, + { + "epoch": 2.2312113809455383, + "grad_norm": 0.3042509785408218, + "learning_rate": 5.332986895859269e-06, + "loss": 0.4442, + "step": 13586 + }, + { + "epoch": 2.231375608153881, + "grad_norm": 0.33087608068349694, + "learning_rate": 5.332526023846536e-06, + "loss": 0.4431, + "step": 13587 + }, + { + "epoch": 2.2315398353622236, + "grad_norm": 0.3253474702931535, + "learning_rate": 5.332065142791751e-06, + "loss": 0.4242, + "step": 13588 + }, + { + "epoch": 2.231704062570566, + "grad_norm": 0.3869258011453058, + "learning_rate": 5.33160425269992e-06, + "loss": 0.4458, + "step": 13589 + }, + { + "epoch": 2.2318682897789093, + "grad_norm": 0.4440672206002762, + "learning_rate": 5.331143353576048e-06, + "loss": 0.4346, + "step": 13590 + }, + { + "epoch": 2.232032516987252, + "grad_norm": 0.36785815536200994, + "learning_rate": 5.330682445425143e-06, + "loss": 0.4452, + "step": 13591 + }, + { + "epoch": 2.2321967441955946, + "grad_norm": 0.44275564494307584, + "learning_rate": 5.330221528252207e-06, + "loss": 0.4502, + "step": 13592 + }, + { + "epoch": 2.232360971403937, + "grad_norm": 0.343543553199038, + "learning_rate": 5.3297606020622495e-06, + "loss": 0.4496, + "step": 13593 + }, + { + "epoch": 2.2325251986122803, + "grad_norm": 0.37180636084536056, + "learning_rate": 5.3292996668602765e-06, + "loss": 0.4493, + "step": 13594 + }, + { + "epoch": 2.232689425820623, + "grad_norm": 0.29019902675559567, + "learning_rate": 5.328838722651292e-06, + "loss": 0.441, + "step": 13595 + }, + { + "epoch": 2.2328536530289655, + "grad_norm": 0.45008975878901003, + "learning_rate": 5.3283777694403045e-06, + "loss": 0.4499, + "step": 13596 + }, + { + "epoch": 2.233017880237308, + "grad_norm": 0.34751380757528144, + "learning_rate": 5.327916807232318e-06, + "loss": 0.4276, + "step": 13597 + }, + { + "epoch": 2.2331821074456513, + "grad_norm": 0.4329884068791182, + "learning_rate": 5.32745583603234e-06, + "loss": 0.4444, + "step": 13598 + }, + { + "epoch": 2.233346334653994, + "grad_norm": 0.34680911321201413, + "learning_rate": 5.32699485584538e-06, + "loss": 0.4257, + "step": 13599 + }, + { + "epoch": 2.2335105618623365, + "grad_norm": 0.4622230443995473, + "learning_rate": 5.326533866676443e-06, + "loss": 0.4285, + "step": 13600 + }, + { + "epoch": 2.233674789070679, + "grad_norm": 0.5131870445694399, + "learning_rate": 5.3260728685305314e-06, + "loss": 0.4491, + "step": 13601 + }, + { + "epoch": 2.2338390162790223, + "grad_norm": 0.40076083507511995, + "learning_rate": 5.325611861412656e-06, + "loss": 0.4386, + "step": 13602 + }, + { + "epoch": 2.234003243487365, + "grad_norm": 0.27194120329673643, + "learning_rate": 5.325150845327826e-06, + "loss": 0.4402, + "step": 13603 + }, + { + "epoch": 2.2341674706957075, + "grad_norm": 0.3627492697365076, + "learning_rate": 5.3246898202810455e-06, + "loss": 0.4561, + "step": 13604 + }, + { + "epoch": 2.23433169790405, + "grad_norm": 0.36811125848416526, + "learning_rate": 5.324228786277321e-06, + "loss": 0.4442, + "step": 13605 + }, + { + "epoch": 2.234495925112393, + "grad_norm": 0.32740568146167454, + "learning_rate": 5.323767743321661e-06, + "loss": 0.4416, + "step": 13606 + }, + { + "epoch": 2.234660152320736, + "grad_norm": 0.2928833678123084, + "learning_rate": 5.323306691419072e-06, + "loss": 0.4526, + "step": 13607 + }, + { + "epoch": 2.2348243795290785, + "grad_norm": 0.5940056253711327, + "learning_rate": 5.322845630574564e-06, + "loss": 0.4452, + "step": 13608 + }, + { + "epoch": 2.234988606737421, + "grad_norm": 0.42021729424037163, + "learning_rate": 5.322384560793143e-06, + "loss": 0.4168, + "step": 13609 + }, + { + "epoch": 2.235152833945764, + "grad_norm": 0.3073837421956184, + "learning_rate": 5.321923482079815e-06, + "loss": 0.4486, + "step": 13610 + }, + { + "epoch": 2.235317061154107, + "grad_norm": 0.3522817208717473, + "learning_rate": 5.321462394439591e-06, + "loss": 0.4284, + "step": 13611 + }, + { + "epoch": 2.2354812883624495, + "grad_norm": 0.3598916939940968, + "learning_rate": 5.3210012978774765e-06, + "loss": 0.4464, + "step": 13612 + }, + { + "epoch": 2.235645515570792, + "grad_norm": 0.2617858004173523, + "learning_rate": 5.320540192398479e-06, + "loss": 0.4511, + "step": 13613 + }, + { + "epoch": 2.235809742779135, + "grad_norm": 0.31940286486217156, + "learning_rate": 5.3200790780076096e-06, + "loss": 0.4472, + "step": 13614 + }, + { + "epoch": 2.235973969987478, + "grad_norm": 0.2959868358849483, + "learning_rate": 5.319617954709873e-06, + "loss": 0.4249, + "step": 13615 + }, + { + "epoch": 2.2361381971958205, + "grad_norm": 0.3030407022951804, + "learning_rate": 5.319156822510281e-06, + "loss": 0.448, + "step": 13616 + }, + { + "epoch": 2.236302424404163, + "grad_norm": 0.34246789595360566, + "learning_rate": 5.318695681413839e-06, + "loss": 0.4412, + "step": 13617 + }, + { + "epoch": 2.236466651612506, + "grad_norm": 0.3426164905586456, + "learning_rate": 5.318234531425555e-06, + "loss": 0.4696, + "step": 13618 + }, + { + "epoch": 2.236630878820849, + "grad_norm": 0.32189875802648454, + "learning_rate": 5.317773372550441e-06, + "loss": 0.4516, + "step": 13619 + }, + { + "epoch": 2.2367951060291915, + "grad_norm": 0.29844986670895207, + "learning_rate": 5.317312204793502e-06, + "loss": 0.4394, + "step": 13620 + }, + { + "epoch": 2.236959333237534, + "grad_norm": 0.30412717030286257, + "learning_rate": 5.316851028159749e-06, + "loss": 0.4352, + "step": 13621 + }, + { + "epoch": 2.237123560445877, + "grad_norm": 0.3608272137967386, + "learning_rate": 5.3163898426541905e-06, + "loss": 0.4342, + "step": 13622 + }, + { + "epoch": 2.2372877876542194, + "grad_norm": 0.2920497088883626, + "learning_rate": 5.315928648281834e-06, + "loss": 0.445, + "step": 13623 + }, + { + "epoch": 2.2374520148625625, + "grad_norm": 0.3006645750815423, + "learning_rate": 5.315467445047691e-06, + "loss": 0.4137, + "step": 13624 + }, + { + "epoch": 2.237616242070905, + "grad_norm": 0.28330615816664806, + "learning_rate": 5.315006232956768e-06, + "loss": 0.4441, + "step": 13625 + }, + { + "epoch": 2.237780469279248, + "grad_norm": 0.4482961606066074, + "learning_rate": 5.314545012014077e-06, + "loss": 0.4477, + "step": 13626 + }, + { + "epoch": 2.2379446964875904, + "grad_norm": 0.5955992178112777, + "learning_rate": 5.314083782224624e-06, + "loss": 0.4501, + "step": 13627 + }, + { + "epoch": 2.2381089236959335, + "grad_norm": 0.33806720327208106, + "learning_rate": 5.3136225435934215e-06, + "loss": 0.449, + "step": 13628 + }, + { + "epoch": 2.238273150904276, + "grad_norm": 0.3012297840190392, + "learning_rate": 5.313161296125477e-06, + "loss": 0.4313, + "step": 13629 + }, + { + "epoch": 2.238437378112619, + "grad_norm": 0.34520611961056774, + "learning_rate": 5.312700039825801e-06, + "loss": 0.4413, + "step": 13630 + }, + { + "epoch": 2.2386016053209614, + "grad_norm": 0.3207154044252607, + "learning_rate": 5.312238774699403e-06, + "loss": 0.4581, + "step": 13631 + }, + { + "epoch": 2.2387658325293045, + "grad_norm": 0.3156908540345743, + "learning_rate": 5.311777500751293e-06, + "loss": 0.4465, + "step": 13632 + }, + { + "epoch": 2.238930059737647, + "grad_norm": 0.5587797292971918, + "learning_rate": 5.31131621798648e-06, + "loss": 0.4327, + "step": 13633 + }, + { + "epoch": 2.2390942869459898, + "grad_norm": 0.3836716634336738, + "learning_rate": 5.310854926409975e-06, + "loss": 0.4342, + "step": 13634 + }, + { + "epoch": 2.2392585141543324, + "grad_norm": 0.39705515751436415, + "learning_rate": 5.310393626026789e-06, + "loss": 0.4496, + "step": 13635 + }, + { + "epoch": 2.2394227413626755, + "grad_norm": 0.4116210776291705, + "learning_rate": 5.3099323168419276e-06, + "loss": 0.4644, + "step": 13636 + }, + { + "epoch": 2.239586968571018, + "grad_norm": 0.48621166225056617, + "learning_rate": 5.309470998860407e-06, + "loss": 0.428, + "step": 13637 + }, + { + "epoch": 2.2397511957793608, + "grad_norm": 0.29530363881665117, + "learning_rate": 5.309009672087236e-06, + "loss": 0.4504, + "step": 13638 + }, + { + "epoch": 2.2399154229877034, + "grad_norm": 0.3208941816493528, + "learning_rate": 5.308548336527421e-06, + "loss": 0.4145, + "step": 13639 + }, + { + "epoch": 2.240079650196046, + "grad_norm": 0.34304395384197434, + "learning_rate": 5.3080869921859765e-06, + "loss": 0.4554, + "step": 13640 + }, + { + "epoch": 2.240243877404389, + "grad_norm": 0.2997489296447226, + "learning_rate": 5.3076256390679136e-06, + "loss": 0.4306, + "step": 13641 + }, + { + "epoch": 2.2404081046127318, + "grad_norm": 0.3598332489608276, + "learning_rate": 5.30716427717824e-06, + "loss": 0.4744, + "step": 13642 + }, + { + "epoch": 2.2405723318210744, + "grad_norm": 0.3640856566145552, + "learning_rate": 5.306702906521969e-06, + "loss": 0.4426, + "step": 13643 + }, + { + "epoch": 2.240736559029417, + "grad_norm": 0.25957988968195783, + "learning_rate": 5.30624152710411e-06, + "loss": 0.4347, + "step": 13644 + }, + { + "epoch": 2.24090078623776, + "grad_norm": 0.3488760731941589, + "learning_rate": 5.305780138929676e-06, + "loss": 0.4667, + "step": 13645 + }, + { + "epoch": 2.2410650134461028, + "grad_norm": 0.4165200065998101, + "learning_rate": 5.305318742003677e-06, + "loss": 0.4525, + "step": 13646 + }, + { + "epoch": 2.2412292406544454, + "grad_norm": 0.31530256152853486, + "learning_rate": 5.304857336331123e-06, + "loss": 0.4622, + "step": 13647 + }, + { + "epoch": 2.241393467862788, + "grad_norm": 0.4446666704947999, + "learning_rate": 5.304395921917027e-06, + "loss": 0.4301, + "step": 13648 + }, + { + "epoch": 2.241557695071131, + "grad_norm": 0.4053189269354466, + "learning_rate": 5.303934498766399e-06, + "loss": 0.4412, + "step": 13649 + }, + { + "epoch": 2.2417219222794738, + "grad_norm": 0.3625864205313529, + "learning_rate": 5.303473066884254e-06, + "loss": 0.4487, + "step": 13650 + }, + { + "epoch": 2.2418861494878164, + "grad_norm": 0.3885493040306324, + "learning_rate": 5.303011626275599e-06, + "loss": 0.4359, + "step": 13651 + }, + { + "epoch": 2.242050376696159, + "grad_norm": 0.318432682101398, + "learning_rate": 5.3025501769454475e-06, + "loss": 0.4322, + "step": 13652 + }, + { + "epoch": 2.242214603904502, + "grad_norm": 0.3007696684784351, + "learning_rate": 5.302088718898812e-06, + "loss": 0.4489, + "step": 13653 + }, + { + "epoch": 2.2423788311128448, + "grad_norm": 0.3887145358064661, + "learning_rate": 5.3016272521407036e-06, + "loss": 0.4483, + "step": 13654 + }, + { + "epoch": 2.2425430583211874, + "grad_norm": 0.3409044775291692, + "learning_rate": 5.301165776676134e-06, + "loss": 0.4433, + "step": 13655 + }, + { + "epoch": 2.24270728552953, + "grad_norm": 0.3670942744271576, + "learning_rate": 5.300704292510116e-06, + "loss": 0.4526, + "step": 13656 + }, + { + "epoch": 2.2428715127378727, + "grad_norm": 0.29240835654932273, + "learning_rate": 5.300242799647662e-06, + "loss": 0.4306, + "step": 13657 + }, + { + "epoch": 2.2430357399462157, + "grad_norm": 0.39224385150203395, + "learning_rate": 5.299781298093785e-06, + "loss": 0.4398, + "step": 13658 + }, + { + "epoch": 2.2431999671545584, + "grad_norm": 0.3124724267924632, + "learning_rate": 5.299319787853495e-06, + "loss": 0.4426, + "step": 13659 + }, + { + "epoch": 2.243364194362901, + "grad_norm": 0.3118206048528069, + "learning_rate": 5.298858268931805e-06, + "loss": 0.4599, + "step": 13660 + }, + { + "epoch": 2.2435284215712437, + "grad_norm": 0.3472209576784062, + "learning_rate": 5.298396741333729e-06, + "loss": 0.4427, + "step": 13661 + }, + { + "epoch": 2.2436926487795867, + "grad_norm": 0.28816801110315016, + "learning_rate": 5.297935205064279e-06, + "loss": 0.4155, + "step": 13662 + }, + { + "epoch": 2.2438568759879294, + "grad_norm": 0.3280013408771476, + "learning_rate": 5.297473660128469e-06, + "loss": 0.4351, + "step": 13663 + }, + { + "epoch": 2.244021103196272, + "grad_norm": 0.3013157024739304, + "learning_rate": 5.297012106531308e-06, + "loss": 0.4542, + "step": 13664 + }, + { + "epoch": 2.2441853304046147, + "grad_norm": 0.2698186431923782, + "learning_rate": 5.296550544277813e-06, + "loss": 0.4358, + "step": 13665 + }, + { + "epoch": 2.2443495576129573, + "grad_norm": 0.3152377394168143, + "learning_rate": 5.296088973372994e-06, + "loss": 0.4271, + "step": 13666 + }, + { + "epoch": 2.2445137848213004, + "grad_norm": 0.332333820287266, + "learning_rate": 5.295627393821867e-06, + "loss": 0.4439, + "step": 13667 + }, + { + "epoch": 2.244678012029643, + "grad_norm": 0.3540494011551216, + "learning_rate": 5.295165805629443e-06, + "loss": 0.427, + "step": 13668 + }, + { + "epoch": 2.2448422392379856, + "grad_norm": 0.27766510632642816, + "learning_rate": 5.294704208800735e-06, + "loss": 0.4408, + "step": 13669 + }, + { + "epoch": 2.2450064664463287, + "grad_norm": 0.32517347498764476, + "learning_rate": 5.2942426033407585e-06, + "loss": 0.4429, + "step": 13670 + }, + { + "epoch": 2.2451706936546714, + "grad_norm": 0.3013033368759928, + "learning_rate": 5.293780989254526e-06, + "loss": 0.4392, + "step": 13671 + }, + { + "epoch": 2.245334920863014, + "grad_norm": 0.282745086702727, + "learning_rate": 5.293319366547051e-06, + "loss": 0.4381, + "step": 13672 + }, + { + "epoch": 2.2454991480713566, + "grad_norm": 0.3523847887648517, + "learning_rate": 5.292857735223346e-06, + "loss": 0.4364, + "step": 13673 + }, + { + "epoch": 2.2456633752796993, + "grad_norm": 0.3750636418052584, + "learning_rate": 5.292396095288428e-06, + "loss": 0.4701, + "step": 13674 + }, + { + "epoch": 2.2458276024880424, + "grad_norm": 0.6454444354941448, + "learning_rate": 5.2919344467473074e-06, + "loss": 0.4425, + "step": 13675 + }, + { + "epoch": 2.245991829696385, + "grad_norm": 0.32847604128266267, + "learning_rate": 5.2914727896049996e-06, + "loss": 0.4349, + "step": 13676 + }, + { + "epoch": 2.2461560569047276, + "grad_norm": 0.27768628742635093, + "learning_rate": 5.291011123866519e-06, + "loss": 0.4275, + "step": 13677 + }, + { + "epoch": 2.2463202841130703, + "grad_norm": 0.29142901921932585, + "learning_rate": 5.290549449536879e-06, + "loss": 0.4339, + "step": 13678 + }, + { + "epoch": 2.2464845113214134, + "grad_norm": 0.5690887619651148, + "learning_rate": 5.290087766621095e-06, + "loss": 0.429, + "step": 13679 + }, + { + "epoch": 2.246648738529756, + "grad_norm": 0.3532823427162118, + "learning_rate": 5.289626075124179e-06, + "loss": 0.444, + "step": 13680 + }, + { + "epoch": 2.2468129657380986, + "grad_norm": 0.2671035256621192, + "learning_rate": 5.289164375051147e-06, + "loss": 0.46, + "step": 13681 + }, + { + "epoch": 2.2469771929464413, + "grad_norm": 0.34637078136166244, + "learning_rate": 5.2887026664070154e-06, + "loss": 0.4498, + "step": 13682 + }, + { + "epoch": 2.247141420154784, + "grad_norm": 0.3572211705234191, + "learning_rate": 5.288240949196796e-06, + "loss": 0.4573, + "step": 13683 + }, + { + "epoch": 2.247305647363127, + "grad_norm": 0.3579960088428089, + "learning_rate": 5.287779223425504e-06, + "loss": 0.4302, + "step": 13684 + }, + { + "epoch": 2.2474698745714696, + "grad_norm": 0.34066301797241494, + "learning_rate": 5.287317489098154e-06, + "loss": 0.443, + "step": 13685 + }, + { + "epoch": 2.2476341017798123, + "grad_norm": 0.32676299112996604, + "learning_rate": 5.286855746219762e-06, + "loss": 0.4399, + "step": 13686 + }, + { + "epoch": 2.2477983289881553, + "grad_norm": 0.36409542955257634, + "learning_rate": 5.2863939947953435e-06, + "loss": 0.4546, + "step": 13687 + }, + { + "epoch": 2.247962556196498, + "grad_norm": 0.4273691550545424, + "learning_rate": 5.285932234829911e-06, + "loss": 0.4636, + "step": 13688 + }, + { + "epoch": 2.2481267834048406, + "grad_norm": 0.36166863699389046, + "learning_rate": 5.285470466328482e-06, + "loss": 0.4517, + "step": 13689 + }, + { + "epoch": 2.2482910106131833, + "grad_norm": 0.3815787960529812, + "learning_rate": 5.285008689296069e-06, + "loss": 0.4625, + "step": 13690 + }, + { + "epoch": 2.248455237821526, + "grad_norm": 0.26063558836316875, + "learning_rate": 5.284546903737692e-06, + "loss": 0.4617, + "step": 13691 + }, + { + "epoch": 2.248619465029869, + "grad_norm": 0.4614932545774242, + "learning_rate": 5.284085109658363e-06, + "loss": 0.4235, + "step": 13692 + }, + { + "epoch": 2.2487836922382116, + "grad_norm": 0.32931830311298993, + "learning_rate": 5.283623307063098e-06, + "loss": 0.427, + "step": 13693 + }, + { + "epoch": 2.2489479194465543, + "grad_norm": 2.310641384368478, + "learning_rate": 5.283161495956912e-06, + "loss": 0.4346, + "step": 13694 + }, + { + "epoch": 2.249112146654897, + "grad_norm": 0.5378084523050332, + "learning_rate": 5.282699676344822e-06, + "loss": 0.4452, + "step": 13695 + }, + { + "epoch": 2.24927637386324, + "grad_norm": 0.34680472456278405, + "learning_rate": 5.282237848231844e-06, + "loss": 0.4457, + "step": 13696 + }, + { + "epoch": 2.2494406010715826, + "grad_norm": 0.3445890634659044, + "learning_rate": 5.281776011622994e-06, + "loss": 0.4428, + "step": 13697 + }, + { + "epoch": 2.2496048282799253, + "grad_norm": 0.2858278812975166, + "learning_rate": 5.2813141665232865e-06, + "loss": 0.4467, + "step": 13698 + }, + { + "epoch": 2.249769055488268, + "grad_norm": 0.3145020721528898, + "learning_rate": 5.280852312937738e-06, + "loss": 0.433, + "step": 13699 + }, + { + "epoch": 2.2499332826966105, + "grad_norm": 0.3817643009363534, + "learning_rate": 5.280390450871367e-06, + "loss": 0.446, + "step": 13700 + }, + { + "epoch": 2.2500975099049536, + "grad_norm": 0.4631720137309753, + "learning_rate": 5.279928580329187e-06, + "loss": 0.4365, + "step": 13701 + }, + { + "epoch": 2.2502617371132962, + "grad_norm": 0.3874126600272311, + "learning_rate": 5.279466701316214e-06, + "loss": 0.4451, + "step": 13702 + }, + { + "epoch": 2.250425964321639, + "grad_norm": 0.7340837113996945, + "learning_rate": 5.279004813837466e-06, + "loss": 0.454, + "step": 13703 + }, + { + "epoch": 2.250590191529982, + "grad_norm": 0.32826893954558084, + "learning_rate": 5.278542917897961e-06, + "loss": 0.4456, + "step": 13704 + }, + { + "epoch": 2.2507544187383246, + "grad_norm": 0.30313075634072284, + "learning_rate": 5.2780810135027115e-06, + "loss": 0.467, + "step": 13705 + }, + { + "epoch": 2.2509186459466672, + "grad_norm": 0.2936344465324073, + "learning_rate": 5.277619100656739e-06, + "loss": 0.4437, + "step": 13706 + }, + { + "epoch": 2.25108287315501, + "grad_norm": 0.32540102092866996, + "learning_rate": 5.277157179365056e-06, + "loss": 0.4375, + "step": 13707 + }, + { + "epoch": 2.2512471003633525, + "grad_norm": 0.5418154782829453, + "learning_rate": 5.276695249632683e-06, + "loss": 0.428, + "step": 13708 + }, + { + "epoch": 2.2514113275716956, + "grad_norm": 0.4711780155828875, + "learning_rate": 5.276233311464635e-06, + "loss": 0.4598, + "step": 13709 + }, + { + "epoch": 2.2515755547800382, + "grad_norm": 0.4652902743526495, + "learning_rate": 5.275771364865929e-06, + "loss": 0.4705, + "step": 13710 + }, + { + "epoch": 2.251739781988381, + "grad_norm": 0.38330167114251695, + "learning_rate": 5.275309409841583e-06, + "loss": 0.4371, + "step": 13711 + }, + { + "epoch": 2.2519040091967235, + "grad_norm": 0.34378733317073845, + "learning_rate": 5.274847446396615e-06, + "loss": 0.4462, + "step": 13712 + }, + { + "epoch": 2.2520682364050666, + "grad_norm": 0.3504403391095108, + "learning_rate": 5.27438547453604e-06, + "loss": 0.4499, + "step": 13713 + }, + { + "epoch": 2.2522324636134092, + "grad_norm": 0.3535033845955811, + "learning_rate": 5.273923494264877e-06, + "loss": 0.4429, + "step": 13714 + }, + { + "epoch": 2.252396690821752, + "grad_norm": 0.33591679853041095, + "learning_rate": 5.273461505588142e-06, + "loss": 0.4497, + "step": 13715 + }, + { + "epoch": 2.2525609180300945, + "grad_norm": 0.3383253978626295, + "learning_rate": 5.272999508510857e-06, + "loss": 0.4391, + "step": 13716 + }, + { + "epoch": 2.252725145238437, + "grad_norm": 0.32385647020423586, + "learning_rate": 5.272537503038036e-06, + "loss": 0.4337, + "step": 13717 + }, + { + "epoch": 2.2528893724467802, + "grad_norm": 0.3396561220455788, + "learning_rate": 5.2720754891746965e-06, + "loss": 0.4611, + "step": 13718 + }, + { + "epoch": 2.253053599655123, + "grad_norm": 0.38140665585784617, + "learning_rate": 5.271613466925859e-06, + "loss": 0.4453, + "step": 13719 + }, + { + "epoch": 2.2532178268634655, + "grad_norm": 0.6540812656662718, + "learning_rate": 5.2711514362965395e-06, + "loss": 0.443, + "step": 13720 + }, + { + "epoch": 2.2533820540718086, + "grad_norm": 0.28094182270807777, + "learning_rate": 5.270689397291757e-06, + "loss": 0.4211, + "step": 13721 + }, + { + "epoch": 2.253546281280151, + "grad_norm": 0.425522068083951, + "learning_rate": 5.2702273499165286e-06, + "loss": 0.4424, + "step": 13722 + }, + { + "epoch": 2.253710508488494, + "grad_norm": 0.41584840792055144, + "learning_rate": 5.2697652941758724e-06, + "loss": 0.4358, + "step": 13723 + }, + { + "epoch": 2.2538747356968365, + "grad_norm": 0.3177024211408362, + "learning_rate": 5.26930323007481e-06, + "loss": 0.434, + "step": 13724 + }, + { + "epoch": 2.254038962905179, + "grad_norm": 0.3330303286606085, + "learning_rate": 5.2688411576183565e-06, + "loss": 0.4462, + "step": 13725 + }, + { + "epoch": 2.254203190113522, + "grad_norm": 0.294027547485759, + "learning_rate": 5.268379076811532e-06, + "loss": 0.4463, + "step": 13726 + }, + { + "epoch": 2.254367417321865, + "grad_norm": 0.3264056370297087, + "learning_rate": 5.267916987659355e-06, + "loss": 0.4393, + "step": 13727 + }, + { + "epoch": 2.2545316445302075, + "grad_norm": 0.29553128037110177, + "learning_rate": 5.267454890166842e-06, + "loss": 0.4719, + "step": 13728 + }, + { + "epoch": 2.25469587173855, + "grad_norm": 0.331978086815522, + "learning_rate": 5.266992784339016e-06, + "loss": 0.4409, + "step": 13729 + }, + { + "epoch": 2.254860098946893, + "grad_norm": 0.2809751050096038, + "learning_rate": 5.266530670180894e-06, + "loss": 0.4285, + "step": 13730 + }, + { + "epoch": 2.255024326155236, + "grad_norm": 0.3182632580568042, + "learning_rate": 5.266068547697493e-06, + "loss": 0.4454, + "step": 13731 + }, + { + "epoch": 2.2551885533635785, + "grad_norm": 0.32836916473275135, + "learning_rate": 5.265606416893835e-06, + "loss": 0.4307, + "step": 13732 + }, + { + "epoch": 2.255352780571921, + "grad_norm": 0.2853170249483263, + "learning_rate": 5.265144277774938e-06, + "loss": 0.4344, + "step": 13733 + }, + { + "epoch": 2.2555170077802638, + "grad_norm": 0.5178439532805238, + "learning_rate": 5.2646821303458215e-06, + "loss": 0.4399, + "step": 13734 + }, + { + "epoch": 2.255681234988607, + "grad_norm": 0.5507828085958464, + "learning_rate": 5.264219974611505e-06, + "loss": 0.4655, + "step": 13735 + }, + { + "epoch": 2.2558454621969495, + "grad_norm": 0.3382196997965826, + "learning_rate": 5.263757810577006e-06, + "loss": 0.4631, + "step": 13736 + }, + { + "epoch": 2.256009689405292, + "grad_norm": 0.28330362320821995, + "learning_rate": 5.263295638247347e-06, + "loss": 0.4392, + "step": 13737 + }, + { + "epoch": 2.256173916613635, + "grad_norm": 0.35874744100148126, + "learning_rate": 5.262833457627546e-06, + "loss": 0.4491, + "step": 13738 + }, + { + "epoch": 2.256338143821978, + "grad_norm": 0.267510615311155, + "learning_rate": 5.262371268722623e-06, + "loss": 0.4176, + "step": 13739 + }, + { + "epoch": 2.2565023710303205, + "grad_norm": 0.5092229768402633, + "learning_rate": 5.261909071537598e-06, + "loss": 0.4427, + "step": 13740 + }, + { + "epoch": 2.256666598238663, + "grad_norm": 0.2804393487003495, + "learning_rate": 5.261446866077491e-06, + "loss": 0.433, + "step": 13741 + }, + { + "epoch": 2.2568308254470058, + "grad_norm": 0.3899613005160599, + "learning_rate": 5.260984652347323e-06, + "loss": 0.4338, + "step": 13742 + }, + { + "epoch": 2.256995052655349, + "grad_norm": 0.3052554308900633, + "learning_rate": 5.260522430352111e-06, + "loss": 0.418, + "step": 13743 + }, + { + "epoch": 2.2571592798636915, + "grad_norm": 0.3072133071489857, + "learning_rate": 5.260060200096877e-06, + "loss": 0.4692, + "step": 13744 + }, + { + "epoch": 2.257323507072034, + "grad_norm": 0.2834940365753804, + "learning_rate": 5.259597961586644e-06, + "loss": 0.4443, + "step": 13745 + }, + { + "epoch": 2.2574877342803767, + "grad_norm": 0.34360106449572925, + "learning_rate": 5.259135714826429e-06, + "loss": 0.4458, + "step": 13746 + }, + { + "epoch": 2.25765196148872, + "grad_norm": 0.34200371546354, + "learning_rate": 5.2586734598212515e-06, + "loss": 0.4496, + "step": 13747 + }, + { + "epoch": 2.2578161886970625, + "grad_norm": 0.32078529673268424, + "learning_rate": 5.258211196576134e-06, + "loss": 0.4541, + "step": 13748 + }, + { + "epoch": 2.257980415905405, + "grad_norm": 0.7634754044032883, + "learning_rate": 5.257748925096098e-06, + "loss": 0.4563, + "step": 13749 + }, + { + "epoch": 2.2581446431137477, + "grad_norm": 0.341481961545268, + "learning_rate": 5.257286645386164e-06, + "loss": 0.4456, + "step": 13750 + }, + { + "epoch": 2.2583088703220904, + "grad_norm": 0.2911188279442169, + "learning_rate": 5.256824357451351e-06, + "loss": 0.4547, + "step": 13751 + }, + { + "epoch": 2.2584730975304335, + "grad_norm": 0.37741366724925646, + "learning_rate": 5.256362061296681e-06, + "loss": 0.4595, + "step": 13752 + }, + { + "epoch": 2.258637324738776, + "grad_norm": 0.3611787880484088, + "learning_rate": 5.255899756927174e-06, + "loss": 0.4582, + "step": 13753 + }, + { + "epoch": 2.2588015519471187, + "grad_norm": 0.913157147165417, + "learning_rate": 5.255437444347852e-06, + "loss": 0.4317, + "step": 13754 + }, + { + "epoch": 2.258965779155462, + "grad_norm": 0.7085119346882607, + "learning_rate": 5.254975123563737e-06, + "loss": 0.4393, + "step": 13755 + }, + { + "epoch": 2.2591300063638045, + "grad_norm": 0.4123409307715226, + "learning_rate": 5.25451279457985e-06, + "loss": 0.4388, + "step": 13756 + }, + { + "epoch": 2.259294233572147, + "grad_norm": 0.3130367328214307, + "learning_rate": 5.25405045740121e-06, + "loss": 0.461, + "step": 13757 + }, + { + "epoch": 2.2594584607804897, + "grad_norm": 0.4009373961373831, + "learning_rate": 5.253588112032841e-06, + "loss": 0.4546, + "step": 13758 + }, + { + "epoch": 2.2596226879888324, + "grad_norm": 0.3549456349719573, + "learning_rate": 5.253125758479763e-06, + "loss": 0.4515, + "step": 13759 + }, + { + "epoch": 2.2597869151971754, + "grad_norm": 0.3200803902850578, + "learning_rate": 5.2526633967469995e-06, + "loss": 0.4402, + "step": 13760 + }, + { + "epoch": 2.259951142405518, + "grad_norm": 0.3332251927527049, + "learning_rate": 5.25220102683957e-06, + "loss": 0.4173, + "step": 13761 + }, + { + "epoch": 2.2601153696138607, + "grad_norm": 0.3379245162929123, + "learning_rate": 5.251738648762497e-06, + "loss": 0.4477, + "step": 13762 + }, + { + "epoch": 2.2602795968222034, + "grad_norm": 0.4068887227418874, + "learning_rate": 5.251276262520804e-06, + "loss": 0.4476, + "step": 13763 + }, + { + "epoch": 2.2604438240305464, + "grad_norm": 0.35292379389392914, + "learning_rate": 5.25081386811951e-06, + "loss": 0.4647, + "step": 13764 + }, + { + "epoch": 2.260608051238889, + "grad_norm": 0.31261175083494874, + "learning_rate": 5.2503514655636405e-06, + "loss": 0.456, + "step": 13765 + }, + { + "epoch": 2.2607722784472317, + "grad_norm": 0.3023902044167681, + "learning_rate": 5.249889054858214e-06, + "loss": 0.4486, + "step": 13766 + }, + { + "epoch": 2.2609365056555744, + "grad_norm": 0.3510764596785331, + "learning_rate": 5.249426636008257e-06, + "loss": 0.4648, + "step": 13767 + }, + { + "epoch": 2.261100732863917, + "grad_norm": 0.29579015381658, + "learning_rate": 5.248964209018787e-06, + "loss": 0.4333, + "step": 13768 + }, + { + "epoch": 2.26126496007226, + "grad_norm": 0.3517170589499364, + "learning_rate": 5.24850177389483e-06, + "loss": 0.4315, + "step": 13769 + }, + { + "epoch": 2.2614291872806027, + "grad_norm": 0.30858214290887587, + "learning_rate": 5.248039330641407e-06, + "loss": 0.4664, + "step": 13770 + }, + { + "epoch": 2.2615934144889454, + "grad_norm": 0.6687128463622946, + "learning_rate": 5.247576879263542e-06, + "loss": 0.4483, + "step": 13771 + }, + { + "epoch": 2.2617576416972884, + "grad_norm": 0.3588713029547824, + "learning_rate": 5.247114419766255e-06, + "loss": 0.426, + "step": 13772 + }, + { + "epoch": 2.261921868905631, + "grad_norm": 0.3184520396061027, + "learning_rate": 5.246651952154569e-06, + "loss": 0.4415, + "step": 13773 + }, + { + "epoch": 2.2620860961139737, + "grad_norm": 0.3182348076386226, + "learning_rate": 5.246189476433512e-06, + "loss": 0.4154, + "step": 13774 + }, + { + "epoch": 2.2622503233223163, + "grad_norm": 0.3472059557046651, + "learning_rate": 5.245726992608101e-06, + "loss": 0.4297, + "step": 13775 + }, + { + "epoch": 2.262414550530659, + "grad_norm": 0.4028328577668929, + "learning_rate": 5.245264500683361e-06, + "loss": 0.4502, + "step": 13776 + }, + { + "epoch": 2.262578777739002, + "grad_norm": 0.30761601943596584, + "learning_rate": 5.244802000664314e-06, + "loss": 0.4434, + "step": 13777 + }, + { + "epoch": 2.2627430049473447, + "grad_norm": 0.388515157470138, + "learning_rate": 5.244339492555987e-06, + "loss": 0.4364, + "step": 13778 + }, + { + "epoch": 2.2629072321556873, + "grad_norm": 0.35450795824264086, + "learning_rate": 5.2438769763634e-06, + "loss": 0.4367, + "step": 13779 + }, + { + "epoch": 2.26307145936403, + "grad_norm": 0.3294525187120768, + "learning_rate": 5.243414452091578e-06, + "loss": 0.4377, + "step": 13780 + }, + { + "epoch": 2.263235686572373, + "grad_norm": 0.3722129129869963, + "learning_rate": 5.242951919745542e-06, + "loss": 0.4388, + "step": 13781 + }, + { + "epoch": 2.2633999137807157, + "grad_norm": 0.4507616586121919, + "learning_rate": 5.242489379330317e-06, + "loss": 0.4447, + "step": 13782 + }, + { + "epoch": 2.2635641409890583, + "grad_norm": 0.3598737035818175, + "learning_rate": 5.242026830850928e-06, + "loss": 0.4402, + "step": 13783 + }, + { + "epoch": 2.263728368197401, + "grad_norm": 0.3324765686675211, + "learning_rate": 5.241564274312398e-06, + "loss": 0.4466, + "step": 13784 + }, + { + "epoch": 2.2638925954057436, + "grad_norm": 0.4083224933688828, + "learning_rate": 5.241101709719749e-06, + "loss": 0.4498, + "step": 13785 + }, + { + "epoch": 2.2640568226140867, + "grad_norm": 0.2933933784871219, + "learning_rate": 5.240639137078007e-06, + "loss": 0.4265, + "step": 13786 + }, + { + "epoch": 2.2642210498224293, + "grad_norm": 0.27710009965559146, + "learning_rate": 5.240176556392194e-06, + "loss": 0.4314, + "step": 13787 + }, + { + "epoch": 2.264385277030772, + "grad_norm": 0.29841634796489386, + "learning_rate": 5.239713967667338e-06, + "loss": 0.4508, + "step": 13788 + }, + { + "epoch": 2.264549504239115, + "grad_norm": 0.5230942248164453, + "learning_rate": 5.23925137090846e-06, + "loss": 0.4558, + "step": 13789 + }, + { + "epoch": 2.2647137314474577, + "grad_norm": 0.5844725375386487, + "learning_rate": 5.238788766120583e-06, + "loss": 0.4491, + "step": 13790 + }, + { + "epoch": 2.2648779586558003, + "grad_norm": 0.35880413792086935, + "learning_rate": 5.238326153308733e-06, + "loss": 0.4613, + "step": 13791 + }, + { + "epoch": 2.265042185864143, + "grad_norm": 0.27176190824021323, + "learning_rate": 5.237863532477936e-06, + "loss": 0.4297, + "step": 13792 + }, + { + "epoch": 2.2652064130724856, + "grad_norm": 0.31010986715177674, + "learning_rate": 5.237400903633215e-06, + "loss": 0.4429, + "step": 13793 + }, + { + "epoch": 2.2653706402808287, + "grad_norm": 0.35101079799031804, + "learning_rate": 5.236938266779595e-06, + "loss": 0.4443, + "step": 13794 + }, + { + "epoch": 2.2655348674891713, + "grad_norm": 0.2913660569347403, + "learning_rate": 5.236475621922099e-06, + "loss": 0.4161, + "step": 13795 + }, + { + "epoch": 2.265699094697514, + "grad_norm": 0.37533689834149453, + "learning_rate": 5.236012969065754e-06, + "loss": 0.434, + "step": 13796 + }, + { + "epoch": 2.2658633219058566, + "grad_norm": 0.312108000748107, + "learning_rate": 5.2355503082155835e-06, + "loss": 0.4432, + "step": 13797 + }, + { + "epoch": 2.2660275491141997, + "grad_norm": 0.33505756371900425, + "learning_rate": 5.2350876393766135e-06, + "loss": 0.4509, + "step": 13798 + }, + { + "epoch": 2.2661917763225423, + "grad_norm": 0.2958642981997617, + "learning_rate": 5.2346249625538684e-06, + "loss": 0.4313, + "step": 13799 + }, + { + "epoch": 2.266356003530885, + "grad_norm": 0.3161571807919032, + "learning_rate": 5.234162277752374e-06, + "loss": 0.4637, + "step": 13800 + }, + { + "epoch": 2.2665202307392276, + "grad_norm": 0.2850364464418512, + "learning_rate": 5.233699584977154e-06, + "loss": 0.4434, + "step": 13801 + }, + { + "epoch": 2.2666844579475702, + "grad_norm": 0.44128845128357014, + "learning_rate": 5.233236884233234e-06, + "loss": 0.4314, + "step": 13802 + }, + { + "epoch": 2.2668486851559133, + "grad_norm": 0.2709781947969362, + "learning_rate": 5.232774175525642e-06, + "loss": 0.4356, + "step": 13803 + }, + { + "epoch": 2.267012912364256, + "grad_norm": 0.3143610695517449, + "learning_rate": 5.232311458859401e-06, + "loss": 0.452, + "step": 13804 + }, + { + "epoch": 2.2671771395725986, + "grad_norm": 0.2811302211053564, + "learning_rate": 5.231848734239536e-06, + "loss": 0.4454, + "step": 13805 + }, + { + "epoch": 2.2673413667809417, + "grad_norm": 0.29524919404369326, + "learning_rate": 5.231386001671074e-06, + "loss": 0.4252, + "step": 13806 + }, + { + "epoch": 2.2675055939892843, + "grad_norm": 0.3036902739174187, + "learning_rate": 5.23092326115904e-06, + "loss": 0.4454, + "step": 13807 + }, + { + "epoch": 2.267669821197627, + "grad_norm": 0.3115339770671733, + "learning_rate": 5.230460512708461e-06, + "loss": 0.4635, + "step": 13808 + }, + { + "epoch": 2.2678340484059696, + "grad_norm": 0.29688931339668434, + "learning_rate": 5.2299977563243635e-06, + "loss": 0.4468, + "step": 13809 + }, + { + "epoch": 2.267998275614312, + "grad_norm": 0.3085715958936347, + "learning_rate": 5.229534992011769e-06, + "loss": 0.4583, + "step": 13810 + }, + { + "epoch": 2.2681625028226553, + "grad_norm": 0.3095940496568009, + "learning_rate": 5.229072219775708e-06, + "loss": 0.4407, + "step": 13811 + }, + { + "epoch": 2.268326730030998, + "grad_norm": 0.4908325226525298, + "learning_rate": 5.228609439621206e-06, + "loss": 0.4401, + "step": 13812 + }, + { + "epoch": 2.2684909572393406, + "grad_norm": 0.44589233555087854, + "learning_rate": 5.2281466515532894e-06, + "loss": 0.4209, + "step": 13813 + }, + { + "epoch": 2.268655184447683, + "grad_norm": 0.2802820328979476, + "learning_rate": 5.227683855576983e-06, + "loss": 0.4287, + "step": 13814 + }, + { + "epoch": 2.2688194116560263, + "grad_norm": 0.4046303814982037, + "learning_rate": 5.227221051697313e-06, + "loss": 0.4517, + "step": 13815 + }, + { + "epoch": 2.268983638864369, + "grad_norm": 0.33943476882464263, + "learning_rate": 5.226758239919308e-06, + "loss": 0.4378, + "step": 13816 + }, + { + "epoch": 2.2691478660727116, + "grad_norm": 0.3936793154256377, + "learning_rate": 5.226295420247994e-06, + "loss": 0.459, + "step": 13817 + }, + { + "epoch": 2.269312093281054, + "grad_norm": 0.2697716266058795, + "learning_rate": 5.225832592688397e-06, + "loss": 0.4345, + "step": 13818 + }, + { + "epoch": 2.269476320489397, + "grad_norm": 0.3475167102931843, + "learning_rate": 5.2253697572455424e-06, + "loss": 0.4517, + "step": 13819 + }, + { + "epoch": 2.26964054769774, + "grad_norm": 0.3057864002599708, + "learning_rate": 5.224906913924459e-06, + "loss": 0.4186, + "step": 13820 + }, + { + "epoch": 2.2698047749060826, + "grad_norm": 0.3029753715957564, + "learning_rate": 5.224444062730174e-06, + "loss": 0.4541, + "step": 13821 + }, + { + "epoch": 2.269969002114425, + "grad_norm": 0.307432523810183, + "learning_rate": 5.2239812036677135e-06, + "loss": 0.4453, + "step": 13822 + }, + { + "epoch": 2.2701332293227683, + "grad_norm": 0.3060059931511143, + "learning_rate": 5.2235183367421055e-06, + "loss": 0.4426, + "step": 13823 + }, + { + "epoch": 2.270297456531111, + "grad_norm": 0.36256508614149663, + "learning_rate": 5.2230554619583756e-06, + "loss": 0.4367, + "step": 13824 + }, + { + "epoch": 2.2704616837394536, + "grad_norm": 0.3647724136370156, + "learning_rate": 5.222592579321552e-06, + "loss": 0.4479, + "step": 13825 + }, + { + "epoch": 2.270625910947796, + "grad_norm": 0.31823959283917014, + "learning_rate": 5.2221296888366615e-06, + "loss": 0.442, + "step": 13826 + }, + { + "epoch": 2.270790138156139, + "grad_norm": 0.30892461908438634, + "learning_rate": 5.221666790508733e-06, + "loss": 0.4305, + "step": 13827 + }, + { + "epoch": 2.270954365364482, + "grad_norm": 0.5498779110061026, + "learning_rate": 5.221203884342793e-06, + "loss": 0.4374, + "step": 13828 + }, + { + "epoch": 2.2711185925728246, + "grad_norm": 0.3437254626690269, + "learning_rate": 5.22074097034387e-06, + "loss": 0.4173, + "step": 13829 + }, + { + "epoch": 2.271282819781167, + "grad_norm": 0.39163066483566433, + "learning_rate": 5.22027804851699e-06, + "loss": 0.4317, + "step": 13830 + }, + { + "epoch": 2.27144704698951, + "grad_norm": 0.31973453986137745, + "learning_rate": 5.219815118867182e-06, + "loss": 0.417, + "step": 13831 + }, + { + "epoch": 2.271611274197853, + "grad_norm": 0.5012814391620765, + "learning_rate": 5.219352181399474e-06, + "loss": 0.4577, + "step": 13832 + }, + { + "epoch": 2.2717755014061956, + "grad_norm": 0.3326116761911855, + "learning_rate": 5.218889236118892e-06, + "loss": 0.4128, + "step": 13833 + }, + { + "epoch": 2.271939728614538, + "grad_norm": 0.34504752744659983, + "learning_rate": 5.218426283030467e-06, + "loss": 0.4525, + "step": 13834 + }, + { + "epoch": 2.272103955822881, + "grad_norm": 0.3250596236621934, + "learning_rate": 5.2179633221392255e-06, + "loss": 0.4545, + "step": 13835 + }, + { + "epoch": 2.2722681830312235, + "grad_norm": 0.28065194575482644, + "learning_rate": 5.217500353450196e-06, + "loss": 0.4345, + "step": 13836 + }, + { + "epoch": 2.2724324102395665, + "grad_norm": 0.7255066479196198, + "learning_rate": 5.217037376968406e-06, + "loss": 0.4438, + "step": 13837 + }, + { + "epoch": 2.272596637447909, + "grad_norm": 0.4017089397573672, + "learning_rate": 5.216574392698886e-06, + "loss": 0.4591, + "step": 13838 + }, + { + "epoch": 2.272760864656252, + "grad_norm": 0.5304928041774932, + "learning_rate": 5.216111400646662e-06, + "loss": 0.4441, + "step": 13839 + }, + { + "epoch": 2.272925091864595, + "grad_norm": 0.2609976185282584, + "learning_rate": 5.215648400816763e-06, + "loss": 0.4351, + "step": 13840 + }, + { + "epoch": 2.2730893190729375, + "grad_norm": 0.358583283765098, + "learning_rate": 5.215185393214219e-06, + "loss": 0.4385, + "step": 13841 + }, + { + "epoch": 2.27325354628128, + "grad_norm": 0.3020230812447199, + "learning_rate": 5.2147223778440586e-06, + "loss": 0.4399, + "step": 13842 + }, + { + "epoch": 2.273417773489623, + "grad_norm": 0.3367916444539525, + "learning_rate": 5.21425935471131e-06, + "loss": 0.4235, + "step": 13843 + }, + { + "epoch": 2.2735820006979655, + "grad_norm": 0.2941663693437456, + "learning_rate": 5.213796323821002e-06, + "loss": 0.4502, + "step": 13844 + }, + { + "epoch": 2.2737462279063085, + "grad_norm": 0.4697417001010032, + "learning_rate": 5.2133332851781624e-06, + "loss": 0.4267, + "step": 13845 + }, + { + "epoch": 2.273910455114651, + "grad_norm": 0.3388810367942243, + "learning_rate": 5.212870238787823e-06, + "loss": 0.4564, + "step": 13846 + }, + { + "epoch": 2.274074682322994, + "grad_norm": 0.2844059312289679, + "learning_rate": 5.2124071846550115e-06, + "loss": 0.4527, + "step": 13847 + }, + { + "epoch": 2.2742389095313364, + "grad_norm": 0.4828332149280781, + "learning_rate": 5.2119441227847555e-06, + "loss": 0.4426, + "step": 13848 + }, + { + "epoch": 2.2744031367396795, + "grad_norm": 0.33234453953277954, + "learning_rate": 5.2114810531820864e-06, + "loss": 0.456, + "step": 13849 + }, + { + "epoch": 2.274567363948022, + "grad_norm": 0.29849007460004134, + "learning_rate": 5.211017975852035e-06, + "loss": 0.4232, + "step": 13850 + }, + { + "epoch": 2.274731591156365, + "grad_norm": 0.31706913669563674, + "learning_rate": 5.210554890799626e-06, + "loss": 0.4273, + "step": 13851 + }, + { + "epoch": 2.2748958183647074, + "grad_norm": 0.4073711965734478, + "learning_rate": 5.210091798029894e-06, + "loss": 0.4376, + "step": 13852 + }, + { + "epoch": 2.27506004557305, + "grad_norm": 0.5229393755986401, + "learning_rate": 5.209628697547865e-06, + "loss": 0.4459, + "step": 13853 + }, + { + "epoch": 2.275224272781393, + "grad_norm": 0.3537211489217148, + "learning_rate": 5.209165589358571e-06, + "loss": 0.4747, + "step": 13854 + }, + { + "epoch": 2.275388499989736, + "grad_norm": 0.40748720524206644, + "learning_rate": 5.20870247346704e-06, + "loss": 0.4416, + "step": 13855 + }, + { + "epoch": 2.2755527271980784, + "grad_norm": 0.3216274797784098, + "learning_rate": 5.2082393498783036e-06, + "loss": 0.4572, + "step": 13856 + }, + { + "epoch": 2.2757169544064215, + "grad_norm": 0.35278286290399846, + "learning_rate": 5.207776218597391e-06, + "loss": 0.4452, + "step": 13857 + }, + { + "epoch": 2.275881181614764, + "grad_norm": 0.4098238080258137, + "learning_rate": 5.207313079629331e-06, + "loss": 0.432, + "step": 13858 + }, + { + "epoch": 2.276045408823107, + "grad_norm": 0.4580092831691726, + "learning_rate": 5.2068499329791575e-06, + "loss": 0.4349, + "step": 13859 + }, + { + "epoch": 2.2762096360314494, + "grad_norm": 0.38826013386276875, + "learning_rate": 5.206386778651896e-06, + "loss": 0.4141, + "step": 13860 + }, + { + "epoch": 2.276373863239792, + "grad_norm": 0.2941958339679804, + "learning_rate": 5.20592361665258e-06, + "loss": 0.4247, + "step": 13861 + }, + { + "epoch": 2.276538090448135, + "grad_norm": 0.3003495984838666, + "learning_rate": 5.20546044698624e-06, + "loss": 0.4258, + "step": 13862 + }, + { + "epoch": 2.276702317656478, + "grad_norm": 0.33772284519341567, + "learning_rate": 5.204997269657905e-06, + "loss": 0.439, + "step": 13863 + }, + { + "epoch": 2.2768665448648204, + "grad_norm": 0.35383257962280384, + "learning_rate": 5.2045340846726054e-06, + "loss": 0.4494, + "step": 13864 + }, + { + "epoch": 2.277030772073163, + "grad_norm": 0.3434125913276977, + "learning_rate": 5.204070892035372e-06, + "loss": 0.4356, + "step": 13865 + }, + { + "epoch": 2.277194999281506, + "grad_norm": 0.30071978455766885, + "learning_rate": 5.203607691751237e-06, + "loss": 0.4401, + "step": 13866 + }, + { + "epoch": 2.277359226489849, + "grad_norm": 0.29424246947418803, + "learning_rate": 5.203144483825231e-06, + "loss": 0.4518, + "step": 13867 + }, + { + "epoch": 2.2775234536981914, + "grad_norm": 0.7205369293800823, + "learning_rate": 5.202681268262382e-06, + "loss": 0.433, + "step": 13868 + }, + { + "epoch": 2.277687680906534, + "grad_norm": 0.4550218086842287, + "learning_rate": 5.202218045067724e-06, + "loss": 0.4227, + "step": 13869 + }, + { + "epoch": 2.2778519081148767, + "grad_norm": 0.5522939727022292, + "learning_rate": 5.201754814246286e-06, + "loss": 0.4329, + "step": 13870 + }, + { + "epoch": 2.27801613532322, + "grad_norm": 0.34802550224546164, + "learning_rate": 5.201291575803103e-06, + "loss": 0.4396, + "step": 13871 + }, + { + "epoch": 2.2781803625315624, + "grad_norm": 0.29412082409797263, + "learning_rate": 5.200828329743202e-06, + "loss": 0.4474, + "step": 13872 + }, + { + "epoch": 2.278344589739905, + "grad_norm": 0.3387381717897766, + "learning_rate": 5.200365076071616e-06, + "loss": 0.4344, + "step": 13873 + }, + { + "epoch": 2.278508816948248, + "grad_norm": 0.3520361563968808, + "learning_rate": 5.199901814793376e-06, + "loss": 0.4562, + "step": 13874 + }, + { + "epoch": 2.2786730441565908, + "grad_norm": 0.3288366800434973, + "learning_rate": 5.199438545913514e-06, + "loss": 0.4355, + "step": 13875 + }, + { + "epoch": 2.2788372713649334, + "grad_norm": 0.29865338535780817, + "learning_rate": 5.19897526943706e-06, + "loss": 0.4518, + "step": 13876 + }, + { + "epoch": 2.279001498573276, + "grad_norm": 0.3404207815125893, + "learning_rate": 5.198511985369049e-06, + "loss": 0.4322, + "step": 13877 + }, + { + "epoch": 2.2791657257816187, + "grad_norm": 0.31980604254782763, + "learning_rate": 5.198048693714509e-06, + "loss": 0.4281, + "step": 13878 + }, + { + "epoch": 2.2793299529899618, + "grad_norm": 0.2872471598384732, + "learning_rate": 5.197585394478474e-06, + "loss": 0.4482, + "step": 13879 + }, + { + "epoch": 2.2794941801983044, + "grad_norm": 0.3154232003073397, + "learning_rate": 5.1971220876659745e-06, + "loss": 0.4364, + "step": 13880 + }, + { + "epoch": 2.279658407406647, + "grad_norm": 0.6348972132812276, + "learning_rate": 5.196658773282044e-06, + "loss": 0.4298, + "step": 13881 + }, + { + "epoch": 2.2798226346149897, + "grad_norm": 0.28308749448615994, + "learning_rate": 5.196195451331715e-06, + "loss": 0.4315, + "step": 13882 + }, + { + "epoch": 2.2799868618233328, + "grad_norm": 0.37848023486848437, + "learning_rate": 5.195732121820016e-06, + "loss": 0.4639, + "step": 13883 + }, + { + "epoch": 2.2801510890316754, + "grad_norm": 0.40271339721462446, + "learning_rate": 5.1952687847519835e-06, + "loss": 0.4328, + "step": 13884 + }, + { + "epoch": 2.280315316240018, + "grad_norm": 0.3014707376437167, + "learning_rate": 5.194805440132647e-06, + "loss": 0.4214, + "step": 13885 + }, + { + "epoch": 2.2804795434483607, + "grad_norm": 0.3373711819342816, + "learning_rate": 5.19434208796704e-06, + "loss": 0.4473, + "step": 13886 + }, + { + "epoch": 2.2806437706567033, + "grad_norm": 0.38535680109940185, + "learning_rate": 5.193878728260194e-06, + "loss": 0.423, + "step": 13887 + }, + { + "epoch": 2.2808079978650464, + "grad_norm": 0.39323458035612957, + "learning_rate": 5.193415361017145e-06, + "loss": 0.4376, + "step": 13888 + }, + { + "epoch": 2.280972225073389, + "grad_norm": 0.40926189382068584, + "learning_rate": 5.19295198624292e-06, + "loss": 0.4268, + "step": 13889 + }, + { + "epoch": 2.2811364522817317, + "grad_norm": 0.39605642816296704, + "learning_rate": 5.192488603942555e-06, + "loss": 0.441, + "step": 13890 + }, + { + "epoch": 2.2813006794900748, + "grad_norm": 0.3321036515817321, + "learning_rate": 5.192025214121084e-06, + "loss": 0.4469, + "step": 13891 + }, + { + "epoch": 2.2814649066984174, + "grad_norm": 0.3084896280179283, + "learning_rate": 5.191561816783538e-06, + "loss": 0.4458, + "step": 13892 + }, + { + "epoch": 2.28162913390676, + "grad_norm": 0.4768889527534172, + "learning_rate": 5.1910984119349495e-06, + "loss": 0.4617, + "step": 13893 + }, + { + "epoch": 2.2817933611151027, + "grad_norm": 0.479253467678791, + "learning_rate": 5.190634999580352e-06, + "loss": 0.4393, + "step": 13894 + }, + { + "epoch": 2.2819575883234453, + "grad_norm": 0.34832956361346284, + "learning_rate": 5.190171579724779e-06, + "loss": 0.4353, + "step": 13895 + }, + { + "epoch": 2.2821218155317884, + "grad_norm": 0.3443708744422341, + "learning_rate": 5.189708152373266e-06, + "loss": 0.4213, + "step": 13896 + }, + { + "epoch": 2.282286042740131, + "grad_norm": 0.5197437799964241, + "learning_rate": 5.189244717530841e-06, + "loss": 0.4352, + "step": 13897 + }, + { + "epoch": 2.2824502699484737, + "grad_norm": 0.3786963383411827, + "learning_rate": 5.188781275202542e-06, + "loss": 0.4144, + "step": 13898 + }, + { + "epoch": 2.2826144971568163, + "grad_norm": 0.31878254563006886, + "learning_rate": 5.188317825393398e-06, + "loss": 0.4375, + "step": 13899 + }, + { + "epoch": 2.2827787243651594, + "grad_norm": 0.32371021620229423, + "learning_rate": 5.187854368108448e-06, + "loss": 0.467, + "step": 13900 + }, + { + "epoch": 2.282942951573502, + "grad_norm": 0.27065989521911527, + "learning_rate": 5.1873909033527225e-06, + "loss": 0.4494, + "step": 13901 + }, + { + "epoch": 2.2831071787818447, + "grad_norm": 0.36088684689567835, + "learning_rate": 5.186927431131254e-06, + "loss": 0.4493, + "step": 13902 + }, + { + "epoch": 2.2832714059901873, + "grad_norm": 0.3054866896938407, + "learning_rate": 5.186463951449079e-06, + "loss": 0.444, + "step": 13903 + }, + { + "epoch": 2.28343563319853, + "grad_norm": 0.9427890070184183, + "learning_rate": 5.1860004643112295e-06, + "loss": 0.4566, + "step": 13904 + }, + { + "epoch": 2.283599860406873, + "grad_norm": 0.36980738576340033, + "learning_rate": 5.1855369697227405e-06, + "loss": 0.4283, + "step": 13905 + }, + { + "epoch": 2.2837640876152157, + "grad_norm": 0.3261290341141939, + "learning_rate": 5.185073467688646e-06, + "loss": 0.4368, + "step": 13906 + }, + { + "epoch": 2.2839283148235583, + "grad_norm": 0.33320968485632513, + "learning_rate": 5.184609958213978e-06, + "loss": 0.4469, + "step": 13907 + }, + { + "epoch": 2.2840925420319014, + "grad_norm": 0.2856889531244492, + "learning_rate": 5.184146441303773e-06, + "loss": 0.4525, + "step": 13908 + }, + { + "epoch": 2.284256769240244, + "grad_norm": 0.3942354513878941, + "learning_rate": 5.183682916963066e-06, + "loss": 0.4316, + "step": 13909 + }, + { + "epoch": 2.2844209964485866, + "grad_norm": 0.3732289311915922, + "learning_rate": 5.183219385196887e-06, + "loss": 0.4307, + "step": 13910 + }, + { + "epoch": 2.2845852236569293, + "grad_norm": 0.33045744667353805, + "learning_rate": 5.182755846010276e-06, + "loss": 0.4419, + "step": 13911 + }, + { + "epoch": 2.284749450865272, + "grad_norm": 0.3139712346377227, + "learning_rate": 5.1822922994082636e-06, + "loss": 0.4438, + "step": 13912 + }, + { + "epoch": 2.284913678073615, + "grad_norm": 0.4955002940911547, + "learning_rate": 5.181828745395886e-06, + "loss": 0.4432, + "step": 13913 + }, + { + "epoch": 2.2850779052819576, + "grad_norm": 0.353124834913557, + "learning_rate": 5.181365183978176e-06, + "loss": 0.4328, + "step": 13914 + }, + { + "epoch": 2.2852421324903003, + "grad_norm": 0.33109773121135133, + "learning_rate": 5.180901615160172e-06, + "loss": 0.4238, + "step": 13915 + }, + { + "epoch": 2.285406359698643, + "grad_norm": 0.3091236114028852, + "learning_rate": 5.180438038946905e-06, + "loss": 0.4395, + "step": 13916 + }, + { + "epoch": 2.285570586906986, + "grad_norm": 0.3072253498064372, + "learning_rate": 5.179974455343412e-06, + "loss": 0.4536, + "step": 13917 + }, + { + "epoch": 2.2857348141153286, + "grad_norm": 0.36653777780275587, + "learning_rate": 5.179510864354727e-06, + "loss": 0.4406, + "step": 13918 + }, + { + "epoch": 2.2858990413236713, + "grad_norm": 0.3347389257699555, + "learning_rate": 5.179047265985885e-06, + "loss": 0.4269, + "step": 13919 + }, + { + "epoch": 2.286063268532014, + "grad_norm": 0.30147606850707453, + "learning_rate": 5.178583660241923e-06, + "loss": 0.4446, + "step": 13920 + }, + { + "epoch": 2.2862274957403566, + "grad_norm": 0.37986592730870994, + "learning_rate": 5.178120047127874e-06, + "loss": 0.4437, + "step": 13921 + }, + { + "epoch": 2.2863917229486996, + "grad_norm": 0.4007300546013079, + "learning_rate": 5.177656426648774e-06, + "loss": 0.4545, + "step": 13922 + }, + { + "epoch": 2.2865559501570423, + "grad_norm": 0.32051943336442223, + "learning_rate": 5.177192798809658e-06, + "loss": 0.4105, + "step": 13923 + }, + { + "epoch": 2.286720177365385, + "grad_norm": 0.38011102453141177, + "learning_rate": 5.1767291636155625e-06, + "loss": 0.4485, + "step": 13924 + }, + { + "epoch": 2.286884404573728, + "grad_norm": 0.35926434734994367, + "learning_rate": 5.176265521071523e-06, + "loss": 0.4362, + "step": 13925 + }, + { + "epoch": 2.2870486317820706, + "grad_norm": 0.3731633735917125, + "learning_rate": 5.175801871182575e-06, + "loss": 0.4455, + "step": 13926 + }, + { + "epoch": 2.2872128589904133, + "grad_norm": 0.29885224871406696, + "learning_rate": 5.175338213953752e-06, + "loss": 0.457, + "step": 13927 + }, + { + "epoch": 2.287377086198756, + "grad_norm": 0.3104186968740682, + "learning_rate": 5.174874549390092e-06, + "loss": 0.4183, + "step": 13928 + }, + { + "epoch": 2.2875413134070985, + "grad_norm": 0.34129434291138006, + "learning_rate": 5.17441087749663e-06, + "loss": 0.4402, + "step": 13929 + }, + { + "epoch": 2.2877055406154416, + "grad_norm": 0.31123742064990884, + "learning_rate": 5.173947198278405e-06, + "loss": 0.4332, + "step": 13930 + }, + { + "epoch": 2.2878697678237843, + "grad_norm": 0.29266834175594275, + "learning_rate": 5.173483511740448e-06, + "loss": 0.444, + "step": 13931 + }, + { + "epoch": 2.288033995032127, + "grad_norm": 0.3725237989488707, + "learning_rate": 5.173019817887798e-06, + "loss": 0.4677, + "step": 13932 + }, + { + "epoch": 2.2881982222404695, + "grad_norm": 0.31685516806527486, + "learning_rate": 5.1725561167254895e-06, + "loss": 0.4456, + "step": 13933 + }, + { + "epoch": 2.2883624494488126, + "grad_norm": 0.39825508760706196, + "learning_rate": 5.172092408258562e-06, + "loss": 0.4505, + "step": 13934 + }, + { + "epoch": 2.2885266766571553, + "grad_norm": 0.3634662074018649, + "learning_rate": 5.171628692492049e-06, + "loss": 0.4398, + "step": 13935 + }, + { + "epoch": 2.288690903865498, + "grad_norm": 0.4549972944347865, + "learning_rate": 5.171164969430987e-06, + "loss": 0.4439, + "step": 13936 + }, + { + "epoch": 2.2888551310738405, + "grad_norm": 0.40498915261239143, + "learning_rate": 5.1707012390804125e-06, + "loss": 0.4444, + "step": 13937 + }, + { + "epoch": 2.289019358282183, + "grad_norm": 0.4218955172962593, + "learning_rate": 5.1702375014453645e-06, + "loss": 0.4519, + "step": 13938 + }, + { + "epoch": 2.2891835854905263, + "grad_norm": 0.3348184465306751, + "learning_rate": 5.1697737565308755e-06, + "loss": 0.4556, + "step": 13939 + }, + { + "epoch": 2.289347812698869, + "grad_norm": 0.3186255812768499, + "learning_rate": 5.169310004341987e-06, + "loss": 0.4308, + "step": 13940 + }, + { + "epoch": 2.2895120399072115, + "grad_norm": 0.38078161937545435, + "learning_rate": 5.1688462448837315e-06, + "loss": 0.4242, + "step": 13941 + }, + { + "epoch": 2.2896762671155546, + "grad_norm": 0.2922132896482114, + "learning_rate": 5.168382478161149e-06, + "loss": 0.4328, + "step": 13942 + }, + { + "epoch": 2.2898404943238972, + "grad_norm": 0.34740487598494213, + "learning_rate": 5.167918704179275e-06, + "loss": 0.4308, + "step": 13943 + }, + { + "epoch": 2.29000472153224, + "grad_norm": 0.3537760593250267, + "learning_rate": 5.167454922943146e-06, + "loss": 0.4307, + "step": 13944 + }, + { + "epoch": 2.2901689487405825, + "grad_norm": 0.2928523746459882, + "learning_rate": 5.1669911344578e-06, + "loss": 0.4521, + "step": 13945 + }, + { + "epoch": 2.290333175948925, + "grad_norm": 0.47139678614977115, + "learning_rate": 5.166527338728275e-06, + "loss": 0.4305, + "step": 13946 + }, + { + "epoch": 2.2904974031572682, + "grad_norm": 0.2985657027832332, + "learning_rate": 5.166063535759606e-06, + "loss": 0.4491, + "step": 13947 + }, + { + "epoch": 2.290661630365611, + "grad_norm": 0.3731853926343701, + "learning_rate": 5.165599725556832e-06, + "loss": 0.4415, + "step": 13948 + }, + { + "epoch": 2.2908258575739535, + "grad_norm": 0.31532136412340483, + "learning_rate": 5.165135908124991e-06, + "loss": 0.4377, + "step": 13949 + }, + { + "epoch": 2.290990084782296, + "grad_norm": 1.126183260087053, + "learning_rate": 5.1646720834691185e-06, + "loss": 0.433, + "step": 13950 + }, + { + "epoch": 2.2911543119906392, + "grad_norm": 0.2954449428221947, + "learning_rate": 5.164208251594255e-06, + "loss": 0.4184, + "step": 13951 + }, + { + "epoch": 2.291318539198982, + "grad_norm": 0.3198516287960699, + "learning_rate": 5.163744412505434e-06, + "loss": 0.4518, + "step": 13952 + }, + { + "epoch": 2.2914827664073245, + "grad_norm": 0.417184169928783, + "learning_rate": 5.163280566207697e-06, + "loss": 0.4285, + "step": 13953 + }, + { + "epoch": 2.291646993615667, + "grad_norm": 0.3120139219686837, + "learning_rate": 5.162816712706081e-06, + "loss": 0.4569, + "step": 13954 + }, + { + "epoch": 2.29181122082401, + "grad_norm": 0.33460930796334654, + "learning_rate": 5.162352852005622e-06, + "loss": 0.4061, + "step": 13955 + }, + { + "epoch": 2.291975448032353, + "grad_norm": 0.5502547933745806, + "learning_rate": 5.16188898411136e-06, + "loss": 0.4349, + "step": 13956 + }, + { + "epoch": 2.2921396752406955, + "grad_norm": 0.3768704001174054, + "learning_rate": 5.161425109028332e-06, + "loss": 0.4496, + "step": 13957 + }, + { + "epoch": 2.292303902449038, + "grad_norm": 0.36340695623770736, + "learning_rate": 5.160961226761576e-06, + "loss": 0.4414, + "step": 13958 + }, + { + "epoch": 2.2924681296573812, + "grad_norm": 0.40539103376015434, + "learning_rate": 5.160497337316133e-06, + "loss": 0.4497, + "step": 13959 + }, + { + "epoch": 2.292632356865724, + "grad_norm": 0.3861065338987038, + "learning_rate": 5.160033440697038e-06, + "loss": 0.4512, + "step": 13960 + }, + { + "epoch": 2.2927965840740665, + "grad_norm": 0.2884253200044417, + "learning_rate": 5.15956953690933e-06, + "loss": 0.4278, + "step": 13961 + }, + { + "epoch": 2.292960811282409, + "grad_norm": 0.30611232750057144, + "learning_rate": 5.159105625958048e-06, + "loss": 0.4382, + "step": 13962 + }, + { + "epoch": 2.2931250384907518, + "grad_norm": 0.34316335987687846, + "learning_rate": 5.158641707848231e-06, + "loss": 0.4671, + "step": 13963 + }, + { + "epoch": 2.293289265699095, + "grad_norm": 0.3970977754941982, + "learning_rate": 5.158177782584917e-06, + "loss": 0.4288, + "step": 13964 + }, + { + "epoch": 2.2934534929074375, + "grad_norm": 0.3403187216231823, + "learning_rate": 5.1577138501731435e-06, + "loss": 0.4418, + "step": 13965 + }, + { + "epoch": 2.29361772011578, + "grad_norm": 0.5190818158430706, + "learning_rate": 5.1572499106179515e-06, + "loss": 0.4337, + "step": 13966 + }, + { + "epoch": 2.2937819473241228, + "grad_norm": 0.3232592487442511, + "learning_rate": 5.156785963924378e-06, + "loss": 0.4505, + "step": 13967 + }, + { + "epoch": 2.293946174532466, + "grad_norm": 0.2813019946585075, + "learning_rate": 5.156322010097464e-06, + "loss": 0.4342, + "step": 13968 + }, + { + "epoch": 2.2941104017408085, + "grad_norm": 0.3356722636096625, + "learning_rate": 5.155858049142247e-06, + "loss": 0.4353, + "step": 13969 + }, + { + "epoch": 2.294274628949151, + "grad_norm": 0.2909214828377573, + "learning_rate": 5.155394081063766e-06, + "loss": 0.4321, + "step": 13970 + }, + { + "epoch": 2.2944388561574938, + "grad_norm": 0.3733982655008716, + "learning_rate": 5.154930105867061e-06, + "loss": 0.472, + "step": 13971 + }, + { + "epoch": 2.2946030833658364, + "grad_norm": 0.4654045248653441, + "learning_rate": 5.154466123557169e-06, + "loss": 0.4429, + "step": 13972 + }, + { + "epoch": 2.2947673105741795, + "grad_norm": 0.3637665009212365, + "learning_rate": 5.154002134139132e-06, + "loss": 0.4238, + "step": 13973 + }, + { + "epoch": 2.294931537782522, + "grad_norm": 0.2909657126305831, + "learning_rate": 5.15353813761799e-06, + "loss": 0.4607, + "step": 13974 + }, + { + "epoch": 2.2950957649908648, + "grad_norm": 0.42400086471079623, + "learning_rate": 5.153074133998778e-06, + "loss": 0.4594, + "step": 13975 + }, + { + "epoch": 2.295259992199208, + "grad_norm": 0.36972331015652127, + "learning_rate": 5.15261012328654e-06, + "loss": 0.4262, + "step": 13976 + }, + { + "epoch": 2.2954242194075505, + "grad_norm": 0.4389775606377693, + "learning_rate": 5.152146105486313e-06, + "loss": 0.4249, + "step": 13977 + }, + { + "epoch": 2.295588446615893, + "grad_norm": 0.33415273938695683, + "learning_rate": 5.1516820806031395e-06, + "loss": 0.4354, + "step": 13978 + }, + { + "epoch": 2.2957526738242358, + "grad_norm": 0.4278522839527496, + "learning_rate": 5.151218048642055e-06, + "loss": 0.4476, + "step": 13979 + }, + { + "epoch": 2.2959169010325784, + "grad_norm": 0.3741320360256869, + "learning_rate": 5.150754009608105e-06, + "loss": 0.4484, + "step": 13980 + }, + { + "epoch": 2.2960811282409215, + "grad_norm": 0.311783369735317, + "learning_rate": 5.150289963506323e-06, + "loss": 0.4509, + "step": 13981 + }, + { + "epoch": 2.296245355449264, + "grad_norm": 0.5405852343668192, + "learning_rate": 5.149825910341753e-06, + "loss": 0.4455, + "step": 13982 + }, + { + "epoch": 2.2964095826576068, + "grad_norm": 0.3519118169672042, + "learning_rate": 5.1493618501194355e-06, + "loss": 0.4337, + "step": 13983 + }, + { + "epoch": 2.2965738098659494, + "grad_norm": 0.28809793111921655, + "learning_rate": 5.148897782844409e-06, + "loss": 0.4485, + "step": 13984 + }, + { + "epoch": 2.2967380370742925, + "grad_norm": 0.37024335296106775, + "learning_rate": 5.1484337085217134e-06, + "loss": 0.4317, + "step": 13985 + }, + { + "epoch": 2.296902264282635, + "grad_norm": 0.38276051392136357, + "learning_rate": 5.14796962715639e-06, + "loss": 0.4423, + "step": 13986 + }, + { + "epoch": 2.2970664914909777, + "grad_norm": 0.29217104055044624, + "learning_rate": 5.147505538753478e-06, + "loss": 0.4473, + "step": 13987 + }, + { + "epoch": 2.2972307186993204, + "grad_norm": 0.47709697511449556, + "learning_rate": 5.147041443318021e-06, + "loss": 0.4508, + "step": 13988 + }, + { + "epoch": 2.297394945907663, + "grad_norm": 0.30567178053755234, + "learning_rate": 5.146577340855056e-06, + "loss": 0.4429, + "step": 13989 + }, + { + "epoch": 2.297559173116006, + "grad_norm": 0.3211430439562128, + "learning_rate": 5.146113231369625e-06, + "loss": 0.4527, + "step": 13990 + }, + { + "epoch": 2.2977234003243487, + "grad_norm": 0.30219116042974015, + "learning_rate": 5.145649114866768e-06, + "loss": 0.457, + "step": 13991 + }, + { + "epoch": 2.2978876275326914, + "grad_norm": 0.2898625848815241, + "learning_rate": 5.145184991351529e-06, + "loss": 0.4148, + "step": 13992 + }, + { + "epoch": 2.2980518547410345, + "grad_norm": 1.1202762278001295, + "learning_rate": 5.144720860828944e-06, + "loss": 0.4187, + "step": 13993 + }, + { + "epoch": 2.298216081949377, + "grad_norm": 0.5445625866550631, + "learning_rate": 5.144256723304056e-06, + "loss": 0.4431, + "step": 13994 + }, + { + "epoch": 2.2983803091577197, + "grad_norm": 0.35936423083423646, + "learning_rate": 5.143792578781906e-06, + "loss": 0.4579, + "step": 13995 + }, + { + "epoch": 2.2985445363660624, + "grad_norm": 0.3387039272937696, + "learning_rate": 5.143328427267535e-06, + "loss": 0.4532, + "step": 13996 + }, + { + "epoch": 2.298708763574405, + "grad_norm": 0.35295839414607105, + "learning_rate": 5.142864268765985e-06, + "loss": 0.4416, + "step": 13997 + }, + { + "epoch": 2.298872990782748, + "grad_norm": 0.28125146920926414, + "learning_rate": 5.142400103282298e-06, + "loss": 0.4393, + "step": 13998 + }, + { + "epoch": 2.2990372179910907, + "grad_norm": 0.4012659500929637, + "learning_rate": 5.141935930821512e-06, + "loss": 0.4453, + "step": 13999 + }, + { + "epoch": 2.2992014451994334, + "grad_norm": 0.3300866074431469, + "learning_rate": 5.14147175138867e-06, + "loss": 0.4331, + "step": 14000 + }, + { + "epoch": 2.299365672407776, + "grad_norm": 0.4325211113663812, + "learning_rate": 5.1410075649888156e-06, + "loss": 0.4416, + "step": 14001 + }, + { + "epoch": 2.299529899616119, + "grad_norm": 0.32184158336621926, + "learning_rate": 5.1405433716269865e-06, + "loss": 0.4513, + "step": 14002 + }, + { + "epoch": 2.2996941268244617, + "grad_norm": 0.35911228396405964, + "learning_rate": 5.140079171308228e-06, + "loss": 0.4395, + "step": 14003 + }, + { + "epoch": 2.2998583540328044, + "grad_norm": 0.3489742010041331, + "learning_rate": 5.139614964037577e-06, + "loss": 0.4434, + "step": 14004 + }, + { + "epoch": 2.300022581241147, + "grad_norm": 0.37801129331195343, + "learning_rate": 5.139150749820081e-06, + "loss": 0.4453, + "step": 14005 + }, + { + "epoch": 2.3001868084494896, + "grad_norm": 0.3762648304082034, + "learning_rate": 5.138686528660778e-06, + "loss": 0.4245, + "step": 14006 + }, + { + "epoch": 2.3003510356578327, + "grad_norm": 0.33802117667366877, + "learning_rate": 5.1382223005647094e-06, + "loss": 0.4454, + "step": 14007 + }, + { + "epoch": 2.3005152628661754, + "grad_norm": 0.4124698432484217, + "learning_rate": 5.137758065536921e-06, + "loss": 0.4344, + "step": 14008 + }, + { + "epoch": 2.300679490074518, + "grad_norm": 0.4053973640415479, + "learning_rate": 5.137293823582452e-06, + "loss": 0.4372, + "step": 14009 + }, + { + "epoch": 2.300843717282861, + "grad_norm": 0.32205027479758536, + "learning_rate": 5.1368295747063455e-06, + "loss": 0.4201, + "step": 14010 + }, + { + "epoch": 2.3010079444912037, + "grad_norm": 0.3158597315788733, + "learning_rate": 5.136365318913641e-06, + "loss": 0.4489, + "step": 14011 + }, + { + "epoch": 2.3011721716995464, + "grad_norm": 0.37379440368033845, + "learning_rate": 5.1359010562093855e-06, + "loss": 0.4481, + "step": 14012 + }, + { + "epoch": 2.301336398907889, + "grad_norm": 0.3371250590690597, + "learning_rate": 5.135436786598619e-06, + "loss": 0.4418, + "step": 14013 + }, + { + "epoch": 2.3015006261162316, + "grad_norm": 0.3469445965971112, + "learning_rate": 5.134972510086382e-06, + "loss": 0.4428, + "step": 14014 + }, + { + "epoch": 2.3016648533245747, + "grad_norm": 0.35999734715615955, + "learning_rate": 5.134508226677719e-06, + "loss": 0.4387, + "step": 14015 + }, + { + "epoch": 2.3018290805329173, + "grad_norm": 0.3987865712873361, + "learning_rate": 5.134043936377672e-06, + "loss": 0.4541, + "step": 14016 + }, + { + "epoch": 2.30199330774126, + "grad_norm": 0.30289366918869837, + "learning_rate": 5.133579639191286e-06, + "loss": 0.4473, + "step": 14017 + }, + { + "epoch": 2.3021575349496026, + "grad_norm": 0.403215916390885, + "learning_rate": 5.133115335123601e-06, + "loss": 0.4545, + "step": 14018 + }, + { + "epoch": 2.3023217621579457, + "grad_norm": 0.34769404599655834, + "learning_rate": 5.13265102417966e-06, + "loss": 0.4373, + "step": 14019 + }, + { + "epoch": 2.3024859893662883, + "grad_norm": 0.4296539613249186, + "learning_rate": 5.132186706364507e-06, + "loss": 0.4341, + "step": 14020 + }, + { + "epoch": 2.302650216574631, + "grad_norm": 0.3154857891196642, + "learning_rate": 5.131722381683183e-06, + "loss": 0.4581, + "step": 14021 + }, + { + "epoch": 2.3028144437829736, + "grad_norm": 0.3455431355471438, + "learning_rate": 5.131258050140734e-06, + "loss": 0.4238, + "step": 14022 + }, + { + "epoch": 2.3029786709913163, + "grad_norm": 0.5078514368461862, + "learning_rate": 5.130793711742201e-06, + "loss": 0.4437, + "step": 14023 + }, + { + "epoch": 2.3031428981996593, + "grad_norm": 0.34871399726253866, + "learning_rate": 5.130329366492628e-06, + "loss": 0.4279, + "step": 14024 + }, + { + "epoch": 2.303307125408002, + "grad_norm": 0.4938279579093215, + "learning_rate": 5.129865014397057e-06, + "loss": 0.4359, + "step": 14025 + }, + { + "epoch": 2.3034713526163446, + "grad_norm": 0.4455533239162793, + "learning_rate": 5.129400655460533e-06, + "loss": 0.4302, + "step": 14026 + }, + { + "epoch": 2.3036355798246877, + "grad_norm": 0.317398104299277, + "learning_rate": 5.1289362896881e-06, + "loss": 0.4441, + "step": 14027 + }, + { + "epoch": 2.3037998070330303, + "grad_norm": 0.3335278163989015, + "learning_rate": 5.128471917084798e-06, + "loss": 0.4409, + "step": 14028 + }, + { + "epoch": 2.303964034241373, + "grad_norm": 0.3116408451208168, + "learning_rate": 5.128007537655673e-06, + "loss": 0.4565, + "step": 14029 + }, + { + "epoch": 2.3041282614497156, + "grad_norm": 0.3616035436306267, + "learning_rate": 5.1275431514057686e-06, + "loss": 0.4402, + "step": 14030 + }, + { + "epoch": 2.3042924886580582, + "grad_norm": 0.3156254049719913, + "learning_rate": 5.127078758340128e-06, + "loss": 0.4342, + "step": 14031 + }, + { + "epoch": 2.3044567158664013, + "grad_norm": 0.37717368073785584, + "learning_rate": 5.126614358463795e-06, + "loss": 0.4355, + "step": 14032 + }, + { + "epoch": 2.304620943074744, + "grad_norm": 0.6232263943964357, + "learning_rate": 5.126149951781814e-06, + "loss": 0.4383, + "step": 14033 + }, + { + "epoch": 2.3047851702830866, + "grad_norm": 0.3038821240087618, + "learning_rate": 5.1256855382992285e-06, + "loss": 0.4616, + "step": 14034 + }, + { + "epoch": 2.3049493974914292, + "grad_norm": 0.3415423211697857, + "learning_rate": 5.125221118021082e-06, + "loss": 0.437, + "step": 14035 + }, + { + "epoch": 2.3051136246997723, + "grad_norm": 0.273495773934862, + "learning_rate": 5.124756690952418e-06, + "loss": 0.4553, + "step": 14036 + }, + { + "epoch": 2.305277851908115, + "grad_norm": 0.32609384140101105, + "learning_rate": 5.124292257098284e-06, + "loss": 0.4358, + "step": 14037 + }, + { + "epoch": 2.3054420791164576, + "grad_norm": 0.29790082821128433, + "learning_rate": 5.123827816463722e-06, + "loss": 0.4455, + "step": 14038 + }, + { + "epoch": 2.3056063063248002, + "grad_norm": 0.4301738879695801, + "learning_rate": 5.123363369053774e-06, + "loss": 0.4351, + "step": 14039 + }, + { + "epoch": 2.305770533533143, + "grad_norm": 0.4560846085606717, + "learning_rate": 5.122898914873487e-06, + "loss": 0.4325, + "step": 14040 + }, + { + "epoch": 2.305934760741486, + "grad_norm": 0.3587378842032936, + "learning_rate": 5.122434453927905e-06, + "loss": 0.4325, + "step": 14041 + }, + { + "epoch": 2.3060989879498286, + "grad_norm": 0.4712292651053053, + "learning_rate": 5.121969986222074e-06, + "loss": 0.4323, + "step": 14042 + }, + { + "epoch": 2.3062632151581712, + "grad_norm": 0.40531640859089124, + "learning_rate": 5.121505511761036e-06, + "loss": 0.4324, + "step": 14043 + }, + { + "epoch": 2.3064274423665143, + "grad_norm": 0.28030142930937785, + "learning_rate": 5.121041030549835e-06, + "loss": 0.4453, + "step": 14044 + }, + { + "epoch": 2.306591669574857, + "grad_norm": 0.3595351679875266, + "learning_rate": 5.120576542593519e-06, + "loss": 0.4409, + "step": 14045 + }, + { + "epoch": 2.3067558967831996, + "grad_norm": 0.30426305503665396, + "learning_rate": 5.120112047897132e-06, + "loss": 0.4528, + "step": 14046 + }, + { + "epoch": 2.3069201239915422, + "grad_norm": 0.5848941502153416, + "learning_rate": 5.119647546465717e-06, + "loss": 0.4311, + "step": 14047 + }, + { + "epoch": 2.307084351199885, + "grad_norm": 0.32612350991860606, + "learning_rate": 5.11918303830432e-06, + "loss": 0.4377, + "step": 14048 + }, + { + "epoch": 2.307248578408228, + "grad_norm": 0.2763098121789989, + "learning_rate": 5.118718523417985e-06, + "loss": 0.4235, + "step": 14049 + }, + { + "epoch": 2.3074128056165706, + "grad_norm": 0.3295502056022296, + "learning_rate": 5.118254001811759e-06, + "loss": 0.4374, + "step": 14050 + }, + { + "epoch": 2.307577032824913, + "grad_norm": 0.3994712281531475, + "learning_rate": 5.117789473490688e-06, + "loss": 0.4412, + "step": 14051 + }, + { + "epoch": 2.307741260033256, + "grad_norm": 0.3115856036129265, + "learning_rate": 5.117324938459813e-06, + "loss": 0.4284, + "step": 14052 + }, + { + "epoch": 2.307905487241599, + "grad_norm": 0.9618678183501287, + "learning_rate": 5.116860396724183e-06, + "loss": 0.451, + "step": 14053 + }, + { + "epoch": 2.3080697144499416, + "grad_norm": 0.3948070348891769, + "learning_rate": 5.116395848288842e-06, + "loss": 0.4334, + "step": 14054 + }, + { + "epoch": 2.308233941658284, + "grad_norm": 0.3034880244011292, + "learning_rate": 5.115931293158835e-06, + "loss": 0.4449, + "step": 14055 + }, + { + "epoch": 2.308398168866627, + "grad_norm": 0.31571765021660086, + "learning_rate": 5.11546673133921e-06, + "loss": 0.4207, + "step": 14056 + }, + { + "epoch": 2.3085623960749695, + "grad_norm": 0.29877701295733133, + "learning_rate": 5.1150021628350095e-06, + "loss": 0.4526, + "step": 14057 + }, + { + "epoch": 2.3087266232833126, + "grad_norm": 0.34530395459750013, + "learning_rate": 5.114537587651279e-06, + "loss": 0.4274, + "step": 14058 + }, + { + "epoch": 2.308890850491655, + "grad_norm": 0.369050788922719, + "learning_rate": 5.114073005793068e-06, + "loss": 0.4483, + "step": 14059 + }, + { + "epoch": 2.309055077699998, + "grad_norm": 0.28538396109289, + "learning_rate": 5.113608417265419e-06, + "loss": 0.4338, + "step": 14060 + }, + { + "epoch": 2.309219304908341, + "grad_norm": 0.32589537320587636, + "learning_rate": 5.113143822073379e-06, + "loss": 0.4394, + "step": 14061 + }, + { + "epoch": 2.3093835321166836, + "grad_norm": 0.32042454658452446, + "learning_rate": 5.1126792202219936e-06, + "loss": 0.4452, + "step": 14062 + }, + { + "epoch": 2.309547759325026, + "grad_norm": 0.33793205302871954, + "learning_rate": 5.11221461171631e-06, + "loss": 0.4362, + "step": 14063 + }, + { + "epoch": 2.309711986533369, + "grad_norm": 0.2879793855044109, + "learning_rate": 5.111749996561371e-06, + "loss": 0.4442, + "step": 14064 + }, + { + "epoch": 2.3098762137417115, + "grad_norm": 0.3600649860400257, + "learning_rate": 5.111285374762227e-06, + "loss": 0.4214, + "step": 14065 + }, + { + "epoch": 2.3100404409500546, + "grad_norm": 0.3584892274240055, + "learning_rate": 5.110820746323924e-06, + "loss": 0.4371, + "step": 14066 + }, + { + "epoch": 2.310204668158397, + "grad_norm": 0.3154401212773581, + "learning_rate": 5.1103561112515035e-06, + "loss": 0.4408, + "step": 14067 + }, + { + "epoch": 2.31036889536674, + "grad_norm": 0.42719130845995357, + "learning_rate": 5.109891469550018e-06, + "loss": 0.4538, + "step": 14068 + }, + { + "epoch": 2.3105331225750825, + "grad_norm": 0.33674862607670436, + "learning_rate": 5.109426821224509e-06, + "loss": 0.4304, + "step": 14069 + }, + { + "epoch": 2.3106973497834256, + "grad_norm": 0.33291473916346687, + "learning_rate": 5.108962166280025e-06, + "loss": 0.4515, + "step": 14070 + }, + { + "epoch": 2.310861576991768, + "grad_norm": 0.34892866493515384, + "learning_rate": 5.108497504721614e-06, + "loss": 0.4331, + "step": 14071 + }, + { + "epoch": 2.311025804200111, + "grad_norm": 0.300162383893827, + "learning_rate": 5.108032836554321e-06, + "loss": 0.4546, + "step": 14072 + }, + { + "epoch": 2.3111900314084535, + "grad_norm": 0.28162989647215003, + "learning_rate": 5.107568161783193e-06, + "loss": 0.4533, + "step": 14073 + }, + { + "epoch": 2.311354258616796, + "grad_norm": 0.38128347717669486, + "learning_rate": 5.107103480413277e-06, + "loss": 0.4519, + "step": 14074 + }, + { + "epoch": 2.311518485825139, + "grad_norm": 0.29761447186213036, + "learning_rate": 5.106638792449619e-06, + "loss": 0.4547, + "step": 14075 + }, + { + "epoch": 2.311682713033482, + "grad_norm": 0.40785614066545356, + "learning_rate": 5.10617409789727e-06, + "loss": 0.419, + "step": 14076 + }, + { + "epoch": 2.3118469402418245, + "grad_norm": 0.35949262056041725, + "learning_rate": 5.105709396761271e-06, + "loss": 0.4461, + "step": 14077 + }, + { + "epoch": 2.3120111674501675, + "grad_norm": 0.460989553636019, + "learning_rate": 5.105244689046672e-06, + "loss": 0.4462, + "step": 14078 + }, + { + "epoch": 2.31217539465851, + "grad_norm": 0.2828943692652613, + "learning_rate": 5.104779974758521e-06, + "loss": 0.4257, + "step": 14079 + }, + { + "epoch": 2.312339621866853, + "grad_norm": 0.3003531542996423, + "learning_rate": 5.1043152539018645e-06, + "loss": 0.4605, + "step": 14080 + }, + { + "epoch": 2.3125038490751955, + "grad_norm": 0.3975382133409971, + "learning_rate": 5.103850526481751e-06, + "loss": 0.4359, + "step": 14081 + }, + { + "epoch": 2.312668076283538, + "grad_norm": 0.32455473999813356, + "learning_rate": 5.103385792503224e-06, + "loss": 0.4535, + "step": 14082 + }, + { + "epoch": 2.312832303491881, + "grad_norm": 0.3051514804701622, + "learning_rate": 5.102921051971335e-06, + "loss": 0.465, + "step": 14083 + }, + { + "epoch": 2.312996530700224, + "grad_norm": 0.3355561230778918, + "learning_rate": 5.102456304891131e-06, + "loss": 0.4458, + "step": 14084 + }, + { + "epoch": 2.3131607579085665, + "grad_norm": 0.37683145351901637, + "learning_rate": 5.101991551267657e-06, + "loss": 0.4505, + "step": 14085 + }, + { + "epoch": 2.313324985116909, + "grad_norm": 0.37557984452306886, + "learning_rate": 5.101526791105964e-06, + "loss": 0.4602, + "step": 14086 + }, + { + "epoch": 2.313489212325252, + "grad_norm": 0.3750722304582995, + "learning_rate": 5.101062024411098e-06, + "loss": 0.4527, + "step": 14087 + }, + { + "epoch": 2.313653439533595, + "grad_norm": 0.27858798055563705, + "learning_rate": 5.100597251188107e-06, + "loss": 0.4474, + "step": 14088 + }, + { + "epoch": 2.3138176667419375, + "grad_norm": 0.299630465893534, + "learning_rate": 5.100132471442038e-06, + "loss": 0.4342, + "step": 14089 + }, + { + "epoch": 2.31398189395028, + "grad_norm": 0.3387134968545231, + "learning_rate": 5.0996676851779405e-06, + "loss": 0.4392, + "step": 14090 + }, + { + "epoch": 2.3141461211586227, + "grad_norm": 0.4640134154275601, + "learning_rate": 5.099202892400863e-06, + "loss": 0.4295, + "step": 14091 + }, + { + "epoch": 2.314310348366966, + "grad_norm": 0.33222568022393845, + "learning_rate": 5.098738093115851e-06, + "loss": 0.4542, + "step": 14092 + }, + { + "epoch": 2.3144745755753084, + "grad_norm": 0.4566089439660268, + "learning_rate": 5.098273287327954e-06, + "loss": 0.4321, + "step": 14093 + }, + { + "epoch": 2.314638802783651, + "grad_norm": 0.3340150445386211, + "learning_rate": 5.097808475042221e-06, + "loss": 0.4348, + "step": 14094 + }, + { + "epoch": 2.314803029991994, + "grad_norm": 0.31587059691726316, + "learning_rate": 5.097343656263701e-06, + "loss": 0.44, + "step": 14095 + }, + { + "epoch": 2.314967257200337, + "grad_norm": 0.29369202918284504, + "learning_rate": 5.0968788309974405e-06, + "loss": 0.4436, + "step": 14096 + }, + { + "epoch": 2.3151314844086794, + "grad_norm": 0.3455351361430038, + "learning_rate": 5.096413999248489e-06, + "loss": 0.4494, + "step": 14097 + }, + { + "epoch": 2.315295711617022, + "grad_norm": 0.29585478735310294, + "learning_rate": 5.095949161021894e-06, + "loss": 0.4273, + "step": 14098 + }, + { + "epoch": 2.3154599388253647, + "grad_norm": 0.26622562749246065, + "learning_rate": 5.0954843163227035e-06, + "loss": 0.435, + "step": 14099 + }, + { + "epoch": 2.315624166033708, + "grad_norm": 0.2716855705390083, + "learning_rate": 5.09501946515597e-06, + "loss": 0.4576, + "step": 14100 + }, + { + "epoch": 2.3157883932420504, + "grad_norm": 0.38750734835384804, + "learning_rate": 5.094554607526738e-06, + "loss": 0.4542, + "step": 14101 + }, + { + "epoch": 2.315952620450393, + "grad_norm": 0.39359876643785513, + "learning_rate": 5.094089743440059e-06, + "loss": 0.4345, + "step": 14102 + }, + { + "epoch": 2.3161168476587357, + "grad_norm": 0.2808952967455288, + "learning_rate": 5.09362487290098e-06, + "loss": 0.4414, + "step": 14103 + }, + { + "epoch": 2.316281074867079, + "grad_norm": 0.3194014628944141, + "learning_rate": 5.093159995914551e-06, + "loss": 0.4424, + "step": 14104 + }, + { + "epoch": 2.3164453020754214, + "grad_norm": 0.4332979821627784, + "learning_rate": 5.092695112485822e-06, + "loss": 0.4527, + "step": 14105 + }, + { + "epoch": 2.316609529283764, + "grad_norm": 0.5310827151398166, + "learning_rate": 5.09223022261984e-06, + "loss": 0.446, + "step": 14106 + }, + { + "epoch": 2.3167737564921067, + "grad_norm": 0.2786846133670859, + "learning_rate": 5.091765326321655e-06, + "loss": 0.4418, + "step": 14107 + }, + { + "epoch": 2.3169379837004493, + "grad_norm": 0.329155169416799, + "learning_rate": 5.091300423596316e-06, + "loss": 0.4484, + "step": 14108 + }, + { + "epoch": 2.3171022109087924, + "grad_norm": 0.3980959735763232, + "learning_rate": 5.0908355144488736e-06, + "loss": 0.4526, + "step": 14109 + }, + { + "epoch": 2.317266438117135, + "grad_norm": 0.32127663985472377, + "learning_rate": 5.090370598884376e-06, + "loss": 0.4498, + "step": 14110 + }, + { + "epoch": 2.3174306653254777, + "grad_norm": 0.34683145841006363, + "learning_rate": 5.089905676907873e-06, + "loss": 0.4459, + "step": 14111 + }, + { + "epoch": 2.317594892533821, + "grad_norm": 0.43147739958741066, + "learning_rate": 5.0894407485244124e-06, + "loss": 0.4443, + "step": 14112 + }, + { + "epoch": 2.3177591197421634, + "grad_norm": 0.4648840760522442, + "learning_rate": 5.0889758137390466e-06, + "loss": 0.4456, + "step": 14113 + }, + { + "epoch": 2.317923346950506, + "grad_norm": 0.31100822351587076, + "learning_rate": 5.0885108725568235e-06, + "loss": 0.4501, + "step": 14114 + }, + { + "epoch": 2.3180875741588487, + "grad_norm": 0.31343052539393956, + "learning_rate": 5.088045924982794e-06, + "loss": 0.4409, + "step": 14115 + }, + { + "epoch": 2.3182518013671913, + "grad_norm": 0.34227482246820434, + "learning_rate": 5.0875809710220075e-06, + "loss": 0.4308, + "step": 14116 + }, + { + "epoch": 2.3184160285755344, + "grad_norm": 0.27676355153912735, + "learning_rate": 5.087116010679511e-06, + "loss": 0.4558, + "step": 14117 + }, + { + "epoch": 2.318580255783877, + "grad_norm": 0.3073291035116375, + "learning_rate": 5.08665104396036e-06, + "loss": 0.4345, + "step": 14118 + }, + { + "epoch": 2.3187444829922197, + "grad_norm": 0.32193629097359744, + "learning_rate": 5.086186070869601e-06, + "loss": 0.439, + "step": 14119 + }, + { + "epoch": 2.3189087102005623, + "grad_norm": 0.281479804541597, + "learning_rate": 5.085721091412284e-06, + "loss": 0.4354, + "step": 14120 + }, + { + "epoch": 2.3190729374089054, + "grad_norm": 0.35718102691551273, + "learning_rate": 5.085256105593459e-06, + "loss": 0.424, + "step": 14121 + }, + { + "epoch": 2.319237164617248, + "grad_norm": 0.28737312989718716, + "learning_rate": 5.084791113418178e-06, + "loss": 0.4309, + "step": 14122 + }, + { + "epoch": 2.3194013918255907, + "grad_norm": 0.26492875726681686, + "learning_rate": 5.08432611489149e-06, + "loss": 0.4475, + "step": 14123 + }, + { + "epoch": 2.3195656190339333, + "grad_norm": 0.3803862478716756, + "learning_rate": 5.083861110018444e-06, + "loss": 0.4277, + "step": 14124 + }, + { + "epoch": 2.319729846242276, + "grad_norm": 0.3684425492831474, + "learning_rate": 5.083396098804093e-06, + "loss": 0.4391, + "step": 14125 + }, + { + "epoch": 2.319894073450619, + "grad_norm": 0.2797802642574338, + "learning_rate": 5.082931081253487e-06, + "loss": 0.4504, + "step": 14126 + }, + { + "epoch": 2.3200583006589617, + "grad_norm": 0.31374455966117754, + "learning_rate": 5.0824660573716756e-06, + "loss": 0.432, + "step": 14127 + }, + { + "epoch": 2.3202225278673043, + "grad_norm": 0.33890373708963784, + "learning_rate": 5.082001027163708e-06, + "loss": 0.4218, + "step": 14128 + }, + { + "epoch": 2.3203867550756474, + "grad_norm": 0.3227194866548745, + "learning_rate": 5.081535990634639e-06, + "loss": 0.4614, + "step": 14129 + }, + { + "epoch": 2.32055098228399, + "grad_norm": 0.4470375783159691, + "learning_rate": 5.081070947789517e-06, + "loss": 0.4233, + "step": 14130 + }, + { + "epoch": 2.3207152094923327, + "grad_norm": 0.32100958956086817, + "learning_rate": 5.080605898633392e-06, + "loss": 0.4519, + "step": 14131 + }, + { + "epoch": 2.3208794367006753, + "grad_norm": 0.3035182537145447, + "learning_rate": 5.080140843171315e-06, + "loss": 0.4415, + "step": 14132 + }, + { + "epoch": 2.321043663909018, + "grad_norm": 0.38367426741253585, + "learning_rate": 5.079675781408337e-06, + "loss": 0.4198, + "step": 14133 + }, + { + "epoch": 2.321207891117361, + "grad_norm": 0.28865208882220506, + "learning_rate": 5.079210713349512e-06, + "loss": 0.4461, + "step": 14134 + }, + { + "epoch": 2.3213721183257037, + "grad_norm": 0.36621689632654064, + "learning_rate": 5.078745638999888e-06, + "loss": 0.4345, + "step": 14135 + }, + { + "epoch": 2.3215363455340463, + "grad_norm": 0.4429051331999256, + "learning_rate": 5.0782805583645165e-06, + "loss": 0.4532, + "step": 14136 + }, + { + "epoch": 2.321700572742389, + "grad_norm": 0.31515172267910496, + "learning_rate": 5.077815471448449e-06, + "loss": 0.4311, + "step": 14137 + }, + { + "epoch": 2.321864799950732, + "grad_norm": 0.3330602157170481, + "learning_rate": 5.077350378256737e-06, + "loss": 0.4416, + "step": 14138 + }, + { + "epoch": 2.3220290271590747, + "grad_norm": 0.31717514726939133, + "learning_rate": 5.076885278794433e-06, + "loss": 0.4261, + "step": 14139 + }, + { + "epoch": 2.3221932543674173, + "grad_norm": 0.3302356919478289, + "learning_rate": 5.0764201730665866e-06, + "loss": 0.4234, + "step": 14140 + }, + { + "epoch": 2.32235748157576, + "grad_norm": 0.33608792819627553, + "learning_rate": 5.0759550610782494e-06, + "loss": 0.439, + "step": 14141 + }, + { + "epoch": 2.3225217087841026, + "grad_norm": 0.36175993010131097, + "learning_rate": 5.075489942834474e-06, + "loss": 0.4306, + "step": 14142 + }, + { + "epoch": 2.3226859359924457, + "grad_norm": 0.2664133792406407, + "learning_rate": 5.075024818340312e-06, + "loss": 0.4309, + "step": 14143 + }, + { + "epoch": 2.3228501632007883, + "grad_norm": 0.30003761719849853, + "learning_rate": 5.0745596876008145e-06, + "loss": 0.4443, + "step": 14144 + }, + { + "epoch": 2.323014390409131, + "grad_norm": 0.3464197651882214, + "learning_rate": 5.074094550621033e-06, + "loss": 0.4303, + "step": 14145 + }, + { + "epoch": 2.323178617617474, + "grad_norm": 0.304019749705254, + "learning_rate": 5.07362940740602e-06, + "loss": 0.4499, + "step": 14146 + }, + { + "epoch": 2.3233428448258167, + "grad_norm": 0.3001217873675227, + "learning_rate": 5.073164257960828e-06, + "loss": 0.4441, + "step": 14147 + }, + { + "epoch": 2.3235070720341593, + "grad_norm": 0.46064457005809356, + "learning_rate": 5.072699102290509e-06, + "loss": 0.4646, + "step": 14148 + }, + { + "epoch": 2.323671299242502, + "grad_norm": 0.3779222203657981, + "learning_rate": 5.072233940400112e-06, + "loss": 0.4595, + "step": 14149 + }, + { + "epoch": 2.3238355264508446, + "grad_norm": 0.41772714934031113, + "learning_rate": 5.071768772294692e-06, + "loss": 0.4357, + "step": 14150 + }, + { + "epoch": 2.3239997536591877, + "grad_norm": 0.3716025171384937, + "learning_rate": 5.0713035979793025e-06, + "loss": 0.4315, + "step": 14151 + }, + { + "epoch": 2.3241639808675303, + "grad_norm": 0.305323170356014, + "learning_rate": 5.070838417458992e-06, + "loss": 0.4386, + "step": 14152 + }, + { + "epoch": 2.324328208075873, + "grad_norm": 0.39248809891274733, + "learning_rate": 5.070373230738815e-06, + "loss": 0.4287, + "step": 14153 + }, + { + "epoch": 2.3244924352842156, + "grad_norm": 0.38032600364508357, + "learning_rate": 5.069908037823823e-06, + "loss": 0.4473, + "step": 14154 + }, + { + "epoch": 2.3246566624925586, + "grad_norm": 0.33848005664175346, + "learning_rate": 5.069442838719071e-06, + "loss": 0.449, + "step": 14155 + }, + { + "epoch": 2.3248208897009013, + "grad_norm": 0.3314219058532578, + "learning_rate": 5.068977633429607e-06, + "loss": 0.4281, + "step": 14156 + }, + { + "epoch": 2.324985116909244, + "grad_norm": 0.33796106931331144, + "learning_rate": 5.068512421960487e-06, + "loss": 0.438, + "step": 14157 + }, + { + "epoch": 2.3251493441175866, + "grad_norm": 0.3271594956649946, + "learning_rate": 5.068047204316763e-06, + "loss": 0.4406, + "step": 14158 + }, + { + "epoch": 2.325313571325929, + "grad_norm": 0.4212547832783069, + "learning_rate": 5.067581980503489e-06, + "loss": 0.4389, + "step": 14159 + }, + { + "epoch": 2.3254777985342723, + "grad_norm": 0.31153221402017683, + "learning_rate": 5.067116750525714e-06, + "loss": 0.4556, + "step": 14160 + }, + { + "epoch": 2.325642025742615, + "grad_norm": 0.29817177042555487, + "learning_rate": 5.066651514388493e-06, + "loss": 0.411, + "step": 14161 + }, + { + "epoch": 2.3258062529509576, + "grad_norm": 0.5029510933292733, + "learning_rate": 5.06618627209688e-06, + "loss": 0.4246, + "step": 14162 + }, + { + "epoch": 2.3259704801593006, + "grad_norm": 0.3514442344266123, + "learning_rate": 5.065721023655927e-06, + "loss": 0.4491, + "step": 14163 + }, + { + "epoch": 2.3261347073676433, + "grad_norm": 0.4915963134364145, + "learning_rate": 5.065255769070687e-06, + "loss": 0.4187, + "step": 14164 + }, + { + "epoch": 2.326298934575986, + "grad_norm": 0.2865691070834308, + "learning_rate": 5.064790508346213e-06, + "loss": 0.4314, + "step": 14165 + }, + { + "epoch": 2.3264631617843285, + "grad_norm": 0.3247883587286542, + "learning_rate": 5.064325241487559e-06, + "loss": 0.4283, + "step": 14166 + }, + { + "epoch": 2.326627388992671, + "grad_norm": 0.35794421758596046, + "learning_rate": 5.063859968499777e-06, + "loss": 0.4566, + "step": 14167 + }, + { + "epoch": 2.3267916162010143, + "grad_norm": 0.46948198967138616, + "learning_rate": 5.063394689387921e-06, + "loss": 0.4318, + "step": 14168 + }, + { + "epoch": 2.326955843409357, + "grad_norm": 0.30464816469968276, + "learning_rate": 5.062929404157046e-06, + "loss": 0.4245, + "step": 14169 + }, + { + "epoch": 2.3271200706176995, + "grad_norm": 0.4028945435079524, + "learning_rate": 5.062464112812202e-06, + "loss": 0.416, + "step": 14170 + }, + { + "epoch": 2.327284297826042, + "grad_norm": 0.3313330258097169, + "learning_rate": 5.061998815358444e-06, + "loss": 0.4552, + "step": 14171 + }, + { + "epoch": 2.3274485250343853, + "grad_norm": 0.3077777867179962, + "learning_rate": 5.061533511800827e-06, + "loss": 0.4567, + "step": 14172 + }, + { + "epoch": 2.327612752242728, + "grad_norm": 0.29356991243950614, + "learning_rate": 5.061068202144404e-06, + "loss": 0.4314, + "step": 14173 + }, + { + "epoch": 2.3277769794510705, + "grad_norm": 0.3028690657624948, + "learning_rate": 5.060602886394227e-06, + "loss": 0.4399, + "step": 14174 + }, + { + "epoch": 2.327941206659413, + "grad_norm": 0.3859579578351086, + "learning_rate": 5.060137564555352e-06, + "loss": 0.4357, + "step": 14175 + }, + { + "epoch": 2.328105433867756, + "grad_norm": 0.4652567023941955, + "learning_rate": 5.0596722366328316e-06, + "loss": 0.4401, + "step": 14176 + }, + { + "epoch": 2.328269661076099, + "grad_norm": 0.3837296610403499, + "learning_rate": 5.059206902631719e-06, + "loss": 0.4507, + "step": 14177 + }, + { + "epoch": 2.3284338882844415, + "grad_norm": 0.2869324518141024, + "learning_rate": 5.0587415625570725e-06, + "loss": 0.4319, + "step": 14178 + }, + { + "epoch": 2.328598115492784, + "grad_norm": 0.3169258499686373, + "learning_rate": 5.05827621641394e-06, + "loss": 0.4254, + "step": 14179 + }, + { + "epoch": 2.3287623427011273, + "grad_norm": 0.6569998287700313, + "learning_rate": 5.057810864207379e-06, + "loss": 0.4293, + "step": 14180 + }, + { + "epoch": 2.32892656990947, + "grad_norm": 0.4219660136497421, + "learning_rate": 5.057345505942444e-06, + "loss": 0.4211, + "step": 14181 + }, + { + "epoch": 2.3290907971178125, + "grad_norm": 0.3104954360549804, + "learning_rate": 5.056880141624187e-06, + "loss": 0.4445, + "step": 14182 + }, + { + "epoch": 2.329255024326155, + "grad_norm": 0.27808873262914974, + "learning_rate": 5.056414771257665e-06, + "loss": 0.415, + "step": 14183 + }, + { + "epoch": 2.329419251534498, + "grad_norm": 0.38868025370533194, + "learning_rate": 5.055949394847932e-06, + "loss": 0.4325, + "step": 14184 + }, + { + "epoch": 2.329583478742841, + "grad_norm": 0.34073141928086287, + "learning_rate": 5.05548401240004e-06, + "loss": 0.4338, + "step": 14185 + }, + { + "epoch": 2.3297477059511835, + "grad_norm": 0.3401716194953938, + "learning_rate": 5.0550186239190445e-06, + "loss": 0.4423, + "step": 14186 + }, + { + "epoch": 2.329911933159526, + "grad_norm": 0.3208427891596354, + "learning_rate": 5.0545532294100016e-06, + "loss": 0.4463, + "step": 14187 + }, + { + "epoch": 2.330076160367869, + "grad_norm": 0.36160948952617955, + "learning_rate": 5.054087828877966e-06, + "loss": 0.4596, + "step": 14188 + }, + { + "epoch": 2.330240387576212, + "grad_norm": 0.31086819709984803, + "learning_rate": 5.053622422327991e-06, + "loss": 0.4191, + "step": 14189 + }, + { + "epoch": 2.3304046147845545, + "grad_norm": 0.34567113619909157, + "learning_rate": 5.053157009765131e-06, + "loss": 0.4426, + "step": 14190 + }, + { + "epoch": 2.330568841992897, + "grad_norm": 0.3963209834997856, + "learning_rate": 5.052691591194442e-06, + "loss": 0.4531, + "step": 14191 + }, + { + "epoch": 2.33073306920124, + "grad_norm": 0.3103594508662998, + "learning_rate": 5.052226166620978e-06, + "loss": 0.4437, + "step": 14192 + }, + { + "epoch": 2.3308972964095824, + "grad_norm": 0.6589932994119513, + "learning_rate": 5.051760736049797e-06, + "loss": 0.4587, + "step": 14193 + }, + { + "epoch": 2.3310615236179255, + "grad_norm": 0.3181522531554116, + "learning_rate": 5.051295299485949e-06, + "loss": 0.4471, + "step": 14194 + }, + { + "epoch": 2.331225750826268, + "grad_norm": 0.5168257697310911, + "learning_rate": 5.0508298569344915e-06, + "loss": 0.4496, + "step": 14195 + }, + { + "epoch": 2.331389978034611, + "grad_norm": 0.39529469305292186, + "learning_rate": 5.05036440840048e-06, + "loss": 0.4473, + "step": 14196 + }, + { + "epoch": 2.331554205242954, + "grad_norm": 0.9181809301881988, + "learning_rate": 5.049898953888971e-06, + "loss": 0.4445, + "step": 14197 + }, + { + "epoch": 2.3317184324512965, + "grad_norm": 0.324874626375649, + "learning_rate": 5.049433493405018e-06, + "loss": 0.4317, + "step": 14198 + }, + { + "epoch": 2.331882659659639, + "grad_norm": 0.3660678533136915, + "learning_rate": 5.048968026953676e-06, + "loss": 0.4453, + "step": 14199 + }, + { + "epoch": 2.332046886867982, + "grad_norm": 0.29991457677255184, + "learning_rate": 5.048502554540001e-06, + "loss": 0.4349, + "step": 14200 + }, + { + "epoch": 2.3322111140763244, + "grad_norm": 0.41932979749369553, + "learning_rate": 5.048037076169049e-06, + "loss": 0.4427, + "step": 14201 + }, + { + "epoch": 2.3323753412846675, + "grad_norm": 0.3987618004851579, + "learning_rate": 5.047571591845875e-06, + "loss": 0.4478, + "step": 14202 + }, + { + "epoch": 2.33253956849301, + "grad_norm": 0.3376552578233765, + "learning_rate": 5.047106101575535e-06, + "loss": 0.4224, + "step": 14203 + }, + { + "epoch": 2.332703795701353, + "grad_norm": 0.3926530091665801, + "learning_rate": 5.046640605363084e-06, + "loss": 0.4436, + "step": 14204 + }, + { + "epoch": 2.3328680229096954, + "grad_norm": 0.6240595384014374, + "learning_rate": 5.046175103213579e-06, + "loss": 0.4306, + "step": 14205 + }, + { + "epoch": 2.3330322501180385, + "grad_norm": 0.4986024189108446, + "learning_rate": 5.045709595132074e-06, + "loss": 0.4649, + "step": 14206 + }, + { + "epoch": 2.333196477326381, + "grad_norm": 0.2973310236981245, + "learning_rate": 5.045244081123627e-06, + "loss": 0.4206, + "step": 14207 + }, + { + "epoch": 2.3333607045347238, + "grad_norm": 0.5038718583325013, + "learning_rate": 5.044778561193291e-06, + "loss": 0.425, + "step": 14208 + }, + { + "epoch": 2.3335249317430664, + "grad_norm": 0.32869318551080595, + "learning_rate": 5.044313035346126e-06, + "loss": 0.4238, + "step": 14209 + }, + { + "epoch": 2.333689158951409, + "grad_norm": 0.7844142557053408, + "learning_rate": 5.043847503587184e-06, + "loss": 0.4586, + "step": 14210 + }, + { + "epoch": 2.333853386159752, + "grad_norm": 0.39747748879408473, + "learning_rate": 5.043381965921524e-06, + "loss": 0.425, + "step": 14211 + }, + { + "epoch": 2.3340176133680948, + "grad_norm": 0.3712658915797102, + "learning_rate": 5.042916422354202e-06, + "loss": 0.4311, + "step": 14212 + }, + { + "epoch": 2.3341818405764374, + "grad_norm": 0.35805916644196095, + "learning_rate": 5.042450872890272e-06, + "loss": 0.4497, + "step": 14213 + }, + { + "epoch": 2.3343460677847805, + "grad_norm": 0.3489653733566996, + "learning_rate": 5.041985317534793e-06, + "loss": 0.4386, + "step": 14214 + }, + { + "epoch": 2.334510294993123, + "grad_norm": 1.552344521578485, + "learning_rate": 5.04151975629282e-06, + "loss": 0.4399, + "step": 14215 + }, + { + "epoch": 2.3346745222014658, + "grad_norm": 0.4330518043111234, + "learning_rate": 5.041054189169409e-06, + "loss": 0.4358, + "step": 14216 + }, + { + "epoch": 2.3348387494098084, + "grad_norm": 0.32002591932723284, + "learning_rate": 5.040588616169618e-06, + "loss": 0.419, + "step": 14217 + }, + { + "epoch": 2.335002976618151, + "grad_norm": 0.31767776648707025, + "learning_rate": 5.040123037298503e-06, + "loss": 0.4407, + "step": 14218 + }, + { + "epoch": 2.335167203826494, + "grad_norm": 0.366803435400509, + "learning_rate": 5.039657452561119e-06, + "loss": 0.4371, + "step": 14219 + }, + { + "epoch": 2.3353314310348368, + "grad_norm": 0.3296447167812067, + "learning_rate": 5.039191861962524e-06, + "loss": 0.4202, + "step": 14220 + }, + { + "epoch": 2.3354956582431794, + "grad_norm": 0.3349276761092498, + "learning_rate": 5.0387262655077755e-06, + "loss": 0.4325, + "step": 14221 + }, + { + "epoch": 2.335659885451522, + "grad_norm": 0.3980378072812971, + "learning_rate": 5.0382606632019325e-06, + "loss": 0.4229, + "step": 14222 + }, + { + "epoch": 2.335824112659865, + "grad_norm": 0.3202898785116211, + "learning_rate": 5.037795055050046e-06, + "loss": 0.4743, + "step": 14223 + }, + { + "epoch": 2.3359883398682078, + "grad_norm": 0.31778100820349403, + "learning_rate": 5.037329441057176e-06, + "loss": 0.4217, + "step": 14224 + }, + { + "epoch": 2.3361525670765504, + "grad_norm": 0.32080705990736375, + "learning_rate": 5.0368638212283795e-06, + "loss": 0.4228, + "step": 14225 + }, + { + "epoch": 2.336316794284893, + "grad_norm": 0.3102344359741497, + "learning_rate": 5.036398195568716e-06, + "loss": 0.4358, + "step": 14226 + }, + { + "epoch": 2.3364810214932357, + "grad_norm": 0.29092912120353454, + "learning_rate": 5.035932564083238e-06, + "loss": 0.4297, + "step": 14227 + }, + { + "epoch": 2.3366452487015787, + "grad_norm": 0.27798039965725674, + "learning_rate": 5.035466926777007e-06, + "loss": 0.4143, + "step": 14228 + }, + { + "epoch": 2.3368094759099214, + "grad_norm": 0.44537914816795954, + "learning_rate": 5.035001283655076e-06, + "loss": 0.4425, + "step": 14229 + }, + { + "epoch": 2.336973703118264, + "grad_norm": 0.3381259638195042, + "learning_rate": 5.0345356347225065e-06, + "loss": 0.4383, + "step": 14230 + }, + { + "epoch": 2.337137930326607, + "grad_norm": 0.36687226092077213, + "learning_rate": 5.034069979984353e-06, + "loss": 0.4437, + "step": 14231 + }, + { + "epoch": 2.3373021575349497, + "grad_norm": 0.46311466060791356, + "learning_rate": 5.033604319445676e-06, + "loss": 0.4397, + "step": 14232 + }, + { + "epoch": 2.3374663847432924, + "grad_norm": 0.3617763925092038, + "learning_rate": 5.0331386531115285e-06, + "loss": 0.4663, + "step": 14233 + }, + { + "epoch": 2.337630611951635, + "grad_norm": 0.44891382965557786, + "learning_rate": 5.032672980986972e-06, + "loss": 0.4556, + "step": 14234 + }, + { + "epoch": 2.3377948391599777, + "grad_norm": 0.36788493675452716, + "learning_rate": 5.032207303077063e-06, + "loss": 0.4325, + "step": 14235 + }, + { + "epoch": 2.3379590663683207, + "grad_norm": 0.3044382854335584, + "learning_rate": 5.031741619386858e-06, + "loss": 0.4566, + "step": 14236 + }, + { + "epoch": 2.3381232935766634, + "grad_norm": 0.3440239656824285, + "learning_rate": 5.0312759299214175e-06, + "loss": 0.4456, + "step": 14237 + }, + { + "epoch": 2.338287520785006, + "grad_norm": 0.3696861896699254, + "learning_rate": 5.030810234685796e-06, + "loss": 0.4308, + "step": 14238 + }, + { + "epoch": 2.3384517479933487, + "grad_norm": 0.4248817510969493, + "learning_rate": 5.030344533685054e-06, + "loss": 0.4313, + "step": 14239 + }, + { + "epoch": 2.3386159752016917, + "grad_norm": 0.3077063680986963, + "learning_rate": 5.02987882692425e-06, + "loss": 0.4457, + "step": 14240 + }, + { + "epoch": 2.3387802024100344, + "grad_norm": 0.337582640428433, + "learning_rate": 5.02941311440844e-06, + "loss": 0.4135, + "step": 14241 + }, + { + "epoch": 2.338944429618377, + "grad_norm": 0.3671872733120523, + "learning_rate": 5.028947396142681e-06, + "loss": 0.4155, + "step": 14242 + }, + { + "epoch": 2.3391086568267196, + "grad_norm": 0.3805787562666042, + "learning_rate": 5.028481672132034e-06, + "loss": 0.4486, + "step": 14243 + }, + { + "epoch": 2.3392728840350623, + "grad_norm": 0.5836936584296454, + "learning_rate": 5.028015942381555e-06, + "loss": 0.4465, + "step": 14244 + }, + { + "epoch": 2.3394371112434054, + "grad_norm": 0.36682613928154184, + "learning_rate": 5.027550206896304e-06, + "loss": 0.4313, + "step": 14245 + }, + { + "epoch": 2.339601338451748, + "grad_norm": 0.3211669947248319, + "learning_rate": 5.027084465681339e-06, + "loss": 0.4331, + "step": 14246 + }, + { + "epoch": 2.3397655656600906, + "grad_norm": 0.33075234650746704, + "learning_rate": 5.026618718741719e-06, + "loss": 0.4381, + "step": 14247 + }, + { + "epoch": 2.3399297928684337, + "grad_norm": 0.3224871130791294, + "learning_rate": 5.0261529660824994e-06, + "loss": 0.441, + "step": 14248 + }, + { + "epoch": 2.3400940200767764, + "grad_norm": 0.5106742651433831, + "learning_rate": 5.025687207708743e-06, + "loss": 0.4349, + "step": 14249 + }, + { + "epoch": 2.340258247285119, + "grad_norm": 0.3670987534944953, + "learning_rate": 5.025221443625504e-06, + "loss": 0.43, + "step": 14250 + }, + { + "epoch": 2.3404224744934616, + "grad_norm": 0.3152223583268576, + "learning_rate": 5.024755673837845e-06, + "loss": 0.4203, + "step": 14251 + }, + { + "epoch": 2.3405867017018043, + "grad_norm": 0.3104109942911494, + "learning_rate": 5.024289898350825e-06, + "loss": 0.4453, + "step": 14252 + }, + { + "epoch": 2.3407509289101474, + "grad_norm": 0.3354252677596092, + "learning_rate": 5.0238241171694974e-06, + "loss": 0.426, + "step": 14253 + }, + { + "epoch": 2.34091515611849, + "grad_norm": 0.3096400471568136, + "learning_rate": 5.023358330298925e-06, + "loss": 0.4414, + "step": 14254 + }, + { + "epoch": 2.3410793833268326, + "grad_norm": 0.4627023481999118, + "learning_rate": 5.022892537744167e-06, + "loss": 0.4296, + "step": 14255 + }, + { + "epoch": 2.3412436105351753, + "grad_norm": 0.47554609569492284, + "learning_rate": 5.022426739510283e-06, + "loss": 0.4319, + "step": 14256 + }, + { + "epoch": 2.3414078377435183, + "grad_norm": 0.33244191992096805, + "learning_rate": 5.021960935602329e-06, + "loss": 0.4346, + "step": 14257 + }, + { + "epoch": 2.341572064951861, + "grad_norm": 0.3671414791598005, + "learning_rate": 5.021495126025366e-06, + "loss": 0.4197, + "step": 14258 + }, + { + "epoch": 2.3417362921602036, + "grad_norm": 0.3001605538598186, + "learning_rate": 5.021029310784453e-06, + "loss": 0.435, + "step": 14259 + }, + { + "epoch": 2.3419005193685463, + "grad_norm": 0.41049676168709537, + "learning_rate": 5.020563489884649e-06, + "loss": 0.4452, + "step": 14260 + }, + { + "epoch": 2.342064746576889, + "grad_norm": 0.39958421223807394, + "learning_rate": 5.020097663331014e-06, + "loss": 0.4378, + "step": 14261 + }, + { + "epoch": 2.342228973785232, + "grad_norm": 0.3526260350919074, + "learning_rate": 5.019631831128605e-06, + "loss": 0.4508, + "step": 14262 + }, + { + "epoch": 2.3423932009935746, + "grad_norm": 0.36019790514805233, + "learning_rate": 5.019165993282483e-06, + "loss": 0.4508, + "step": 14263 + }, + { + "epoch": 2.3425574282019173, + "grad_norm": 0.39219392948370696, + "learning_rate": 5.018700149797709e-06, + "loss": 0.4285, + "step": 14264 + }, + { + "epoch": 2.3427216554102603, + "grad_norm": 0.34781631591555273, + "learning_rate": 5.018234300679341e-06, + "loss": 0.4335, + "step": 14265 + }, + { + "epoch": 2.342885882618603, + "grad_norm": 0.33544889426458624, + "learning_rate": 5.017768445932438e-06, + "loss": 0.4483, + "step": 14266 + }, + { + "epoch": 2.3430501098269456, + "grad_norm": 0.3154253538619397, + "learning_rate": 5.01730258556206e-06, + "loss": 0.4427, + "step": 14267 + }, + { + "epoch": 2.3432143370352883, + "grad_norm": 0.33192751785748736, + "learning_rate": 5.016836719573268e-06, + "loss": 0.4133, + "step": 14268 + }, + { + "epoch": 2.343378564243631, + "grad_norm": 0.313437034515309, + "learning_rate": 5.01637084797112e-06, + "loss": 0.4341, + "step": 14269 + }, + { + "epoch": 2.343542791451974, + "grad_norm": 0.31116640811452134, + "learning_rate": 5.015904970760677e-06, + "loss": 0.4455, + "step": 14270 + }, + { + "epoch": 2.3437070186603166, + "grad_norm": 0.44806362836241137, + "learning_rate": 5.015439087946998e-06, + "loss": 0.4522, + "step": 14271 + }, + { + "epoch": 2.3438712458686592, + "grad_norm": 0.34147456503692797, + "learning_rate": 5.0149731995351445e-06, + "loss": 0.4347, + "step": 14272 + }, + { + "epoch": 2.344035473077002, + "grad_norm": 0.31152642014698967, + "learning_rate": 5.014507305530173e-06, + "loss": 0.4205, + "step": 14273 + }, + { + "epoch": 2.344199700285345, + "grad_norm": 0.34475814543629774, + "learning_rate": 5.014041405937147e-06, + "loss": 0.4342, + "step": 14274 + }, + { + "epoch": 2.3443639274936876, + "grad_norm": 0.4015184961878321, + "learning_rate": 5.013575500761127e-06, + "loss": 0.4257, + "step": 14275 + }, + { + "epoch": 2.3445281547020302, + "grad_norm": 0.3033020240845556, + "learning_rate": 5.013109590007172e-06, + "loss": 0.4382, + "step": 14276 + }, + { + "epoch": 2.344692381910373, + "grad_norm": 0.338000873883839, + "learning_rate": 5.012643673680339e-06, + "loss": 0.4404, + "step": 14277 + }, + { + "epoch": 2.3448566091187155, + "grad_norm": 0.31550507098643704, + "learning_rate": 5.012177751785694e-06, + "loss": 0.4527, + "step": 14278 + }, + { + "epoch": 2.3450208363270586, + "grad_norm": 0.2923251975366749, + "learning_rate": 5.0117118243282925e-06, + "loss": 0.4509, + "step": 14279 + }, + { + "epoch": 2.3451850635354012, + "grad_norm": 0.4025916966292508, + "learning_rate": 5.011245891313199e-06, + "loss": 0.4418, + "step": 14280 + }, + { + "epoch": 2.345349290743744, + "grad_norm": 0.3495439251380644, + "learning_rate": 5.010779952745472e-06, + "loss": 0.4466, + "step": 14281 + }, + { + "epoch": 2.345513517952087, + "grad_norm": 0.3030842299624673, + "learning_rate": 5.010314008630171e-06, + "loss": 0.451, + "step": 14282 + }, + { + "epoch": 2.3456777451604296, + "grad_norm": 0.25459738553951666, + "learning_rate": 5.009848058972359e-06, + "loss": 0.4288, + "step": 14283 + }, + { + "epoch": 2.3458419723687722, + "grad_norm": 0.34130323628216985, + "learning_rate": 5.009382103777093e-06, + "loss": 0.4415, + "step": 14284 + }, + { + "epoch": 2.346006199577115, + "grad_norm": 0.31855931505260354, + "learning_rate": 5.008916143049439e-06, + "loss": 0.4409, + "step": 14285 + }, + { + "epoch": 2.3461704267854575, + "grad_norm": 0.529587397164376, + "learning_rate": 5.008450176794455e-06, + "loss": 0.4339, + "step": 14286 + }, + { + "epoch": 2.3463346539938006, + "grad_norm": 0.30583306940272903, + "learning_rate": 5.0079842050172e-06, + "loss": 0.4358, + "step": 14287 + }, + { + "epoch": 2.3464988812021432, + "grad_norm": 0.3376306795943431, + "learning_rate": 5.007518227722738e-06, + "loss": 0.4744, + "step": 14288 + }, + { + "epoch": 2.346663108410486, + "grad_norm": 0.3799763220578188, + "learning_rate": 5.007052244916129e-06, + "loss": 0.4431, + "step": 14289 + }, + { + "epoch": 2.3468273356188285, + "grad_norm": 0.2766437623030712, + "learning_rate": 5.006586256602433e-06, + "loss": 0.4062, + "step": 14290 + }, + { + "epoch": 2.3469915628271716, + "grad_norm": 0.3172427330468487, + "learning_rate": 5.006120262786712e-06, + "loss": 0.4506, + "step": 14291 + }, + { + "epoch": 2.3471557900355142, + "grad_norm": 0.370368306164017, + "learning_rate": 5.005654263474026e-06, + "loss": 0.4444, + "step": 14292 + }, + { + "epoch": 2.347320017243857, + "grad_norm": 0.5424344754056988, + "learning_rate": 5.0051882586694384e-06, + "loss": 0.4411, + "step": 14293 + }, + { + "epoch": 2.3474842444521995, + "grad_norm": 0.37539884285858194, + "learning_rate": 5.004722248378009e-06, + "loss": 0.4245, + "step": 14294 + }, + { + "epoch": 2.347648471660542, + "grad_norm": 0.317396319175305, + "learning_rate": 5.0042562326048e-06, + "loss": 0.4444, + "step": 14295 + }, + { + "epoch": 2.347812698868885, + "grad_norm": 0.28224187924061256, + "learning_rate": 5.003790211354872e-06, + "loss": 0.4551, + "step": 14296 + }, + { + "epoch": 2.347976926077228, + "grad_norm": 0.3263896166098868, + "learning_rate": 5.003324184633286e-06, + "loss": 0.4499, + "step": 14297 + }, + { + "epoch": 2.3481411532855705, + "grad_norm": 0.3283510564766459, + "learning_rate": 5.002858152445104e-06, + "loss": 0.439, + "step": 14298 + }, + { + "epoch": 2.3483053804939136, + "grad_norm": 0.34528668929370976, + "learning_rate": 5.002392114795388e-06, + "loss": 0.4474, + "step": 14299 + }, + { + "epoch": 2.348469607702256, + "grad_norm": 0.3675402895714062, + "learning_rate": 5.0019260716892e-06, + "loss": 0.466, + "step": 14300 + }, + { + "epoch": 2.348633834910599, + "grad_norm": 0.32888646075463274, + "learning_rate": 5.001460023131601e-06, + "loss": 0.4308, + "step": 14301 + }, + { + "epoch": 2.3487980621189415, + "grad_norm": 0.34194545071114996, + "learning_rate": 5.000993969127652e-06, + "loss": 0.4381, + "step": 14302 + }, + { + "epoch": 2.348962289327284, + "grad_norm": 0.3481120091692677, + "learning_rate": 5.000527909682415e-06, + "loss": 0.4281, + "step": 14303 + }, + { + "epoch": 2.349126516535627, + "grad_norm": 0.30426600918544544, + "learning_rate": 5.000061844800953e-06, + "loss": 0.4242, + "step": 14304 + }, + { + "epoch": 2.34929074374397, + "grad_norm": 0.33042610605754225, + "learning_rate": 4.9995957744883286e-06, + "loss": 0.4362, + "step": 14305 + }, + { + "epoch": 2.3494549709523125, + "grad_norm": 0.6139370838276929, + "learning_rate": 4.999129698749602e-06, + "loss": 0.4475, + "step": 14306 + }, + { + "epoch": 2.349619198160655, + "grad_norm": 0.3091523983404541, + "learning_rate": 4.998663617589835e-06, + "loss": 0.4586, + "step": 14307 + }, + { + "epoch": 2.349783425368998, + "grad_norm": 0.3671883433413459, + "learning_rate": 4.998197531014091e-06, + "loss": 0.4205, + "step": 14308 + }, + { + "epoch": 2.349947652577341, + "grad_norm": 0.28508625616699385, + "learning_rate": 4.997731439027432e-06, + "loss": 0.4259, + "step": 14309 + }, + { + "epoch": 2.3501118797856835, + "grad_norm": 0.33190313156947665, + "learning_rate": 4.99726534163492e-06, + "loss": 0.4272, + "step": 14310 + }, + { + "epoch": 2.350276106994026, + "grad_norm": 0.3684075426938841, + "learning_rate": 4.996799238841616e-06, + "loss": 0.4357, + "step": 14311 + }, + { + "epoch": 2.3504403342023688, + "grad_norm": 0.29104654696695337, + "learning_rate": 4.9963331306525834e-06, + "loss": 0.4507, + "step": 14312 + }, + { + "epoch": 2.350604561410712, + "grad_norm": 0.45342829692727055, + "learning_rate": 4.995867017072885e-06, + "loss": 0.4348, + "step": 14313 + }, + { + "epoch": 2.3507687886190545, + "grad_norm": 0.28462488186154083, + "learning_rate": 4.995400898107584e-06, + "loss": 0.4408, + "step": 14314 + }, + { + "epoch": 2.350933015827397, + "grad_norm": 0.32996492095223623, + "learning_rate": 4.994934773761742e-06, + "loss": 0.455, + "step": 14315 + }, + { + "epoch": 2.35109724303574, + "grad_norm": 0.4413132936257363, + "learning_rate": 4.994468644040419e-06, + "loss": 0.4453, + "step": 14316 + }, + { + "epoch": 2.351261470244083, + "grad_norm": 0.287439219083887, + "learning_rate": 4.994002508948682e-06, + "loss": 0.45, + "step": 14317 + }, + { + "epoch": 2.3514256974524255, + "grad_norm": 0.6687016558497426, + "learning_rate": 4.993536368491592e-06, + "loss": 0.4322, + "step": 14318 + }, + { + "epoch": 2.351589924660768, + "grad_norm": 0.31833933331295167, + "learning_rate": 4.99307022267421e-06, + "loss": 0.4233, + "step": 14319 + }, + { + "epoch": 2.3517541518691107, + "grad_norm": 0.2936523952712435, + "learning_rate": 4.992604071501601e-06, + "loss": 0.424, + "step": 14320 + }, + { + "epoch": 2.351918379077454, + "grad_norm": 0.29199165743680827, + "learning_rate": 4.992137914978827e-06, + "loss": 0.4367, + "step": 14321 + }, + { + "epoch": 2.3520826062857965, + "grad_norm": 0.38049347369291886, + "learning_rate": 4.991671753110952e-06, + "loss": 0.4562, + "step": 14322 + }, + { + "epoch": 2.352246833494139, + "grad_norm": 0.28514980201093715, + "learning_rate": 4.991205585903037e-06, + "loss": 0.4403, + "step": 14323 + }, + { + "epoch": 2.3524110607024817, + "grad_norm": 0.29063242226173736, + "learning_rate": 4.990739413360147e-06, + "loss": 0.4173, + "step": 14324 + }, + { + "epoch": 2.352575287910825, + "grad_norm": 0.33956667259069695, + "learning_rate": 4.990273235487343e-06, + "loss": 0.4395, + "step": 14325 + }, + { + "epoch": 2.3527395151191675, + "grad_norm": 0.29213608629516113, + "learning_rate": 4.9898070522896885e-06, + "loss": 0.436, + "step": 14326 + }, + { + "epoch": 2.35290374232751, + "grad_norm": 0.3184200244453, + "learning_rate": 4.9893408637722504e-06, + "loss": 0.4421, + "step": 14327 + }, + { + "epoch": 2.3530679695358527, + "grad_norm": 0.9054970444536435, + "learning_rate": 4.988874669940086e-06, + "loss": 0.4359, + "step": 14328 + }, + { + "epoch": 2.3532321967441954, + "grad_norm": 0.8009512337096192, + "learning_rate": 4.988408470798264e-06, + "loss": 0.4652, + "step": 14329 + }, + { + "epoch": 2.3533964239525385, + "grad_norm": 0.3000048361872114, + "learning_rate": 4.987942266351845e-06, + "loss": 0.4389, + "step": 14330 + }, + { + "epoch": 2.353560651160881, + "grad_norm": 0.4435834381481093, + "learning_rate": 4.987476056605892e-06, + "loss": 0.4462, + "step": 14331 + }, + { + "epoch": 2.3537248783692237, + "grad_norm": 0.31409223317024465, + "learning_rate": 4.987009841565469e-06, + "loss": 0.4288, + "step": 14332 + }, + { + "epoch": 2.353889105577567, + "grad_norm": 0.3775543340666963, + "learning_rate": 4.986543621235641e-06, + "loss": 0.4687, + "step": 14333 + }, + { + "epoch": 2.3540533327859094, + "grad_norm": 0.3154684698397416, + "learning_rate": 4.986077395621471e-06, + "loss": 0.4511, + "step": 14334 + }, + { + "epoch": 2.354217559994252, + "grad_norm": 0.33377923891010136, + "learning_rate": 4.985611164728022e-06, + "loss": 0.4495, + "step": 14335 + }, + { + "epoch": 2.3543817872025947, + "grad_norm": 0.38798308412223753, + "learning_rate": 4.9851449285603575e-06, + "loss": 0.4527, + "step": 14336 + }, + { + "epoch": 2.3545460144109374, + "grad_norm": 0.3595666147784764, + "learning_rate": 4.984678687123542e-06, + "loss": 0.4423, + "step": 14337 + }, + { + "epoch": 2.3547102416192804, + "grad_norm": 0.35797627990956515, + "learning_rate": 4.984212440422639e-06, + "loss": 0.4325, + "step": 14338 + }, + { + "epoch": 2.354874468827623, + "grad_norm": 0.29402622902463466, + "learning_rate": 4.9837461884627134e-06, + "loss": 0.4498, + "step": 14339 + }, + { + "epoch": 2.3550386960359657, + "grad_norm": 0.3619177067733234, + "learning_rate": 4.983279931248827e-06, + "loss": 0.4209, + "step": 14340 + }, + { + "epoch": 2.3552029232443084, + "grad_norm": 0.3469217107065268, + "learning_rate": 4.982813668786045e-06, + "loss": 0.4301, + "step": 14341 + }, + { + "epoch": 2.3553671504526514, + "grad_norm": 0.2869461199173222, + "learning_rate": 4.982347401079432e-06, + "loss": 0.4353, + "step": 14342 + }, + { + "epoch": 2.355531377660994, + "grad_norm": 0.3310294155726151, + "learning_rate": 4.981881128134052e-06, + "loss": 0.4306, + "step": 14343 + }, + { + "epoch": 2.3556956048693367, + "grad_norm": 0.44544791162175035, + "learning_rate": 4.981414849954969e-06, + "loss": 0.4457, + "step": 14344 + }, + { + "epoch": 2.3558598320776793, + "grad_norm": 0.2821848497782474, + "learning_rate": 4.980948566547246e-06, + "loss": 0.4283, + "step": 14345 + }, + { + "epoch": 2.356024059286022, + "grad_norm": 0.2899346904371948, + "learning_rate": 4.980482277915948e-06, + "loss": 0.4384, + "step": 14346 + }, + { + "epoch": 2.356188286494365, + "grad_norm": 0.28895582908267986, + "learning_rate": 4.9800159840661416e-06, + "loss": 0.4425, + "step": 14347 + }, + { + "epoch": 2.3563525137027077, + "grad_norm": 0.32261166832480315, + "learning_rate": 4.979549685002888e-06, + "loss": 0.4473, + "step": 14348 + }, + { + "epoch": 2.3565167409110503, + "grad_norm": 0.41278684584248443, + "learning_rate": 4.979083380731254e-06, + "loss": 0.4489, + "step": 14349 + }, + { + "epoch": 2.3566809681193934, + "grad_norm": 0.36717664735562605, + "learning_rate": 4.978617071256302e-06, + "loss": 0.4333, + "step": 14350 + }, + { + "epoch": 2.356845195327736, + "grad_norm": 0.2871702500281055, + "learning_rate": 4.978150756583098e-06, + "loss": 0.4314, + "step": 14351 + }, + { + "epoch": 2.3570094225360787, + "grad_norm": 0.296831622491759, + "learning_rate": 4.977684436716707e-06, + "loss": 0.4459, + "step": 14352 + }, + { + "epoch": 2.3571736497444213, + "grad_norm": 0.3152784713613363, + "learning_rate": 4.977218111662193e-06, + "loss": 0.4313, + "step": 14353 + }, + { + "epoch": 2.357337876952764, + "grad_norm": 0.3179123907686836, + "learning_rate": 4.97675178142462e-06, + "loss": 0.4383, + "step": 14354 + }, + { + "epoch": 2.357502104161107, + "grad_norm": 0.33097652269284605, + "learning_rate": 4.976285446009053e-06, + "loss": 0.4608, + "step": 14355 + }, + { + "epoch": 2.3576663313694497, + "grad_norm": 0.26370984506069994, + "learning_rate": 4.9758191054205595e-06, + "loss": 0.4194, + "step": 14356 + }, + { + "epoch": 2.3578305585777923, + "grad_norm": 0.3503675341705753, + "learning_rate": 4.975352759664201e-06, + "loss": 0.4318, + "step": 14357 + }, + { + "epoch": 2.357994785786135, + "grad_norm": 0.26075769429226026, + "learning_rate": 4.974886408745045e-06, + "loss": 0.4337, + "step": 14358 + }, + { + "epoch": 2.358159012994478, + "grad_norm": 0.3167737749473058, + "learning_rate": 4.9744200526681545e-06, + "loss": 0.4472, + "step": 14359 + }, + { + "epoch": 2.3583232402028207, + "grad_norm": 0.3267289682222068, + "learning_rate": 4.973953691438595e-06, + "loss": 0.4489, + "step": 14360 + }, + { + "epoch": 2.3584874674111633, + "grad_norm": 0.48223262088179, + "learning_rate": 4.973487325061433e-06, + "loss": 0.4413, + "step": 14361 + }, + { + "epoch": 2.358651694619506, + "grad_norm": 0.29004525308874013, + "learning_rate": 4.973020953541732e-06, + "loss": 0.4247, + "step": 14362 + }, + { + "epoch": 2.3588159218278486, + "grad_norm": 0.30160171868034047, + "learning_rate": 4.972554576884559e-06, + "loss": 0.448, + "step": 14363 + }, + { + "epoch": 2.3589801490361917, + "grad_norm": 0.41944901184902755, + "learning_rate": 4.972088195094978e-06, + "loss": 0.4512, + "step": 14364 + }, + { + "epoch": 2.3591443762445343, + "grad_norm": 0.28947942760816986, + "learning_rate": 4.971621808178054e-06, + "loss": 0.4422, + "step": 14365 + }, + { + "epoch": 2.359308603452877, + "grad_norm": 0.3238543328135968, + "learning_rate": 4.971155416138853e-06, + "loss": 0.4397, + "step": 14366 + }, + { + "epoch": 2.35947283066122, + "grad_norm": 0.3703052518219256, + "learning_rate": 4.970689018982442e-06, + "loss": 0.4297, + "step": 14367 + }, + { + "epoch": 2.3596370578695627, + "grad_norm": 0.32595968322039665, + "learning_rate": 4.9702226167138855e-06, + "loss": 0.4132, + "step": 14368 + }, + { + "epoch": 2.3598012850779053, + "grad_norm": 0.36003670628504997, + "learning_rate": 4.9697562093382475e-06, + "loss": 0.4519, + "step": 14369 + }, + { + "epoch": 2.359965512286248, + "grad_norm": 0.35066351988470934, + "learning_rate": 4.969289796860595e-06, + "loss": 0.4557, + "step": 14370 + }, + { + "epoch": 2.3601297394945906, + "grad_norm": 0.28461795043565574, + "learning_rate": 4.968823379285993e-06, + "loss": 0.4465, + "step": 14371 + }, + { + "epoch": 2.3602939667029337, + "grad_norm": 0.3349794950571723, + "learning_rate": 4.96835695661951e-06, + "loss": 0.4298, + "step": 14372 + }, + { + "epoch": 2.3604581939112763, + "grad_norm": 0.4623462596162397, + "learning_rate": 4.967890528866209e-06, + "loss": 0.4376, + "step": 14373 + }, + { + "epoch": 2.360622421119619, + "grad_norm": 0.3045711511063072, + "learning_rate": 4.967424096031155e-06, + "loss": 0.4281, + "step": 14374 + }, + { + "epoch": 2.3607866483279616, + "grad_norm": 0.28236555126313645, + "learning_rate": 4.966957658119415e-06, + "loss": 0.4293, + "step": 14375 + }, + { + "epoch": 2.3609508755363047, + "grad_norm": 0.329845338092337, + "learning_rate": 4.966491215136056e-06, + "loss": 0.4448, + "step": 14376 + }, + { + "epoch": 2.3611151027446473, + "grad_norm": 0.2780889848515211, + "learning_rate": 4.966024767086145e-06, + "loss": 0.4521, + "step": 14377 + }, + { + "epoch": 2.36127932995299, + "grad_norm": 0.7223244736692684, + "learning_rate": 4.965558313974746e-06, + "loss": 0.4267, + "step": 14378 + }, + { + "epoch": 2.3614435571613326, + "grad_norm": 0.29452548364476305, + "learning_rate": 4.965091855806925e-06, + "loss": 0.4378, + "step": 14379 + }, + { + "epoch": 2.3616077843696752, + "grad_norm": 0.34535556397656414, + "learning_rate": 4.964625392587749e-06, + "loss": 0.4374, + "step": 14380 + }, + { + "epoch": 2.3617720115780183, + "grad_norm": 0.28214696649413246, + "learning_rate": 4.9641589243222845e-06, + "loss": 0.4398, + "step": 14381 + }, + { + "epoch": 2.361936238786361, + "grad_norm": 0.9998617351395512, + "learning_rate": 4.963692451015597e-06, + "loss": 0.4421, + "step": 14382 + }, + { + "epoch": 2.3621004659947036, + "grad_norm": 0.31297145796805065, + "learning_rate": 4.963225972672753e-06, + "loss": 0.4216, + "step": 14383 + }, + { + "epoch": 2.3622646932030467, + "grad_norm": 0.2939100139357004, + "learning_rate": 4.96275948929882e-06, + "loss": 0.4648, + "step": 14384 + }, + { + "epoch": 2.3624289204113893, + "grad_norm": 0.530158610934547, + "learning_rate": 4.962293000898864e-06, + "loss": 0.4314, + "step": 14385 + }, + { + "epoch": 2.362593147619732, + "grad_norm": 0.40901577717616516, + "learning_rate": 4.9618265074779496e-06, + "loss": 0.428, + "step": 14386 + }, + { + "epoch": 2.3627573748280746, + "grad_norm": 0.27364877261831405, + "learning_rate": 4.961360009041146e-06, + "loss": 0.44, + "step": 14387 + }, + { + "epoch": 2.362921602036417, + "grad_norm": 0.30209721956900093, + "learning_rate": 4.9608935055935175e-06, + "loss": 0.4211, + "step": 14388 + }, + { + "epoch": 2.3630858292447603, + "grad_norm": 0.39013537477111404, + "learning_rate": 4.960426997140134e-06, + "loss": 0.4411, + "step": 14389 + }, + { + "epoch": 2.363250056453103, + "grad_norm": 0.5139156781012, + "learning_rate": 4.959960483686059e-06, + "loss": 0.4373, + "step": 14390 + }, + { + "epoch": 2.3634142836614456, + "grad_norm": 0.3278145517121621, + "learning_rate": 4.959493965236361e-06, + "loss": 0.4433, + "step": 14391 + }, + { + "epoch": 2.363578510869788, + "grad_norm": 0.3005983703147699, + "learning_rate": 4.959027441796107e-06, + "loss": 0.4535, + "step": 14392 + }, + { + "epoch": 2.3637427380781313, + "grad_norm": 0.4589025460002175, + "learning_rate": 4.958560913370363e-06, + "loss": 0.4363, + "step": 14393 + }, + { + "epoch": 2.363906965286474, + "grad_norm": 0.504401185692525, + "learning_rate": 4.958094379964196e-06, + "loss": 0.4448, + "step": 14394 + }, + { + "epoch": 2.3640711924948166, + "grad_norm": 0.29769659114726227, + "learning_rate": 4.9576278415826725e-06, + "loss": 0.4592, + "step": 14395 + }, + { + "epoch": 2.364235419703159, + "grad_norm": 0.29314450078630233, + "learning_rate": 4.957161298230861e-06, + "loss": 0.4562, + "step": 14396 + }, + { + "epoch": 2.364399646911502, + "grad_norm": 0.33090458937336886, + "learning_rate": 4.956694749913829e-06, + "loss": 0.4701, + "step": 14397 + }, + { + "epoch": 2.364563874119845, + "grad_norm": 0.2934499635019847, + "learning_rate": 4.956228196636643e-06, + "loss": 0.4375, + "step": 14398 + }, + { + "epoch": 2.3647281013281876, + "grad_norm": 0.3996609817040107, + "learning_rate": 4.9557616384043685e-06, + "loss": 0.4257, + "step": 14399 + }, + { + "epoch": 2.36489232853653, + "grad_norm": 0.32228752842073993, + "learning_rate": 4.955295075222074e-06, + "loss": 0.4337, + "step": 14400 + }, + { + "epoch": 2.3650565557448733, + "grad_norm": 0.3656720599456349, + "learning_rate": 4.954828507094828e-06, + "loss": 0.4355, + "step": 14401 + }, + { + "epoch": 2.365220782953216, + "grad_norm": 0.31683870295473615, + "learning_rate": 4.954361934027697e-06, + "loss": 0.4442, + "step": 14402 + }, + { + "epoch": 2.3653850101615586, + "grad_norm": 0.3107408972347142, + "learning_rate": 4.953895356025748e-06, + "loss": 0.4561, + "step": 14403 + }, + { + "epoch": 2.365549237369901, + "grad_norm": 0.32621837651394164, + "learning_rate": 4.9534287730940486e-06, + "loss": 0.4338, + "step": 14404 + }, + { + "epoch": 2.365713464578244, + "grad_norm": 0.29890532466434366, + "learning_rate": 4.952962185237667e-06, + "loss": 0.4527, + "step": 14405 + }, + { + "epoch": 2.365877691786587, + "grad_norm": 0.5040022006465182, + "learning_rate": 4.952495592461671e-06, + "loss": 0.4354, + "step": 14406 + }, + { + "epoch": 2.3660419189949295, + "grad_norm": 0.31470278545685143, + "learning_rate": 4.952028994771127e-06, + "loss": 0.4347, + "step": 14407 + }, + { + "epoch": 2.366206146203272, + "grad_norm": 0.3173507245055464, + "learning_rate": 4.951562392171103e-06, + "loss": 0.4437, + "step": 14408 + }, + { + "epoch": 2.366370373411615, + "grad_norm": 0.3154547169516024, + "learning_rate": 4.9510957846666665e-06, + "loss": 0.4601, + "step": 14409 + }, + { + "epoch": 2.366534600619958, + "grad_norm": 0.38068921310440645, + "learning_rate": 4.950629172262888e-06, + "loss": 0.4304, + "step": 14410 + }, + { + "epoch": 2.3666988278283005, + "grad_norm": 0.4921138013809312, + "learning_rate": 4.9501625549648315e-06, + "loss": 0.4571, + "step": 14411 + }, + { + "epoch": 2.366863055036643, + "grad_norm": 0.3367350539856582, + "learning_rate": 4.949695932777568e-06, + "loss": 0.4411, + "step": 14412 + }, + { + "epoch": 2.367027282244986, + "grad_norm": 0.2890413534719377, + "learning_rate": 4.949229305706163e-06, + "loss": 0.4387, + "step": 14413 + }, + { + "epoch": 2.3671915094533285, + "grad_norm": 0.29384243153256306, + "learning_rate": 4.948762673755688e-06, + "loss": 0.4309, + "step": 14414 + }, + { + "epoch": 2.3673557366616715, + "grad_norm": 0.30973836082355344, + "learning_rate": 4.948296036931206e-06, + "loss": 0.4332, + "step": 14415 + }, + { + "epoch": 2.367519963870014, + "grad_norm": 0.3154947522688878, + "learning_rate": 4.947829395237789e-06, + "loss": 0.4325, + "step": 14416 + }, + { + "epoch": 2.367684191078357, + "grad_norm": 0.3468663049157948, + "learning_rate": 4.947362748680506e-06, + "loss": 0.4446, + "step": 14417 + }, + { + "epoch": 2.3678484182867, + "grad_norm": 0.6625064405457736, + "learning_rate": 4.946896097264421e-06, + "loss": 0.4359, + "step": 14418 + }, + { + "epoch": 2.3680126454950425, + "grad_norm": 0.36755003431616734, + "learning_rate": 4.946429440994606e-06, + "loss": 0.4501, + "step": 14419 + }, + { + "epoch": 2.368176872703385, + "grad_norm": 0.3180908616830341, + "learning_rate": 4.945962779876127e-06, + "loss": 0.4367, + "step": 14420 + }, + { + "epoch": 2.368341099911728, + "grad_norm": 0.2969125348286323, + "learning_rate": 4.945496113914055e-06, + "loss": 0.4397, + "step": 14421 + }, + { + "epoch": 2.3685053271200704, + "grad_norm": 0.3484937349702599, + "learning_rate": 4.945029443113455e-06, + "loss": 0.4477, + "step": 14422 + }, + { + "epoch": 2.3686695543284135, + "grad_norm": 0.3364151217982416, + "learning_rate": 4.9445627674794e-06, + "loss": 0.4408, + "step": 14423 + }, + { + "epoch": 2.368833781536756, + "grad_norm": 0.27632261732359437, + "learning_rate": 4.944096087016953e-06, + "loss": 0.4327, + "step": 14424 + }, + { + "epoch": 2.368998008745099, + "grad_norm": 0.33643617461694814, + "learning_rate": 4.943629401731187e-06, + "loss": 0.4327, + "step": 14425 + }, + { + "epoch": 2.3691622359534414, + "grad_norm": 0.251371038402946, + "learning_rate": 4.94316271162717e-06, + "loss": 0.4295, + "step": 14426 + }, + { + "epoch": 2.3693264631617845, + "grad_norm": 0.273291389563221, + "learning_rate": 4.942696016709969e-06, + "loss": 0.4571, + "step": 14427 + }, + { + "epoch": 2.369490690370127, + "grad_norm": 0.3027602849916757, + "learning_rate": 4.942229316984654e-06, + "loss": 0.4303, + "step": 14428 + }, + { + "epoch": 2.36965491757847, + "grad_norm": 0.29359944961044193, + "learning_rate": 4.941762612456292e-06, + "loss": 0.447, + "step": 14429 + }, + { + "epoch": 2.3698191447868124, + "grad_norm": 0.30796589557839593, + "learning_rate": 4.941295903129954e-06, + "loss": 0.4462, + "step": 14430 + }, + { + "epoch": 2.369983371995155, + "grad_norm": 0.3321031226177786, + "learning_rate": 4.94082918901071e-06, + "loss": 0.442, + "step": 14431 + }, + { + "epoch": 2.370147599203498, + "grad_norm": 0.28573543373725724, + "learning_rate": 4.940362470103627e-06, + "loss": 0.4416, + "step": 14432 + }, + { + "epoch": 2.370311826411841, + "grad_norm": 0.34980867774696045, + "learning_rate": 4.939895746413773e-06, + "loss": 0.4405, + "step": 14433 + }, + { + "epoch": 2.3704760536201834, + "grad_norm": 0.38809888263301406, + "learning_rate": 4.939429017946218e-06, + "loss": 0.4441, + "step": 14434 + }, + { + "epoch": 2.3706402808285265, + "grad_norm": 0.2776636961242803, + "learning_rate": 4.938962284706034e-06, + "loss": 0.4449, + "step": 14435 + }, + { + "epoch": 2.370804508036869, + "grad_norm": 0.31336049708314334, + "learning_rate": 4.9384955466982855e-06, + "loss": 0.4198, + "step": 14436 + }, + { + "epoch": 2.370968735245212, + "grad_norm": 0.2897266594367285, + "learning_rate": 4.938028803928044e-06, + "loss": 0.4437, + "step": 14437 + }, + { + "epoch": 2.3711329624535544, + "grad_norm": 0.3676946069702938, + "learning_rate": 4.93756205640038e-06, + "loss": 0.4314, + "step": 14438 + }, + { + "epoch": 2.371297189661897, + "grad_norm": 0.29396140174936525, + "learning_rate": 4.937095304120362e-06, + "loss": 0.4426, + "step": 14439 + }, + { + "epoch": 2.37146141687024, + "grad_norm": 0.3154282077143417, + "learning_rate": 4.936628547093057e-06, + "loss": 0.4423, + "step": 14440 + }, + { + "epoch": 2.371625644078583, + "grad_norm": 0.5470785694493393, + "learning_rate": 4.936161785323538e-06, + "loss": 0.4369, + "step": 14441 + }, + { + "epoch": 2.3717898712869254, + "grad_norm": 0.2680266468948304, + "learning_rate": 4.935695018816872e-06, + "loss": 0.4333, + "step": 14442 + }, + { + "epoch": 2.371954098495268, + "grad_norm": 0.310046436229158, + "learning_rate": 4.935228247578129e-06, + "loss": 0.4326, + "step": 14443 + }, + { + "epoch": 2.372118325703611, + "grad_norm": 0.4633103707884507, + "learning_rate": 4.93476147161238e-06, + "loss": 0.4542, + "step": 14444 + }, + { + "epoch": 2.372282552911954, + "grad_norm": 0.4442241811118408, + "learning_rate": 4.9342946909246935e-06, + "loss": 0.439, + "step": 14445 + }, + { + "epoch": 2.3724467801202964, + "grad_norm": 0.3754637769732102, + "learning_rate": 4.933827905520139e-06, + "loss": 0.4583, + "step": 14446 + }, + { + "epoch": 2.372611007328639, + "grad_norm": 0.2746592828941314, + "learning_rate": 4.933361115403787e-06, + "loss": 0.4348, + "step": 14447 + }, + { + "epoch": 2.3727752345369817, + "grad_norm": 0.48162906768990915, + "learning_rate": 4.932894320580707e-06, + "loss": 0.4588, + "step": 14448 + }, + { + "epoch": 2.3729394617453248, + "grad_norm": 0.27825006803186564, + "learning_rate": 4.9324275210559675e-06, + "loss": 0.4456, + "step": 14449 + }, + { + "epoch": 2.3731036889536674, + "grad_norm": 0.6205579010136238, + "learning_rate": 4.931960716834641e-06, + "loss": 0.4453, + "step": 14450 + }, + { + "epoch": 2.37326791616201, + "grad_norm": 0.3124084196903969, + "learning_rate": 4.931493907921796e-06, + "loss": 0.4307, + "step": 14451 + }, + { + "epoch": 2.373432143370353, + "grad_norm": 0.3028093135665966, + "learning_rate": 4.931027094322503e-06, + "loss": 0.436, + "step": 14452 + }, + { + "epoch": 2.3735963705786958, + "grad_norm": 0.3348730023292204, + "learning_rate": 4.93056027604183e-06, + "loss": 0.4355, + "step": 14453 + }, + { + "epoch": 2.3737605977870384, + "grad_norm": 0.33042531193006286, + "learning_rate": 4.93009345308485e-06, + "loss": 0.4485, + "step": 14454 + }, + { + "epoch": 2.373924824995381, + "grad_norm": 0.28904118773468906, + "learning_rate": 4.929626625456633e-06, + "loss": 0.437, + "step": 14455 + }, + { + "epoch": 2.3740890522037237, + "grad_norm": 0.5665194365436337, + "learning_rate": 4.929159793162247e-06, + "loss": 0.4356, + "step": 14456 + }, + { + "epoch": 2.3742532794120668, + "grad_norm": 0.3139144028595357, + "learning_rate": 4.9286929562067635e-06, + "loss": 0.4295, + "step": 14457 + }, + { + "epoch": 2.3744175066204094, + "grad_norm": 0.3337755379913973, + "learning_rate": 4.928226114595252e-06, + "loss": 0.4425, + "step": 14458 + }, + { + "epoch": 2.374581733828752, + "grad_norm": 0.6960305232843772, + "learning_rate": 4.927759268332783e-06, + "loss": 0.4401, + "step": 14459 + }, + { + "epoch": 2.3747459610370947, + "grad_norm": 0.3636555112169848, + "learning_rate": 4.927292417424429e-06, + "loss": 0.4307, + "step": 14460 + }, + { + "epoch": 2.3749101882454378, + "grad_norm": 0.4151227793689786, + "learning_rate": 4.92682556187526e-06, + "loss": 0.4312, + "step": 14461 + }, + { + "epoch": 2.3750744154537804, + "grad_norm": 0.3043352830410617, + "learning_rate": 4.926358701690343e-06, + "loss": 0.4435, + "step": 14462 + }, + { + "epoch": 2.375238642662123, + "grad_norm": 0.41972681854749694, + "learning_rate": 4.925891836874751e-06, + "loss": 0.4381, + "step": 14463 + }, + { + "epoch": 2.3754028698704657, + "grad_norm": 0.3183437051748933, + "learning_rate": 4.925424967433557e-06, + "loss": 0.4492, + "step": 14464 + }, + { + "epoch": 2.3755670970788083, + "grad_norm": 0.3825905561550247, + "learning_rate": 4.924958093371828e-06, + "loss": 0.4457, + "step": 14465 + }, + { + "epoch": 2.3757313242871514, + "grad_norm": 0.3127106608442882, + "learning_rate": 4.924491214694636e-06, + "loss": 0.4438, + "step": 14466 + }, + { + "epoch": 2.375895551495494, + "grad_norm": 0.30704550035120054, + "learning_rate": 4.924024331407051e-06, + "loss": 0.4409, + "step": 14467 + }, + { + "epoch": 2.3760597787038367, + "grad_norm": 0.5557837596183262, + "learning_rate": 4.923557443514145e-06, + "loss": 0.4268, + "step": 14468 + }, + { + "epoch": 2.3762240059121797, + "grad_norm": 0.31056296034006503, + "learning_rate": 4.92309055102099e-06, + "loss": 0.4226, + "step": 14469 + }, + { + "epoch": 2.3763882331205224, + "grad_norm": 0.3614259977141636, + "learning_rate": 4.922623653932655e-06, + "loss": 0.437, + "step": 14470 + }, + { + "epoch": 2.376552460328865, + "grad_norm": 0.32688326087771474, + "learning_rate": 4.92215675225421e-06, + "loss": 0.4247, + "step": 14471 + }, + { + "epoch": 2.3767166875372077, + "grad_norm": 0.29109959383026757, + "learning_rate": 4.921689845990726e-06, + "loss": 0.4389, + "step": 14472 + }, + { + "epoch": 2.3768809147455503, + "grad_norm": 0.3201231594951292, + "learning_rate": 4.921222935147279e-06, + "loss": 0.4474, + "step": 14473 + }, + { + "epoch": 2.3770451419538934, + "grad_norm": 0.30953462437114765, + "learning_rate": 4.920756019728934e-06, + "loss": 0.4296, + "step": 14474 + }, + { + "epoch": 2.377209369162236, + "grad_norm": 0.33172813843029597, + "learning_rate": 4.9202890997407656e-06, + "loss": 0.4333, + "step": 14475 + }, + { + "epoch": 2.3773735963705787, + "grad_norm": 0.2837245728085035, + "learning_rate": 4.9198221751878435e-06, + "loss": 0.4304, + "step": 14476 + }, + { + "epoch": 2.3775378235789213, + "grad_norm": 0.36359957078420707, + "learning_rate": 4.919355246075241e-06, + "loss": 0.4302, + "step": 14477 + }, + { + "epoch": 2.3777020507872644, + "grad_norm": 0.3465557420820277, + "learning_rate": 4.918888312408026e-06, + "loss": 0.4385, + "step": 14478 + }, + { + "epoch": 2.377866277995607, + "grad_norm": 0.28285182301127754, + "learning_rate": 4.918421374191272e-06, + "loss": 0.4361, + "step": 14479 + }, + { + "epoch": 2.3780305052039497, + "grad_norm": 0.2794892656742042, + "learning_rate": 4.917954431430051e-06, + "loss": 0.4353, + "step": 14480 + }, + { + "epoch": 2.3781947324122923, + "grad_norm": 0.9462095011637344, + "learning_rate": 4.917487484129434e-06, + "loss": 0.442, + "step": 14481 + }, + { + "epoch": 2.378358959620635, + "grad_norm": 0.34605196349121137, + "learning_rate": 4.917020532294491e-06, + "loss": 0.4431, + "step": 14482 + }, + { + "epoch": 2.378523186828978, + "grad_norm": 0.29326419873734, + "learning_rate": 4.916553575930295e-06, + "loss": 0.429, + "step": 14483 + }, + { + "epoch": 2.3786874140373206, + "grad_norm": 0.32590839117850195, + "learning_rate": 4.9160866150419185e-06, + "loss": 0.4518, + "step": 14484 + }, + { + "epoch": 2.3788516412456633, + "grad_norm": 0.3073546322282407, + "learning_rate": 4.9156196496344315e-06, + "loss": 0.4276, + "step": 14485 + }, + { + "epoch": 2.3790158684540064, + "grad_norm": 0.36654859711759064, + "learning_rate": 4.915152679712905e-06, + "loss": 0.4439, + "step": 14486 + }, + { + "epoch": 2.379180095662349, + "grad_norm": 0.3038320472398642, + "learning_rate": 4.914685705282413e-06, + "loss": 0.4282, + "step": 14487 + }, + { + "epoch": 2.3793443228706916, + "grad_norm": 0.34343378873985786, + "learning_rate": 4.914218726348026e-06, + "loss": 0.4538, + "step": 14488 + }, + { + "epoch": 2.3795085500790343, + "grad_norm": 0.30341262594748736, + "learning_rate": 4.913751742914817e-06, + "loss": 0.4309, + "step": 14489 + }, + { + "epoch": 2.379672777287377, + "grad_norm": 0.29250488553478204, + "learning_rate": 4.913284754987856e-06, + "loss": 0.4135, + "step": 14490 + }, + { + "epoch": 2.37983700449572, + "grad_norm": 0.27498243268008893, + "learning_rate": 4.912817762572216e-06, + "loss": 0.4461, + "step": 14491 + }, + { + "epoch": 2.3800012317040626, + "grad_norm": 0.34557233743510063, + "learning_rate": 4.912350765672968e-06, + "loss": 0.419, + "step": 14492 + }, + { + "epoch": 2.3801654589124053, + "grad_norm": 0.2647786611846677, + "learning_rate": 4.911883764295186e-06, + "loss": 0.4275, + "step": 14493 + }, + { + "epoch": 2.380329686120748, + "grad_norm": 0.3375971662260146, + "learning_rate": 4.911416758443941e-06, + "loss": 0.4321, + "step": 14494 + }, + { + "epoch": 2.380493913329091, + "grad_norm": 0.29484030285466734, + "learning_rate": 4.910949748124306e-06, + "loss": 0.4441, + "step": 14495 + }, + { + "epoch": 2.3806581405374336, + "grad_norm": 0.3515144629906051, + "learning_rate": 4.9104827333413515e-06, + "loss": 0.4307, + "step": 14496 + }, + { + "epoch": 2.3808223677457763, + "grad_norm": 0.2737921671106952, + "learning_rate": 4.91001571410015e-06, + "loss": 0.4262, + "step": 14497 + }, + { + "epoch": 2.380986594954119, + "grad_norm": 0.3125825530773969, + "learning_rate": 4.909548690405777e-06, + "loss": 0.4474, + "step": 14498 + }, + { + "epoch": 2.3811508221624615, + "grad_norm": 0.3069349138335879, + "learning_rate": 4.909081662263299e-06, + "loss": 0.4183, + "step": 14499 + }, + { + "epoch": 2.3813150493708046, + "grad_norm": 0.31701743882648536, + "learning_rate": 4.908614629677794e-06, + "loss": 0.4235, + "step": 14500 + }, + { + "epoch": 2.3814792765791473, + "grad_norm": 0.34108602787338077, + "learning_rate": 4.908147592654332e-06, + "loss": 0.4231, + "step": 14501 + }, + { + "epoch": 2.38164350378749, + "grad_norm": 0.3935501178826395, + "learning_rate": 4.9076805511979845e-06, + "loss": 0.4539, + "step": 14502 + }, + { + "epoch": 2.381807730995833, + "grad_norm": 0.3131609275378316, + "learning_rate": 4.907213505313825e-06, + "loss": 0.4473, + "step": 14503 + }, + { + "epoch": 2.3819719582041756, + "grad_norm": 0.3002432702582786, + "learning_rate": 4.9067464550069275e-06, + "loss": 0.442, + "step": 14504 + }, + { + "epoch": 2.3821361854125183, + "grad_norm": 0.3096378279868265, + "learning_rate": 4.906279400282362e-06, + "loss": 0.4402, + "step": 14505 + }, + { + "epoch": 2.382300412620861, + "grad_norm": 0.2763473957324622, + "learning_rate": 4.905812341145204e-06, + "loss": 0.4298, + "step": 14506 + }, + { + "epoch": 2.3824646398292035, + "grad_norm": 0.26518791364843874, + "learning_rate": 4.905345277600524e-06, + "loss": 0.4342, + "step": 14507 + }, + { + "epoch": 2.3826288670375466, + "grad_norm": 0.2906595974916181, + "learning_rate": 4.904878209653394e-06, + "loss": 0.4429, + "step": 14508 + }, + { + "epoch": 2.3827930942458893, + "grad_norm": 0.3391787092663081, + "learning_rate": 4.904411137308889e-06, + "loss": 0.4355, + "step": 14509 + }, + { + "epoch": 2.382957321454232, + "grad_norm": 0.300445441947826, + "learning_rate": 4.9039440605720834e-06, + "loss": 0.4338, + "step": 14510 + }, + { + "epoch": 2.3831215486625745, + "grad_norm": 0.26723588598079895, + "learning_rate": 4.903476979448045e-06, + "loss": 0.4346, + "step": 14511 + }, + { + "epoch": 2.3832857758709176, + "grad_norm": 0.2838691958214916, + "learning_rate": 4.903009893941851e-06, + "loss": 0.4504, + "step": 14512 + }, + { + "epoch": 2.3834500030792602, + "grad_norm": 0.36319610087457616, + "learning_rate": 4.902542804058573e-06, + "loss": 0.4413, + "step": 14513 + }, + { + "epoch": 2.383614230287603, + "grad_norm": 0.4271662136062928, + "learning_rate": 4.902075709803284e-06, + "loss": 0.4456, + "step": 14514 + }, + { + "epoch": 2.3837784574959455, + "grad_norm": 0.5264160203486998, + "learning_rate": 4.901608611181057e-06, + "loss": 0.4405, + "step": 14515 + }, + { + "epoch": 2.383942684704288, + "grad_norm": 0.28902641518996575, + "learning_rate": 4.901141508196965e-06, + "loss": 0.4243, + "step": 14516 + }, + { + "epoch": 2.3841069119126312, + "grad_norm": 0.28998258912696195, + "learning_rate": 4.900674400856082e-06, + "loss": 0.4389, + "step": 14517 + }, + { + "epoch": 2.384271139120974, + "grad_norm": 0.297485830160264, + "learning_rate": 4.900207289163482e-06, + "loss": 0.4473, + "step": 14518 + }, + { + "epoch": 2.3844353663293165, + "grad_norm": 0.32857178706650425, + "learning_rate": 4.899740173124236e-06, + "loss": 0.437, + "step": 14519 + }, + { + "epoch": 2.3845995935376596, + "grad_norm": 0.33976806970867174, + "learning_rate": 4.899273052743418e-06, + "loss": 0.439, + "step": 14520 + }, + { + "epoch": 2.3847638207460022, + "grad_norm": 0.2981178461359611, + "learning_rate": 4.898805928026102e-06, + "loss": 0.4224, + "step": 14521 + }, + { + "epoch": 2.384928047954345, + "grad_norm": 0.40828126605470355, + "learning_rate": 4.8983387989773605e-06, + "loss": 0.4499, + "step": 14522 + }, + { + "epoch": 2.3850922751626875, + "grad_norm": 0.28611327371513295, + "learning_rate": 4.8978716656022686e-06, + "loss": 0.4221, + "step": 14523 + }, + { + "epoch": 2.38525650237103, + "grad_norm": 0.39824063682909056, + "learning_rate": 4.8974045279059e-06, + "loss": 0.4388, + "step": 14524 + }, + { + "epoch": 2.3854207295793732, + "grad_norm": 0.5609924737582214, + "learning_rate": 4.896937385893327e-06, + "loss": 0.4526, + "step": 14525 + }, + { + "epoch": 2.385584956787716, + "grad_norm": 0.31889911235677965, + "learning_rate": 4.896470239569622e-06, + "loss": 0.4425, + "step": 14526 + }, + { + "epoch": 2.3857491839960585, + "grad_norm": 0.302289456244296, + "learning_rate": 4.8960030889398605e-06, + "loss": 0.4016, + "step": 14527 + }, + { + "epoch": 2.385913411204401, + "grad_norm": 0.3105265703326025, + "learning_rate": 4.895535934009116e-06, + "loss": 0.4446, + "step": 14528 + }, + { + "epoch": 2.3860776384127442, + "grad_norm": 0.29037798400142933, + "learning_rate": 4.895068774782463e-06, + "loss": 0.4385, + "step": 14529 + }, + { + "epoch": 2.386241865621087, + "grad_norm": 0.3254765727629465, + "learning_rate": 4.894601611264973e-06, + "loss": 0.4645, + "step": 14530 + }, + { + "epoch": 2.3864060928294295, + "grad_norm": 0.28695733641227344, + "learning_rate": 4.894134443461723e-06, + "loss": 0.4496, + "step": 14531 + }, + { + "epoch": 2.386570320037772, + "grad_norm": 0.31091703634070156, + "learning_rate": 4.893667271377783e-06, + "loss": 0.4481, + "step": 14532 + }, + { + "epoch": 2.386734547246115, + "grad_norm": 0.3169028926886554, + "learning_rate": 4.8932000950182316e-06, + "loss": 0.4358, + "step": 14533 + }, + { + "epoch": 2.386898774454458, + "grad_norm": 0.32378659995173825, + "learning_rate": 4.892732914388138e-06, + "loss": 0.4216, + "step": 14534 + }, + { + "epoch": 2.3870630016628005, + "grad_norm": 0.3424600596134925, + "learning_rate": 4.89226572949258e-06, + "loss": 0.4347, + "step": 14535 + }, + { + "epoch": 2.387227228871143, + "grad_norm": 0.4083489609261254, + "learning_rate": 4.891798540336628e-06, + "loss": 0.4335, + "step": 14536 + }, + { + "epoch": 2.387391456079486, + "grad_norm": 0.31418734811658605, + "learning_rate": 4.891331346925361e-06, + "loss": 0.4284, + "step": 14537 + }, + { + "epoch": 2.387555683287829, + "grad_norm": 0.3118148361256724, + "learning_rate": 4.89086414926385e-06, + "loss": 0.4376, + "step": 14538 + }, + { + "epoch": 2.3877199104961715, + "grad_norm": 0.3205728376077856, + "learning_rate": 4.890396947357169e-06, + "loss": 0.4489, + "step": 14539 + }, + { + "epoch": 2.387884137704514, + "grad_norm": 0.3037510041274979, + "learning_rate": 4.889929741210394e-06, + "loss": 0.4459, + "step": 14540 + }, + { + "epoch": 2.3880483649128568, + "grad_norm": 0.29716183603278645, + "learning_rate": 4.889462530828597e-06, + "loss": 0.4372, + "step": 14541 + }, + { + "epoch": 2.3882125921212, + "grad_norm": 0.29372871219067426, + "learning_rate": 4.888995316216855e-06, + "loss": 0.4338, + "step": 14542 + }, + { + "epoch": 2.3883768193295425, + "grad_norm": 0.31376207464816375, + "learning_rate": 4.888528097380241e-06, + "loss": 0.4438, + "step": 14543 + }, + { + "epoch": 2.388541046537885, + "grad_norm": 0.3638794588880243, + "learning_rate": 4.88806087432383e-06, + "loss": 0.4374, + "step": 14544 + }, + { + "epoch": 2.3887052737462278, + "grad_norm": 0.4157192070799767, + "learning_rate": 4.8875936470526956e-06, + "loss": 0.4471, + "step": 14545 + }, + { + "epoch": 2.388869500954571, + "grad_norm": 0.4629312247969477, + "learning_rate": 4.887126415571912e-06, + "loss": 0.4498, + "step": 14546 + }, + { + "epoch": 2.3890337281629135, + "grad_norm": 0.35641292496732957, + "learning_rate": 4.886659179886555e-06, + "loss": 0.4136, + "step": 14547 + }, + { + "epoch": 2.389197955371256, + "grad_norm": 0.29823085621147055, + "learning_rate": 4.886191940001701e-06, + "loss": 0.4372, + "step": 14548 + }, + { + "epoch": 2.3893621825795988, + "grad_norm": 0.3648444197028791, + "learning_rate": 4.88572469592242e-06, + "loss": 0.4408, + "step": 14549 + }, + { + "epoch": 2.3895264097879414, + "grad_norm": 0.3131996192986303, + "learning_rate": 4.88525744765379e-06, + "loss": 0.4271, + "step": 14550 + }, + { + "epoch": 2.3896906369962845, + "grad_norm": 0.9393732568333811, + "learning_rate": 4.884790195200884e-06, + "loss": 0.4277, + "step": 14551 + }, + { + "epoch": 2.389854864204627, + "grad_norm": 0.49807294959646065, + "learning_rate": 4.88432293856878e-06, + "loss": 0.4137, + "step": 14552 + }, + { + "epoch": 2.3900190914129698, + "grad_norm": 0.2795714012023261, + "learning_rate": 4.88385567776255e-06, + "loss": 0.426, + "step": 14553 + }, + { + "epoch": 2.390183318621313, + "grad_norm": 0.3680107524133055, + "learning_rate": 4.883388412787269e-06, + "loss": 0.4374, + "step": 14554 + }, + { + "epoch": 2.3903475458296555, + "grad_norm": 0.2576176856027463, + "learning_rate": 4.882921143648013e-06, + "loss": 0.4269, + "step": 14555 + }, + { + "epoch": 2.390511773037998, + "grad_norm": 0.33334827288723884, + "learning_rate": 4.882453870349858e-06, + "loss": 0.4424, + "step": 14556 + }, + { + "epoch": 2.3906760002463407, + "grad_norm": 0.32855118523344423, + "learning_rate": 4.881986592897875e-06, + "loss": 0.4258, + "step": 14557 + }, + { + "epoch": 2.3908402274546834, + "grad_norm": 0.2641498222991579, + "learning_rate": 4.881519311297145e-06, + "loss": 0.4335, + "step": 14558 + }, + { + "epoch": 2.3910044546630265, + "grad_norm": 0.30007217089842214, + "learning_rate": 4.881052025552737e-06, + "loss": 0.4448, + "step": 14559 + }, + { + "epoch": 2.391168681871369, + "grad_norm": 0.2652220391149274, + "learning_rate": 4.880584735669731e-06, + "loss": 0.4102, + "step": 14560 + }, + { + "epoch": 2.3913329090797117, + "grad_norm": 0.7023406690762278, + "learning_rate": 4.880117441653199e-06, + "loss": 0.4472, + "step": 14561 + }, + { + "epoch": 2.3914971362880544, + "grad_norm": 0.3832103540577619, + "learning_rate": 4.879650143508217e-06, + "loss": 0.443, + "step": 14562 + }, + { + "epoch": 2.3916613634963975, + "grad_norm": 0.3768401551082294, + "learning_rate": 4.879182841239863e-06, + "loss": 0.4443, + "step": 14563 + }, + { + "epoch": 2.39182559070474, + "grad_norm": 0.529999664933227, + "learning_rate": 4.87871553485321e-06, + "loss": 0.4524, + "step": 14564 + }, + { + "epoch": 2.3919898179130827, + "grad_norm": 0.5284450836711834, + "learning_rate": 4.878248224353334e-06, + "loss": 0.4441, + "step": 14565 + }, + { + "epoch": 2.3921540451214254, + "grad_norm": 0.32265336090808, + "learning_rate": 4.877780909745308e-06, + "loss": 0.4271, + "step": 14566 + }, + { + "epoch": 2.392318272329768, + "grad_norm": 0.30649852459360966, + "learning_rate": 4.8773135910342105e-06, + "loss": 0.4307, + "step": 14567 + }, + { + "epoch": 2.392482499538111, + "grad_norm": 0.34600620366711066, + "learning_rate": 4.876846268225117e-06, + "loss": 0.426, + "step": 14568 + }, + { + "epoch": 2.3926467267464537, + "grad_norm": 0.2898991988895646, + "learning_rate": 4.876378941323102e-06, + "loss": 0.4361, + "step": 14569 + }, + { + "epoch": 2.3928109539547964, + "grad_norm": 0.31657080709066865, + "learning_rate": 4.87591161033324e-06, + "loss": 0.4468, + "step": 14570 + }, + { + "epoch": 2.392975181163139, + "grad_norm": 0.3682597438220124, + "learning_rate": 4.875444275260609e-06, + "loss": 0.4413, + "step": 14571 + }, + { + "epoch": 2.393139408371482, + "grad_norm": 0.2818996138587926, + "learning_rate": 4.8749769361102855e-06, + "loss": 0.4281, + "step": 14572 + }, + { + "epoch": 2.3933036355798247, + "grad_norm": 0.2760481393412478, + "learning_rate": 4.874509592887342e-06, + "loss": 0.4633, + "step": 14573 + }, + { + "epoch": 2.3934678627881674, + "grad_norm": 0.3090199962692466, + "learning_rate": 4.874042245596856e-06, + "loss": 0.4345, + "step": 14574 + }, + { + "epoch": 2.39363208999651, + "grad_norm": 0.3753236126182619, + "learning_rate": 4.873574894243902e-06, + "loss": 0.4334, + "step": 14575 + }, + { + "epoch": 2.393796317204853, + "grad_norm": 0.49666749402603405, + "learning_rate": 4.873107538833558e-06, + "loss": 0.4138, + "step": 14576 + }, + { + "epoch": 2.3939605444131957, + "grad_norm": 0.44873693999064757, + "learning_rate": 4.8726401793709e-06, + "loss": 0.4385, + "step": 14577 + }, + { + "epoch": 2.3941247716215384, + "grad_norm": 0.5624188771958173, + "learning_rate": 4.872172815861003e-06, + "loss": 0.4535, + "step": 14578 + }, + { + "epoch": 2.394288998829881, + "grad_norm": 0.36776524540715916, + "learning_rate": 4.871705448308942e-06, + "loss": 0.4431, + "step": 14579 + }, + { + "epoch": 2.394453226038224, + "grad_norm": 0.40112504999237575, + "learning_rate": 4.871238076719794e-06, + "loss": 0.4004, + "step": 14580 + }, + { + "epoch": 2.3946174532465667, + "grad_norm": 0.38816955768952405, + "learning_rate": 4.8707707010986365e-06, + "loss": 0.444, + "step": 14581 + }, + { + "epoch": 2.3947816804549094, + "grad_norm": 0.40890090055450345, + "learning_rate": 4.870303321450544e-06, + "loss": 0.4244, + "step": 14582 + }, + { + "epoch": 2.394945907663252, + "grad_norm": 0.347335496686221, + "learning_rate": 4.869835937780592e-06, + "loss": 0.4351, + "step": 14583 + }, + { + "epoch": 2.3951101348715946, + "grad_norm": 0.37816795611763476, + "learning_rate": 4.869368550093859e-06, + "loss": 0.4281, + "step": 14584 + }, + { + "epoch": 2.3952743620799377, + "grad_norm": 0.31915528359769213, + "learning_rate": 4.868901158395418e-06, + "loss": 0.4397, + "step": 14585 + }, + { + "epoch": 2.3954385892882804, + "grad_norm": 0.33747799228179265, + "learning_rate": 4.86843376269035e-06, + "loss": 0.4453, + "step": 14586 + }, + { + "epoch": 2.395602816496623, + "grad_norm": 0.3983997690577079, + "learning_rate": 4.867966362983728e-06, + "loss": 0.4484, + "step": 14587 + }, + { + "epoch": 2.3957670437049656, + "grad_norm": 0.475030543908353, + "learning_rate": 4.86749895928063e-06, + "loss": 0.428, + "step": 14588 + }, + { + "epoch": 2.3959312709133087, + "grad_norm": 0.3346654030306633, + "learning_rate": 4.86703155158613e-06, + "loss": 0.4342, + "step": 14589 + }, + { + "epoch": 2.3960954981216513, + "grad_norm": 0.286620871577916, + "learning_rate": 4.866564139905308e-06, + "loss": 0.4243, + "step": 14590 + }, + { + "epoch": 2.396259725329994, + "grad_norm": 0.3188961526696404, + "learning_rate": 4.866096724243238e-06, + "loss": 0.4302, + "step": 14591 + }, + { + "epoch": 2.3964239525383366, + "grad_norm": 0.4410066533907783, + "learning_rate": 4.8656293046049976e-06, + "loss": 0.4246, + "step": 14592 + }, + { + "epoch": 2.3965881797466797, + "grad_norm": 0.30105566284819324, + "learning_rate": 4.865161880995663e-06, + "loss": 0.4263, + "step": 14593 + }, + { + "epoch": 2.3967524069550223, + "grad_norm": 0.30798326084830907, + "learning_rate": 4.864694453420312e-06, + "loss": 0.447, + "step": 14594 + }, + { + "epoch": 2.396916634163365, + "grad_norm": 0.32150354987246904, + "learning_rate": 4.864227021884018e-06, + "loss": 0.4431, + "step": 14595 + }, + { + "epoch": 2.3970808613717076, + "grad_norm": 0.42606120355256905, + "learning_rate": 4.863759586391862e-06, + "loss": 0.4228, + "step": 14596 + }, + { + "epoch": 2.3972450885800507, + "grad_norm": 0.3043593996027058, + "learning_rate": 4.863292146948919e-06, + "loss": 0.4576, + "step": 14597 + }, + { + "epoch": 2.3974093157883933, + "grad_norm": 0.24487691882867033, + "learning_rate": 4.862824703560266e-06, + "loss": 0.4333, + "step": 14598 + }, + { + "epoch": 2.397573542996736, + "grad_norm": 0.4344466671080413, + "learning_rate": 4.862357256230979e-06, + "loss": 0.4571, + "step": 14599 + }, + { + "epoch": 2.3977377702050786, + "grad_norm": 0.4551198744757983, + "learning_rate": 4.861889804966136e-06, + "loss": 0.4434, + "step": 14600 + }, + { + "epoch": 2.3979019974134212, + "grad_norm": 0.3458326007684332, + "learning_rate": 4.861422349770814e-06, + "loss": 0.413, + "step": 14601 + }, + { + "epoch": 2.3980662246217643, + "grad_norm": 0.28749146184798846, + "learning_rate": 4.8609548906500895e-06, + "loss": 0.4352, + "step": 14602 + }, + { + "epoch": 2.398230451830107, + "grad_norm": 0.3190568605296688, + "learning_rate": 4.860487427609039e-06, + "loss": 0.412, + "step": 14603 + }, + { + "epoch": 2.3983946790384496, + "grad_norm": 0.30187148550272713, + "learning_rate": 4.860019960652741e-06, + "loss": 0.438, + "step": 14604 + }, + { + "epoch": 2.3985589062467922, + "grad_norm": 0.3017561582254539, + "learning_rate": 4.859552489786272e-06, + "loss": 0.4505, + "step": 14605 + }, + { + "epoch": 2.3987231334551353, + "grad_norm": 1.293891871086484, + "learning_rate": 4.85908501501471e-06, + "loss": 0.469, + "step": 14606 + }, + { + "epoch": 2.398887360663478, + "grad_norm": 0.5743909815937656, + "learning_rate": 4.858617536343131e-06, + "loss": 0.4301, + "step": 14607 + }, + { + "epoch": 2.3990515878718206, + "grad_norm": 0.30495791593097715, + "learning_rate": 4.858150053776612e-06, + "loss": 0.4623, + "step": 14608 + }, + { + "epoch": 2.3992158150801632, + "grad_norm": 0.34780932489427824, + "learning_rate": 4.857682567320231e-06, + "loss": 0.4429, + "step": 14609 + }, + { + "epoch": 2.3993800422885063, + "grad_norm": 0.3017423370784942, + "learning_rate": 4.857215076979065e-06, + "loss": 0.4366, + "step": 14610 + }, + { + "epoch": 2.399544269496849, + "grad_norm": 0.49134203031797363, + "learning_rate": 4.856747582758193e-06, + "loss": 0.4387, + "step": 14611 + }, + { + "epoch": 2.3997084967051916, + "grad_norm": 0.36562730185281356, + "learning_rate": 4.856280084662692e-06, + "loss": 0.4644, + "step": 14612 + }, + { + "epoch": 2.3998727239135342, + "grad_norm": 0.3097029681533732, + "learning_rate": 4.855812582697637e-06, + "loss": 0.4408, + "step": 14613 + }, + { + "epoch": 2.4000369511218773, + "grad_norm": 0.38873300419101187, + "learning_rate": 4.8553450768681075e-06, + "loss": 0.4431, + "step": 14614 + }, + { + "epoch": 2.40020117833022, + "grad_norm": 0.31761409622079567, + "learning_rate": 4.854877567179182e-06, + "loss": 0.4001, + "step": 14615 + }, + { + "epoch": 2.4003654055385626, + "grad_norm": 0.2777570280076414, + "learning_rate": 4.8544100536359375e-06, + "loss": 0.4245, + "step": 14616 + }, + { + "epoch": 2.4005296327469052, + "grad_norm": 0.31920044070247267, + "learning_rate": 4.853942536243449e-06, + "loss": 0.4412, + "step": 14617 + }, + { + "epoch": 2.400693859955248, + "grad_norm": 0.37106495638871273, + "learning_rate": 4.8534750150067965e-06, + "loss": 0.4423, + "step": 14618 + }, + { + "epoch": 2.400858087163591, + "grad_norm": 0.30136802472197904, + "learning_rate": 4.853007489931059e-06, + "loss": 0.4284, + "step": 14619 + }, + { + "epoch": 2.4010223143719336, + "grad_norm": 0.34743551515882315, + "learning_rate": 4.8525399610213115e-06, + "loss": 0.4357, + "step": 14620 + }, + { + "epoch": 2.4011865415802762, + "grad_norm": 0.4630662332483221, + "learning_rate": 4.852072428282635e-06, + "loss": 0.4507, + "step": 14621 + }, + { + "epoch": 2.401350768788619, + "grad_norm": 0.34421273907801864, + "learning_rate": 4.851604891720104e-06, + "loss": 0.4261, + "step": 14622 + }, + { + "epoch": 2.401514995996962, + "grad_norm": 0.30429866073920575, + "learning_rate": 4.851137351338798e-06, + "loss": 0.4375, + "step": 14623 + }, + { + "epoch": 2.4016792232053046, + "grad_norm": 0.30710285649408753, + "learning_rate": 4.850669807143795e-06, + "loss": 0.4438, + "step": 14624 + }, + { + "epoch": 2.401843450413647, + "grad_norm": 0.3111504370306694, + "learning_rate": 4.850202259140173e-06, + "loss": 0.4482, + "step": 14625 + }, + { + "epoch": 2.40200767762199, + "grad_norm": 0.2772359140975464, + "learning_rate": 4.84973470733301e-06, + "loss": 0.4459, + "step": 14626 + }, + { + "epoch": 2.402171904830333, + "grad_norm": 0.34639818656776106, + "learning_rate": 4.849267151727385e-06, + "loss": 0.4418, + "step": 14627 + }, + { + "epoch": 2.4023361320386756, + "grad_norm": 0.2964347781930256, + "learning_rate": 4.848799592328374e-06, + "loss": 0.4443, + "step": 14628 + }, + { + "epoch": 2.402500359247018, + "grad_norm": 0.33595929206815733, + "learning_rate": 4.848332029141055e-06, + "loss": 0.4301, + "step": 14629 + }, + { + "epoch": 2.402664586455361, + "grad_norm": 0.4015804960132361, + "learning_rate": 4.847864462170509e-06, + "loss": 0.4376, + "step": 14630 + }, + { + "epoch": 2.402828813663704, + "grad_norm": 0.36583852333102235, + "learning_rate": 4.847396891421814e-06, + "loss": 0.4408, + "step": 14631 + }, + { + "epoch": 2.4029930408720466, + "grad_norm": 0.3243934301895329, + "learning_rate": 4.8469293169000455e-06, + "loss": 0.432, + "step": 14632 + }, + { + "epoch": 2.403157268080389, + "grad_norm": 1.1242199526297239, + "learning_rate": 4.846461738610282e-06, + "loss": 0.4221, + "step": 14633 + }, + { + "epoch": 2.403321495288732, + "grad_norm": 0.6124803657632618, + "learning_rate": 4.845994156557604e-06, + "loss": 0.4373, + "step": 14634 + }, + { + "epoch": 2.4034857224970745, + "grad_norm": 0.31726548703950835, + "learning_rate": 4.84552657074709e-06, + "loss": 0.4681, + "step": 14635 + }, + { + "epoch": 2.4036499497054176, + "grad_norm": 0.3402830482709174, + "learning_rate": 4.845058981183817e-06, + "loss": 0.4427, + "step": 14636 + }, + { + "epoch": 2.40381417691376, + "grad_norm": 0.41415711718620096, + "learning_rate": 4.8445913878728644e-06, + "loss": 0.4509, + "step": 14637 + }, + { + "epoch": 2.403978404122103, + "grad_norm": 0.30062268597490205, + "learning_rate": 4.844123790819309e-06, + "loss": 0.4277, + "step": 14638 + }, + { + "epoch": 2.4041426313304455, + "grad_norm": 0.3777055667239158, + "learning_rate": 4.84365619002823e-06, + "loss": 0.4244, + "step": 14639 + }, + { + "epoch": 2.4043068585387886, + "grad_norm": 0.35823953526949825, + "learning_rate": 4.84318858550471e-06, + "loss": 0.4382, + "step": 14640 + }, + { + "epoch": 2.404471085747131, + "grad_norm": 0.29672629622495195, + "learning_rate": 4.842720977253822e-06, + "loss": 0.4321, + "step": 14641 + }, + { + "epoch": 2.404635312955474, + "grad_norm": 0.4199981609070766, + "learning_rate": 4.842253365280647e-06, + "loss": 0.4396, + "step": 14642 + }, + { + "epoch": 2.4047995401638165, + "grad_norm": 0.3244624973865936, + "learning_rate": 4.841785749590264e-06, + "loss": 0.4583, + "step": 14643 + }, + { + "epoch": 2.4049637673721596, + "grad_norm": 0.30771806534999885, + "learning_rate": 4.841318130187752e-06, + "loss": 0.4359, + "step": 14644 + }, + { + "epoch": 2.405127994580502, + "grad_norm": 0.29862090519592716, + "learning_rate": 4.840850507078189e-06, + "loss": 0.4414, + "step": 14645 + }, + { + "epoch": 2.405292221788845, + "grad_norm": 0.32810428176479883, + "learning_rate": 4.840382880266654e-06, + "loss": 0.4249, + "step": 14646 + }, + { + "epoch": 2.4054564489971875, + "grad_norm": 0.36740941744035893, + "learning_rate": 4.8399152497582255e-06, + "loss": 0.4569, + "step": 14647 + }, + { + "epoch": 2.4056206762055306, + "grad_norm": 0.2634679213019516, + "learning_rate": 4.839447615557984e-06, + "loss": 0.4461, + "step": 14648 + }, + { + "epoch": 2.405784903413873, + "grad_norm": 0.29881940701178017, + "learning_rate": 4.838979977671007e-06, + "loss": 0.4378, + "step": 14649 + }, + { + "epoch": 2.405949130622216, + "grad_norm": 0.29574009739461327, + "learning_rate": 4.838512336102374e-06, + "loss": 0.4135, + "step": 14650 + }, + { + "epoch": 2.4061133578305585, + "grad_norm": 0.30183737594772203, + "learning_rate": 4.838044690857163e-06, + "loss": 0.4418, + "step": 14651 + }, + { + "epoch": 2.406277585038901, + "grad_norm": 0.3247926430668289, + "learning_rate": 4.8375770419404566e-06, + "loss": 0.4605, + "step": 14652 + }, + { + "epoch": 2.406441812247244, + "grad_norm": 0.2958133357958267, + "learning_rate": 4.837109389357329e-06, + "loss": 0.4294, + "step": 14653 + }, + { + "epoch": 2.406606039455587, + "grad_norm": 0.36391652836693766, + "learning_rate": 4.836641733112861e-06, + "loss": 0.4277, + "step": 14654 + }, + { + "epoch": 2.4067702666639295, + "grad_norm": 0.332486098628157, + "learning_rate": 4.836174073212136e-06, + "loss": 0.4281, + "step": 14655 + }, + { + "epoch": 2.406934493872272, + "grad_norm": 0.31970043225608835, + "learning_rate": 4.835706409660227e-06, + "loss": 0.4152, + "step": 14656 + }, + { + "epoch": 2.407098721080615, + "grad_norm": 0.38834894755127186, + "learning_rate": 4.8352387424622174e-06, + "loss": 0.4234, + "step": 14657 + }, + { + "epoch": 2.407262948288958, + "grad_norm": 0.4903854226706532, + "learning_rate": 4.834771071623184e-06, + "loss": 0.4418, + "step": 14658 + }, + { + "epoch": 2.4074271754973005, + "grad_norm": 0.36363960384397537, + "learning_rate": 4.834303397148208e-06, + "loss": 0.4549, + "step": 14659 + }, + { + "epoch": 2.407591402705643, + "grad_norm": 0.29685954160347816, + "learning_rate": 4.8338357190423684e-06, + "loss": 0.4276, + "step": 14660 + }, + { + "epoch": 2.407755629913986, + "grad_norm": 0.29239727092336887, + "learning_rate": 4.833368037310746e-06, + "loss": 0.4477, + "step": 14661 + }, + { + "epoch": 2.407919857122329, + "grad_norm": 0.25791649206692546, + "learning_rate": 4.832900351958416e-06, + "loss": 0.4322, + "step": 14662 + }, + { + "epoch": 2.4080840843306714, + "grad_norm": 0.3088996194131839, + "learning_rate": 4.832432662990462e-06, + "loss": 0.4291, + "step": 14663 + }, + { + "epoch": 2.408248311539014, + "grad_norm": 0.41234587740782985, + "learning_rate": 4.831964970411962e-06, + "loss": 0.4531, + "step": 14664 + }, + { + "epoch": 2.408412538747357, + "grad_norm": 0.28658592688301243, + "learning_rate": 4.831497274227996e-06, + "loss": 0.4164, + "step": 14665 + }, + { + "epoch": 2.4085767659557, + "grad_norm": 0.340510853960804, + "learning_rate": 4.831029574443644e-06, + "loss": 0.4475, + "step": 14666 + }, + { + "epoch": 2.4087409931640424, + "grad_norm": 0.28144928044547207, + "learning_rate": 4.830561871063983e-06, + "loss": 0.4171, + "step": 14667 + }, + { + "epoch": 2.408905220372385, + "grad_norm": 0.28529132479570535, + "learning_rate": 4.830094164094096e-06, + "loss": 0.4457, + "step": 14668 + }, + { + "epoch": 2.4090694475807277, + "grad_norm": 0.3332582267058933, + "learning_rate": 4.829626453539062e-06, + "loss": 0.4128, + "step": 14669 + }, + { + "epoch": 2.409233674789071, + "grad_norm": 0.3058023319027211, + "learning_rate": 4.829158739403962e-06, + "loss": 0.4351, + "step": 14670 + }, + { + "epoch": 2.4093979019974134, + "grad_norm": 0.294349833850524, + "learning_rate": 4.828691021693872e-06, + "loss": 0.4613, + "step": 14671 + }, + { + "epoch": 2.409562129205756, + "grad_norm": 0.45108829225684566, + "learning_rate": 4.828223300413873e-06, + "loss": 0.4235, + "step": 14672 + }, + { + "epoch": 2.4097263564140987, + "grad_norm": 0.29688811430073514, + "learning_rate": 4.8277555755690495e-06, + "loss": 0.45, + "step": 14673 + }, + { + "epoch": 2.409890583622442, + "grad_norm": 0.3621360599742629, + "learning_rate": 4.827287847164475e-06, + "loss": 0.4473, + "step": 14674 + }, + { + "epoch": 2.4100548108307844, + "grad_norm": 0.27637854458418865, + "learning_rate": 4.826820115205234e-06, + "loss": 0.4417, + "step": 14675 + }, + { + "epoch": 2.410219038039127, + "grad_norm": 0.5931964755976143, + "learning_rate": 4.8263523796964045e-06, + "loss": 0.4292, + "step": 14676 + }, + { + "epoch": 2.4103832652474697, + "grad_norm": 0.27390405093577597, + "learning_rate": 4.825884640643067e-06, + "loss": 0.4545, + "step": 14677 + }, + { + "epoch": 2.410547492455813, + "grad_norm": 0.397743748353218, + "learning_rate": 4.825416898050302e-06, + "loss": 0.4375, + "step": 14678 + }, + { + "epoch": 2.4107117196641554, + "grad_norm": 0.31255784691419425, + "learning_rate": 4.82494915192319e-06, + "loss": 0.4449, + "step": 14679 + }, + { + "epoch": 2.410875946872498, + "grad_norm": 1.4403416299574465, + "learning_rate": 4.824481402266809e-06, + "loss": 0.442, + "step": 14680 + }, + { + "epoch": 2.4110401740808407, + "grad_norm": 0.33922366749113997, + "learning_rate": 4.82401364908624e-06, + "loss": 0.4288, + "step": 14681 + }, + { + "epoch": 2.411204401289184, + "grad_norm": 0.31757509275868, + "learning_rate": 4.823545892386567e-06, + "loss": 0.4422, + "step": 14682 + }, + { + "epoch": 2.4113686284975264, + "grad_norm": 0.304788723585256, + "learning_rate": 4.823078132172865e-06, + "loss": 0.4255, + "step": 14683 + }, + { + "epoch": 2.411532855705869, + "grad_norm": 0.27335390554437755, + "learning_rate": 4.822610368450218e-06, + "loss": 0.4283, + "step": 14684 + }, + { + "epoch": 2.4116970829142117, + "grad_norm": 0.3469198590453585, + "learning_rate": 4.822142601223704e-06, + "loss": 0.4274, + "step": 14685 + }, + { + "epoch": 2.4118613101225543, + "grad_norm": 0.3807364827960831, + "learning_rate": 4.8216748304984055e-06, + "loss": 0.4338, + "step": 14686 + }, + { + "epoch": 2.4120255373308974, + "grad_norm": 0.3371271371858553, + "learning_rate": 4.8212070562794e-06, + "loss": 0.435, + "step": 14687 + }, + { + "epoch": 2.41218976453924, + "grad_norm": 0.43622776633348503, + "learning_rate": 4.820739278571771e-06, + "loss": 0.4447, + "step": 14688 + }, + { + "epoch": 2.4123539917475827, + "grad_norm": 0.334119379770926, + "learning_rate": 4.820271497380598e-06, + "loss": 0.4353, + "step": 14689 + }, + { + "epoch": 2.4125182189559253, + "grad_norm": 0.41180417575499706, + "learning_rate": 4.819803712710961e-06, + "loss": 0.4331, + "step": 14690 + }, + { + "epoch": 2.4126824461642684, + "grad_norm": 0.31571247354022247, + "learning_rate": 4.819335924567942e-06, + "loss": 0.4396, + "step": 14691 + }, + { + "epoch": 2.412846673372611, + "grad_norm": 0.34680247785096713, + "learning_rate": 4.818868132956619e-06, + "loss": 0.4331, + "step": 14692 + }, + { + "epoch": 2.4130109005809537, + "grad_norm": 0.3080440473620233, + "learning_rate": 4.818400337882075e-06, + "loss": 0.43, + "step": 14693 + }, + { + "epoch": 2.4131751277892963, + "grad_norm": 0.2817006501283312, + "learning_rate": 4.8179325393493906e-06, + "loss": 0.449, + "step": 14694 + }, + { + "epoch": 2.4133393549976394, + "grad_norm": 0.3112562541286652, + "learning_rate": 4.817464737363646e-06, + "loss": 0.4373, + "step": 14695 + }, + { + "epoch": 2.413503582205982, + "grad_norm": 0.3130396013840511, + "learning_rate": 4.816996931929922e-06, + "loss": 0.4451, + "step": 14696 + }, + { + "epoch": 2.4136678094143247, + "grad_norm": 0.32321218750431113, + "learning_rate": 4.816529123053298e-06, + "loss": 0.4302, + "step": 14697 + }, + { + "epoch": 2.4138320366226673, + "grad_norm": 0.3698459892322004, + "learning_rate": 4.816061310738859e-06, + "loss": 0.4359, + "step": 14698 + }, + { + "epoch": 2.4139962638310104, + "grad_norm": 0.42413402293263597, + "learning_rate": 4.815593494991681e-06, + "loss": 0.4245, + "step": 14699 + }, + { + "epoch": 2.414160491039353, + "grad_norm": 0.30761605557733, + "learning_rate": 4.815125675816848e-06, + "loss": 0.4256, + "step": 14700 + }, + { + "epoch": 2.4143247182476957, + "grad_norm": 0.24302570900623616, + "learning_rate": 4.81465785321944e-06, + "loss": 0.4306, + "step": 14701 + }, + { + "epoch": 2.4144889454560383, + "grad_norm": 0.2927736876723968, + "learning_rate": 4.814190027204536e-06, + "loss": 0.4504, + "step": 14702 + }, + { + "epoch": 2.414653172664381, + "grad_norm": 0.38682221463914745, + "learning_rate": 4.813722197777223e-06, + "loss": 0.4254, + "step": 14703 + }, + { + "epoch": 2.414817399872724, + "grad_norm": 0.29190540997776415, + "learning_rate": 4.813254364942578e-06, + "loss": 0.4435, + "step": 14704 + }, + { + "epoch": 2.4149816270810667, + "grad_norm": 0.300206389002567, + "learning_rate": 4.81278652870568e-06, + "loss": 0.4327, + "step": 14705 + }, + { + "epoch": 2.4151458542894093, + "grad_norm": 0.39121257578811, + "learning_rate": 4.812318689071613e-06, + "loss": 0.4254, + "step": 14706 + }, + { + "epoch": 2.415310081497752, + "grad_norm": 0.2755867218892501, + "learning_rate": 4.811850846045459e-06, + "loss": 0.4484, + "step": 14707 + }, + { + "epoch": 2.415474308706095, + "grad_norm": 0.3882210233624939, + "learning_rate": 4.811382999632297e-06, + "loss": 0.4283, + "step": 14708 + }, + { + "epoch": 2.4156385359144377, + "grad_norm": 0.33194414181235643, + "learning_rate": 4.81091514983721e-06, + "loss": 0.4329, + "step": 14709 + }, + { + "epoch": 2.4158027631227803, + "grad_norm": 0.5485778450887616, + "learning_rate": 4.810447296665278e-06, + "loss": 0.4398, + "step": 14710 + }, + { + "epoch": 2.415966990331123, + "grad_norm": 0.2714483130812466, + "learning_rate": 4.809979440121583e-06, + "loss": 0.4438, + "step": 14711 + }, + { + "epoch": 2.416131217539466, + "grad_norm": 0.442827826944641, + "learning_rate": 4.8095115802112055e-06, + "loss": 0.4292, + "step": 14712 + }, + { + "epoch": 2.4162954447478087, + "grad_norm": 0.2997106074840441, + "learning_rate": 4.809043716939229e-06, + "loss": 0.43, + "step": 14713 + }, + { + "epoch": 2.4164596719561513, + "grad_norm": 0.3101144702754857, + "learning_rate": 4.8085758503107335e-06, + "loss": 0.4637, + "step": 14714 + }, + { + "epoch": 2.416623899164494, + "grad_norm": 0.4345488387599415, + "learning_rate": 4.8081079803308006e-06, + "loss": 0.441, + "step": 14715 + }, + { + "epoch": 2.416788126372837, + "grad_norm": 0.31524736528031577, + "learning_rate": 4.807640107004511e-06, + "loss": 0.4385, + "step": 14716 + }, + { + "epoch": 2.4169523535811797, + "grad_norm": 0.9910231052912541, + "learning_rate": 4.807172230336947e-06, + "loss": 0.451, + "step": 14717 + }, + { + "epoch": 2.4171165807895223, + "grad_norm": 0.353751030084181, + "learning_rate": 4.806704350333191e-06, + "loss": 0.4298, + "step": 14718 + }, + { + "epoch": 2.417280807997865, + "grad_norm": 0.2960552918752241, + "learning_rate": 4.8062364669983255e-06, + "loss": 0.4353, + "step": 14719 + }, + { + "epoch": 2.4174450352062076, + "grad_norm": 0.44033970780659726, + "learning_rate": 4.80576858033743e-06, + "loss": 0.4103, + "step": 14720 + }, + { + "epoch": 2.4176092624145507, + "grad_norm": 0.37219487130298756, + "learning_rate": 4.8053006903555846e-06, + "loss": 0.4263, + "step": 14721 + }, + { + "epoch": 2.4177734896228933, + "grad_norm": 0.30714372461167905, + "learning_rate": 4.804832797057875e-06, + "loss": 0.4256, + "step": 14722 + }, + { + "epoch": 2.417937716831236, + "grad_norm": 0.31272508721749476, + "learning_rate": 4.804364900449382e-06, + "loss": 0.4506, + "step": 14723 + }, + { + "epoch": 2.4181019440395786, + "grad_norm": 0.4030508319923359, + "learning_rate": 4.803897000535186e-06, + "loss": 0.4464, + "step": 14724 + }, + { + "epoch": 2.4182661712479216, + "grad_norm": 0.3141823854058452, + "learning_rate": 4.8034290973203695e-06, + "loss": 0.4415, + "step": 14725 + }, + { + "epoch": 2.4184303984562643, + "grad_norm": 0.33834956229051, + "learning_rate": 4.802961190810014e-06, + "loss": 0.4135, + "step": 14726 + }, + { + "epoch": 2.418594625664607, + "grad_norm": 0.33520231537538203, + "learning_rate": 4.802493281009202e-06, + "loss": 0.4389, + "step": 14727 + }, + { + "epoch": 2.4187588528729496, + "grad_norm": 0.289427403704886, + "learning_rate": 4.802025367923017e-06, + "loss": 0.4206, + "step": 14728 + }, + { + "epoch": 2.4189230800812926, + "grad_norm": 0.31902463019262406, + "learning_rate": 4.801557451556538e-06, + "loss": 0.4171, + "step": 14729 + }, + { + "epoch": 2.4190873072896353, + "grad_norm": 0.34344005937323363, + "learning_rate": 4.801089531914847e-06, + "loss": 0.4319, + "step": 14730 + }, + { + "epoch": 2.419251534497978, + "grad_norm": 0.3326226745260404, + "learning_rate": 4.80062160900303e-06, + "loss": 0.4359, + "step": 14731 + }, + { + "epoch": 2.4194157617063206, + "grad_norm": 0.34417344186097304, + "learning_rate": 4.800153682826166e-06, + "loss": 0.4375, + "step": 14732 + }, + { + "epoch": 2.4195799889146636, + "grad_norm": 0.2824968263701154, + "learning_rate": 4.799685753389338e-06, + "loss": 0.4317, + "step": 14733 + }, + { + "epoch": 2.4197442161230063, + "grad_norm": 0.4136191893113875, + "learning_rate": 4.799217820697626e-06, + "loss": 0.4159, + "step": 14734 + }, + { + "epoch": 2.419908443331349, + "grad_norm": 0.31603338795216673, + "learning_rate": 4.7987498847561146e-06, + "loss": 0.4325, + "step": 14735 + }, + { + "epoch": 2.4200726705396916, + "grad_norm": 0.34601978001013006, + "learning_rate": 4.798281945569887e-06, + "loss": 0.4193, + "step": 14736 + }, + { + "epoch": 2.420236897748034, + "grad_norm": 0.31763167069909876, + "learning_rate": 4.797814003144023e-06, + "loss": 0.4432, + "step": 14737 + }, + { + "epoch": 2.4204011249563773, + "grad_norm": 0.47743166016080013, + "learning_rate": 4.797346057483606e-06, + "loss": 0.4225, + "step": 14738 + }, + { + "epoch": 2.42056535216472, + "grad_norm": 0.36547044513866017, + "learning_rate": 4.796878108593718e-06, + "loss": 0.4388, + "step": 14739 + }, + { + "epoch": 2.4207295793730625, + "grad_norm": 0.3777551355717173, + "learning_rate": 4.796410156479443e-06, + "loss": 0.4363, + "step": 14740 + }, + { + "epoch": 2.420893806581405, + "grad_norm": 0.35697617648628965, + "learning_rate": 4.79594220114586e-06, + "loss": 0.4224, + "step": 14741 + }, + { + "epoch": 2.4210580337897483, + "grad_norm": 0.3862378017848256, + "learning_rate": 4.795474242598054e-06, + "loss": 0.4375, + "step": 14742 + }, + { + "epoch": 2.421222260998091, + "grad_norm": 0.30417337392006183, + "learning_rate": 4.7950062808411085e-06, + "loss": 0.4333, + "step": 14743 + }, + { + "epoch": 2.4213864882064335, + "grad_norm": 0.42732171860737683, + "learning_rate": 4.794538315880103e-06, + "loss": 0.4333, + "step": 14744 + }, + { + "epoch": 2.421550715414776, + "grad_norm": 0.31445632031102666, + "learning_rate": 4.7940703477201225e-06, + "loss": 0.4304, + "step": 14745 + }, + { + "epoch": 2.4217149426231193, + "grad_norm": 0.6221104220450133, + "learning_rate": 4.793602376366248e-06, + "loss": 0.4356, + "step": 14746 + }, + { + "epoch": 2.421879169831462, + "grad_norm": 0.32177085663706345, + "learning_rate": 4.793134401823564e-06, + "loss": 0.4323, + "step": 14747 + }, + { + "epoch": 2.4220433970398045, + "grad_norm": 0.410211944772348, + "learning_rate": 4.792666424097151e-06, + "loss": 0.4331, + "step": 14748 + }, + { + "epoch": 2.422207624248147, + "grad_norm": 0.3075597976329248, + "learning_rate": 4.7921984431920935e-06, + "loss": 0.43, + "step": 14749 + }, + { + "epoch": 2.4223718514564903, + "grad_norm": 0.35261253135593046, + "learning_rate": 4.791730459113472e-06, + "loss": 0.4492, + "step": 14750 + }, + { + "epoch": 2.422536078664833, + "grad_norm": 0.36899376715600435, + "learning_rate": 4.791262471866372e-06, + "loss": 0.4108, + "step": 14751 + }, + { + "epoch": 2.4227003058731755, + "grad_norm": 0.37434397764969857, + "learning_rate": 4.790794481455874e-06, + "loss": 0.4432, + "step": 14752 + }, + { + "epoch": 2.422864533081518, + "grad_norm": 0.31972500121132164, + "learning_rate": 4.7903264878870635e-06, + "loss": 0.4082, + "step": 14753 + }, + { + "epoch": 2.423028760289861, + "grad_norm": 0.3123339025336729, + "learning_rate": 4.78985849116502e-06, + "loss": 0.4427, + "step": 14754 + }, + { + "epoch": 2.423192987498204, + "grad_norm": 0.2795028420999253, + "learning_rate": 4.789390491294827e-06, + "loss": 0.433, + "step": 14755 + }, + { + "epoch": 2.4233572147065465, + "grad_norm": 0.328024548668862, + "learning_rate": 4.78892248828157e-06, + "loss": 0.4298, + "step": 14756 + }, + { + "epoch": 2.423521441914889, + "grad_norm": 0.31231298140414665, + "learning_rate": 4.788454482130332e-06, + "loss": 0.4279, + "step": 14757 + }, + { + "epoch": 2.423685669123232, + "grad_norm": 0.3172952986153162, + "learning_rate": 4.787986472846193e-06, + "loss": 0.4454, + "step": 14758 + }, + { + "epoch": 2.423849896331575, + "grad_norm": 0.35895962136883086, + "learning_rate": 4.787518460434238e-06, + "loss": 0.4469, + "step": 14759 + }, + { + "epoch": 2.4240141235399175, + "grad_norm": 0.25783531582906527, + "learning_rate": 4.7870504448995495e-06, + "loss": 0.4376, + "step": 14760 + }, + { + "epoch": 2.42417835074826, + "grad_norm": 0.31120025248246125, + "learning_rate": 4.78658242624721e-06, + "loss": 0.4412, + "step": 14761 + }, + { + "epoch": 2.424342577956603, + "grad_norm": 0.28799008577143, + "learning_rate": 4.786114404482305e-06, + "loss": 0.4224, + "step": 14762 + }, + { + "epoch": 2.424506805164946, + "grad_norm": 0.32125630055880444, + "learning_rate": 4.7856463796099156e-06, + "loss": 0.4404, + "step": 14763 + }, + { + "epoch": 2.4246710323732885, + "grad_norm": 0.32524375153824536, + "learning_rate": 4.785178351635124e-06, + "loss": 0.4425, + "step": 14764 + }, + { + "epoch": 2.424835259581631, + "grad_norm": 0.43420083943177323, + "learning_rate": 4.784710320563016e-06, + "loss": 0.4355, + "step": 14765 + }, + { + "epoch": 2.424999486789974, + "grad_norm": 0.34655407456659604, + "learning_rate": 4.784242286398674e-06, + "loss": 0.4326, + "step": 14766 + }, + { + "epoch": 2.425163713998317, + "grad_norm": 0.2747114594008047, + "learning_rate": 4.78377424914718e-06, + "loss": 0.426, + "step": 14767 + }, + { + "epoch": 2.4253279412066595, + "grad_norm": 0.34923290616290603, + "learning_rate": 4.78330620881362e-06, + "loss": 0.4456, + "step": 14768 + }, + { + "epoch": 2.425492168415002, + "grad_norm": 0.26275802173147667, + "learning_rate": 4.782838165403076e-06, + "loss": 0.4401, + "step": 14769 + }, + { + "epoch": 2.425656395623345, + "grad_norm": 0.3533306405127362, + "learning_rate": 4.7823701189206295e-06, + "loss": 0.4389, + "step": 14770 + }, + { + "epoch": 2.4258206228316874, + "grad_norm": 0.3336264849677881, + "learning_rate": 4.781902069371367e-06, + "loss": 0.4475, + "step": 14771 + }, + { + "epoch": 2.4259848500400305, + "grad_norm": 0.496776075438404, + "learning_rate": 4.78143401676037e-06, + "loss": 0.4364, + "step": 14772 + }, + { + "epoch": 2.426149077248373, + "grad_norm": 0.3384395539167162, + "learning_rate": 4.780965961092722e-06, + "loss": 0.4439, + "step": 14773 + }, + { + "epoch": 2.426313304456716, + "grad_norm": 0.33369923606988255, + "learning_rate": 4.78049790237351e-06, + "loss": 0.4317, + "step": 14774 + }, + { + "epoch": 2.4264775316650584, + "grad_norm": 0.29929761882085637, + "learning_rate": 4.780029840607812e-06, + "loss": 0.4413, + "step": 14775 + }, + { + "epoch": 2.4266417588734015, + "grad_norm": 0.30729134782795475, + "learning_rate": 4.7795617758007145e-06, + "loss": 0.429, + "step": 14776 + }, + { + "epoch": 2.426805986081744, + "grad_norm": 0.28668811524646165, + "learning_rate": 4.779093707957303e-06, + "loss": 0.4459, + "step": 14777 + }, + { + "epoch": 2.4269702132900868, + "grad_norm": 0.9480953824797685, + "learning_rate": 4.7786256370826586e-06, + "loss": 0.4403, + "step": 14778 + }, + { + "epoch": 2.4271344404984294, + "grad_norm": 0.2905434139198909, + "learning_rate": 4.7781575631818645e-06, + "loss": 0.44, + "step": 14779 + }, + { + "epoch": 2.4272986677067725, + "grad_norm": 0.36488714133691247, + "learning_rate": 4.777689486260006e-06, + "loss": 0.4242, + "step": 14780 + }, + { + "epoch": 2.427462894915115, + "grad_norm": 0.5288134862615336, + "learning_rate": 4.777221406322168e-06, + "loss": 0.4292, + "step": 14781 + }, + { + "epoch": 2.4276271221234578, + "grad_norm": 0.2905834476410685, + "learning_rate": 4.776753323373432e-06, + "loss": 0.4505, + "step": 14782 + }, + { + "epoch": 2.4277913493318004, + "grad_norm": 0.33826065646395237, + "learning_rate": 4.7762852374188815e-06, + "loss": 0.4299, + "step": 14783 + }, + { + "epoch": 2.4279555765401435, + "grad_norm": 0.40177622398477075, + "learning_rate": 4.7758171484636015e-06, + "loss": 0.4552, + "step": 14784 + }, + { + "epoch": 2.428119803748486, + "grad_norm": 0.27650968021480343, + "learning_rate": 4.775349056512676e-06, + "loss": 0.431, + "step": 14785 + }, + { + "epoch": 2.4282840309568288, + "grad_norm": 0.4170100154683804, + "learning_rate": 4.77488096157119e-06, + "loss": 0.4399, + "step": 14786 + }, + { + "epoch": 2.4284482581651714, + "grad_norm": 0.3064746958240771, + "learning_rate": 4.774412863644227e-06, + "loss": 0.4325, + "step": 14787 + }, + { + "epoch": 2.428612485373514, + "grad_norm": 0.6095324612409638, + "learning_rate": 4.773944762736868e-06, + "loss": 0.4377, + "step": 14788 + }, + { + "epoch": 2.428776712581857, + "grad_norm": 0.2843106785607604, + "learning_rate": 4.7734766588542005e-06, + "loss": 0.4241, + "step": 14789 + }, + { + "epoch": 2.4289409397901998, + "grad_norm": 0.3693598879437371, + "learning_rate": 4.773008552001308e-06, + "loss": 0.4398, + "step": 14790 + }, + { + "epoch": 2.4291051669985424, + "grad_norm": 0.2911478294752035, + "learning_rate": 4.772540442183273e-06, + "loss": 0.431, + "step": 14791 + }, + { + "epoch": 2.429269394206885, + "grad_norm": 0.27983158639217526, + "learning_rate": 4.772072329405182e-06, + "loss": 0.432, + "step": 14792 + }, + { + "epoch": 2.429433621415228, + "grad_norm": 0.35636279705662716, + "learning_rate": 4.771604213672116e-06, + "loss": 0.4412, + "step": 14793 + }, + { + "epoch": 2.4295978486235708, + "grad_norm": 0.3399555636216479, + "learning_rate": 4.771136094989162e-06, + "loss": 0.435, + "step": 14794 + }, + { + "epoch": 2.4297620758319134, + "grad_norm": 0.34618491205707147, + "learning_rate": 4.770667973361403e-06, + "loss": 0.4566, + "step": 14795 + }, + { + "epoch": 2.429926303040256, + "grad_norm": 0.28967649101482, + "learning_rate": 4.770199848793924e-06, + "loss": 0.4202, + "step": 14796 + }, + { + "epoch": 2.430090530248599, + "grad_norm": 0.2779166934004358, + "learning_rate": 4.7697317212918075e-06, + "loss": 0.4289, + "step": 14797 + }, + { + "epoch": 2.4302547574569417, + "grad_norm": 0.2812164799099954, + "learning_rate": 4.76926359086014e-06, + "loss": 0.4519, + "step": 14798 + }, + { + "epoch": 2.4304189846652844, + "grad_norm": 0.32263573432382836, + "learning_rate": 4.768795457504005e-06, + "loss": 0.4667, + "step": 14799 + }, + { + "epoch": 2.430583211873627, + "grad_norm": 0.28947225570522306, + "learning_rate": 4.768327321228487e-06, + "loss": 0.4348, + "step": 14800 + }, + { + "epoch": 2.43074743908197, + "grad_norm": 0.2764048458026499, + "learning_rate": 4.7678591820386705e-06, + "loss": 0.4371, + "step": 14801 + }, + { + "epoch": 2.4309116662903127, + "grad_norm": 0.2854607261745386, + "learning_rate": 4.7673910399396396e-06, + "loss": 0.4262, + "step": 14802 + }, + { + "epoch": 2.4310758934986554, + "grad_norm": 0.26922749726743767, + "learning_rate": 4.766922894936479e-06, + "loss": 0.4454, + "step": 14803 + }, + { + "epoch": 2.431240120706998, + "grad_norm": 0.2646681350023495, + "learning_rate": 4.766454747034273e-06, + "loss": 0.4247, + "step": 14804 + }, + { + "epoch": 2.4314043479153407, + "grad_norm": 0.34550230812725563, + "learning_rate": 4.765986596238106e-06, + "loss": 0.4341, + "step": 14805 + }, + { + "epoch": 2.4315685751236837, + "grad_norm": 0.2869108230808502, + "learning_rate": 4.765518442553063e-06, + "loss": 0.417, + "step": 14806 + }, + { + "epoch": 2.4317328023320264, + "grad_norm": 0.2849337599012945, + "learning_rate": 4.765050285984229e-06, + "loss": 0.4495, + "step": 14807 + }, + { + "epoch": 2.431897029540369, + "grad_norm": 0.4105925836148604, + "learning_rate": 4.7645821265366875e-06, + "loss": 0.425, + "step": 14808 + }, + { + "epoch": 2.4320612567487117, + "grad_norm": 0.3121318395626099, + "learning_rate": 4.764113964215523e-06, + "loss": 0.4407, + "step": 14809 + }, + { + "epoch": 2.4322254839570547, + "grad_norm": 0.28767812785841906, + "learning_rate": 4.763645799025822e-06, + "loss": 0.4481, + "step": 14810 + }, + { + "epoch": 2.4323897111653974, + "grad_norm": 0.3154937609679195, + "learning_rate": 4.763177630972669e-06, + "loss": 0.4548, + "step": 14811 + }, + { + "epoch": 2.43255393837374, + "grad_norm": 0.3992594414402425, + "learning_rate": 4.762709460061147e-06, + "loss": 0.4427, + "step": 14812 + }, + { + "epoch": 2.4327181655820826, + "grad_norm": 0.3513545926571055, + "learning_rate": 4.762241286296342e-06, + "loss": 0.47, + "step": 14813 + }, + { + "epoch": 2.4328823927904257, + "grad_norm": 0.29704178929318303, + "learning_rate": 4.761773109683338e-06, + "loss": 0.4328, + "step": 14814 + }, + { + "epoch": 2.4330466199987684, + "grad_norm": 0.31782862402263434, + "learning_rate": 4.761304930227222e-06, + "loss": 0.4252, + "step": 14815 + }, + { + "epoch": 2.433210847207111, + "grad_norm": 0.28176998645123574, + "learning_rate": 4.760836747933077e-06, + "loss": 0.4255, + "step": 14816 + }, + { + "epoch": 2.4333750744154536, + "grad_norm": 0.3323179661972225, + "learning_rate": 4.760368562805988e-06, + "loss": 0.4447, + "step": 14817 + }, + { + "epoch": 2.4335393016237967, + "grad_norm": 0.3144062587438668, + "learning_rate": 4.759900374851038e-06, + "loss": 0.431, + "step": 14818 + }, + { + "epoch": 2.4337035288321394, + "grad_norm": 0.31873388546233744, + "learning_rate": 4.759432184073317e-06, + "loss": 0.43, + "step": 14819 + }, + { + "epoch": 2.433867756040482, + "grad_norm": 0.35986862702714584, + "learning_rate": 4.758963990477906e-06, + "loss": 0.4175, + "step": 14820 + }, + { + "epoch": 2.4340319832488246, + "grad_norm": 0.4216160669476413, + "learning_rate": 4.758495794069893e-06, + "loss": 0.4186, + "step": 14821 + }, + { + "epoch": 2.4341962104571673, + "grad_norm": 0.2656357025434144, + "learning_rate": 4.758027594854359e-06, + "loss": 0.4245, + "step": 14822 + }, + { + "epoch": 2.4343604376655104, + "grad_norm": 0.27275238122045936, + "learning_rate": 4.757559392836393e-06, + "loss": 0.4171, + "step": 14823 + }, + { + "epoch": 2.434524664873853, + "grad_norm": 0.4642214461339467, + "learning_rate": 4.757091188021078e-06, + "loss": 0.4447, + "step": 14824 + }, + { + "epoch": 2.4346888920821956, + "grad_norm": 0.32736316829012824, + "learning_rate": 4.7566229804135e-06, + "loss": 0.4379, + "step": 14825 + }, + { + "epoch": 2.4348531192905383, + "grad_norm": 0.3461060149654547, + "learning_rate": 4.7561547700187435e-06, + "loss": 0.4493, + "step": 14826 + }, + { + "epoch": 2.4350173464988814, + "grad_norm": 0.3414152378529399, + "learning_rate": 4.755686556841894e-06, + "loss": 0.4467, + "step": 14827 + }, + { + "epoch": 2.435181573707224, + "grad_norm": 0.3408110403504788, + "learning_rate": 4.755218340888038e-06, + "loss": 0.436, + "step": 14828 + }, + { + "epoch": 2.4353458009155666, + "grad_norm": 0.2892055813816026, + "learning_rate": 4.754750122162258e-06, + "loss": 0.4512, + "step": 14829 + }, + { + "epoch": 2.4355100281239093, + "grad_norm": 0.6586018155691247, + "learning_rate": 4.754281900669644e-06, + "loss": 0.4342, + "step": 14830 + }, + { + "epoch": 2.4356742553322523, + "grad_norm": 0.3309761857325862, + "learning_rate": 4.753813676415275e-06, + "loss": 0.435, + "step": 14831 + }, + { + "epoch": 2.435838482540595, + "grad_norm": 0.47816462488938366, + "learning_rate": 4.753345449404242e-06, + "loss": 0.4482, + "step": 14832 + }, + { + "epoch": 2.4360027097489376, + "grad_norm": 0.4051812130197788, + "learning_rate": 4.752877219641628e-06, + "loss": 0.4585, + "step": 14833 + }, + { + "epoch": 2.4361669369572803, + "grad_norm": 0.3283393580462784, + "learning_rate": 4.752408987132517e-06, + "loss": 0.4471, + "step": 14834 + }, + { + "epoch": 2.4363311641656233, + "grad_norm": 0.29286177768403737, + "learning_rate": 4.751940751881998e-06, + "loss": 0.4231, + "step": 14835 + }, + { + "epoch": 2.436495391373966, + "grad_norm": 0.2906969999573708, + "learning_rate": 4.751472513895154e-06, + "loss": 0.4326, + "step": 14836 + }, + { + "epoch": 2.4366596185823086, + "grad_norm": 0.39150745671413895, + "learning_rate": 4.75100427317707e-06, + "loss": 0.4434, + "step": 14837 + }, + { + "epoch": 2.4368238457906513, + "grad_norm": 0.27745309394386025, + "learning_rate": 4.750536029732834e-06, + "loss": 0.4194, + "step": 14838 + }, + { + "epoch": 2.436988072998994, + "grad_norm": 0.7360407236143521, + "learning_rate": 4.750067783567528e-06, + "loss": 0.4238, + "step": 14839 + }, + { + "epoch": 2.437152300207337, + "grad_norm": 0.4518500230554605, + "learning_rate": 4.749599534686242e-06, + "loss": 0.4224, + "step": 14840 + }, + { + "epoch": 2.4373165274156796, + "grad_norm": 0.34881458565898915, + "learning_rate": 4.74913128309406e-06, + "loss": 0.4303, + "step": 14841 + }, + { + "epoch": 2.4374807546240222, + "grad_norm": 0.2615419271331008, + "learning_rate": 4.748663028796065e-06, + "loss": 0.4413, + "step": 14842 + }, + { + "epoch": 2.437644981832365, + "grad_norm": 0.31938658995749575, + "learning_rate": 4.748194771797346e-06, + "loss": 0.4383, + "step": 14843 + }, + { + "epoch": 2.437809209040708, + "grad_norm": 0.34806719512201467, + "learning_rate": 4.747726512102988e-06, + "loss": 0.4449, + "step": 14844 + }, + { + "epoch": 2.4379734362490506, + "grad_norm": 0.2952998266610244, + "learning_rate": 4.747258249718077e-06, + "loss": 0.4133, + "step": 14845 + }, + { + "epoch": 2.4381376634573932, + "grad_norm": 0.4080042462948629, + "learning_rate": 4.746789984647696e-06, + "loss": 0.4174, + "step": 14846 + }, + { + "epoch": 2.438301890665736, + "grad_norm": 0.2617233550855106, + "learning_rate": 4.7463217168969335e-06, + "loss": 0.4516, + "step": 14847 + }, + { + "epoch": 2.438466117874079, + "grad_norm": 0.3167837063772311, + "learning_rate": 4.7458534464708746e-06, + "loss": 0.4207, + "step": 14848 + }, + { + "epoch": 2.4386303450824216, + "grad_norm": 0.34189588475116073, + "learning_rate": 4.745385173374608e-06, + "loss": 0.4362, + "step": 14849 + }, + { + "epoch": 2.4387945722907642, + "grad_norm": 0.37358037512844017, + "learning_rate": 4.7449168976132145e-06, + "loss": 0.4224, + "step": 14850 + }, + { + "epoch": 2.438958799499107, + "grad_norm": 0.3431725832259083, + "learning_rate": 4.744448619191783e-06, + "loss": 0.4262, + "step": 14851 + }, + { + "epoch": 2.43912302670745, + "grad_norm": 0.314791986594665, + "learning_rate": 4.743980338115398e-06, + "loss": 0.4557, + "step": 14852 + }, + { + "epoch": 2.4392872539157926, + "grad_norm": 0.5013463893524989, + "learning_rate": 4.743512054389148e-06, + "loss": 0.4394, + "step": 14853 + }, + { + "epoch": 2.4394514811241352, + "grad_norm": 0.2725217761544163, + "learning_rate": 4.743043768018117e-06, + "loss": 0.4114, + "step": 14854 + }, + { + "epoch": 2.439615708332478, + "grad_norm": 0.37105193624079863, + "learning_rate": 4.742575479007393e-06, + "loss": 0.4172, + "step": 14855 + }, + { + "epoch": 2.4397799355408205, + "grad_norm": 0.4803472831557481, + "learning_rate": 4.742107187362058e-06, + "loss": 0.4385, + "step": 14856 + }, + { + "epoch": 2.4399441627491636, + "grad_norm": 0.2943878838182107, + "learning_rate": 4.741638893087203e-06, + "loss": 0.4354, + "step": 14857 + }, + { + "epoch": 2.4401083899575062, + "grad_norm": 0.2980819471686045, + "learning_rate": 4.74117059618791e-06, + "loss": 0.4249, + "step": 14858 + }, + { + "epoch": 2.440272617165849, + "grad_norm": 0.7392078808610509, + "learning_rate": 4.740702296669269e-06, + "loss": 0.4452, + "step": 14859 + }, + { + "epoch": 2.4404368443741915, + "grad_norm": 0.335397733043894, + "learning_rate": 4.740233994536363e-06, + "loss": 0.4384, + "step": 14860 + }, + { + "epoch": 2.4406010715825346, + "grad_norm": 0.3189866960758374, + "learning_rate": 4.73976568979428e-06, + "loss": 0.4306, + "step": 14861 + }, + { + "epoch": 2.4407652987908772, + "grad_norm": 0.2752736569035358, + "learning_rate": 4.739297382448105e-06, + "loss": 0.4227, + "step": 14862 + }, + { + "epoch": 2.44092952599922, + "grad_norm": 0.37664773038821614, + "learning_rate": 4.738829072502925e-06, + "loss": 0.4311, + "step": 14863 + }, + { + "epoch": 2.4410937532075625, + "grad_norm": 0.3309783756164121, + "learning_rate": 4.738360759963827e-06, + "loss": 0.4323, + "step": 14864 + }, + { + "epoch": 2.4412579804159056, + "grad_norm": 0.3763551459442455, + "learning_rate": 4.737892444835896e-06, + "loss": 0.4403, + "step": 14865 + }, + { + "epoch": 2.441422207624248, + "grad_norm": 0.35097168113421806, + "learning_rate": 4.737424127124219e-06, + "loss": 0.4442, + "step": 14866 + }, + { + "epoch": 2.441586434832591, + "grad_norm": 0.3712543226418984, + "learning_rate": 4.7369558068338825e-06, + "loss": 0.4372, + "step": 14867 + }, + { + "epoch": 2.4417506620409335, + "grad_norm": 0.32506729393422107, + "learning_rate": 4.736487483969972e-06, + "loss": 0.4395, + "step": 14868 + }, + { + "epoch": 2.4419148892492766, + "grad_norm": 0.3064457747338616, + "learning_rate": 4.7360191585375765e-06, + "loss": 0.4452, + "step": 14869 + }, + { + "epoch": 2.442079116457619, + "grad_norm": 0.44702514644854086, + "learning_rate": 4.735550830541781e-06, + "loss": 0.4159, + "step": 14870 + }, + { + "epoch": 2.442243343665962, + "grad_norm": 0.3616030848631678, + "learning_rate": 4.73508249998767e-06, + "loss": 0.4256, + "step": 14871 + }, + { + "epoch": 2.4424075708743045, + "grad_norm": 0.28232285830936266, + "learning_rate": 4.734614166880332e-06, + "loss": 0.4406, + "step": 14872 + }, + { + "epoch": 2.442571798082647, + "grad_norm": 0.2805074286554534, + "learning_rate": 4.734145831224853e-06, + "loss": 0.4344, + "step": 14873 + }, + { + "epoch": 2.44273602529099, + "grad_norm": 0.2956435605595372, + "learning_rate": 4.733677493026323e-06, + "loss": 0.429, + "step": 14874 + }, + { + "epoch": 2.442900252499333, + "grad_norm": 0.2989704174696099, + "learning_rate": 4.7332091522898225e-06, + "loss": 0.4158, + "step": 14875 + }, + { + "epoch": 2.4430644797076755, + "grad_norm": 0.26915929922268195, + "learning_rate": 4.73274080902044e-06, + "loss": 0.4651, + "step": 14876 + }, + { + "epoch": 2.443228706916018, + "grad_norm": 0.3133310818964482, + "learning_rate": 4.732272463223265e-06, + "loss": 0.4208, + "step": 14877 + }, + { + "epoch": 2.443392934124361, + "grad_norm": 0.2808808633684279, + "learning_rate": 4.731804114903384e-06, + "loss": 0.4278, + "step": 14878 + }, + { + "epoch": 2.443557161332704, + "grad_norm": 0.9563998333839273, + "learning_rate": 4.731335764065881e-06, + "loss": 0.4451, + "step": 14879 + }, + { + "epoch": 2.4437213885410465, + "grad_norm": 0.30893292698775016, + "learning_rate": 4.730867410715844e-06, + "loss": 0.4448, + "step": 14880 + }, + { + "epoch": 2.443885615749389, + "grad_norm": 0.2708141085292231, + "learning_rate": 4.7303990548583584e-06, + "loss": 0.4341, + "step": 14881 + }, + { + "epoch": 2.444049842957732, + "grad_norm": 0.43372099717902635, + "learning_rate": 4.729930696498515e-06, + "loss": 0.441, + "step": 14882 + }, + { + "epoch": 2.444214070166075, + "grad_norm": 0.2895325076784331, + "learning_rate": 4.729462335641396e-06, + "loss": 0.426, + "step": 14883 + }, + { + "epoch": 2.4443782973744175, + "grad_norm": 0.33145935474776, + "learning_rate": 4.728993972292091e-06, + "loss": 0.437, + "step": 14884 + }, + { + "epoch": 2.44454252458276, + "grad_norm": 0.3935456848771356, + "learning_rate": 4.728525606455686e-06, + "loss": 0.4555, + "step": 14885 + }, + { + "epoch": 2.444706751791103, + "grad_norm": 0.2952308962107992, + "learning_rate": 4.728057238137269e-06, + "loss": 0.438, + "step": 14886 + }, + { + "epoch": 2.444870978999446, + "grad_norm": 0.3062234555252227, + "learning_rate": 4.727588867341925e-06, + "loss": 0.4143, + "step": 14887 + }, + { + "epoch": 2.4450352062077885, + "grad_norm": 0.32180242990583197, + "learning_rate": 4.727120494074741e-06, + "loss": 0.4416, + "step": 14888 + }, + { + "epoch": 2.445199433416131, + "grad_norm": 0.32843161134064053, + "learning_rate": 4.726652118340808e-06, + "loss": 0.418, + "step": 14889 + }, + { + "epoch": 2.4453636606244737, + "grad_norm": 0.3390374896278502, + "learning_rate": 4.726183740145208e-06, + "loss": 0.4518, + "step": 14890 + }, + { + "epoch": 2.445527887832817, + "grad_norm": 0.28347479023720434, + "learning_rate": 4.725715359493031e-06, + "loss": 0.4379, + "step": 14891 + }, + { + "epoch": 2.4456921150411595, + "grad_norm": 0.30168778550602693, + "learning_rate": 4.725246976389361e-06, + "loss": 0.4447, + "step": 14892 + }, + { + "epoch": 2.445856342249502, + "grad_norm": 0.3182228860270116, + "learning_rate": 4.724778590839289e-06, + "loss": 0.4436, + "step": 14893 + }, + { + "epoch": 2.4460205694578447, + "grad_norm": 0.26929248249368787, + "learning_rate": 4.7243102028479e-06, + "loss": 0.4283, + "step": 14894 + }, + { + "epoch": 2.446184796666188, + "grad_norm": 0.3502740325919489, + "learning_rate": 4.723841812420282e-06, + "loss": 0.441, + "step": 14895 + }, + { + "epoch": 2.4463490238745305, + "grad_norm": 0.4357248159824678, + "learning_rate": 4.723373419561521e-06, + "loss": 0.449, + "step": 14896 + }, + { + "epoch": 2.446513251082873, + "grad_norm": 0.2956091582918993, + "learning_rate": 4.722905024276704e-06, + "loss": 0.4464, + "step": 14897 + }, + { + "epoch": 2.4466774782912157, + "grad_norm": 0.9827935727181517, + "learning_rate": 4.72243662657092e-06, + "loss": 0.4575, + "step": 14898 + }, + { + "epoch": 2.446841705499559, + "grad_norm": 0.8911376698958159, + "learning_rate": 4.7219682264492565e-06, + "loss": 0.4177, + "step": 14899 + }, + { + "epoch": 2.4470059327079015, + "grad_norm": 0.4204288106361778, + "learning_rate": 4.721499823916798e-06, + "loss": 0.4407, + "step": 14900 + }, + { + "epoch": 2.447170159916244, + "grad_norm": 0.4214831462365801, + "learning_rate": 4.7210314189786335e-06, + "loss": 0.4385, + "step": 14901 + }, + { + "epoch": 2.4473343871245867, + "grad_norm": 0.2856166432697078, + "learning_rate": 4.72056301163985e-06, + "loss": 0.4083, + "step": 14902 + }, + { + "epoch": 2.44749861433293, + "grad_norm": 0.39927665904922716, + "learning_rate": 4.720094601905538e-06, + "loss": 0.4393, + "step": 14903 + }, + { + "epoch": 2.4476628415412724, + "grad_norm": 0.27247089834361105, + "learning_rate": 4.719626189780781e-06, + "loss": 0.4343, + "step": 14904 + }, + { + "epoch": 2.447827068749615, + "grad_norm": 0.3379655021876495, + "learning_rate": 4.719157775270666e-06, + "loss": 0.4368, + "step": 14905 + }, + { + "epoch": 2.4479912959579577, + "grad_norm": 0.5812707202881656, + "learning_rate": 4.718689358380282e-06, + "loss": 0.427, + "step": 14906 + }, + { + "epoch": 2.4481555231663004, + "grad_norm": 0.288175714574787, + "learning_rate": 4.718220939114718e-06, + "loss": 0.4393, + "step": 14907 + }, + { + "epoch": 2.4483197503746434, + "grad_norm": 0.31558613912668265, + "learning_rate": 4.717752517479059e-06, + "loss": 0.4263, + "step": 14908 + }, + { + "epoch": 2.448483977582986, + "grad_norm": 0.3122999644429297, + "learning_rate": 4.7172840934783935e-06, + "loss": 0.4497, + "step": 14909 + }, + { + "epoch": 2.4486482047913287, + "grad_norm": 0.3856354602008668, + "learning_rate": 4.716815667117808e-06, + "loss": 0.4209, + "step": 14910 + }, + { + "epoch": 2.4488124319996714, + "grad_norm": 0.3183549391768059, + "learning_rate": 4.716347238402393e-06, + "loss": 0.4361, + "step": 14911 + }, + { + "epoch": 2.4489766592080144, + "grad_norm": 0.27665517262562817, + "learning_rate": 4.715878807337233e-06, + "loss": 0.407, + "step": 14912 + }, + { + "epoch": 2.449140886416357, + "grad_norm": 0.5377088495858262, + "learning_rate": 4.715410373927417e-06, + "loss": 0.4374, + "step": 14913 + }, + { + "epoch": 2.4493051136246997, + "grad_norm": 0.30183317730928727, + "learning_rate": 4.714941938178032e-06, + "loss": 0.4332, + "step": 14914 + }, + { + "epoch": 2.4494693408330424, + "grad_norm": 0.3264467535903471, + "learning_rate": 4.714473500094166e-06, + "loss": 0.4374, + "step": 14915 + }, + { + "epoch": 2.4496335680413854, + "grad_norm": 0.33462011748221, + "learning_rate": 4.714005059680908e-06, + "loss": 0.4584, + "step": 14916 + }, + { + "epoch": 2.449797795249728, + "grad_norm": 0.3117046569026179, + "learning_rate": 4.713536616943342e-06, + "loss": 0.446, + "step": 14917 + }, + { + "epoch": 2.4499620224580707, + "grad_norm": 0.3103767396060371, + "learning_rate": 4.71306817188656e-06, + "loss": 0.4251, + "step": 14918 + }, + { + "epoch": 2.4501262496664133, + "grad_norm": 0.37468063922930944, + "learning_rate": 4.712599724515649e-06, + "loss": 0.4316, + "step": 14919 + }, + { + "epoch": 2.4502904768747564, + "grad_norm": 0.2992503309515945, + "learning_rate": 4.712131274835694e-06, + "loss": 0.4332, + "step": 14920 + }, + { + "epoch": 2.450454704083099, + "grad_norm": 0.2914771504239795, + "learning_rate": 4.711662822851785e-06, + "loss": 0.4394, + "step": 14921 + }, + { + "epoch": 2.4506189312914417, + "grad_norm": 0.4249734365242678, + "learning_rate": 4.711194368569009e-06, + "loss": 0.448, + "step": 14922 + }, + { + "epoch": 2.4507831584997843, + "grad_norm": 0.2915814818062091, + "learning_rate": 4.710725911992456e-06, + "loss": 0.4271, + "step": 14923 + }, + { + "epoch": 2.450947385708127, + "grad_norm": 0.5198274597992836, + "learning_rate": 4.710257453127212e-06, + "loss": 0.4353, + "step": 14924 + }, + { + "epoch": 2.45111161291647, + "grad_norm": 0.38566805754977934, + "learning_rate": 4.709788991978364e-06, + "loss": 0.4318, + "step": 14925 + }, + { + "epoch": 2.4512758401248127, + "grad_norm": 0.33962611959497585, + "learning_rate": 4.7093205285510024e-06, + "loss": 0.4317, + "step": 14926 + }, + { + "epoch": 2.4514400673331553, + "grad_norm": 0.3431469761470781, + "learning_rate": 4.7088520628502134e-06, + "loss": 0.4209, + "step": 14927 + }, + { + "epoch": 2.451604294541498, + "grad_norm": 0.2557889173861677, + "learning_rate": 4.708383594881086e-06, + "loss": 0.444, + "step": 14928 + }, + { + "epoch": 2.451768521749841, + "grad_norm": 0.3034543392005885, + "learning_rate": 4.707915124648707e-06, + "loss": 0.4489, + "step": 14929 + }, + { + "epoch": 2.4519327489581837, + "grad_norm": 0.3355927507316989, + "learning_rate": 4.707446652158164e-06, + "loss": 0.4363, + "step": 14930 + }, + { + "epoch": 2.4520969761665263, + "grad_norm": 0.43993410862279914, + "learning_rate": 4.706978177414548e-06, + "loss": 0.4439, + "step": 14931 + }, + { + "epoch": 2.452261203374869, + "grad_norm": 0.309306723684051, + "learning_rate": 4.7065097004229445e-06, + "loss": 0.459, + "step": 14932 + }, + { + "epoch": 2.452425430583212, + "grad_norm": 0.36210017974478553, + "learning_rate": 4.706041221188444e-06, + "loss": 0.458, + "step": 14933 + }, + { + "epoch": 2.4525896577915547, + "grad_norm": 0.44914379194192844, + "learning_rate": 4.705572739716132e-06, + "loss": 0.4297, + "step": 14934 + }, + { + "epoch": 2.4527538849998973, + "grad_norm": 0.4199867608706427, + "learning_rate": 4.705104256011097e-06, + "loss": 0.4251, + "step": 14935 + }, + { + "epoch": 2.45291811220824, + "grad_norm": 0.2960066711495447, + "learning_rate": 4.70463577007843e-06, + "loss": 0.4098, + "step": 14936 + }, + { + "epoch": 2.453082339416583, + "grad_norm": 0.29052926852285893, + "learning_rate": 4.704167281923215e-06, + "loss": 0.4391, + "step": 14937 + }, + { + "epoch": 2.4532465666249257, + "grad_norm": 0.3345783118974667, + "learning_rate": 4.703698791550544e-06, + "loss": 0.4445, + "step": 14938 + }, + { + "epoch": 2.4534107938332683, + "grad_norm": 0.31411562694600764, + "learning_rate": 4.703230298965503e-06, + "loss": 0.4374, + "step": 14939 + }, + { + "epoch": 2.453575021041611, + "grad_norm": 0.39657920815497005, + "learning_rate": 4.702761804173181e-06, + "loss": 0.4297, + "step": 14940 + }, + { + "epoch": 2.4537392482499536, + "grad_norm": 0.26347298494219035, + "learning_rate": 4.7022933071786674e-06, + "loss": 0.4503, + "step": 14941 + }, + { + "epoch": 2.4539034754582967, + "grad_norm": 0.4001769996258484, + "learning_rate": 4.701824807987049e-06, + "loss": 0.4404, + "step": 14942 + }, + { + "epoch": 2.4540677026666393, + "grad_norm": 0.3490323283984835, + "learning_rate": 4.701356306603414e-06, + "loss": 0.4108, + "step": 14943 + }, + { + "epoch": 2.454231929874982, + "grad_norm": 0.32937590189847393, + "learning_rate": 4.700887803032851e-06, + "loss": 0.4331, + "step": 14944 + }, + { + "epoch": 2.4543961570833246, + "grad_norm": 0.3413660682534813, + "learning_rate": 4.70041929728045e-06, + "loss": 0.4379, + "step": 14945 + }, + { + "epoch": 2.4545603842916677, + "grad_norm": 0.5002206756837491, + "learning_rate": 4.699950789351297e-06, + "loss": 0.4483, + "step": 14946 + }, + { + "epoch": 2.4547246115000103, + "grad_norm": 0.3417412630888464, + "learning_rate": 4.699482279250482e-06, + "loss": 0.4233, + "step": 14947 + }, + { + "epoch": 2.454888838708353, + "grad_norm": 0.2913854518469454, + "learning_rate": 4.699013766983093e-06, + "loss": 0.4401, + "step": 14948 + }, + { + "epoch": 2.4550530659166956, + "grad_norm": 0.28411465528321456, + "learning_rate": 4.698545252554221e-06, + "loss": 0.4299, + "step": 14949 + }, + { + "epoch": 2.4552172931250387, + "grad_norm": 0.33716547921552187, + "learning_rate": 4.69807673596895e-06, + "loss": 0.4505, + "step": 14950 + }, + { + "epoch": 2.4553815203333813, + "grad_norm": 0.2993655726406468, + "learning_rate": 4.69760821723237e-06, + "loss": 0.4476, + "step": 14951 + }, + { + "epoch": 2.455545747541724, + "grad_norm": 0.30247931387572397, + "learning_rate": 4.6971396963495725e-06, + "loss": 0.4362, + "step": 14952 + }, + { + "epoch": 2.4557099747500666, + "grad_norm": 0.2859025796264969, + "learning_rate": 4.696671173325643e-06, + "loss": 0.4473, + "step": 14953 + }, + { + "epoch": 2.4558742019584097, + "grad_norm": 0.31087008805358723, + "learning_rate": 4.6962026481656715e-06, + "loss": 0.4323, + "step": 14954 + }, + { + "epoch": 2.4560384291667523, + "grad_norm": 0.31108806233509195, + "learning_rate": 4.695734120874745e-06, + "loss": 0.4363, + "step": 14955 + }, + { + "epoch": 2.456202656375095, + "grad_norm": 0.3713261946065773, + "learning_rate": 4.6952655914579526e-06, + "loss": 0.4582, + "step": 14956 + }, + { + "epoch": 2.4563668835834376, + "grad_norm": 0.3066338892215127, + "learning_rate": 4.694797059920388e-06, + "loss": 0.4296, + "step": 14957 + }, + { + "epoch": 2.45653111079178, + "grad_norm": 0.2810188832318168, + "learning_rate": 4.694328526267133e-06, + "loss": 0.4197, + "step": 14958 + }, + { + "epoch": 2.4566953380001233, + "grad_norm": 0.29244327846057233, + "learning_rate": 4.693859990503277e-06, + "loss": 0.4436, + "step": 14959 + }, + { + "epoch": 2.456859565208466, + "grad_norm": 0.3156738030477161, + "learning_rate": 4.693391452633913e-06, + "loss": 0.4539, + "step": 14960 + }, + { + "epoch": 2.4570237924168086, + "grad_norm": 0.3456900198117342, + "learning_rate": 4.692922912664128e-06, + "loss": 0.4107, + "step": 14961 + }, + { + "epoch": 2.457188019625151, + "grad_norm": 0.3750918202689833, + "learning_rate": 4.69245437059901e-06, + "loss": 0.4304, + "step": 14962 + }, + { + "epoch": 2.4573522468334943, + "grad_norm": 0.42389798412654855, + "learning_rate": 4.691985826443647e-06, + "loss": 0.4171, + "step": 14963 + }, + { + "epoch": 2.457516474041837, + "grad_norm": 0.28392710882294936, + "learning_rate": 4.69151728020313e-06, + "loss": 0.4347, + "step": 14964 + }, + { + "epoch": 2.4576807012501796, + "grad_norm": 0.28639762426572174, + "learning_rate": 4.691048731882546e-06, + "loss": 0.4443, + "step": 14965 + }, + { + "epoch": 2.457844928458522, + "grad_norm": 0.25684977537668713, + "learning_rate": 4.690580181486986e-06, + "loss": 0.4359, + "step": 14966 + }, + { + "epoch": 2.4580091556668653, + "grad_norm": 0.36644612008467237, + "learning_rate": 4.690111629021538e-06, + "loss": 0.4653, + "step": 14967 + }, + { + "epoch": 2.458173382875208, + "grad_norm": 0.3646384093642947, + "learning_rate": 4.68964307449129e-06, + "loss": 0.4397, + "step": 14968 + }, + { + "epoch": 2.4583376100835506, + "grad_norm": 0.3382367193847496, + "learning_rate": 4.689174517901331e-06, + "loss": 0.4406, + "step": 14969 + }, + { + "epoch": 2.458501837291893, + "grad_norm": 0.3140018837661301, + "learning_rate": 4.688705959256752e-06, + "loss": 0.4469, + "step": 14970 + }, + { + "epoch": 2.4586660645002363, + "grad_norm": 0.35977465400020864, + "learning_rate": 4.688237398562639e-06, + "loss": 0.4258, + "step": 14971 + }, + { + "epoch": 2.458830291708579, + "grad_norm": 0.3943690831857817, + "learning_rate": 4.687768835824083e-06, + "loss": 0.4461, + "step": 14972 + }, + { + "epoch": 2.4589945189169216, + "grad_norm": 0.5226466463828919, + "learning_rate": 4.687300271046173e-06, + "loss": 0.4528, + "step": 14973 + }, + { + "epoch": 2.459158746125264, + "grad_norm": 0.2765055483214686, + "learning_rate": 4.686831704233999e-06, + "loss": 0.4521, + "step": 14974 + }, + { + "epoch": 2.459322973333607, + "grad_norm": 0.29749033942911196, + "learning_rate": 4.6863631353926476e-06, + "loss": 0.4468, + "step": 14975 + }, + { + "epoch": 2.45948720054195, + "grad_norm": 0.2902094231503712, + "learning_rate": 4.68589456452721e-06, + "loss": 0.4278, + "step": 14976 + }, + { + "epoch": 2.4596514277502926, + "grad_norm": 0.3078796587727116, + "learning_rate": 4.6854259916427735e-06, + "loss": 0.4159, + "step": 14977 + }, + { + "epoch": 2.459815654958635, + "grad_norm": 0.2985730494760598, + "learning_rate": 4.684957416744429e-06, + "loss": 0.4503, + "step": 14978 + }, + { + "epoch": 2.459979882166978, + "grad_norm": 0.3189836021672542, + "learning_rate": 4.684488839837265e-06, + "loss": 0.442, + "step": 14979 + }, + { + "epoch": 2.460144109375321, + "grad_norm": 0.4842946241587704, + "learning_rate": 4.684020260926369e-06, + "loss": 0.4377, + "step": 14980 + }, + { + "epoch": 2.4603083365836635, + "grad_norm": 0.373334795378634, + "learning_rate": 4.683551680016834e-06, + "loss": 0.4456, + "step": 14981 + }, + { + "epoch": 2.460472563792006, + "grad_norm": 0.44875805856112194, + "learning_rate": 4.683083097113748e-06, + "loss": 0.445, + "step": 14982 + }, + { + "epoch": 2.460636791000349, + "grad_norm": 0.37397571261106627, + "learning_rate": 4.682614512222197e-06, + "loss": 0.4335, + "step": 14983 + }, + { + "epoch": 2.460801018208692, + "grad_norm": 0.3465551968819545, + "learning_rate": 4.682145925347273e-06, + "loss": 0.4215, + "step": 14984 + }, + { + "epoch": 2.4609652454170345, + "grad_norm": 0.29430252055096734, + "learning_rate": 4.681677336494065e-06, + "loss": 0.4457, + "step": 14985 + }, + { + "epoch": 2.461129472625377, + "grad_norm": 0.32286831624943985, + "learning_rate": 4.6812087456676644e-06, + "loss": 0.4154, + "step": 14986 + }, + { + "epoch": 2.46129369983372, + "grad_norm": 0.3793808917437682, + "learning_rate": 4.680740152873157e-06, + "loss": 0.4303, + "step": 14987 + }, + { + "epoch": 2.461457927042063, + "grad_norm": 0.34371076774506265, + "learning_rate": 4.680271558115635e-06, + "loss": 0.4444, + "step": 14988 + }, + { + "epoch": 2.4616221542504055, + "grad_norm": 0.2800293307566269, + "learning_rate": 4.679802961400184e-06, + "loss": 0.4263, + "step": 14989 + }, + { + "epoch": 2.461786381458748, + "grad_norm": 0.4105776277011872, + "learning_rate": 4.679334362731898e-06, + "loss": 0.442, + "step": 14990 + }, + { + "epoch": 2.461950608667091, + "grad_norm": 0.29117182265697783, + "learning_rate": 4.678865762115864e-06, + "loss": 0.4155, + "step": 14991 + }, + { + "epoch": 2.4621148358754334, + "grad_norm": 0.30870836843997373, + "learning_rate": 4.678397159557172e-06, + "loss": 0.4156, + "step": 14992 + }, + { + "epoch": 2.4622790630837765, + "grad_norm": 0.2643732894140815, + "learning_rate": 4.67792855506091e-06, + "loss": 0.4354, + "step": 14993 + }, + { + "epoch": 2.462443290292119, + "grad_norm": 0.2758373427519078, + "learning_rate": 4.67745994863217e-06, + "loss": 0.4397, + "step": 14994 + }, + { + "epoch": 2.462607517500462, + "grad_norm": 0.2879361585632679, + "learning_rate": 4.676991340276039e-06, + "loss": 0.4245, + "step": 14995 + }, + { + "epoch": 2.4627717447088044, + "grad_norm": 0.3407813698263583, + "learning_rate": 4.676522729997611e-06, + "loss": 0.424, + "step": 14996 + }, + { + "epoch": 2.4629359719171475, + "grad_norm": 0.3317369355754718, + "learning_rate": 4.676054117801969e-06, + "loss": 0.445, + "step": 14997 + }, + { + "epoch": 2.46310019912549, + "grad_norm": 0.6803115759149154, + "learning_rate": 4.675585503694208e-06, + "loss": 0.4384, + "step": 14998 + }, + { + "epoch": 2.463264426333833, + "grad_norm": 0.3831499501518102, + "learning_rate": 4.6751168876794164e-06, + "loss": 0.447, + "step": 14999 + }, + { + "epoch": 2.4634286535421754, + "grad_norm": 0.8359767289627413, + "learning_rate": 4.674648269762681e-06, + "loss": 0.4318, + "step": 15000 + }, + { + "epoch": 2.4635928807505185, + "grad_norm": 0.2966619920680067, + "learning_rate": 4.674179649949095e-06, + "loss": 0.4424, + "step": 15001 + }, + { + "epoch": 2.463757107958861, + "grad_norm": 0.2732458183573598, + "learning_rate": 4.673711028243746e-06, + "loss": 0.4509, + "step": 15002 + }, + { + "epoch": 2.463921335167204, + "grad_norm": 0.29770838372139535, + "learning_rate": 4.673242404651725e-06, + "loss": 0.4195, + "step": 15003 + }, + { + "epoch": 2.4640855623755464, + "grad_norm": 0.2502454178475141, + "learning_rate": 4.67277377917812e-06, + "loss": 0.4325, + "step": 15004 + }, + { + "epoch": 2.4642497895838895, + "grad_norm": 0.27202883419282115, + "learning_rate": 4.672305151828022e-06, + "loss": 0.4389, + "step": 15005 + }, + { + "epoch": 2.464414016792232, + "grad_norm": 0.316940999995647, + "learning_rate": 4.671836522606521e-06, + "loss": 0.4244, + "step": 15006 + }, + { + "epoch": 2.464578244000575, + "grad_norm": 0.3077409686336453, + "learning_rate": 4.671367891518707e-06, + "loss": 0.4431, + "step": 15007 + }, + { + "epoch": 2.4647424712089174, + "grad_norm": 0.2991331964822248, + "learning_rate": 4.670899258569668e-06, + "loss": 0.4186, + "step": 15008 + }, + { + "epoch": 2.46490669841726, + "grad_norm": 0.30160160597116736, + "learning_rate": 4.670430623764495e-06, + "loss": 0.4209, + "step": 15009 + }, + { + "epoch": 2.465070925625603, + "grad_norm": 1.2057903308954052, + "learning_rate": 4.6699619871082775e-06, + "loss": 0.4507, + "step": 15010 + }, + { + "epoch": 2.465235152833946, + "grad_norm": 0.36722155486617347, + "learning_rate": 4.669493348606105e-06, + "loss": 0.4246, + "step": 15011 + }, + { + "epoch": 2.4653993800422884, + "grad_norm": 0.2732815069756437, + "learning_rate": 4.669024708263071e-06, + "loss": 0.4399, + "step": 15012 + }, + { + "epoch": 2.465563607250631, + "grad_norm": 0.3759797860911508, + "learning_rate": 4.668556066084259e-06, + "loss": 0.4429, + "step": 15013 + }, + { + "epoch": 2.465727834458974, + "grad_norm": 0.29556366049368965, + "learning_rate": 4.668087422074763e-06, + "loss": 0.4399, + "step": 15014 + }, + { + "epoch": 2.465892061667317, + "grad_norm": 0.33310926302971894, + "learning_rate": 4.667618776239674e-06, + "loss": 0.4282, + "step": 15015 + }, + { + "epoch": 2.4660562888756594, + "grad_norm": 0.28561588310789915, + "learning_rate": 4.6671501285840795e-06, + "loss": 0.4207, + "step": 15016 + }, + { + "epoch": 2.466220516084002, + "grad_norm": 0.31983473101658594, + "learning_rate": 4.666681479113069e-06, + "loss": 0.4324, + "step": 15017 + }, + { + "epoch": 2.466384743292345, + "grad_norm": 0.3606193430776502, + "learning_rate": 4.666212827831733e-06, + "loss": 0.4311, + "step": 15018 + }, + { + "epoch": 2.4665489705006878, + "grad_norm": 0.3478030075169639, + "learning_rate": 4.665744174745164e-06, + "loss": 0.4234, + "step": 15019 + }, + { + "epoch": 2.4667131977090304, + "grad_norm": 0.2944069325615738, + "learning_rate": 4.66527551985845e-06, + "loss": 0.4405, + "step": 15020 + }, + { + "epoch": 2.466877424917373, + "grad_norm": 0.4024844019253324, + "learning_rate": 4.6648068631766816e-06, + "loss": 0.4406, + "step": 15021 + }, + { + "epoch": 2.467041652125716, + "grad_norm": 0.2907487570514868, + "learning_rate": 4.664338204704947e-06, + "loss": 0.4362, + "step": 15022 + }, + { + "epoch": 2.4672058793340588, + "grad_norm": 0.3599279459135872, + "learning_rate": 4.663869544448338e-06, + "loss": 0.4249, + "step": 15023 + }, + { + "epoch": 2.4673701065424014, + "grad_norm": 0.4021341046173282, + "learning_rate": 4.663400882411946e-06, + "loss": 0.4577, + "step": 15024 + }, + { + "epoch": 2.467534333750744, + "grad_norm": 0.29310096697738297, + "learning_rate": 4.662932218600859e-06, + "loss": 0.4242, + "step": 15025 + }, + { + "epoch": 2.4676985609590867, + "grad_norm": 0.3008874854202207, + "learning_rate": 4.662463553020167e-06, + "loss": 0.4483, + "step": 15026 + }, + { + "epoch": 2.4678627881674298, + "grad_norm": 0.31891265100092614, + "learning_rate": 4.661994885674962e-06, + "loss": 0.4422, + "step": 15027 + }, + { + "epoch": 2.4680270153757724, + "grad_norm": 0.4460526086221128, + "learning_rate": 4.661526216570332e-06, + "loss": 0.4276, + "step": 15028 + }, + { + "epoch": 2.468191242584115, + "grad_norm": 0.3133604060641469, + "learning_rate": 4.661057545711369e-06, + "loss": 0.4528, + "step": 15029 + }, + { + "epoch": 2.4683554697924577, + "grad_norm": 0.28146024737318615, + "learning_rate": 4.660588873103164e-06, + "loss": 0.4345, + "step": 15030 + }, + { + "epoch": 2.4685196970008008, + "grad_norm": 0.528866055449549, + "learning_rate": 4.6601201987508035e-06, + "loss": 0.4364, + "step": 15031 + }, + { + "epoch": 2.4686839242091434, + "grad_norm": 0.2872816059039179, + "learning_rate": 4.659651522659382e-06, + "loss": 0.4372, + "step": 15032 + }, + { + "epoch": 2.468848151417486, + "grad_norm": 0.34378625498393467, + "learning_rate": 4.659182844833987e-06, + "loss": 0.4253, + "step": 15033 + }, + { + "epoch": 2.4690123786258287, + "grad_norm": 0.3196887697710157, + "learning_rate": 4.6587141652797085e-06, + "loss": 0.44, + "step": 15034 + }, + { + "epoch": 2.4691766058341718, + "grad_norm": 0.2937573874985924, + "learning_rate": 4.658245484001641e-06, + "loss": 0.4304, + "step": 15035 + }, + { + "epoch": 2.4693408330425144, + "grad_norm": 0.30747920832229947, + "learning_rate": 4.6577768010048685e-06, + "loss": 0.434, + "step": 15036 + }, + { + "epoch": 2.469505060250857, + "grad_norm": 0.3798041029245597, + "learning_rate": 4.657308116294488e-06, + "loss": 0.4494, + "step": 15037 + }, + { + "epoch": 2.4696692874591997, + "grad_norm": 0.30374441085047227, + "learning_rate": 4.656839429875584e-06, + "loss": 0.4268, + "step": 15038 + }, + { + "epoch": 2.4698335146675428, + "grad_norm": 0.3892278187656976, + "learning_rate": 4.656370741753251e-06, + "loss": 0.4299, + "step": 15039 + }, + { + "epoch": 2.4699977418758854, + "grad_norm": 0.2855739563377692, + "learning_rate": 4.655902051932576e-06, + "loss": 0.433, + "step": 15040 + }, + { + "epoch": 2.470161969084228, + "grad_norm": 0.3673913461509335, + "learning_rate": 4.655433360418654e-06, + "loss": 0.4347, + "step": 15041 + }, + { + "epoch": 2.4703261962925707, + "grad_norm": 0.6614279208112778, + "learning_rate": 4.65496466721657e-06, + "loss": 0.4454, + "step": 15042 + }, + { + "epoch": 2.4704904235009133, + "grad_norm": 0.27867910155523756, + "learning_rate": 4.654495972331418e-06, + "loss": 0.4296, + "step": 15043 + }, + { + "epoch": 2.4706546507092564, + "grad_norm": 0.41239281456362475, + "learning_rate": 4.65402727576829e-06, + "loss": 0.436, + "step": 15044 + }, + { + "epoch": 2.470818877917599, + "grad_norm": 0.3045078974339018, + "learning_rate": 4.653558577532274e-06, + "loss": 0.431, + "step": 15045 + }, + { + "epoch": 2.4709831051259417, + "grad_norm": 0.3162849530056356, + "learning_rate": 4.653089877628458e-06, + "loss": 0.4205, + "step": 15046 + }, + { + "epoch": 2.4711473323342843, + "grad_norm": 0.29902768192013507, + "learning_rate": 4.652621176061936e-06, + "loss": 0.4433, + "step": 15047 + }, + { + "epoch": 2.4713115595426274, + "grad_norm": 0.31846117954541137, + "learning_rate": 4.652152472837798e-06, + "loss": 0.4314, + "step": 15048 + }, + { + "epoch": 2.47147578675097, + "grad_norm": 0.2710159655321698, + "learning_rate": 4.651683767961136e-06, + "loss": 0.4532, + "step": 15049 + }, + { + "epoch": 2.4716400139593127, + "grad_norm": 0.3420916679093024, + "learning_rate": 4.651215061437038e-06, + "loss": 0.4279, + "step": 15050 + }, + { + "epoch": 2.4718042411676553, + "grad_norm": 0.27750671662652043, + "learning_rate": 4.650746353270595e-06, + "loss": 0.4153, + "step": 15051 + }, + { + "epoch": 2.4719684683759984, + "grad_norm": 0.3031546521062631, + "learning_rate": 4.650277643466899e-06, + "loss": 0.42, + "step": 15052 + }, + { + "epoch": 2.472132695584341, + "grad_norm": 0.3337669951335687, + "learning_rate": 4.64980893203104e-06, + "loss": 0.4325, + "step": 15053 + }, + { + "epoch": 2.4722969227926836, + "grad_norm": 0.2946419765296823, + "learning_rate": 4.64934021896811e-06, + "loss": 0.4389, + "step": 15054 + }, + { + "epoch": 2.4724611500010263, + "grad_norm": 0.6566714479694089, + "learning_rate": 4.648871504283196e-06, + "loss": 0.4273, + "step": 15055 + }, + { + "epoch": 2.4726253772093694, + "grad_norm": 0.4074657555880319, + "learning_rate": 4.6484027879813905e-06, + "loss": 0.4383, + "step": 15056 + }, + { + "epoch": 2.472789604417712, + "grad_norm": 0.27201573725819184, + "learning_rate": 4.6479340700677865e-06, + "loss": 0.4318, + "step": 15057 + }, + { + "epoch": 2.4729538316260546, + "grad_norm": 0.26289819417609633, + "learning_rate": 4.647465350547473e-06, + "loss": 0.4327, + "step": 15058 + }, + { + "epoch": 2.4731180588343973, + "grad_norm": 0.3174931715785728, + "learning_rate": 4.64699662942554e-06, + "loss": 0.4239, + "step": 15059 + }, + { + "epoch": 2.47328228604274, + "grad_norm": 0.2919191884008475, + "learning_rate": 4.64652790670708e-06, + "loss": 0.457, + "step": 15060 + }, + { + "epoch": 2.473446513251083, + "grad_norm": 0.31143441804325606, + "learning_rate": 4.646059182397181e-06, + "loss": 0.4143, + "step": 15061 + }, + { + "epoch": 2.4736107404594256, + "grad_norm": 0.3528192752492953, + "learning_rate": 4.645590456500937e-06, + "loss": 0.4373, + "step": 15062 + }, + { + "epoch": 2.4737749676677683, + "grad_norm": 0.334288162317713, + "learning_rate": 4.645121729023436e-06, + "loss": 0.4261, + "step": 15063 + }, + { + "epoch": 2.473939194876111, + "grad_norm": 0.49576018154807827, + "learning_rate": 4.644652999969772e-06, + "loss": 0.4409, + "step": 15064 + }, + { + "epoch": 2.474103422084454, + "grad_norm": 0.2739197448585547, + "learning_rate": 4.644184269345033e-06, + "loss": 0.4462, + "step": 15065 + }, + { + "epoch": 2.4742676492927966, + "grad_norm": 0.3100637983651266, + "learning_rate": 4.643715537154312e-06, + "loss": 0.4469, + "step": 15066 + }, + { + "epoch": 2.4744318765011393, + "grad_norm": 0.3359245159861427, + "learning_rate": 4.643246803402698e-06, + "loss": 0.4606, + "step": 15067 + }, + { + "epoch": 2.474596103709482, + "grad_norm": 0.384617259495692, + "learning_rate": 4.642778068095282e-06, + "loss": 0.4261, + "step": 15068 + }, + { + "epoch": 2.474760330917825, + "grad_norm": 0.29517197642369547, + "learning_rate": 4.642309331237157e-06, + "loss": 0.429, + "step": 15069 + }, + { + "epoch": 2.4749245581261676, + "grad_norm": 0.32753736877117223, + "learning_rate": 4.641840592833413e-06, + "loss": 0.4431, + "step": 15070 + }, + { + "epoch": 2.4750887853345103, + "grad_norm": 0.2791454602503895, + "learning_rate": 4.641371852889139e-06, + "loss": 0.4071, + "step": 15071 + }, + { + "epoch": 2.475253012542853, + "grad_norm": 0.37888047503031486, + "learning_rate": 4.640903111409428e-06, + "loss": 0.4417, + "step": 15072 + }, + { + "epoch": 2.475417239751196, + "grad_norm": 0.366354730985903, + "learning_rate": 4.640434368399371e-06, + "loss": 0.4554, + "step": 15073 + }, + { + "epoch": 2.4755814669595386, + "grad_norm": 0.31104150908583245, + "learning_rate": 4.63996562386406e-06, + "loss": 0.4289, + "step": 15074 + }, + { + "epoch": 2.4757456941678813, + "grad_norm": 0.37956424179769677, + "learning_rate": 4.6394968778085825e-06, + "loss": 0.4193, + "step": 15075 + }, + { + "epoch": 2.475909921376224, + "grad_norm": 0.335801656073786, + "learning_rate": 4.6390281302380325e-06, + "loss": 0.4467, + "step": 15076 + }, + { + "epoch": 2.4760741485845665, + "grad_norm": 0.2643865236365438, + "learning_rate": 4.6385593811575e-06, + "loss": 0.4419, + "step": 15077 + }, + { + "epoch": 2.4762383757929096, + "grad_norm": 0.30764530125320155, + "learning_rate": 4.638090630572076e-06, + "loss": 0.4202, + "step": 15078 + }, + { + "epoch": 2.4764026030012523, + "grad_norm": 0.32420305420903595, + "learning_rate": 4.637621878486853e-06, + "loss": 0.4522, + "step": 15079 + }, + { + "epoch": 2.476566830209595, + "grad_norm": 0.3363197644758546, + "learning_rate": 4.63715312490692e-06, + "loss": 0.4223, + "step": 15080 + }, + { + "epoch": 2.4767310574179375, + "grad_norm": 0.3091369151927837, + "learning_rate": 4.636684369837368e-06, + "loss": 0.4365, + "step": 15081 + }, + { + "epoch": 2.4768952846262806, + "grad_norm": 0.29132422243990463, + "learning_rate": 4.636215613283291e-06, + "loss": 0.4294, + "step": 15082 + }, + { + "epoch": 2.4770595118346233, + "grad_norm": 0.32071462618152996, + "learning_rate": 4.635746855249779e-06, + "loss": 0.4581, + "step": 15083 + }, + { + "epoch": 2.477223739042966, + "grad_norm": 0.39568575870858036, + "learning_rate": 4.635278095741922e-06, + "loss": 0.4408, + "step": 15084 + }, + { + "epoch": 2.4773879662513085, + "grad_norm": 0.2705314149096496, + "learning_rate": 4.634809334764811e-06, + "loss": 0.4407, + "step": 15085 + }, + { + "epoch": 2.4775521934596516, + "grad_norm": 0.4562209852816167, + "learning_rate": 4.634340572323538e-06, + "loss": 0.4238, + "step": 15086 + }, + { + "epoch": 2.4777164206679942, + "grad_norm": 0.35692718672318324, + "learning_rate": 4.633871808423195e-06, + "loss": 0.4366, + "step": 15087 + }, + { + "epoch": 2.477880647876337, + "grad_norm": 0.353983621673839, + "learning_rate": 4.633403043068873e-06, + "loss": 0.4546, + "step": 15088 + }, + { + "epoch": 2.4780448750846795, + "grad_norm": 0.42116045761854776, + "learning_rate": 4.632934276265661e-06, + "loss": 0.4482, + "step": 15089 + }, + { + "epoch": 2.4782091022930226, + "grad_norm": 0.3206255592467727, + "learning_rate": 4.6324655080186524e-06, + "loss": 0.4248, + "step": 15090 + }, + { + "epoch": 2.4783733295013652, + "grad_norm": 0.2792993323261359, + "learning_rate": 4.6319967383329395e-06, + "loss": 0.425, + "step": 15091 + }, + { + "epoch": 2.478537556709708, + "grad_norm": 0.2843568794714767, + "learning_rate": 4.631527967213611e-06, + "loss": 0.4416, + "step": 15092 + }, + { + "epoch": 2.4787017839180505, + "grad_norm": 0.27163007119109167, + "learning_rate": 4.631059194665759e-06, + "loss": 0.4263, + "step": 15093 + }, + { + "epoch": 2.478866011126393, + "grad_norm": 0.285132682533487, + "learning_rate": 4.630590420694475e-06, + "loss": 0.451, + "step": 15094 + }, + { + "epoch": 2.4790302383347362, + "grad_norm": 0.2962666477282085, + "learning_rate": 4.630121645304853e-06, + "loss": 0.4149, + "step": 15095 + }, + { + "epoch": 2.479194465543079, + "grad_norm": 0.3436820673601864, + "learning_rate": 4.629652868501979e-06, + "loss": 0.4354, + "step": 15096 + }, + { + "epoch": 2.4793586927514215, + "grad_norm": 0.30779119066240046, + "learning_rate": 4.629184090290948e-06, + "loss": 0.4039, + "step": 15097 + }, + { + "epoch": 2.479522919959764, + "grad_norm": 0.30109729627360404, + "learning_rate": 4.628715310676851e-06, + "loss": 0.4364, + "step": 15098 + }, + { + "epoch": 2.4796871471681072, + "grad_norm": 0.34213472664573985, + "learning_rate": 4.62824652966478e-06, + "loss": 0.4024, + "step": 15099 + }, + { + "epoch": 2.47985137437645, + "grad_norm": 0.3312074651939912, + "learning_rate": 4.627777747259825e-06, + "loss": 0.4494, + "step": 15100 + }, + { + "epoch": 2.4800156015847925, + "grad_norm": 0.3918133083512165, + "learning_rate": 4.627308963467076e-06, + "loss": 0.4383, + "step": 15101 + }, + { + "epoch": 2.480179828793135, + "grad_norm": 0.40037398856188533, + "learning_rate": 4.6268401782916285e-06, + "loss": 0.4496, + "step": 15102 + }, + { + "epoch": 2.4803440560014782, + "grad_norm": 0.29368723208387454, + "learning_rate": 4.6263713917385725e-06, + "loss": 0.4546, + "step": 15103 + }, + { + "epoch": 2.480508283209821, + "grad_norm": 0.39442925356351577, + "learning_rate": 4.625902603812998e-06, + "loss": 0.4396, + "step": 15104 + }, + { + "epoch": 2.4806725104181635, + "grad_norm": 0.49391288466102, + "learning_rate": 4.625433814519997e-06, + "loss": 0.4461, + "step": 15105 + }, + { + "epoch": 2.480836737626506, + "grad_norm": 0.6507872211415915, + "learning_rate": 4.624965023864661e-06, + "loss": 0.4257, + "step": 15106 + }, + { + "epoch": 2.481000964834849, + "grad_norm": 0.3170404081913076, + "learning_rate": 4.624496231852082e-06, + "loss": 0.4303, + "step": 15107 + }, + { + "epoch": 2.481165192043192, + "grad_norm": 0.8010381914422225, + "learning_rate": 4.624027438487352e-06, + "loss": 0.4287, + "step": 15108 + }, + { + "epoch": 2.4813294192515345, + "grad_norm": 0.3175103487015811, + "learning_rate": 4.623558643775561e-06, + "loss": 0.4396, + "step": 15109 + }, + { + "epoch": 2.481493646459877, + "grad_norm": 0.2780316156388087, + "learning_rate": 4.623089847721803e-06, + "loss": 0.4386, + "step": 15110 + }, + { + "epoch": 2.4816578736682198, + "grad_norm": 0.9957906981950299, + "learning_rate": 4.622621050331167e-06, + "loss": 0.4382, + "step": 15111 + }, + { + "epoch": 2.481822100876563, + "grad_norm": 0.3119134050343224, + "learning_rate": 4.622152251608747e-06, + "loss": 0.41, + "step": 15112 + }, + { + "epoch": 2.4819863280849055, + "grad_norm": 0.3165098283105381, + "learning_rate": 4.621683451559633e-06, + "loss": 0.4261, + "step": 15113 + }, + { + "epoch": 2.482150555293248, + "grad_norm": 0.3120583440078844, + "learning_rate": 4.621214650188916e-06, + "loss": 0.4349, + "step": 15114 + }, + { + "epoch": 2.4823147825015908, + "grad_norm": 0.2738497164372407, + "learning_rate": 4.620745847501689e-06, + "loss": 0.4321, + "step": 15115 + }, + { + "epoch": 2.482479009709934, + "grad_norm": 0.36008005991934383, + "learning_rate": 4.620277043503044e-06, + "loss": 0.4417, + "step": 15116 + }, + { + "epoch": 2.4826432369182765, + "grad_norm": 0.31989619467996416, + "learning_rate": 4.619808238198072e-06, + "loss": 0.446, + "step": 15117 + }, + { + "epoch": 2.482807464126619, + "grad_norm": 0.6824453394065131, + "learning_rate": 4.619339431591864e-06, + "loss": 0.4383, + "step": 15118 + }, + { + "epoch": 2.4829716913349618, + "grad_norm": 0.37095029373258626, + "learning_rate": 4.618870623689512e-06, + "loss": 0.4583, + "step": 15119 + }, + { + "epoch": 2.483135918543305, + "grad_norm": 0.33314170061608483, + "learning_rate": 4.618401814496109e-06, + "loss": 0.4348, + "step": 15120 + }, + { + "epoch": 2.4833001457516475, + "grad_norm": 0.33636320733646125, + "learning_rate": 4.617933004016744e-06, + "loss": 0.4554, + "step": 15121 + }, + { + "epoch": 2.48346437295999, + "grad_norm": 1.0196784103267829, + "learning_rate": 4.617464192256513e-06, + "loss": 0.4434, + "step": 15122 + }, + { + "epoch": 2.4836286001683328, + "grad_norm": 0.3193716312140261, + "learning_rate": 4.616995379220504e-06, + "loss": 0.43, + "step": 15123 + }, + { + "epoch": 2.483792827376676, + "grad_norm": 0.288516069215822, + "learning_rate": 4.616526564913811e-06, + "loss": 0.434, + "step": 15124 + }, + { + "epoch": 2.4839570545850185, + "grad_norm": 0.29912771431081314, + "learning_rate": 4.616057749341524e-06, + "loss": 0.4475, + "step": 15125 + }, + { + "epoch": 2.484121281793361, + "grad_norm": 0.2733627283094918, + "learning_rate": 4.615588932508735e-06, + "loss": 0.4382, + "step": 15126 + }, + { + "epoch": 2.4842855090017038, + "grad_norm": 0.3603048759548791, + "learning_rate": 4.615120114420538e-06, + "loss": 0.4283, + "step": 15127 + }, + { + "epoch": 2.4844497362100464, + "grad_norm": 0.3743687907784263, + "learning_rate": 4.6146512950820225e-06, + "loss": 0.448, + "step": 15128 + }, + { + "epoch": 2.4846139634183895, + "grad_norm": 0.3108138141975132, + "learning_rate": 4.614182474498282e-06, + "loss": 0.4537, + "step": 15129 + }, + { + "epoch": 2.484778190626732, + "grad_norm": 0.3197077167691703, + "learning_rate": 4.613713652674406e-06, + "loss": 0.438, + "step": 15130 + }, + { + "epoch": 2.4849424178350747, + "grad_norm": 0.4391802841487158, + "learning_rate": 4.613244829615488e-06, + "loss": 0.4311, + "step": 15131 + }, + { + "epoch": 2.4851066450434174, + "grad_norm": 0.2920981537029958, + "learning_rate": 4.612776005326621e-06, + "loss": 0.4237, + "step": 15132 + }, + { + "epoch": 2.4852708722517605, + "grad_norm": 0.29646873635485227, + "learning_rate": 4.612307179812896e-06, + "loss": 0.4341, + "step": 15133 + }, + { + "epoch": 2.485435099460103, + "grad_norm": 0.29841283343957575, + "learning_rate": 4.611838353079403e-06, + "loss": 0.4303, + "step": 15134 + }, + { + "epoch": 2.4855993266684457, + "grad_norm": 0.3250609621900752, + "learning_rate": 4.611369525131235e-06, + "loss": 0.4257, + "step": 15135 + }, + { + "epoch": 2.4857635538767884, + "grad_norm": 0.35478496086877537, + "learning_rate": 4.610900695973485e-06, + "loss": 0.4379, + "step": 15136 + }, + { + "epoch": 2.4859277810851315, + "grad_norm": 0.4670594900199308, + "learning_rate": 4.610431865611247e-06, + "loss": 0.4517, + "step": 15137 + }, + { + "epoch": 2.486092008293474, + "grad_norm": 0.31415101552518854, + "learning_rate": 4.609963034049607e-06, + "loss": 0.4298, + "step": 15138 + }, + { + "epoch": 2.4862562355018167, + "grad_norm": 0.38284691709415497, + "learning_rate": 4.609494201293661e-06, + "loss": 0.43, + "step": 15139 + }, + { + "epoch": 2.4864204627101594, + "grad_norm": 0.3092641007937, + "learning_rate": 4.6090253673484995e-06, + "loss": 0.421, + "step": 15140 + }, + { + "epoch": 2.4865846899185025, + "grad_norm": 0.35972316348470473, + "learning_rate": 4.608556532219216e-06, + "loss": 0.4597, + "step": 15141 + }, + { + "epoch": 2.486748917126845, + "grad_norm": 0.368530624076118, + "learning_rate": 4.608087695910903e-06, + "loss": 0.4258, + "step": 15142 + }, + { + "epoch": 2.4869131443351877, + "grad_norm": 0.303106049447321, + "learning_rate": 4.6076188584286505e-06, + "loss": 0.4249, + "step": 15143 + }, + { + "epoch": 2.4870773715435304, + "grad_norm": 0.48973722250178475, + "learning_rate": 4.607150019777551e-06, + "loss": 0.4328, + "step": 15144 + }, + { + "epoch": 2.487241598751873, + "grad_norm": 0.314524236592061, + "learning_rate": 4.606681179962697e-06, + "loss": 0.4424, + "step": 15145 + }, + { + "epoch": 2.487405825960216, + "grad_norm": 0.5027401543850439, + "learning_rate": 4.60621233898918e-06, + "loss": 0.4372, + "step": 15146 + }, + { + "epoch": 2.4875700531685587, + "grad_norm": 0.29423642278198925, + "learning_rate": 4.605743496862093e-06, + "loss": 0.4217, + "step": 15147 + }, + { + "epoch": 2.4877342803769014, + "grad_norm": 0.31679517006738683, + "learning_rate": 4.605274653586526e-06, + "loss": 0.4495, + "step": 15148 + }, + { + "epoch": 2.487898507585244, + "grad_norm": 0.26487858401307063, + "learning_rate": 4.604805809167574e-06, + "loss": 0.4323, + "step": 15149 + }, + { + "epoch": 2.488062734793587, + "grad_norm": 0.32293708597952736, + "learning_rate": 4.604336963610328e-06, + "loss": 0.4254, + "step": 15150 + }, + { + "epoch": 2.4882269620019297, + "grad_norm": 0.28820496702549153, + "learning_rate": 4.60386811691988e-06, + "loss": 0.4368, + "step": 15151 + }, + { + "epoch": 2.4883911892102724, + "grad_norm": 0.27518127880599397, + "learning_rate": 4.6033992691013225e-06, + "loss": 0.4453, + "step": 15152 + }, + { + "epoch": 2.488555416418615, + "grad_norm": 0.3493994813337365, + "learning_rate": 4.6029304201597456e-06, + "loss": 0.4322, + "step": 15153 + }, + { + "epoch": 2.488719643626958, + "grad_norm": 0.3251525156968644, + "learning_rate": 4.602461570100246e-06, + "loss": 0.4246, + "step": 15154 + }, + { + "epoch": 2.4888838708353007, + "grad_norm": 0.4157430492921045, + "learning_rate": 4.6019927189279096e-06, + "loss": 0.4371, + "step": 15155 + }, + { + "epoch": 2.4890480980436434, + "grad_norm": 0.30758658627784186, + "learning_rate": 4.601523866647834e-06, + "loss": 0.4267, + "step": 15156 + }, + { + "epoch": 2.489212325251986, + "grad_norm": 0.33772718364386484, + "learning_rate": 4.601055013265109e-06, + "loss": 0.4342, + "step": 15157 + }, + { + "epoch": 2.489376552460329, + "grad_norm": 0.36400549487339234, + "learning_rate": 4.6005861587848264e-06, + "loss": 0.4364, + "step": 15158 + }, + { + "epoch": 2.4895407796686717, + "grad_norm": 0.4213966493331531, + "learning_rate": 4.600117303212079e-06, + "loss": 0.3879, + "step": 15159 + }, + { + "epoch": 2.4897050068770143, + "grad_norm": 0.3207069643365226, + "learning_rate": 4.5996484465519594e-06, + "loss": 0.4352, + "step": 15160 + }, + { + "epoch": 2.489869234085357, + "grad_norm": 0.30520069096946645, + "learning_rate": 4.599179588809561e-06, + "loss": 0.4323, + "step": 15161 + }, + { + "epoch": 2.4900334612936996, + "grad_norm": 0.4020668168700281, + "learning_rate": 4.598710729989974e-06, + "loss": 0.4535, + "step": 15162 + }, + { + "epoch": 2.4901976885020427, + "grad_norm": 0.42245364077029657, + "learning_rate": 4.5982418700982905e-06, + "loss": 0.4525, + "step": 15163 + }, + { + "epoch": 2.4903619157103853, + "grad_norm": 0.8362459834605354, + "learning_rate": 4.597773009139604e-06, + "loss": 0.4362, + "step": 15164 + }, + { + "epoch": 2.490526142918728, + "grad_norm": 0.3449582767880722, + "learning_rate": 4.597304147119006e-06, + "loss": 0.4339, + "step": 15165 + }, + { + "epoch": 2.4906903701270706, + "grad_norm": 0.3066916301852268, + "learning_rate": 4.5968352840415904e-06, + "loss": 0.4309, + "step": 15166 + }, + { + "epoch": 2.4908545973354137, + "grad_norm": 0.5106999824766285, + "learning_rate": 4.596366419912448e-06, + "loss": 0.4531, + "step": 15167 + }, + { + "epoch": 2.4910188245437563, + "grad_norm": 0.34081483839278837, + "learning_rate": 4.595897554736671e-06, + "loss": 0.4411, + "step": 15168 + }, + { + "epoch": 2.491183051752099, + "grad_norm": 0.3703165743951746, + "learning_rate": 4.5954286885193514e-06, + "loss": 0.4255, + "step": 15169 + }, + { + "epoch": 2.4913472789604416, + "grad_norm": 0.2788333157420013, + "learning_rate": 4.5949598212655845e-06, + "loss": 0.4364, + "step": 15170 + }, + { + "epoch": 2.4915115061687847, + "grad_norm": 0.35393728188332935, + "learning_rate": 4.594490952980459e-06, + "loss": 0.4183, + "step": 15171 + }, + { + "epoch": 2.4916757333771273, + "grad_norm": 0.6014347064530057, + "learning_rate": 4.594022083669069e-06, + "loss": 0.4263, + "step": 15172 + }, + { + "epoch": 2.49183996058547, + "grad_norm": 0.3243854776951079, + "learning_rate": 4.593553213336507e-06, + "loss": 0.4318, + "step": 15173 + }, + { + "epoch": 2.4920041877938126, + "grad_norm": 0.392814323494369, + "learning_rate": 4.593084341987864e-06, + "loss": 0.4218, + "step": 15174 + }, + { + "epoch": 2.4921684150021557, + "grad_norm": 0.34098247165368967, + "learning_rate": 4.592615469628235e-06, + "loss": 0.4641, + "step": 15175 + }, + { + "epoch": 2.4923326422104983, + "grad_norm": 0.35972202448556473, + "learning_rate": 4.59214659626271e-06, + "loss": 0.4248, + "step": 15176 + }, + { + "epoch": 2.492496869418841, + "grad_norm": 0.2892343843339758, + "learning_rate": 4.591677721896382e-06, + "loss": 0.415, + "step": 15177 + }, + { + "epoch": 2.4926610966271836, + "grad_norm": 0.3099738318685391, + "learning_rate": 4.591208846534344e-06, + "loss": 0.4426, + "step": 15178 + }, + { + "epoch": 2.4928253238355262, + "grad_norm": 0.3571184083633324, + "learning_rate": 4.590739970181689e-06, + "loss": 0.4301, + "step": 15179 + }, + { + "epoch": 2.4929895510438693, + "grad_norm": 0.33853164613700226, + "learning_rate": 4.590271092843507e-06, + "loss": 0.449, + "step": 15180 + }, + { + "epoch": 2.493153778252212, + "grad_norm": 0.30508015812979095, + "learning_rate": 4.589802214524896e-06, + "loss": 0.4246, + "step": 15181 + }, + { + "epoch": 2.4933180054605546, + "grad_norm": 0.3546083751853678, + "learning_rate": 4.58933333523094e-06, + "loss": 0.4307, + "step": 15182 + }, + { + "epoch": 2.4934822326688972, + "grad_norm": 0.34676750407362766, + "learning_rate": 4.5888644549667384e-06, + "loss": 0.4382, + "step": 15183 + }, + { + "epoch": 2.4936464598772403, + "grad_norm": 0.33336637439983086, + "learning_rate": 4.58839557373738e-06, + "loss": 0.4363, + "step": 15184 + }, + { + "epoch": 2.493810687085583, + "grad_norm": 0.3562136027603175, + "learning_rate": 4.58792669154796e-06, + "loss": 0.4254, + "step": 15185 + }, + { + "epoch": 2.4939749142939256, + "grad_norm": 0.30767491608825387, + "learning_rate": 4.587457808403569e-06, + "loss": 0.4312, + "step": 15186 + }, + { + "epoch": 2.4941391415022682, + "grad_norm": 0.3717157302565822, + "learning_rate": 4.586988924309302e-06, + "loss": 0.4342, + "step": 15187 + }, + { + "epoch": 2.4943033687106113, + "grad_norm": 0.31073508635900254, + "learning_rate": 4.586520039270247e-06, + "loss": 0.4286, + "step": 15188 + }, + { + "epoch": 2.494467595918954, + "grad_norm": 0.300935391463759, + "learning_rate": 4.5860511532915e-06, + "loss": 0.429, + "step": 15189 + }, + { + "epoch": 2.4946318231272966, + "grad_norm": 0.2891538888734539, + "learning_rate": 4.585582266378153e-06, + "loss": 0.4273, + "step": 15190 + }, + { + "epoch": 2.4947960503356392, + "grad_norm": 0.3636855265250306, + "learning_rate": 4.5851133785353e-06, + "loss": 0.4359, + "step": 15191 + }, + { + "epoch": 2.4949602775439823, + "grad_norm": 0.31238609339074075, + "learning_rate": 4.584644489768029e-06, + "loss": 0.4496, + "step": 15192 + }, + { + "epoch": 2.495124504752325, + "grad_norm": 0.3466335457330843, + "learning_rate": 4.584175600081438e-06, + "loss": 0.4251, + "step": 15193 + }, + { + "epoch": 2.4952887319606676, + "grad_norm": 0.28513115942321365, + "learning_rate": 4.583706709480615e-06, + "loss": 0.4303, + "step": 15194 + }, + { + "epoch": 2.49545295916901, + "grad_norm": 0.3313604508733959, + "learning_rate": 4.5832378179706564e-06, + "loss": 0.4239, + "step": 15195 + }, + { + "epoch": 2.495617186377353, + "grad_norm": 0.2735952782125291, + "learning_rate": 4.582768925556653e-06, + "loss": 0.4314, + "step": 15196 + }, + { + "epoch": 2.495781413585696, + "grad_norm": 0.3199061503551554, + "learning_rate": 4.582300032243698e-06, + "loss": 0.4437, + "step": 15197 + }, + { + "epoch": 2.4959456407940386, + "grad_norm": 0.2941447415940853, + "learning_rate": 4.581831138036882e-06, + "loss": 0.4434, + "step": 15198 + }, + { + "epoch": 2.496109868002381, + "grad_norm": 0.3355786824721521, + "learning_rate": 4.5813622429413e-06, + "loss": 0.4277, + "step": 15199 + }, + { + "epoch": 2.496274095210724, + "grad_norm": 0.28103705919862076, + "learning_rate": 4.580893346962045e-06, + "loss": 0.4314, + "step": 15200 + }, + { + "epoch": 2.496438322419067, + "grad_norm": 0.3227157508902223, + "learning_rate": 4.5804244501042085e-06, + "loss": 0.4493, + "step": 15201 + }, + { + "epoch": 2.4966025496274096, + "grad_norm": 0.2970871193039058, + "learning_rate": 4.5799555523728824e-06, + "loss": 0.42, + "step": 15202 + }, + { + "epoch": 2.496766776835752, + "grad_norm": 0.4342254727787912, + "learning_rate": 4.57948665377316e-06, + "loss": 0.4333, + "step": 15203 + }, + { + "epoch": 2.496931004044095, + "grad_norm": 0.3422407517356136, + "learning_rate": 4.579017754310136e-06, + "loss": 0.4373, + "step": 15204 + }, + { + "epoch": 2.497095231252438, + "grad_norm": 0.32443689801201475, + "learning_rate": 4.578548853988901e-06, + "loss": 0.4196, + "step": 15205 + }, + { + "epoch": 2.4972594584607806, + "grad_norm": 0.3213615230691932, + "learning_rate": 4.578079952814547e-06, + "loss": 0.4284, + "step": 15206 + }, + { + "epoch": 2.497423685669123, + "grad_norm": 0.30105959294998685, + "learning_rate": 4.577611050792169e-06, + "loss": 0.4037, + "step": 15207 + }, + { + "epoch": 2.497587912877466, + "grad_norm": 0.29610221993478114, + "learning_rate": 4.577142147926859e-06, + "loss": 0.448, + "step": 15208 + }, + { + "epoch": 2.497752140085809, + "grad_norm": 0.300053251348721, + "learning_rate": 4.576673244223709e-06, + "loss": 0.4284, + "step": 15209 + }, + { + "epoch": 2.4979163672941516, + "grad_norm": 0.4108333131032182, + "learning_rate": 4.576204339687812e-06, + "loss": 0.4393, + "step": 15210 + }, + { + "epoch": 2.498080594502494, + "grad_norm": 0.42413555863246477, + "learning_rate": 4.57573543432426e-06, + "loss": 0.4404, + "step": 15211 + }, + { + "epoch": 2.498244821710837, + "grad_norm": 0.3744832314644596, + "learning_rate": 4.5752665281381474e-06, + "loss": 0.427, + "step": 15212 + }, + { + "epoch": 2.4984090489191795, + "grad_norm": 0.5036243765745887, + "learning_rate": 4.574797621134566e-06, + "loss": 0.4456, + "step": 15213 + }, + { + "epoch": 2.4985732761275226, + "grad_norm": 0.39270734558141457, + "learning_rate": 4.574328713318609e-06, + "loss": 0.4252, + "step": 15214 + }, + { + "epoch": 2.498737503335865, + "grad_norm": 0.29570628940704496, + "learning_rate": 4.57385980469537e-06, + "loss": 0.4527, + "step": 15215 + }, + { + "epoch": 2.498901730544208, + "grad_norm": 0.7676602433712431, + "learning_rate": 4.573390895269941e-06, + "loss": 0.44, + "step": 15216 + }, + { + "epoch": 2.4990659577525505, + "grad_norm": 0.6201610602230498, + "learning_rate": 4.572921985047413e-06, + "loss": 0.4306, + "step": 15217 + }, + { + "epoch": 2.4992301849608936, + "grad_norm": 1.715363184871039, + "learning_rate": 4.572453074032881e-06, + "loss": 0.4368, + "step": 15218 + }, + { + "epoch": 2.499394412169236, + "grad_norm": 0.3603458735228863, + "learning_rate": 4.571984162231437e-06, + "loss": 0.4428, + "step": 15219 + }, + { + "epoch": 2.499558639377579, + "grad_norm": 0.29670745837822965, + "learning_rate": 4.571515249648174e-06, + "loss": 0.408, + "step": 15220 + }, + { + "epoch": 2.4997228665859215, + "grad_norm": 0.28863059782458106, + "learning_rate": 4.571046336288186e-06, + "loss": 0.4034, + "step": 15221 + }, + { + "epoch": 2.4998870937942645, + "grad_norm": 0.3231624496999091, + "learning_rate": 4.570577422156564e-06, + "loss": 0.4262, + "step": 15222 + }, + { + "epoch": 2.500051321002607, + "grad_norm": 0.28074500386612955, + "learning_rate": 4.570108507258403e-06, + "loss": 0.4477, + "step": 15223 + }, + { + "epoch": 2.50021554821095, + "grad_norm": 0.2709352897613838, + "learning_rate": 4.569639591598794e-06, + "loss": 0.4358, + "step": 15224 + }, + { + "epoch": 2.5003797754192925, + "grad_norm": 0.28514705073201974, + "learning_rate": 4.569170675182831e-06, + "loss": 0.4236, + "step": 15225 + }, + { + "epoch": 2.5005440026276355, + "grad_norm": 0.2873450628834571, + "learning_rate": 4.5687017580156055e-06, + "loss": 0.4324, + "step": 15226 + }, + { + "epoch": 2.500708229835978, + "grad_norm": 0.29247613976893794, + "learning_rate": 4.568232840102211e-06, + "loss": 0.4417, + "step": 15227 + }, + { + "epoch": 2.500872457044321, + "grad_norm": 0.3039572928244555, + "learning_rate": 4.567763921447741e-06, + "loss": 0.424, + "step": 15228 + }, + { + "epoch": 2.5010366842526635, + "grad_norm": 0.3498139299117389, + "learning_rate": 4.56729500205729e-06, + "loss": 0.4579, + "step": 15229 + }, + { + "epoch": 2.501200911461006, + "grad_norm": 0.35502698847985975, + "learning_rate": 4.566826081935947e-06, + "loss": 0.4223, + "step": 15230 + }, + { + "epoch": 2.501365138669349, + "grad_norm": 0.3317335064782728, + "learning_rate": 4.566357161088808e-06, + "loss": 0.4263, + "step": 15231 + }, + { + "epoch": 2.501529365877692, + "grad_norm": 0.3393369552996207, + "learning_rate": 4.565888239520963e-06, + "loss": 0.4415, + "step": 15232 + }, + { + "epoch": 2.5016935930860345, + "grad_norm": 0.40649164473995997, + "learning_rate": 4.56541931723751e-06, + "loss": 0.4196, + "step": 15233 + }, + { + "epoch": 2.5018578202943775, + "grad_norm": 0.5651714899939198, + "learning_rate": 4.564950394243538e-06, + "loss": 0.43, + "step": 15234 + }, + { + "epoch": 2.50202204750272, + "grad_norm": 0.2903708190570917, + "learning_rate": 4.564481470544139e-06, + "loss": 0.4101, + "step": 15235 + }, + { + "epoch": 2.502186274711063, + "grad_norm": 0.407942080989367, + "learning_rate": 4.564012546144409e-06, + "loss": 0.4586, + "step": 15236 + }, + { + "epoch": 2.5023505019194054, + "grad_norm": 0.30899819635242237, + "learning_rate": 4.56354362104944e-06, + "loss": 0.4397, + "step": 15237 + }, + { + "epoch": 2.502514729127748, + "grad_norm": 0.3462923651598003, + "learning_rate": 4.563074695264324e-06, + "loss": 0.4532, + "step": 15238 + }, + { + "epoch": 2.5026789563360907, + "grad_norm": 0.3220893341552208, + "learning_rate": 4.562605768794156e-06, + "loss": 0.4317, + "step": 15239 + }, + { + "epoch": 2.502843183544434, + "grad_norm": 0.31180577802122095, + "learning_rate": 4.562136841644027e-06, + "loss": 0.4425, + "step": 15240 + }, + { + "epoch": 2.5030074107527764, + "grad_norm": 0.2764546692709857, + "learning_rate": 4.561667913819031e-06, + "loss": 0.4437, + "step": 15241 + }, + { + "epoch": 2.503171637961119, + "grad_norm": 0.31019314339615184, + "learning_rate": 4.56119898532426e-06, + "loss": 0.4204, + "step": 15242 + }, + { + "epoch": 2.503335865169462, + "grad_norm": 0.3931365921905194, + "learning_rate": 4.560730056164808e-06, + "loss": 0.4276, + "step": 15243 + }, + { + "epoch": 2.503500092377805, + "grad_norm": 0.3767687885431783, + "learning_rate": 4.560261126345769e-06, + "loss": 0.4344, + "step": 15244 + }, + { + "epoch": 2.5036643195861474, + "grad_norm": 0.35056231533364646, + "learning_rate": 4.5597921958722336e-06, + "loss": 0.439, + "step": 15245 + }, + { + "epoch": 2.50382854679449, + "grad_norm": 0.34796478913559775, + "learning_rate": 4.559323264749297e-06, + "loss": 0.4379, + "step": 15246 + }, + { + "epoch": 2.5039927740028327, + "grad_norm": 0.364929157109696, + "learning_rate": 4.5588543329820504e-06, + "loss": 0.435, + "step": 15247 + }, + { + "epoch": 2.504157001211176, + "grad_norm": 0.30504356991755455, + "learning_rate": 4.5583854005755875e-06, + "loss": 0.4206, + "step": 15248 + }, + { + "epoch": 2.5043212284195184, + "grad_norm": 0.33326552576505075, + "learning_rate": 4.5579164675350035e-06, + "loss": 0.4366, + "step": 15249 + }, + { + "epoch": 2.504485455627861, + "grad_norm": 0.30634993507320807, + "learning_rate": 4.557447533865389e-06, + "loss": 0.424, + "step": 15250 + }, + { + "epoch": 2.504649682836204, + "grad_norm": 0.3015603465660569, + "learning_rate": 4.556978599571838e-06, + "loss": 0.4437, + "step": 15251 + }, + { + "epoch": 2.504813910044547, + "grad_norm": 0.404288094881335, + "learning_rate": 4.556509664659441e-06, + "loss": 0.4333, + "step": 15252 + }, + { + "epoch": 2.5049781372528894, + "grad_norm": 0.26762791113011236, + "learning_rate": 4.556040729133297e-06, + "loss": 0.4433, + "step": 15253 + }, + { + "epoch": 2.505142364461232, + "grad_norm": 0.3917995761448291, + "learning_rate": 4.5555717929984945e-06, + "loss": 0.42, + "step": 15254 + }, + { + "epoch": 2.5053065916695747, + "grad_norm": 0.43586340257632006, + "learning_rate": 4.5551028562601255e-06, + "loss": 0.441, + "step": 15255 + }, + { + "epoch": 2.5054708188779173, + "grad_norm": 0.5715050248374042, + "learning_rate": 4.554633918923287e-06, + "loss": 0.4354, + "step": 15256 + }, + { + "epoch": 2.5056350460862604, + "grad_norm": 0.363662712363827, + "learning_rate": 4.554164980993069e-06, + "loss": 0.4175, + "step": 15257 + }, + { + "epoch": 2.505799273294603, + "grad_norm": 0.3369993889608383, + "learning_rate": 4.553696042474569e-06, + "loss": 0.4322, + "step": 15258 + }, + { + "epoch": 2.5059635005029457, + "grad_norm": 0.3051343689476861, + "learning_rate": 4.5532271033728745e-06, + "loss": 0.4337, + "step": 15259 + }, + { + "epoch": 2.5061277277112888, + "grad_norm": 0.34664385637290773, + "learning_rate": 4.552758163693083e-06, + "loss": 0.4378, + "step": 15260 + }, + { + "epoch": 2.5062919549196314, + "grad_norm": 0.38369436861216527, + "learning_rate": 4.552289223440284e-06, + "loss": 0.425, + "step": 15261 + }, + { + "epoch": 2.506456182127974, + "grad_norm": 0.31098976283094065, + "learning_rate": 4.551820282619575e-06, + "loss": 0.4392, + "step": 15262 + }, + { + "epoch": 2.5066204093363167, + "grad_norm": 0.2924212913716923, + "learning_rate": 4.551351341236044e-06, + "loss": 0.4387, + "step": 15263 + }, + { + "epoch": 2.5067846365446593, + "grad_norm": 0.3614952525688465, + "learning_rate": 4.55088239929479e-06, + "loss": 0.4477, + "step": 15264 + }, + { + "epoch": 2.5069488637530024, + "grad_norm": 0.31854411737094895, + "learning_rate": 4.5504134568009e-06, + "loss": 0.447, + "step": 15265 + }, + { + "epoch": 2.507113090961345, + "grad_norm": 0.33338819590292956, + "learning_rate": 4.549944513759473e-06, + "loss": 0.4543, + "step": 15266 + }, + { + "epoch": 2.5072773181696877, + "grad_norm": 0.37005363688745724, + "learning_rate": 4.549475570175597e-06, + "loss": 0.4504, + "step": 15267 + }, + { + "epoch": 2.5074415453780308, + "grad_norm": 0.31931176261796645, + "learning_rate": 4.5490066260543694e-06, + "loss": 0.4491, + "step": 15268 + }, + { + "epoch": 2.5076057725863734, + "grad_norm": 0.4028200604365174, + "learning_rate": 4.548537681400881e-06, + "loss": 0.4354, + "step": 15269 + }, + { + "epoch": 2.507769999794716, + "grad_norm": 0.34431494305601784, + "learning_rate": 4.548068736220224e-06, + "loss": 0.4327, + "step": 15270 + }, + { + "epoch": 2.5079342270030587, + "grad_norm": 0.3359105652446257, + "learning_rate": 4.547599790517496e-06, + "loss": 0.4452, + "step": 15271 + }, + { + "epoch": 2.5080984542114013, + "grad_norm": 0.36606184117765694, + "learning_rate": 4.547130844297786e-06, + "loss": 0.4168, + "step": 15272 + }, + { + "epoch": 2.508262681419744, + "grad_norm": 0.46539092867000703, + "learning_rate": 4.546661897566189e-06, + "loss": 0.43, + "step": 15273 + }, + { + "epoch": 2.508426908628087, + "grad_norm": 0.3260456914724407, + "learning_rate": 4.546192950327797e-06, + "loss": 0.4333, + "step": 15274 + }, + { + "epoch": 2.5085911358364297, + "grad_norm": 0.431282609159347, + "learning_rate": 4.545724002587706e-06, + "loss": 0.4328, + "step": 15275 + }, + { + "epoch": 2.5087553630447723, + "grad_norm": 0.31874399961113387, + "learning_rate": 4.5452550543510055e-06, + "loss": 0.4409, + "step": 15276 + }, + { + "epoch": 2.5089195902531154, + "grad_norm": 0.36712748209494483, + "learning_rate": 4.544786105622791e-06, + "loss": 0.4079, + "step": 15277 + }, + { + "epoch": 2.509083817461458, + "grad_norm": 0.2955054728370622, + "learning_rate": 4.544317156408157e-06, + "loss": 0.433, + "step": 15278 + }, + { + "epoch": 2.5092480446698007, + "grad_norm": 0.31795398888195725, + "learning_rate": 4.543848206712193e-06, + "loss": 0.4335, + "step": 15279 + }, + { + "epoch": 2.5094122718781433, + "grad_norm": 0.327166579049544, + "learning_rate": 4.543379256539995e-06, + "loss": 0.4405, + "step": 15280 + }, + { + "epoch": 2.509576499086486, + "grad_norm": 0.29009950534624457, + "learning_rate": 4.542910305896655e-06, + "loss": 0.4215, + "step": 15281 + }, + { + "epoch": 2.509740726294829, + "grad_norm": 0.5040405319999883, + "learning_rate": 4.542441354787268e-06, + "loss": 0.4146, + "step": 15282 + }, + { + "epoch": 2.5099049535031717, + "grad_norm": 0.3522597372682301, + "learning_rate": 4.541972403216927e-06, + "loss": 0.4387, + "step": 15283 + }, + { + "epoch": 2.5100691807115143, + "grad_norm": 0.2726311387047777, + "learning_rate": 4.5415034511907226e-06, + "loss": 0.4354, + "step": 15284 + }, + { + "epoch": 2.5102334079198574, + "grad_norm": 0.302638717120387, + "learning_rate": 4.5410344987137496e-06, + "loss": 0.435, + "step": 15285 + }, + { + "epoch": 2.5103976351282, + "grad_norm": 0.2967323251347567, + "learning_rate": 4.5405655457911026e-06, + "loss": 0.4407, + "step": 15286 + }, + { + "epoch": 2.5105618623365427, + "grad_norm": 0.31331167849939856, + "learning_rate": 4.540096592427874e-06, + "loss": 0.4412, + "step": 15287 + }, + { + "epoch": 2.5107260895448853, + "grad_norm": 0.2815771015808261, + "learning_rate": 4.539627638629157e-06, + "loss": 0.4126, + "step": 15288 + }, + { + "epoch": 2.510890316753228, + "grad_norm": 0.4327857806293426, + "learning_rate": 4.539158684400044e-06, + "loss": 0.4141, + "step": 15289 + }, + { + "epoch": 2.5110545439615706, + "grad_norm": 0.33791685760518037, + "learning_rate": 4.538689729745629e-06, + "loss": 0.4324, + "step": 15290 + }, + { + "epoch": 2.5112187711699137, + "grad_norm": 0.3428719333544963, + "learning_rate": 4.538220774671005e-06, + "loss": 0.4483, + "step": 15291 + }, + { + "epoch": 2.5113829983782563, + "grad_norm": 0.30672105224945806, + "learning_rate": 4.537751819181268e-06, + "loss": 0.4369, + "step": 15292 + }, + { + "epoch": 2.511547225586599, + "grad_norm": 0.6634715199384176, + "learning_rate": 4.537282863281509e-06, + "loss": 0.428, + "step": 15293 + }, + { + "epoch": 2.511711452794942, + "grad_norm": 0.31872231581495836, + "learning_rate": 4.536813906976819e-06, + "loss": 0.433, + "step": 15294 + }, + { + "epoch": 2.5118756800032846, + "grad_norm": 0.3231534958659712, + "learning_rate": 4.536344950272295e-06, + "loss": 0.4355, + "step": 15295 + }, + { + "epoch": 2.5120399072116273, + "grad_norm": 0.2757083960884838, + "learning_rate": 4.535875993173029e-06, + "loss": 0.424, + "step": 15296 + }, + { + "epoch": 2.51220413441997, + "grad_norm": 0.3142120617565392, + "learning_rate": 4.535407035684115e-06, + "loss": 0.4487, + "step": 15297 + }, + { + "epoch": 2.5123683616283126, + "grad_norm": 0.34568721423469645, + "learning_rate": 4.534938077810646e-06, + "loss": 0.4384, + "step": 15298 + }, + { + "epoch": 2.5125325888366556, + "grad_norm": 0.3676976562163103, + "learning_rate": 4.534469119557714e-06, + "loss": 0.4517, + "step": 15299 + }, + { + "epoch": 2.5126968160449983, + "grad_norm": 0.4372151567267689, + "learning_rate": 4.534000160930414e-06, + "loss": 0.4505, + "step": 15300 + }, + { + "epoch": 2.512861043253341, + "grad_norm": 0.38459047394944434, + "learning_rate": 4.5335312019338385e-06, + "loss": 0.4281, + "step": 15301 + }, + { + "epoch": 2.513025270461684, + "grad_norm": 0.2934325348012631, + "learning_rate": 4.533062242573082e-06, + "loss": 0.4287, + "step": 15302 + }, + { + "epoch": 2.5131894976700266, + "grad_norm": 0.3307628319172223, + "learning_rate": 4.532593282853236e-06, + "loss": 0.4387, + "step": 15303 + }, + { + "epoch": 2.5133537248783693, + "grad_norm": 0.5152030404954767, + "learning_rate": 4.532124322779395e-06, + "loss": 0.4185, + "step": 15304 + }, + { + "epoch": 2.513517952086712, + "grad_norm": 0.29601355750979563, + "learning_rate": 4.531655362356652e-06, + "loss": 0.4424, + "step": 15305 + }, + { + "epoch": 2.5136821792950546, + "grad_norm": 0.33875656452253916, + "learning_rate": 4.531186401590102e-06, + "loss": 0.4533, + "step": 15306 + }, + { + "epoch": 2.513846406503397, + "grad_norm": 0.2905312012969298, + "learning_rate": 4.530717440484836e-06, + "loss": 0.4417, + "step": 15307 + }, + { + "epoch": 2.5140106337117403, + "grad_norm": 0.45733470068768756, + "learning_rate": 4.530248479045949e-06, + "loss": 0.4504, + "step": 15308 + }, + { + "epoch": 2.514174860920083, + "grad_norm": 0.3232055665647335, + "learning_rate": 4.529779517278533e-06, + "loss": 0.4336, + "step": 15309 + }, + { + "epoch": 2.5143390881284255, + "grad_norm": 0.32595716015001, + "learning_rate": 4.529310555187681e-06, + "loss": 0.4239, + "step": 15310 + }, + { + "epoch": 2.5145033153367686, + "grad_norm": 0.32457344849632785, + "learning_rate": 4.528841592778489e-06, + "loss": 0.4606, + "step": 15311 + }, + { + "epoch": 2.5146675425451113, + "grad_norm": 0.4409322797051764, + "learning_rate": 4.528372630056049e-06, + "loss": 0.4443, + "step": 15312 + }, + { + "epoch": 2.514831769753454, + "grad_norm": 0.3427318010419413, + "learning_rate": 4.527903667025455e-06, + "loss": 0.4069, + "step": 15313 + }, + { + "epoch": 2.5149959969617965, + "grad_norm": 0.3094199668034755, + "learning_rate": 4.527434703691799e-06, + "loss": 0.4165, + "step": 15314 + }, + { + "epoch": 2.515160224170139, + "grad_norm": 0.3129531860016772, + "learning_rate": 4.526965740060174e-06, + "loss": 0.4453, + "step": 15315 + }, + { + "epoch": 2.5153244513784823, + "grad_norm": 0.36141679423305373, + "learning_rate": 4.526496776135675e-06, + "loss": 0.458, + "step": 15316 + }, + { + "epoch": 2.515488678586825, + "grad_norm": 0.32103349571056183, + "learning_rate": 4.526027811923398e-06, + "loss": 0.4344, + "step": 15317 + }, + { + "epoch": 2.5156529057951675, + "grad_norm": 0.35520418639316653, + "learning_rate": 4.52555884742843e-06, + "loss": 0.4542, + "step": 15318 + }, + { + "epoch": 2.5158171330035106, + "grad_norm": 0.38140314268757286, + "learning_rate": 4.525089882655868e-06, + "loss": 0.4279, + "step": 15319 + }, + { + "epoch": 2.5159813602118533, + "grad_norm": 0.3587596025920061, + "learning_rate": 4.524620917610805e-06, + "loss": 0.4487, + "step": 15320 + }, + { + "epoch": 2.516145587420196, + "grad_norm": 0.31918676746353725, + "learning_rate": 4.524151952298336e-06, + "loss": 0.4269, + "step": 15321 + }, + { + "epoch": 2.5163098146285385, + "grad_norm": 0.3006162767702321, + "learning_rate": 4.523682986723553e-06, + "loss": 0.4487, + "step": 15322 + }, + { + "epoch": 2.516474041836881, + "grad_norm": 0.35412734411765195, + "learning_rate": 4.523214020891549e-06, + "loss": 0.4191, + "step": 15323 + }, + { + "epoch": 2.516638269045224, + "grad_norm": 0.5091874206900988, + "learning_rate": 4.5227450548074165e-06, + "loss": 0.4503, + "step": 15324 + }, + { + "epoch": 2.516802496253567, + "grad_norm": 0.2851578874900258, + "learning_rate": 4.522276088476253e-06, + "loss": 0.4352, + "step": 15325 + }, + { + "epoch": 2.5169667234619095, + "grad_norm": 0.9997601491535643, + "learning_rate": 4.5218071219031476e-06, + "loss": 0.4468, + "step": 15326 + }, + { + "epoch": 2.517130950670252, + "grad_norm": 0.29348842306959855, + "learning_rate": 4.5213381550931955e-06, + "loss": 0.4525, + "step": 15327 + }, + { + "epoch": 2.5172951778785952, + "grad_norm": 0.2674552211461208, + "learning_rate": 4.52086918805149e-06, + "loss": 0.4416, + "step": 15328 + }, + { + "epoch": 2.517459405086938, + "grad_norm": 0.2949016922161546, + "learning_rate": 4.5204002207831255e-06, + "loss": 0.4262, + "step": 15329 + }, + { + "epoch": 2.5176236322952805, + "grad_norm": 0.32887885088298996, + "learning_rate": 4.519931253293194e-06, + "loss": 0.4439, + "step": 15330 + }, + { + "epoch": 2.517787859503623, + "grad_norm": 0.44021383142814, + "learning_rate": 4.519462285586789e-06, + "loss": 0.444, + "step": 15331 + }, + { + "epoch": 2.517952086711966, + "grad_norm": 0.3943727299935503, + "learning_rate": 4.518993317669005e-06, + "loss": 0.4211, + "step": 15332 + }, + { + "epoch": 2.518116313920309, + "grad_norm": 0.46339240270561693, + "learning_rate": 4.5185243495449346e-06, + "loss": 0.4441, + "step": 15333 + }, + { + "epoch": 2.5182805411286515, + "grad_norm": 0.2996537058649584, + "learning_rate": 4.518055381219671e-06, + "loss": 0.4177, + "step": 15334 + }, + { + "epoch": 2.518444768336994, + "grad_norm": 0.38842685502754193, + "learning_rate": 4.517586412698308e-06, + "loss": 0.4356, + "step": 15335 + }, + { + "epoch": 2.5186089955453372, + "grad_norm": 0.45335596462941113, + "learning_rate": 4.517117443985942e-06, + "loss": 0.4299, + "step": 15336 + }, + { + "epoch": 2.51877322275368, + "grad_norm": 0.3111046050312757, + "learning_rate": 4.516648475087662e-06, + "loss": 0.4326, + "step": 15337 + }, + { + "epoch": 2.5189374499620225, + "grad_norm": 0.3004520651914907, + "learning_rate": 4.516179506008563e-06, + "loss": 0.4291, + "step": 15338 + }, + { + "epoch": 2.519101677170365, + "grad_norm": 0.30623244552605644, + "learning_rate": 4.515710536753737e-06, + "loss": 0.4452, + "step": 15339 + }, + { + "epoch": 2.519265904378708, + "grad_norm": 0.29111646298893507, + "learning_rate": 4.515241567328281e-06, + "loss": 0.4212, + "step": 15340 + }, + { + "epoch": 2.5194301315870504, + "grad_norm": 0.292066653111252, + "learning_rate": 4.514772597737286e-06, + "loss": 0.4405, + "step": 15341 + }, + { + "epoch": 2.5195943587953935, + "grad_norm": 0.45941378411880673, + "learning_rate": 4.514303627985848e-06, + "loss": 0.4301, + "step": 15342 + }, + { + "epoch": 2.519758586003736, + "grad_norm": 0.36884625842137847, + "learning_rate": 4.513834658079056e-06, + "loss": 0.4311, + "step": 15343 + }, + { + "epoch": 2.519922813212079, + "grad_norm": 0.3267743781596869, + "learning_rate": 4.513365688022006e-06, + "loss": 0.4263, + "step": 15344 + }, + { + "epoch": 2.520087040420422, + "grad_norm": 0.4746571514823164, + "learning_rate": 4.512896717819792e-06, + "loss": 0.4373, + "step": 15345 + }, + { + "epoch": 2.5202512676287645, + "grad_norm": 0.3153602181102855, + "learning_rate": 4.512427747477508e-06, + "loss": 0.4146, + "step": 15346 + }, + { + "epoch": 2.520415494837107, + "grad_norm": 0.46102400221620937, + "learning_rate": 4.511958777000246e-06, + "loss": 0.4219, + "step": 15347 + }, + { + "epoch": 2.5205797220454498, + "grad_norm": 0.35545244939339854, + "learning_rate": 4.5114898063931e-06, + "loss": 0.4453, + "step": 15348 + }, + { + "epoch": 2.5207439492537924, + "grad_norm": 0.3778700125325818, + "learning_rate": 4.5110208356611625e-06, + "loss": 0.4593, + "step": 15349 + }, + { + "epoch": 2.5209081764621355, + "grad_norm": 0.43565446974417477, + "learning_rate": 4.510551864809529e-06, + "loss": 0.4214, + "step": 15350 + }, + { + "epoch": 2.521072403670478, + "grad_norm": 0.3153237572414692, + "learning_rate": 4.5100828938432915e-06, + "loss": 0.4309, + "step": 15351 + }, + { + "epoch": 2.5212366308788208, + "grad_norm": 0.3206707578049655, + "learning_rate": 4.509613922767543e-06, + "loss": 0.4325, + "step": 15352 + }, + { + "epoch": 2.521400858087164, + "grad_norm": 0.2925015730081957, + "learning_rate": 4.509144951587378e-06, + "loss": 0.4275, + "step": 15353 + }, + { + "epoch": 2.5215650852955065, + "grad_norm": 0.29391592396077754, + "learning_rate": 4.508675980307891e-06, + "loss": 0.4336, + "step": 15354 + }, + { + "epoch": 2.521729312503849, + "grad_norm": 0.3576499616385502, + "learning_rate": 4.508207008934173e-06, + "loss": 0.4186, + "step": 15355 + }, + { + "epoch": 2.5218935397121918, + "grad_norm": 0.2790704744061716, + "learning_rate": 4.50773803747132e-06, + "loss": 0.4364, + "step": 15356 + }, + { + "epoch": 2.5220577669205344, + "grad_norm": 0.2741415789587045, + "learning_rate": 4.507269065924424e-06, + "loss": 0.419, + "step": 15357 + }, + { + "epoch": 2.522221994128877, + "grad_norm": 0.31922742530373754, + "learning_rate": 4.506800094298579e-06, + "loss": 0.426, + "step": 15358 + }, + { + "epoch": 2.52238622133722, + "grad_norm": 0.30382675560648514, + "learning_rate": 4.506331122598877e-06, + "loss": 0.4343, + "step": 15359 + }, + { + "epoch": 2.5225504485455628, + "grad_norm": 0.37896547303248035, + "learning_rate": 4.505862150830413e-06, + "loss": 0.4466, + "step": 15360 + }, + { + "epoch": 2.5227146757539054, + "grad_norm": 0.3183774699805361, + "learning_rate": 4.505393178998282e-06, + "loss": 0.4527, + "step": 15361 + }, + { + "epoch": 2.5228789029622485, + "grad_norm": 0.2846751786737191, + "learning_rate": 4.504924207107573e-06, + "loss": 0.4336, + "step": 15362 + }, + { + "epoch": 2.523043130170591, + "grad_norm": 0.3478034808054333, + "learning_rate": 4.504455235163385e-06, + "loss": 0.4453, + "step": 15363 + }, + { + "epoch": 2.5232073573789338, + "grad_norm": 0.2946001247345606, + "learning_rate": 4.503986263170807e-06, + "loss": 0.4382, + "step": 15364 + }, + { + "epoch": 2.5233715845872764, + "grad_norm": 0.28990216086748805, + "learning_rate": 4.503517291134936e-06, + "loss": 0.4231, + "step": 15365 + }, + { + "epoch": 2.523535811795619, + "grad_norm": 0.3804646638287557, + "learning_rate": 4.503048319060862e-06, + "loss": 0.4286, + "step": 15366 + }, + { + "epoch": 2.523700039003962, + "grad_norm": 2.0413952323858604, + "learning_rate": 4.502579346953682e-06, + "loss": 0.445, + "step": 15367 + }, + { + "epoch": 2.5238642662123048, + "grad_norm": 0.5064912268763407, + "learning_rate": 4.5021103748184865e-06, + "loss": 0.4368, + "step": 15368 + }, + { + "epoch": 2.5240284934206474, + "grad_norm": 0.3052118739719438, + "learning_rate": 4.50164140266037e-06, + "loss": 0.4349, + "step": 15369 + }, + { + "epoch": 2.5241927206289905, + "grad_norm": 0.3198249052491719, + "learning_rate": 4.501172430484426e-06, + "loss": 0.4264, + "step": 15370 + }, + { + "epoch": 2.524356947837333, + "grad_norm": 0.5187552386652747, + "learning_rate": 4.5007034582957505e-06, + "loss": 0.4362, + "step": 15371 + }, + { + "epoch": 2.5245211750456757, + "grad_norm": 0.2570604115758852, + "learning_rate": 4.500234486099433e-06, + "loss": 0.4435, + "step": 15372 + }, + { + "epoch": 2.5246854022540184, + "grad_norm": 0.27968517151321504, + "learning_rate": 4.499765513900568e-06, + "loss": 0.4406, + "step": 15373 + }, + { + "epoch": 2.524849629462361, + "grad_norm": 0.33267787718052455, + "learning_rate": 4.499296541704251e-06, + "loss": 0.4247, + "step": 15374 + }, + { + "epoch": 2.5250138566707037, + "grad_norm": 0.3596710570956569, + "learning_rate": 4.498827569515574e-06, + "loss": 0.4202, + "step": 15375 + }, + { + "epoch": 2.5251780838790467, + "grad_norm": 0.3257269029831687, + "learning_rate": 4.49835859733963e-06, + "loss": 0.4417, + "step": 15376 + }, + { + "epoch": 2.5253423110873894, + "grad_norm": 0.4355993030836781, + "learning_rate": 4.4978896251815155e-06, + "loss": 0.4288, + "step": 15377 + }, + { + "epoch": 2.525506538295732, + "grad_norm": 0.32559613958151995, + "learning_rate": 4.49742065304632e-06, + "loss": 0.4051, + "step": 15378 + }, + { + "epoch": 2.525670765504075, + "grad_norm": 0.35226942817405693, + "learning_rate": 4.496951680939139e-06, + "loss": 0.4098, + "step": 15379 + }, + { + "epoch": 2.5258349927124177, + "grad_norm": 0.31375777585851555, + "learning_rate": 4.496482708865065e-06, + "loss": 0.4481, + "step": 15380 + }, + { + "epoch": 2.5259992199207604, + "grad_norm": 0.27628561903807153, + "learning_rate": 4.496013736829193e-06, + "loss": 0.4393, + "step": 15381 + }, + { + "epoch": 2.526163447129103, + "grad_norm": 0.3308393282880393, + "learning_rate": 4.495544764836616e-06, + "loss": 0.4384, + "step": 15382 + }, + { + "epoch": 2.5263276743374457, + "grad_norm": 0.3388520225022834, + "learning_rate": 4.495075792892426e-06, + "loss": 0.4407, + "step": 15383 + }, + { + "epoch": 2.5264919015457887, + "grad_norm": 0.2992347879039684, + "learning_rate": 4.494606821001719e-06, + "loss": 0.4266, + "step": 15384 + }, + { + "epoch": 2.5266561287541314, + "grad_norm": 0.3280214871252539, + "learning_rate": 4.4941378491695864e-06, + "loss": 0.4408, + "step": 15385 + }, + { + "epoch": 2.526820355962474, + "grad_norm": 0.31297700953904556, + "learning_rate": 4.493668877401125e-06, + "loss": 0.4237, + "step": 15386 + }, + { + "epoch": 2.526984583170817, + "grad_norm": 0.3584916739969418, + "learning_rate": 4.493199905701423e-06, + "loss": 0.4183, + "step": 15387 + }, + { + "epoch": 2.5271488103791597, + "grad_norm": 0.26870273735724437, + "learning_rate": 4.492730934075577e-06, + "loss": 0.4187, + "step": 15388 + }, + { + "epoch": 2.5273130375875024, + "grad_norm": 0.3480358281774671, + "learning_rate": 4.492261962528681e-06, + "loss": 0.4448, + "step": 15389 + }, + { + "epoch": 2.527477264795845, + "grad_norm": 0.3162081780616614, + "learning_rate": 4.491792991065828e-06, + "loss": 0.4443, + "step": 15390 + }, + { + "epoch": 2.5276414920041876, + "grad_norm": 0.2870245734500193, + "learning_rate": 4.49132401969211e-06, + "loss": 0.4417, + "step": 15391 + }, + { + "epoch": 2.5278057192125303, + "grad_norm": 0.2879612790355667, + "learning_rate": 4.490855048412621e-06, + "loss": 0.4515, + "step": 15392 + }, + { + "epoch": 2.5279699464208734, + "grad_norm": 0.3525720241438474, + "learning_rate": 4.490386077232457e-06, + "loss": 0.4362, + "step": 15393 + }, + { + "epoch": 2.528134173629216, + "grad_norm": 0.34886617957552496, + "learning_rate": 4.48991710615671e-06, + "loss": 0.4293, + "step": 15394 + }, + { + "epoch": 2.5282984008375586, + "grad_norm": 0.33907897924004743, + "learning_rate": 4.489448135190472e-06, + "loss": 0.4473, + "step": 15395 + }, + { + "epoch": 2.5284626280459017, + "grad_norm": 0.3035129464459638, + "learning_rate": 4.4889791643388385e-06, + "loss": 0.4349, + "step": 15396 + }, + { + "epoch": 2.5286268552542444, + "grad_norm": 0.30303534555734757, + "learning_rate": 4.488510193606901e-06, + "loss": 0.4357, + "step": 15397 + }, + { + "epoch": 2.528791082462587, + "grad_norm": 0.790153063740717, + "learning_rate": 4.4880412229997546e-06, + "loss": 0.427, + "step": 15398 + }, + { + "epoch": 2.5289553096709296, + "grad_norm": 0.3511038371128071, + "learning_rate": 4.487572252522493e-06, + "loss": 0.4357, + "step": 15399 + }, + { + "epoch": 2.5291195368792723, + "grad_norm": 0.41092611732691453, + "learning_rate": 4.4871032821802076e-06, + "loss": 0.438, + "step": 15400 + }, + { + "epoch": 2.5292837640876153, + "grad_norm": 0.3450527596392091, + "learning_rate": 4.4866343119779936e-06, + "loss": 0.456, + "step": 15401 + }, + { + "epoch": 2.529447991295958, + "grad_norm": 0.31325138329002983, + "learning_rate": 4.486165341920946e-06, + "loss": 0.4486, + "step": 15402 + }, + { + "epoch": 2.5296122185043006, + "grad_norm": 0.4049086399645973, + "learning_rate": 4.485696372014154e-06, + "loss": 0.4273, + "step": 15403 + }, + { + "epoch": 2.5297764457126437, + "grad_norm": 0.3570244695616439, + "learning_rate": 4.485227402262715e-06, + "loss": 0.4435, + "step": 15404 + }, + { + "epoch": 2.5299406729209863, + "grad_norm": 0.26325068167244875, + "learning_rate": 4.48475843267172e-06, + "loss": 0.4536, + "step": 15405 + }, + { + "epoch": 2.530104900129329, + "grad_norm": 0.4771733843902166, + "learning_rate": 4.484289463246263e-06, + "loss": 0.4402, + "step": 15406 + }, + { + "epoch": 2.5302691273376716, + "grad_norm": 0.5682448574100989, + "learning_rate": 4.483820493991438e-06, + "loss": 0.4431, + "step": 15407 + }, + { + "epoch": 2.5304333545460143, + "grad_norm": 0.4079954803323253, + "learning_rate": 4.483351524912339e-06, + "loss": 0.4283, + "step": 15408 + }, + { + "epoch": 2.530597581754357, + "grad_norm": 0.3742329415968252, + "learning_rate": 4.48288255601406e-06, + "loss": 0.4355, + "step": 15409 + }, + { + "epoch": 2.5307618089627, + "grad_norm": 0.31517860625091276, + "learning_rate": 4.482413587301691e-06, + "loss": 0.4219, + "step": 15410 + }, + { + "epoch": 2.5309260361710426, + "grad_norm": 0.32624001883317155, + "learning_rate": 4.48194461878033e-06, + "loss": 0.4355, + "step": 15411 + }, + { + "epoch": 2.5310902633793853, + "grad_norm": 0.4844251612004818, + "learning_rate": 4.481475650455067e-06, + "loss": 0.4177, + "step": 15412 + }, + { + "epoch": 2.5312544905877283, + "grad_norm": 0.41499405644931575, + "learning_rate": 4.481006682330996e-06, + "loss": 0.4381, + "step": 15413 + }, + { + "epoch": 2.531418717796071, + "grad_norm": 0.3254658251219828, + "learning_rate": 4.480537714413212e-06, + "loss": 0.4417, + "step": 15414 + }, + { + "epoch": 2.5315829450044136, + "grad_norm": 0.28061612293188354, + "learning_rate": 4.480068746706807e-06, + "loss": 0.4429, + "step": 15415 + }, + { + "epoch": 2.5317471722127562, + "grad_norm": 0.29524776058023783, + "learning_rate": 4.479599779216875e-06, + "loss": 0.436, + "step": 15416 + }, + { + "epoch": 2.531911399421099, + "grad_norm": 0.275370031459336, + "learning_rate": 4.47913081194851e-06, + "loss": 0.4402, + "step": 15417 + }, + { + "epoch": 2.532075626629442, + "grad_norm": 0.382911694822015, + "learning_rate": 4.478661844906805e-06, + "loss": 0.4273, + "step": 15418 + }, + { + "epoch": 2.5322398538377846, + "grad_norm": 0.2856386139248631, + "learning_rate": 4.4781928780968535e-06, + "loss": 0.4315, + "step": 15419 + }, + { + "epoch": 2.5324040810461272, + "grad_norm": 0.3662003334417495, + "learning_rate": 4.477723911523749e-06, + "loss": 0.4377, + "step": 15420 + }, + { + "epoch": 2.5325683082544703, + "grad_norm": 0.40571562617701123, + "learning_rate": 4.477254945192584e-06, + "loss": 0.4411, + "step": 15421 + }, + { + "epoch": 2.532732535462813, + "grad_norm": 0.33072718734849793, + "learning_rate": 4.4767859791084525e-06, + "loss": 0.4123, + "step": 15422 + }, + { + "epoch": 2.5328967626711556, + "grad_norm": 0.31030981069280167, + "learning_rate": 4.4763170132764474e-06, + "loss": 0.4415, + "step": 15423 + }, + { + "epoch": 2.5330609898794982, + "grad_norm": 0.3393961988335576, + "learning_rate": 4.475848047701664e-06, + "loss": 0.4484, + "step": 15424 + }, + { + "epoch": 2.533225217087841, + "grad_norm": 0.3355243579966591, + "learning_rate": 4.475379082389194e-06, + "loss": 0.4345, + "step": 15425 + }, + { + "epoch": 2.5333894442961835, + "grad_norm": 0.2739009867205716, + "learning_rate": 4.474910117344132e-06, + "loss": 0.4299, + "step": 15426 + }, + { + "epoch": 2.5335536715045266, + "grad_norm": 0.30376055594266194, + "learning_rate": 4.474441152571572e-06, + "loss": 0.4146, + "step": 15427 + }, + { + "epoch": 2.5337178987128692, + "grad_norm": 0.3275017172400074, + "learning_rate": 4.473972188076604e-06, + "loss": 0.421, + "step": 15428 + }, + { + "epoch": 2.533882125921212, + "grad_norm": 0.32453479680479597, + "learning_rate": 4.473503223864325e-06, + "loss": 0.4396, + "step": 15429 + }, + { + "epoch": 2.534046353129555, + "grad_norm": 0.48443152017160745, + "learning_rate": 4.473034259939826e-06, + "loss": 0.4342, + "step": 15430 + }, + { + "epoch": 2.5342105803378976, + "grad_norm": 0.30446815466227956, + "learning_rate": 4.472565296308202e-06, + "loss": 0.4375, + "step": 15431 + }, + { + "epoch": 2.5343748075462402, + "grad_norm": 0.28200424035613053, + "learning_rate": 4.472096332974545e-06, + "loss": 0.4148, + "step": 15432 + }, + { + "epoch": 2.534539034754583, + "grad_norm": 0.27325398958681874, + "learning_rate": 4.4716273699439515e-06, + "loss": 0.4212, + "step": 15433 + }, + { + "epoch": 2.5347032619629255, + "grad_norm": 0.2784740527403517, + "learning_rate": 4.471158407221511e-06, + "loss": 0.4329, + "step": 15434 + }, + { + "epoch": 2.5348674891712686, + "grad_norm": 0.33152751718058054, + "learning_rate": 4.470689444812318e-06, + "loss": 0.4278, + "step": 15435 + }, + { + "epoch": 2.535031716379611, + "grad_norm": 0.3668173661236447, + "learning_rate": 4.470220482721469e-06, + "loss": 0.4222, + "step": 15436 + }, + { + "epoch": 2.535195943587954, + "grad_norm": 0.30285348260361367, + "learning_rate": 4.469751520954053e-06, + "loss": 0.4429, + "step": 15437 + }, + { + "epoch": 2.535360170796297, + "grad_norm": 0.3455367713469772, + "learning_rate": 4.469282559515165e-06, + "loss": 0.4236, + "step": 15438 + }, + { + "epoch": 2.5355243980046396, + "grad_norm": 0.2989934245140012, + "learning_rate": 4.4688135984098994e-06, + "loss": 0.4492, + "step": 15439 + }, + { + "epoch": 2.535688625212982, + "grad_norm": 0.5199522243515112, + "learning_rate": 4.468344637643349e-06, + "loss": 0.4461, + "step": 15440 + }, + { + "epoch": 2.535852852421325, + "grad_norm": 0.3047139296928439, + "learning_rate": 4.467875677220605e-06, + "loss": 0.456, + "step": 15441 + }, + { + "epoch": 2.5360170796296675, + "grad_norm": 0.4383952993691737, + "learning_rate": 4.467406717146764e-06, + "loss": 0.4231, + "step": 15442 + }, + { + "epoch": 2.53618130683801, + "grad_norm": 0.29397560831634006, + "learning_rate": 4.466937757426919e-06, + "loss": 0.4269, + "step": 15443 + }, + { + "epoch": 2.536345534046353, + "grad_norm": 0.38538600677786994, + "learning_rate": 4.4664687980661635e-06, + "loss": 0.4527, + "step": 15444 + }, + { + "epoch": 2.536509761254696, + "grad_norm": 0.374965751531902, + "learning_rate": 4.465999839069588e-06, + "loss": 0.4519, + "step": 15445 + }, + { + "epoch": 2.5366739884630385, + "grad_norm": 0.3716477346667442, + "learning_rate": 4.465530880442287e-06, + "loss": 0.4409, + "step": 15446 + }, + { + "epoch": 2.5368382156713816, + "grad_norm": 1.7589956201460324, + "learning_rate": 4.4650619221893555e-06, + "loss": 0.431, + "step": 15447 + }, + { + "epoch": 2.537002442879724, + "grad_norm": 0.36684972649658787, + "learning_rate": 4.464592964315886e-06, + "loss": 0.4427, + "step": 15448 + }, + { + "epoch": 2.537166670088067, + "grad_norm": 0.33423643193368346, + "learning_rate": 4.464124006826971e-06, + "loss": 0.4296, + "step": 15449 + }, + { + "epoch": 2.5373308972964095, + "grad_norm": 0.3044931364167006, + "learning_rate": 4.463655049727705e-06, + "loss": 0.4352, + "step": 15450 + }, + { + "epoch": 2.537495124504752, + "grad_norm": 0.30416152178632283, + "learning_rate": 4.463186093023181e-06, + "loss": 0.437, + "step": 15451 + }, + { + "epoch": 2.537659351713095, + "grad_norm": 0.4942313735733957, + "learning_rate": 4.462717136718494e-06, + "loss": 0.4555, + "step": 15452 + }, + { + "epoch": 2.537823578921438, + "grad_norm": 0.3510260293567135, + "learning_rate": 4.462248180818733e-06, + "loss": 0.4535, + "step": 15453 + }, + { + "epoch": 2.5379878061297805, + "grad_norm": 0.3444097092072025, + "learning_rate": 4.461779225328995e-06, + "loss": 0.4235, + "step": 15454 + }, + { + "epoch": 2.5381520333381236, + "grad_norm": 0.3186023690214023, + "learning_rate": 4.461310270254372e-06, + "loss": 0.4269, + "step": 15455 + }, + { + "epoch": 2.538316260546466, + "grad_norm": 0.28283435452405786, + "learning_rate": 4.4608413155999574e-06, + "loss": 0.4461, + "step": 15456 + }, + { + "epoch": 2.538480487754809, + "grad_norm": 0.2835693384007139, + "learning_rate": 4.460372361370844e-06, + "loss": 0.4179, + "step": 15457 + }, + { + "epoch": 2.5386447149631515, + "grad_norm": 0.469588775510588, + "learning_rate": 4.459903407572127e-06, + "loss": 0.4278, + "step": 15458 + }, + { + "epoch": 2.538808942171494, + "grad_norm": 0.31968267198673356, + "learning_rate": 4.459434454208898e-06, + "loss": 0.4319, + "step": 15459 + }, + { + "epoch": 2.5389731693798367, + "grad_norm": 0.3139217810294942, + "learning_rate": 4.45896550128625e-06, + "loss": 0.4486, + "step": 15460 + }, + { + "epoch": 2.53913739658818, + "grad_norm": 0.6289982528503534, + "learning_rate": 4.458496548809279e-06, + "loss": 0.4383, + "step": 15461 + }, + { + "epoch": 2.5393016237965225, + "grad_norm": 0.43248669042514953, + "learning_rate": 4.458027596783075e-06, + "loss": 0.4298, + "step": 15462 + }, + { + "epoch": 2.539465851004865, + "grad_norm": 0.32175232275524884, + "learning_rate": 4.457558645212733e-06, + "loss": 0.4305, + "step": 15463 + }, + { + "epoch": 2.539630078213208, + "grad_norm": 0.33913465217808964, + "learning_rate": 4.457089694103345e-06, + "loss": 0.4423, + "step": 15464 + }, + { + "epoch": 2.539794305421551, + "grad_norm": 0.4217029864914242, + "learning_rate": 4.456620743460005e-06, + "loss": 0.4327, + "step": 15465 + }, + { + "epoch": 2.5399585326298935, + "grad_norm": 0.32107328028587867, + "learning_rate": 4.456151793287807e-06, + "loss": 0.4595, + "step": 15466 + }, + { + "epoch": 2.540122759838236, + "grad_norm": 0.4056818357852612, + "learning_rate": 4.455682843591845e-06, + "loss": 0.4314, + "step": 15467 + }, + { + "epoch": 2.5402869870465787, + "grad_norm": 0.3285580483437489, + "learning_rate": 4.455213894377208e-06, + "loss": 0.4394, + "step": 15468 + }, + { + "epoch": 2.540451214254922, + "grad_norm": 0.4463962845057985, + "learning_rate": 4.454744945648996e-06, + "loss": 0.4365, + "step": 15469 + }, + { + "epoch": 2.5406154414632645, + "grad_norm": 0.5759978564993642, + "learning_rate": 4.454275997412296e-06, + "loss": 0.4314, + "step": 15470 + }, + { + "epoch": 2.540779668671607, + "grad_norm": 0.30311744414915, + "learning_rate": 4.453807049672203e-06, + "loss": 0.4443, + "step": 15471 + }, + { + "epoch": 2.54094389587995, + "grad_norm": 0.48319619600914393, + "learning_rate": 4.453338102433812e-06, + "loss": 0.4312, + "step": 15472 + }, + { + "epoch": 2.541108123088293, + "grad_norm": 0.27961971412030734, + "learning_rate": 4.452869155702215e-06, + "loss": 0.439, + "step": 15473 + }, + { + "epoch": 2.5412723502966355, + "grad_norm": 0.33938593418946583, + "learning_rate": 4.452400209482505e-06, + "loss": 0.419, + "step": 15474 + }, + { + "epoch": 2.541436577504978, + "grad_norm": 0.31723679723348247, + "learning_rate": 4.451931263779775e-06, + "loss": 0.42, + "step": 15475 + }, + { + "epoch": 2.5416008047133207, + "grad_norm": 0.3900582478708769, + "learning_rate": 4.4514623185991195e-06, + "loss": 0.4435, + "step": 15476 + }, + { + "epoch": 2.5417650319216634, + "grad_norm": 0.3161165257957074, + "learning_rate": 4.4509933739456325e-06, + "loss": 0.4175, + "step": 15477 + }, + { + "epoch": 2.5419292591300064, + "grad_norm": 0.3988098521207848, + "learning_rate": 4.450524429824405e-06, + "loss": 0.4453, + "step": 15478 + }, + { + "epoch": 2.542093486338349, + "grad_norm": 0.32174868195541384, + "learning_rate": 4.45005548624053e-06, + "loss": 0.4372, + "step": 15479 + }, + { + "epoch": 2.5422577135466917, + "grad_norm": 0.3045389565255562, + "learning_rate": 4.449586543199101e-06, + "loss": 0.4247, + "step": 15480 + }, + { + "epoch": 2.542421940755035, + "grad_norm": 0.2908792710329188, + "learning_rate": 4.4491176007052115e-06, + "loss": 0.4271, + "step": 15481 + }, + { + "epoch": 2.5425861679633774, + "grad_norm": 0.29144215488845143, + "learning_rate": 4.448648658763957e-06, + "loss": 0.4374, + "step": 15482 + }, + { + "epoch": 2.54275039517172, + "grad_norm": 0.3977821028455875, + "learning_rate": 4.448179717380426e-06, + "loss": 0.4574, + "step": 15483 + }, + { + "epoch": 2.5429146223800627, + "grad_norm": 0.33925302540685487, + "learning_rate": 4.447710776559716e-06, + "loss": 0.4403, + "step": 15484 + }, + { + "epoch": 2.5430788495884054, + "grad_norm": 0.38977415792187947, + "learning_rate": 4.447241836306917e-06, + "loss": 0.4484, + "step": 15485 + }, + { + "epoch": 2.5432430767967484, + "grad_norm": 0.3444752824234094, + "learning_rate": 4.4467728966271265e-06, + "loss": 0.4188, + "step": 15486 + }, + { + "epoch": 2.543407304005091, + "grad_norm": 0.29387542378527076, + "learning_rate": 4.446303957525432e-06, + "loss": 0.4551, + "step": 15487 + }, + { + "epoch": 2.5435715312134337, + "grad_norm": 0.3891893197024633, + "learning_rate": 4.445835019006931e-06, + "loss": 0.4435, + "step": 15488 + }, + { + "epoch": 2.543735758421777, + "grad_norm": 0.5432113733163281, + "learning_rate": 4.445366081076714e-06, + "loss": 0.4286, + "step": 15489 + }, + { + "epoch": 2.5438999856301194, + "grad_norm": 0.30458472444959955, + "learning_rate": 4.444897143739875e-06, + "loss": 0.4291, + "step": 15490 + }, + { + "epoch": 2.544064212838462, + "grad_norm": 0.33335443953907484, + "learning_rate": 4.444428207001507e-06, + "loss": 0.4427, + "step": 15491 + }, + { + "epoch": 2.5442284400468047, + "grad_norm": 0.3114285780540954, + "learning_rate": 4.4439592708667044e-06, + "loss": 0.4315, + "step": 15492 + }, + { + "epoch": 2.5443926672551473, + "grad_norm": 0.33859893153267717, + "learning_rate": 4.443490335340558e-06, + "loss": 0.4292, + "step": 15493 + }, + { + "epoch": 2.54455689446349, + "grad_norm": 0.34687445804031664, + "learning_rate": 4.443021400428164e-06, + "loss": 0.4318, + "step": 15494 + }, + { + "epoch": 2.544721121671833, + "grad_norm": 0.2773802250144603, + "learning_rate": 4.442552466134613e-06, + "loss": 0.4369, + "step": 15495 + }, + { + "epoch": 2.5448853488801757, + "grad_norm": 0.29535434743502614, + "learning_rate": 4.4420835324649976e-06, + "loss": 0.4346, + "step": 15496 + }, + { + "epoch": 2.5450495760885183, + "grad_norm": 0.9360580914134653, + "learning_rate": 4.441614599424413e-06, + "loss": 0.4517, + "step": 15497 + }, + { + "epoch": 2.5452138032968614, + "grad_norm": 0.7513840159695082, + "learning_rate": 4.441145667017951e-06, + "loss": 0.4466, + "step": 15498 + }, + { + "epoch": 2.545378030505204, + "grad_norm": 0.34400842175373336, + "learning_rate": 4.4406767352507045e-06, + "loss": 0.4268, + "step": 15499 + }, + { + "epoch": 2.5455422577135467, + "grad_norm": 0.36609020722625096, + "learning_rate": 4.440207804127767e-06, + "loss": 0.4322, + "step": 15500 + }, + { + "epoch": 2.5457064849218893, + "grad_norm": 0.30316434981498774, + "learning_rate": 4.439738873654232e-06, + "loss": 0.4371, + "step": 15501 + }, + { + "epoch": 2.545870712130232, + "grad_norm": 0.2875496740502021, + "learning_rate": 4.439269943835192e-06, + "loss": 0.4177, + "step": 15502 + }, + { + "epoch": 2.546034939338575, + "grad_norm": 0.3142325124623637, + "learning_rate": 4.438801014675742e-06, + "loss": 0.4359, + "step": 15503 + }, + { + "epoch": 2.5461991665469177, + "grad_norm": 0.3137356864008128, + "learning_rate": 4.43833208618097e-06, + "loss": 0.4371, + "step": 15504 + }, + { + "epoch": 2.5463633937552603, + "grad_norm": 0.32170461352243107, + "learning_rate": 4.437863158355975e-06, + "loss": 0.4447, + "step": 15505 + }, + { + "epoch": 2.5465276209636034, + "grad_norm": 0.3802915803414638, + "learning_rate": 4.437394231205845e-06, + "loss": 0.4345, + "step": 15506 + }, + { + "epoch": 2.546691848171946, + "grad_norm": 0.31857101888559824, + "learning_rate": 4.436925304735677e-06, + "loss": 0.4309, + "step": 15507 + }, + { + "epoch": 2.5468560753802887, + "grad_norm": 0.2584963933988361, + "learning_rate": 4.4364563789505604e-06, + "loss": 0.4443, + "step": 15508 + }, + { + "epoch": 2.5470203025886313, + "grad_norm": 0.4038773729499395, + "learning_rate": 4.435987453855591e-06, + "loss": 0.4472, + "step": 15509 + }, + { + "epoch": 2.547184529796974, + "grad_norm": 0.31802899292435854, + "learning_rate": 4.43551852945586e-06, + "loss": 0.4401, + "step": 15510 + }, + { + "epoch": 2.5473487570053166, + "grad_norm": 0.3242892323716624, + "learning_rate": 4.435049605756464e-06, + "loss": 0.4332, + "step": 15511 + }, + { + "epoch": 2.5475129842136597, + "grad_norm": 0.26797913518694794, + "learning_rate": 4.434580682762491e-06, + "loss": 0.4278, + "step": 15512 + }, + { + "epoch": 2.5476772114220023, + "grad_norm": 0.44582865863931953, + "learning_rate": 4.434111760479037e-06, + "loss": 0.4533, + "step": 15513 + }, + { + "epoch": 2.547841438630345, + "grad_norm": 0.3055931723597788, + "learning_rate": 4.433642838911193e-06, + "loss": 0.4502, + "step": 15514 + }, + { + "epoch": 2.548005665838688, + "grad_norm": 0.3085446507289279, + "learning_rate": 4.433173918064053e-06, + "loss": 0.4381, + "step": 15515 + }, + { + "epoch": 2.5481698930470307, + "grad_norm": 0.2799455160677543, + "learning_rate": 4.432704997942711e-06, + "loss": 0.4293, + "step": 15516 + }, + { + "epoch": 2.5483341202553733, + "grad_norm": 0.3072559176696745, + "learning_rate": 4.432236078552259e-06, + "loss": 0.4313, + "step": 15517 + }, + { + "epoch": 2.548498347463716, + "grad_norm": 0.2794966100213563, + "learning_rate": 4.4317671598977885e-06, + "loss": 0.4265, + "step": 15518 + }, + { + "epoch": 2.5486625746720586, + "grad_norm": 0.28997197483099796, + "learning_rate": 4.431298241984396e-06, + "loss": 0.4536, + "step": 15519 + }, + { + "epoch": 2.5488268018804017, + "grad_norm": 0.4573616100826798, + "learning_rate": 4.430829324817171e-06, + "loss": 0.4291, + "step": 15520 + }, + { + "epoch": 2.5489910290887443, + "grad_norm": 0.4438472979662701, + "learning_rate": 4.430360408401207e-06, + "loss": 0.4267, + "step": 15521 + }, + { + "epoch": 2.549155256297087, + "grad_norm": 0.2577708578255245, + "learning_rate": 4.429891492741598e-06, + "loss": 0.448, + "step": 15522 + }, + { + "epoch": 2.54931948350543, + "grad_norm": 0.32911337426606396, + "learning_rate": 4.429422577843436e-06, + "loss": 0.4351, + "step": 15523 + }, + { + "epoch": 2.5494837107137727, + "grad_norm": 0.3524767093716376, + "learning_rate": 4.428953663711814e-06, + "loss": 0.4165, + "step": 15524 + }, + { + "epoch": 2.5496479379221153, + "grad_norm": 0.29196546007234, + "learning_rate": 4.428484750351825e-06, + "loss": 0.4344, + "step": 15525 + }, + { + "epoch": 2.549812165130458, + "grad_norm": 0.298449530861429, + "learning_rate": 4.428015837768563e-06, + "loss": 0.4259, + "step": 15526 + }, + { + "epoch": 2.5499763923388006, + "grad_norm": 0.6148962359432069, + "learning_rate": 4.42754692596712e-06, + "loss": 0.4403, + "step": 15527 + }, + { + "epoch": 2.550140619547143, + "grad_norm": 0.3280661317620682, + "learning_rate": 4.42707801495259e-06, + "loss": 0.4275, + "step": 15528 + }, + { + "epoch": 2.5503048467554863, + "grad_norm": 0.2837424072375426, + "learning_rate": 4.426609104730062e-06, + "loss": 0.4427, + "step": 15529 + }, + { + "epoch": 2.550469073963829, + "grad_norm": 0.28041871680203856, + "learning_rate": 4.426140195304631e-06, + "loss": 0.4635, + "step": 15530 + }, + { + "epoch": 2.5506333011721716, + "grad_norm": 0.350200009979362, + "learning_rate": 4.425671286681392e-06, + "loss": 0.4343, + "step": 15531 + }, + { + "epoch": 2.5507975283805147, + "grad_norm": 0.32454220273887197, + "learning_rate": 4.425202378865435e-06, + "loss": 0.4614, + "step": 15532 + }, + { + "epoch": 2.5509617555888573, + "grad_norm": 0.5367213364709559, + "learning_rate": 4.424733471861853e-06, + "loss": 0.4101, + "step": 15533 + }, + { + "epoch": 2.5511259827972, + "grad_norm": 0.28676733476987987, + "learning_rate": 4.42426456567574e-06, + "loss": 0.4259, + "step": 15534 + }, + { + "epoch": 2.5512902100055426, + "grad_norm": 0.2975809840115441, + "learning_rate": 4.423795660312189e-06, + "loss": 0.4426, + "step": 15535 + }, + { + "epoch": 2.551454437213885, + "grad_norm": 0.3249328928419927, + "learning_rate": 4.423326755776292e-06, + "loss": 0.4473, + "step": 15536 + }, + { + "epoch": 2.5516186644222283, + "grad_norm": 0.31184122620432747, + "learning_rate": 4.422857852073143e-06, + "loss": 0.4104, + "step": 15537 + }, + { + "epoch": 2.551782891630571, + "grad_norm": 0.30635029912032957, + "learning_rate": 4.422388949207831e-06, + "loss": 0.4345, + "step": 15538 + }, + { + "epoch": 2.5519471188389136, + "grad_norm": 0.33025971138949717, + "learning_rate": 4.421920047185453e-06, + "loss": 0.4492, + "step": 15539 + }, + { + "epoch": 2.5521113460472566, + "grad_norm": 0.3181232184610691, + "learning_rate": 4.421451146011099e-06, + "loss": 0.4229, + "step": 15540 + }, + { + "epoch": 2.5522755732555993, + "grad_norm": 0.28306483682463085, + "learning_rate": 4.420982245689865e-06, + "loss": 0.4365, + "step": 15541 + }, + { + "epoch": 2.552439800463942, + "grad_norm": 0.34612517738918935, + "learning_rate": 4.420513346226839e-06, + "loss": 0.432, + "step": 15542 + }, + { + "epoch": 2.5526040276722846, + "grad_norm": 0.30498092076014316, + "learning_rate": 4.420044447627118e-06, + "loss": 0.4353, + "step": 15543 + }, + { + "epoch": 2.552768254880627, + "grad_norm": 0.3009496686032841, + "learning_rate": 4.419575549895793e-06, + "loss": 0.4316, + "step": 15544 + }, + { + "epoch": 2.55293248208897, + "grad_norm": 0.29409546874362075, + "learning_rate": 4.419106653037956e-06, + "loss": 0.4305, + "step": 15545 + }, + { + "epoch": 2.553096709297313, + "grad_norm": 0.30258945834332523, + "learning_rate": 4.418637757058701e-06, + "loss": 0.4276, + "step": 15546 + }, + { + "epoch": 2.5532609365056556, + "grad_norm": 0.3330461796000069, + "learning_rate": 4.418168861963119e-06, + "loss": 0.4454, + "step": 15547 + }, + { + "epoch": 2.553425163713998, + "grad_norm": 0.3405066873911021, + "learning_rate": 4.4176999677563035e-06, + "loss": 0.4283, + "step": 15548 + }, + { + "epoch": 2.5535893909223413, + "grad_norm": 0.39288368153964764, + "learning_rate": 4.417231074443347e-06, + "loss": 0.4311, + "step": 15549 + }, + { + "epoch": 2.553753618130684, + "grad_norm": 0.37034601558345265, + "learning_rate": 4.416762182029344e-06, + "loss": 0.432, + "step": 15550 + }, + { + "epoch": 2.5539178453390265, + "grad_norm": 0.38462584820097667, + "learning_rate": 4.416293290519385e-06, + "loss": 0.452, + "step": 15551 + }, + { + "epoch": 2.554082072547369, + "grad_norm": 0.42051310281220755, + "learning_rate": 4.415824399918563e-06, + "loss": 0.435, + "step": 15552 + }, + { + "epoch": 2.554246299755712, + "grad_norm": 0.3166037955740742, + "learning_rate": 4.4153555102319725e-06, + "loss": 0.413, + "step": 15553 + }, + { + "epoch": 2.554410526964055, + "grad_norm": 0.31043044492793076, + "learning_rate": 4.414886621464702e-06, + "loss": 0.4347, + "step": 15554 + }, + { + "epoch": 2.5545747541723975, + "grad_norm": 0.34700798655471937, + "learning_rate": 4.414417733621847e-06, + "loss": 0.4421, + "step": 15555 + }, + { + "epoch": 2.55473898138074, + "grad_norm": 0.2892813037690508, + "learning_rate": 4.4139488467085004e-06, + "loss": 0.4446, + "step": 15556 + }, + { + "epoch": 2.5549032085890833, + "grad_norm": 0.3175899138027976, + "learning_rate": 4.413479960729754e-06, + "loss": 0.4314, + "step": 15557 + }, + { + "epoch": 2.555067435797426, + "grad_norm": 0.3149807022034962, + "learning_rate": 4.413011075690699e-06, + "loss": 0.4423, + "step": 15558 + }, + { + "epoch": 2.5552316630057685, + "grad_norm": 0.5347116578772787, + "learning_rate": 4.41254219159643e-06, + "loss": 0.4459, + "step": 15559 + }, + { + "epoch": 2.555395890214111, + "grad_norm": 0.37843617764625054, + "learning_rate": 4.41207330845204e-06, + "loss": 0.4341, + "step": 15560 + }, + { + "epoch": 2.555560117422454, + "grad_norm": 0.2762679102433087, + "learning_rate": 4.411604426262621e-06, + "loss": 0.4058, + "step": 15561 + }, + { + "epoch": 2.5557243446307965, + "grad_norm": 0.30060861946870354, + "learning_rate": 4.411135545033263e-06, + "loss": 0.4443, + "step": 15562 + }, + { + "epoch": 2.5558885718391395, + "grad_norm": 0.3800865842812888, + "learning_rate": 4.41066666476906e-06, + "loss": 0.43, + "step": 15563 + }, + { + "epoch": 2.556052799047482, + "grad_norm": 0.25946722154162527, + "learning_rate": 4.4101977854751055e-06, + "loss": 0.4419, + "step": 15564 + }, + { + "epoch": 2.556217026255825, + "grad_norm": 0.6759836164016616, + "learning_rate": 4.409728907156493e-06, + "loss": 0.4274, + "step": 15565 + }, + { + "epoch": 2.556381253464168, + "grad_norm": 0.6234744045352152, + "learning_rate": 4.409260029818312e-06, + "loss": 0.4346, + "step": 15566 + }, + { + "epoch": 2.5565454806725105, + "grad_norm": 0.38581918418779537, + "learning_rate": 4.408791153465655e-06, + "loss": 0.4232, + "step": 15567 + }, + { + "epoch": 2.556709707880853, + "grad_norm": 0.29573823645262287, + "learning_rate": 4.408322278103617e-06, + "loss": 0.4193, + "step": 15568 + }, + { + "epoch": 2.556873935089196, + "grad_norm": 0.3016718414824482, + "learning_rate": 4.407853403737291e-06, + "loss": 0.4545, + "step": 15569 + }, + { + "epoch": 2.5570381622975384, + "grad_norm": 0.3891485274119814, + "learning_rate": 4.407384530371766e-06, + "loss": 0.4365, + "step": 15570 + }, + { + "epoch": 2.5572023895058815, + "grad_norm": 0.2532739379279318, + "learning_rate": 4.406915658012136e-06, + "loss": 0.4317, + "step": 15571 + }, + { + "epoch": 2.557366616714224, + "grad_norm": 0.29871548761299727, + "learning_rate": 4.406446786663494e-06, + "loss": 0.4403, + "step": 15572 + }, + { + "epoch": 2.557530843922567, + "grad_norm": 0.35194308191729523, + "learning_rate": 4.405977916330931e-06, + "loss": 0.4441, + "step": 15573 + }, + { + "epoch": 2.55769507113091, + "grad_norm": 0.3031216695491342, + "learning_rate": 4.405509047019541e-06, + "loss": 0.4154, + "step": 15574 + }, + { + "epoch": 2.5578592983392525, + "grad_norm": 0.31930845237851674, + "learning_rate": 4.4050401787344165e-06, + "loss": 0.4454, + "step": 15575 + }, + { + "epoch": 2.558023525547595, + "grad_norm": 0.33002408879010403, + "learning_rate": 4.404571311480648e-06, + "loss": 0.4434, + "step": 15576 + }, + { + "epoch": 2.558187752755938, + "grad_norm": 0.30275352248274623, + "learning_rate": 4.404102445263329e-06, + "loss": 0.4509, + "step": 15577 + }, + { + "epoch": 2.5583519799642804, + "grad_norm": 0.35335386445372424, + "learning_rate": 4.4036335800875535e-06, + "loss": 0.448, + "step": 15578 + }, + { + "epoch": 2.558516207172623, + "grad_norm": 0.2800989999633153, + "learning_rate": 4.403164715958411e-06, + "loss": 0.425, + "step": 15579 + }, + { + "epoch": 2.558680434380966, + "grad_norm": 0.3127782022895306, + "learning_rate": 4.402695852880995e-06, + "loss": 0.4137, + "step": 15580 + }, + { + "epoch": 2.558844661589309, + "grad_norm": 0.3799146716087815, + "learning_rate": 4.402226990860397e-06, + "loss": 0.4266, + "step": 15581 + }, + { + "epoch": 2.5590088887976514, + "grad_norm": 0.3673831939441783, + "learning_rate": 4.401758129901711e-06, + "loss": 0.4064, + "step": 15582 + }, + { + "epoch": 2.5591731160059945, + "grad_norm": 0.3376274379342438, + "learning_rate": 4.401289270010027e-06, + "loss": 0.4233, + "step": 15583 + }, + { + "epoch": 2.559337343214337, + "grad_norm": 0.3998414262198734, + "learning_rate": 4.40082041119044e-06, + "loss": 0.4275, + "step": 15584 + }, + { + "epoch": 2.55950157042268, + "grad_norm": 0.3520853575209431, + "learning_rate": 4.40035155344804e-06, + "loss": 0.4616, + "step": 15585 + }, + { + "epoch": 2.5596657976310224, + "grad_norm": 0.37561224309961333, + "learning_rate": 4.399882696787922e-06, + "loss": 0.4394, + "step": 15586 + }, + { + "epoch": 2.559830024839365, + "grad_norm": 0.36754339045918716, + "learning_rate": 4.399413841215175e-06, + "loss": 0.4575, + "step": 15587 + }, + { + "epoch": 2.559994252047708, + "grad_norm": 0.33783166500512124, + "learning_rate": 4.398944986734892e-06, + "loss": 0.4164, + "step": 15588 + }, + { + "epoch": 2.560158479256051, + "grad_norm": 0.31452477254199035, + "learning_rate": 4.398476133352167e-06, + "loss": 0.4528, + "step": 15589 + }, + { + "epoch": 2.5603227064643934, + "grad_norm": 0.2787399568684325, + "learning_rate": 4.398007281072091e-06, + "loss": 0.4209, + "step": 15590 + }, + { + "epoch": 2.5604869336727365, + "grad_norm": 0.2767623218217474, + "learning_rate": 4.397538429899756e-06, + "loss": 0.414, + "step": 15591 + }, + { + "epoch": 2.560651160881079, + "grad_norm": 0.28888844049323786, + "learning_rate": 4.397069579840253e-06, + "loss": 0.4542, + "step": 15592 + }, + { + "epoch": 2.5608153880894218, + "grad_norm": 0.5864957714875929, + "learning_rate": 4.396600730898677e-06, + "loss": 0.4243, + "step": 15593 + }, + { + "epoch": 2.5609796152977644, + "grad_norm": 0.38962154546595434, + "learning_rate": 4.396131883080121e-06, + "loss": 0.4387, + "step": 15594 + }, + { + "epoch": 2.561143842506107, + "grad_norm": 0.27331680844407696, + "learning_rate": 4.395663036389673e-06, + "loss": 0.4464, + "step": 15595 + }, + { + "epoch": 2.5613080697144497, + "grad_norm": 0.3069416039408074, + "learning_rate": 4.395194190832426e-06, + "loss": 0.4388, + "step": 15596 + }, + { + "epoch": 2.5614722969227928, + "grad_norm": 0.3252988244608723, + "learning_rate": 4.394725346413474e-06, + "loss": 0.4312, + "step": 15597 + }, + { + "epoch": 2.5616365241311354, + "grad_norm": 0.556385918534084, + "learning_rate": 4.394256503137908e-06, + "loss": 0.4547, + "step": 15598 + }, + { + "epoch": 2.561800751339478, + "grad_norm": 0.3021824955470384, + "learning_rate": 4.393787661010821e-06, + "loss": 0.4296, + "step": 15599 + }, + { + "epoch": 2.561964978547821, + "grad_norm": 0.2910937592004871, + "learning_rate": 4.393318820037304e-06, + "loss": 0.4649, + "step": 15600 + }, + { + "epoch": 2.5621292057561638, + "grad_norm": 0.31504368710656216, + "learning_rate": 4.39284998022245e-06, + "loss": 0.4244, + "step": 15601 + }, + { + "epoch": 2.5622934329645064, + "grad_norm": 0.3230087906700632, + "learning_rate": 4.39238114157135e-06, + "loss": 0.4366, + "step": 15602 + }, + { + "epoch": 2.562457660172849, + "grad_norm": 0.33819297778333945, + "learning_rate": 4.3919123040890985e-06, + "loss": 0.4313, + "step": 15603 + }, + { + "epoch": 2.5626218873811917, + "grad_norm": 0.28026153833271106, + "learning_rate": 4.391443467780784e-06, + "loss": 0.4233, + "step": 15604 + }, + { + "epoch": 2.5627861145895348, + "grad_norm": 0.313441369830849, + "learning_rate": 4.390974632651502e-06, + "loss": 0.4295, + "step": 15605 + }, + { + "epoch": 2.5629503417978774, + "grad_norm": 0.8718709717848913, + "learning_rate": 4.3905057987063406e-06, + "loss": 0.4238, + "step": 15606 + }, + { + "epoch": 2.56311456900622, + "grad_norm": 0.26352651969380825, + "learning_rate": 4.390036965950394e-06, + "loss": 0.4334, + "step": 15607 + }, + { + "epoch": 2.563278796214563, + "grad_norm": 0.2994463931598903, + "learning_rate": 4.389568134388754e-06, + "loss": 0.4317, + "step": 15608 + }, + { + "epoch": 2.5634430234229058, + "grad_norm": 0.3646830270622841, + "learning_rate": 4.389099304026515e-06, + "loss": 0.4337, + "step": 15609 + }, + { + "epoch": 2.5636072506312484, + "grad_norm": 0.5393460161367439, + "learning_rate": 4.3886304748687644e-06, + "loss": 0.4356, + "step": 15610 + }, + { + "epoch": 2.563771477839591, + "grad_norm": 0.30025939172178034, + "learning_rate": 4.388161646920599e-06, + "loss": 0.4251, + "step": 15611 + }, + { + "epoch": 2.5639357050479337, + "grad_norm": 0.31959158175100244, + "learning_rate": 4.387692820187106e-06, + "loss": 0.436, + "step": 15612 + }, + { + "epoch": 2.5640999322562763, + "grad_norm": 0.29317675299532536, + "learning_rate": 4.38722399467338e-06, + "loss": 0.4383, + "step": 15613 + }, + { + "epoch": 2.5642641594646194, + "grad_norm": 0.34975265052513155, + "learning_rate": 4.386755170384513e-06, + "loss": 0.4398, + "step": 15614 + }, + { + "epoch": 2.564428386672962, + "grad_norm": 0.3412204875657687, + "learning_rate": 4.386286347325595e-06, + "loss": 0.4538, + "step": 15615 + }, + { + "epoch": 2.5645926138813047, + "grad_norm": 0.31861125143751867, + "learning_rate": 4.385817525501719e-06, + "loss": 0.4384, + "step": 15616 + }, + { + "epoch": 2.5647568410896477, + "grad_norm": 0.2895893168578073, + "learning_rate": 4.385348704917978e-06, + "loss": 0.4292, + "step": 15617 + }, + { + "epoch": 2.5649210682979904, + "grad_norm": 0.3212807387575279, + "learning_rate": 4.384879885579462e-06, + "loss": 0.4217, + "step": 15618 + }, + { + "epoch": 2.565085295506333, + "grad_norm": 0.37869276897825244, + "learning_rate": 4.384411067491265e-06, + "loss": 0.4226, + "step": 15619 + }, + { + "epoch": 2.5652495227146757, + "grad_norm": 0.32545970108276556, + "learning_rate": 4.383942250658478e-06, + "loss": 0.4182, + "step": 15620 + }, + { + "epoch": 2.5654137499230183, + "grad_norm": 0.5821317175586268, + "learning_rate": 4.383473435086191e-06, + "loss": 0.4307, + "step": 15621 + }, + { + "epoch": 2.5655779771313614, + "grad_norm": 0.3737727497282395, + "learning_rate": 4.383004620779497e-06, + "loss": 0.4254, + "step": 15622 + }, + { + "epoch": 2.565742204339704, + "grad_norm": 0.3252778528469744, + "learning_rate": 4.382535807743487e-06, + "loss": 0.4444, + "step": 15623 + }, + { + "epoch": 2.5659064315480467, + "grad_norm": 0.40545934497853325, + "learning_rate": 4.382066995983256e-06, + "loss": 0.4465, + "step": 15624 + }, + { + "epoch": 2.5660706587563897, + "grad_norm": 0.3786830883990562, + "learning_rate": 4.381598185503892e-06, + "loss": 0.4253, + "step": 15625 + }, + { + "epoch": 2.5662348859647324, + "grad_norm": 0.33683926885493226, + "learning_rate": 4.381129376310488e-06, + "loss": 0.4349, + "step": 15626 + }, + { + "epoch": 2.566399113173075, + "grad_norm": 0.9430359938687994, + "learning_rate": 4.380660568408136e-06, + "loss": 0.4134, + "step": 15627 + }, + { + "epoch": 2.5665633403814176, + "grad_norm": 0.4029380612559527, + "learning_rate": 4.38019176180193e-06, + "loss": 0.4342, + "step": 15628 + }, + { + "epoch": 2.5667275675897603, + "grad_norm": 0.3329457909755624, + "learning_rate": 4.379722956496958e-06, + "loss": 0.4387, + "step": 15629 + }, + { + "epoch": 2.566891794798103, + "grad_norm": 0.35796644332871863, + "learning_rate": 4.379254152498312e-06, + "loss": 0.4571, + "step": 15630 + }, + { + "epoch": 2.567056022006446, + "grad_norm": 0.34985740849790825, + "learning_rate": 4.378785349811085e-06, + "loss": 0.4408, + "step": 15631 + }, + { + "epoch": 2.5672202492147886, + "grad_norm": 0.3545240868647146, + "learning_rate": 4.378316548440369e-06, + "loss": 0.4361, + "step": 15632 + }, + { + "epoch": 2.5673844764231313, + "grad_norm": 0.297635939595374, + "learning_rate": 4.3778477483912545e-06, + "loss": 0.4233, + "step": 15633 + }, + { + "epoch": 2.5675487036314744, + "grad_norm": 0.2947778083461371, + "learning_rate": 4.377378949668833e-06, + "loss": 0.433, + "step": 15634 + }, + { + "epoch": 2.567712930839817, + "grad_norm": 0.395361891954945, + "learning_rate": 4.376910152278197e-06, + "loss": 0.4374, + "step": 15635 + }, + { + "epoch": 2.5678771580481596, + "grad_norm": 0.26747379087473455, + "learning_rate": 4.37644135622444e-06, + "loss": 0.4452, + "step": 15636 + }, + { + "epoch": 2.5680413852565023, + "grad_norm": 0.3537882378113586, + "learning_rate": 4.37597256151265e-06, + "loss": 0.4309, + "step": 15637 + }, + { + "epoch": 2.568205612464845, + "grad_norm": 0.3126543583966415, + "learning_rate": 4.375503768147918e-06, + "loss": 0.4475, + "step": 15638 + }, + { + "epoch": 2.568369839673188, + "grad_norm": 0.310994438653193, + "learning_rate": 4.375034976135341e-06, + "loss": 0.4532, + "step": 15639 + }, + { + "epoch": 2.5685340668815306, + "grad_norm": 0.2757312848098248, + "learning_rate": 4.374566185480005e-06, + "loss": 0.4254, + "step": 15640 + }, + { + "epoch": 2.5686982940898733, + "grad_norm": 0.27164222845895303, + "learning_rate": 4.374097396187003e-06, + "loss": 0.4184, + "step": 15641 + }, + { + "epoch": 2.5688625212982164, + "grad_norm": 0.27266040054382534, + "learning_rate": 4.373628608261428e-06, + "loss": 0.422, + "step": 15642 + }, + { + "epoch": 2.569026748506559, + "grad_norm": 0.29807414547875344, + "learning_rate": 4.373159821708372e-06, + "loss": 0.4379, + "step": 15643 + }, + { + "epoch": 2.5691909757149016, + "grad_norm": 0.4276170738492218, + "learning_rate": 4.372691036532923e-06, + "loss": 0.4514, + "step": 15644 + }, + { + "epoch": 2.5693552029232443, + "grad_norm": 0.30498004090935377, + "learning_rate": 4.372222252740177e-06, + "loss": 0.4365, + "step": 15645 + }, + { + "epoch": 2.569519430131587, + "grad_norm": 0.3839879430641964, + "learning_rate": 4.371753470335221e-06, + "loss": 0.4384, + "step": 15646 + }, + { + "epoch": 2.5696836573399295, + "grad_norm": 0.4011106422419418, + "learning_rate": 4.37128468932315e-06, + "loss": 0.4383, + "step": 15647 + }, + { + "epoch": 2.5698478845482726, + "grad_norm": 0.35855065427808513, + "learning_rate": 4.3708159097090536e-06, + "loss": 0.4398, + "step": 15648 + }, + { + "epoch": 2.5700121117566153, + "grad_norm": 0.3644533904290332, + "learning_rate": 4.370347131498022e-06, + "loss": 0.4257, + "step": 15649 + }, + { + "epoch": 2.570176338964958, + "grad_norm": 0.33253435890955046, + "learning_rate": 4.369878354695148e-06, + "loss": 0.4282, + "step": 15650 + }, + { + "epoch": 2.570340566173301, + "grad_norm": 0.26752244259463775, + "learning_rate": 4.369409579305525e-06, + "loss": 0.4401, + "step": 15651 + }, + { + "epoch": 2.5705047933816436, + "grad_norm": 0.27764725847093746, + "learning_rate": 4.368940805334241e-06, + "loss": 0.4348, + "step": 15652 + }, + { + "epoch": 2.5706690205899863, + "grad_norm": 0.46418971042458596, + "learning_rate": 4.3684720327863904e-06, + "loss": 0.411, + "step": 15653 + }, + { + "epoch": 2.570833247798329, + "grad_norm": 0.29686142198669646, + "learning_rate": 4.368003261667062e-06, + "loss": 0.4427, + "step": 15654 + }, + { + "epoch": 2.5709974750066715, + "grad_norm": 0.27340688193424023, + "learning_rate": 4.367534491981349e-06, + "loss": 0.4459, + "step": 15655 + }, + { + "epoch": 2.5711617022150146, + "grad_norm": 0.29452955602163794, + "learning_rate": 4.36706572373434e-06, + "loss": 0.4615, + "step": 15656 + }, + { + "epoch": 2.5713259294233572, + "grad_norm": 0.2902678707411273, + "learning_rate": 4.366596956931128e-06, + "loss": 0.4221, + "step": 15657 + }, + { + "epoch": 2.5714901566317, + "grad_norm": 0.47928414230720084, + "learning_rate": 4.366128191576806e-06, + "loss": 0.4417, + "step": 15658 + }, + { + "epoch": 2.571654383840043, + "grad_norm": 0.30012861208394687, + "learning_rate": 4.3656594276764616e-06, + "loss": 0.4254, + "step": 15659 + }, + { + "epoch": 2.5718186110483856, + "grad_norm": 0.3130155646681567, + "learning_rate": 4.365190665235189e-06, + "loss": 0.4422, + "step": 15660 + }, + { + "epoch": 2.5719828382567282, + "grad_norm": 0.3455326963780102, + "learning_rate": 4.36472190425808e-06, + "loss": 0.4318, + "step": 15661 + }, + { + "epoch": 2.572147065465071, + "grad_norm": 0.3041176120109691, + "learning_rate": 4.364253144750222e-06, + "loss": 0.4313, + "step": 15662 + }, + { + "epoch": 2.5723112926734135, + "grad_norm": 0.28055282368032514, + "learning_rate": 4.36378438671671e-06, + "loss": 0.4413, + "step": 15663 + }, + { + "epoch": 2.572475519881756, + "grad_norm": 0.5147635636054625, + "learning_rate": 4.363315630162632e-06, + "loss": 0.4397, + "step": 15664 + }, + { + "epoch": 2.5726397470900992, + "grad_norm": 0.2849554341120117, + "learning_rate": 4.362846875093081e-06, + "loss": 0.4344, + "step": 15665 + }, + { + "epoch": 2.572803974298442, + "grad_norm": 0.2846440291685015, + "learning_rate": 4.3623781215131475e-06, + "loss": 0.4277, + "step": 15666 + }, + { + "epoch": 2.5729682015067845, + "grad_norm": 0.3305141222386102, + "learning_rate": 4.3619093694279245e-06, + "loss": 0.4405, + "step": 15667 + }, + { + "epoch": 2.5731324287151276, + "grad_norm": 0.3596809632535296, + "learning_rate": 4.3614406188425005e-06, + "loss": 0.4677, + "step": 15668 + }, + { + "epoch": 2.5732966559234702, + "grad_norm": 0.4530597862843398, + "learning_rate": 4.360971869761968e-06, + "loss": 0.4387, + "step": 15669 + }, + { + "epoch": 2.573460883131813, + "grad_norm": 0.3251876593883486, + "learning_rate": 4.360503122191419e-06, + "loss": 0.4108, + "step": 15670 + }, + { + "epoch": 2.5736251103401555, + "grad_norm": 0.2803538758207167, + "learning_rate": 4.360034376135942e-06, + "loss": 0.4446, + "step": 15671 + }, + { + "epoch": 2.573789337548498, + "grad_norm": 0.28182246452077286, + "learning_rate": 4.3595656316006295e-06, + "loss": 0.4468, + "step": 15672 + }, + { + "epoch": 2.5739535647568412, + "grad_norm": 0.34577943342234163, + "learning_rate": 4.359096888590573e-06, + "loss": 0.4308, + "step": 15673 + }, + { + "epoch": 2.574117791965184, + "grad_norm": 0.38565717225066504, + "learning_rate": 4.358628147110862e-06, + "loss": 0.432, + "step": 15674 + }, + { + "epoch": 2.5742820191735265, + "grad_norm": 0.30459225924397265, + "learning_rate": 4.358159407166588e-06, + "loss": 0.4202, + "step": 15675 + }, + { + "epoch": 2.5744462463818696, + "grad_norm": 0.3196455829723776, + "learning_rate": 4.357690668762844e-06, + "loss": 0.4338, + "step": 15676 + }, + { + "epoch": 2.5746104735902122, + "grad_norm": 0.32731457911173506, + "learning_rate": 4.357221931904718e-06, + "loss": 0.4271, + "step": 15677 + }, + { + "epoch": 2.574774700798555, + "grad_norm": 0.3219906580343855, + "learning_rate": 4.356753196597304e-06, + "loss": 0.4437, + "step": 15678 + }, + { + "epoch": 2.5749389280068975, + "grad_norm": 0.29879707891187424, + "learning_rate": 4.35628446284569e-06, + "loss": 0.4442, + "step": 15679 + }, + { + "epoch": 2.57510315521524, + "grad_norm": 0.3494962037579369, + "learning_rate": 4.355815730654968e-06, + "loss": 0.4262, + "step": 15680 + }, + { + "epoch": 2.5752673824235828, + "grad_norm": 0.25523753461883125, + "learning_rate": 4.355347000030229e-06, + "loss": 0.426, + "step": 15681 + }, + { + "epoch": 2.575431609631926, + "grad_norm": 0.45019078924757433, + "learning_rate": 4.354878270976564e-06, + "loss": 0.441, + "step": 15682 + }, + { + "epoch": 2.5755958368402685, + "grad_norm": 0.32355129633184126, + "learning_rate": 4.354409543499064e-06, + "loss": 0.4299, + "step": 15683 + }, + { + "epoch": 2.575760064048611, + "grad_norm": 0.3285679353712852, + "learning_rate": 4.353940817602819e-06, + "loss": 0.4186, + "step": 15684 + }, + { + "epoch": 2.575924291256954, + "grad_norm": 0.27666075631853193, + "learning_rate": 4.35347209329292e-06, + "loss": 0.4338, + "step": 15685 + }, + { + "epoch": 2.576088518465297, + "grad_norm": 0.49437240302558794, + "learning_rate": 4.353003370574461e-06, + "loss": 0.4409, + "step": 15686 + }, + { + "epoch": 2.5762527456736395, + "grad_norm": 0.30204622384409346, + "learning_rate": 4.352534649452527e-06, + "loss": 0.4422, + "step": 15687 + }, + { + "epoch": 2.576416972881982, + "grad_norm": 0.273434931240618, + "learning_rate": 4.352065929932215e-06, + "loss": 0.4177, + "step": 15688 + }, + { + "epoch": 2.5765812000903248, + "grad_norm": 0.35155939309416256, + "learning_rate": 4.35159721201861e-06, + "loss": 0.4432, + "step": 15689 + }, + { + "epoch": 2.576745427298668, + "grad_norm": 0.3920495165730994, + "learning_rate": 4.351128495716805e-06, + "loss": 0.4235, + "step": 15690 + }, + { + "epoch": 2.5769096545070105, + "grad_norm": 0.30913151759937846, + "learning_rate": 4.350659781031891e-06, + "loss": 0.4201, + "step": 15691 + }, + { + "epoch": 2.577073881715353, + "grad_norm": 0.3076565225368957, + "learning_rate": 4.35019106796896e-06, + "loss": 0.4235, + "step": 15692 + }, + { + "epoch": 2.577238108923696, + "grad_norm": 0.29655321776974725, + "learning_rate": 4.349722356533101e-06, + "loss": 0.4459, + "step": 15693 + }, + { + "epoch": 2.577402336132039, + "grad_norm": 0.28892151775120634, + "learning_rate": 4.3492536467294044e-06, + "loss": 0.4179, + "step": 15694 + }, + { + "epoch": 2.5775665633403815, + "grad_norm": 0.41459237885226863, + "learning_rate": 4.3487849385629635e-06, + "loss": 0.4146, + "step": 15695 + }, + { + "epoch": 2.577730790548724, + "grad_norm": 0.2902977896042206, + "learning_rate": 4.348316232038865e-06, + "loss": 0.437, + "step": 15696 + }, + { + "epoch": 2.5778950177570668, + "grad_norm": 0.4496536761679526, + "learning_rate": 4.347847527162203e-06, + "loss": 0.4415, + "step": 15697 + }, + { + "epoch": 2.5780592449654094, + "grad_norm": 0.36641641828421784, + "learning_rate": 4.3473788239380645e-06, + "loss": 0.4223, + "step": 15698 + }, + { + "epoch": 2.5782234721737525, + "grad_norm": 0.36549153174167476, + "learning_rate": 4.3469101223715425e-06, + "loss": 0.4306, + "step": 15699 + }, + { + "epoch": 2.578387699382095, + "grad_norm": 0.3015497487218498, + "learning_rate": 4.3464414224677275e-06, + "loss": 0.419, + "step": 15700 + }, + { + "epoch": 2.5785519265904377, + "grad_norm": 0.27625426522960334, + "learning_rate": 4.345972724231711e-06, + "loss": 0.4503, + "step": 15701 + }, + { + "epoch": 2.578716153798781, + "grad_norm": 0.30417803932476767, + "learning_rate": 4.3455040276685805e-06, + "loss": 0.4298, + "step": 15702 + }, + { + "epoch": 2.5788803810071235, + "grad_norm": 0.31044685598214067, + "learning_rate": 4.345035332783431e-06, + "loss": 0.4572, + "step": 15703 + }, + { + "epoch": 2.579044608215466, + "grad_norm": 0.34029110494135095, + "learning_rate": 4.344566639581348e-06, + "loss": 0.4286, + "step": 15704 + }, + { + "epoch": 2.5792088354238087, + "grad_norm": 0.4220228863169494, + "learning_rate": 4.344097948067424e-06, + "loss": 0.4313, + "step": 15705 + }, + { + "epoch": 2.5793730626321514, + "grad_norm": 0.29592190663583623, + "learning_rate": 4.34362925824675e-06, + "loss": 0.4506, + "step": 15706 + }, + { + "epoch": 2.5795372898404945, + "grad_norm": 0.29158058048698404, + "learning_rate": 4.343160570124417e-06, + "loss": 0.4402, + "step": 15707 + }, + { + "epoch": 2.579701517048837, + "grad_norm": 0.3708781139752633, + "learning_rate": 4.3426918837055135e-06, + "loss": 0.4506, + "step": 15708 + }, + { + "epoch": 2.5798657442571797, + "grad_norm": 0.3318455295905244, + "learning_rate": 4.342223198995131e-06, + "loss": 0.4468, + "step": 15709 + }, + { + "epoch": 2.580029971465523, + "grad_norm": 0.4317230371408357, + "learning_rate": 4.34175451599836e-06, + "loss": 0.4466, + "step": 15710 + }, + { + "epoch": 2.5801941986738655, + "grad_norm": 0.3369867194655041, + "learning_rate": 4.341285834720292e-06, + "loss": 0.4115, + "step": 15711 + }, + { + "epoch": 2.580358425882208, + "grad_norm": 0.31912585151917744, + "learning_rate": 4.340817155166015e-06, + "loss": 0.4482, + "step": 15712 + }, + { + "epoch": 2.5805226530905507, + "grad_norm": 0.25662968380350065, + "learning_rate": 4.340348477340619e-06, + "loss": 0.4425, + "step": 15713 + }, + { + "epoch": 2.5806868802988934, + "grad_norm": 0.3063092669941935, + "learning_rate": 4.339879801249197e-06, + "loss": 0.4489, + "step": 15714 + }, + { + "epoch": 2.580851107507236, + "grad_norm": 0.2630713688895437, + "learning_rate": 4.339411126896836e-06, + "loss": 0.4542, + "step": 15715 + }, + { + "epoch": 2.581015334715579, + "grad_norm": 0.31208575964151564, + "learning_rate": 4.338942454288631e-06, + "loss": 0.4318, + "step": 15716 + }, + { + "epoch": 2.5811795619239217, + "grad_norm": 0.3594129107656559, + "learning_rate": 4.338473783429668e-06, + "loss": 0.4353, + "step": 15717 + }, + { + "epoch": 2.5813437891322644, + "grad_norm": 0.3693658716118475, + "learning_rate": 4.338005114325038e-06, + "loss": 0.4337, + "step": 15718 + }, + { + "epoch": 2.5815080163406074, + "grad_norm": 0.40035589840951313, + "learning_rate": 4.3375364469798315e-06, + "loss": 0.4231, + "step": 15719 + }, + { + "epoch": 2.58167224354895, + "grad_norm": 0.2942822598845297, + "learning_rate": 4.3370677813991425e-06, + "loss": 0.4365, + "step": 15720 + }, + { + "epoch": 2.5818364707572927, + "grad_norm": 0.2598983203441498, + "learning_rate": 4.3365991175880545e-06, + "loss": 0.43, + "step": 15721 + }, + { + "epoch": 2.5820006979656354, + "grad_norm": 0.3306450186865602, + "learning_rate": 4.336130455551662e-06, + "loss": 0.4488, + "step": 15722 + }, + { + "epoch": 2.582164925173978, + "grad_norm": 0.4172832201505027, + "learning_rate": 4.335661795295053e-06, + "loss": 0.4493, + "step": 15723 + }, + { + "epoch": 2.582329152382321, + "grad_norm": 0.4443225994832997, + "learning_rate": 4.3351931368233195e-06, + "loss": 0.4485, + "step": 15724 + }, + { + "epoch": 2.5824933795906637, + "grad_norm": 0.36022320421626164, + "learning_rate": 4.334724480141551e-06, + "loss": 0.4535, + "step": 15725 + }, + { + "epoch": 2.5826576067990064, + "grad_norm": 0.28358389028441683, + "learning_rate": 4.334255825254836e-06, + "loss": 0.4391, + "step": 15726 + }, + { + "epoch": 2.5828218340073494, + "grad_norm": 0.33976043099099573, + "learning_rate": 4.333787172168266e-06, + "loss": 0.4492, + "step": 15727 + }, + { + "epoch": 2.582986061215692, + "grad_norm": 0.2803123022258049, + "learning_rate": 4.333318520886932e-06, + "loss": 0.4182, + "step": 15728 + }, + { + "epoch": 2.5831502884240347, + "grad_norm": 0.2642610620741509, + "learning_rate": 4.332849871415922e-06, + "loss": 0.4329, + "step": 15729 + }, + { + "epoch": 2.5833145156323774, + "grad_norm": 0.2714676626725195, + "learning_rate": 4.332381223760327e-06, + "loss": 0.4358, + "step": 15730 + }, + { + "epoch": 2.58347874284072, + "grad_norm": 0.29556803031452, + "learning_rate": 4.331912577925237e-06, + "loss": 0.4179, + "step": 15731 + }, + { + "epoch": 2.5836429700490626, + "grad_norm": 0.3645808737464659, + "learning_rate": 4.3314439339157415e-06, + "loss": 0.4348, + "step": 15732 + }, + { + "epoch": 2.5838071972574057, + "grad_norm": 0.3772393709280388, + "learning_rate": 4.3309752917369305e-06, + "loss": 0.4406, + "step": 15733 + }, + { + "epoch": 2.5839714244657483, + "grad_norm": 0.27548331977469753, + "learning_rate": 4.330506651393894e-06, + "loss": 0.4471, + "step": 15734 + }, + { + "epoch": 2.584135651674091, + "grad_norm": 0.2826736816359112, + "learning_rate": 4.330038012891723e-06, + "loss": 0.4483, + "step": 15735 + }, + { + "epoch": 2.584299878882434, + "grad_norm": 0.4133515073580495, + "learning_rate": 4.329569376235506e-06, + "loss": 0.4419, + "step": 15736 + }, + { + "epoch": 2.5844641060907767, + "grad_norm": 0.3225643113998406, + "learning_rate": 4.329100741430334e-06, + "loss": 0.4471, + "step": 15737 + }, + { + "epoch": 2.5846283332991193, + "grad_norm": 0.3407019919418374, + "learning_rate": 4.3286321084812955e-06, + "loss": 0.4533, + "step": 15738 + }, + { + "epoch": 2.584792560507462, + "grad_norm": 0.3428604509352793, + "learning_rate": 4.32816347739348e-06, + "loss": 0.4755, + "step": 15739 + }, + { + "epoch": 2.5849567877158046, + "grad_norm": 0.32503432507541574, + "learning_rate": 4.327694848171979e-06, + "loss": 0.4341, + "step": 15740 + }, + { + "epoch": 2.5851210149241477, + "grad_norm": 0.36169010504914045, + "learning_rate": 4.327226220821881e-06, + "loss": 0.4603, + "step": 15741 + }, + { + "epoch": 2.5852852421324903, + "grad_norm": 0.2994689117279142, + "learning_rate": 4.326757595348276e-06, + "loss": 0.4367, + "step": 15742 + }, + { + "epoch": 2.585449469340833, + "grad_norm": 0.313911921117997, + "learning_rate": 4.326288971756254e-06, + "loss": 0.4324, + "step": 15743 + }, + { + "epoch": 2.585613696549176, + "grad_norm": 0.2735236587767174, + "learning_rate": 4.3258203500509055e-06, + "loss": 0.4339, + "step": 15744 + }, + { + "epoch": 2.5857779237575187, + "grad_norm": 0.3583785949380112, + "learning_rate": 4.32535173023732e-06, + "loss": 0.4228, + "step": 15745 + }, + { + "epoch": 2.5859421509658613, + "grad_norm": 0.3579906842061974, + "learning_rate": 4.324883112320586e-06, + "loss": 0.4374, + "step": 15746 + }, + { + "epoch": 2.586106378174204, + "grad_norm": 0.2920349175148648, + "learning_rate": 4.324414496305793e-06, + "loss": 0.448, + "step": 15747 + }, + { + "epoch": 2.5862706053825466, + "grad_norm": 0.3551419529001769, + "learning_rate": 4.323945882198031e-06, + "loss": 0.4296, + "step": 15748 + }, + { + "epoch": 2.5864348325908892, + "grad_norm": 0.38050189174987475, + "learning_rate": 4.3234772700023904e-06, + "loss": 0.4367, + "step": 15749 + }, + { + "epoch": 2.5865990597992323, + "grad_norm": 0.27693520786110415, + "learning_rate": 4.323008659723961e-06, + "loss": 0.4273, + "step": 15750 + }, + { + "epoch": 2.586763287007575, + "grad_norm": 0.2946787998493088, + "learning_rate": 4.32254005136783e-06, + "loss": 0.4506, + "step": 15751 + }, + { + "epoch": 2.5869275142159176, + "grad_norm": 0.300489672148383, + "learning_rate": 4.32207144493909e-06, + "loss": 0.4254, + "step": 15752 + }, + { + "epoch": 2.5870917414242607, + "grad_norm": 0.33785313952343987, + "learning_rate": 4.32160284044283e-06, + "loss": 0.4356, + "step": 15753 + }, + { + "epoch": 2.5872559686326033, + "grad_norm": 0.517376636026884, + "learning_rate": 4.321134237884138e-06, + "loss": 0.4211, + "step": 15754 + }, + { + "epoch": 2.587420195840946, + "grad_norm": 0.2898400184440624, + "learning_rate": 4.320665637268103e-06, + "loss": 0.4221, + "step": 15755 + }, + { + "epoch": 2.5875844230492886, + "grad_norm": 0.42197824385946514, + "learning_rate": 4.3201970385998164e-06, + "loss": 0.4449, + "step": 15756 + }, + { + "epoch": 2.5877486502576312, + "grad_norm": 0.31677534657011097, + "learning_rate": 4.319728441884366e-06, + "loss": 0.4403, + "step": 15757 + }, + { + "epoch": 2.5879128774659743, + "grad_norm": 0.2832694609001604, + "learning_rate": 4.319259847126843e-06, + "loss": 0.4378, + "step": 15758 + }, + { + "epoch": 2.588077104674317, + "grad_norm": 0.39187976296966304, + "learning_rate": 4.318791254332337e-06, + "loss": 0.4537, + "step": 15759 + }, + { + "epoch": 2.5882413318826596, + "grad_norm": 0.296304090284739, + "learning_rate": 4.318322663505934e-06, + "loss": 0.4382, + "step": 15760 + }, + { + "epoch": 2.5884055590910027, + "grad_norm": 0.2989425161690862, + "learning_rate": 4.317854074652727e-06, + "loss": 0.4451, + "step": 15761 + }, + { + "epoch": 2.5885697862993453, + "grad_norm": 0.32956500842867725, + "learning_rate": 4.317385487777805e-06, + "loss": 0.4252, + "step": 15762 + }, + { + "epoch": 2.588734013507688, + "grad_norm": 0.3554805266347347, + "learning_rate": 4.316916902886255e-06, + "loss": 0.4785, + "step": 15763 + }, + { + "epoch": 2.5888982407160306, + "grad_norm": 0.29784661781396565, + "learning_rate": 4.316448319983166e-06, + "loss": 0.449, + "step": 15764 + }, + { + "epoch": 2.5890624679243732, + "grad_norm": 0.3403796996552582, + "learning_rate": 4.315979739073631e-06, + "loss": 0.42, + "step": 15765 + }, + { + "epoch": 2.589226695132716, + "grad_norm": 0.2820204314916899, + "learning_rate": 4.315511160162736e-06, + "loss": 0.42, + "step": 15766 + }, + { + "epoch": 2.589390922341059, + "grad_norm": 0.34694290008039924, + "learning_rate": 4.315042583255571e-06, + "loss": 0.4405, + "step": 15767 + }, + { + "epoch": 2.5895551495494016, + "grad_norm": 0.285712751776153, + "learning_rate": 4.314574008357227e-06, + "loss": 0.4387, + "step": 15768 + }, + { + "epoch": 2.589719376757744, + "grad_norm": 0.2886681436122029, + "learning_rate": 4.31410543547279e-06, + "loss": 0.4346, + "step": 15769 + }, + { + "epoch": 2.5898836039660873, + "grad_norm": 0.26426338895128476, + "learning_rate": 4.3136368646073535e-06, + "loss": 0.465, + "step": 15770 + }, + { + "epoch": 2.59004783117443, + "grad_norm": 0.46212153735161743, + "learning_rate": 4.313168295766003e-06, + "loss": 0.423, + "step": 15771 + }, + { + "epoch": 2.5902120583827726, + "grad_norm": 0.27173188730977343, + "learning_rate": 4.312699728953827e-06, + "loss": 0.4151, + "step": 15772 + }, + { + "epoch": 2.590376285591115, + "grad_norm": 0.44061969942428547, + "learning_rate": 4.312231164175917e-06, + "loss": 0.437, + "step": 15773 + }, + { + "epoch": 2.590540512799458, + "grad_norm": 0.2805299615077187, + "learning_rate": 4.311762601437362e-06, + "loss": 0.4249, + "step": 15774 + }, + { + "epoch": 2.590704740007801, + "grad_norm": 0.3082668237862245, + "learning_rate": 4.3112940407432495e-06, + "loss": 0.4388, + "step": 15775 + }, + { + "epoch": 2.5908689672161436, + "grad_norm": 0.2950023105650061, + "learning_rate": 4.3108254820986685e-06, + "loss": 0.4588, + "step": 15776 + }, + { + "epoch": 2.591033194424486, + "grad_norm": 0.3201705793393197, + "learning_rate": 4.31035692550871e-06, + "loss": 0.4311, + "step": 15777 + }, + { + "epoch": 2.5911974216328293, + "grad_norm": 0.3640366984477735, + "learning_rate": 4.309888370978464e-06, + "loss": 0.4126, + "step": 15778 + }, + { + "epoch": 2.591361648841172, + "grad_norm": 0.2999324534104021, + "learning_rate": 4.309419818513014e-06, + "loss": 0.445, + "step": 15779 + }, + { + "epoch": 2.5915258760495146, + "grad_norm": 0.35312819820987595, + "learning_rate": 4.308951268117454e-06, + "loss": 0.4531, + "step": 15780 + }, + { + "epoch": 2.591690103257857, + "grad_norm": 0.3104622059310826, + "learning_rate": 4.308482719796871e-06, + "loss": 0.419, + "step": 15781 + }, + { + "epoch": 2.5918543304662, + "grad_norm": 0.40858664531530114, + "learning_rate": 4.308014173556353e-06, + "loss": 0.4378, + "step": 15782 + }, + { + "epoch": 2.5920185576745425, + "grad_norm": 0.3084003255005991, + "learning_rate": 4.3075456294009906e-06, + "loss": 0.4262, + "step": 15783 + }, + { + "epoch": 2.5921827848828856, + "grad_norm": 0.37756243501180997, + "learning_rate": 4.3070770873358725e-06, + "loss": 0.4435, + "step": 15784 + }, + { + "epoch": 2.592347012091228, + "grad_norm": 0.3318405143473803, + "learning_rate": 4.306608547366087e-06, + "loss": 0.4372, + "step": 15785 + }, + { + "epoch": 2.592511239299571, + "grad_norm": 0.31825262182092245, + "learning_rate": 4.306140009496722e-06, + "loss": 0.4355, + "step": 15786 + }, + { + "epoch": 2.592675466507914, + "grad_norm": 0.26950774595672156, + "learning_rate": 4.30567147373287e-06, + "loss": 0.4384, + "step": 15787 + }, + { + "epoch": 2.5928396937162566, + "grad_norm": 0.2773574713214042, + "learning_rate": 4.305202940079614e-06, + "loss": 0.4168, + "step": 15788 + }, + { + "epoch": 2.593003920924599, + "grad_norm": 0.27968928473718313, + "learning_rate": 4.304734408542048e-06, + "loss": 0.4126, + "step": 15789 + }, + { + "epoch": 2.593168148132942, + "grad_norm": 0.339631272148747, + "learning_rate": 4.304265879125256e-06, + "loss": 0.4252, + "step": 15790 + }, + { + "epoch": 2.5933323753412845, + "grad_norm": 0.4405429762051981, + "learning_rate": 4.30379735183433e-06, + "loss": 0.4386, + "step": 15791 + }, + { + "epoch": 2.5934966025496275, + "grad_norm": 0.33307488107275973, + "learning_rate": 4.303328826674358e-06, + "loss": 0.4191, + "step": 15792 + }, + { + "epoch": 2.59366082975797, + "grad_norm": 0.31976140803255354, + "learning_rate": 4.3028603036504286e-06, + "loss": 0.4373, + "step": 15793 + }, + { + "epoch": 2.593825056966313, + "grad_norm": 0.29829335474835267, + "learning_rate": 4.302391782767629e-06, + "loss": 0.4349, + "step": 15794 + }, + { + "epoch": 2.593989284174656, + "grad_norm": 0.3889999203353962, + "learning_rate": 4.301923264031052e-06, + "loss": 0.434, + "step": 15795 + }, + { + "epoch": 2.5941535113829985, + "grad_norm": 0.30571572031790084, + "learning_rate": 4.301454747445781e-06, + "loss": 0.4201, + "step": 15796 + }, + { + "epoch": 2.594317738591341, + "grad_norm": 0.3459232731873862, + "learning_rate": 4.300986233016907e-06, + "loss": 0.4514, + "step": 15797 + }, + { + "epoch": 2.594481965799684, + "grad_norm": 0.30146836874846944, + "learning_rate": 4.300517720749518e-06, + "loss": 0.4502, + "step": 15798 + }, + { + "epoch": 2.5946461930080265, + "grad_norm": 0.36184226392840596, + "learning_rate": 4.3000492106487035e-06, + "loss": 0.431, + "step": 15799 + }, + { + "epoch": 2.594810420216369, + "grad_norm": 0.31671853093359414, + "learning_rate": 4.299580702719551e-06, + "loss": 0.4438, + "step": 15800 + }, + { + "epoch": 2.594974647424712, + "grad_norm": 0.3050096609024856, + "learning_rate": 4.299112196967149e-06, + "loss": 0.4642, + "step": 15801 + }, + { + "epoch": 2.595138874633055, + "grad_norm": 0.41602891522432706, + "learning_rate": 4.2986436933965866e-06, + "loss": 0.4327, + "step": 15802 + }, + { + "epoch": 2.5953031018413975, + "grad_norm": 0.24868676436677487, + "learning_rate": 4.298175192012953e-06, + "loss": 0.4176, + "step": 15803 + }, + { + "epoch": 2.5954673290497405, + "grad_norm": 0.36640826122549386, + "learning_rate": 4.2977066928213345e-06, + "loss": 0.4296, + "step": 15804 + }, + { + "epoch": 2.595631556258083, + "grad_norm": 0.30540983951859807, + "learning_rate": 4.29723819582682e-06, + "loss": 0.4366, + "step": 15805 + }, + { + "epoch": 2.595795783466426, + "grad_norm": 0.3490310182612695, + "learning_rate": 4.296769701034497e-06, + "loss": 0.4336, + "step": 15806 + }, + { + "epoch": 2.5959600106747684, + "grad_norm": 0.2680422378112935, + "learning_rate": 4.296301208449456e-06, + "loss": 0.4255, + "step": 15807 + }, + { + "epoch": 2.596124237883111, + "grad_norm": 0.28315344448721086, + "learning_rate": 4.295832718076785e-06, + "loss": 0.4163, + "step": 15808 + }, + { + "epoch": 2.596288465091454, + "grad_norm": 0.2923550723051912, + "learning_rate": 4.295364229921571e-06, + "loss": 0.416, + "step": 15809 + }, + { + "epoch": 2.596452692299797, + "grad_norm": 0.28055270604204857, + "learning_rate": 4.294895743988902e-06, + "loss": 0.4469, + "step": 15810 + }, + { + "epoch": 2.5966169195081394, + "grad_norm": 0.2879848375564051, + "learning_rate": 4.294427260283868e-06, + "loss": 0.4399, + "step": 15811 + }, + { + "epoch": 2.5967811467164825, + "grad_norm": 0.5644459781722265, + "learning_rate": 4.293958778811558e-06, + "loss": 0.4669, + "step": 15812 + }, + { + "epoch": 2.596945373924825, + "grad_norm": 0.2848883445687235, + "learning_rate": 4.293490299577056e-06, + "loss": 0.421, + "step": 15813 + }, + { + "epoch": 2.597109601133168, + "grad_norm": 0.3855917851665769, + "learning_rate": 4.293021822585452e-06, + "loss": 0.4571, + "step": 15814 + }, + { + "epoch": 2.5972738283415104, + "grad_norm": 0.37697902958562113, + "learning_rate": 4.292553347841836e-06, + "loss": 0.4241, + "step": 15815 + }, + { + "epoch": 2.597438055549853, + "grad_norm": 0.3082437220187043, + "learning_rate": 4.2920848753512945e-06, + "loss": 0.4571, + "step": 15816 + }, + { + "epoch": 2.5976022827581957, + "grad_norm": 0.4806961940115057, + "learning_rate": 4.291616405118915e-06, + "loss": 0.4438, + "step": 15817 + }, + { + "epoch": 2.597766509966539, + "grad_norm": 0.33571584868959514, + "learning_rate": 4.291147937149787e-06, + "loss": 0.4355, + "step": 15818 + }, + { + "epoch": 2.5979307371748814, + "grad_norm": 0.8390119913907225, + "learning_rate": 4.290679471448998e-06, + "loss": 0.4369, + "step": 15819 + }, + { + "epoch": 2.598094964383224, + "grad_norm": 0.24477381682903024, + "learning_rate": 4.290211008021638e-06, + "loss": 0.4346, + "step": 15820 + }, + { + "epoch": 2.598259191591567, + "grad_norm": 0.36475684221611265, + "learning_rate": 4.289742546872789e-06, + "loss": 0.4485, + "step": 15821 + }, + { + "epoch": 2.59842341879991, + "grad_norm": 0.3489466071086147, + "learning_rate": 4.289274088007544e-06, + "loss": 0.4417, + "step": 15822 + }, + { + "epoch": 2.5985876460082524, + "grad_norm": 0.27971414760095065, + "learning_rate": 4.288805631430991e-06, + "loss": 0.441, + "step": 15823 + }, + { + "epoch": 2.598751873216595, + "grad_norm": 0.26586588593970034, + "learning_rate": 4.288337177148215e-06, + "loss": 0.4563, + "step": 15824 + }, + { + "epoch": 2.5989161004249377, + "grad_norm": 0.30542948972381184, + "learning_rate": 4.287868725164307e-06, + "loss": 0.4266, + "step": 15825 + }, + { + "epoch": 2.599080327633281, + "grad_norm": 0.31955052815461044, + "learning_rate": 4.287400275484351e-06, + "loss": 0.4498, + "step": 15826 + }, + { + "epoch": 2.5992445548416234, + "grad_norm": 0.31979104627134625, + "learning_rate": 4.28693182811344e-06, + "loss": 0.4317, + "step": 15827 + }, + { + "epoch": 2.599408782049966, + "grad_norm": 0.7336338061730221, + "learning_rate": 4.286463383056658e-06, + "loss": 0.4161, + "step": 15828 + }, + { + "epoch": 2.599573009258309, + "grad_norm": 0.26448341402895187, + "learning_rate": 4.285994940319094e-06, + "loss": 0.4546, + "step": 15829 + }, + { + "epoch": 2.599737236466652, + "grad_norm": 0.2814894747781621, + "learning_rate": 4.285526499905835e-06, + "loss": 0.4408, + "step": 15830 + }, + { + "epoch": 2.5999014636749944, + "grad_norm": 0.32304929685665024, + "learning_rate": 4.2850580618219685e-06, + "loss": 0.4495, + "step": 15831 + }, + { + "epoch": 2.600065690883337, + "grad_norm": 0.3003407465523042, + "learning_rate": 4.284589626072584e-06, + "loss": 0.4211, + "step": 15832 + }, + { + "epoch": 2.6002299180916797, + "grad_norm": 0.3126022106522328, + "learning_rate": 4.2841211926627685e-06, + "loss": 0.4459, + "step": 15833 + }, + { + "epoch": 2.6003941453000223, + "grad_norm": 0.3202251903747605, + "learning_rate": 4.283652761597607e-06, + "loss": 0.4123, + "step": 15834 + }, + { + "epoch": 2.6005583725083654, + "grad_norm": 0.3193033038388879, + "learning_rate": 4.283184332882192e-06, + "loss": 0.43, + "step": 15835 + }, + { + "epoch": 2.600722599716708, + "grad_norm": 0.4759091609533221, + "learning_rate": 4.282715906521607e-06, + "loss": 0.4351, + "step": 15836 + }, + { + "epoch": 2.6008868269250507, + "grad_norm": 0.34048263468535467, + "learning_rate": 4.2822474825209426e-06, + "loss": 0.4542, + "step": 15837 + }, + { + "epoch": 2.6010510541333938, + "grad_norm": 0.2964018813124492, + "learning_rate": 4.281779060885284e-06, + "loss": 0.4524, + "step": 15838 + }, + { + "epoch": 2.6012152813417364, + "grad_norm": 0.272121429844428, + "learning_rate": 4.281310641619719e-06, + "loss": 0.4389, + "step": 15839 + }, + { + "epoch": 2.601379508550079, + "grad_norm": 0.2930384076277509, + "learning_rate": 4.280842224729335e-06, + "loss": 0.4205, + "step": 15840 + }, + { + "epoch": 2.6015437357584217, + "grad_norm": 0.3932004190719817, + "learning_rate": 4.28037381021922e-06, + "loss": 0.4208, + "step": 15841 + }, + { + "epoch": 2.6017079629667643, + "grad_norm": 0.2916958143650516, + "learning_rate": 4.279905398094463e-06, + "loss": 0.4442, + "step": 15842 + }, + { + "epoch": 2.6018721901751074, + "grad_norm": 0.38320540985005647, + "learning_rate": 4.279436988360148e-06, + "loss": 0.4371, + "step": 15843 + }, + { + "epoch": 2.60203641738345, + "grad_norm": 0.3442761883932953, + "learning_rate": 4.278968581021366e-06, + "loss": 0.4267, + "step": 15844 + }, + { + "epoch": 2.6022006445917927, + "grad_norm": 0.3311876856923415, + "learning_rate": 4.278500176083204e-06, + "loss": 0.4491, + "step": 15845 + }, + { + "epoch": 2.6023648718001358, + "grad_norm": 0.271081882936446, + "learning_rate": 4.278031773550745e-06, + "loss": 0.443, + "step": 15846 + }, + { + "epoch": 2.6025290990084784, + "grad_norm": 0.27360586704891615, + "learning_rate": 4.27756337342908e-06, + "loss": 0.4234, + "step": 15847 + }, + { + "epoch": 2.602693326216821, + "grad_norm": 0.3028946730931956, + "learning_rate": 4.277094975723297e-06, + "loss": 0.4178, + "step": 15848 + }, + { + "epoch": 2.6028575534251637, + "grad_norm": 0.4211902298436584, + "learning_rate": 4.27662658043848e-06, + "loss": 0.4279, + "step": 15849 + }, + { + "epoch": 2.6030217806335063, + "grad_norm": 0.29390415171983797, + "learning_rate": 4.276158187579719e-06, + "loss": 0.4311, + "step": 15850 + }, + { + "epoch": 2.603186007841849, + "grad_norm": 0.2893391661327366, + "learning_rate": 4.275689797152101e-06, + "loss": 0.4442, + "step": 15851 + }, + { + "epoch": 2.603350235050192, + "grad_norm": 0.30309257216350804, + "learning_rate": 4.275221409160711e-06, + "loss": 0.4148, + "step": 15852 + }, + { + "epoch": 2.6035144622585347, + "grad_norm": 0.3694770688838367, + "learning_rate": 4.274753023610641e-06, + "loss": 0.4402, + "step": 15853 + }, + { + "epoch": 2.6036786894668773, + "grad_norm": 0.31224351318683174, + "learning_rate": 4.274284640506972e-06, + "loss": 0.4209, + "step": 15854 + }, + { + "epoch": 2.6038429166752204, + "grad_norm": 0.26476569410972106, + "learning_rate": 4.273816259854794e-06, + "loss": 0.4353, + "step": 15855 + }, + { + "epoch": 2.604007143883563, + "grad_norm": 0.2524068878432348, + "learning_rate": 4.273347881659193e-06, + "loss": 0.4341, + "step": 15856 + }, + { + "epoch": 2.6041713710919057, + "grad_norm": 0.3836716245953561, + "learning_rate": 4.272879505925259e-06, + "loss": 0.4405, + "step": 15857 + }, + { + "epoch": 2.6043355983002483, + "grad_norm": 0.31367484844860166, + "learning_rate": 4.272411132658076e-06, + "loss": 0.437, + "step": 15858 + }, + { + "epoch": 2.604499825508591, + "grad_norm": 0.24073228613682615, + "learning_rate": 4.271942761862731e-06, + "loss": 0.4263, + "step": 15859 + }, + { + "epoch": 2.604664052716934, + "grad_norm": 0.31112665903222925, + "learning_rate": 4.271474393544313e-06, + "loss": 0.4367, + "step": 15860 + }, + { + "epoch": 2.6048282799252767, + "grad_norm": 0.39513586710108894, + "learning_rate": 4.271006027707909e-06, + "loss": 0.4421, + "step": 15861 + }, + { + "epoch": 2.6049925071336193, + "grad_norm": 0.37016694695642754, + "learning_rate": 4.270537664358604e-06, + "loss": 0.4234, + "step": 15862 + }, + { + "epoch": 2.6051567343419624, + "grad_norm": 0.3234392124628261, + "learning_rate": 4.270069303501487e-06, + "loss": 0.4366, + "step": 15863 + }, + { + "epoch": 2.605320961550305, + "grad_norm": 0.3211553226048717, + "learning_rate": 4.269600945141642e-06, + "loss": 0.4139, + "step": 15864 + }, + { + "epoch": 2.6054851887586477, + "grad_norm": 0.297381168760795, + "learning_rate": 4.269132589284157e-06, + "loss": 0.4222, + "step": 15865 + }, + { + "epoch": 2.6056494159669903, + "grad_norm": 0.2684853314660215, + "learning_rate": 4.268664235934119e-06, + "loss": 0.4226, + "step": 15866 + }, + { + "epoch": 2.605813643175333, + "grad_norm": 0.24428461191927797, + "learning_rate": 4.268195885096617e-06, + "loss": 0.4055, + "step": 15867 + }, + { + "epoch": 2.6059778703836756, + "grad_norm": 0.30690138353801594, + "learning_rate": 4.267727536776734e-06, + "loss": 0.4327, + "step": 15868 + }, + { + "epoch": 2.6061420975920186, + "grad_norm": 0.309580731291792, + "learning_rate": 4.267259190979558e-06, + "loss": 0.4277, + "step": 15869 + }, + { + "epoch": 2.6063063248003613, + "grad_norm": 0.3521082341168475, + "learning_rate": 4.2667908477101794e-06, + "loss": 0.4369, + "step": 15870 + }, + { + "epoch": 2.606470552008704, + "grad_norm": 0.27827993463951545, + "learning_rate": 4.266322506973679e-06, + "loss": 0.4153, + "step": 15871 + }, + { + "epoch": 2.606634779217047, + "grad_norm": 0.29314152987116177, + "learning_rate": 4.265854168775148e-06, + "loss": 0.4383, + "step": 15872 + }, + { + "epoch": 2.6067990064253896, + "grad_norm": 0.4175886384913535, + "learning_rate": 4.265385833119668e-06, + "loss": 0.4374, + "step": 15873 + }, + { + "epoch": 2.6069632336337323, + "grad_norm": 0.3215341260226248, + "learning_rate": 4.264917500012331e-06, + "loss": 0.4481, + "step": 15874 + }, + { + "epoch": 2.607127460842075, + "grad_norm": 0.28916901461660643, + "learning_rate": 4.2644491694582196e-06, + "loss": 0.4291, + "step": 15875 + }, + { + "epoch": 2.6072916880504176, + "grad_norm": 0.5564639233489209, + "learning_rate": 4.263980841462424e-06, + "loss": 0.4434, + "step": 15876 + }, + { + "epoch": 2.6074559152587606, + "grad_norm": 0.4252859915961404, + "learning_rate": 4.263512516030027e-06, + "loss": 0.433, + "step": 15877 + }, + { + "epoch": 2.6076201424671033, + "grad_norm": 0.3004118104642622, + "learning_rate": 4.2630441931661195e-06, + "loss": 0.4334, + "step": 15878 + }, + { + "epoch": 2.607784369675446, + "grad_norm": 0.3107784239472434, + "learning_rate": 4.262575872875782e-06, + "loss": 0.429, + "step": 15879 + }, + { + "epoch": 2.607948596883789, + "grad_norm": 0.3734813433262263, + "learning_rate": 4.262107555164105e-06, + "loss": 0.4144, + "step": 15880 + }, + { + "epoch": 2.6081128240921316, + "grad_norm": 0.3201747720302578, + "learning_rate": 4.261639240036174e-06, + "loss": 0.4365, + "step": 15881 + }, + { + "epoch": 2.6082770513004743, + "grad_norm": 0.30335678042097514, + "learning_rate": 4.261170927497076e-06, + "loss": 0.4369, + "step": 15882 + }, + { + "epoch": 2.608441278508817, + "grad_norm": 0.30057906830959097, + "learning_rate": 4.260702617551896e-06, + "loss": 0.4274, + "step": 15883 + }, + { + "epoch": 2.6086055057171595, + "grad_norm": 0.6371499207725939, + "learning_rate": 4.260234310205721e-06, + "loss": 0.4348, + "step": 15884 + }, + { + "epoch": 2.608769732925502, + "grad_norm": 0.33918844373338264, + "learning_rate": 4.2597660054636376e-06, + "loss": 0.4335, + "step": 15885 + }, + { + "epoch": 2.6089339601338453, + "grad_norm": 0.3361159771638176, + "learning_rate": 4.259297703330732e-06, + "loss": 0.4276, + "step": 15886 + }, + { + "epoch": 2.609098187342188, + "grad_norm": 0.3028877725278089, + "learning_rate": 4.258829403812091e-06, + "loss": 0.4339, + "step": 15887 + }, + { + "epoch": 2.6092624145505305, + "grad_norm": 0.2717049246569981, + "learning_rate": 4.258361106912799e-06, + "loss": 0.422, + "step": 15888 + }, + { + "epoch": 2.6094266417588736, + "grad_norm": 0.29662739630039475, + "learning_rate": 4.257892812637942e-06, + "loss": 0.4485, + "step": 15889 + }, + { + "epoch": 2.6095908689672163, + "grad_norm": 0.32580420835705043, + "learning_rate": 4.257424520992608e-06, + "loss": 0.4304, + "step": 15890 + }, + { + "epoch": 2.609755096175559, + "grad_norm": 0.2789367086644554, + "learning_rate": 4.256956231981883e-06, + "loss": 0.4312, + "step": 15891 + }, + { + "epoch": 2.6099193233839015, + "grad_norm": 0.27050105126913654, + "learning_rate": 4.256487945610853e-06, + "loss": 0.4363, + "step": 15892 + }, + { + "epoch": 2.610083550592244, + "grad_norm": 0.32473112425656697, + "learning_rate": 4.256019661884601e-06, + "loss": 0.4514, + "step": 15893 + }, + { + "epoch": 2.6102477778005873, + "grad_norm": 1.2224814949738927, + "learning_rate": 4.255551380808217e-06, + "loss": 0.4475, + "step": 15894 + }, + { + "epoch": 2.61041200500893, + "grad_norm": 0.2899718094249228, + "learning_rate": 4.255083102386787e-06, + "loss": 0.4432, + "step": 15895 + }, + { + "epoch": 2.6105762322172725, + "grad_norm": 0.31770282494513413, + "learning_rate": 4.254614826625393e-06, + "loss": 0.4164, + "step": 15896 + }, + { + "epoch": 2.6107404594256156, + "grad_norm": 0.31746036994094073, + "learning_rate": 4.254146553529126e-06, + "loss": 0.4343, + "step": 15897 + }, + { + "epoch": 2.6109046866339582, + "grad_norm": 0.32599628063434466, + "learning_rate": 4.253678283103067e-06, + "loss": 0.4484, + "step": 15898 + }, + { + "epoch": 2.611068913842301, + "grad_norm": 0.29199535549958605, + "learning_rate": 4.2532100153523045e-06, + "loss": 0.424, + "step": 15899 + }, + { + "epoch": 2.6112331410506435, + "grad_norm": 0.30444024787236657, + "learning_rate": 4.252741750281925e-06, + "loss": 0.4227, + "step": 15900 + }, + { + "epoch": 2.611397368258986, + "grad_norm": 0.4412199564586546, + "learning_rate": 4.252273487897013e-06, + "loss": 0.4427, + "step": 15901 + }, + { + "epoch": 2.611561595467329, + "grad_norm": 0.5410335689281915, + "learning_rate": 4.251805228202654e-06, + "loss": 0.4292, + "step": 15902 + }, + { + "epoch": 2.611725822675672, + "grad_norm": 0.41213691877007574, + "learning_rate": 4.251336971203934e-06, + "loss": 0.4487, + "step": 15903 + }, + { + "epoch": 2.6118900498840145, + "grad_norm": 0.3330652790042675, + "learning_rate": 4.250868716905941e-06, + "loss": 0.4392, + "step": 15904 + }, + { + "epoch": 2.612054277092357, + "grad_norm": 0.3481140986138393, + "learning_rate": 4.250400465313758e-06, + "loss": 0.422, + "step": 15905 + }, + { + "epoch": 2.6122185043007002, + "grad_norm": 0.31634923265199455, + "learning_rate": 4.249932216432472e-06, + "loss": 0.4314, + "step": 15906 + }, + { + "epoch": 2.612382731509043, + "grad_norm": 0.2918750725823665, + "learning_rate": 4.249463970267168e-06, + "loss": 0.4341, + "step": 15907 + }, + { + "epoch": 2.6125469587173855, + "grad_norm": 1.0899497941084635, + "learning_rate": 4.24899572682293e-06, + "loss": 0.4252, + "step": 15908 + }, + { + "epoch": 2.612711185925728, + "grad_norm": 0.32946172413991354, + "learning_rate": 4.248527486104847e-06, + "loss": 0.4436, + "step": 15909 + }, + { + "epoch": 2.612875413134071, + "grad_norm": 0.28083680309894277, + "learning_rate": 4.248059248118003e-06, + "loss": 0.4234, + "step": 15910 + }, + { + "epoch": 2.613039640342414, + "grad_norm": 0.2696569450276403, + "learning_rate": 4.247591012867483e-06, + "loss": 0.4317, + "step": 15911 + }, + { + "epoch": 2.6132038675507565, + "grad_norm": 0.32797454999406345, + "learning_rate": 4.247122780358374e-06, + "loss": 0.4485, + "step": 15912 + }, + { + "epoch": 2.613368094759099, + "grad_norm": 0.2978012132300891, + "learning_rate": 4.246654550595759e-06, + "loss": 0.4444, + "step": 15913 + }, + { + "epoch": 2.6135323219674422, + "grad_norm": 0.3347378118305014, + "learning_rate": 4.246186323584725e-06, + "loss": 0.4491, + "step": 15914 + }, + { + "epoch": 2.613696549175785, + "grad_norm": 0.30672663933298117, + "learning_rate": 4.245718099330358e-06, + "loss": 0.4711, + "step": 15915 + }, + { + "epoch": 2.6138607763841275, + "grad_norm": 0.2780680965058183, + "learning_rate": 4.245249877837742e-06, + "loss": 0.4234, + "step": 15916 + }, + { + "epoch": 2.61402500359247, + "grad_norm": 0.31100650951866027, + "learning_rate": 4.2447816591119626e-06, + "loss": 0.4395, + "step": 15917 + }, + { + "epoch": 2.614189230800813, + "grad_norm": 0.3165837170282832, + "learning_rate": 4.244313443158106e-06, + "loss": 0.4534, + "step": 15918 + }, + { + "epoch": 2.6143534580091554, + "grad_norm": 0.2945574433533434, + "learning_rate": 4.243845229981256e-06, + "loss": 0.4322, + "step": 15919 + }, + { + "epoch": 2.6145176852174985, + "grad_norm": 0.2854056769828595, + "learning_rate": 4.243377019586501e-06, + "loss": 0.4216, + "step": 15920 + }, + { + "epoch": 2.614681912425841, + "grad_norm": 0.3449269045558118, + "learning_rate": 4.242908811978924e-06, + "loss": 0.4335, + "step": 15921 + }, + { + "epoch": 2.6148461396341838, + "grad_norm": 0.29707638186807517, + "learning_rate": 4.2424406071636085e-06, + "loss": 0.4307, + "step": 15922 + }, + { + "epoch": 2.615010366842527, + "grad_norm": 0.31842893735652705, + "learning_rate": 4.241972405145641e-06, + "loss": 0.4462, + "step": 15923 + }, + { + "epoch": 2.6151745940508695, + "grad_norm": 0.2784297794765863, + "learning_rate": 4.241504205930108e-06, + "loss": 0.4393, + "step": 15924 + }, + { + "epoch": 2.615338821259212, + "grad_norm": 0.637609011183369, + "learning_rate": 4.241036009522094e-06, + "loss": 0.4164, + "step": 15925 + }, + { + "epoch": 2.6155030484675548, + "grad_norm": 0.30614294622529886, + "learning_rate": 4.240567815926683e-06, + "loss": 0.4284, + "step": 15926 + }, + { + "epoch": 2.6156672756758974, + "grad_norm": 0.3447127775105285, + "learning_rate": 4.240099625148961e-06, + "loss": 0.4537, + "step": 15927 + }, + { + "epoch": 2.6158315028842405, + "grad_norm": 0.4970599090971395, + "learning_rate": 4.2396314371940125e-06, + "loss": 0.4371, + "step": 15928 + }, + { + "epoch": 2.615995730092583, + "grad_norm": 0.43555955002393065, + "learning_rate": 4.239163252066925e-06, + "loss": 0.4361, + "step": 15929 + }, + { + "epoch": 2.6161599573009258, + "grad_norm": 0.32903375832698184, + "learning_rate": 4.238695069772779e-06, + "loss": 0.4384, + "step": 15930 + }, + { + "epoch": 2.616324184509269, + "grad_norm": 0.33202092298407887, + "learning_rate": 4.238226890316663e-06, + "loss": 0.4481, + "step": 15931 + }, + { + "epoch": 2.6164884117176115, + "grad_norm": 0.45140111170335284, + "learning_rate": 4.237758713703659e-06, + "loss": 0.4361, + "step": 15932 + }, + { + "epoch": 2.616652638925954, + "grad_norm": 0.33571375004655346, + "learning_rate": 4.237290539938854e-06, + "loss": 0.4146, + "step": 15933 + }, + { + "epoch": 2.6168168661342968, + "grad_norm": 0.5085529476929775, + "learning_rate": 4.236822369027331e-06, + "loss": 0.4412, + "step": 15934 + }, + { + "epoch": 2.6169810933426394, + "grad_norm": 0.26527704034784494, + "learning_rate": 4.236354200974178e-06, + "loss": 0.4344, + "step": 15935 + }, + { + "epoch": 2.617145320550982, + "grad_norm": 0.2653581981143029, + "learning_rate": 4.235886035784477e-06, + "loss": 0.4255, + "step": 15936 + }, + { + "epoch": 2.617309547759325, + "grad_norm": 0.26429214414880825, + "learning_rate": 4.235417873463314e-06, + "loss": 0.4164, + "step": 15937 + }, + { + "epoch": 2.6174737749676678, + "grad_norm": 0.451089379274073, + "learning_rate": 4.234949714015772e-06, + "loss": 0.4303, + "step": 15938 + }, + { + "epoch": 2.6176380021760104, + "grad_norm": 0.613858816212356, + "learning_rate": 4.2344815574469376e-06, + "loss": 0.4503, + "step": 15939 + }, + { + "epoch": 2.6178022293843535, + "grad_norm": 0.42073771656410275, + "learning_rate": 4.234013403761895e-06, + "loss": 0.4332, + "step": 15940 + }, + { + "epoch": 2.617966456592696, + "grad_norm": 0.2764531610938458, + "learning_rate": 4.233545252965728e-06, + "loss": 0.4246, + "step": 15941 + }, + { + "epoch": 2.6181306838010387, + "grad_norm": 0.3069746988437526, + "learning_rate": 4.233077105063521e-06, + "loss": 0.44, + "step": 15942 + }, + { + "epoch": 2.6182949110093814, + "grad_norm": 0.34945204460664414, + "learning_rate": 4.232608960060361e-06, + "loss": 0.4338, + "step": 15943 + }, + { + "epoch": 2.618459138217724, + "grad_norm": 0.3304580150854837, + "learning_rate": 4.23214081796133e-06, + "loss": 0.4513, + "step": 15944 + }, + { + "epoch": 2.618623365426067, + "grad_norm": 0.4045233492426899, + "learning_rate": 4.231672678771513e-06, + "loss": 0.4383, + "step": 15945 + }, + { + "epoch": 2.6187875926344097, + "grad_norm": 0.3027881257346491, + "learning_rate": 4.231204542495996e-06, + "loss": 0.4323, + "step": 15946 + }, + { + "epoch": 2.6189518198427524, + "grad_norm": 0.3566048787535595, + "learning_rate": 4.230736409139861e-06, + "loss": 0.4433, + "step": 15947 + }, + { + "epoch": 2.6191160470510955, + "grad_norm": 0.28909857225231095, + "learning_rate": 4.230268278708193e-06, + "loss": 0.4371, + "step": 15948 + }, + { + "epoch": 2.619280274259438, + "grad_norm": 0.30328281123785367, + "learning_rate": 4.229800151206077e-06, + "loss": 0.4616, + "step": 15949 + }, + { + "epoch": 2.6194445014677807, + "grad_norm": 0.28871293493755906, + "learning_rate": 4.229332026638598e-06, + "loss": 0.4094, + "step": 15950 + }, + { + "epoch": 2.6196087286761234, + "grad_norm": 0.3015937440164125, + "learning_rate": 4.228863905010839e-06, + "loss": 0.4405, + "step": 15951 + }, + { + "epoch": 2.619772955884466, + "grad_norm": 0.33214719515041785, + "learning_rate": 4.2283957863278845e-06, + "loss": 0.4394, + "step": 15952 + }, + { + "epoch": 2.6199371830928087, + "grad_norm": 0.3584487107355222, + "learning_rate": 4.227927670594818e-06, + "loss": 0.4484, + "step": 15953 + }, + { + "epoch": 2.6201014103011517, + "grad_norm": 0.3666964485124772, + "learning_rate": 4.227459557816728e-06, + "loss": 0.4186, + "step": 15954 + }, + { + "epoch": 2.6202656375094944, + "grad_norm": 0.27626251940460655, + "learning_rate": 4.226991447998694e-06, + "loss": 0.4184, + "step": 15955 + }, + { + "epoch": 2.620429864717837, + "grad_norm": 0.3010169911353582, + "learning_rate": 4.226523341145801e-06, + "loss": 0.4435, + "step": 15956 + }, + { + "epoch": 2.62059409192618, + "grad_norm": 0.3042075192771724, + "learning_rate": 4.226055237263132e-06, + "loss": 0.4275, + "step": 15957 + }, + { + "epoch": 2.6207583191345227, + "grad_norm": 0.28337020528828455, + "learning_rate": 4.225587136355774e-06, + "loss": 0.4321, + "step": 15958 + }, + { + "epoch": 2.6209225463428654, + "grad_norm": 0.275572951052758, + "learning_rate": 4.22511903842881e-06, + "loss": 0.4293, + "step": 15959 + }, + { + "epoch": 2.621086773551208, + "grad_norm": 0.3765406983641153, + "learning_rate": 4.2246509434873235e-06, + "loss": 0.4488, + "step": 15960 + }, + { + "epoch": 2.6212510007595506, + "grad_norm": 0.3500604250595598, + "learning_rate": 4.224182851536398e-06, + "loss": 0.4265, + "step": 15961 + }, + { + "epoch": 2.6214152279678937, + "grad_norm": 0.3161889341013727, + "learning_rate": 4.22371476258112e-06, + "loss": 0.4507, + "step": 15962 + }, + { + "epoch": 2.6215794551762364, + "grad_norm": 0.2995552223403898, + "learning_rate": 4.223246676626571e-06, + "loss": 0.4271, + "step": 15963 + }, + { + "epoch": 2.621743682384579, + "grad_norm": 0.40480995619927845, + "learning_rate": 4.222778593677833e-06, + "loss": 0.4236, + "step": 15964 + }, + { + "epoch": 2.621907909592922, + "grad_norm": 0.3988167086614991, + "learning_rate": 4.222310513739995e-06, + "loss": 0.4419, + "step": 15965 + }, + { + "epoch": 2.6220721368012647, + "grad_norm": 0.32298607418392483, + "learning_rate": 4.221842436818136e-06, + "loss": 0.4397, + "step": 15966 + }, + { + "epoch": 2.6222363640096074, + "grad_norm": 0.29969031931968826, + "learning_rate": 4.2213743629173425e-06, + "loss": 0.4201, + "step": 15967 + }, + { + "epoch": 2.62240059121795, + "grad_norm": 0.27714271714246214, + "learning_rate": 4.220906292042697e-06, + "loss": 0.4431, + "step": 15968 + }, + { + "epoch": 2.6225648184262926, + "grad_norm": 0.2942175988231403, + "learning_rate": 4.220438224199285e-06, + "loss": 0.4351, + "step": 15969 + }, + { + "epoch": 2.6227290456346353, + "grad_norm": 0.27700577987436237, + "learning_rate": 4.21997015939219e-06, + "loss": 0.4394, + "step": 15970 + }, + { + "epoch": 2.6228932728429784, + "grad_norm": 0.31847153155102015, + "learning_rate": 4.219502097626492e-06, + "loss": 0.428, + "step": 15971 + }, + { + "epoch": 2.623057500051321, + "grad_norm": 0.2929640968271118, + "learning_rate": 4.219034038907278e-06, + "loss": 0.4342, + "step": 15972 + }, + { + "epoch": 2.6232217272596636, + "grad_norm": 0.36743081273337336, + "learning_rate": 4.21856598323963e-06, + "loss": 0.434, + "step": 15973 + }, + { + "epoch": 2.6233859544680067, + "grad_norm": 0.251752898125371, + "learning_rate": 4.218097930628634e-06, + "loss": 0.4176, + "step": 15974 + }, + { + "epoch": 2.6235501816763493, + "grad_norm": 0.26781368785221105, + "learning_rate": 4.217629881079372e-06, + "loss": 0.4441, + "step": 15975 + }, + { + "epoch": 2.623714408884692, + "grad_norm": 0.29308075035555686, + "learning_rate": 4.217161834596925e-06, + "loss": 0.4396, + "step": 15976 + }, + { + "epoch": 2.6238786360930346, + "grad_norm": 0.3126222728890951, + "learning_rate": 4.21669379118638e-06, + "loss": 0.4339, + "step": 15977 + }, + { + "epoch": 2.6240428633013773, + "grad_norm": 0.42222963655284285, + "learning_rate": 4.21622575085282e-06, + "loss": 0.4497, + "step": 15978 + }, + { + "epoch": 2.6242070905097203, + "grad_norm": 0.2680911148735966, + "learning_rate": 4.215757713601327e-06, + "loss": 0.4505, + "step": 15979 + }, + { + "epoch": 2.624371317718063, + "grad_norm": 0.26722916634092214, + "learning_rate": 4.215289679436986e-06, + "loss": 0.4328, + "step": 15980 + }, + { + "epoch": 2.6245355449264056, + "grad_norm": 0.28522394018656005, + "learning_rate": 4.214821648364877e-06, + "loss": 0.4208, + "step": 15981 + }, + { + "epoch": 2.6246997721347487, + "grad_norm": 0.3730121107654761, + "learning_rate": 4.214353620390086e-06, + "loss": 0.4349, + "step": 15982 + }, + { + "epoch": 2.6248639993430913, + "grad_norm": 0.34376190480566104, + "learning_rate": 4.2138855955176955e-06, + "loss": 0.4477, + "step": 15983 + }, + { + "epoch": 2.625028226551434, + "grad_norm": 0.35628550422099475, + "learning_rate": 4.21341757375279e-06, + "loss": 0.4342, + "step": 15984 + }, + { + "epoch": 2.6251924537597766, + "grad_norm": 0.3953333670118324, + "learning_rate": 4.212949555100451e-06, + "loss": 0.4378, + "step": 15985 + }, + { + "epoch": 2.6253566809681192, + "grad_norm": 0.3403985295123224, + "learning_rate": 4.212481539565762e-06, + "loss": 0.4206, + "step": 15986 + }, + { + "epoch": 2.625520908176462, + "grad_norm": 0.3700412707613751, + "learning_rate": 4.212013527153809e-06, + "loss": 0.4462, + "step": 15987 + }, + { + "epoch": 2.625685135384805, + "grad_norm": 0.3124818850646178, + "learning_rate": 4.211545517869669e-06, + "loss": 0.4382, + "step": 15988 + }, + { + "epoch": 2.6258493625931476, + "grad_norm": 0.37030254675822366, + "learning_rate": 4.21107751171843e-06, + "loss": 0.4218, + "step": 15989 + }, + { + "epoch": 2.6260135898014902, + "grad_norm": 0.3251159171177923, + "learning_rate": 4.210609508705173e-06, + "loss": 0.4342, + "step": 15990 + }, + { + "epoch": 2.6261778170098333, + "grad_norm": 0.28275761112001435, + "learning_rate": 4.210141508834981e-06, + "loss": 0.4313, + "step": 15991 + }, + { + "epoch": 2.626342044218176, + "grad_norm": 0.3372588684132263, + "learning_rate": 4.2096735121129385e-06, + "loss": 0.4427, + "step": 15992 + }, + { + "epoch": 2.6265062714265186, + "grad_norm": 0.35546215576563495, + "learning_rate": 4.2092055185441265e-06, + "loss": 0.4253, + "step": 15993 + }, + { + "epoch": 2.6266704986348612, + "grad_norm": 0.28538441490936256, + "learning_rate": 4.2087375281336286e-06, + "loss": 0.4459, + "step": 15994 + }, + { + "epoch": 2.626834725843204, + "grad_norm": 0.3705717411953057, + "learning_rate": 4.208269540886529e-06, + "loss": 0.4422, + "step": 15995 + }, + { + "epoch": 2.626998953051547, + "grad_norm": 0.26329011035550054, + "learning_rate": 4.207801556807909e-06, + "loss": 0.4442, + "step": 15996 + }, + { + "epoch": 2.6271631802598896, + "grad_norm": 0.28256898887020987, + "learning_rate": 4.20733357590285e-06, + "loss": 0.4422, + "step": 15997 + }, + { + "epoch": 2.6273274074682322, + "grad_norm": 0.31725685114307306, + "learning_rate": 4.206865598176437e-06, + "loss": 0.4472, + "step": 15998 + }, + { + "epoch": 2.6274916346765753, + "grad_norm": 1.181339168715183, + "learning_rate": 4.206397623633753e-06, + "loss": 0.4324, + "step": 15999 + }, + { + "epoch": 2.627655861884918, + "grad_norm": 0.5335499658081728, + "learning_rate": 4.2059296522798786e-06, + "loss": 0.4335, + "step": 16000 + }, + { + "epoch": 2.633813827817476, + "grad_norm": 0.3458413336772324, + "learning_rate": 4.188362348384085e-06, + "loss": 0.4379, + "step": 16001 + }, + { + "epoch": 2.6339784120365923, + "grad_norm": 0.5975672929918889, + "learning_rate": 4.1878934149027705e-06, + "loss": 0.4447, + "step": 16002 + }, + { + "epoch": 2.634142996255709, + "grad_norm": 0.31596954107529784, + "learning_rate": 4.18742448482704e-06, + "loss": 0.43, + "step": 16003 + }, + { + "epoch": 2.6343075804748253, + "grad_norm": 0.29705223444507267, + "learning_rate": 4.186955558162006e-06, + "loss": 0.4309, + "step": 16004 + }, + { + "epoch": 2.6344721646939417, + "grad_norm": 0.3042036993024349, + "learning_rate": 4.186486634912789e-06, + "loss": 0.4447, + "step": 16005 + }, + { + "epoch": 2.634636748913058, + "grad_norm": 0.38759716050242937, + "learning_rate": 4.1860177150845045e-06, + "loss": 0.4292, + "step": 16006 + }, + { + "epoch": 2.6348013331321747, + "grad_norm": 0.31390985601371674, + "learning_rate": 4.18554879868227e-06, + "loss": 0.4378, + "step": 16007 + }, + { + "epoch": 2.634965917351291, + "grad_norm": 0.5655176381737559, + "learning_rate": 4.1850798857112e-06, + "loss": 0.4382, + "step": 16008 + }, + { + "epoch": 2.6351305015704076, + "grad_norm": 0.3223395703472826, + "learning_rate": 4.184610976176414e-06, + "loss": 0.438, + "step": 16009 + }, + { + "epoch": 2.635295085789524, + "grad_norm": 0.35687197663501297, + "learning_rate": 4.184142070083026e-06, + "loss": 0.4237, + "step": 16010 + }, + { + "epoch": 2.6354596700086406, + "grad_norm": 0.44101539197692596, + "learning_rate": 4.183673167436155e-06, + "loss": 0.4341, + "step": 16011 + }, + { + "epoch": 2.635624254227757, + "grad_norm": 0.3007372895231009, + "learning_rate": 4.183204268240916e-06, + "loss": 0.4288, + "step": 16012 + }, + { + "epoch": 2.6357888384468735, + "grad_norm": 0.29204721833278224, + "learning_rate": 4.182735372502424e-06, + "loss": 0.4309, + "step": 16013 + }, + { + "epoch": 2.63595342266599, + "grad_norm": 0.31142547055058784, + "learning_rate": 4.1822664802258e-06, + "loss": 0.4211, + "step": 16014 + }, + { + "epoch": 2.6361180068851064, + "grad_norm": 0.4347133243104596, + "learning_rate": 4.181797591416155e-06, + "loss": 0.4404, + "step": 16015 + }, + { + "epoch": 2.636282591104223, + "grad_norm": 0.5475754911371223, + "learning_rate": 4.181328706078609e-06, + "loss": 0.427, + "step": 16016 + }, + { + "epoch": 2.6364471753233394, + "grad_norm": 0.33465708743300165, + "learning_rate": 4.180859824218275e-06, + "loss": 0.441, + "step": 16017 + }, + { + "epoch": 2.636611759542456, + "grad_norm": 0.4214285639676177, + "learning_rate": 4.180390945840273e-06, + "loss": 0.4257, + "step": 16018 + }, + { + "epoch": 2.6367763437615723, + "grad_norm": 0.380826150533155, + "learning_rate": 4.179922070949717e-06, + "loss": 0.4514, + "step": 16019 + }, + { + "epoch": 2.636940927980689, + "grad_norm": 0.2875791514989647, + "learning_rate": 4.179453199551723e-06, + "loss": 0.4438, + "step": 16020 + }, + { + "epoch": 2.6371055121998053, + "grad_norm": 0.308429485865092, + "learning_rate": 4.178984331651408e-06, + "loss": 0.4246, + "step": 16021 + }, + { + "epoch": 2.6372700964189217, + "grad_norm": 0.346006434138322, + "learning_rate": 4.178515467253888e-06, + "loss": 0.4488, + "step": 16022 + }, + { + "epoch": 2.637434680638038, + "grad_norm": 0.2868870426572875, + "learning_rate": 4.1780466063642804e-06, + "loss": 0.4168, + "step": 16023 + }, + { + "epoch": 2.6375992648571547, + "grad_norm": 0.3518715966594303, + "learning_rate": 4.177577748987697e-06, + "loss": 0.4306, + "step": 16024 + }, + { + "epoch": 2.637763849076271, + "grad_norm": 0.33694302289305983, + "learning_rate": 4.177108895129259e-06, + "loss": 0.4469, + "step": 16025 + }, + { + "epoch": 2.6379284332953876, + "grad_norm": 0.2835088136502032, + "learning_rate": 4.176640044794079e-06, + "loss": 0.4265, + "step": 16026 + }, + { + "epoch": 2.638093017514504, + "grad_norm": 0.3304200094771688, + "learning_rate": 4.176171197987273e-06, + "loss": 0.4282, + "step": 16027 + }, + { + "epoch": 2.6382576017336206, + "grad_norm": 0.29655360915312234, + "learning_rate": 4.1757023547139585e-06, + "loss": 0.4258, + "step": 16028 + }, + { + "epoch": 2.638422185952737, + "grad_norm": 0.34291231739564487, + "learning_rate": 4.175233514979249e-06, + "loss": 0.4276, + "step": 16029 + }, + { + "epoch": 2.6385867701718535, + "grad_norm": 0.32533256752976253, + "learning_rate": 4.174764678788264e-06, + "loss": 0.4225, + "step": 16030 + }, + { + "epoch": 2.63875135439097, + "grad_norm": 0.27444339256916256, + "learning_rate": 4.174295846146115e-06, + "loss": 0.4422, + "step": 16031 + }, + { + "epoch": 2.6389159386100864, + "grad_norm": 0.3594351017234265, + "learning_rate": 4.173827017057922e-06, + "loss": 0.4176, + "step": 16032 + }, + { + "epoch": 2.639080522829203, + "grad_norm": 0.3259773673795588, + "learning_rate": 4.173358191528798e-06, + "loss": 0.4315, + "step": 16033 + }, + { + "epoch": 2.6392451070483194, + "grad_norm": 0.33864295381147363, + "learning_rate": 4.172889369563857e-06, + "loss": 0.4419, + "step": 16034 + }, + { + "epoch": 2.639409691267436, + "grad_norm": 0.29615433686097753, + "learning_rate": 4.172420551168219e-06, + "loss": 0.4399, + "step": 16035 + }, + { + "epoch": 2.6395742754865523, + "grad_norm": 0.47785437472432807, + "learning_rate": 4.171951736346995e-06, + "loss": 0.4334, + "step": 16036 + }, + { + "epoch": 2.639738859705669, + "grad_norm": 0.4004573990222095, + "learning_rate": 4.171482925105304e-06, + "loss": 0.4471, + "step": 16037 + }, + { + "epoch": 2.6399034439247853, + "grad_norm": 0.3143272720035237, + "learning_rate": 4.17101411744826e-06, + "loss": 0.4294, + "step": 16038 + }, + { + "epoch": 2.6400680281439017, + "grad_norm": 0.35269234949253736, + "learning_rate": 4.170545313380978e-06, + "loss": 0.4045, + "step": 16039 + }, + { + "epoch": 2.6402326123630178, + "grad_norm": 0.340001284874301, + "learning_rate": 4.1700765129085754e-06, + "loss": 0.4191, + "step": 16040 + }, + { + "epoch": 2.6403971965821342, + "grad_norm": 0.41295947541357814, + "learning_rate": 4.169607716036165e-06, + "loss": 0.4088, + "step": 16041 + }, + { + "epoch": 2.6405617808012507, + "grad_norm": 0.31813500109467485, + "learning_rate": 4.169138922768865e-06, + "loss": 0.4433, + "step": 16042 + }, + { + "epoch": 2.640726365020367, + "grad_norm": 0.31553983739624203, + "learning_rate": 4.168670133111787e-06, + "loss": 0.4269, + "step": 16043 + }, + { + "epoch": 2.6408909492394836, + "grad_norm": 0.28908417796027375, + "learning_rate": 4.16820134707005e-06, + "loss": 0.4391, + "step": 16044 + }, + { + "epoch": 2.6410555334586, + "grad_norm": 0.25498288846337946, + "learning_rate": 4.1677325646487644e-06, + "loss": 0.4344, + "step": 16045 + }, + { + "epoch": 2.6412201176777166, + "grad_norm": 0.9148740161576957, + "learning_rate": 4.167263785853052e-06, + "loss": 0.4397, + "step": 16046 + }, + { + "epoch": 2.641384701896833, + "grad_norm": 0.3654250205826132, + "learning_rate": 4.166795010688024e-06, + "loss": 0.437, + "step": 16047 + }, + { + "epoch": 2.6415492861159495, + "grad_norm": 0.2839790292083236, + "learning_rate": 4.166326239158794e-06, + "loss": 0.4405, + "step": 16048 + }, + { + "epoch": 2.641713870335066, + "grad_norm": 0.2566802127035541, + "learning_rate": 4.16585747127048e-06, + "loss": 0.4223, + "step": 16049 + }, + { + "epoch": 2.6418784545541825, + "grad_norm": 0.3592073497185888, + "learning_rate": 4.165388707028195e-06, + "loss": 0.4197, + "step": 16050 + }, + { + "epoch": 2.642043038773299, + "grad_norm": 0.296672179833109, + "learning_rate": 4.164919946437056e-06, + "loss": 0.4494, + "step": 16051 + }, + { + "epoch": 2.6422076229924154, + "grad_norm": 0.26656374733013644, + "learning_rate": 4.164451189502176e-06, + "loss": 0.4348, + "step": 16052 + }, + { + "epoch": 2.642372207211532, + "grad_norm": 0.8647309996086165, + "learning_rate": 4.163982436228672e-06, + "loss": 0.424, + "step": 16053 + }, + { + "epoch": 2.6425367914306483, + "grad_norm": 0.2867680259115217, + "learning_rate": 4.163513686621655e-06, + "loss": 0.4553, + "step": 16054 + }, + { + "epoch": 2.642701375649765, + "grad_norm": 0.2975334360287579, + "learning_rate": 4.163044940686246e-06, + "loss": 0.4421, + "step": 16055 + }, + { + "epoch": 2.6428659598688813, + "grad_norm": 0.4094481182342718, + "learning_rate": 4.162576198427554e-06, + "loss": 0.4242, + "step": 16056 + }, + { + "epoch": 2.6430305440879978, + "grad_norm": 0.319781031677507, + "learning_rate": 4.162107459850696e-06, + "loss": 0.4294, + "step": 16057 + }, + { + "epoch": 2.6431951283071142, + "grad_norm": 0.29254963141456264, + "learning_rate": 4.1616387249607865e-06, + "loss": 0.4415, + "step": 16058 + }, + { + "epoch": 2.6433597125262307, + "grad_norm": 0.3342593584953292, + "learning_rate": 4.1611699937629395e-06, + "loss": 0.4362, + "step": 16059 + }, + { + "epoch": 2.643524296745347, + "grad_norm": 0.29622533476076573, + "learning_rate": 4.160701266262272e-06, + "loss": 0.4041, + "step": 16060 + }, + { + "epoch": 2.6436888809644636, + "grad_norm": 0.29560361745977387, + "learning_rate": 4.160232542463895e-06, + "loss": 0.4316, + "step": 16061 + }, + { + "epoch": 2.64385346518358, + "grad_norm": 0.34481303444827943, + "learning_rate": 4.159763822372926e-06, + "loss": 0.4565, + "step": 16062 + }, + { + "epoch": 2.6440180494026966, + "grad_norm": 0.44701790743094155, + "learning_rate": 4.15929510599448e-06, + "loss": 0.4322, + "step": 16063 + }, + { + "epoch": 2.644182633621813, + "grad_norm": 0.38441840228187624, + "learning_rate": 4.158826393333666e-06, + "loss": 0.4308, + "step": 16064 + }, + { + "epoch": 2.644347217840929, + "grad_norm": 0.27856025851449084, + "learning_rate": 4.158357684395606e-06, + "loss": 0.4557, + "step": 16065 + }, + { + "epoch": 2.6445118020600455, + "grad_norm": 0.346035538895123, + "learning_rate": 4.1578889791854074e-06, + "loss": 0.4371, + "step": 16066 + }, + { + "epoch": 2.644676386279162, + "grad_norm": 0.29522255544530657, + "learning_rate": 4.15742027770819e-06, + "loss": 0.4429, + "step": 16067 + }, + { + "epoch": 2.6448409704982785, + "grad_norm": 0.2568408699069682, + "learning_rate": 4.156951579969064e-06, + "loss": 0.4511, + "step": 16068 + }, + { + "epoch": 2.645005554717395, + "grad_norm": 0.2795226041069509, + "learning_rate": 4.156482885973147e-06, + "loss": 0.4317, + "step": 16069 + }, + { + "epoch": 2.6451701389365114, + "grad_norm": 0.41437289939438915, + "learning_rate": 4.156014195725552e-06, + "loss": 0.4471, + "step": 16070 + }, + { + "epoch": 2.645334723155628, + "grad_norm": 0.37249505618644185, + "learning_rate": 4.155545509231391e-06, + "loss": 0.4286, + "step": 16071 + }, + { + "epoch": 2.6454993073747444, + "grad_norm": 0.36242228653315584, + "learning_rate": 4.155076826495783e-06, + "loss": 0.439, + "step": 16072 + }, + { + "epoch": 2.645663891593861, + "grad_norm": 0.31756610529430523, + "learning_rate": 4.154608147523834e-06, + "loss": 0.4273, + "step": 16073 + }, + { + "epoch": 2.6458284758129773, + "grad_norm": 0.32159274867073534, + "learning_rate": 4.154139472320668e-06, + "loss": 0.4537, + "step": 16074 + }, + { + "epoch": 2.645993060032094, + "grad_norm": 0.335271183359619, + "learning_rate": 4.153670800891389e-06, + "loss": 0.4422, + "step": 16075 + }, + { + "epoch": 2.6461576442512103, + "grad_norm": 0.3287060062641479, + "learning_rate": 4.153202133241121e-06, + "loss": 0.4472, + "step": 16076 + }, + { + "epoch": 2.6463222284703267, + "grad_norm": 0.3426461142941525, + "learning_rate": 4.1527334693749716e-06, + "loss": 0.4241, + "step": 16077 + }, + { + "epoch": 2.646486812689443, + "grad_norm": 0.29177189773366047, + "learning_rate": 4.1522648092980546e-06, + "loss": 0.4293, + "step": 16078 + }, + { + "epoch": 2.6466513969085597, + "grad_norm": 0.35057317490047024, + "learning_rate": 4.151796153015486e-06, + "loss": 0.4336, + "step": 16079 + }, + { + "epoch": 2.646815981127676, + "grad_norm": 0.3733394546466849, + "learning_rate": 4.151327500532379e-06, + "loss": 0.4339, + "step": 16080 + }, + { + "epoch": 2.6469805653467926, + "grad_norm": 0.3274339998882645, + "learning_rate": 4.150858851853847e-06, + "loss": 0.4353, + "step": 16081 + }, + { + "epoch": 2.647145149565909, + "grad_norm": 0.39048201119481457, + "learning_rate": 4.150390206985002e-06, + "loss": 0.4288, + "step": 16082 + }, + { + "epoch": 2.6473097337850255, + "grad_norm": 0.2890017641382168, + "learning_rate": 4.149921565930962e-06, + "loss": 0.4285, + "step": 16083 + }, + { + "epoch": 2.647474318004142, + "grad_norm": 0.3689520037974736, + "learning_rate": 4.149452928696839e-06, + "loss": 0.447, + "step": 16084 + }, + { + "epoch": 2.6476389022232585, + "grad_norm": 0.32192762270808284, + "learning_rate": 4.148984295287743e-06, + "loss": 0.4378, + "step": 16085 + }, + { + "epoch": 2.647803486442375, + "grad_norm": 0.34372293910344526, + "learning_rate": 4.148515665708792e-06, + "loss": 0.4493, + "step": 16086 + }, + { + "epoch": 2.6479680706614914, + "grad_norm": 0.3676412358462734, + "learning_rate": 4.148047039965097e-06, + "loss": 0.4115, + "step": 16087 + }, + { + "epoch": 2.648132654880608, + "grad_norm": 0.32314400463920195, + "learning_rate": 4.147578418061772e-06, + "loss": 0.4302, + "step": 16088 + }, + { + "epoch": 2.6482972390997244, + "grad_norm": 0.32951777235338797, + "learning_rate": 4.147109800003931e-06, + "loss": 0.432, + "step": 16089 + }, + { + "epoch": 2.648461823318841, + "grad_norm": 0.4144155570458211, + "learning_rate": 4.146641185796687e-06, + "loss": 0.4526, + "step": 16090 + }, + { + "epoch": 2.6486264075379573, + "grad_norm": 0.32492391828837874, + "learning_rate": 4.146172575445153e-06, + "loss": 0.4171, + "step": 16091 + }, + { + "epoch": 2.6487909917570738, + "grad_norm": 0.32096250114278707, + "learning_rate": 4.1457039689544435e-06, + "loss": 0.4263, + "step": 16092 + }, + { + "epoch": 2.6489555759761902, + "grad_norm": 0.2931596596890427, + "learning_rate": 4.1452353663296715e-06, + "loss": 0.4177, + "step": 16093 + }, + { + "epoch": 2.6491201601953067, + "grad_norm": 0.42602171281467677, + "learning_rate": 4.144766767575947e-06, + "loss": 0.4397, + "step": 16094 + }, + { + "epoch": 2.649284744414423, + "grad_norm": 0.31803480047796434, + "learning_rate": 4.144298172698389e-06, + "loss": 0.4048, + "step": 16095 + }, + { + "epoch": 2.6494493286335397, + "grad_norm": 0.3114085359245422, + "learning_rate": 4.143829581702105e-06, + "loss": 0.4518, + "step": 16096 + }, + { + "epoch": 2.649613912852656, + "grad_norm": 0.27338831216461773, + "learning_rate": 4.143360994592211e-06, + "loss": 0.4443, + "step": 16097 + }, + { + "epoch": 2.6497784970717726, + "grad_norm": 0.3096832549208866, + "learning_rate": 4.14289241137382e-06, + "loss": 0.431, + "step": 16098 + }, + { + "epoch": 2.649943081290889, + "grad_norm": 0.570367929625992, + "learning_rate": 4.142423832052045e-06, + "loss": 0.433, + "step": 16099 + }, + { + "epoch": 2.6501076655100055, + "grad_norm": 0.3231277758544926, + "learning_rate": 4.141955256631997e-06, + "loss": 0.4274, + "step": 16100 + }, + { + "epoch": 2.650272249729122, + "grad_norm": 0.31080107211903213, + "learning_rate": 4.14148668511879e-06, + "loss": 0.4327, + "step": 16101 + }, + { + "epoch": 2.6504368339482385, + "grad_norm": 0.4623297273803156, + "learning_rate": 4.141018117517539e-06, + "loss": 0.4279, + "step": 16102 + }, + { + "epoch": 2.650601418167355, + "grad_norm": 0.26247998589542876, + "learning_rate": 4.1405495538333534e-06, + "loss": 0.4313, + "step": 16103 + }, + { + "epoch": 2.6507660023864714, + "grad_norm": 0.29532370622536497, + "learning_rate": 4.14008099407135e-06, + "loss": 0.4159, + "step": 16104 + }, + { + "epoch": 2.650930586605588, + "grad_norm": 0.41691707860407345, + "learning_rate": 4.139612438236636e-06, + "loss": 0.4391, + "step": 16105 + }, + { + "epoch": 2.6510951708247044, + "grad_norm": 0.4613190154024909, + "learning_rate": 4.13914388633433e-06, + "loss": 0.4255, + "step": 16106 + }, + { + "epoch": 2.6512597550438204, + "grad_norm": 0.2797038181019288, + "learning_rate": 4.138675338369541e-06, + "loss": 0.4285, + "step": 16107 + }, + { + "epoch": 2.651424339262937, + "grad_norm": 0.2871785210367013, + "learning_rate": 4.138206794347381e-06, + "loss": 0.4277, + "step": 16108 + }, + { + "epoch": 2.6515889234820533, + "grad_norm": 0.2833375328201389, + "learning_rate": 4.137738254272966e-06, + "loss": 0.4306, + "step": 16109 + }, + { + "epoch": 2.65175350770117, + "grad_norm": 0.2791742154687719, + "learning_rate": 4.137269718151405e-06, + "loss": 0.4182, + "step": 16110 + }, + { + "epoch": 2.6519180919202863, + "grad_norm": 0.2977754878777086, + "learning_rate": 4.1368011859878135e-06, + "loss": 0.4469, + "step": 16111 + }, + { + "epoch": 2.6520826761394027, + "grad_norm": 0.37579617600776255, + "learning_rate": 4.136332657787302e-06, + "loss": 0.4436, + "step": 16112 + }, + { + "epoch": 2.652247260358519, + "grad_norm": 0.2968274348201199, + "learning_rate": 4.135864133554983e-06, + "loss": 0.4178, + "step": 16113 + }, + { + "epoch": 2.6524118445776357, + "grad_norm": 0.2964357275876666, + "learning_rate": 4.135395613295971e-06, + "loss": 0.4285, + "step": 16114 + }, + { + "epoch": 2.652576428796752, + "grad_norm": 0.26860658490969397, + "learning_rate": 4.134927097015373e-06, + "loss": 0.4483, + "step": 16115 + }, + { + "epoch": 2.6527410130158686, + "grad_norm": 0.34189626645070365, + "learning_rate": 4.134458584718309e-06, + "loss": 0.4368, + "step": 16116 + }, + { + "epoch": 2.652905597234985, + "grad_norm": 0.2779968278720606, + "learning_rate": 4.133990076409884e-06, + "loss": 0.4249, + "step": 16117 + }, + { + "epoch": 2.6530701814541016, + "grad_norm": 0.30593249193352895, + "learning_rate": 4.133521572095214e-06, + "loss": 0.4443, + "step": 16118 + }, + { + "epoch": 2.653234765673218, + "grad_norm": 0.2854793229772029, + "learning_rate": 4.133053071779411e-06, + "loss": 0.4286, + "step": 16119 + }, + { + "epoch": 2.6533993498923345, + "grad_norm": 0.29006974883980136, + "learning_rate": 4.1325845754675856e-06, + "loss": 0.4163, + "step": 16120 + }, + { + "epoch": 2.653563934111451, + "grad_norm": 0.3077942187428335, + "learning_rate": 4.132116083164851e-06, + "loss": 0.45, + "step": 16121 + }, + { + "epoch": 2.6537285183305674, + "grad_norm": 0.3408742741710642, + "learning_rate": 4.131647594876319e-06, + "loss": 0.4492, + "step": 16122 + }, + { + "epoch": 2.653893102549684, + "grad_norm": 0.2926827150992984, + "learning_rate": 4.1311791106071026e-06, + "loss": 0.4253, + "step": 16123 + }, + { + "epoch": 2.6540576867688004, + "grad_norm": 0.3155775845434203, + "learning_rate": 4.13071063036231e-06, + "loss": 0.4121, + "step": 16124 + }, + { + "epoch": 2.654222270987917, + "grad_norm": 0.2646331437379279, + "learning_rate": 4.130242154147058e-06, + "loss": 0.4217, + "step": 16125 + }, + { + "epoch": 2.6543868552070333, + "grad_norm": 0.26323728951390457, + "learning_rate": 4.129773681966453e-06, + "loss": 0.4286, + "step": 16126 + }, + { + "epoch": 2.65455143942615, + "grad_norm": 0.4258820615094784, + "learning_rate": 4.129305213825614e-06, + "loss": 0.4105, + "step": 16127 + }, + { + "epoch": 2.6547160236452663, + "grad_norm": 0.2896312674933501, + "learning_rate": 4.128836749729646e-06, + "loss": 0.4373, + "step": 16128 + }, + { + "epoch": 2.6548806078643827, + "grad_norm": 0.31894235570171264, + "learning_rate": 4.128368289683663e-06, + "loss": 0.4397, + "step": 16129 + }, + { + "epoch": 2.655045192083499, + "grad_norm": 0.31359654504208, + "learning_rate": 4.127899833692778e-06, + "loss": 0.442, + "step": 16130 + }, + { + "epoch": 2.6552097763026157, + "grad_norm": 0.3494845795600111, + "learning_rate": 4.1274313817621e-06, + "loss": 0.4355, + "step": 16131 + }, + { + "epoch": 2.6553743605217317, + "grad_norm": 0.3489366047251084, + "learning_rate": 4.126962933896744e-06, + "loss": 0.4342, + "step": 16132 + }, + { + "epoch": 2.655538944740848, + "grad_norm": 0.34060977168449236, + "learning_rate": 4.126494490101818e-06, + "loss": 0.4374, + "step": 16133 + }, + { + "epoch": 2.6557035289599646, + "grad_norm": 0.2742126576753534, + "learning_rate": 4.126026050382436e-06, + "loss": 0.4451, + "step": 16134 + }, + { + "epoch": 2.655868113179081, + "grad_norm": 0.2992707000967294, + "learning_rate": 4.125557614743707e-06, + "loss": 0.4185, + "step": 16135 + }, + { + "epoch": 2.6560326973981976, + "grad_norm": 0.2877101811641106, + "learning_rate": 4.125089183190747e-06, + "loss": 0.4328, + "step": 16136 + }, + { + "epoch": 2.656197281617314, + "grad_norm": 0.4447724590169303, + "learning_rate": 4.124620755728663e-06, + "loss": 0.4457, + "step": 16137 + }, + { + "epoch": 2.6563618658364305, + "grad_norm": 0.28823615255434304, + "learning_rate": 4.124152332362565e-06, + "loss": 0.4357, + "step": 16138 + }, + { + "epoch": 2.656526450055547, + "grad_norm": 0.2877010704372036, + "learning_rate": 4.12368391309757e-06, + "loss": 0.4209, + "step": 16139 + }, + { + "epoch": 2.6566910342746635, + "grad_norm": 1.2348588948682877, + "learning_rate": 4.123215497938783e-06, + "loss": 0.434, + "step": 16140 + }, + { + "epoch": 2.65685561849378, + "grad_norm": 0.3176384519860354, + "learning_rate": 4.12274708689132e-06, + "loss": 0.4248, + "step": 16141 + }, + { + "epoch": 2.6570202027128964, + "grad_norm": 0.34314293250836697, + "learning_rate": 4.1222786799602895e-06, + "loss": 0.4255, + "step": 16142 + }, + { + "epoch": 2.657184786932013, + "grad_norm": 0.32065602089046813, + "learning_rate": 4.121810277150804e-06, + "loss": 0.4327, + "step": 16143 + }, + { + "epoch": 2.6573493711511293, + "grad_norm": 0.31641949004858255, + "learning_rate": 4.121341878467975e-06, + "loss": 0.4359, + "step": 16144 + }, + { + "epoch": 2.657513955370246, + "grad_norm": 0.31985622093017013, + "learning_rate": 4.120873483916909e-06, + "loss": 0.4469, + "step": 16145 + }, + { + "epoch": 2.6576785395893623, + "grad_norm": 0.4740004344214797, + "learning_rate": 4.1204050935027235e-06, + "loss": 0.4051, + "step": 16146 + }, + { + "epoch": 2.6578431238084788, + "grad_norm": 0.3047611687392972, + "learning_rate": 4.119936707230524e-06, + "loss": 0.4228, + "step": 16147 + }, + { + "epoch": 2.6580077080275952, + "grad_norm": 0.35193746954809046, + "learning_rate": 4.1194683251054246e-06, + "loss": 0.4341, + "step": 16148 + }, + { + "epoch": 2.6581722922467117, + "grad_norm": 0.4636378951303919, + "learning_rate": 4.118999947132533e-06, + "loss": 0.4312, + "step": 16149 + }, + { + "epoch": 2.658336876465828, + "grad_norm": 0.32009910134700315, + "learning_rate": 4.1185315733169645e-06, + "loss": 0.4396, + "step": 16150 + }, + { + "epoch": 2.6585014606849446, + "grad_norm": 0.32073795597582255, + "learning_rate": 4.1180632036638265e-06, + "loss": 0.4309, + "step": 16151 + }, + { + "epoch": 2.658666044904061, + "grad_norm": 0.4289503976591517, + "learning_rate": 4.117594838178229e-06, + "loss": 0.4286, + "step": 16152 + }, + { + "epoch": 2.6588306291231776, + "grad_norm": 0.3596741397311703, + "learning_rate": 4.1171264768652855e-06, + "loss": 0.4439, + "step": 16153 + }, + { + "epoch": 2.658995213342294, + "grad_norm": 0.35541758817558256, + "learning_rate": 4.1166581197301036e-06, + "loss": 0.4418, + "step": 16154 + }, + { + "epoch": 2.6591597975614105, + "grad_norm": 0.32675505943073413, + "learning_rate": 4.116189766777797e-06, + "loss": 0.4353, + "step": 16155 + }, + { + "epoch": 2.659324381780527, + "grad_norm": 0.3698642182004654, + "learning_rate": 4.115721418013473e-06, + "loss": 0.4203, + "step": 16156 + }, + { + "epoch": 2.6594889659996435, + "grad_norm": 0.6317514981473367, + "learning_rate": 4.115253073442245e-06, + "loss": 0.4277, + "step": 16157 + }, + { + "epoch": 2.65965355021876, + "grad_norm": 0.3042933450838122, + "learning_rate": 4.114784733069222e-06, + "loss": 0.4547, + "step": 16158 + }, + { + "epoch": 2.6598181344378764, + "grad_norm": 0.5519804724689971, + "learning_rate": 4.114316396899513e-06, + "loss": 0.4268, + "step": 16159 + }, + { + "epoch": 2.659982718656993, + "grad_norm": 0.3026651330178726, + "learning_rate": 4.11384806493823e-06, + "loss": 0.4267, + "step": 16160 + }, + { + "epoch": 2.6601473028761093, + "grad_norm": 0.2896548638142162, + "learning_rate": 4.113379737190482e-06, + "loss": 0.434, + "step": 16161 + }, + { + "epoch": 2.660311887095226, + "grad_norm": 0.3162773316354961, + "learning_rate": 4.112911413661382e-06, + "loss": 0.4183, + "step": 16162 + }, + { + "epoch": 2.6604764713143423, + "grad_norm": 0.30438307843708645, + "learning_rate": 4.112443094356036e-06, + "loss": 0.4222, + "step": 16163 + }, + { + "epoch": 2.6606410555334588, + "grad_norm": 0.325295396745359, + "learning_rate": 4.111974779279558e-06, + "loss": 0.4417, + "step": 16164 + }, + { + "epoch": 2.6608056397525752, + "grad_norm": 0.289759415015586, + "learning_rate": 4.111506468437057e-06, + "loss": 0.4367, + "step": 16165 + }, + { + "epoch": 2.6609702239716917, + "grad_norm": 0.2611358156919582, + "learning_rate": 4.11103816183364e-06, + "loss": 0.4349, + "step": 16166 + }, + { + "epoch": 2.661134808190808, + "grad_norm": 0.30681190236477174, + "learning_rate": 4.110569859474421e-06, + "loss": 0.4151, + "step": 16167 + }, + { + "epoch": 2.6612993924099246, + "grad_norm": 0.3179901230899138, + "learning_rate": 4.110101561364506e-06, + "loss": 0.4361, + "step": 16168 + }, + { + "epoch": 2.661463976629041, + "grad_norm": 0.3743547215570952, + "learning_rate": 4.109633267509009e-06, + "loss": 0.4457, + "step": 16169 + }, + { + "epoch": 2.6616285608481576, + "grad_norm": 0.3155628139691341, + "learning_rate": 4.109164977913037e-06, + "loss": 0.4396, + "step": 16170 + }, + { + "epoch": 2.661793145067274, + "grad_norm": 0.27701818933684635, + "learning_rate": 4.108696692581702e-06, + "loss": 0.4359, + "step": 16171 + }, + { + "epoch": 2.6619577292863905, + "grad_norm": 0.3668374170702665, + "learning_rate": 4.108228411520113e-06, + "loss": 0.4368, + "step": 16172 + }, + { + "epoch": 2.662122313505507, + "grad_norm": 0.3149660241315818, + "learning_rate": 4.1077601347333775e-06, + "loss": 0.4477, + "step": 16173 + }, + { + "epoch": 2.662286897724623, + "grad_norm": 0.36031363805868644, + "learning_rate": 4.107291862226608e-06, + "loss": 0.4386, + "step": 16174 + }, + { + "epoch": 2.6624514819437395, + "grad_norm": 0.3752687445537029, + "learning_rate": 4.106823594004912e-06, + "loss": 0.4326, + "step": 16175 + }, + { + "epoch": 2.662616066162856, + "grad_norm": 0.33165849121813185, + "learning_rate": 4.106355330073402e-06, + "loss": 0.4155, + "step": 16176 + }, + { + "epoch": 2.6627806503819724, + "grad_norm": 0.2996559896507021, + "learning_rate": 4.105887070437182e-06, + "loss": 0.4407, + "step": 16177 + }, + { + "epoch": 2.662945234601089, + "grad_norm": 0.3691651742015506, + "learning_rate": 4.1054188151013685e-06, + "loss": 0.4311, + "step": 16178 + }, + { + "epoch": 2.6631098188202054, + "grad_norm": 0.37706408757539167, + "learning_rate": 4.1049505640710655e-06, + "loss": 0.4414, + "step": 16179 + }, + { + "epoch": 2.663274403039322, + "grad_norm": 0.4580061387375138, + "learning_rate": 4.104482317351386e-06, + "loss": 0.425, + "step": 16180 + }, + { + "epoch": 2.6634389872584383, + "grad_norm": 0.3982468103731691, + "learning_rate": 4.104014074947436e-06, + "loss": 0.4176, + "step": 16181 + }, + { + "epoch": 2.6636035714775548, + "grad_norm": 0.31071724356969327, + "learning_rate": 4.103545836864326e-06, + "loss": 0.4289, + "step": 16182 + }, + { + "epoch": 2.6637681556966712, + "grad_norm": 0.3174287098929874, + "learning_rate": 4.103077603107167e-06, + "loss": 0.4249, + "step": 16183 + }, + { + "epoch": 2.6639327399157877, + "grad_norm": 0.28455613214851727, + "learning_rate": 4.102609373681066e-06, + "loss": 0.4451, + "step": 16184 + }, + { + "epoch": 2.664097324134904, + "grad_norm": 0.3196335886895791, + "learning_rate": 4.102141148591134e-06, + "loss": 0.4226, + "step": 16185 + }, + { + "epoch": 2.6642619083540207, + "grad_norm": 0.6233557455696098, + "learning_rate": 4.1016729278424765e-06, + "loss": 0.4241, + "step": 16186 + }, + { + "epoch": 2.664426492573137, + "grad_norm": 0.33200426377694864, + "learning_rate": 4.101204711440209e-06, + "loss": 0.4439, + "step": 16187 + }, + { + "epoch": 2.6645910767922536, + "grad_norm": 0.3156950309931716, + "learning_rate": 4.100736499389434e-06, + "loss": 0.4652, + "step": 16188 + }, + { + "epoch": 2.66475566101137, + "grad_norm": 0.29051265138452365, + "learning_rate": 4.100268291695262e-06, + "loss": 0.4058, + "step": 16189 + }, + { + "epoch": 2.6649202452304865, + "grad_norm": 0.3876607898144007, + "learning_rate": 4.099800088362805e-06, + "loss": 0.4336, + "step": 16190 + }, + { + "epoch": 2.665084829449603, + "grad_norm": 0.2801680526667986, + "learning_rate": 4.099331889397168e-06, + "loss": 0.4409, + "step": 16191 + }, + { + "epoch": 2.6652494136687195, + "grad_norm": 0.3928005012139743, + "learning_rate": 4.098863694803462e-06, + "loss": 0.4452, + "step": 16192 + }, + { + "epoch": 2.665413997887836, + "grad_norm": 0.31667327209953494, + "learning_rate": 4.098395504586795e-06, + "loss": 0.4314, + "step": 16193 + }, + { + "epoch": 2.6655785821069524, + "grad_norm": 0.4095252931530756, + "learning_rate": 4.0979273187522775e-06, + "loss": 0.4398, + "step": 16194 + }, + { + "epoch": 2.665743166326069, + "grad_norm": 0.3039674024879263, + "learning_rate": 4.097459137305016e-06, + "loss": 0.4303, + "step": 16195 + }, + { + "epoch": 2.6659077505451854, + "grad_norm": 0.3846388319882331, + "learning_rate": 4.096990960250118e-06, + "loss": 0.4087, + "step": 16196 + }, + { + "epoch": 2.666072334764302, + "grad_norm": 0.36933999442641624, + "learning_rate": 4.096522787592697e-06, + "loss": 0.435, + "step": 16197 + }, + { + "epoch": 2.6662369189834183, + "grad_norm": 0.26155978929311113, + "learning_rate": 4.0960546193378555e-06, + "loss": 0.4179, + "step": 16198 + }, + { + "epoch": 2.6664015032025343, + "grad_norm": 0.2824416471224022, + "learning_rate": 4.095586455490706e-06, + "loss": 0.4447, + "step": 16199 + }, + { + "epoch": 2.666566087421651, + "grad_norm": 0.3029193693644384, + "learning_rate": 4.095118296056355e-06, + "loss": 0.4314, + "step": 16200 + }, + { + "epoch": 2.6667306716407673, + "grad_norm": 0.5897728845044921, + "learning_rate": 4.0946501410399124e-06, + "loss": 0.4221, + "step": 16201 + }, + { + "epoch": 2.6668952558598837, + "grad_norm": 0.3111304963439345, + "learning_rate": 4.094181990446486e-06, + "loss": 0.4532, + "step": 16202 + }, + { + "epoch": 2.667059840079, + "grad_norm": 0.3465409814026947, + "learning_rate": 4.093713844281182e-06, + "loss": 0.4544, + "step": 16203 + }, + { + "epoch": 2.6672244242981167, + "grad_norm": 0.2985915021659544, + "learning_rate": 4.093245702549113e-06, + "loss": 0.4428, + "step": 16204 + }, + { + "epoch": 2.667389008517233, + "grad_norm": 0.3219223204177759, + "learning_rate": 4.092777565255381e-06, + "loss": 0.443, + "step": 16205 + }, + { + "epoch": 2.6675535927363496, + "grad_norm": 0.3190977768020272, + "learning_rate": 4.092309432405101e-06, + "loss": 0.4039, + "step": 16206 + }, + { + "epoch": 2.667718176955466, + "grad_norm": 0.29823725676161056, + "learning_rate": 4.091841304003376e-06, + "loss": 0.4328, + "step": 16207 + }, + { + "epoch": 2.6678827611745826, + "grad_norm": 0.3483582913118178, + "learning_rate": 4.091373180055317e-06, + "loss": 0.4572, + "step": 16208 + }, + { + "epoch": 2.668047345393699, + "grad_norm": 0.2992140663455201, + "learning_rate": 4.09090506056603e-06, + "loss": 0.4164, + "step": 16209 + }, + { + "epoch": 2.6682119296128155, + "grad_norm": 0.3210340795390415, + "learning_rate": 4.090436945540623e-06, + "loss": 0.4382, + "step": 16210 + }, + { + "epoch": 2.668376513831932, + "grad_norm": 0.3029225777699273, + "learning_rate": 4.089968834984206e-06, + "loss": 0.4334, + "step": 16211 + }, + { + "epoch": 2.6685410980510484, + "grad_norm": 0.29585427129308206, + "learning_rate": 4.089500728901885e-06, + "loss": 0.4171, + "step": 16212 + }, + { + "epoch": 2.668705682270165, + "grad_norm": 0.30808751863312334, + "learning_rate": 4.089032627298768e-06, + "loss": 0.4282, + "step": 16213 + }, + { + "epoch": 2.6688702664892814, + "grad_norm": 0.26945376664591575, + "learning_rate": 4.0885645301799615e-06, + "loss": 0.4313, + "step": 16214 + }, + { + "epoch": 2.669034850708398, + "grad_norm": 0.3248008830813408, + "learning_rate": 4.088096437550577e-06, + "loss": 0.4383, + "step": 16215 + }, + { + "epoch": 2.6691994349275143, + "grad_norm": 0.3257332301812428, + "learning_rate": 4.0876283494157186e-06, + "loss": 0.4334, + "step": 16216 + }, + { + "epoch": 2.669364019146631, + "grad_norm": 0.4606309903111496, + "learning_rate": 4.087160265780496e-06, + "loss": 0.4325, + "step": 16217 + }, + { + "epoch": 2.6695286033657473, + "grad_norm": 0.3205989020265071, + "learning_rate": 4.086692186650016e-06, + "loss": 0.4257, + "step": 16218 + }, + { + "epoch": 2.6696931875848637, + "grad_norm": 0.333101561605051, + "learning_rate": 4.086224112029385e-06, + "loss": 0.4371, + "step": 16219 + }, + { + "epoch": 2.66985777180398, + "grad_norm": 0.28771924507188223, + "learning_rate": 4.085756041923711e-06, + "loss": 0.438, + "step": 16220 + }, + { + "epoch": 2.6700223560230967, + "grad_norm": 0.27046072482072303, + "learning_rate": 4.085287976338102e-06, + "loss": 0.4127, + "step": 16221 + }, + { + "epoch": 2.670186940242213, + "grad_norm": 0.300922333424232, + "learning_rate": 4.084819915277665e-06, + "loss": 0.4319, + "step": 16222 + }, + { + "epoch": 2.6703515244613296, + "grad_norm": 0.24299807154068132, + "learning_rate": 4.0843518587475064e-06, + "loss": 0.415, + "step": 16223 + }, + { + "epoch": 2.670516108680446, + "grad_norm": 0.341735720442819, + "learning_rate": 4.083883806752737e-06, + "loss": 0.414, + "step": 16224 + }, + { + "epoch": 2.6706806928995626, + "grad_norm": 0.3393067511916409, + "learning_rate": 4.083415759298461e-06, + "loss": 0.429, + "step": 16225 + }, + { + "epoch": 2.670845277118679, + "grad_norm": 0.28674310194577535, + "learning_rate": 4.082947716389784e-06, + "loss": 0.429, + "step": 16226 + }, + { + "epoch": 2.6710098613377955, + "grad_norm": 0.5185646111855251, + "learning_rate": 4.082479678031817e-06, + "loss": 0.4454, + "step": 16227 + }, + { + "epoch": 2.671174445556912, + "grad_norm": 0.34007590718458663, + "learning_rate": 4.082011644229663e-06, + "loss": 0.4159, + "step": 16228 + }, + { + "epoch": 2.6713390297760284, + "grad_norm": 0.29528169814285576, + "learning_rate": 4.081543614988434e-06, + "loss": 0.4363, + "step": 16229 + }, + { + "epoch": 2.671503613995145, + "grad_norm": 0.3010034087122512, + "learning_rate": 4.0810755903132315e-06, + "loss": 0.4339, + "step": 16230 + }, + { + "epoch": 2.6716681982142614, + "grad_norm": 0.3420782797293866, + "learning_rate": 4.080607570209166e-06, + "loss": 0.4532, + "step": 16231 + }, + { + "epoch": 2.671832782433378, + "grad_norm": 0.3518435427751243, + "learning_rate": 4.080139554681343e-06, + "loss": 0.4374, + "step": 16232 + }, + { + "epoch": 2.6719973666524943, + "grad_norm": 0.3853923992274283, + "learning_rate": 4.0796715437348696e-06, + "loss": 0.4189, + "step": 16233 + }, + { + "epoch": 2.672161950871611, + "grad_norm": 0.39547593141325593, + "learning_rate": 4.079203537374853e-06, + "loss": 0.4322, + "step": 16234 + }, + { + "epoch": 2.6723265350907273, + "grad_norm": 0.4567047029006985, + "learning_rate": 4.078735535606399e-06, + "loss": 0.4189, + "step": 16235 + }, + { + "epoch": 2.6724911193098437, + "grad_norm": 0.30768753053570136, + "learning_rate": 4.078267538434616e-06, + "loss": 0.403, + "step": 16236 + }, + { + "epoch": 2.67265570352896, + "grad_norm": 0.37096345709738054, + "learning_rate": 4.077799545864607e-06, + "loss": 0.4071, + "step": 16237 + }, + { + "epoch": 2.6728202877480767, + "grad_norm": 0.2795107187902698, + "learning_rate": 4.0773315579014835e-06, + "loss": 0.4394, + "step": 16238 + }, + { + "epoch": 2.672984871967193, + "grad_norm": 0.40934872094739266, + "learning_rate": 4.076863574550349e-06, + "loss": 0.4526, + "step": 16239 + }, + { + "epoch": 2.673149456186309, + "grad_norm": 0.3011794778746138, + "learning_rate": 4.076395595816308e-06, + "loss": 0.4298, + "step": 16240 + }, + { + "epoch": 2.6733140404054256, + "grad_norm": 0.4588930576874614, + "learning_rate": 4.075927621704471e-06, + "loss": 0.4395, + "step": 16241 + }, + { + "epoch": 2.673478624624542, + "grad_norm": 0.2837479901390299, + "learning_rate": 4.075459652219941e-06, + "loss": 0.4207, + "step": 16242 + }, + { + "epoch": 2.6736432088436586, + "grad_norm": 0.37951879970028074, + "learning_rate": 4.074991687367827e-06, + "loss": 0.4461, + "step": 16243 + }, + { + "epoch": 2.673807793062775, + "grad_norm": 0.4577196555274362, + "learning_rate": 4.074523727153234e-06, + "loss": 0.4284, + "step": 16244 + }, + { + "epoch": 2.6739723772818915, + "grad_norm": 0.3745132841651956, + "learning_rate": 4.074055771581268e-06, + "loss": 0.4428, + "step": 16245 + }, + { + "epoch": 2.674136961501008, + "grad_norm": 0.5084261638812808, + "learning_rate": 4.073587820657037e-06, + "loss": 0.4531, + "step": 16246 + }, + { + "epoch": 2.6743015457201245, + "grad_norm": 0.39509714110382965, + "learning_rate": 4.073119874385644e-06, + "loss": 0.4279, + "step": 16247 + }, + { + "epoch": 2.674466129939241, + "grad_norm": 0.3186932320538688, + "learning_rate": 4.072651932772197e-06, + "loss": 0.4236, + "step": 16248 + }, + { + "epoch": 2.6746307141583574, + "grad_norm": 0.49111200088983387, + "learning_rate": 4.072183995821801e-06, + "loss": 0.4293, + "step": 16249 + }, + { + "epoch": 2.674795298377474, + "grad_norm": 0.349164641605376, + "learning_rate": 4.071716063539563e-06, + "loss": 0.4612, + "step": 16250 + }, + { + "epoch": 2.6749598825965903, + "grad_norm": 0.4703838182432665, + "learning_rate": 4.0712481359305876e-06, + "loss": 0.4506, + "step": 16251 + }, + { + "epoch": 2.675124466815707, + "grad_norm": 0.27681566344147845, + "learning_rate": 4.070780212999982e-06, + "loss": 0.4306, + "step": 16252 + }, + { + "epoch": 2.6752890510348233, + "grad_norm": 0.2706256207107712, + "learning_rate": 4.070312294752852e-06, + "loss": 0.4132, + "step": 16253 + }, + { + "epoch": 2.6754536352539398, + "grad_norm": 0.2942736546722622, + "learning_rate": 4.069844381194301e-06, + "loss": 0.4247, + "step": 16254 + }, + { + "epoch": 2.6756182194730562, + "grad_norm": 0.2887730169859917, + "learning_rate": 4.06937647232944e-06, + "loss": 0.4282, + "step": 16255 + }, + { + "epoch": 2.6757828036921727, + "grad_norm": 0.3249853712294799, + "learning_rate": 4.068908568163367e-06, + "loss": 0.4552, + "step": 16256 + }, + { + "epoch": 2.675947387911289, + "grad_norm": 0.42233120294813575, + "learning_rate": 4.068440668701195e-06, + "loss": 0.4494, + "step": 16257 + }, + { + "epoch": 2.6761119721304056, + "grad_norm": 0.39370678424915456, + "learning_rate": 4.067972773948022e-06, + "loss": 0.4425, + "step": 16258 + }, + { + "epoch": 2.676276556349522, + "grad_norm": 0.3351960219169127, + "learning_rate": 4.067504883908962e-06, + "loss": 0.4287, + "step": 16259 + }, + { + "epoch": 2.6764411405686386, + "grad_norm": 0.3646822270877999, + "learning_rate": 4.067036998589114e-06, + "loss": 0.4406, + "step": 16260 + }, + { + "epoch": 2.676605724787755, + "grad_norm": 0.27525514075166774, + "learning_rate": 4.066569117993586e-06, + "loss": 0.4177, + "step": 16261 + }, + { + "epoch": 2.6767703090068715, + "grad_norm": 0.33919213329865616, + "learning_rate": 4.066101242127483e-06, + "loss": 0.4459, + "step": 16262 + }, + { + "epoch": 2.676934893225988, + "grad_norm": 0.35308673235114507, + "learning_rate": 4.0656333709959085e-06, + "loss": 0.4379, + "step": 16263 + }, + { + "epoch": 2.6770994774451045, + "grad_norm": 0.30855438126161155, + "learning_rate": 4.065165504603971e-06, + "loss": 0.4431, + "step": 16264 + }, + { + "epoch": 2.677264061664221, + "grad_norm": 0.34653493149245607, + "learning_rate": 4.064697642956773e-06, + "loss": 0.4355, + "step": 16265 + }, + { + "epoch": 2.677428645883337, + "grad_norm": 0.29623996229396615, + "learning_rate": 4.064229786059422e-06, + "loss": 0.4255, + "step": 16266 + }, + { + "epoch": 2.6775932301024534, + "grad_norm": 0.3025284374334076, + "learning_rate": 4.06376193391702e-06, + "loss": 0.4453, + "step": 16267 + }, + { + "epoch": 2.67775781432157, + "grad_norm": 0.3830832942566304, + "learning_rate": 4.063294086534675e-06, + "loss": 0.4266, + "step": 16268 + }, + { + "epoch": 2.6779223985406864, + "grad_norm": 0.3344091233245314, + "learning_rate": 4.06282624391749e-06, + "loss": 0.4222, + "step": 16269 + }, + { + "epoch": 2.678086982759803, + "grad_norm": 0.3260377569625621, + "learning_rate": 4.06235840607057e-06, + "loss": 0.4139, + "step": 16270 + }, + { + "epoch": 2.6782515669789193, + "grad_norm": 0.4072985148156493, + "learning_rate": 4.061890572999021e-06, + "loss": 0.4392, + "step": 16271 + }, + { + "epoch": 2.6784161511980358, + "grad_norm": 0.3117617202916446, + "learning_rate": 4.061422744707947e-06, + "loss": 0.4456, + "step": 16272 + }, + { + "epoch": 2.6785807354171522, + "grad_norm": 0.34202950905471813, + "learning_rate": 4.060954921202454e-06, + "loss": 0.4252, + "step": 16273 + }, + { + "epoch": 2.6787453196362687, + "grad_norm": 0.3078055002171829, + "learning_rate": 4.060487102487644e-06, + "loss": 0.4427, + "step": 16274 + }, + { + "epoch": 2.678909903855385, + "grad_norm": 0.34492177812050034, + "learning_rate": 4.060019288568624e-06, + "loss": 0.4127, + "step": 16275 + }, + { + "epoch": 2.6790744880745017, + "grad_norm": 0.4479150830400735, + "learning_rate": 4.0595514794505e-06, + "loss": 0.41, + "step": 16276 + }, + { + "epoch": 2.679239072293618, + "grad_norm": 0.3027447731401161, + "learning_rate": 4.0590836751383704e-06, + "loss": 0.4125, + "step": 16277 + }, + { + "epoch": 2.6794036565127346, + "grad_norm": 0.3868369095960643, + "learning_rate": 4.058615875637347e-06, + "loss": 0.4229, + "step": 16278 + }, + { + "epoch": 2.679568240731851, + "grad_norm": 0.31252996703349806, + "learning_rate": 4.058148080952529e-06, + "loss": 0.4387, + "step": 16279 + }, + { + "epoch": 2.6797328249509675, + "grad_norm": 0.38131546502511204, + "learning_rate": 4.057680291089025e-06, + "loss": 0.4208, + "step": 16280 + }, + { + "epoch": 2.679897409170084, + "grad_norm": 0.31437886402272847, + "learning_rate": 4.057212506051935e-06, + "loss": 0.442, + "step": 16281 + }, + { + "epoch": 2.6800619933892005, + "grad_norm": 0.29705877207988296, + "learning_rate": 4.056744725846366e-06, + "loss": 0.4405, + "step": 16282 + }, + { + "epoch": 2.680226577608317, + "grad_norm": 0.3287127232403905, + "learning_rate": 4.0562769504774226e-06, + "loss": 0.4369, + "step": 16283 + }, + { + "epoch": 2.6803911618274334, + "grad_norm": 0.3461000338063579, + "learning_rate": 4.055809179950207e-06, + "loss": 0.4416, + "step": 16284 + }, + { + "epoch": 2.68055574604655, + "grad_norm": 0.3162634482779627, + "learning_rate": 4.055341414269825e-06, + "loss": 0.4496, + "step": 16285 + }, + { + "epoch": 2.6807203302656664, + "grad_norm": 0.36816052604616, + "learning_rate": 4.0548736534413795e-06, + "loss": 0.439, + "step": 16286 + }, + { + "epoch": 2.680884914484783, + "grad_norm": 0.354552042915324, + "learning_rate": 4.0544058974699764e-06, + "loss": 0.4346, + "step": 16287 + }, + { + "epoch": 2.6810494987038993, + "grad_norm": 0.38578018985135637, + "learning_rate": 4.053938146360715e-06, + "loss": 0.4294, + "step": 16288 + }, + { + "epoch": 2.6812140829230158, + "grad_norm": 0.26869839291294056, + "learning_rate": 4.053470400118707e-06, + "loss": 0.4321, + "step": 16289 + }, + { + "epoch": 2.6813786671421322, + "grad_norm": 0.5894318253041799, + "learning_rate": 4.053002658749049e-06, + "loss": 0.4472, + "step": 16290 + }, + { + "epoch": 2.6815432513612487, + "grad_norm": 0.32389992131136, + "learning_rate": 4.0525349222568475e-06, + "loss": 0.4352, + "step": 16291 + }, + { + "epoch": 2.681707835580365, + "grad_norm": 0.35372515325452797, + "learning_rate": 4.052067190647208e-06, + "loss": 0.4445, + "step": 16292 + }, + { + "epoch": 2.6818724197994817, + "grad_norm": 0.3780722530944741, + "learning_rate": 4.051599463925231e-06, + "loss": 0.4342, + "step": 16293 + }, + { + "epoch": 2.682037004018598, + "grad_norm": 0.3179532350010808, + "learning_rate": 4.051131742096022e-06, + "loss": 0.4373, + "step": 16294 + }, + { + "epoch": 2.6822015882377146, + "grad_norm": 0.28235609038119763, + "learning_rate": 4.050664025164684e-06, + "loss": 0.4266, + "step": 16295 + }, + { + "epoch": 2.682366172456831, + "grad_norm": 0.34937662996411334, + "learning_rate": 4.050196313136322e-06, + "loss": 0.4204, + "step": 16296 + }, + { + "epoch": 2.6825307566759475, + "grad_norm": 0.29543097261117546, + "learning_rate": 4.049728606016039e-06, + "loss": 0.4281, + "step": 16297 + }, + { + "epoch": 2.682695340895064, + "grad_norm": 0.2755549630399525, + "learning_rate": 4.049260903808936e-06, + "loss": 0.4285, + "step": 16298 + }, + { + "epoch": 2.6828599251141805, + "grad_norm": 0.3197657919433483, + "learning_rate": 4.048793206520118e-06, + "loss": 0.4312, + "step": 16299 + }, + { + "epoch": 2.683024509333297, + "grad_norm": 0.41965219409979476, + "learning_rate": 4.048325514154688e-06, + "loss": 0.4388, + "step": 16300 + }, + { + "epoch": 2.6831890935524134, + "grad_norm": 0.27678180863462526, + "learning_rate": 4.047857826717751e-06, + "loss": 0.4242, + "step": 16301 + }, + { + "epoch": 2.68335367777153, + "grad_norm": 0.347595027625458, + "learning_rate": 4.047390144214409e-06, + "loss": 0.4457, + "step": 16302 + }, + { + "epoch": 2.6835182619906464, + "grad_norm": 0.27207774173030125, + "learning_rate": 4.046922466649764e-06, + "loss": 0.4235, + "step": 16303 + }, + { + "epoch": 2.683682846209763, + "grad_norm": 0.33812820655438, + "learning_rate": 4.0464547940289205e-06, + "loss": 0.4282, + "step": 16304 + }, + { + "epoch": 2.6838474304288793, + "grad_norm": 0.31855684501272835, + "learning_rate": 4.045987126356982e-06, + "loss": 0.4508, + "step": 16305 + }, + { + "epoch": 2.6840120146479958, + "grad_norm": 0.2655956406794349, + "learning_rate": 4.045519463639052e-06, + "loss": 0.4403, + "step": 16306 + }, + { + "epoch": 2.684176598867112, + "grad_norm": 0.3066415099568566, + "learning_rate": 4.045051805880229e-06, + "loss": 0.4123, + "step": 16307 + }, + { + "epoch": 2.6843411830862283, + "grad_norm": 0.4262155359903081, + "learning_rate": 4.044584153085623e-06, + "loss": 0.4483, + "step": 16308 + }, + { + "epoch": 2.6845057673053447, + "grad_norm": 0.30152830675260467, + "learning_rate": 4.044116505260329e-06, + "loss": 0.4397, + "step": 16309 + }, + { + "epoch": 2.684670351524461, + "grad_norm": 0.3478555543350279, + "learning_rate": 4.043648862409457e-06, + "loss": 0.4255, + "step": 16310 + }, + { + "epoch": 2.6848349357435777, + "grad_norm": 0.4643246693630661, + "learning_rate": 4.043181224538104e-06, + "loss": 0.4371, + "step": 16311 + }, + { + "epoch": 2.684999519962694, + "grad_norm": 0.2839497220739598, + "learning_rate": 4.042713591651377e-06, + "loss": 0.4233, + "step": 16312 + }, + { + "epoch": 2.6851641041818106, + "grad_norm": 0.2903961775110076, + "learning_rate": 4.042245963754376e-06, + "loss": 0.4377, + "step": 16313 + }, + { + "epoch": 2.685328688400927, + "grad_norm": 0.2753754545659661, + "learning_rate": 4.041778340852204e-06, + "loss": 0.4058, + "step": 16314 + }, + { + "epoch": 2.6854932726200436, + "grad_norm": 0.3303682132491632, + "learning_rate": 4.041310722949964e-06, + "loss": 0.4355, + "step": 16315 + }, + { + "epoch": 2.68565785683916, + "grad_norm": 0.37255543523726975, + "learning_rate": 4.040843110052758e-06, + "loss": 0.4319, + "step": 16316 + }, + { + "epoch": 2.6858224410582765, + "grad_norm": 0.3054564467132957, + "learning_rate": 4.04037550216569e-06, + "loss": 0.4291, + "step": 16317 + }, + { + "epoch": 2.685987025277393, + "grad_norm": 0.28915295925285905, + "learning_rate": 4.0399078992938595e-06, + "loss": 0.463, + "step": 16318 + }, + { + "epoch": 2.6861516094965094, + "grad_norm": 0.4022542124139846, + "learning_rate": 4.0394403014423725e-06, + "loss": 0.4405, + "step": 16319 + }, + { + "epoch": 2.686316193715626, + "grad_norm": 0.4806554260121287, + "learning_rate": 4.038972708616328e-06, + "loss": 0.4386, + "step": 16320 + }, + { + "epoch": 2.6864807779347424, + "grad_norm": 0.4025570728681745, + "learning_rate": 4.038505120820829e-06, + "loss": 0.4217, + "step": 16321 + }, + { + "epoch": 2.686645362153859, + "grad_norm": 0.33411015406408345, + "learning_rate": 4.0380375380609785e-06, + "loss": 0.4164, + "step": 16322 + }, + { + "epoch": 2.6868099463729753, + "grad_norm": 0.38337978563252073, + "learning_rate": 4.037569960341877e-06, + "loss": 0.442, + "step": 16323 + }, + { + "epoch": 2.686974530592092, + "grad_norm": 0.3286192345753033, + "learning_rate": 4.0371023876686285e-06, + "loss": 0.4229, + "step": 16324 + }, + { + "epoch": 2.6871391148112083, + "grad_norm": 0.2887883452897136, + "learning_rate": 4.036634820046333e-06, + "loss": 0.429, + "step": 16325 + }, + { + "epoch": 2.6873036990303247, + "grad_norm": 0.6183909761308701, + "learning_rate": 4.036167257480095e-06, + "loss": 0.4306, + "step": 16326 + }, + { + "epoch": 2.687468283249441, + "grad_norm": 0.294742555568296, + "learning_rate": 4.0356996999750146e-06, + "loss": 0.4292, + "step": 16327 + }, + { + "epoch": 2.6876328674685577, + "grad_norm": 0.32687745293903564, + "learning_rate": 4.035232147536191e-06, + "loss": 0.421, + "step": 16328 + }, + { + "epoch": 2.687797451687674, + "grad_norm": 0.3037644479826378, + "learning_rate": 4.034764600168733e-06, + "loss": 0.4258, + "step": 16329 + }, + { + "epoch": 2.6879620359067906, + "grad_norm": 0.30225917798625956, + "learning_rate": 4.034297057877735e-06, + "loss": 0.4382, + "step": 16330 + }, + { + "epoch": 2.688126620125907, + "grad_norm": 0.38987761613900895, + "learning_rate": 4.033829520668302e-06, + "loss": 0.4328, + "step": 16331 + }, + { + "epoch": 2.688291204345023, + "grad_norm": 0.3210169110933282, + "learning_rate": 4.033361988545535e-06, + "loss": 0.4412, + "step": 16332 + }, + { + "epoch": 2.6884557885641396, + "grad_norm": 0.31864691654286775, + "learning_rate": 4.032894461514537e-06, + "loss": 0.4282, + "step": 16333 + }, + { + "epoch": 2.688620372783256, + "grad_norm": 0.4825974822411688, + "learning_rate": 4.0324269395804074e-06, + "loss": 0.4326, + "step": 16334 + }, + { + "epoch": 2.6887849570023725, + "grad_norm": 0.32769347237972846, + "learning_rate": 4.031959422748249e-06, + "loss": 0.4267, + "step": 16335 + }, + { + "epoch": 2.688949541221489, + "grad_norm": 0.38048815364433103, + "learning_rate": 4.031491911023163e-06, + "loss": 0.4389, + "step": 16336 + }, + { + "epoch": 2.6891141254406055, + "grad_norm": 1.0058854085830922, + "learning_rate": 4.031024404410248e-06, + "loss": 0.4429, + "step": 16337 + }, + { + "epoch": 2.689278709659722, + "grad_norm": 0.3754249272517528, + "learning_rate": 4.03055690291461e-06, + "loss": 0.4199, + "step": 16338 + }, + { + "epoch": 2.6894432938788384, + "grad_norm": 0.2947882584863735, + "learning_rate": 4.030089406541345e-06, + "loss": 0.4281, + "step": 16339 + }, + { + "epoch": 2.689607878097955, + "grad_norm": 0.3096317789054828, + "learning_rate": 4.029621915295561e-06, + "loss": 0.4298, + "step": 16340 + }, + { + "epoch": 2.6897724623170713, + "grad_norm": 0.35530437347375415, + "learning_rate": 4.029154429182352e-06, + "loss": 0.4357, + "step": 16341 + }, + { + "epoch": 2.689937046536188, + "grad_norm": 0.3012366191872031, + "learning_rate": 4.0286869482068215e-06, + "loss": 0.4405, + "step": 16342 + }, + { + "epoch": 2.6901016307553043, + "grad_norm": 0.31298119816673015, + "learning_rate": 4.028219472374073e-06, + "loss": 0.4375, + "step": 16343 + }, + { + "epoch": 2.6902662149744208, + "grad_norm": 0.32781224232259865, + "learning_rate": 4.027752001689203e-06, + "loss": 0.4406, + "step": 16344 + }, + { + "epoch": 2.6904307991935372, + "grad_norm": 0.414384595744943, + "learning_rate": 4.027284536157316e-06, + "loss": 0.4224, + "step": 16345 + }, + { + "epoch": 2.6905953834126537, + "grad_norm": 0.36499776322117494, + "learning_rate": 4.026817075783511e-06, + "loss": 0.4376, + "step": 16346 + }, + { + "epoch": 2.69075996763177, + "grad_norm": 0.32248926422427887, + "learning_rate": 4.02634962057289e-06, + "loss": 0.4432, + "step": 16347 + }, + { + "epoch": 2.6909245518508866, + "grad_norm": 0.338708489474019, + "learning_rate": 4.025882170530552e-06, + "loss": 0.459, + "step": 16348 + }, + { + "epoch": 2.691089136070003, + "grad_norm": 0.294052606229598, + "learning_rate": 4.025414725661601e-06, + "loss": 0.4451, + "step": 16349 + }, + { + "epoch": 2.6912537202891196, + "grad_norm": 0.2780252582537487, + "learning_rate": 4.024947285971133e-06, + "loss": 0.4453, + "step": 16350 + }, + { + "epoch": 2.691418304508236, + "grad_norm": 0.33408514253873084, + "learning_rate": 4.02447985146425e-06, + "loss": 0.4298, + "step": 16351 + }, + { + "epoch": 2.6915828887273525, + "grad_norm": 0.3270836296226998, + "learning_rate": 4.024012422146054e-06, + "loss": 0.4399, + "step": 16352 + }, + { + "epoch": 2.691747472946469, + "grad_norm": 0.36075527590040307, + "learning_rate": 4.023544998021644e-06, + "loss": 0.4183, + "step": 16353 + }, + { + "epoch": 2.6919120571655855, + "grad_norm": 0.3363764146010592, + "learning_rate": 4.023077579096121e-06, + "loss": 0.4425, + "step": 16354 + }, + { + "epoch": 2.692076641384702, + "grad_norm": 0.31465186885890206, + "learning_rate": 4.022610165374585e-06, + "loss": 0.4223, + "step": 16355 + }, + { + "epoch": 2.6922412256038184, + "grad_norm": 0.35284763955478143, + "learning_rate": 4.022142756862137e-06, + "loss": 0.4383, + "step": 16356 + }, + { + "epoch": 2.692405809822935, + "grad_norm": 0.3134771967967619, + "learning_rate": 4.021675353563877e-06, + "loss": 0.44, + "step": 16357 + }, + { + "epoch": 2.6925703940420513, + "grad_norm": 0.3802097153284701, + "learning_rate": 4.0212079554849025e-06, + "loss": 0.4319, + "step": 16358 + }, + { + "epoch": 2.692734978261168, + "grad_norm": 0.34680413997601384, + "learning_rate": 4.020740562630319e-06, + "loss": 0.4456, + "step": 16359 + }, + { + "epoch": 2.6928995624802843, + "grad_norm": 0.4307466925610047, + "learning_rate": 4.02027317500522e-06, + "loss": 0.4387, + "step": 16360 + }, + { + "epoch": 2.6930641466994008, + "grad_norm": 0.3429686840682955, + "learning_rate": 4.01980579261471e-06, + "loss": 0.4355, + "step": 16361 + }, + { + "epoch": 2.693228730918517, + "grad_norm": 0.2995725153651655, + "learning_rate": 4.019338415463888e-06, + "loss": 0.4388, + "step": 16362 + }, + { + "epoch": 2.6933933151376337, + "grad_norm": 0.5330069986000292, + "learning_rate": 4.018871043557852e-06, + "loss": 0.4396, + "step": 16363 + }, + { + "epoch": 2.69355789935675, + "grad_norm": 0.3288145812661788, + "learning_rate": 4.018403676901704e-06, + "loss": 0.4132, + "step": 16364 + }, + { + "epoch": 2.6937224835758666, + "grad_norm": 0.29583734843271503, + "learning_rate": 4.017936315500543e-06, + "loss": 0.451, + "step": 16365 + }, + { + "epoch": 2.693887067794983, + "grad_norm": 0.42497783015345564, + "learning_rate": 4.017468959359469e-06, + "loss": 0.4285, + "step": 16366 + }, + { + "epoch": 2.6940516520140996, + "grad_norm": 0.2844809828785978, + "learning_rate": 4.017001608483579e-06, + "loss": 0.4358, + "step": 16367 + }, + { + "epoch": 2.694216236233216, + "grad_norm": 0.3626814403818623, + "learning_rate": 4.016534262877978e-06, + "loss": 0.4234, + "step": 16368 + }, + { + "epoch": 2.6943808204523325, + "grad_norm": 0.28919512072085424, + "learning_rate": 4.016066922547759e-06, + "loss": 0.4327, + "step": 16369 + }, + { + "epoch": 2.694545404671449, + "grad_norm": 0.32003358765545853, + "learning_rate": 4.015599587498026e-06, + "loss": 0.4388, + "step": 16370 + }, + { + "epoch": 2.6947099888905655, + "grad_norm": 0.36882718405394466, + "learning_rate": 4.015132257733878e-06, + "loss": 0.4623, + "step": 16371 + }, + { + "epoch": 2.694874573109682, + "grad_norm": 0.27980427256449153, + "learning_rate": 4.014664933260411e-06, + "loss": 0.4161, + "step": 16372 + }, + { + "epoch": 2.6950391573287984, + "grad_norm": 0.41433725457500464, + "learning_rate": 4.0141976140827285e-06, + "loss": 0.4193, + "step": 16373 + }, + { + "epoch": 2.6952037415479144, + "grad_norm": 0.38174671828343304, + "learning_rate": 4.013730300205927e-06, + "loss": 0.4233, + "step": 16374 + }, + { + "epoch": 2.695368325767031, + "grad_norm": 0.37448167649323577, + "learning_rate": 4.0132629916351064e-06, + "loss": 0.4348, + "step": 16375 + }, + { + "epoch": 2.6955329099861474, + "grad_norm": 0.3227325820515218, + "learning_rate": 4.012795688375366e-06, + "loss": 0.4289, + "step": 16376 + }, + { + "epoch": 2.695697494205264, + "grad_norm": 0.34668685475852096, + "learning_rate": 4.012328390431804e-06, + "loss": 0.4328, + "step": 16377 + }, + { + "epoch": 2.6958620784243803, + "grad_norm": 0.27383474398604524, + "learning_rate": 4.0118610978095216e-06, + "loss": 0.4297, + "step": 16378 + }, + { + "epoch": 2.6960266626434968, + "grad_norm": 0.42749690459672113, + "learning_rate": 4.011393810513614e-06, + "loss": 0.4651, + "step": 16379 + }, + { + "epoch": 2.6961912468626132, + "grad_norm": 0.3957381987373052, + "learning_rate": 4.010926528549184e-06, + "loss": 0.4396, + "step": 16380 + }, + { + "epoch": 2.6963558310817297, + "grad_norm": 0.4465267398360211, + "learning_rate": 4.010459251921327e-06, + "loss": 0.4428, + "step": 16381 + }, + { + "epoch": 2.696520415300846, + "grad_norm": 0.33690374814706475, + "learning_rate": 4.009991980635144e-06, + "loss": 0.4245, + "step": 16382 + }, + { + "epoch": 2.6966849995199627, + "grad_norm": 0.264495976809934, + "learning_rate": 4.0095247146957325e-06, + "loss": 0.418, + "step": 16383 + }, + { + "epoch": 2.696849583739079, + "grad_norm": 0.28920170373751336, + "learning_rate": 4.009057454108191e-06, + "loss": 0.4269, + "step": 16384 + }, + { + "epoch": 2.6970141679581956, + "grad_norm": 1.0817155635714626, + "learning_rate": 4.00859019887762e-06, + "loss": 0.4385, + "step": 16385 + }, + { + "epoch": 2.697178752177312, + "grad_norm": 0.3838047671684187, + "learning_rate": 4.008122949009116e-06, + "loss": 0.4376, + "step": 16386 + }, + { + "epoch": 2.6973433363964285, + "grad_norm": 0.2556844042050589, + "learning_rate": 4.007655704507779e-06, + "loss": 0.419, + "step": 16387 + }, + { + "epoch": 2.697507920615545, + "grad_norm": 0.27940657185380346, + "learning_rate": 4.007188465378704e-06, + "loss": 0.4193, + "step": 16388 + }, + { + "epoch": 2.6976725048346615, + "grad_norm": 0.3339629542859908, + "learning_rate": 4.006721231626995e-06, + "loss": 0.4421, + "step": 16389 + }, + { + "epoch": 2.697837089053778, + "grad_norm": 0.3339466019931385, + "learning_rate": 4.006254003257744e-06, + "loss": 0.4305, + "step": 16390 + }, + { + "epoch": 2.6980016732728944, + "grad_norm": 0.28327025098936864, + "learning_rate": 4.0057867802760545e-06, + "loss": 0.4298, + "step": 16391 + }, + { + "epoch": 2.698166257492011, + "grad_norm": 0.33148361090437845, + "learning_rate": 4.005319562687021e-06, + "loss": 0.4252, + "step": 16392 + }, + { + "epoch": 2.6983308417111274, + "grad_norm": 0.32939786900837537, + "learning_rate": 4.004852350495744e-06, + "loss": 0.4211, + "step": 16393 + }, + { + "epoch": 2.698495425930244, + "grad_norm": 0.3725020408204437, + "learning_rate": 4.004385143707321e-06, + "loss": 0.4585, + "step": 16394 + }, + { + "epoch": 2.6986600101493603, + "grad_norm": 0.2853854565429146, + "learning_rate": 4.003917942326848e-06, + "loss": 0.4301, + "step": 16395 + }, + { + "epoch": 2.6988245943684768, + "grad_norm": 0.3735961596222982, + "learning_rate": 4.0034507463594254e-06, + "loss": 0.4314, + "step": 16396 + }, + { + "epoch": 2.6989891785875932, + "grad_norm": 0.3555759193710977, + "learning_rate": 4.002983555810149e-06, + "loss": 0.4408, + "step": 16397 + }, + { + "epoch": 2.6991537628067097, + "grad_norm": 0.3541206601416742, + "learning_rate": 4.00251637068412e-06, + "loss": 0.4236, + "step": 16398 + }, + { + "epoch": 2.6993183470258257, + "grad_norm": 0.2697075924755978, + "learning_rate": 4.00204919098643e-06, + "loss": 0.4272, + "step": 16399 + }, + { + "epoch": 2.699482931244942, + "grad_norm": 0.2698943695657363, + "learning_rate": 4.0015820167221844e-06, + "loss": 0.4351, + "step": 16400 + }, + { + "epoch": 2.6996475154640587, + "grad_norm": 0.3023080111451641, + "learning_rate": 4.001114847896476e-06, + "loss": 0.4313, + "step": 16401 + }, + { + "epoch": 2.699812099683175, + "grad_norm": 0.33379035571948806, + "learning_rate": 4.0006476845144015e-06, + "loss": 0.4331, + "step": 16402 + }, + { + "epoch": 2.6999766839022916, + "grad_norm": 0.4000340059845679, + "learning_rate": 4.000180526581062e-06, + "loss": 0.4484, + "step": 16403 + }, + { + "epoch": 2.700141268121408, + "grad_norm": 0.3095464294829794, + "learning_rate": 3.999713374101551e-06, + "loss": 0.4427, + "step": 16404 + }, + { + "epoch": 2.7003058523405246, + "grad_norm": 0.4313563968877461, + "learning_rate": 3.999246227080969e-06, + "loss": 0.4514, + "step": 16405 + }, + { + "epoch": 2.700470436559641, + "grad_norm": 0.3398078212298633, + "learning_rate": 3.998779085524413e-06, + "loss": 0.43, + "step": 16406 + }, + { + "epoch": 2.7006350207787575, + "grad_norm": 0.3391341204498403, + "learning_rate": 3.99831194943698e-06, + "loss": 0.4186, + "step": 16407 + }, + { + "epoch": 2.700799604997874, + "grad_norm": 0.39121165508808176, + "learning_rate": 3.997844818823767e-06, + "loss": 0.418, + "step": 16408 + }, + { + "epoch": 2.7009641892169904, + "grad_norm": 0.2978515677764767, + "learning_rate": 3.997377693689868e-06, + "loss": 0.4307, + "step": 16409 + }, + { + "epoch": 2.701128773436107, + "grad_norm": 0.269733507262398, + "learning_rate": 3.996910574040387e-06, + "loss": 0.4378, + "step": 16410 + }, + { + "epoch": 2.7012933576552234, + "grad_norm": 0.293333395883526, + "learning_rate": 3.996443459880414e-06, + "loss": 0.4246, + "step": 16411 + }, + { + "epoch": 2.70145794187434, + "grad_norm": 0.33065446535807685, + "learning_rate": 3.9959763512150505e-06, + "loss": 0.4373, + "step": 16412 + }, + { + "epoch": 2.7016225260934563, + "grad_norm": 0.4420974144520157, + "learning_rate": 3.99550924804939e-06, + "loss": 0.4277, + "step": 16413 + }, + { + "epoch": 2.701787110312573, + "grad_norm": 0.35927542344814284, + "learning_rate": 3.995042150388534e-06, + "loss": 0.4208, + "step": 16414 + }, + { + "epoch": 2.7019516945316893, + "grad_norm": 0.32015570904244817, + "learning_rate": 3.994575058237575e-06, + "loss": 0.4468, + "step": 16415 + }, + { + "epoch": 2.7021162787508057, + "grad_norm": 0.34575732730265396, + "learning_rate": 3.994107971601611e-06, + "loss": 0.4225, + "step": 16416 + }, + { + "epoch": 2.702280862969922, + "grad_norm": 0.30017266073248694, + "learning_rate": 3.9936408904857404e-06, + "loss": 0.4345, + "step": 16417 + }, + { + "epoch": 2.7024454471890387, + "grad_norm": 0.2841028088209639, + "learning_rate": 3.993173814895056e-06, + "loss": 0.4281, + "step": 16418 + }, + { + "epoch": 2.702610031408155, + "grad_norm": 0.3727144194184109, + "learning_rate": 3.9927067448346584e-06, + "loss": 0.4396, + "step": 16419 + }, + { + "epoch": 2.7027746156272716, + "grad_norm": 1.1118286108688196, + "learning_rate": 3.992239680309641e-06, + "loss": 0.4223, + "step": 16420 + }, + { + "epoch": 2.702939199846388, + "grad_norm": 0.33739735935023996, + "learning_rate": 3.991772621325103e-06, + "loss": 0.4294, + "step": 16421 + }, + { + "epoch": 2.7031037840655046, + "grad_norm": 0.3207517013717223, + "learning_rate": 3.991305567886139e-06, + "loss": 0.4354, + "step": 16422 + }, + { + "epoch": 2.703268368284621, + "grad_norm": 0.34552698173846835, + "learning_rate": 3.990838519997845e-06, + "loss": 0.4524, + "step": 16423 + }, + { + "epoch": 2.7034329525037375, + "grad_norm": 0.29631186943187254, + "learning_rate": 3.990371477665319e-06, + "loss": 0.432, + "step": 16424 + }, + { + "epoch": 2.703597536722854, + "grad_norm": 0.3395910937313192, + "learning_rate": 3.989904440893654e-06, + "loss": 0.43, + "step": 16425 + }, + { + "epoch": 2.7037621209419704, + "grad_norm": 0.30236926557935484, + "learning_rate": 3.98943740968795e-06, + "loss": 0.4171, + "step": 16426 + }, + { + "epoch": 2.703926705161087, + "grad_norm": 0.45268445138775065, + "learning_rate": 3.9889703840533e-06, + "loss": 0.451, + "step": 16427 + }, + { + "epoch": 2.7040912893802034, + "grad_norm": 0.2825909781347472, + "learning_rate": 3.9885033639948025e-06, + "loss": 0.4377, + "step": 16428 + }, + { + "epoch": 2.70425587359932, + "grad_norm": 0.6358393266769788, + "learning_rate": 3.988036349517551e-06, + "loss": 0.4415, + "step": 16429 + }, + { + "epoch": 2.7044204578184363, + "grad_norm": 0.3386341951236184, + "learning_rate": 3.987569340626644e-06, + "loss": 0.419, + "step": 16430 + }, + { + "epoch": 2.704585042037553, + "grad_norm": 0.43150716763894154, + "learning_rate": 3.987102337327176e-06, + "loss": 0.4085, + "step": 16431 + }, + { + "epoch": 2.7047496262566693, + "grad_norm": 0.33112369492597604, + "learning_rate": 3.986635339624241e-06, + "loss": 0.441, + "step": 16432 + }, + { + "epoch": 2.7049142104757857, + "grad_norm": 0.7055101510823322, + "learning_rate": 3.986168347522937e-06, + "loss": 0.4415, + "step": 16433 + }, + { + "epoch": 2.705078794694902, + "grad_norm": 0.3058808341878057, + "learning_rate": 3.985701361028358e-06, + "loss": 0.419, + "step": 16434 + }, + { + "epoch": 2.7052433789140187, + "grad_norm": 0.5188963271133904, + "learning_rate": 3.985234380145601e-06, + "loss": 0.4249, + "step": 16435 + }, + { + "epoch": 2.705407963133135, + "grad_norm": 0.36464984739773737, + "learning_rate": 3.984767404879761e-06, + "loss": 0.4299, + "step": 16436 + }, + { + "epoch": 2.7055725473522516, + "grad_norm": 0.3289296636196006, + "learning_rate": 3.9843004352359335e-06, + "loss": 0.4267, + "step": 16437 + }, + { + "epoch": 2.705737131571368, + "grad_norm": 0.33757350971555905, + "learning_rate": 3.983833471219215e-06, + "loss": 0.4125, + "step": 16438 + }, + { + "epoch": 2.7059017157904846, + "grad_norm": 0.31527342354316334, + "learning_rate": 3.983366512834697e-06, + "loss": 0.4364, + "step": 16439 + }, + { + "epoch": 2.706066300009601, + "grad_norm": 0.3019138030598178, + "learning_rate": 3.982899560087481e-06, + "loss": 0.4311, + "step": 16440 + }, + { + "epoch": 2.706230884228717, + "grad_norm": 0.31228858196188747, + "learning_rate": 3.982432612982656e-06, + "loss": 0.4314, + "step": 16441 + }, + { + "epoch": 2.7063954684478335, + "grad_norm": 0.3658219525437423, + "learning_rate": 3.98196567152532e-06, + "loss": 0.4518, + "step": 16442 + }, + { + "epoch": 2.70656005266695, + "grad_norm": 0.24508146841699552, + "learning_rate": 3.9814987357205685e-06, + "loss": 0.4343, + "step": 16443 + }, + { + "epoch": 2.7067246368860665, + "grad_norm": 0.31452290874256633, + "learning_rate": 3.981031805573496e-06, + "loss": 0.4207, + "step": 16444 + }, + { + "epoch": 2.706889221105183, + "grad_norm": 0.2755412575819989, + "learning_rate": 3.980564881089197e-06, + "loss": 0.4472, + "step": 16445 + }, + { + "epoch": 2.7070538053242994, + "grad_norm": 0.30094494464038807, + "learning_rate": 3.980097962272766e-06, + "loss": 0.4148, + "step": 16446 + }, + { + "epoch": 2.707218389543416, + "grad_norm": 0.3007444972335924, + "learning_rate": 3.9796310491293e-06, + "loss": 0.4177, + "step": 16447 + }, + { + "epoch": 2.7073829737625323, + "grad_norm": 0.3387548685818209, + "learning_rate": 3.97916414166389e-06, + "loss": 0.4534, + "step": 16448 + }, + { + "epoch": 2.707547557981649, + "grad_norm": 0.26601239553772493, + "learning_rate": 3.978697239881636e-06, + "loss": 0.4238, + "step": 16449 + }, + { + "epoch": 2.7077121422007653, + "grad_norm": 0.4986743261050538, + "learning_rate": 3.978230343787627e-06, + "loss": 0.4303, + "step": 16450 + }, + { + "epoch": 2.7078767264198818, + "grad_norm": 0.3267574436279148, + "learning_rate": 3.977763453386963e-06, + "loss": 0.4246, + "step": 16451 + }, + { + "epoch": 2.7080413106389982, + "grad_norm": 0.32715433919622156, + "learning_rate": 3.977296568684735e-06, + "loss": 0.4469, + "step": 16452 + }, + { + "epoch": 2.7082058948581147, + "grad_norm": 0.326640328060673, + "learning_rate": 3.976829689686037e-06, + "loss": 0.4384, + "step": 16453 + }, + { + "epoch": 2.708370479077231, + "grad_norm": 0.2731441484587725, + "learning_rate": 3.9763628163959646e-06, + "loss": 0.4244, + "step": 16454 + }, + { + "epoch": 2.7085350632963476, + "grad_norm": 0.29145356682928636, + "learning_rate": 3.975895948819612e-06, + "loss": 0.4303, + "step": 16455 + }, + { + "epoch": 2.708699647515464, + "grad_norm": 0.3826056619240171, + "learning_rate": 3.975429086962075e-06, + "loss": 0.4366, + "step": 16456 + }, + { + "epoch": 2.7088642317345806, + "grad_norm": 0.2903672346142324, + "learning_rate": 3.974962230828445e-06, + "loss": 0.43, + "step": 16457 + }, + { + "epoch": 2.709028815953697, + "grad_norm": 0.31055339133125603, + "learning_rate": 3.9744953804238184e-06, + "loss": 0.4243, + "step": 16458 + }, + { + "epoch": 2.7091934001728135, + "grad_norm": 0.30907072386314455, + "learning_rate": 3.97402853575329e-06, + "loss": 0.4558, + "step": 16459 + }, + { + "epoch": 2.70935798439193, + "grad_norm": 0.39217728225346443, + "learning_rate": 3.973561696821949e-06, + "loss": 0.4383, + "step": 16460 + }, + { + "epoch": 2.7095225686110465, + "grad_norm": 0.2949136541812381, + "learning_rate": 3.973094863634896e-06, + "loss": 0.4406, + "step": 16461 + }, + { + "epoch": 2.709687152830163, + "grad_norm": 0.2874774588586657, + "learning_rate": 3.972628036197219e-06, + "loss": 0.433, + "step": 16462 + }, + { + "epoch": 2.7098517370492794, + "grad_norm": 0.37726555702971565, + "learning_rate": 3.972161214514016e-06, + "loss": 0.4182, + "step": 16463 + }, + { + "epoch": 2.710016321268396, + "grad_norm": 0.3523421698654293, + "learning_rate": 3.971694398590378e-06, + "loss": 0.4313, + "step": 16464 + }, + { + "epoch": 2.7101809054875123, + "grad_norm": 0.35601099773408845, + "learning_rate": 3.9712275884314006e-06, + "loss": 0.4233, + "step": 16465 + }, + { + "epoch": 2.7103454897066284, + "grad_norm": 4.607132561798621, + "learning_rate": 3.9707607840421765e-06, + "loss": 0.4371, + "step": 16466 + }, + { + "epoch": 2.710510073925745, + "grad_norm": 0.32006363388751585, + "learning_rate": 3.9702939854277995e-06, + "loss": 0.4252, + "step": 16467 + }, + { + "epoch": 2.7106746581448613, + "grad_norm": 0.3703664092191253, + "learning_rate": 3.969827192593364e-06, + "loss": 0.4229, + "step": 16468 + }, + { + "epoch": 2.7108392423639778, + "grad_norm": 0.3535642572338403, + "learning_rate": 3.96936040554396e-06, + "loss": 0.4352, + "step": 16469 + }, + { + "epoch": 2.7110038265830942, + "grad_norm": 0.34953714902004956, + "learning_rate": 3.968893624284687e-06, + "loss": 0.4501, + "step": 16470 + }, + { + "epoch": 2.7111684108022107, + "grad_norm": 0.3261552636191001, + "learning_rate": 3.968426848820632e-06, + "loss": 0.4519, + "step": 16471 + }, + { + "epoch": 2.711332995021327, + "grad_norm": 0.4002886697589968, + "learning_rate": 3.967960079156893e-06, + "loss": 0.4319, + "step": 16472 + }, + { + "epoch": 2.7114975792404437, + "grad_norm": 0.43804861055115035, + "learning_rate": 3.96749331529856e-06, + "loss": 0.4346, + "step": 16473 + }, + { + "epoch": 2.71166216345956, + "grad_norm": 0.29537495233881805, + "learning_rate": 3.967026557250728e-06, + "loss": 0.4134, + "step": 16474 + }, + { + "epoch": 2.7118267476786766, + "grad_norm": 0.41638120350919683, + "learning_rate": 3.966559805018489e-06, + "loss": 0.3992, + "step": 16475 + }, + { + "epoch": 2.711991331897793, + "grad_norm": 0.3401077405746412, + "learning_rate": 3.966093058606936e-06, + "loss": 0.4158, + "step": 16476 + }, + { + "epoch": 2.7121559161169095, + "grad_norm": 0.49076143533819405, + "learning_rate": 3.9656263180211646e-06, + "loss": 0.4334, + "step": 16477 + }, + { + "epoch": 2.712320500336026, + "grad_norm": 0.35596719185727005, + "learning_rate": 3.965159583266263e-06, + "loss": 0.4405, + "step": 16478 + }, + { + "epoch": 2.7124850845551425, + "grad_norm": 0.2811140700107037, + "learning_rate": 3.9646928543473284e-06, + "loss": 0.4281, + "step": 16479 + }, + { + "epoch": 2.712649668774259, + "grad_norm": 0.36662815582599095, + "learning_rate": 3.964226131269451e-06, + "loss": 0.4426, + "step": 16480 + }, + { + "epoch": 2.7128142529933754, + "grad_norm": 0.3329050753007927, + "learning_rate": 3.963759414037725e-06, + "loss": 0.4567, + "step": 16481 + }, + { + "epoch": 2.712978837212492, + "grad_norm": 0.3967360921283001, + "learning_rate": 3.9632927026572415e-06, + "loss": 0.4515, + "step": 16482 + }, + { + "epoch": 2.7131434214316084, + "grad_norm": 0.4789891785118059, + "learning_rate": 3.962825997133094e-06, + "loss": 0.4336, + "step": 16483 + }, + { + "epoch": 2.713308005650725, + "grad_norm": 0.49288810478031103, + "learning_rate": 3.962359297470375e-06, + "loss": 0.433, + "step": 16484 + }, + { + "epoch": 2.7134725898698413, + "grad_norm": 0.374289866565762, + "learning_rate": 3.961892603674176e-06, + "loss": 0.419, + "step": 16485 + }, + { + "epoch": 2.7136371740889578, + "grad_norm": 0.339703176599694, + "learning_rate": 3.96142591574959e-06, + "loss": 0.4183, + "step": 16486 + }, + { + "epoch": 2.7138017583080742, + "grad_norm": 0.3705111284263445, + "learning_rate": 3.96095923370171e-06, + "loss": 0.4245, + "step": 16487 + }, + { + "epoch": 2.7139663425271907, + "grad_norm": 0.38620231082094414, + "learning_rate": 3.9604925575356285e-06, + "loss": 0.4077, + "step": 16488 + }, + { + "epoch": 2.714130926746307, + "grad_norm": 0.36650262469548656, + "learning_rate": 3.960025887256437e-06, + "loss": 0.4306, + "step": 16489 + }, + { + "epoch": 2.7142955109654237, + "grad_norm": 0.3655590240723453, + "learning_rate": 3.959559222869226e-06, + "loss": 0.4168, + "step": 16490 + }, + { + "epoch": 2.71446009518454, + "grad_norm": 0.5492373426547444, + "learning_rate": 3.959092564379091e-06, + "loss": 0.426, + "step": 16491 + }, + { + "epoch": 2.7146246794036566, + "grad_norm": 0.32705909412887524, + "learning_rate": 3.9586259117911205e-06, + "loss": 0.4088, + "step": 16492 + }, + { + "epoch": 2.714789263622773, + "grad_norm": 0.28925992733428907, + "learning_rate": 3.958159265110409e-06, + "loss": 0.4242, + "step": 16493 + }, + { + "epoch": 2.7149538478418895, + "grad_norm": 0.34237158057699313, + "learning_rate": 3.957692624342046e-06, + "loss": 0.4267, + "step": 16494 + }, + { + "epoch": 2.715118432061006, + "grad_norm": 0.36573089990123203, + "learning_rate": 3.957225989491125e-06, + "loss": 0.448, + "step": 16495 + }, + { + "epoch": 2.7152830162801225, + "grad_norm": 0.396777803147069, + "learning_rate": 3.956759360562738e-06, + "loss": 0.4295, + "step": 16496 + }, + { + "epoch": 2.715447600499239, + "grad_norm": 0.31646470571881147, + "learning_rate": 3.956292737561976e-06, + "loss": 0.4159, + "step": 16497 + }, + { + "epoch": 2.7156121847183554, + "grad_norm": 0.3603492209505748, + "learning_rate": 3.9558261204939305e-06, + "loss": 0.4007, + "step": 16498 + }, + { + "epoch": 2.715776768937472, + "grad_norm": 0.33269287449159657, + "learning_rate": 3.955359509363693e-06, + "loss": 0.4473, + "step": 16499 + }, + { + "epoch": 2.7159413531565884, + "grad_norm": 0.3743128484780302, + "learning_rate": 3.954892904176356e-06, + "loss": 0.4347, + "step": 16500 + }, + { + "epoch": 2.716105937375705, + "grad_norm": 0.31514688689823894, + "learning_rate": 3.954426304937008e-06, + "loss": 0.4237, + "step": 16501 + }, + { + "epoch": 2.7162705215948213, + "grad_norm": 1.7890578690514742, + "learning_rate": 3.953959711650745e-06, + "loss": 0.4232, + "step": 16502 + }, + { + "epoch": 2.7164351058139378, + "grad_norm": 0.30094650376522986, + "learning_rate": 3.953493124322655e-06, + "loss": 0.4214, + "step": 16503 + }, + { + "epoch": 2.7165996900330542, + "grad_norm": 0.3094817709581162, + "learning_rate": 3.953026542957829e-06, + "loss": 0.4319, + "step": 16504 + }, + { + "epoch": 2.7167642742521707, + "grad_norm": 0.34239085799206986, + "learning_rate": 3.9525599675613594e-06, + "loss": 0.4326, + "step": 16505 + }, + { + "epoch": 2.716928858471287, + "grad_norm": 0.3131738401297161, + "learning_rate": 3.952093398138336e-06, + "loss": 0.4406, + "step": 16506 + }, + { + "epoch": 2.717093442690403, + "grad_norm": 0.32423565735021526, + "learning_rate": 3.951626834693853e-06, + "loss": 0.4283, + "step": 16507 + }, + { + "epoch": 2.7172580269095197, + "grad_norm": 0.37070568272760424, + "learning_rate": 3.951160277232997e-06, + "loss": 0.4436, + "step": 16508 + }, + { + "epoch": 2.717422611128636, + "grad_norm": 0.2891131256851677, + "learning_rate": 3.950693725760863e-06, + "loss": 0.4357, + "step": 16509 + }, + { + "epoch": 2.7175871953477526, + "grad_norm": 0.28978087673201264, + "learning_rate": 3.950227180282538e-06, + "loss": 0.4372, + "step": 16510 + }, + { + "epoch": 2.717751779566869, + "grad_norm": 0.5355941888768845, + "learning_rate": 3.949760640803116e-06, + "loss": 0.4186, + "step": 16511 + }, + { + "epoch": 2.7179163637859856, + "grad_norm": 0.3526071429229731, + "learning_rate": 3.949294107327686e-06, + "loss": 0.4179, + "step": 16512 + }, + { + "epoch": 2.718080948005102, + "grad_norm": 0.5767881584871625, + "learning_rate": 3.948827579861338e-06, + "loss": 0.4172, + "step": 16513 + }, + { + "epoch": 2.7182455322242185, + "grad_norm": 0.7289027212814949, + "learning_rate": 3.948361058409165e-06, + "loss": 0.4297, + "step": 16514 + }, + { + "epoch": 2.718410116443335, + "grad_norm": 0.30909871256741495, + "learning_rate": 3.947894542976254e-06, + "loss": 0.436, + "step": 16515 + }, + { + "epoch": 2.7185747006624514, + "grad_norm": 0.3246466845888603, + "learning_rate": 3.947428033567699e-06, + "loss": 0.4501, + "step": 16516 + }, + { + "epoch": 2.718739284881568, + "grad_norm": 0.2665993992574491, + "learning_rate": 3.946961530188588e-06, + "loss": 0.4145, + "step": 16517 + }, + { + "epoch": 2.7189038691006844, + "grad_norm": 0.27713457592729585, + "learning_rate": 3.9464950328440124e-06, + "loss": 0.4148, + "step": 16518 + }, + { + "epoch": 2.719068453319801, + "grad_norm": 0.43622092377899335, + "learning_rate": 3.946028541539064e-06, + "loss": 0.4334, + "step": 16519 + }, + { + "epoch": 2.7192330375389173, + "grad_norm": 0.31195549073953505, + "learning_rate": 3.9455620562788275e-06, + "loss": 0.4259, + "step": 16520 + }, + { + "epoch": 2.719397621758034, + "grad_norm": 0.3053428947657628, + "learning_rate": 3.945095577068399e-06, + "loss": 0.4174, + "step": 16521 + }, + { + "epoch": 2.7195622059771503, + "grad_norm": 0.32858949379606484, + "learning_rate": 3.944629103912863e-06, + "loss": 0.44, + "step": 16522 + }, + { + "epoch": 2.7197267901962667, + "grad_norm": 0.3173134537251033, + "learning_rate": 3.944162636817316e-06, + "loss": 0.4361, + "step": 16523 + }, + { + "epoch": 2.719891374415383, + "grad_norm": 0.36359440024895223, + "learning_rate": 3.943696175786843e-06, + "loss": 0.4275, + "step": 16524 + }, + { + "epoch": 2.7200559586344997, + "grad_norm": 0.3136275602179564, + "learning_rate": 3.9432297208265365e-06, + "loss": 0.4161, + "step": 16525 + }, + { + "epoch": 2.720220542853616, + "grad_norm": 0.3560583689227809, + "learning_rate": 3.942763271941484e-06, + "loss": 0.4317, + "step": 16526 + }, + { + "epoch": 2.7203851270727326, + "grad_norm": 0.352877130841058, + "learning_rate": 3.942296829136776e-06, + "loss": 0.4453, + "step": 16527 + }, + { + "epoch": 2.720549711291849, + "grad_norm": 0.29121856925706796, + "learning_rate": 3.941830392417503e-06, + "loss": 0.4293, + "step": 16528 + }, + { + "epoch": 2.7207142955109656, + "grad_norm": 0.31819722571074455, + "learning_rate": 3.941363961788754e-06, + "loss": 0.4381, + "step": 16529 + }, + { + "epoch": 2.720878879730082, + "grad_norm": 0.27919041748825024, + "learning_rate": 3.940897537255619e-06, + "loss": 0.4351, + "step": 16530 + }, + { + "epoch": 2.7210434639491985, + "grad_norm": 0.3181876486582578, + "learning_rate": 3.940431118823185e-06, + "loss": 0.4341, + "step": 16531 + }, + { + "epoch": 2.7212080481683145, + "grad_norm": 0.4905460053945161, + "learning_rate": 3.939964706496546e-06, + "loss": 0.4485, + "step": 16532 + }, + { + "epoch": 2.721372632387431, + "grad_norm": 0.36060063288372324, + "learning_rate": 3.9394983002807875e-06, + "loss": 0.4614, + "step": 16533 + }, + { + "epoch": 2.7215372166065475, + "grad_norm": 0.2891799665529831, + "learning_rate": 3.939031900180999e-06, + "loss": 0.4301, + "step": 16534 + }, + { + "epoch": 2.721701800825664, + "grad_norm": 0.31653000551268723, + "learning_rate": 3.938565506202271e-06, + "loss": 0.4368, + "step": 16535 + }, + { + "epoch": 2.7218663850447804, + "grad_norm": 0.31446163576171354, + "learning_rate": 3.938099118349692e-06, + "loss": 0.4115, + "step": 16536 + }, + { + "epoch": 2.722030969263897, + "grad_norm": 0.4224125133332122, + "learning_rate": 3.9376327366283514e-06, + "loss": 0.4275, + "step": 16537 + }, + { + "epoch": 2.7221955534830133, + "grad_norm": 0.41937226295599567, + "learning_rate": 3.937166361043337e-06, + "loss": 0.4261, + "step": 16538 + }, + { + "epoch": 2.72236013770213, + "grad_norm": 0.32751387604214655, + "learning_rate": 3.93669999159974e-06, + "loss": 0.4322, + "step": 16539 + }, + { + "epoch": 2.7225247219212463, + "grad_norm": 0.31879617266630655, + "learning_rate": 3.936233628302649e-06, + "loss": 0.4266, + "step": 16540 + }, + { + "epoch": 2.7226893061403628, + "grad_norm": 0.2738330940521524, + "learning_rate": 3.935767271157148e-06, + "loss": 0.4184, + "step": 16541 + }, + { + "epoch": 2.7228538903594792, + "grad_norm": 0.33398045292163164, + "learning_rate": 3.935300920168334e-06, + "loss": 0.4391, + "step": 16542 + }, + { + "epoch": 2.7230184745785957, + "grad_norm": 0.35001300209930836, + "learning_rate": 3.934834575341287e-06, + "loss": 0.4128, + "step": 16543 + }, + { + "epoch": 2.723183058797712, + "grad_norm": 0.41861161950891773, + "learning_rate": 3.934368236681102e-06, + "loss": 0.4333, + "step": 16544 + }, + { + "epoch": 2.7233476430168286, + "grad_norm": 0.42298701569274116, + "learning_rate": 3.9339019041928625e-06, + "loss": 0.399, + "step": 16545 + }, + { + "epoch": 2.723512227235945, + "grad_norm": 0.30381499839029175, + "learning_rate": 3.9334355778816614e-06, + "loss": 0.4262, + "step": 16546 + }, + { + "epoch": 2.7236768114550616, + "grad_norm": 0.4276762118814055, + "learning_rate": 3.9329692577525854e-06, + "loss": 0.4185, + "step": 16547 + }, + { + "epoch": 2.723841395674178, + "grad_norm": 0.3489145625046093, + "learning_rate": 3.932502943810722e-06, + "loss": 0.4392, + "step": 16548 + }, + { + "epoch": 2.7240059798932945, + "grad_norm": 0.31725816333011786, + "learning_rate": 3.932036636061161e-06, + "loss": 0.4141, + "step": 16549 + }, + { + "epoch": 2.724170564112411, + "grad_norm": 0.6398241391454529, + "learning_rate": 3.931570334508987e-06, + "loss": 0.4138, + "step": 16550 + }, + { + "epoch": 2.7243351483315275, + "grad_norm": 0.4477747233220181, + "learning_rate": 3.931104039159293e-06, + "loss": 0.4398, + "step": 16551 + }, + { + "epoch": 2.724499732550644, + "grad_norm": 0.32522477304337494, + "learning_rate": 3.930637750017162e-06, + "loss": 0.4228, + "step": 16552 + }, + { + "epoch": 2.7246643167697604, + "grad_norm": 0.26621290472588405, + "learning_rate": 3.930171467087688e-06, + "loss": 0.4412, + "step": 16553 + }, + { + "epoch": 2.724828900988877, + "grad_norm": 0.31334048099794237, + "learning_rate": 3.929705190375953e-06, + "loss": 0.4332, + "step": 16554 + }, + { + "epoch": 2.7249934852079933, + "grad_norm": 0.4502108996851825, + "learning_rate": 3.929238919887049e-06, + "loss": 0.4309, + "step": 16555 + }, + { + "epoch": 2.72515806942711, + "grad_norm": 0.3080396753891864, + "learning_rate": 3.9287726556260615e-06, + "loss": 0.435, + "step": 16556 + }, + { + "epoch": 2.7253226536462263, + "grad_norm": 0.2974240979985078, + "learning_rate": 3.9283063975980785e-06, + "loss": 0.4384, + "step": 16557 + }, + { + "epoch": 2.7254872378653427, + "grad_norm": 0.3634519406751955, + "learning_rate": 3.927840145808188e-06, + "loss": 0.4346, + "step": 16558 + }, + { + "epoch": 2.725651822084459, + "grad_norm": 0.2961719414650674, + "learning_rate": 3.927373900261478e-06, + "loss": 0.4359, + "step": 16559 + }, + { + "epoch": 2.7258164063035757, + "grad_norm": 1.3632269864910096, + "learning_rate": 3.926907660963035e-06, + "loss": 0.4161, + "step": 16560 + }, + { + "epoch": 2.725980990522692, + "grad_norm": 0.48981983726866346, + "learning_rate": 3.926441427917946e-06, + "loss": 0.4361, + "step": 16561 + }, + { + "epoch": 2.7261455747418086, + "grad_norm": 0.3451352277184602, + "learning_rate": 3.925975201131302e-06, + "loss": 0.4428, + "step": 16562 + }, + { + "epoch": 2.726310158960925, + "grad_norm": 0.3423792475728588, + "learning_rate": 3.925508980608186e-06, + "loss": 0.4071, + "step": 16563 + }, + { + "epoch": 2.7264747431800416, + "grad_norm": 0.35651849165342947, + "learning_rate": 3.925042766353686e-06, + "loss": 0.4573, + "step": 16564 + }, + { + "epoch": 2.726639327399158, + "grad_norm": 0.33468366739995337, + "learning_rate": 3.9245765583728905e-06, + "loss": 0.4134, + "step": 16565 + }, + { + "epoch": 2.7268039116182745, + "grad_norm": 0.3264344724816696, + "learning_rate": 3.924110356670885e-06, + "loss": 0.4397, + "step": 16566 + }, + { + "epoch": 2.726968495837391, + "grad_norm": 0.2621226652285822, + "learning_rate": 3.923644161252759e-06, + "loss": 0.4345, + "step": 16567 + }, + { + "epoch": 2.7271330800565075, + "grad_norm": 0.38974860554227214, + "learning_rate": 3.923177972123597e-06, + "loss": 0.4474, + "step": 16568 + }, + { + "epoch": 2.727297664275624, + "grad_norm": 0.3402501223880213, + "learning_rate": 3.922711789288487e-06, + "loss": 0.4651, + "step": 16569 + }, + { + "epoch": 2.7274622484947404, + "grad_norm": 0.31851552409547534, + "learning_rate": 3.922245612752517e-06, + "loss": 0.432, + "step": 16570 + }, + { + "epoch": 2.727626832713857, + "grad_norm": 0.4068273023864668, + "learning_rate": 3.921779442520769e-06, + "loss": 0.4339, + "step": 16571 + }, + { + "epoch": 2.7277914169329733, + "grad_norm": 1.0664734260304998, + "learning_rate": 3.921313278598336e-06, + "loss": 0.4232, + "step": 16572 + }, + { + "epoch": 2.72795600115209, + "grad_norm": 0.3619878185276546, + "learning_rate": 3.920847120990299e-06, + "loss": 0.4427, + "step": 16573 + }, + { + "epoch": 2.728120585371206, + "grad_norm": 0.3263631633804949, + "learning_rate": 3.920380969701749e-06, + "loss": 0.4447, + "step": 16574 + }, + { + "epoch": 2.7282851695903223, + "grad_norm": 0.6487706367050764, + "learning_rate": 3.919914824737769e-06, + "loss": 0.4403, + "step": 16575 + }, + { + "epoch": 2.7284497538094388, + "grad_norm": 0.39630519759832494, + "learning_rate": 3.919448686103448e-06, + "loss": 0.4551, + "step": 16576 + }, + { + "epoch": 2.7286143380285552, + "grad_norm": 0.3550421170640398, + "learning_rate": 3.918982553803872e-06, + "loss": 0.4432, + "step": 16577 + }, + { + "epoch": 2.7287789222476717, + "grad_norm": 0.4224330935360739, + "learning_rate": 3.9185164278441245e-06, + "loss": 0.4201, + "step": 16578 + }, + { + "epoch": 2.728943506466788, + "grad_norm": 0.3213326665891005, + "learning_rate": 3.918050308229295e-06, + "loss": 0.4252, + "step": 16579 + }, + { + "epoch": 2.7291080906859047, + "grad_norm": 0.31491941534195894, + "learning_rate": 3.917584194964467e-06, + "loss": 0.4371, + "step": 16580 + }, + { + "epoch": 2.729272674905021, + "grad_norm": 0.35263803623632617, + "learning_rate": 3.917118088054731e-06, + "loss": 0.4346, + "step": 16581 + }, + { + "epoch": 2.7294372591241376, + "grad_norm": 0.33412122258213794, + "learning_rate": 3.916651987505166e-06, + "loss": 0.4307, + "step": 16582 + }, + { + "epoch": 2.729601843343254, + "grad_norm": 0.3795048520762357, + "learning_rate": 3.916185893320864e-06, + "loss": 0.4262, + "step": 16583 + }, + { + "epoch": 2.7297664275623705, + "grad_norm": 0.34055335812743365, + "learning_rate": 3.915719805506909e-06, + "loss": 0.4465, + "step": 16584 + }, + { + "epoch": 2.729931011781487, + "grad_norm": 0.30848973764445026, + "learning_rate": 3.915253724068384e-06, + "loss": 0.4478, + "step": 16585 + }, + { + "epoch": 2.7300955960006035, + "grad_norm": 0.3324198552438988, + "learning_rate": 3.9147876490103784e-06, + "loss": 0.4406, + "step": 16586 + }, + { + "epoch": 2.73026018021972, + "grad_norm": 0.2761581425577677, + "learning_rate": 3.914321580337975e-06, + "loss": 0.4362, + "step": 16587 + }, + { + "epoch": 2.7304247644388364, + "grad_norm": 0.2873771774415666, + "learning_rate": 3.913855518056263e-06, + "loss": 0.4433, + "step": 16588 + }, + { + "epoch": 2.730589348657953, + "grad_norm": 0.3365185935551332, + "learning_rate": 3.913389462170324e-06, + "loss": 0.4265, + "step": 16589 + }, + { + "epoch": 2.7307539328770694, + "grad_norm": 0.3202568834928568, + "learning_rate": 3.9129234126852455e-06, + "loss": 0.4205, + "step": 16590 + }, + { + "epoch": 2.730918517096186, + "grad_norm": 0.31466464812505096, + "learning_rate": 3.912457369606114e-06, + "loss": 0.4354, + "step": 16591 + }, + { + "epoch": 2.7310831013153023, + "grad_norm": 0.32197124336552835, + "learning_rate": 3.9119913329380105e-06, + "loss": 0.4313, + "step": 16592 + }, + { + "epoch": 2.7312476855344188, + "grad_norm": 0.3262108831936048, + "learning_rate": 3.911525302686025e-06, + "loss": 0.4475, + "step": 16593 + }, + { + "epoch": 2.7314122697535352, + "grad_norm": 0.3018341625636483, + "learning_rate": 3.911059278855239e-06, + "loss": 0.4259, + "step": 16594 + }, + { + "epoch": 2.7315768539726517, + "grad_norm": 0.5083638087103188, + "learning_rate": 3.9105932614507404e-06, + "loss": 0.4358, + "step": 16595 + }, + { + "epoch": 2.731741438191768, + "grad_norm": 0.39664415812454396, + "learning_rate": 3.910127250477611e-06, + "loss": 0.4399, + "step": 16596 + }, + { + "epoch": 2.7319060224108846, + "grad_norm": 0.30830712980996544, + "learning_rate": 3.90966124594094e-06, + "loss": 0.4512, + "step": 16597 + }, + { + "epoch": 2.732070606630001, + "grad_norm": 0.36449293175256364, + "learning_rate": 3.909195247845808e-06, + "loss": 0.4285, + "step": 16598 + }, + { + "epoch": 2.732235190849117, + "grad_norm": 0.2824279588786678, + "learning_rate": 3.9087292561973025e-06, + "loss": 0.4244, + "step": 16599 + }, + { + "epoch": 2.7323997750682336, + "grad_norm": 0.3732075192421565, + "learning_rate": 3.908263271000509e-06, + "loss": 0.4402, + "step": 16600 + }, + { + "epoch": 2.73256435928735, + "grad_norm": 0.3843866741677414, + "learning_rate": 3.907797292260507e-06, + "loss": 0.4059, + "step": 16601 + }, + { + "epoch": 2.7327289435064666, + "grad_norm": 0.5765114906730495, + "learning_rate": 3.907331319982388e-06, + "loss": 0.443, + "step": 16602 + }, + { + "epoch": 2.732893527725583, + "grad_norm": 0.3048172546078079, + "learning_rate": 3.9068653541712295e-06, + "loss": 0.4256, + "step": 16603 + }, + { + "epoch": 2.7330581119446995, + "grad_norm": 0.2729739798913893, + "learning_rate": 3.906399394832123e-06, + "loss": 0.4412, + "step": 16604 + }, + { + "epoch": 2.733222696163816, + "grad_norm": 0.28601395344284813, + "learning_rate": 3.905933441970147e-06, + "loss": 0.428, + "step": 16605 + }, + { + "epoch": 2.7333872803829324, + "grad_norm": 0.2899991559501144, + "learning_rate": 3.90546749559039e-06, + "loss": 0.4449, + "step": 16606 + }, + { + "epoch": 2.733551864602049, + "grad_norm": 0.383461257118734, + "learning_rate": 3.905001555697934e-06, + "loss": 0.4328, + "step": 16607 + }, + { + "epoch": 2.7337164488211654, + "grad_norm": 0.3344920575647727, + "learning_rate": 3.904535622297862e-06, + "loss": 0.4243, + "step": 16608 + }, + { + "epoch": 2.733881033040282, + "grad_norm": 0.311637742348854, + "learning_rate": 3.9040696953952615e-06, + "loss": 0.4287, + "step": 16609 + }, + { + "epoch": 2.7340456172593983, + "grad_norm": 0.39159452522404453, + "learning_rate": 3.9036037749952134e-06, + "loss": 0.4326, + "step": 16610 + }, + { + "epoch": 2.734210201478515, + "grad_norm": 0.3379876446766164, + "learning_rate": 3.903137861102804e-06, + "loss": 0.4384, + "step": 16611 + }, + { + "epoch": 2.7343747856976313, + "grad_norm": 0.32601922212061046, + "learning_rate": 3.902671953723115e-06, + "loss": 0.4263, + "step": 16612 + }, + { + "epoch": 2.7345393699167477, + "grad_norm": 0.3037710053756364, + "learning_rate": 3.902206052861233e-06, + "loss": 0.4291, + "step": 16613 + }, + { + "epoch": 2.734703954135864, + "grad_norm": 0.39634282364124573, + "learning_rate": 3.901740158522239e-06, + "loss": 0.4384, + "step": 16614 + }, + { + "epoch": 2.7348685383549807, + "grad_norm": 0.3083150422178533, + "learning_rate": 3.901274270711217e-06, + "loss": 0.4434, + "step": 16615 + }, + { + "epoch": 2.735033122574097, + "grad_norm": 0.2880021907018946, + "learning_rate": 3.900808389433251e-06, + "loss": 0.4219, + "step": 16616 + }, + { + "epoch": 2.7351977067932136, + "grad_norm": 0.28308455496725854, + "learning_rate": 3.900342514693425e-06, + "loss": 0.4293, + "step": 16617 + }, + { + "epoch": 2.73536229101233, + "grad_norm": 0.33136345258347305, + "learning_rate": 3.899876646496823e-06, + "loss": 0.42, + "step": 16618 + }, + { + "epoch": 2.7355268752314466, + "grad_norm": 0.30752082102442413, + "learning_rate": 3.899410784848526e-06, + "loss": 0.4239, + "step": 16619 + }, + { + "epoch": 2.735691459450563, + "grad_norm": 0.42132254252174034, + "learning_rate": 3.89894492975362e-06, + "loss": 0.4559, + "step": 16620 + }, + { + "epoch": 2.7358560436696795, + "grad_norm": 0.3928050979267358, + "learning_rate": 3.898479081217188e-06, + "loss": 0.4629, + "step": 16621 + }, + { + "epoch": 2.736020627888796, + "grad_norm": 0.4182456753045492, + "learning_rate": 3.89801323924431e-06, + "loss": 0.4231, + "step": 16622 + }, + { + "epoch": 2.7361852121079124, + "grad_norm": 0.401752609400978, + "learning_rate": 3.897547403840073e-06, + "loss": 0.4278, + "step": 16623 + }, + { + "epoch": 2.736349796327029, + "grad_norm": 0.3373797177237633, + "learning_rate": 3.897081575009557e-06, + "loss": 0.4372, + "step": 16624 + }, + { + "epoch": 2.7365143805461454, + "grad_norm": 0.31456143649250423, + "learning_rate": 3.896615752757847e-06, + "loss": 0.4234, + "step": 16625 + }, + { + "epoch": 2.736678964765262, + "grad_norm": 0.4505892755130963, + "learning_rate": 3.896149937090024e-06, + "loss": 0.4297, + "step": 16626 + }, + { + "epoch": 2.7368435489843783, + "grad_norm": 0.3310531708308449, + "learning_rate": 3.895684128011174e-06, + "loss": 0.4197, + "step": 16627 + }, + { + "epoch": 2.737008133203495, + "grad_norm": 0.32229938221905113, + "learning_rate": 3.895218325526376e-06, + "loss": 0.4275, + "step": 16628 + }, + { + "epoch": 2.7371727174226113, + "grad_norm": 0.3673752839728046, + "learning_rate": 3.894752529640714e-06, + "loss": 0.445, + "step": 16629 + }, + { + "epoch": 2.7373373016417277, + "grad_norm": 0.3024556838370898, + "learning_rate": 3.894286740359272e-06, + "loss": 0.4344, + "step": 16630 + }, + { + "epoch": 2.737501885860844, + "grad_norm": 0.355569626536372, + "learning_rate": 3.8938209576871305e-06, + "loss": 0.4331, + "step": 16631 + }, + { + "epoch": 2.7376664700799607, + "grad_norm": 0.3329281372530806, + "learning_rate": 3.893355181629374e-06, + "loss": 0.425, + "step": 16632 + }, + { + "epoch": 2.737831054299077, + "grad_norm": 0.2996473940409809, + "learning_rate": 3.892889412191082e-06, + "loss": 0.4515, + "step": 16633 + }, + { + "epoch": 2.7379956385181936, + "grad_norm": 0.31016466286381267, + "learning_rate": 3.8924236493773395e-06, + "loss": 0.431, + "step": 16634 + }, + { + "epoch": 2.73816022273731, + "grad_norm": 0.3464979108227558, + "learning_rate": 3.8919578931932275e-06, + "loss": 0.4303, + "step": 16635 + }, + { + "epoch": 2.7383248069564265, + "grad_norm": 0.31263854785111733, + "learning_rate": 3.891492143643829e-06, + "loss": 0.4384, + "step": 16636 + }, + { + "epoch": 2.738489391175543, + "grad_norm": 0.32162531400133154, + "learning_rate": 3.891026400734224e-06, + "loss": 0.4422, + "step": 16637 + }, + { + "epoch": 2.7386539753946595, + "grad_norm": 0.31048427444162374, + "learning_rate": 3.890560664469496e-06, + "loss": 0.4208, + "step": 16638 + }, + { + "epoch": 2.738818559613776, + "grad_norm": 0.3184756943376093, + "learning_rate": 3.890094934854727e-06, + "loss": 0.4388, + "step": 16639 + }, + { + "epoch": 2.7389831438328924, + "grad_norm": 0.32833610141557723, + "learning_rate": 3.889629211894999e-06, + "loss": 0.4252, + "step": 16640 + }, + { + "epoch": 2.7391477280520085, + "grad_norm": 0.32894370976124304, + "learning_rate": 3.889163495595393e-06, + "loss": 0.4323, + "step": 16641 + }, + { + "epoch": 2.739312312271125, + "grad_norm": 0.7708130390270826, + "learning_rate": 3.888697785960991e-06, + "loss": 0.4433, + "step": 16642 + }, + { + "epoch": 2.7394768964902414, + "grad_norm": 0.2399325823005569, + "learning_rate": 3.888232082996877e-06, + "loss": 0.4197, + "step": 16643 + }, + { + "epoch": 2.739641480709358, + "grad_norm": 0.700859157485469, + "learning_rate": 3.887766386708129e-06, + "loss": 0.4392, + "step": 16644 + }, + { + "epoch": 2.7398060649284743, + "grad_norm": 0.4217182459833526, + "learning_rate": 3.887300697099829e-06, + "loss": 0.4433, + "step": 16645 + }, + { + "epoch": 2.739970649147591, + "grad_norm": 0.46884546102603025, + "learning_rate": 3.8868350141770595e-06, + "loss": 0.4262, + "step": 16646 + }, + { + "epoch": 2.7401352333667073, + "grad_norm": 0.3110915519154355, + "learning_rate": 3.8863693379449015e-06, + "loss": 0.4337, + "step": 16647 + }, + { + "epoch": 2.7402998175858237, + "grad_norm": 0.3352084069388875, + "learning_rate": 3.8859036684084364e-06, + "loss": 0.4221, + "step": 16648 + }, + { + "epoch": 2.74046440180494, + "grad_norm": 0.36378279540182057, + "learning_rate": 3.885438005572746e-06, + "loss": 0.4214, + "step": 16649 + }, + { + "epoch": 2.7406289860240567, + "grad_norm": 0.3158512433778828, + "learning_rate": 3.884972349442911e-06, + "loss": 0.4154, + "step": 16650 + }, + { + "epoch": 2.740793570243173, + "grad_norm": 0.307548477999172, + "learning_rate": 3.8845067000240125e-06, + "loss": 0.4315, + "step": 16651 + }, + { + "epoch": 2.7409581544622896, + "grad_norm": 0.38812292332638837, + "learning_rate": 3.8840410573211286e-06, + "loss": 0.4481, + "step": 16652 + }, + { + "epoch": 2.741122738681406, + "grad_norm": 0.2783823676112584, + "learning_rate": 3.883575421339347e-06, + "loss": 0.4244, + "step": 16653 + }, + { + "epoch": 2.7412873229005226, + "grad_norm": 0.3557886541737026, + "learning_rate": 3.8831097920837395e-06, + "loss": 0.4439, + "step": 16654 + }, + { + "epoch": 2.741451907119639, + "grad_norm": 0.32008413498538474, + "learning_rate": 3.882644169559396e-06, + "loss": 0.4327, + "step": 16655 + }, + { + "epoch": 2.7416164913387555, + "grad_norm": 0.304571256600945, + "learning_rate": 3.882178553771391e-06, + "loss": 0.4448, + "step": 16656 + }, + { + "epoch": 2.741781075557872, + "grad_norm": 0.27985675276001454, + "learning_rate": 3.881712944724808e-06, + "loss": 0.4445, + "step": 16657 + }, + { + "epoch": 2.7419456597769885, + "grad_norm": 0.37539069739389663, + "learning_rate": 3.881247342424726e-06, + "loss": 0.4306, + "step": 16658 + }, + { + "epoch": 2.742110243996105, + "grad_norm": 0.3604991298614318, + "learning_rate": 3.880781746876226e-06, + "loss": 0.4246, + "step": 16659 + }, + { + "epoch": 2.7422748282152214, + "grad_norm": 0.34497239921676615, + "learning_rate": 3.880316158084388e-06, + "loss": 0.431, + "step": 16660 + }, + { + "epoch": 2.742439412434338, + "grad_norm": 0.32782234520476394, + "learning_rate": 3.879850576054293e-06, + "loss": 0.4266, + "step": 16661 + }, + { + "epoch": 2.7426039966534543, + "grad_norm": 0.355377901894709, + "learning_rate": 3.879385000791022e-06, + "loss": 0.4313, + "step": 16662 + }, + { + "epoch": 2.742768580872571, + "grad_norm": 0.3622834351699398, + "learning_rate": 3.878919432299652e-06, + "loss": 0.4421, + "step": 16663 + }, + { + "epoch": 2.7429331650916873, + "grad_norm": 0.3121404542685007, + "learning_rate": 3.878453870585268e-06, + "loss": 0.4405, + "step": 16664 + }, + { + "epoch": 2.7430977493108037, + "grad_norm": 0.5422525704023888, + "learning_rate": 3.877988315652946e-06, + "loss": 0.4363, + "step": 16665 + }, + { + "epoch": 2.7432623335299198, + "grad_norm": 0.33630247392790386, + "learning_rate": 3.877522767507767e-06, + "loss": 0.4226, + "step": 16666 + }, + { + "epoch": 2.7434269177490362, + "grad_norm": 0.3488597023051155, + "learning_rate": 3.877057226154812e-06, + "loss": 0.4228, + "step": 16667 + }, + { + "epoch": 2.7435915019681527, + "grad_norm": 0.5512200340488606, + "learning_rate": 3.876591691599159e-06, + "loss": 0.4241, + "step": 16668 + }, + { + "epoch": 2.743756086187269, + "grad_norm": 0.2967852679199231, + "learning_rate": 3.87612616384589e-06, + "loss": 0.451, + "step": 16669 + }, + { + "epoch": 2.7439206704063857, + "grad_norm": 0.329573071370927, + "learning_rate": 3.875660642900081e-06, + "loss": 0.4412, + "step": 16670 + }, + { + "epoch": 2.744085254625502, + "grad_norm": 0.3357303574720884, + "learning_rate": 3.875195128766815e-06, + "loss": 0.4352, + "step": 16671 + }, + { + "epoch": 2.7442498388446186, + "grad_norm": 0.3118697057421077, + "learning_rate": 3.874729621451172e-06, + "loss": 0.422, + "step": 16672 + }, + { + "epoch": 2.744414423063735, + "grad_norm": 0.324592192857279, + "learning_rate": 3.874264120958227e-06, + "loss": 0.433, + "step": 16673 + }, + { + "epoch": 2.7445790072828515, + "grad_norm": 0.356134316887983, + "learning_rate": 3.873798627293065e-06, + "loss": 0.4342, + "step": 16674 + }, + { + "epoch": 2.744743591501968, + "grad_norm": 0.5216140005545202, + "learning_rate": 3.873333140460761e-06, + "loss": 0.4235, + "step": 16675 + }, + { + "epoch": 2.7449081757210845, + "grad_norm": 0.5477538161873636, + "learning_rate": 3.872867660466396e-06, + "loss": 0.4384, + "step": 16676 + }, + { + "epoch": 2.745072759940201, + "grad_norm": 0.29347997209182203, + "learning_rate": 3.872402187315048e-06, + "loss": 0.4281, + "step": 16677 + }, + { + "epoch": 2.7452373441593174, + "grad_norm": 0.3099905198378355, + "learning_rate": 3.871936721011797e-06, + "loss": 0.4214, + "step": 16678 + }, + { + "epoch": 2.745401928378434, + "grad_norm": 0.4122803175887726, + "learning_rate": 3.8714712615617226e-06, + "loss": 0.445, + "step": 16679 + }, + { + "epoch": 2.7455665125975504, + "grad_norm": 0.9142629401607756, + "learning_rate": 3.8710058089699025e-06, + "loss": 0.4357, + "step": 16680 + }, + { + "epoch": 2.745731096816667, + "grad_norm": 0.3001133513388538, + "learning_rate": 3.870540363241417e-06, + "loss": 0.4291, + "step": 16681 + }, + { + "epoch": 2.7458956810357833, + "grad_norm": 0.4168866990618788, + "learning_rate": 3.8700749243813415e-06, + "loss": 0.4269, + "step": 16682 + }, + { + "epoch": 2.7460602652548998, + "grad_norm": 0.30645876076786355, + "learning_rate": 3.86960949239476e-06, + "loss": 0.4262, + "step": 16683 + }, + { + "epoch": 2.7462248494740162, + "grad_norm": 0.30916232701484836, + "learning_rate": 3.869144067286745e-06, + "loss": 0.4342, + "step": 16684 + }, + { + "epoch": 2.7463894336931327, + "grad_norm": 0.3309646231801186, + "learning_rate": 3.868678649062381e-06, + "loss": 0.4199, + "step": 16685 + }, + { + "epoch": 2.746554017912249, + "grad_norm": 0.36138802785749186, + "learning_rate": 3.868213237726742e-06, + "loss": 0.4446, + "step": 16686 + }, + { + "epoch": 2.7467186021313656, + "grad_norm": 0.33220801329003613, + "learning_rate": 3.8677478332849084e-06, + "loss": 0.4171, + "step": 16687 + }, + { + "epoch": 2.746883186350482, + "grad_norm": 0.30982777008522744, + "learning_rate": 3.867282435741959e-06, + "loss": 0.4282, + "step": 16688 + }, + { + "epoch": 2.7470477705695986, + "grad_norm": 0.36053955933065407, + "learning_rate": 3.8668170451029694e-06, + "loss": 0.4546, + "step": 16689 + }, + { + "epoch": 2.747212354788715, + "grad_norm": 0.3115291673408375, + "learning_rate": 3.8663516613730204e-06, + "loss": 0.4223, + "step": 16690 + }, + { + "epoch": 2.7473769390078315, + "grad_norm": 0.36712438720724067, + "learning_rate": 3.865886284557188e-06, + "loss": 0.4214, + "step": 16691 + }, + { + "epoch": 2.747541523226948, + "grad_norm": 0.4542084555280313, + "learning_rate": 3.865420914660553e-06, + "loss": 0.43, + "step": 16692 + }, + { + "epoch": 2.7477061074460645, + "grad_norm": 0.31832298830736644, + "learning_rate": 3.86495555168819e-06, + "loss": 0.4374, + "step": 16693 + }, + { + "epoch": 2.747870691665181, + "grad_norm": 0.3556434858838793, + "learning_rate": 3.864490195645182e-06, + "loss": 0.4368, + "step": 16694 + }, + { + "epoch": 2.7480352758842974, + "grad_norm": 0.3578218709637887, + "learning_rate": 3.864024846536602e-06, + "loss": 0.4255, + "step": 16695 + }, + { + "epoch": 2.748199860103414, + "grad_norm": 0.272539267466431, + "learning_rate": 3.863559504367527e-06, + "loss": 0.4247, + "step": 16696 + }, + { + "epoch": 2.7483644443225304, + "grad_norm": 0.36092119887107127, + "learning_rate": 3.863094169143038e-06, + "loss": 0.4139, + "step": 16697 + }, + { + "epoch": 2.748529028541647, + "grad_norm": 0.3138993620997971, + "learning_rate": 3.862628840868211e-06, + "loss": 0.4451, + "step": 16698 + }, + { + "epoch": 2.7486936127607633, + "grad_norm": 0.36947159209977626, + "learning_rate": 3.862163519548124e-06, + "loss": 0.4372, + "step": 16699 + }, + { + "epoch": 2.7488581969798798, + "grad_norm": 0.3174300936180199, + "learning_rate": 3.861698205187853e-06, + "loss": 0.4305, + "step": 16700 + }, + { + "epoch": 2.7490227811989962, + "grad_norm": 0.290319003808263, + "learning_rate": 3.861232897792478e-06, + "loss": 0.4328, + "step": 16701 + }, + { + "epoch": 2.7491873654181127, + "grad_norm": 0.3098957433837338, + "learning_rate": 3.860767597367076e-06, + "loss": 0.4244, + "step": 16702 + }, + { + "epoch": 2.749351949637229, + "grad_norm": 0.32129061982609347, + "learning_rate": 3.860302303916718e-06, + "loss": 0.4378, + "step": 16703 + }, + { + "epoch": 2.7495165338563456, + "grad_norm": 0.3653148104201589, + "learning_rate": 3.85983701744649e-06, + "loss": 0.4245, + "step": 16704 + }, + { + "epoch": 2.749681118075462, + "grad_norm": 0.7149367519498637, + "learning_rate": 3.859371737961464e-06, + "loss": 0.4298, + "step": 16705 + }, + { + "epoch": 2.7498457022945786, + "grad_norm": 0.36724382587372284, + "learning_rate": 3.858906465466718e-06, + "loss": 0.4278, + "step": 16706 + }, + { + "epoch": 2.7500102865136946, + "grad_norm": 0.2925452421742609, + "learning_rate": 3.858441199967328e-06, + "loss": 0.4211, + "step": 16707 + }, + { + "epoch": 2.750174870732811, + "grad_norm": 0.2683395976583217, + "learning_rate": 3.857975941468373e-06, + "loss": 0.4175, + "step": 16708 + }, + { + "epoch": 2.7503394549519276, + "grad_norm": 0.3136844198419058, + "learning_rate": 3.857510689974927e-06, + "loss": 0.447, + "step": 16709 + }, + { + "epoch": 2.750504039171044, + "grad_norm": 0.3030603614875939, + "learning_rate": 3.857045445492068e-06, + "loss": 0.4385, + "step": 16710 + }, + { + "epoch": 2.7506686233901605, + "grad_norm": 0.283294155437198, + "learning_rate": 3.856580208024873e-06, + "loss": 0.4033, + "step": 16711 + }, + { + "epoch": 2.750833207609277, + "grad_norm": 0.5659848655842254, + "learning_rate": 3.8561149775784174e-06, + "loss": 0.4346, + "step": 16712 + }, + { + "epoch": 2.7509977918283934, + "grad_norm": 0.3151413366114171, + "learning_rate": 3.85564975415778e-06, + "loss": 0.4431, + "step": 16713 + }, + { + "epoch": 2.75116237604751, + "grad_norm": 0.2758156307202911, + "learning_rate": 3.855184537768033e-06, + "loss": 0.4267, + "step": 16714 + }, + { + "epoch": 2.7513269602666264, + "grad_norm": 0.3780966181832298, + "learning_rate": 3.854719328414257e-06, + "loss": 0.4239, + "step": 16715 + }, + { + "epoch": 2.751491544485743, + "grad_norm": 0.2821931796559113, + "learning_rate": 3.854254126101525e-06, + "loss": 0.417, + "step": 16716 + }, + { + "epoch": 2.7516561287048593, + "grad_norm": 0.3266347677830889, + "learning_rate": 3.853788930834914e-06, + "loss": 0.4276, + "step": 16717 + }, + { + "epoch": 2.751820712923976, + "grad_norm": 0.312747836556435, + "learning_rate": 3.853323742619501e-06, + "loss": 0.4294, + "step": 16718 + }, + { + "epoch": 2.7519852971430923, + "grad_norm": 0.35643769445508905, + "learning_rate": 3.852858561460361e-06, + "loss": 0.4214, + "step": 16719 + }, + { + "epoch": 2.7521498813622087, + "grad_norm": 0.3929579469658608, + "learning_rate": 3.8523933873625695e-06, + "loss": 0.4401, + "step": 16720 + }, + { + "epoch": 2.752314465581325, + "grad_norm": 0.32304296447919406, + "learning_rate": 3.851928220331202e-06, + "loss": 0.4291, + "step": 16721 + }, + { + "epoch": 2.7524790498004417, + "grad_norm": 0.29631745968376494, + "learning_rate": 3.851463060371337e-06, + "loss": 0.4295, + "step": 16722 + }, + { + "epoch": 2.752643634019558, + "grad_norm": 0.31666231721168825, + "learning_rate": 3.850997907488048e-06, + "loss": 0.4301, + "step": 16723 + }, + { + "epoch": 2.7528082182386746, + "grad_norm": 0.3134924938039652, + "learning_rate": 3.850532761686411e-06, + "loss": 0.4151, + "step": 16724 + }, + { + "epoch": 2.752972802457791, + "grad_norm": 0.3341055846699733, + "learning_rate": 3.850067622971502e-06, + "loss": 0.4158, + "step": 16725 + }, + { + "epoch": 2.7531373866769075, + "grad_norm": 0.3944613390653974, + "learning_rate": 3.849602491348394e-06, + "loss": 0.4179, + "step": 16726 + }, + { + "epoch": 2.753301970896024, + "grad_norm": 0.34549310637003094, + "learning_rate": 3.849137366822165e-06, + "loss": 0.4228, + "step": 16727 + }, + { + "epoch": 2.7534665551151405, + "grad_norm": 0.48093379685876775, + "learning_rate": 3.848672249397888e-06, + "loss": 0.4329, + "step": 16728 + }, + { + "epoch": 2.753631139334257, + "grad_norm": 0.2653898037593713, + "learning_rate": 3.8482071390806405e-06, + "loss": 0.4263, + "step": 16729 + }, + { + "epoch": 2.7537957235533734, + "grad_norm": 0.3569534889161102, + "learning_rate": 3.8477420358754955e-06, + "loss": 0.4311, + "step": 16730 + }, + { + "epoch": 2.75396030777249, + "grad_norm": 0.37801092775394957, + "learning_rate": 3.847276939787531e-06, + "loss": 0.4154, + "step": 16731 + }, + { + "epoch": 2.754124891991606, + "grad_norm": 0.28751365365028175, + "learning_rate": 3.84681185082182e-06, + "loss": 0.4275, + "step": 16732 + }, + { + "epoch": 2.7542894762107224, + "grad_norm": 0.35444783230134985, + "learning_rate": 3.846346768983435e-06, + "loss": 0.4274, + "step": 16733 + }, + { + "epoch": 2.754454060429839, + "grad_norm": 0.35730086522911014, + "learning_rate": 3.845881694277456e-06, + "loss": 0.4165, + "step": 16734 + }, + { + "epoch": 2.7546186446489553, + "grad_norm": 0.34307664484838024, + "learning_rate": 3.845416626708952e-06, + "loss": 0.4241, + "step": 16735 + }, + { + "epoch": 2.754783228868072, + "grad_norm": 0.5066426180430484, + "learning_rate": 3.844951566283003e-06, + "loss": 0.4287, + "step": 16736 + }, + { + "epoch": 2.7549478130871883, + "grad_norm": 0.2992610263698568, + "learning_rate": 3.84448651300468e-06, + "loss": 0.4282, + "step": 16737 + }, + { + "epoch": 2.7551123973063048, + "grad_norm": 0.36298104637670003, + "learning_rate": 3.844021466879059e-06, + "loss": 0.4406, + "step": 16738 + }, + { + "epoch": 2.755276981525421, + "grad_norm": 0.25724806667635997, + "learning_rate": 3.843556427911214e-06, + "loss": 0.4136, + "step": 16739 + }, + { + "epoch": 2.7554415657445377, + "grad_norm": 0.29443173858665445, + "learning_rate": 3.843091396106218e-06, + "loss": 0.4388, + "step": 16740 + }, + { + "epoch": 2.755606149963654, + "grad_norm": 0.525197192040113, + "learning_rate": 3.842626371469148e-06, + "loss": 0.4149, + "step": 16741 + }, + { + "epoch": 2.7557707341827706, + "grad_norm": 0.31673011717278055, + "learning_rate": 3.842161354005076e-06, + "loss": 0.4333, + "step": 16742 + }, + { + "epoch": 2.755935318401887, + "grad_norm": 0.3159479306854577, + "learning_rate": 3.8416963437190776e-06, + "loss": 0.4299, + "step": 16743 + }, + { + "epoch": 2.7560999026210036, + "grad_norm": 0.3180567009797619, + "learning_rate": 3.841231340616224e-06, + "loss": 0.4415, + "step": 16744 + }, + { + "epoch": 2.75626448684012, + "grad_norm": 0.3046299149431181, + "learning_rate": 3.840766344701594e-06, + "loss": 0.431, + "step": 16745 + }, + { + "epoch": 2.7564290710592365, + "grad_norm": 0.32926788198908075, + "learning_rate": 3.840301355980257e-06, + "loss": 0.4296, + "step": 16746 + }, + { + "epoch": 2.756593655278353, + "grad_norm": 0.3202232197890211, + "learning_rate": 3.839836374457288e-06, + "loss": 0.4212, + "step": 16747 + }, + { + "epoch": 2.7567582394974695, + "grad_norm": 0.3292743896916637, + "learning_rate": 3.839371400137761e-06, + "loss": 0.4425, + "step": 16748 + }, + { + "epoch": 2.756922823716586, + "grad_norm": 0.3225566149924342, + "learning_rate": 3.838906433026749e-06, + "loss": 0.4294, + "step": 16749 + }, + { + "epoch": 2.7570874079357024, + "grad_norm": 0.43059850699838753, + "learning_rate": 3.838441473129328e-06, + "loss": 0.4374, + "step": 16750 + }, + { + "epoch": 2.757251992154819, + "grad_norm": 0.3935972124510023, + "learning_rate": 3.837976520450567e-06, + "loss": 0.4468, + "step": 16751 + }, + { + "epoch": 2.7574165763739353, + "grad_norm": 0.3989199727825013, + "learning_rate": 3.837511574995545e-06, + "loss": 0.4457, + "step": 16752 + }, + { + "epoch": 2.757581160593052, + "grad_norm": 0.3078040887317582, + "learning_rate": 3.837046636769331e-06, + "loss": 0.4484, + "step": 16753 + }, + { + "epoch": 2.7577457448121683, + "grad_norm": 0.4155164881385347, + "learning_rate": 3.836581705776998e-06, + "loss": 0.4235, + "step": 16754 + }, + { + "epoch": 2.7579103290312847, + "grad_norm": 0.9218484543356673, + "learning_rate": 3.836116782023624e-06, + "loss": 0.427, + "step": 16755 + }, + { + "epoch": 2.758074913250401, + "grad_norm": 0.4694404570897363, + "learning_rate": 3.835651865514277e-06, + "loss": 0.4346, + "step": 16756 + }, + { + "epoch": 2.7582394974695177, + "grad_norm": 0.3609226084141087, + "learning_rate": 3.835186956254031e-06, + "loss": 0.4463, + "step": 16757 + }, + { + "epoch": 2.758404081688634, + "grad_norm": 0.38001556363092925, + "learning_rate": 3.834722054247959e-06, + "loss": 0.4076, + "step": 16758 + }, + { + "epoch": 2.7585686659077506, + "grad_norm": 0.3146518163215466, + "learning_rate": 3.834257159501137e-06, + "loss": 0.4341, + "step": 16759 + }, + { + "epoch": 2.758733250126867, + "grad_norm": 0.3258618934792849, + "learning_rate": 3.8337922720186326e-06, + "loss": 0.4361, + "step": 16760 + }, + { + "epoch": 2.7588978343459836, + "grad_norm": 0.41980482455061724, + "learning_rate": 3.8333273918055226e-06, + "loss": 0.4028, + "step": 16761 + }, + { + "epoch": 2.7590624185651, + "grad_norm": 0.3035359627965373, + "learning_rate": 3.832862518866879e-06, + "loss": 0.4333, + "step": 16762 + }, + { + "epoch": 2.7592270027842165, + "grad_norm": 0.2792010265807511, + "learning_rate": 3.83239765320777e-06, + "loss": 0.4239, + "step": 16763 + }, + { + "epoch": 2.759391587003333, + "grad_norm": 0.3004060675697017, + "learning_rate": 3.8319327948332744e-06, + "loss": 0.4394, + "step": 16764 + }, + { + "epoch": 2.7595561712224494, + "grad_norm": 0.46446135062881594, + "learning_rate": 3.8314679437484594e-06, + "loss": 0.4369, + "step": 16765 + }, + { + "epoch": 2.759720755441566, + "grad_norm": 0.44409316368375035, + "learning_rate": 3.8310030999584014e-06, + "loss": 0.4264, + "step": 16766 + }, + { + "epoch": 2.7598853396606824, + "grad_norm": 0.3156353402627495, + "learning_rate": 3.830538263468169e-06, + "loss": 0.404, + "step": 16767 + }, + { + "epoch": 2.760049923879799, + "grad_norm": 0.5656328540429268, + "learning_rate": 3.830073434282837e-06, + "loss": 0.4191, + "step": 16768 + }, + { + "epoch": 2.7602145080989153, + "grad_norm": 0.34012852273365457, + "learning_rate": 3.829608612407476e-06, + "loss": 0.415, + "step": 16769 + }, + { + "epoch": 2.760379092318032, + "grad_norm": 0.3749393374332376, + "learning_rate": 3.829143797847157e-06, + "loss": 0.4375, + "step": 16770 + }, + { + "epoch": 2.7605436765371483, + "grad_norm": 0.35255460757698165, + "learning_rate": 3.828678990606955e-06, + "loss": 0.4348, + "step": 16771 + }, + { + "epoch": 2.7607082607562647, + "grad_norm": 0.26503899237119494, + "learning_rate": 3.828214190691939e-06, + "loss": 0.4285, + "step": 16772 + }, + { + "epoch": 2.760872844975381, + "grad_norm": 0.2901410596208873, + "learning_rate": 3.827749398107182e-06, + "loss": 0.4175, + "step": 16773 + }, + { + "epoch": 2.7610374291944972, + "grad_norm": 0.5798815384488164, + "learning_rate": 3.827284612857754e-06, + "loss": 0.4396, + "step": 16774 + }, + { + "epoch": 2.7612020134136137, + "grad_norm": 0.3729746218682766, + "learning_rate": 3.82681983494873e-06, + "loss": 0.4217, + "step": 16775 + }, + { + "epoch": 2.76136659763273, + "grad_norm": 0.3113239690103586, + "learning_rate": 3.826355064385179e-06, + "loss": 0.4367, + "step": 16776 + }, + { + "epoch": 2.7615311818518467, + "grad_norm": 0.3720192886770401, + "learning_rate": 3.825890301172171e-06, + "loss": 0.435, + "step": 16777 + }, + { + "epoch": 2.761695766070963, + "grad_norm": 0.29142275136633367, + "learning_rate": 3.82542554531478e-06, + "loss": 0.413, + "step": 16778 + }, + { + "epoch": 2.7618603502900796, + "grad_norm": 0.3192271465070386, + "learning_rate": 3.824960796818076e-06, + "loss": 0.4261, + "step": 16779 + }, + { + "epoch": 2.762024934509196, + "grad_norm": 0.3484695951419301, + "learning_rate": 3.82449605568713e-06, + "loss": 0.4347, + "step": 16780 + }, + { + "epoch": 2.7621895187283125, + "grad_norm": 0.31107231719861583, + "learning_rate": 3.824031321927014e-06, + "loss": 0.4563, + "step": 16781 + }, + { + "epoch": 2.762354102947429, + "grad_norm": 0.30327560992114966, + "learning_rate": 3.823566595542798e-06, + "loss": 0.4013, + "step": 16782 + }, + { + "epoch": 2.7625186871665455, + "grad_norm": 0.392686760340269, + "learning_rate": 3.823101876539556e-06, + "loss": 0.4263, + "step": 16783 + }, + { + "epoch": 2.762683271385662, + "grad_norm": 0.4008420205309258, + "learning_rate": 3.822637164922352e-06, + "loss": 0.4276, + "step": 16784 + }, + { + "epoch": 2.7628478556047784, + "grad_norm": 0.32373652463314856, + "learning_rate": 3.822172460696264e-06, + "loss": 0.4248, + "step": 16785 + }, + { + "epoch": 2.763012439823895, + "grad_norm": 0.32680014460348633, + "learning_rate": 3.821707763866358e-06, + "loss": 0.4492, + "step": 16786 + }, + { + "epoch": 2.7631770240430114, + "grad_norm": 0.35168595861593227, + "learning_rate": 3.821243074437706e-06, + "loss": 0.443, + "step": 16787 + }, + { + "epoch": 2.763341608262128, + "grad_norm": 0.3233521352290655, + "learning_rate": 3.820778392415379e-06, + "loss": 0.4318, + "step": 16788 + }, + { + "epoch": 2.7635061924812443, + "grad_norm": 0.5374254233166486, + "learning_rate": 3.820313717804448e-06, + "loss": 0.4281, + "step": 16789 + }, + { + "epoch": 2.7636707767003608, + "grad_norm": 0.35161183229666393, + "learning_rate": 3.819849050609982e-06, + "loss": 0.4144, + "step": 16790 + }, + { + "epoch": 2.7638353609194772, + "grad_norm": 0.46119190096611545, + "learning_rate": 3.81938439083705e-06, + "loss": 0.4266, + "step": 16791 + }, + { + "epoch": 2.7639999451385937, + "grad_norm": 0.2709732273695854, + "learning_rate": 3.818919738490726e-06, + "loss": 0.4288, + "step": 16792 + }, + { + "epoch": 2.76416452935771, + "grad_norm": 0.2927553770775444, + "learning_rate": 3.818455093576078e-06, + "loss": 0.4487, + "step": 16793 + }, + { + "epoch": 2.7643291135768266, + "grad_norm": 0.33049937178502276, + "learning_rate": 3.817990456098176e-06, + "loss": 0.4283, + "step": 16794 + }, + { + "epoch": 2.764493697795943, + "grad_norm": 0.5151043041028656, + "learning_rate": 3.817525826062088e-06, + "loss": 0.4361, + "step": 16795 + }, + { + "epoch": 2.7646582820150596, + "grad_norm": 0.44716660786155266, + "learning_rate": 3.8170612034728886e-06, + "loss": 0.4351, + "step": 16796 + }, + { + "epoch": 2.764822866234176, + "grad_norm": 0.3042095373540588, + "learning_rate": 3.8165965883356435e-06, + "loss": 0.4206, + "step": 16797 + }, + { + "epoch": 2.7649874504532925, + "grad_norm": 0.3198591642725671, + "learning_rate": 3.816131980655422e-06, + "loss": 0.446, + "step": 16798 + }, + { + "epoch": 2.7651520346724086, + "grad_norm": 0.3194612663957039, + "learning_rate": 3.815667380437298e-06, + "loss": 0.417, + "step": 16799 + }, + { + "epoch": 2.765316618891525, + "grad_norm": 0.5235493512889577, + "learning_rate": 3.815202787686337e-06, + "loss": 0.4101, + "step": 16800 + }, + { + "epoch": 2.7654812031106415, + "grad_norm": 0.2875249223023592, + "learning_rate": 3.8147382024076104e-06, + "loss": 0.4281, + "step": 16801 + }, + { + "epoch": 2.765645787329758, + "grad_norm": 0.4026276153596572, + "learning_rate": 3.8142736246061864e-06, + "loss": 0.4598, + "step": 16802 + }, + { + "epoch": 2.7658103715488744, + "grad_norm": 0.27404729719891796, + "learning_rate": 3.813809054287135e-06, + "loss": 0.4086, + "step": 16803 + }, + { + "epoch": 2.765974955767991, + "grad_norm": 0.3145397887623035, + "learning_rate": 3.8133444914555255e-06, + "loss": 0.4493, + "step": 16804 + }, + { + "epoch": 2.7661395399871074, + "grad_norm": 0.3397846067693523, + "learning_rate": 3.8128799361164277e-06, + "loss": 0.4198, + "step": 16805 + }, + { + "epoch": 2.766304124206224, + "grad_norm": 0.35722500103087296, + "learning_rate": 3.81241538827491e-06, + "loss": 0.4365, + "step": 16806 + }, + { + "epoch": 2.7664687084253403, + "grad_norm": 0.30238379013409583, + "learning_rate": 3.8119508479360402e-06, + "loss": 0.4608, + "step": 16807 + }, + { + "epoch": 2.766633292644457, + "grad_norm": 0.3440353515744379, + "learning_rate": 3.811486315104889e-06, + "loss": 0.4381, + "step": 16808 + }, + { + "epoch": 2.7667978768635733, + "grad_norm": 0.35348389908653965, + "learning_rate": 3.8110217897865228e-06, + "loss": 0.4347, + "step": 16809 + }, + { + "epoch": 2.7669624610826897, + "grad_norm": 0.5701370655291501, + "learning_rate": 3.810557271986013e-06, + "loss": 0.4434, + "step": 16810 + }, + { + "epoch": 2.767127045301806, + "grad_norm": 0.3437702405996044, + "learning_rate": 3.8100927617084262e-06, + "loss": 0.4401, + "step": 16811 + }, + { + "epoch": 2.7672916295209227, + "grad_norm": 0.3378670724201335, + "learning_rate": 3.8096282589588325e-06, + "loss": 0.4188, + "step": 16812 + }, + { + "epoch": 2.767456213740039, + "grad_norm": 0.35544705652232955, + "learning_rate": 3.8091637637423014e-06, + "loss": 0.4518, + "step": 16813 + }, + { + "epoch": 2.7676207979591556, + "grad_norm": 0.31464462787067055, + "learning_rate": 3.808699276063896e-06, + "loss": 0.4274, + "step": 16814 + }, + { + "epoch": 2.767785382178272, + "grad_norm": 0.2459535141240698, + "learning_rate": 3.808234795928692e-06, + "loss": 0.425, + "step": 16815 + }, + { + "epoch": 2.7679499663973886, + "grad_norm": 0.3086223489073096, + "learning_rate": 3.80777032334175e-06, + "loss": 0.4215, + "step": 16816 + }, + { + "epoch": 2.768114550616505, + "grad_norm": 0.26936310532731195, + "learning_rate": 3.807305858308145e-06, + "loss": 0.405, + "step": 16817 + }, + { + "epoch": 2.7682791348356215, + "grad_norm": 0.3025565113277676, + "learning_rate": 3.80684140083294e-06, + "loss": 0.437, + "step": 16818 + }, + { + "epoch": 2.768443719054738, + "grad_norm": 0.2957806754182231, + "learning_rate": 3.8063769509212065e-06, + "loss": 0.4352, + "step": 16819 + }, + { + "epoch": 2.7686083032738544, + "grad_norm": 0.33027851952704007, + "learning_rate": 3.8059125085780105e-06, + "loss": 0.4393, + "step": 16820 + }, + { + "epoch": 2.768772887492971, + "grad_norm": 0.36777594773645805, + "learning_rate": 3.8054480738084195e-06, + "loss": 0.4286, + "step": 16821 + }, + { + "epoch": 2.7689374717120874, + "grad_norm": 0.361574667511148, + "learning_rate": 3.804983646617503e-06, + "loss": 0.418, + "step": 16822 + }, + { + "epoch": 2.769102055931204, + "grad_norm": 0.36971154574152404, + "learning_rate": 3.804519227010326e-06, + "loss": 0.4427, + "step": 16823 + }, + { + "epoch": 2.7692666401503203, + "grad_norm": 0.32077205106904844, + "learning_rate": 3.8040548149919594e-06, + "loss": 0.4231, + "step": 16824 + }, + { + "epoch": 2.769431224369437, + "grad_norm": 0.2944188371284831, + "learning_rate": 3.8035904105674672e-06, + "loss": 0.4175, + "step": 16825 + }, + { + "epoch": 2.7695958085885533, + "grad_norm": 0.31903723649409144, + "learning_rate": 3.8031260137419207e-06, + "loss": 0.4325, + "step": 16826 + }, + { + "epoch": 2.7697603928076697, + "grad_norm": 0.3292297061095488, + "learning_rate": 3.802661624520384e-06, + "loss": 0.4273, + "step": 16827 + }, + { + "epoch": 2.769924977026786, + "grad_norm": 0.3541969818547337, + "learning_rate": 3.802197242907924e-06, + "loss": 0.4334, + "step": 16828 + }, + { + "epoch": 2.7700895612459027, + "grad_norm": 0.3672254462240884, + "learning_rate": 3.8017328689096106e-06, + "loss": 0.4251, + "step": 16829 + }, + { + "epoch": 2.770254145465019, + "grad_norm": 0.3457483782958529, + "learning_rate": 3.801268502530509e-06, + "loss": 0.4237, + "step": 16830 + }, + { + "epoch": 2.7704187296841356, + "grad_norm": 0.28709007258308455, + "learning_rate": 3.800804143775687e-06, + "loss": 0.4207, + "step": 16831 + }, + { + "epoch": 2.770583313903252, + "grad_norm": 0.2958632409294842, + "learning_rate": 3.8003397926502106e-06, + "loss": 0.4531, + "step": 16832 + }, + { + "epoch": 2.7707478981223685, + "grad_norm": 0.35996068001301706, + "learning_rate": 3.7998754491591478e-06, + "loss": 0.4378, + "step": 16833 + }, + { + "epoch": 2.770912482341485, + "grad_norm": 0.45725835910002477, + "learning_rate": 3.7994111133075653e-06, + "loss": 0.4312, + "step": 16834 + }, + { + "epoch": 2.7710770665606015, + "grad_norm": 0.37072152053424307, + "learning_rate": 3.7989467851005267e-06, + "loss": 0.4404, + "step": 16835 + }, + { + "epoch": 2.771241650779718, + "grad_norm": 0.38066903273037256, + "learning_rate": 3.7984824645431036e-06, + "loss": 0.4407, + "step": 16836 + }, + { + "epoch": 2.7714062349988344, + "grad_norm": 0.33648006260136515, + "learning_rate": 3.7980181516403584e-06, + "loss": 0.4347, + "step": 16837 + }, + { + "epoch": 2.771570819217951, + "grad_norm": 0.3286620878957204, + "learning_rate": 3.797553846397359e-06, + "loss": 0.4404, + "step": 16838 + }, + { + "epoch": 2.7717354034370674, + "grad_norm": 0.3275340096220997, + "learning_rate": 3.7970895488191717e-06, + "loss": 0.4444, + "step": 16839 + }, + { + "epoch": 2.771899987656184, + "grad_norm": 0.2860578424125304, + "learning_rate": 3.7966252589108634e-06, + "loss": 0.4433, + "step": 16840 + }, + { + "epoch": 2.7720645718753, + "grad_norm": 0.7913351638025952, + "learning_rate": 3.7961609766774994e-06, + "loss": 0.4224, + "step": 16841 + }, + { + "epoch": 2.7722291560944163, + "grad_norm": 0.4807048698131998, + "learning_rate": 3.795696702124145e-06, + "loss": 0.4401, + "step": 16842 + }, + { + "epoch": 2.772393740313533, + "grad_norm": 0.37526450105030956, + "learning_rate": 3.7952324352558676e-06, + "loss": 0.4269, + "step": 16843 + }, + { + "epoch": 2.7725583245326493, + "grad_norm": 0.2909990302337196, + "learning_rate": 3.7947681760777325e-06, + "loss": 0.4238, + "step": 16844 + }, + { + "epoch": 2.7727229087517657, + "grad_norm": 0.3265977697998563, + "learning_rate": 3.7943039245948074e-06, + "loss": 0.4365, + "step": 16845 + }, + { + "epoch": 2.772887492970882, + "grad_norm": 0.3940798235253495, + "learning_rate": 3.7938396808121525e-06, + "loss": 0.4415, + "step": 16846 + }, + { + "epoch": 2.7730520771899987, + "grad_norm": 0.320725253909283, + "learning_rate": 3.793375444734841e-06, + "loss": 0.4166, + "step": 16847 + }, + { + "epoch": 2.773216661409115, + "grad_norm": 0.3463614463862739, + "learning_rate": 3.7929112163679314e-06, + "loss": 0.4337, + "step": 16848 + }, + { + "epoch": 2.7733812456282316, + "grad_norm": 0.5034829550088746, + "learning_rate": 3.7924469957164952e-06, + "loss": 0.4265, + "step": 16849 + }, + { + "epoch": 2.773545829847348, + "grad_norm": 0.40248275758221413, + "learning_rate": 3.791982782785594e-06, + "loss": 0.4103, + "step": 16850 + }, + { + "epoch": 2.7737104140664646, + "grad_norm": 0.3980328594843701, + "learning_rate": 3.7915185775802934e-06, + "loss": 0.424, + "step": 16851 + }, + { + "epoch": 2.773874998285581, + "grad_norm": 0.3646532818983775, + "learning_rate": 3.7910543801056603e-06, + "loss": 0.4261, + "step": 16852 + }, + { + "epoch": 2.7740395825046975, + "grad_norm": 0.3362183017590112, + "learning_rate": 3.7905901903667576e-06, + "loss": 0.4297, + "step": 16853 + }, + { + "epoch": 2.774204166723814, + "grad_norm": 0.37016511030631144, + "learning_rate": 3.7901260083686523e-06, + "loss": 0.4294, + "step": 16854 + }, + { + "epoch": 2.7743687509429305, + "grad_norm": 0.361155081565757, + "learning_rate": 3.7896618341164087e-06, + "loss": 0.4351, + "step": 16855 + }, + { + "epoch": 2.774533335162047, + "grad_norm": 0.3273799523643519, + "learning_rate": 3.789197667615093e-06, + "loss": 0.4266, + "step": 16856 + }, + { + "epoch": 2.7746979193811634, + "grad_norm": 0.33376176425607657, + "learning_rate": 3.7887335088697676e-06, + "loss": 0.4245, + "step": 16857 + }, + { + "epoch": 2.77486250360028, + "grad_norm": 0.3355292741124985, + "learning_rate": 3.7882693578854975e-06, + "loss": 0.4354, + "step": 16858 + }, + { + "epoch": 2.7750270878193963, + "grad_norm": 0.5419348669241673, + "learning_rate": 3.787805214667349e-06, + "loss": 0.4349, + "step": 16859 + }, + { + "epoch": 2.775191672038513, + "grad_norm": 0.30543632162174206, + "learning_rate": 3.787341079220385e-06, + "loss": 0.4534, + "step": 16860 + }, + { + "epoch": 2.7753562562576293, + "grad_norm": 0.42483169974023705, + "learning_rate": 3.7868769515496715e-06, + "loss": 0.4455, + "step": 16861 + }, + { + "epoch": 2.7755208404767457, + "grad_norm": 0.28050471837453267, + "learning_rate": 3.7864128316602714e-06, + "loss": 0.4655, + "step": 16862 + }, + { + "epoch": 2.775685424695862, + "grad_norm": 0.3282592417587267, + "learning_rate": 3.78594871955725e-06, + "loss": 0.4241, + "step": 16863 + }, + { + "epoch": 2.7758500089149787, + "grad_norm": 0.2451870935424479, + "learning_rate": 3.7854846152456732e-06, + "loss": 0.4291, + "step": 16864 + }, + { + "epoch": 2.776014593134095, + "grad_norm": 0.32971710114845926, + "learning_rate": 3.7850205187305992e-06, + "loss": 0.4254, + "step": 16865 + }, + { + "epoch": 2.776179177353211, + "grad_norm": 0.3777895255987735, + "learning_rate": 3.7845564300170998e-06, + "loss": 0.4203, + "step": 16866 + }, + { + "epoch": 2.7763437615723277, + "grad_norm": 0.5279816235551633, + "learning_rate": 3.784092349110231e-06, + "loss": 0.4436, + "step": 16867 + }, + { + "epoch": 2.776508345791444, + "grad_norm": 0.2845430605160568, + "learning_rate": 3.7836282760150646e-06, + "loss": 0.4277, + "step": 16868 + }, + { + "epoch": 2.7766729300105606, + "grad_norm": 0.4033960800609004, + "learning_rate": 3.7831642107366587e-06, + "loss": 0.4309, + "step": 16869 + }, + { + "epoch": 2.776837514229677, + "grad_norm": 0.30830629475683824, + "learning_rate": 3.7827001532800793e-06, + "loss": 0.4338, + "step": 16870 + }, + { + "epoch": 2.7770020984487935, + "grad_norm": 0.27374971523185726, + "learning_rate": 3.7822361036503898e-06, + "loss": 0.4401, + "step": 16871 + }, + { + "epoch": 2.77716668266791, + "grad_norm": 0.36793554515865656, + "learning_rate": 3.7817720618526523e-06, + "loss": 0.4354, + "step": 16872 + }, + { + "epoch": 2.7773312668870265, + "grad_norm": 0.3654739408901122, + "learning_rate": 3.781308027891932e-06, + "loss": 0.4197, + "step": 16873 + }, + { + "epoch": 2.777495851106143, + "grad_norm": 0.2981515247509971, + "learning_rate": 3.7808440017732917e-06, + "loss": 0.4182, + "step": 16874 + }, + { + "epoch": 2.7776604353252594, + "grad_norm": 0.2876842090046348, + "learning_rate": 3.780379983501796e-06, + "loss": 0.4387, + "step": 16875 + }, + { + "epoch": 2.777825019544376, + "grad_norm": 0.5915380201347793, + "learning_rate": 3.779915973082504e-06, + "loss": 0.4339, + "step": 16876 + }, + { + "epoch": 2.7779896037634924, + "grad_norm": 0.47905263011333626, + "learning_rate": 3.779451970520484e-06, + "loss": 0.4493, + "step": 16877 + }, + { + "epoch": 2.778154187982609, + "grad_norm": 0.30270546075192667, + "learning_rate": 3.778987975820796e-06, + "loss": 0.4363, + "step": 16878 + }, + { + "epoch": 2.7783187722017253, + "grad_norm": 0.29825553662753534, + "learning_rate": 3.7785239889885024e-06, + "loss": 0.4263, + "step": 16879 + }, + { + "epoch": 2.7784833564208418, + "grad_norm": 0.33033442175320543, + "learning_rate": 3.778060010028668e-06, + "loss": 0.4245, + "step": 16880 + }, + { + "epoch": 2.7786479406399582, + "grad_norm": 0.36814904182310326, + "learning_rate": 3.7775960389463538e-06, + "loss": 0.443, + "step": 16881 + }, + { + "epoch": 2.7788125248590747, + "grad_norm": 0.43006987518328926, + "learning_rate": 3.777132075746624e-06, + "loss": 0.4367, + "step": 16882 + }, + { + "epoch": 2.778977109078191, + "grad_norm": 0.8368175177155547, + "learning_rate": 3.77666812043454e-06, + "loss": 0.4217, + "step": 16883 + }, + { + "epoch": 2.7791416932973076, + "grad_norm": 0.3555436805334529, + "learning_rate": 3.7762041730151654e-06, + "loss": 0.4227, + "step": 16884 + }, + { + "epoch": 2.779306277516424, + "grad_norm": 0.2839381903773783, + "learning_rate": 3.775740233493563e-06, + "loss": 0.4171, + "step": 16885 + }, + { + "epoch": 2.7794708617355406, + "grad_norm": 0.2696148806985721, + "learning_rate": 3.7752763018747915e-06, + "loss": 0.4255, + "step": 16886 + }, + { + "epoch": 2.779635445954657, + "grad_norm": 0.3281658213205818, + "learning_rate": 3.7748123781639183e-06, + "loss": 0.4477, + "step": 16887 + }, + { + "epoch": 2.7798000301737735, + "grad_norm": 0.38679247783092735, + "learning_rate": 3.7743484623660007e-06, + "loss": 0.4131, + "step": 16888 + }, + { + "epoch": 2.77996461439289, + "grad_norm": 0.33188086006927525, + "learning_rate": 3.7738845544861047e-06, + "loss": 0.4346, + "step": 16889 + }, + { + "epoch": 2.7801291986120065, + "grad_norm": 0.4873338195639668, + "learning_rate": 3.7734206545292896e-06, + "loss": 0.4251, + "step": 16890 + }, + { + "epoch": 2.780293782831123, + "grad_norm": 0.2832226392994431, + "learning_rate": 3.772956762500619e-06, + "loss": 0.448, + "step": 16891 + }, + { + "epoch": 2.7804583670502394, + "grad_norm": 0.3029700069785049, + "learning_rate": 3.7724928784051533e-06, + "loss": 0.4292, + "step": 16892 + }, + { + "epoch": 2.780622951269356, + "grad_norm": 0.30564716249735363, + "learning_rate": 3.772029002247956e-06, + "loss": 0.4342, + "step": 16893 + }, + { + "epoch": 2.7807875354884724, + "grad_norm": 0.3342439886152099, + "learning_rate": 3.7715651340340884e-06, + "loss": 0.4289, + "step": 16894 + }, + { + "epoch": 2.780952119707589, + "grad_norm": 0.3117023301162004, + "learning_rate": 3.7711012737686088e-06, + "loss": 0.4428, + "step": 16895 + }, + { + "epoch": 2.7811167039267053, + "grad_norm": 0.34039420971910117, + "learning_rate": 3.770637421456584e-06, + "loss": 0.4372, + "step": 16896 + }, + { + "epoch": 2.7812812881458218, + "grad_norm": 0.337347976138304, + "learning_rate": 3.7701735771030696e-06, + "loss": 0.4603, + "step": 16897 + }, + { + "epoch": 2.7814458723649382, + "grad_norm": 0.4024168661032073, + "learning_rate": 3.7697097407131326e-06, + "loss": 0.4501, + "step": 16898 + }, + { + "epoch": 2.7816104565840547, + "grad_norm": 0.36673311040074835, + "learning_rate": 3.76924591229183e-06, + "loss": 0.4253, + "step": 16899 + }, + { + "epoch": 2.781775040803171, + "grad_norm": 0.3466210120876975, + "learning_rate": 3.7687820918442248e-06, + "loss": 0.4126, + "step": 16900 + }, + { + "epoch": 2.7819396250222876, + "grad_norm": 0.4994253757949831, + "learning_rate": 3.768318279375378e-06, + "loss": 0.4371, + "step": 16901 + }, + { + "epoch": 2.782104209241404, + "grad_norm": 0.32390373399047606, + "learning_rate": 3.7678544748903487e-06, + "loss": 0.4394, + "step": 16902 + }, + { + "epoch": 2.7822687934605206, + "grad_norm": 0.467440952888479, + "learning_rate": 3.7673906783942002e-06, + "loss": 0.4308, + "step": 16903 + }, + { + "epoch": 2.782433377679637, + "grad_norm": 0.3679244911932761, + "learning_rate": 3.7669268898919917e-06, + "loss": 0.4397, + "step": 16904 + }, + { + "epoch": 2.7825979618987535, + "grad_norm": 0.28117900107872473, + "learning_rate": 3.7664631093887853e-06, + "loss": 0.4439, + "step": 16905 + }, + { + "epoch": 2.78276254611787, + "grad_norm": 0.3595577765149056, + "learning_rate": 3.765999336889639e-06, + "loss": 0.4472, + "step": 16906 + }, + { + "epoch": 2.7829271303369865, + "grad_norm": 0.3413085463391491, + "learning_rate": 3.7655355723996175e-06, + "loss": 0.4487, + "step": 16907 + }, + { + "epoch": 2.7830917145561025, + "grad_norm": 0.3171633221867142, + "learning_rate": 3.765071815923778e-06, + "loss": 0.4384, + "step": 16908 + }, + { + "epoch": 2.783256298775219, + "grad_norm": 0.41788203100764226, + "learning_rate": 3.76460806746718e-06, + "loss": 0.4313, + "step": 16909 + }, + { + "epoch": 2.7834208829943354, + "grad_norm": 0.33256616962038477, + "learning_rate": 3.7641443270348864e-06, + "loss": 0.4195, + "step": 16910 + }, + { + "epoch": 2.783585467213452, + "grad_norm": 0.4438550838839359, + "learning_rate": 3.763680594631956e-06, + "loss": 0.4469, + "step": 16911 + }, + { + "epoch": 2.7837500514325684, + "grad_norm": 0.3723740914529145, + "learning_rate": 3.763216870263449e-06, + "loss": 0.4182, + "step": 16912 + }, + { + "epoch": 2.783914635651685, + "grad_norm": 0.38287134229195346, + "learning_rate": 3.762753153934425e-06, + "loss": 0.4364, + "step": 16913 + }, + { + "epoch": 2.7840792198708013, + "grad_norm": 0.3048030213220403, + "learning_rate": 3.762289445649945e-06, + "loss": 0.4375, + "step": 16914 + }, + { + "epoch": 2.784243804089918, + "grad_norm": 0.318188085140894, + "learning_rate": 3.7618257454150693e-06, + "loss": 0.4243, + "step": 16915 + }, + { + "epoch": 2.7844083883090343, + "grad_norm": 0.34353950372326697, + "learning_rate": 3.7613620532348534e-06, + "loss": 0.4318, + "step": 16916 + }, + { + "epoch": 2.7845729725281507, + "grad_norm": 0.30445659026707206, + "learning_rate": 3.760898369114363e-06, + "loss": 0.417, + "step": 16917 + }, + { + "epoch": 2.784737556747267, + "grad_norm": 0.2983743769361913, + "learning_rate": 3.7604346930586528e-06, + "loss": 0.4367, + "step": 16918 + }, + { + "epoch": 2.7849021409663837, + "grad_norm": 0.30771506829920947, + "learning_rate": 3.7599710250727854e-06, + "loss": 0.4376, + "step": 16919 + }, + { + "epoch": 2.7850667251855, + "grad_norm": 0.35212283779278725, + "learning_rate": 3.7595073651618173e-06, + "loss": 0.4356, + "step": 16920 + }, + { + "epoch": 2.7852313094046166, + "grad_norm": 0.4328481200989893, + "learning_rate": 3.7590437133308104e-06, + "loss": 0.4232, + "step": 16921 + }, + { + "epoch": 2.785395893623733, + "grad_norm": 0.2879514901774684, + "learning_rate": 3.758580069584823e-06, + "loss": 0.4249, + "step": 16922 + }, + { + "epoch": 2.7855604778428495, + "grad_norm": 0.3926481862322684, + "learning_rate": 3.7581164339289125e-06, + "loss": 0.4157, + "step": 16923 + }, + { + "epoch": 2.785725062061966, + "grad_norm": 0.2779738532693352, + "learning_rate": 3.757652806368141e-06, + "loss": 0.42, + "step": 16924 + }, + { + "epoch": 2.7858896462810825, + "grad_norm": 0.2873667962660394, + "learning_rate": 3.7571891869075646e-06, + "loss": 0.434, + "step": 16925 + }, + { + "epoch": 2.786054230500199, + "grad_norm": 0.3209100044525901, + "learning_rate": 3.7567255755522452e-06, + "loss": 0.4236, + "step": 16926 + }, + { + "epoch": 2.7862188147193154, + "grad_norm": 0.2933805601306257, + "learning_rate": 3.7562619723072367e-06, + "loss": 0.4281, + "step": 16927 + }, + { + "epoch": 2.786383398938432, + "grad_norm": 0.3328129468806271, + "learning_rate": 3.755798377177604e-06, + "loss": 0.4346, + "step": 16928 + }, + { + "epoch": 2.7865479831575484, + "grad_norm": 0.3088428740042805, + "learning_rate": 3.7553347901683987e-06, + "loss": 0.4294, + "step": 16929 + }, + { + "epoch": 2.786712567376665, + "grad_norm": 0.35085409735244044, + "learning_rate": 3.7548712112846866e-06, + "loss": 0.4382, + "step": 16930 + }, + { + "epoch": 2.7868771515957813, + "grad_norm": 0.3807144512865194, + "learning_rate": 3.754407640531521e-06, + "loss": 0.4311, + "step": 16931 + }, + { + "epoch": 2.787041735814898, + "grad_norm": 0.48497774083128187, + "learning_rate": 3.753944077913961e-06, + "loss": 0.4377, + "step": 16932 + }, + { + "epoch": 2.787206320034014, + "grad_norm": 0.35729850499027854, + "learning_rate": 3.7534805234370668e-06, + "loss": 0.4393, + "step": 16933 + }, + { + "epoch": 2.7873709042531303, + "grad_norm": 0.5948625168020623, + "learning_rate": 3.7530169771058935e-06, + "loss": 0.4313, + "step": 16934 + }, + { + "epoch": 2.7875354884722467, + "grad_norm": 0.36432472655003784, + "learning_rate": 3.7525534389255022e-06, + "loss": 0.4484, + "step": 16935 + }, + { + "epoch": 2.787700072691363, + "grad_norm": 0.30984674335063966, + "learning_rate": 3.7520899089009486e-06, + "loss": 0.4332, + "step": 16936 + }, + { + "epoch": 2.7878646569104797, + "grad_norm": 0.7437331781268104, + "learning_rate": 3.7516263870372935e-06, + "loss": 0.4521, + "step": 16937 + }, + { + "epoch": 2.788029241129596, + "grad_norm": 0.4071443712642099, + "learning_rate": 3.7511628733395913e-06, + "loss": 0.4261, + "step": 16938 + }, + { + "epoch": 2.7881938253487126, + "grad_norm": 0.3537902345034585, + "learning_rate": 3.7506993678128995e-06, + "loss": 0.4321, + "step": 16939 + }, + { + "epoch": 2.788358409567829, + "grad_norm": 0.33455054322997646, + "learning_rate": 3.7502358704622792e-06, + "loss": 0.4215, + "step": 16940 + }, + { + "epoch": 2.7885229937869456, + "grad_norm": 0.3280338772128488, + "learning_rate": 3.7497723812927843e-06, + "loss": 0.4377, + "step": 16941 + }, + { + "epoch": 2.788687578006062, + "grad_norm": 0.34642732226049505, + "learning_rate": 3.7493089003094744e-06, + "loss": 0.4113, + "step": 16942 + }, + { + "epoch": 2.7888521622251785, + "grad_norm": 0.34188860854341835, + "learning_rate": 3.7488454275174056e-06, + "loss": 0.4193, + "step": 16943 + }, + { + "epoch": 2.789016746444295, + "grad_norm": 0.36554530240010263, + "learning_rate": 3.748381962921636e-06, + "loss": 0.4051, + "step": 16944 + }, + { + "epoch": 2.7891813306634115, + "grad_norm": 0.30335350273377804, + "learning_rate": 3.747918506527225e-06, + "loss": 0.4373, + "step": 16945 + }, + { + "epoch": 2.789345914882528, + "grad_norm": 0.34263542785101275, + "learning_rate": 3.747455058339223e-06, + "loss": 0.4121, + "step": 16946 + }, + { + "epoch": 2.7895104991016444, + "grad_norm": 0.5502643653929845, + "learning_rate": 3.7469916183626937e-06, + "loss": 0.4322, + "step": 16947 + }, + { + "epoch": 2.789675083320761, + "grad_norm": 0.3538609617300198, + "learning_rate": 3.7465281866026898e-06, + "loss": 0.4224, + "step": 16948 + }, + { + "epoch": 2.7898396675398773, + "grad_norm": 0.290319516274371, + "learning_rate": 3.7460647630642714e-06, + "loss": 0.4507, + "step": 16949 + }, + { + "epoch": 2.790004251758994, + "grad_norm": 0.3516323884400021, + "learning_rate": 3.7456013477524915e-06, + "loss": 0.4471, + "step": 16950 + }, + { + "epoch": 2.7901688359781103, + "grad_norm": 0.39564626942861164, + "learning_rate": 3.74513794067241e-06, + "loss": 0.4133, + "step": 16951 + }, + { + "epoch": 2.7903334201972267, + "grad_norm": 0.3694548077180987, + "learning_rate": 3.7446745418290816e-06, + "loss": 0.4272, + "step": 16952 + }, + { + "epoch": 2.790498004416343, + "grad_norm": 0.43330598934010617, + "learning_rate": 3.7442111512275626e-06, + "loss": 0.4454, + "step": 16953 + }, + { + "epoch": 2.7906625886354597, + "grad_norm": 0.29778710218876897, + "learning_rate": 3.7437477688729105e-06, + "loss": 0.4174, + "step": 16954 + }, + { + "epoch": 2.790827172854576, + "grad_norm": 0.42565006211331385, + "learning_rate": 3.7432843947701805e-06, + "loss": 0.4039, + "step": 16955 + }, + { + "epoch": 2.7909917570736926, + "grad_norm": 0.29606168440900626, + "learning_rate": 3.7428210289244293e-06, + "loss": 0.4128, + "step": 16956 + }, + { + "epoch": 2.791156341292809, + "grad_norm": 0.3511754169964407, + "learning_rate": 3.742357671340713e-06, + "loss": 0.435, + "step": 16957 + }, + { + "epoch": 2.7913209255119256, + "grad_norm": 0.4027590318484724, + "learning_rate": 3.7418943220240884e-06, + "loss": 0.4387, + "step": 16958 + }, + { + "epoch": 2.791485509731042, + "grad_norm": 0.3527348623847974, + "learning_rate": 3.7414309809796097e-06, + "loss": 0.4414, + "step": 16959 + }, + { + "epoch": 2.7916500939501585, + "grad_norm": 0.33095859367945063, + "learning_rate": 3.7409676482123323e-06, + "loss": 0.4443, + "step": 16960 + }, + { + "epoch": 2.791814678169275, + "grad_norm": 0.29026368944239717, + "learning_rate": 3.7405043237273143e-06, + "loss": 0.4319, + "step": 16961 + }, + { + "epoch": 2.7919792623883914, + "grad_norm": 0.32769594815474834, + "learning_rate": 3.740041007529609e-06, + "loss": 0.4364, + "step": 16962 + }, + { + "epoch": 2.792143846607508, + "grad_norm": 0.38324519531876516, + "learning_rate": 3.7395776996242737e-06, + "loss": 0.449, + "step": 16963 + }, + { + "epoch": 2.7923084308266244, + "grad_norm": 0.31201759520893474, + "learning_rate": 3.739114400016362e-06, + "loss": 0.4356, + "step": 16964 + }, + { + "epoch": 2.792473015045741, + "grad_norm": 0.32916106873222084, + "learning_rate": 3.7386511087109312e-06, + "loss": 0.4334, + "step": 16965 + }, + { + "epoch": 2.7926375992648573, + "grad_norm": 0.2519412393686649, + "learning_rate": 3.738187825713037e-06, + "loss": 0.4169, + "step": 16966 + }, + { + "epoch": 2.792802183483974, + "grad_norm": 0.3511877128500632, + "learning_rate": 3.7377245510277306e-06, + "loss": 0.429, + "step": 16967 + }, + { + "epoch": 2.7929667677030903, + "grad_norm": 0.2943120983708326, + "learning_rate": 3.7372612846600715e-06, + "loss": 0.4519, + "step": 16968 + }, + { + "epoch": 2.7931313519222067, + "grad_norm": 0.31550500106456264, + "learning_rate": 3.7367980266151117e-06, + "loss": 0.4291, + "step": 16969 + }, + { + "epoch": 2.793295936141323, + "grad_norm": 0.2594130714411502, + "learning_rate": 3.7363347768979084e-06, + "loss": 0.4216, + "step": 16970 + }, + { + "epoch": 2.7934605203604397, + "grad_norm": 0.3066082610071582, + "learning_rate": 3.7358715355135136e-06, + "loss": 0.4528, + "step": 16971 + }, + { + "epoch": 2.793625104579556, + "grad_norm": 0.2696462542228448, + "learning_rate": 3.735408302466985e-06, + "loss": 0.4357, + "step": 16972 + }, + { + "epoch": 2.7937896887986726, + "grad_norm": 0.3063848075313442, + "learning_rate": 3.734945077763375e-06, + "loss": 0.4342, + "step": 16973 + }, + { + "epoch": 2.7939542730177886, + "grad_norm": 0.2953028168617143, + "learning_rate": 3.7344818614077394e-06, + "loss": 0.427, + "step": 16974 + }, + { + "epoch": 2.794118857236905, + "grad_norm": 0.4258592757205851, + "learning_rate": 3.7340186534051324e-06, + "loss": 0.4474, + "step": 16975 + }, + { + "epoch": 2.7942834414560216, + "grad_norm": 0.27834239294905616, + "learning_rate": 3.7335554537606074e-06, + "loss": 0.442, + "step": 16976 + }, + { + "epoch": 2.794448025675138, + "grad_norm": 0.3230209326259235, + "learning_rate": 3.7330922624792216e-06, + "loss": 0.4276, + "step": 16977 + }, + { + "epoch": 2.7946126098942545, + "grad_norm": 0.3221333441755021, + "learning_rate": 3.732629079566024e-06, + "loss": 0.4458, + "step": 16978 + }, + { + "epoch": 2.794777194113371, + "grad_norm": 0.3676011024234717, + "learning_rate": 3.7321659050260737e-06, + "loss": 0.4368, + "step": 16979 + }, + { + "epoch": 2.7949417783324875, + "grad_norm": 0.2712662495086995, + "learning_rate": 3.7317027388644214e-06, + "loss": 0.4179, + "step": 16980 + }, + { + "epoch": 2.795106362551604, + "grad_norm": 0.30231518602792223, + "learning_rate": 3.731239581086123e-06, + "loss": 0.4042, + "step": 16981 + }, + { + "epoch": 2.7952709467707204, + "grad_norm": 1.6063668657993666, + "learning_rate": 3.730776431696231e-06, + "loss": 0.4352, + "step": 16982 + }, + { + "epoch": 2.795435530989837, + "grad_norm": 0.3833217390368779, + "learning_rate": 3.7303132906997994e-06, + "loss": 0.4282, + "step": 16983 + }, + { + "epoch": 2.7956001152089534, + "grad_norm": 0.3571528913706697, + "learning_rate": 3.7298501581018827e-06, + "loss": 0.4333, + "step": 16984 + }, + { + "epoch": 2.79576469942807, + "grad_norm": 0.4181591768839053, + "learning_rate": 3.729387033907533e-06, + "loss": 0.4391, + "step": 16985 + }, + { + "epoch": 2.7959292836471863, + "grad_norm": 0.3195424170629623, + "learning_rate": 3.7289239181218048e-06, + "loss": 0.4393, + "step": 16986 + }, + { + "epoch": 2.7960938678663028, + "grad_norm": 0.29082044594392087, + "learning_rate": 3.7284608107497503e-06, + "loss": 0.4378, + "step": 16987 + }, + { + "epoch": 2.7962584520854192, + "grad_norm": 0.3120472537805515, + "learning_rate": 3.727997711796426e-06, + "loss": 0.4211, + "step": 16988 + }, + { + "epoch": 2.7964230363045357, + "grad_norm": 0.2821648618544632, + "learning_rate": 3.7275346212668806e-06, + "loss": 0.4056, + "step": 16989 + }, + { + "epoch": 2.796587620523652, + "grad_norm": 0.46635738511994507, + "learning_rate": 3.727071539166169e-06, + "loss": 0.4369, + "step": 16990 + }, + { + "epoch": 2.7967522047427686, + "grad_norm": 0.34837292018747074, + "learning_rate": 3.7266084654993447e-06, + "loss": 0.4262, + "step": 16991 + }, + { + "epoch": 2.796916788961885, + "grad_norm": 0.3238602963355004, + "learning_rate": 3.7261454002714594e-06, + "loss": 0.4305, + "step": 16992 + }, + { + "epoch": 2.7970813731810016, + "grad_norm": 0.2700471790603966, + "learning_rate": 3.725682343487568e-06, + "loss": 0.4366, + "step": 16993 + }, + { + "epoch": 2.797245957400118, + "grad_norm": 0.32729950519911205, + "learning_rate": 3.72521929515272e-06, + "loss": 0.4444, + "step": 16994 + }, + { + "epoch": 2.7974105416192345, + "grad_norm": 0.3845240801769013, + "learning_rate": 3.7247562552719714e-06, + "loss": 0.4285, + "step": 16995 + }, + { + "epoch": 2.797575125838351, + "grad_norm": 0.303382662774618, + "learning_rate": 3.7242932238503742e-06, + "loss": 0.4372, + "step": 16996 + }, + { + "epoch": 2.7977397100574675, + "grad_norm": 0.3544453013233295, + "learning_rate": 3.7238302008929765e-06, + "loss": 0.448, + "step": 16997 + }, + { + "epoch": 2.797904294276584, + "grad_norm": 0.376369276483892, + "learning_rate": 3.7233671864048373e-06, + "loss": 0.452, + "step": 16998 + }, + { + "epoch": 2.7980688784957, + "grad_norm": 0.28156389454081876, + "learning_rate": 3.722904180391002e-06, + "loss": 0.4315, + "step": 16999 + }, + { + "epoch": 2.7982334627148164, + "grad_norm": 0.2676048084949686, + "learning_rate": 3.7224411828565293e-06, + "loss": 0.4146, + "step": 17000 + }, + { + "epoch": 2.798398046933933, + "grad_norm": 0.28636233503036546, + "learning_rate": 3.7219781938064657e-06, + "loss": 0.4321, + "step": 17001 + }, + { + "epoch": 2.7985626311530494, + "grad_norm": 0.28896095189299925, + "learning_rate": 3.721515213245867e-06, + "loss": 0.4383, + "step": 17002 + }, + { + "epoch": 2.798727215372166, + "grad_norm": 0.3737462974148848, + "learning_rate": 3.7210522411797837e-06, + "loss": 0.4433, + "step": 17003 + }, + { + "epoch": 2.7988917995912823, + "grad_norm": 0.33586658724596585, + "learning_rate": 3.7205892776132666e-06, + "loss": 0.4311, + "step": 17004 + }, + { + "epoch": 2.799056383810399, + "grad_norm": 0.3544658458659146, + "learning_rate": 3.720126322551368e-06, + "loss": 0.4241, + "step": 17005 + }, + { + "epoch": 2.7992209680295153, + "grad_norm": 0.3527511763637352, + "learning_rate": 3.71966337599914e-06, + "loss": 0.4472, + "step": 17006 + }, + { + "epoch": 2.7993855522486317, + "grad_norm": 0.3095155768711578, + "learning_rate": 3.7192004379616358e-06, + "loss": 0.4237, + "step": 17007 + }, + { + "epoch": 2.799550136467748, + "grad_norm": 1.0535976014865251, + "learning_rate": 3.7187375084439014e-06, + "loss": 0.4342, + "step": 17008 + }, + { + "epoch": 2.7997147206868647, + "grad_norm": 0.3112590515808455, + "learning_rate": 3.7182745874509945e-06, + "loss": 0.4138, + "step": 17009 + }, + { + "epoch": 2.799879304905981, + "grad_norm": 0.33246102620116796, + "learning_rate": 3.717811674987962e-06, + "loss": 0.4391, + "step": 17010 + }, + { + "epoch": 2.8000438891250976, + "grad_norm": 0.36308513014633353, + "learning_rate": 3.717348771059856e-06, + "loss": 0.4302, + "step": 17011 + }, + { + "epoch": 2.800208473344214, + "grad_norm": 0.43090264422151375, + "learning_rate": 3.716885875671728e-06, + "loss": 0.4362, + "step": 17012 + }, + { + "epoch": 2.8003730575633305, + "grad_norm": 0.3120753273054859, + "learning_rate": 3.7164229888286287e-06, + "loss": 0.412, + "step": 17013 + }, + { + "epoch": 2.800537641782447, + "grad_norm": 0.3137670501286763, + "learning_rate": 3.71596011053561e-06, + "loss": 0.4341, + "step": 17014 + }, + { + "epoch": 2.8007022260015635, + "grad_norm": 0.31210905483561246, + "learning_rate": 3.71549724079772e-06, + "loss": 0.4248, + "step": 17015 + }, + { + "epoch": 2.80086681022068, + "grad_norm": 0.3238692595647709, + "learning_rate": 3.7150343796200123e-06, + "loss": 0.427, + "step": 17016 + }, + { + "epoch": 2.8010313944397964, + "grad_norm": 0.3889252924366496, + "learning_rate": 3.714571527007535e-06, + "loss": 0.4369, + "step": 17017 + }, + { + "epoch": 2.801195978658913, + "grad_norm": 0.3874796764943105, + "learning_rate": 3.714108682965341e-06, + "loss": 0.4403, + "step": 17018 + }, + { + "epoch": 2.8013605628780294, + "grad_norm": 0.602048942349635, + "learning_rate": 3.7136458474984805e-06, + "loss": 0.4419, + "step": 17019 + }, + { + "epoch": 2.801525147097146, + "grad_norm": 0.4316973975224754, + "learning_rate": 3.7131830206120005e-06, + "loss": 0.4211, + "step": 17020 + }, + { + "epoch": 2.8016897313162623, + "grad_norm": 0.3025552469638141, + "learning_rate": 3.7127202023109545e-06, + "loss": 0.4494, + "step": 17021 + }, + { + "epoch": 2.801854315535379, + "grad_norm": 0.30613352483498824, + "learning_rate": 3.71225739260039e-06, + "loss": 0.4203, + "step": 17022 + }, + { + "epoch": 2.8020188997544953, + "grad_norm": 0.3390169431145897, + "learning_rate": 3.71179459148536e-06, + "loss": 0.431, + "step": 17023 + }, + { + "epoch": 2.8021834839736117, + "grad_norm": 0.32462568250008716, + "learning_rate": 3.7113317989709114e-06, + "loss": 0.4362, + "step": 17024 + }, + { + "epoch": 2.802348068192728, + "grad_norm": 0.3769661778823802, + "learning_rate": 3.710869015062097e-06, + "loss": 0.44, + "step": 17025 + }, + { + "epoch": 2.8025126524118447, + "grad_norm": 0.344819910321778, + "learning_rate": 3.7104062397639652e-06, + "loss": 0.4211, + "step": 17026 + }, + { + "epoch": 2.802677236630961, + "grad_norm": 0.28936962174503733, + "learning_rate": 3.709943473081563e-06, + "loss": 0.4193, + "step": 17027 + }, + { + "epoch": 2.8028418208500776, + "grad_norm": 0.3059987994253555, + "learning_rate": 3.7094807150199453e-06, + "loss": 0.4427, + "step": 17028 + }, + { + "epoch": 2.803006405069194, + "grad_norm": 0.3496518395650141, + "learning_rate": 3.7090179655841552e-06, + "loss": 0.4169, + "step": 17029 + }, + { + "epoch": 2.8031709892883105, + "grad_norm": 0.29300593577265654, + "learning_rate": 3.7085552247792484e-06, + "loss": 0.4374, + "step": 17030 + }, + { + "epoch": 2.803335573507427, + "grad_norm": 0.3291116815151797, + "learning_rate": 3.708092492610269e-06, + "loss": 0.4292, + "step": 17031 + }, + { + "epoch": 2.8035001577265435, + "grad_norm": 0.7757549504963241, + "learning_rate": 3.70762976908227e-06, + "loss": 0.4292, + "step": 17032 + }, + { + "epoch": 2.80366474194566, + "grad_norm": 0.3413843417394545, + "learning_rate": 3.707167054200298e-06, + "loss": 0.4519, + "step": 17033 + }, + { + "epoch": 2.8038293261647764, + "grad_norm": 0.3344160874415403, + "learning_rate": 3.706704347969402e-06, + "loss": 0.4169, + "step": 17034 + }, + { + "epoch": 2.803993910383893, + "grad_norm": 0.3432974498324751, + "learning_rate": 3.706241650394633e-06, + "loss": 0.4291, + "step": 17035 + }, + { + "epoch": 2.8041584946030094, + "grad_norm": 0.327641039036722, + "learning_rate": 3.7057789614810363e-06, + "loss": 0.4333, + "step": 17036 + }, + { + "epoch": 2.804323078822126, + "grad_norm": 0.3558890376868826, + "learning_rate": 3.7053162812336638e-06, + "loss": 0.4206, + "step": 17037 + }, + { + "epoch": 2.8044876630412423, + "grad_norm": 0.30115121607919093, + "learning_rate": 3.7048536096575627e-06, + "loss": 0.4199, + "step": 17038 + }, + { + "epoch": 2.8046522472603588, + "grad_norm": 1.5654426664598493, + "learning_rate": 3.704390946757783e-06, + "loss": 0.436, + "step": 17039 + }, + { + "epoch": 2.8048168314794752, + "grad_norm": 0.3369589627104296, + "learning_rate": 3.7039282925393706e-06, + "loss": 0.4327, + "step": 17040 + }, + { + "epoch": 2.8049814156985913, + "grad_norm": 0.2708386930655964, + "learning_rate": 3.7034656470073745e-06, + "loss": 0.4139, + "step": 17041 + }, + { + "epoch": 2.8051459999177077, + "grad_norm": 0.31847056505151394, + "learning_rate": 3.7030030101668444e-06, + "loss": 0.4155, + "step": 17042 + }, + { + "epoch": 2.805310584136824, + "grad_norm": 0.36442981805925195, + "learning_rate": 3.702540382022826e-06, + "loss": 0.4248, + "step": 17043 + }, + { + "epoch": 2.8054751683559407, + "grad_norm": 0.29620262128536895, + "learning_rate": 3.70207776258037e-06, + "loss": 0.4361, + "step": 17044 + }, + { + "epoch": 2.805639752575057, + "grad_norm": 0.9831257028070958, + "learning_rate": 3.7016151518445214e-06, + "loss": 0.4267, + "step": 17045 + }, + { + "epoch": 2.8058043367941736, + "grad_norm": 0.3674522650750558, + "learning_rate": 3.7011525498203315e-06, + "loss": 0.4439, + "step": 17046 + }, + { + "epoch": 2.80596892101329, + "grad_norm": 0.44696920449614114, + "learning_rate": 3.7006899565128467e-06, + "loss": 0.4549, + "step": 17047 + }, + { + "epoch": 2.8061335052324066, + "grad_norm": 0.48658489052050297, + "learning_rate": 3.7002273719271106e-06, + "loss": 0.4421, + "step": 17048 + }, + { + "epoch": 2.806298089451523, + "grad_norm": 0.28648773556330664, + "learning_rate": 3.699764796068178e-06, + "loss": 0.4522, + "step": 17049 + }, + { + "epoch": 2.8064626736706395, + "grad_norm": 0.6411059808424378, + "learning_rate": 3.69930222894109e-06, + "loss": 0.4365, + "step": 17050 + }, + { + "epoch": 2.806627257889756, + "grad_norm": 0.33897861852767236, + "learning_rate": 3.6988396705508977e-06, + "loss": 0.4445, + "step": 17051 + }, + { + "epoch": 2.8067918421088724, + "grad_norm": 0.3062163499760631, + "learning_rate": 3.6983771209026466e-06, + "loss": 0.4368, + "step": 17052 + }, + { + "epoch": 2.806956426327989, + "grad_norm": 0.29910086203118597, + "learning_rate": 3.6979145800013855e-06, + "loss": 0.4156, + "step": 17053 + }, + { + "epoch": 2.8071210105471054, + "grad_norm": 0.33507309223228005, + "learning_rate": 3.6974520478521587e-06, + "loss": 0.4207, + "step": 17054 + }, + { + "epoch": 2.807285594766222, + "grad_norm": 0.35606412297007517, + "learning_rate": 3.696989524460016e-06, + "loss": 0.411, + "step": 17055 + }, + { + "epoch": 2.8074501789853383, + "grad_norm": 0.4319969872987501, + "learning_rate": 3.6965270098300035e-06, + "loss": 0.4558, + "step": 17056 + }, + { + "epoch": 2.807614763204455, + "grad_norm": 0.3174102870346743, + "learning_rate": 3.6960645039671665e-06, + "loss": 0.4363, + "step": 17057 + }, + { + "epoch": 2.8077793474235713, + "grad_norm": 0.26966269521273795, + "learning_rate": 3.6956020068765547e-06, + "loss": 0.4319, + "step": 17058 + }, + { + "epoch": 2.8079439316426877, + "grad_norm": 0.3015423359547079, + "learning_rate": 3.6951395185632096e-06, + "loss": 0.4293, + "step": 17059 + }, + { + "epoch": 2.808108515861804, + "grad_norm": 0.39543701173947554, + "learning_rate": 3.6946770390321837e-06, + "loss": 0.4475, + "step": 17060 + }, + { + "epoch": 2.8082731000809207, + "grad_norm": 0.29558063112252037, + "learning_rate": 3.694214568288518e-06, + "loss": 0.4326, + "step": 17061 + }, + { + "epoch": 2.808437684300037, + "grad_norm": 0.29241953332047743, + "learning_rate": 3.693752106337264e-06, + "loss": 0.4463, + "step": 17062 + }, + { + "epoch": 2.8086022685191536, + "grad_norm": 0.36922966517747435, + "learning_rate": 3.6932896531834643e-06, + "loss": 0.4424, + "step": 17063 + }, + { + "epoch": 2.80876685273827, + "grad_norm": 0.34813899240461527, + "learning_rate": 3.692827208832165e-06, + "loss": 0.4427, + "step": 17064 + }, + { + "epoch": 2.8089314369573866, + "grad_norm": 0.2882741202352136, + "learning_rate": 3.6923647732884143e-06, + "loss": 0.4429, + "step": 17065 + }, + { + "epoch": 2.8090960211765026, + "grad_norm": 0.28096012817176935, + "learning_rate": 3.6919023465572558e-06, + "loss": 0.4275, + "step": 17066 + }, + { + "epoch": 2.809260605395619, + "grad_norm": 0.3340488275443412, + "learning_rate": 3.6914399286437373e-06, + "loss": 0.4357, + "step": 17067 + }, + { + "epoch": 2.8094251896147355, + "grad_norm": 0.3125108279274556, + "learning_rate": 3.6909775195529026e-06, + "loss": 0.4279, + "step": 17068 + }, + { + "epoch": 2.809589773833852, + "grad_norm": 0.2881721468616627, + "learning_rate": 3.6905151192898004e-06, + "loss": 0.4254, + "step": 17069 + }, + { + "epoch": 2.8097543580529685, + "grad_norm": 0.33957698439504513, + "learning_rate": 3.690052727859473e-06, + "loss": 0.4403, + "step": 17070 + }, + { + "epoch": 2.809918942272085, + "grad_norm": 0.3042511204962381, + "learning_rate": 3.6895903452669655e-06, + "loss": 0.4062, + "step": 17071 + }, + { + "epoch": 2.8100835264912014, + "grad_norm": 0.3131632403271567, + "learning_rate": 3.689127971517327e-06, + "loss": 0.4254, + "step": 17072 + }, + { + "epoch": 2.810248110710318, + "grad_norm": 0.32292931932722746, + "learning_rate": 3.6886656066155985e-06, + "loss": 0.4297, + "step": 17073 + }, + { + "epoch": 2.8104126949294344, + "grad_norm": 0.3258889677088053, + "learning_rate": 3.6882032505668283e-06, + "loss": 0.4245, + "step": 17074 + }, + { + "epoch": 2.810577279148551, + "grad_norm": 0.3213719613226214, + "learning_rate": 3.687740903376059e-06, + "loss": 0.4184, + "step": 17075 + }, + { + "epoch": 2.8107418633676673, + "grad_norm": 0.28843989368418793, + "learning_rate": 3.687278565048338e-06, + "loss": 0.423, + "step": 17076 + }, + { + "epoch": 2.8109064475867838, + "grad_norm": 0.382218246035676, + "learning_rate": 3.6868162355887096e-06, + "loss": 0.4173, + "step": 17077 + }, + { + "epoch": 2.8110710318059002, + "grad_norm": 0.3187230594758411, + "learning_rate": 3.686353915002216e-06, + "loss": 0.4292, + "step": 17078 + }, + { + "epoch": 2.8112356160250167, + "grad_norm": 0.28468997295293, + "learning_rate": 3.685891603293906e-06, + "loss": 0.4286, + "step": 17079 + }, + { + "epoch": 2.811400200244133, + "grad_norm": 0.325067665311539, + "learning_rate": 3.685429300468819e-06, + "loss": 0.4266, + "step": 17080 + }, + { + "epoch": 2.8115647844632496, + "grad_norm": 0.5363543466171763, + "learning_rate": 3.6849670065320047e-06, + "loss": 0.4149, + "step": 17081 + }, + { + "epoch": 2.811729368682366, + "grad_norm": 0.36713578219493537, + "learning_rate": 3.684504721488504e-06, + "loss": 0.4214, + "step": 17082 + }, + { + "epoch": 2.8118939529014826, + "grad_norm": 0.3556618817554257, + "learning_rate": 3.684042445343363e-06, + "loss": 0.4279, + "step": 17083 + }, + { + "epoch": 2.812058537120599, + "grad_norm": 0.3790316644542553, + "learning_rate": 3.683580178101626e-06, + "loss": 0.4204, + "step": 17084 + }, + { + "epoch": 2.8122231213397155, + "grad_norm": 0.44054579314572406, + "learning_rate": 3.6831179197683345e-06, + "loss": 0.4296, + "step": 17085 + }, + { + "epoch": 2.812387705558832, + "grad_norm": 0.3110667703692109, + "learning_rate": 3.682655670348535e-06, + "loss": 0.4293, + "step": 17086 + }, + { + "epoch": 2.8125522897779485, + "grad_norm": 0.33293568691450487, + "learning_rate": 3.68219342984727e-06, + "loss": 0.4437, + "step": 17087 + }, + { + "epoch": 2.812716873997065, + "grad_norm": 0.47825615572547775, + "learning_rate": 3.6817311982695857e-06, + "loss": 0.4353, + "step": 17088 + }, + { + "epoch": 2.8128814582161814, + "grad_norm": 0.36050258840546906, + "learning_rate": 3.6812689756205207e-06, + "loss": 0.4256, + "step": 17089 + }, + { + "epoch": 2.813046042435298, + "grad_norm": 0.2679257712308787, + "learning_rate": 3.6808067619051253e-06, + "loss": 0.4215, + "step": 17090 + }, + { + "epoch": 2.8132106266544143, + "grad_norm": 0.3498276050713189, + "learning_rate": 3.6803445571284385e-06, + "loss": 0.4165, + "step": 17091 + }, + { + "epoch": 2.813375210873531, + "grad_norm": 0.29415549838126503, + "learning_rate": 3.679882361295504e-06, + "loss": 0.425, + "step": 17092 + }, + { + "epoch": 2.8135397950926473, + "grad_norm": 0.3274825930326004, + "learning_rate": 3.679420174411367e-06, + "loss": 0.4117, + "step": 17093 + }, + { + "epoch": 2.8137043793117638, + "grad_norm": 0.380254456127662, + "learning_rate": 3.678957996481068e-06, + "loss": 0.4653, + "step": 17094 + }, + { + "epoch": 2.8138689635308802, + "grad_norm": 0.38134174916262903, + "learning_rate": 3.6784958275096528e-06, + "loss": 0.414, + "step": 17095 + }, + { + "epoch": 2.8140335477499967, + "grad_norm": 0.2777538832839576, + "learning_rate": 3.6780336675021627e-06, + "loss": 0.4113, + "step": 17096 + }, + { + "epoch": 2.814198131969113, + "grad_norm": 0.39757878901837895, + "learning_rate": 3.6775715164636414e-06, + "loss": 0.4386, + "step": 17097 + }, + { + "epoch": 2.8143627161882296, + "grad_norm": 0.2996085061697317, + "learning_rate": 3.677109374399131e-06, + "loss": 0.4326, + "step": 17098 + }, + { + "epoch": 2.814527300407346, + "grad_norm": 0.3132830922240271, + "learning_rate": 3.676647241313676e-06, + "loss": 0.4243, + "step": 17099 + }, + { + "epoch": 2.8146918846264626, + "grad_norm": 0.308012152473725, + "learning_rate": 3.6761851172123183e-06, + "loss": 0.4346, + "step": 17100 + }, + { + "epoch": 2.814856468845579, + "grad_norm": 0.28324542469615294, + "learning_rate": 3.6757230021000977e-06, + "loss": 0.4333, + "step": 17101 + }, + { + "epoch": 2.8150210530646955, + "grad_norm": 0.4152341168327462, + "learning_rate": 3.6752608959820596e-06, + "loss": 0.4325, + "step": 17102 + }, + { + "epoch": 2.815185637283812, + "grad_norm": 0.30608123523114217, + "learning_rate": 3.674798798863245e-06, + "loss": 0.4336, + "step": 17103 + }, + { + "epoch": 2.8153502215029285, + "grad_norm": 0.3394719081879419, + "learning_rate": 3.674336710748697e-06, + "loss": 0.4335, + "step": 17104 + }, + { + "epoch": 2.815514805722045, + "grad_norm": 0.3288775694862542, + "learning_rate": 3.6738746316434567e-06, + "loss": 0.4478, + "step": 17105 + }, + { + "epoch": 2.8156793899411614, + "grad_norm": 0.4548371543200483, + "learning_rate": 3.673412561552568e-06, + "loss": 0.4359, + "step": 17106 + }, + { + "epoch": 2.815843974160278, + "grad_norm": 0.3064065707396901, + "learning_rate": 3.672950500481072e-06, + "loss": 0.4131, + "step": 17107 + }, + { + "epoch": 2.816008558379394, + "grad_norm": 0.37563674605906305, + "learning_rate": 3.6724884484340074e-06, + "loss": 0.4072, + "step": 17108 + }, + { + "epoch": 2.8161731425985104, + "grad_norm": 0.31215206312051025, + "learning_rate": 3.6720264054164214e-06, + "loss": 0.4345, + "step": 17109 + }, + { + "epoch": 2.816337726817627, + "grad_norm": 0.29020191266348566, + "learning_rate": 3.67156437143335e-06, + "loss": 0.4291, + "step": 17110 + }, + { + "epoch": 2.8165023110367433, + "grad_norm": 0.3353379042670644, + "learning_rate": 3.6711023464898397e-06, + "loss": 0.4314, + "step": 17111 + }, + { + "epoch": 2.81666689525586, + "grad_norm": 0.31147570248059336, + "learning_rate": 3.670640330590929e-06, + "loss": 0.4281, + "step": 17112 + }, + { + "epoch": 2.8168314794749763, + "grad_norm": 0.3117734222192966, + "learning_rate": 3.67017832374166e-06, + "loss": 0.4448, + "step": 17113 + }, + { + "epoch": 2.8169960636940927, + "grad_norm": 0.4132043734099047, + "learning_rate": 3.6697163259470743e-06, + "loss": 0.4474, + "step": 17114 + }, + { + "epoch": 2.817160647913209, + "grad_norm": 0.4119011796010191, + "learning_rate": 3.6692543372122123e-06, + "loss": 0.4319, + "step": 17115 + }, + { + "epoch": 2.8173252321323257, + "grad_norm": 0.3155796783904074, + "learning_rate": 3.668792357542116e-06, + "loss": 0.4182, + "step": 17116 + }, + { + "epoch": 2.817489816351442, + "grad_norm": 0.30699109023803706, + "learning_rate": 3.6683303869418244e-06, + "loss": 0.4266, + "step": 17117 + }, + { + "epoch": 2.8176544005705586, + "grad_norm": 0.6887270606990578, + "learning_rate": 3.667868425416381e-06, + "loss": 0.4262, + "step": 17118 + }, + { + "epoch": 2.817818984789675, + "grad_norm": 0.2751247994882229, + "learning_rate": 3.6674064729708247e-06, + "loss": 0.4238, + "step": 17119 + }, + { + "epoch": 2.8179835690087915, + "grad_norm": 0.459838495793609, + "learning_rate": 3.666944529610198e-06, + "loss": 0.4364, + "step": 17120 + }, + { + "epoch": 2.818148153227908, + "grad_norm": 0.31715489732174246, + "learning_rate": 3.666482595339539e-06, + "loss": 0.4115, + "step": 17121 + }, + { + "epoch": 2.8183127374470245, + "grad_norm": 0.3810854234866949, + "learning_rate": 3.6660206701638885e-06, + "loss": 0.435, + "step": 17122 + }, + { + "epoch": 2.818477321666141, + "grad_norm": 0.2980712904841457, + "learning_rate": 3.665558754088289e-06, + "loss": 0.4293, + "step": 17123 + }, + { + "epoch": 2.8186419058852574, + "grad_norm": 0.31554529600333936, + "learning_rate": 3.6650968471177778e-06, + "loss": 0.4396, + "step": 17124 + }, + { + "epoch": 2.818806490104374, + "grad_norm": 0.3526745457275579, + "learning_rate": 3.664634949257398e-06, + "loss": 0.4386, + "step": 17125 + }, + { + "epoch": 2.8189710743234904, + "grad_norm": 0.26955385001081505, + "learning_rate": 3.6641730605121874e-06, + "loss": 0.4217, + "step": 17126 + }, + { + "epoch": 2.819135658542607, + "grad_norm": 0.3302344739823321, + "learning_rate": 3.663711180887187e-06, + "loss": 0.4473, + "step": 17127 + }, + { + "epoch": 2.8193002427617233, + "grad_norm": 0.38799946904704896, + "learning_rate": 3.663249310387438e-06, + "loss": 0.4147, + "step": 17128 + }, + { + "epoch": 2.8194648269808398, + "grad_norm": 0.36345893830202847, + "learning_rate": 3.6627874490179753e-06, + "loss": 0.4215, + "step": 17129 + }, + { + "epoch": 2.8196294111999562, + "grad_norm": 0.3921152616720411, + "learning_rate": 3.6623255967838453e-06, + "loss": 0.4397, + "step": 17130 + }, + { + "epoch": 2.8197939954190727, + "grad_norm": 0.3370515743146148, + "learning_rate": 3.6618637536900817e-06, + "loss": 0.423, + "step": 17131 + }, + { + "epoch": 2.819958579638189, + "grad_norm": 0.298672711834632, + "learning_rate": 3.6614019197417274e-06, + "loss": 0.4148, + "step": 17132 + }, + { + "epoch": 2.820123163857305, + "grad_norm": 0.3674852867505034, + "learning_rate": 3.6609400949438195e-06, + "loss": 0.45, + "step": 17133 + }, + { + "epoch": 2.8202877480764217, + "grad_norm": 0.4480024381728986, + "learning_rate": 3.6604782793013998e-06, + "loss": 0.422, + "step": 17134 + }, + { + "epoch": 2.820452332295538, + "grad_norm": 0.43238289475601743, + "learning_rate": 3.660016472819506e-06, + "loss": 0.4339, + "step": 17135 + }, + { + "epoch": 2.8206169165146546, + "grad_norm": 0.26018647672172557, + "learning_rate": 3.6595546755031758e-06, + "loss": 0.4051, + "step": 17136 + }, + { + "epoch": 2.820781500733771, + "grad_norm": 0.27947140164193335, + "learning_rate": 3.659092887357451e-06, + "loss": 0.4224, + "step": 17137 + }, + { + "epoch": 2.8209460849528876, + "grad_norm": 0.2779553942761288, + "learning_rate": 3.6586311083873674e-06, + "loss": 0.4294, + "step": 17138 + }, + { + "epoch": 2.821110669172004, + "grad_norm": 0.3123836058428518, + "learning_rate": 3.658169338597968e-06, + "loss": 0.3996, + "step": 17139 + }, + { + "epoch": 2.8212752533911205, + "grad_norm": 0.29982736494768003, + "learning_rate": 3.657707577994286e-06, + "loss": 0.435, + "step": 17140 + }, + { + "epoch": 2.821439837610237, + "grad_norm": 0.2998633135173355, + "learning_rate": 3.6572458265813648e-06, + "loss": 0.4326, + "step": 17141 + }, + { + "epoch": 2.8216044218293534, + "grad_norm": 1.2875260936074302, + "learning_rate": 3.6567840843642383e-06, + "loss": 0.4414, + "step": 17142 + }, + { + "epoch": 2.82176900604847, + "grad_norm": 0.36380125041897954, + "learning_rate": 3.6563223513479505e-06, + "loss": 0.4419, + "step": 17143 + }, + { + "epoch": 2.8219335902675864, + "grad_norm": 0.3031862883250551, + "learning_rate": 3.6558606275375356e-06, + "loss": 0.4327, + "step": 17144 + }, + { + "epoch": 2.822098174486703, + "grad_norm": 0.31900799897958043, + "learning_rate": 3.655398912938031e-06, + "loss": 0.4384, + "step": 17145 + }, + { + "epoch": 2.8222627587058193, + "grad_norm": 0.3025173566880341, + "learning_rate": 3.654937207554478e-06, + "loss": 0.4343, + "step": 17146 + }, + { + "epoch": 2.822427342924936, + "grad_norm": 0.2663358722032339, + "learning_rate": 3.6544755113919113e-06, + "loss": 0.4463, + "step": 17147 + }, + { + "epoch": 2.8225919271440523, + "grad_norm": 0.3513917982076639, + "learning_rate": 3.654013824455372e-06, + "loss": 0.4307, + "step": 17148 + }, + { + "epoch": 2.8227565113631687, + "grad_norm": 0.5643742825543426, + "learning_rate": 3.653552146749895e-06, + "loss": 0.433, + "step": 17149 + }, + { + "epoch": 2.822921095582285, + "grad_norm": 0.3137193911706826, + "learning_rate": 3.65309047828052e-06, + "loss": 0.4303, + "step": 17150 + }, + { + "epoch": 2.8230856798014017, + "grad_norm": 0.846721645930326, + "learning_rate": 3.652628819052285e-06, + "loss": 0.4398, + "step": 17151 + }, + { + "epoch": 2.823250264020518, + "grad_norm": 0.35462356857183863, + "learning_rate": 3.652167169070224e-06, + "loss": 0.4333, + "step": 17152 + }, + { + "epoch": 2.8234148482396346, + "grad_norm": 0.2740207894520864, + "learning_rate": 3.6517055283393776e-06, + "loss": 0.4125, + "step": 17153 + }, + { + "epoch": 2.823579432458751, + "grad_norm": 0.3036748323797673, + "learning_rate": 3.6512438968647807e-06, + "loss": 0.4514, + "step": 17154 + }, + { + "epoch": 2.8237440166778676, + "grad_norm": 0.29306246669216246, + "learning_rate": 3.6507822746514722e-06, + "loss": 0.4343, + "step": 17155 + }, + { + "epoch": 2.823908600896984, + "grad_norm": 0.2752629222420503, + "learning_rate": 3.650320661704488e-06, + "loss": 0.4429, + "step": 17156 + }, + { + "epoch": 2.8240731851161005, + "grad_norm": 0.30854742164160615, + "learning_rate": 3.6498590580288672e-06, + "loss": 0.4468, + "step": 17157 + }, + { + "epoch": 2.824237769335217, + "grad_norm": 0.31442516825330213, + "learning_rate": 3.649397463629646e-06, + "loss": 0.4469, + "step": 17158 + }, + { + "epoch": 2.8244023535543334, + "grad_norm": 0.31816277090241346, + "learning_rate": 3.6489358785118563e-06, + "loss": 0.4317, + "step": 17159 + }, + { + "epoch": 2.82456693777345, + "grad_norm": 0.3820945237326531, + "learning_rate": 3.6484743026805423e-06, + "loss": 0.4237, + "step": 17160 + }, + { + "epoch": 2.8247315219925664, + "grad_norm": 0.3340528223521202, + "learning_rate": 3.648012736140734e-06, + "loss": 0.429, + "step": 17161 + }, + { + "epoch": 2.824896106211683, + "grad_norm": 0.2914111218959242, + "learning_rate": 3.6475511788974735e-06, + "loss": 0.4571, + "step": 17162 + }, + { + "epoch": 2.8250606904307993, + "grad_norm": 0.31147738717265716, + "learning_rate": 3.6470896309557925e-06, + "loss": 0.4219, + "step": 17163 + }, + { + "epoch": 2.825225274649916, + "grad_norm": 0.31456881040370577, + "learning_rate": 3.6466280923207297e-06, + "loss": 0.4485, + "step": 17164 + }, + { + "epoch": 2.8253898588690323, + "grad_norm": 0.2519660907558552, + "learning_rate": 3.6461665629973207e-06, + "loss": 0.43, + "step": 17165 + }, + { + "epoch": 2.8255544430881487, + "grad_norm": 0.32504354549709374, + "learning_rate": 3.6457050429906e-06, + "loss": 0.4183, + "step": 17166 + }, + { + "epoch": 2.825719027307265, + "grad_norm": 0.3063441324912973, + "learning_rate": 3.6452435323056063e-06, + "loss": 0.4233, + "step": 17167 + }, + { + "epoch": 2.8258836115263817, + "grad_norm": 0.2825098525755312, + "learning_rate": 3.644782030947373e-06, + "loss": 0.4453, + "step": 17168 + }, + { + "epoch": 2.826048195745498, + "grad_norm": 0.3196720940852849, + "learning_rate": 3.6443205389209384e-06, + "loss": 0.4341, + "step": 17169 + }, + { + "epoch": 2.8262127799646146, + "grad_norm": 0.30787067603334795, + "learning_rate": 3.6438590562313345e-06, + "loss": 0.4246, + "step": 17170 + }, + { + "epoch": 2.826377364183731, + "grad_norm": 0.3398306678988993, + "learning_rate": 3.643397582883602e-06, + "loss": 0.428, + "step": 17171 + }, + { + "epoch": 2.8265419484028476, + "grad_norm": 0.6485214365786198, + "learning_rate": 3.642936118882771e-06, + "loss": 0.424, + "step": 17172 + }, + { + "epoch": 2.826706532621964, + "grad_norm": 0.287575292203491, + "learning_rate": 3.642474664233879e-06, + "loss": 0.4514, + "step": 17173 + }, + { + "epoch": 2.82687111684108, + "grad_norm": 0.3992671162860481, + "learning_rate": 3.6420132189419624e-06, + "loss": 0.4402, + "step": 17174 + }, + { + "epoch": 2.8270357010601965, + "grad_norm": 0.29128519496400274, + "learning_rate": 3.641551783012054e-06, + "loss": 0.4127, + "step": 17175 + }, + { + "epoch": 2.827200285279313, + "grad_norm": 0.33820307623461265, + "learning_rate": 3.64109035644919e-06, + "loss": 0.4298, + "step": 17176 + }, + { + "epoch": 2.8273648694984295, + "grad_norm": 0.5258143016939788, + "learning_rate": 3.640628939258405e-06, + "loss": 0.4313, + "step": 17177 + }, + { + "epoch": 2.827529453717546, + "grad_norm": 0.33421939150531105, + "learning_rate": 3.640167531444735e-06, + "loss": 0.4433, + "step": 17178 + }, + { + "epoch": 2.8276940379366624, + "grad_norm": 0.3489542375657524, + "learning_rate": 3.639706133013212e-06, + "loss": 0.421, + "step": 17179 + }, + { + "epoch": 2.827858622155779, + "grad_norm": 0.3365482386357486, + "learning_rate": 3.6392447439688744e-06, + "loss": 0.4159, + "step": 17180 + }, + { + "epoch": 2.8280232063748953, + "grad_norm": 0.33791730236571293, + "learning_rate": 3.638783364316755e-06, + "loss": 0.4452, + "step": 17181 + }, + { + "epoch": 2.828187790594012, + "grad_norm": 0.3057041855992184, + "learning_rate": 3.6383219940618863e-06, + "loss": 0.4048, + "step": 17182 + }, + { + "epoch": 2.8283523748131283, + "grad_norm": 0.33764982152904666, + "learning_rate": 3.6378606332093046e-06, + "loss": 0.4338, + "step": 17183 + }, + { + "epoch": 2.8285169590322448, + "grad_norm": 0.3030336775119331, + "learning_rate": 3.6373992817640427e-06, + "loss": 0.4195, + "step": 17184 + }, + { + "epoch": 2.8286815432513612, + "grad_norm": 0.3335775815830645, + "learning_rate": 3.636937939731137e-06, + "loss": 0.4098, + "step": 17185 + }, + { + "epoch": 2.8288461274704777, + "grad_norm": 0.2719457502171315, + "learning_rate": 3.6364766071156185e-06, + "loss": 0.4247, + "step": 17186 + }, + { + "epoch": 2.829010711689594, + "grad_norm": 0.2997921615260896, + "learning_rate": 3.636015283922524e-06, + "loss": 0.444, + "step": 17187 + }, + { + "epoch": 2.8291752959087106, + "grad_norm": 0.3194612110051689, + "learning_rate": 3.635553970156886e-06, + "loss": 0.445, + "step": 17188 + }, + { + "epoch": 2.829339880127827, + "grad_norm": 0.313158008599718, + "learning_rate": 3.6350926658237367e-06, + "loss": 0.4327, + "step": 17189 + }, + { + "epoch": 2.8295044643469436, + "grad_norm": 0.28656355456574967, + "learning_rate": 3.634631370928113e-06, + "loss": 0.4306, + "step": 17190 + }, + { + "epoch": 2.82966904856606, + "grad_norm": 0.49093999604571625, + "learning_rate": 3.6341700854750433e-06, + "loss": 0.4238, + "step": 17191 + }, + { + "epoch": 2.8298336327851765, + "grad_norm": 0.312583077459305, + "learning_rate": 3.6337088094695677e-06, + "loss": 0.4267, + "step": 17192 + }, + { + "epoch": 2.829998217004293, + "grad_norm": 0.433786744476954, + "learning_rate": 3.6332475429167125e-06, + "loss": 0.4285, + "step": 17193 + }, + { + "epoch": 2.8301628012234095, + "grad_norm": 0.3632006308997577, + "learning_rate": 3.632786285821517e-06, + "loss": 0.418, + "step": 17194 + }, + { + "epoch": 2.830327385442526, + "grad_norm": 0.30953167349296806, + "learning_rate": 3.6323250381890107e-06, + "loss": 0.4368, + "step": 17195 + }, + { + "epoch": 2.8304919696616424, + "grad_norm": 0.31743954151520987, + "learning_rate": 3.6318638000242263e-06, + "loss": 0.4372, + "step": 17196 + }, + { + "epoch": 2.830656553880759, + "grad_norm": 0.30182454303424505, + "learning_rate": 3.6314025713321985e-06, + "loss": 0.4286, + "step": 17197 + }, + { + "epoch": 2.8308211380998753, + "grad_norm": 0.38935321783769483, + "learning_rate": 3.630941352117958e-06, + "loss": 0.4297, + "step": 17198 + }, + { + "epoch": 2.8309857223189914, + "grad_norm": 0.30084220698938574, + "learning_rate": 3.63048014238654e-06, + "loss": 0.4244, + "step": 17199 + }, + { + "epoch": 2.831150306538108, + "grad_norm": 0.3576438574713621, + "learning_rate": 3.6300189421429752e-06, + "loss": 0.414, + "step": 17200 + }, + { + "epoch": 2.8313148907572243, + "grad_norm": 0.8043356118524135, + "learning_rate": 3.6295577513922975e-06, + "loss": 0.4545, + "step": 17201 + }, + { + "epoch": 2.831479474976341, + "grad_norm": 0.4242885589241204, + "learning_rate": 3.6290965701395374e-06, + "loss": 0.4416, + "step": 17202 + }, + { + "epoch": 2.8316440591954573, + "grad_norm": 0.2974338529965238, + "learning_rate": 3.6286353983897272e-06, + "loss": 0.4296, + "step": 17203 + }, + { + "epoch": 2.8318086434145737, + "grad_norm": 0.3484590649079067, + "learning_rate": 3.6281742361479004e-06, + "loss": 0.429, + "step": 17204 + }, + { + "epoch": 2.83197322763369, + "grad_norm": 0.3206797393939164, + "learning_rate": 3.627713083419088e-06, + "loss": 0.4444, + "step": 17205 + }, + { + "epoch": 2.8321378118528067, + "grad_norm": 0.32626885839653796, + "learning_rate": 3.627251940208323e-06, + "loss": 0.4112, + "step": 17206 + }, + { + "epoch": 2.832302396071923, + "grad_norm": 0.31874147694777155, + "learning_rate": 3.626790806520635e-06, + "loss": 0.4244, + "step": 17207 + }, + { + "epoch": 2.8324669802910396, + "grad_norm": 0.35748771708537413, + "learning_rate": 3.6263296823610594e-06, + "loss": 0.4601, + "step": 17208 + }, + { + "epoch": 2.832631564510156, + "grad_norm": 0.2769410845748084, + "learning_rate": 3.6258685677346256e-06, + "loss": 0.4284, + "step": 17209 + }, + { + "epoch": 2.8327961487292725, + "grad_norm": 0.34461088576620236, + "learning_rate": 3.6254074626463626e-06, + "loss": 0.4311, + "step": 17210 + }, + { + "epoch": 2.832960732948389, + "grad_norm": 0.286261312823275, + "learning_rate": 3.624946367101307e-06, + "loss": 0.4372, + "step": 17211 + }, + { + "epoch": 2.8331253171675055, + "grad_norm": 0.3474396602887607, + "learning_rate": 3.624485281104485e-06, + "loss": 0.4219, + "step": 17212 + }, + { + "epoch": 2.833289901386622, + "grad_norm": 0.2918359905236362, + "learning_rate": 3.6240242046609327e-06, + "loss": 0.4382, + "step": 17213 + }, + { + "epoch": 2.8334544856057384, + "grad_norm": 0.29510176438923397, + "learning_rate": 3.6235631377756767e-06, + "loss": 0.4456, + "step": 17214 + }, + { + "epoch": 2.833619069824855, + "grad_norm": 0.3666142725303603, + "learning_rate": 3.623102080453751e-06, + "loss": 0.4147, + "step": 17215 + }, + { + "epoch": 2.8337836540439714, + "grad_norm": 0.3449592834790887, + "learning_rate": 3.6226410327001854e-06, + "loss": 0.4254, + "step": 17216 + }, + { + "epoch": 2.833948238263088, + "grad_norm": 0.474561524529995, + "learning_rate": 3.62217999452001e-06, + "loss": 0.4259, + "step": 17217 + }, + { + "epoch": 2.8341128224822043, + "grad_norm": 0.3081507109268821, + "learning_rate": 3.6217189659182566e-06, + "loss": 0.4151, + "step": 17218 + }, + { + "epoch": 2.834277406701321, + "grad_norm": 0.3440200959668479, + "learning_rate": 3.621257946899955e-06, + "loss": 0.4122, + "step": 17219 + }, + { + "epoch": 2.8344419909204372, + "grad_norm": 0.292391216530612, + "learning_rate": 3.6207969374701375e-06, + "loss": 0.448, + "step": 17220 + }, + { + "epoch": 2.8346065751395537, + "grad_norm": 0.26545459000328425, + "learning_rate": 3.6203359376338298e-06, + "loss": 0.4372, + "step": 17221 + }, + { + "epoch": 2.83477115935867, + "grad_norm": 0.33534217289780477, + "learning_rate": 3.6198749473960685e-06, + "loss": 0.4374, + "step": 17222 + }, + { + "epoch": 2.8349357435777867, + "grad_norm": 0.35976061171612134, + "learning_rate": 3.6194139667618774e-06, + "loss": 0.4318, + "step": 17223 + }, + { + "epoch": 2.835100327796903, + "grad_norm": 0.5249155215893581, + "learning_rate": 3.618952995736293e-06, + "loss": 0.4324, + "step": 17224 + }, + { + "epoch": 2.8352649120160196, + "grad_norm": 0.3894814558327808, + "learning_rate": 3.6184920343243406e-06, + "loss": 0.4212, + "step": 17225 + }, + { + "epoch": 2.835429496235136, + "grad_norm": 0.2899878228648192, + "learning_rate": 3.6180310825310505e-06, + "loss": 0.4332, + "step": 17226 + }, + { + "epoch": 2.8355940804542525, + "grad_norm": 0.2955486946134394, + "learning_rate": 3.617570140361454e-06, + "loss": 0.4232, + "step": 17227 + }, + { + "epoch": 2.835758664673369, + "grad_norm": 0.4856743837521025, + "learning_rate": 3.6171092078205794e-06, + "loss": 0.4292, + "step": 17228 + }, + { + "epoch": 2.8359232488924855, + "grad_norm": 0.2852401565044446, + "learning_rate": 3.6166482849134573e-06, + "loss": 0.4355, + "step": 17229 + }, + { + "epoch": 2.836087833111602, + "grad_norm": 0.4321899265913684, + "learning_rate": 3.6161873716451158e-06, + "loss": 0.4199, + "step": 17230 + }, + { + "epoch": 2.8362524173307184, + "grad_norm": 0.33996527413233607, + "learning_rate": 3.6157264680205853e-06, + "loss": 0.4311, + "step": 17231 + }, + { + "epoch": 2.836417001549835, + "grad_norm": 0.3379685630381169, + "learning_rate": 3.6152655740448963e-06, + "loss": 0.437, + "step": 17232 + }, + { + "epoch": 2.8365815857689514, + "grad_norm": 0.36981142619782814, + "learning_rate": 3.6148046897230744e-06, + "loss": 0.4246, + "step": 17233 + }, + { + "epoch": 2.836746169988068, + "grad_norm": 0.3729699046682807, + "learning_rate": 3.6143438150601513e-06, + "loss": 0.4486, + "step": 17234 + }, + { + "epoch": 2.8369107542071843, + "grad_norm": 0.4633154252788908, + "learning_rate": 3.6138829500611537e-06, + "loss": 0.4265, + "step": 17235 + }, + { + "epoch": 2.8370753384263008, + "grad_norm": 0.44586719350424964, + "learning_rate": 3.6134220947311134e-06, + "loss": 0.4336, + "step": 17236 + }, + { + "epoch": 2.8372399226454172, + "grad_norm": 0.2940424118350281, + "learning_rate": 3.6129612490750555e-06, + "loss": 0.4263, + "step": 17237 + }, + { + "epoch": 2.8374045068645337, + "grad_norm": 0.25917279093973444, + "learning_rate": 3.612500413098012e-06, + "loss": 0.4268, + "step": 17238 + }, + { + "epoch": 2.83756909108365, + "grad_norm": 0.31873135212679354, + "learning_rate": 3.612039586805011e-06, + "loss": 0.4382, + "step": 17239 + }, + { + "epoch": 2.8377336753027667, + "grad_norm": 0.30085694031050964, + "learning_rate": 3.611578770201076e-06, + "loss": 0.4152, + "step": 17240 + }, + { + "epoch": 2.8378982595218827, + "grad_norm": 0.3951723660102914, + "learning_rate": 3.611117963291242e-06, + "loss": 0.4378, + "step": 17241 + }, + { + "epoch": 2.838062843740999, + "grad_norm": 0.36034714559106945, + "learning_rate": 3.6106571660805315e-06, + "loss": 0.4363, + "step": 17242 + }, + { + "epoch": 2.8382274279601156, + "grad_norm": 0.34931684788064654, + "learning_rate": 3.6101963785739774e-06, + "loss": 0.417, + "step": 17243 + }, + { + "epoch": 2.838392012179232, + "grad_norm": 0.36458261655129637, + "learning_rate": 3.609735600776604e-06, + "loss": 0.4313, + "step": 17244 + }, + { + "epoch": 2.8385565963983486, + "grad_norm": 0.30291655119692484, + "learning_rate": 3.609274832693441e-06, + "loss": 0.432, + "step": 17245 + }, + { + "epoch": 2.838721180617465, + "grad_norm": 0.25942457772664856, + "learning_rate": 3.608814074329516e-06, + "loss": 0.4488, + "step": 17246 + }, + { + "epoch": 2.8388857648365815, + "grad_norm": 0.3107357539755343, + "learning_rate": 3.6083533256898546e-06, + "loss": 0.4363, + "step": 17247 + }, + { + "epoch": 2.839050349055698, + "grad_norm": 0.38995430767614275, + "learning_rate": 3.607892586779487e-06, + "loss": 0.4233, + "step": 17248 + }, + { + "epoch": 2.8392149332748144, + "grad_norm": 0.35307916284665136, + "learning_rate": 3.6074318576034388e-06, + "loss": 0.4271, + "step": 17249 + }, + { + "epoch": 2.839379517493931, + "grad_norm": 0.3120446790305785, + "learning_rate": 3.6069711381667385e-06, + "loss": 0.4256, + "step": 17250 + }, + { + "epoch": 2.8395441017130474, + "grad_norm": 0.4791097839923694, + "learning_rate": 3.606510428474412e-06, + "loss": 0.4365, + "step": 17251 + }, + { + "epoch": 2.839708685932164, + "grad_norm": 0.3135030321580087, + "learning_rate": 3.606049728531489e-06, + "loss": 0.4154, + "step": 17252 + }, + { + "epoch": 2.8398732701512803, + "grad_norm": 0.3228397935016283, + "learning_rate": 3.6055890383429934e-06, + "loss": 0.4498, + "step": 17253 + }, + { + "epoch": 2.840037854370397, + "grad_norm": 0.5532350580857291, + "learning_rate": 3.6051283579139522e-06, + "loss": 0.4389, + "step": 17254 + }, + { + "epoch": 2.8402024385895133, + "grad_norm": 0.43181483714512964, + "learning_rate": 3.604667687249395e-06, + "loss": 0.4346, + "step": 17255 + }, + { + "epoch": 2.8403670228086297, + "grad_norm": 0.33784989362316176, + "learning_rate": 3.604207026354345e-06, + "loss": 0.4429, + "step": 17256 + }, + { + "epoch": 2.840531607027746, + "grad_norm": 0.31093719233205325, + "learning_rate": 3.603746375233831e-06, + "loss": 0.4227, + "step": 17257 + }, + { + "epoch": 2.8406961912468627, + "grad_norm": 0.25628383000837457, + "learning_rate": 3.6032857338928787e-06, + "loss": 0.4347, + "step": 17258 + }, + { + "epoch": 2.840860775465979, + "grad_norm": 0.3291742395980252, + "learning_rate": 3.602825102336515e-06, + "loss": 0.4139, + "step": 17259 + }, + { + "epoch": 2.8410253596850956, + "grad_norm": 0.286703019845901, + "learning_rate": 3.602364480569767e-06, + "loss": 0.4358, + "step": 17260 + }, + { + "epoch": 2.841189943904212, + "grad_norm": 0.3262119607648993, + "learning_rate": 3.601903868597657e-06, + "loss": 0.4492, + "step": 17261 + }, + { + "epoch": 2.8413545281233286, + "grad_norm": 0.2642906042273244, + "learning_rate": 3.601443266425216e-06, + "loss": 0.4361, + "step": 17262 + }, + { + "epoch": 2.841519112342445, + "grad_norm": 0.30197138080336206, + "learning_rate": 3.6009826740574656e-06, + "loss": 0.4273, + "step": 17263 + }, + { + "epoch": 2.8416836965615615, + "grad_norm": 0.3278386663356908, + "learning_rate": 3.600522091499434e-06, + "loss": 0.4491, + "step": 17264 + }, + { + "epoch": 2.841848280780678, + "grad_norm": 0.366249872998856, + "learning_rate": 3.600061518756146e-06, + "loss": 0.4365, + "step": 17265 + }, + { + "epoch": 2.842012864999794, + "grad_norm": 0.32644035623795087, + "learning_rate": 3.5996009558326287e-06, + "loss": 0.4083, + "step": 17266 + }, + { + "epoch": 2.8421774492189105, + "grad_norm": 0.33766069885696454, + "learning_rate": 3.5991404027339047e-06, + "loss": 0.4114, + "step": 17267 + }, + { + "epoch": 2.842342033438027, + "grad_norm": 0.34999581741863056, + "learning_rate": 3.5986798594650023e-06, + "loss": 0.4398, + "step": 17268 + }, + { + "epoch": 2.8425066176571434, + "grad_norm": 0.29690638842098677, + "learning_rate": 3.598219326030946e-06, + "loss": 0.4379, + "step": 17269 + }, + { + "epoch": 2.84267120187626, + "grad_norm": 0.5821848353060289, + "learning_rate": 3.597758802436759e-06, + "loss": 0.4272, + "step": 17270 + }, + { + "epoch": 2.8428357860953763, + "grad_norm": 0.27318836340541974, + "learning_rate": 3.5972982886874706e-06, + "loss": 0.4364, + "step": 17271 + }, + { + "epoch": 2.843000370314493, + "grad_norm": 0.2907001035804837, + "learning_rate": 3.5968377847880992e-06, + "loss": 0.4385, + "step": 17272 + }, + { + "epoch": 2.8431649545336093, + "grad_norm": 0.35475419183949747, + "learning_rate": 3.5963772907436767e-06, + "loss": 0.4404, + "step": 17273 + }, + { + "epoch": 2.8433295387527258, + "grad_norm": 0.2929670971585379, + "learning_rate": 3.5959168065592217e-06, + "loss": 0.4269, + "step": 17274 + }, + { + "epoch": 2.8434941229718422, + "grad_norm": 0.31006705955902614, + "learning_rate": 3.5954563322397647e-06, + "loss": 0.419, + "step": 17275 + }, + { + "epoch": 2.8436587071909587, + "grad_norm": 0.4804142954059058, + "learning_rate": 3.594995867790326e-06, + "loss": 0.431, + "step": 17276 + }, + { + "epoch": 2.843823291410075, + "grad_norm": 0.38870693622529257, + "learning_rate": 3.59453541321593e-06, + "loss": 0.4359, + "step": 17277 + }, + { + "epoch": 2.8439878756291916, + "grad_norm": 0.3286508395949563, + "learning_rate": 3.5940749685216036e-06, + "loss": 0.4262, + "step": 17278 + }, + { + "epoch": 2.844152459848308, + "grad_norm": 0.3152080061760849, + "learning_rate": 3.5936145337123686e-06, + "loss": 0.4265, + "step": 17279 + }, + { + "epoch": 2.8443170440674246, + "grad_norm": 0.3129194774671554, + "learning_rate": 3.593154108793251e-06, + "loss": 0.4297, + "step": 17280 + }, + { + "epoch": 2.844481628286541, + "grad_norm": 0.38874431455021785, + "learning_rate": 3.592693693769273e-06, + "loss": 0.4383, + "step": 17281 + }, + { + "epoch": 2.8446462125056575, + "grad_norm": 0.37803432983638346, + "learning_rate": 3.592233288645461e-06, + "loss": 0.4305, + "step": 17282 + }, + { + "epoch": 2.844810796724774, + "grad_norm": 0.45048005972917665, + "learning_rate": 3.591772893426836e-06, + "loss": 0.426, + "step": 17283 + }, + { + "epoch": 2.8449753809438905, + "grad_norm": 0.28005271787070796, + "learning_rate": 3.591312508118422e-06, + "loss": 0.4315, + "step": 17284 + }, + { + "epoch": 2.845139965163007, + "grad_norm": 0.33550972302654886, + "learning_rate": 3.590852132725244e-06, + "loss": 0.4234, + "step": 17285 + }, + { + "epoch": 2.8453045493821234, + "grad_norm": 0.32102823504171063, + "learning_rate": 3.5903917672523238e-06, + "loss": 0.442, + "step": 17286 + }, + { + "epoch": 2.84546913360124, + "grad_norm": 0.35483966039125975, + "learning_rate": 3.5899314117046864e-06, + "loss": 0.4241, + "step": 17287 + }, + { + "epoch": 2.8456337178203563, + "grad_norm": 0.31500137223962354, + "learning_rate": 3.5894710660873536e-06, + "loss": 0.4432, + "step": 17288 + }, + { + "epoch": 2.845798302039473, + "grad_norm": 0.29601726038616466, + "learning_rate": 3.5890107304053498e-06, + "loss": 0.4144, + "step": 17289 + }, + { + "epoch": 2.8459628862585893, + "grad_norm": 0.2838766819295395, + "learning_rate": 3.588550404663699e-06, + "loss": 0.4503, + "step": 17290 + }, + { + "epoch": 2.8461274704777058, + "grad_norm": 0.32551607430476087, + "learning_rate": 3.5880900888674185e-06, + "loss": 0.4515, + "step": 17291 + }, + { + "epoch": 2.8462920546968222, + "grad_norm": 0.303023119862285, + "learning_rate": 3.5876297830215387e-06, + "loss": 0.4282, + "step": 17292 + }, + { + "epoch": 2.8464566389159387, + "grad_norm": 0.3677300499257935, + "learning_rate": 3.5871694871310755e-06, + "loss": 0.4279, + "step": 17293 + }, + { + "epoch": 2.846621223135055, + "grad_norm": 0.2903798105398287, + "learning_rate": 3.586709201201057e-06, + "loss": 0.426, + "step": 17294 + }, + { + "epoch": 2.8467858073541716, + "grad_norm": 0.2793437932527116, + "learning_rate": 3.5862489252365017e-06, + "loss": 0.4373, + "step": 17295 + }, + { + "epoch": 2.846950391573288, + "grad_norm": 0.3094071114496996, + "learning_rate": 3.585788659242434e-06, + "loss": 0.4581, + "step": 17296 + }, + { + "epoch": 2.8471149757924046, + "grad_norm": 0.3039670945668618, + "learning_rate": 3.5853284032238763e-06, + "loss": 0.4145, + "step": 17297 + }, + { + "epoch": 2.847279560011521, + "grad_norm": 0.6203273417092992, + "learning_rate": 3.584868157185849e-06, + "loss": 0.4232, + "step": 17298 + }, + { + "epoch": 2.8474441442306375, + "grad_norm": 0.43621089193038654, + "learning_rate": 3.5844079211333753e-06, + "loss": 0.4257, + "step": 17299 + }, + { + "epoch": 2.847608728449754, + "grad_norm": 0.3902462502959947, + "learning_rate": 3.5839476950714765e-06, + "loss": 0.4344, + "step": 17300 + }, + { + "epoch": 2.8477733126688705, + "grad_norm": 0.4501508527387159, + "learning_rate": 3.5834874790051763e-06, + "loss": 0.4309, + "step": 17301 + }, + { + "epoch": 2.847937896887987, + "grad_norm": 0.37557686051108913, + "learning_rate": 3.583027272939493e-06, + "loss": 0.4415, + "step": 17302 + }, + { + "epoch": 2.8481024811071034, + "grad_norm": 0.3013870558351485, + "learning_rate": 3.5825670768794525e-06, + "loss": 0.4305, + "step": 17303 + }, + { + "epoch": 2.84826706532622, + "grad_norm": 0.3542117988239237, + "learning_rate": 3.5821068908300713e-06, + "loss": 0.4372, + "step": 17304 + }, + { + "epoch": 2.8484316495453363, + "grad_norm": 0.31145913931400526, + "learning_rate": 3.5816467147963757e-06, + "loss": 0.4244, + "step": 17305 + }, + { + "epoch": 2.848596233764453, + "grad_norm": 0.3143075586074146, + "learning_rate": 3.5811865487833844e-06, + "loss": 0.4287, + "step": 17306 + }, + { + "epoch": 2.8487608179835693, + "grad_norm": 0.38184845853847327, + "learning_rate": 3.5807263927961173e-06, + "loss": 0.4578, + "step": 17307 + }, + { + "epoch": 2.8489254022026853, + "grad_norm": 0.32585932374428855, + "learning_rate": 3.580266246839598e-06, + "loss": 0.4245, + "step": 17308 + }, + { + "epoch": 2.849089986421802, + "grad_norm": 0.3651766852599073, + "learning_rate": 3.579806110918845e-06, + "loss": 0.4211, + "step": 17309 + }, + { + "epoch": 2.8492545706409182, + "grad_norm": 0.4191280874392949, + "learning_rate": 3.5793459850388822e-06, + "loss": 0.4079, + "step": 17310 + }, + { + "epoch": 2.8494191548600347, + "grad_norm": 0.32954077567316326, + "learning_rate": 3.578885869204727e-06, + "loss": 0.4324, + "step": 17311 + }, + { + "epoch": 2.849583739079151, + "grad_norm": 0.3743297123453578, + "learning_rate": 3.578425763421403e-06, + "loss": 0.4265, + "step": 17312 + }, + { + "epoch": 2.8497483232982677, + "grad_norm": 0.2850966383078337, + "learning_rate": 3.57796566769393e-06, + "loss": 0.4212, + "step": 17313 + }, + { + "epoch": 2.849912907517384, + "grad_norm": 0.41129961107418034, + "learning_rate": 3.5775055820273255e-06, + "loss": 0.4365, + "step": 17314 + }, + { + "epoch": 2.8500774917365006, + "grad_norm": 0.3414907552588458, + "learning_rate": 3.5770455064266136e-06, + "loss": 0.4338, + "step": 17315 + }, + { + "epoch": 2.850242075955617, + "grad_norm": 0.29009696112826655, + "learning_rate": 3.576585440896811e-06, + "loss": 0.4174, + "step": 17316 + }, + { + "epoch": 2.8504066601747335, + "grad_norm": 0.4310254529123287, + "learning_rate": 3.5761253854429407e-06, + "loss": 0.4204, + "step": 17317 + }, + { + "epoch": 2.85057124439385, + "grad_norm": 0.31112114797040397, + "learning_rate": 3.575665340070021e-06, + "loss": 0.4341, + "step": 17318 + }, + { + "epoch": 2.8507358286129665, + "grad_norm": 0.3279739330863808, + "learning_rate": 3.5752053047830725e-06, + "loss": 0.4322, + "step": 17319 + }, + { + "epoch": 2.850900412832083, + "grad_norm": 0.3301961644714612, + "learning_rate": 3.5747452795871153e-06, + "loss": 0.4349, + "step": 17320 + }, + { + "epoch": 2.8510649970511994, + "grad_norm": 0.277429509503672, + "learning_rate": 3.574285264487167e-06, + "loss": 0.4504, + "step": 17321 + }, + { + "epoch": 2.851229581270316, + "grad_norm": 0.4076592298941959, + "learning_rate": 3.57382525948825e-06, + "loss": 0.4261, + "step": 17322 + }, + { + "epoch": 2.8513941654894324, + "grad_norm": 0.2610069194071665, + "learning_rate": 3.5733652645953796e-06, + "loss": 0.4286, + "step": 17323 + }, + { + "epoch": 2.851558749708549, + "grad_norm": 0.3170634069177754, + "learning_rate": 3.5729052798135806e-06, + "loss": 0.4292, + "step": 17324 + }, + { + "epoch": 2.8517233339276653, + "grad_norm": 0.37389489628158157, + "learning_rate": 3.572445305147866e-06, + "loss": 0.4369, + "step": 17325 + }, + { + "epoch": 2.8518879181467818, + "grad_norm": 0.30498221629286726, + "learning_rate": 3.571985340603261e-06, + "loss": 0.4343, + "step": 17326 + }, + { + "epoch": 2.8520525023658982, + "grad_norm": 0.32470011381080516, + "learning_rate": 3.571525386184781e-06, + "loss": 0.4253, + "step": 17327 + }, + { + "epoch": 2.8522170865850147, + "grad_norm": 0.3199015472926689, + "learning_rate": 3.5710654418974444e-06, + "loss": 0.4437, + "step": 17328 + }, + { + "epoch": 2.852381670804131, + "grad_norm": 0.3304374071937174, + "learning_rate": 3.5706055077462715e-06, + "loss": 0.4349, + "step": 17329 + }, + { + "epoch": 2.8525462550232477, + "grad_norm": 0.5648046194401402, + "learning_rate": 3.5701455837362798e-06, + "loss": 0.4487, + "step": 17330 + }, + { + "epoch": 2.852710839242364, + "grad_norm": 0.3312659649653881, + "learning_rate": 3.5696856698724896e-06, + "loss": 0.4234, + "step": 17331 + }, + { + "epoch": 2.8528754234614806, + "grad_norm": 0.2999730938000472, + "learning_rate": 3.5692257661599166e-06, + "loss": 0.4201, + "step": 17332 + }, + { + "epoch": 2.8530400076805966, + "grad_norm": 0.3054685782593052, + "learning_rate": 3.5687658726035825e-06, + "loss": 0.4441, + "step": 17333 + }, + { + "epoch": 2.853204591899713, + "grad_norm": 0.29541052668524603, + "learning_rate": 3.5683059892085025e-06, + "loss": 0.4456, + "step": 17334 + }, + { + "epoch": 2.8533691761188296, + "grad_norm": 0.39264174913702915, + "learning_rate": 3.5678461159796953e-06, + "loss": 0.4203, + "step": 17335 + }, + { + "epoch": 2.853533760337946, + "grad_norm": 0.2988594884479697, + "learning_rate": 3.56738625292218e-06, + "loss": 0.4346, + "step": 17336 + }, + { + "epoch": 2.8536983445570625, + "grad_norm": 0.29792359684867575, + "learning_rate": 3.5669264000409724e-06, + "loss": 0.4224, + "step": 17337 + }, + { + "epoch": 2.853862928776179, + "grad_norm": 0.31966824404577093, + "learning_rate": 3.566466557341093e-06, + "loss": 0.43, + "step": 17338 + }, + { + "epoch": 2.8540275129952954, + "grad_norm": 0.3570823411675509, + "learning_rate": 3.5660067248275568e-06, + "loss": 0.4297, + "step": 17339 + }, + { + "epoch": 2.854192097214412, + "grad_norm": 0.2703363496172826, + "learning_rate": 3.565546902505383e-06, + "loss": 0.4263, + "step": 17340 + }, + { + "epoch": 2.8543566814335284, + "grad_norm": 0.3790065141753752, + "learning_rate": 3.5650870903795897e-06, + "loss": 0.437, + "step": 17341 + }, + { + "epoch": 2.854521265652645, + "grad_norm": 0.3175430522401861, + "learning_rate": 3.56462728845519e-06, + "loss": 0.4308, + "step": 17342 + }, + { + "epoch": 2.8546858498717613, + "grad_norm": 0.3151100167548812, + "learning_rate": 3.5641674967372074e-06, + "loss": 0.4393, + "step": 17343 + }, + { + "epoch": 2.854850434090878, + "grad_norm": 0.33334304766233647, + "learning_rate": 3.563707715230652e-06, + "loss": 0.4384, + "step": 17344 + }, + { + "epoch": 2.8550150183099943, + "grad_norm": 0.28834839600363854, + "learning_rate": 3.5632479439405475e-06, + "loss": 0.4298, + "step": 17345 + }, + { + "epoch": 2.8551796025291107, + "grad_norm": 0.3013133808479478, + "learning_rate": 3.562788182871906e-06, + "loss": 0.4468, + "step": 17346 + }, + { + "epoch": 2.855344186748227, + "grad_norm": 0.2896137490725572, + "learning_rate": 3.5623284320297465e-06, + "loss": 0.4125, + "step": 17347 + }, + { + "epoch": 2.8555087709673437, + "grad_norm": 0.2920795502549955, + "learning_rate": 3.561868691419084e-06, + "loss": 0.4212, + "step": 17348 + }, + { + "epoch": 2.85567335518646, + "grad_norm": 0.3502524102384909, + "learning_rate": 3.561408961044937e-06, + "loss": 0.4219, + "step": 17349 + }, + { + "epoch": 2.8558379394055766, + "grad_norm": 0.2963465990646538, + "learning_rate": 3.5609492409123205e-06, + "loss": 0.4222, + "step": 17350 + }, + { + "epoch": 2.856002523624693, + "grad_norm": 0.46442209356470837, + "learning_rate": 3.5604895310262503e-06, + "loss": 0.4449, + "step": 17351 + }, + { + "epoch": 2.8561671078438096, + "grad_norm": 0.33772961723301825, + "learning_rate": 3.5600298313917452e-06, + "loss": 0.4438, + "step": 17352 + }, + { + "epoch": 2.856331692062926, + "grad_norm": 0.2988364502504436, + "learning_rate": 3.559570142013817e-06, + "loss": 0.4336, + "step": 17353 + }, + { + "epoch": 2.8564962762820425, + "grad_norm": 0.3835527090232755, + "learning_rate": 3.5591104628974865e-06, + "loss": 0.4489, + "step": 17354 + }, + { + "epoch": 2.856660860501159, + "grad_norm": 0.31238237100554256, + "learning_rate": 3.558650794047764e-06, + "loss": 0.4256, + "step": 17355 + }, + { + "epoch": 2.8568254447202754, + "grad_norm": 0.30854875596452236, + "learning_rate": 3.558191135469672e-06, + "loss": 0.4261, + "step": 17356 + }, + { + "epoch": 2.856990028939392, + "grad_norm": 0.3281040543302565, + "learning_rate": 3.5577314871682214e-06, + "loss": 0.4355, + "step": 17357 + }, + { + "epoch": 2.8571546131585084, + "grad_norm": 0.46861731742534457, + "learning_rate": 3.557271849148427e-06, + "loss": 0.4253, + "step": 17358 + }, + { + "epoch": 2.857319197377625, + "grad_norm": 0.31972857803363086, + "learning_rate": 3.556812221415307e-06, + "loss": 0.4595, + "step": 17359 + }, + { + "epoch": 2.8574837815967413, + "grad_norm": 0.34830650317616807, + "learning_rate": 3.5563526039738758e-06, + "loss": 0.4333, + "step": 17360 + }, + { + "epoch": 2.857648365815858, + "grad_norm": 0.26551698406335, + "learning_rate": 3.5558929968291484e-06, + "loss": 0.4248, + "step": 17361 + }, + { + "epoch": 2.8578129500349743, + "grad_norm": 0.3552027859489436, + "learning_rate": 3.5554333999861386e-06, + "loss": 0.4278, + "step": 17362 + }, + { + "epoch": 2.8579775342540907, + "grad_norm": 0.49062427891614496, + "learning_rate": 3.5549738134498643e-06, + "loss": 0.4365, + "step": 17363 + }, + { + "epoch": 2.858142118473207, + "grad_norm": 0.39124919194394747, + "learning_rate": 3.554514237225339e-06, + "loss": 0.4399, + "step": 17364 + }, + { + "epoch": 2.8583067026923237, + "grad_norm": 2.118110390489228, + "learning_rate": 3.554054671317576e-06, + "loss": 0.4454, + "step": 17365 + }, + { + "epoch": 2.85847128691144, + "grad_norm": 0.3123363002781529, + "learning_rate": 3.5535951157315907e-06, + "loss": 0.4008, + "step": 17366 + }, + { + "epoch": 2.8586358711305566, + "grad_norm": 0.3400430711403663, + "learning_rate": 3.5531355704723975e-06, + "loss": 0.4138, + "step": 17367 + }, + { + "epoch": 2.858800455349673, + "grad_norm": 0.34525379179719407, + "learning_rate": 3.552676035545012e-06, + "loss": 0.4349, + "step": 17368 + }, + { + "epoch": 2.8589650395687896, + "grad_norm": 0.2987836071432275, + "learning_rate": 3.552216510954446e-06, + "loss": 0.4202, + "step": 17369 + }, + { + "epoch": 2.859129623787906, + "grad_norm": 0.3772431570981102, + "learning_rate": 3.551756996705717e-06, + "loss": 0.4281, + "step": 17370 + }, + { + "epoch": 2.8592942080070225, + "grad_norm": 0.27907664529479576, + "learning_rate": 3.5512974928038372e-06, + "loss": 0.4228, + "step": 17371 + }, + { + "epoch": 2.859458792226139, + "grad_norm": 0.4616168933271382, + "learning_rate": 3.5508379992538183e-06, + "loss": 0.4043, + "step": 17372 + }, + { + "epoch": 2.8596233764452554, + "grad_norm": 0.3168197813319331, + "learning_rate": 3.5503785160606784e-06, + "loss": 0.4307, + "step": 17373 + }, + { + "epoch": 2.859787960664372, + "grad_norm": 0.31139488991072456, + "learning_rate": 3.549919043229427e-06, + "loss": 0.3988, + "step": 17374 + }, + { + "epoch": 2.859952544883488, + "grad_norm": 0.34256371374820876, + "learning_rate": 3.5494595807650833e-06, + "loss": 0.4285, + "step": 17375 + }, + { + "epoch": 2.8601171291026044, + "grad_norm": 0.32999590179031507, + "learning_rate": 3.5490001286726546e-06, + "loss": 0.4054, + "step": 17376 + }, + { + "epoch": 2.860281713321721, + "grad_norm": 0.3151567022626986, + "learning_rate": 3.5485406869571574e-06, + "loss": 0.424, + "step": 17377 + }, + { + "epoch": 2.8604462975408373, + "grad_norm": 0.2944405518922663, + "learning_rate": 3.5480812556236054e-06, + "loss": 0.404, + "step": 17378 + }, + { + "epoch": 2.860610881759954, + "grad_norm": 0.31649209921217125, + "learning_rate": 3.5476218346770096e-06, + "loss": 0.4265, + "step": 17379 + }, + { + "epoch": 2.8607754659790703, + "grad_norm": 0.32118599442886553, + "learning_rate": 3.547162424122385e-06, + "loss": 0.4076, + "step": 17380 + }, + { + "epoch": 2.8609400501981868, + "grad_norm": 0.34538102601435855, + "learning_rate": 3.546703023964743e-06, + "loss": 0.4455, + "step": 17381 + }, + { + "epoch": 2.8611046344173032, + "grad_norm": 0.5686932534326282, + "learning_rate": 3.546243634209098e-06, + "loss": 0.4399, + "step": 17382 + }, + { + "epoch": 2.8612692186364197, + "grad_norm": 0.35543383109799415, + "learning_rate": 3.545784254860461e-06, + "loss": 0.4455, + "step": 17383 + }, + { + "epoch": 2.861433802855536, + "grad_norm": 0.6460606174437876, + "learning_rate": 3.545324885923847e-06, + "loss": 0.4401, + "step": 17384 + }, + { + "epoch": 2.8615983870746526, + "grad_norm": 0.30017636310835727, + "learning_rate": 3.544865527404266e-06, + "loss": 0.4167, + "step": 17385 + }, + { + "epoch": 2.861762971293769, + "grad_norm": 0.31817270123624747, + "learning_rate": 3.5444061793067305e-06, + "loss": 0.4135, + "step": 17386 + }, + { + "epoch": 2.8619275555128856, + "grad_norm": 2.0942530739338068, + "learning_rate": 3.5439468416362544e-06, + "loss": 0.424, + "step": 17387 + }, + { + "epoch": 2.862092139732002, + "grad_norm": 0.3462842069028323, + "learning_rate": 3.543487514397848e-06, + "loss": 0.4154, + "step": 17388 + }, + { + "epoch": 2.8622567239511185, + "grad_norm": 0.31711819493067556, + "learning_rate": 3.543028197596524e-06, + "loss": 0.4368, + "step": 17389 + }, + { + "epoch": 2.862421308170235, + "grad_norm": 0.392098853077831, + "learning_rate": 3.5425688912372943e-06, + "loss": 0.4379, + "step": 17390 + }, + { + "epoch": 2.8625858923893515, + "grad_norm": 0.566164161477723, + "learning_rate": 3.5421095953251714e-06, + "loss": 0.4067, + "step": 17391 + }, + { + "epoch": 2.862750476608468, + "grad_norm": 0.6690822106221481, + "learning_rate": 3.5416503098651653e-06, + "loss": 0.4234, + "step": 17392 + }, + { + "epoch": 2.8629150608275844, + "grad_norm": 0.302852719194461, + "learning_rate": 3.5411910348622897e-06, + "loss": 0.4129, + "step": 17393 + }, + { + "epoch": 2.863079645046701, + "grad_norm": 0.2589054574663652, + "learning_rate": 3.5407317703215562e-06, + "loss": 0.4519, + "step": 17394 + }, + { + "epoch": 2.8632442292658173, + "grad_norm": 0.4101958079789623, + "learning_rate": 3.5402725162479723e-06, + "loss": 0.4272, + "step": 17395 + }, + { + "epoch": 2.863408813484934, + "grad_norm": 0.26525454642865537, + "learning_rate": 3.5398132726465524e-06, + "loss": 0.4278, + "step": 17396 + }, + { + "epoch": 2.8635733977040503, + "grad_norm": 0.3779916268979564, + "learning_rate": 3.539354039522306e-06, + "loss": 0.451, + "step": 17397 + }, + { + "epoch": 2.8637379819231668, + "grad_norm": 0.36607349629965114, + "learning_rate": 3.5388948168802463e-06, + "loss": 0.4443, + "step": 17398 + }, + { + "epoch": 2.8639025661422832, + "grad_norm": 0.3140599194344739, + "learning_rate": 3.538435604725382e-06, + "loss": 0.4313, + "step": 17399 + }, + { + "epoch": 2.8640671503613992, + "grad_norm": 0.2858037967176737, + "learning_rate": 3.5379764030627252e-06, + "loss": 0.4064, + "step": 17400 + }, + { + "epoch": 2.8642317345805157, + "grad_norm": 0.4345367756236418, + "learning_rate": 3.537517211897286e-06, + "loss": 0.4254, + "step": 17401 + }, + { + "epoch": 2.864396318799632, + "grad_norm": 0.5451837729821961, + "learning_rate": 3.5370580312340736e-06, + "loss": 0.4369, + "step": 17402 + }, + { + "epoch": 2.8645609030187487, + "grad_norm": 0.37529945803443476, + "learning_rate": 3.5365988610781017e-06, + "loss": 0.4083, + "step": 17403 + }, + { + "epoch": 2.864725487237865, + "grad_norm": 0.3037152828583387, + "learning_rate": 3.5361397014343765e-06, + "loss": 0.4587, + "step": 17404 + }, + { + "epoch": 2.8648900714569816, + "grad_norm": 0.35187267786260457, + "learning_rate": 3.5356805523079118e-06, + "loss": 0.4292, + "step": 17405 + }, + { + "epoch": 2.865054655676098, + "grad_norm": 0.3494197798213748, + "learning_rate": 3.5352214137037142e-06, + "loss": 0.435, + "step": 17406 + }, + { + "epoch": 2.8652192398952145, + "grad_norm": 0.4412802947995881, + "learning_rate": 3.534762285626798e-06, + "loss": 0.4333, + "step": 17407 + }, + { + "epoch": 2.865383824114331, + "grad_norm": 0.475036997514968, + "learning_rate": 3.5343031680821692e-06, + "loss": 0.4351, + "step": 17408 + }, + { + "epoch": 2.8655484083334475, + "grad_norm": 0.3499265166691491, + "learning_rate": 3.5338440610748383e-06, + "loss": 0.4361, + "step": 17409 + }, + { + "epoch": 2.865712992552564, + "grad_norm": 0.3023794348814161, + "learning_rate": 3.5333849646098163e-06, + "loss": 0.4304, + "step": 17410 + }, + { + "epoch": 2.8658775767716804, + "grad_norm": 0.366119061707016, + "learning_rate": 3.5329258786921108e-06, + "loss": 0.42, + "step": 17411 + }, + { + "epoch": 2.866042160990797, + "grad_norm": 0.3774323397580119, + "learning_rate": 3.532466803326733e-06, + "loss": 0.4231, + "step": 17412 + }, + { + "epoch": 2.8662067452099134, + "grad_norm": 0.31430262520379176, + "learning_rate": 3.5320077385186905e-06, + "loss": 0.4533, + "step": 17413 + }, + { + "epoch": 2.86637132942903, + "grad_norm": 0.3252067503048915, + "learning_rate": 3.5315486842729953e-06, + "loss": 0.4278, + "step": 17414 + }, + { + "epoch": 2.8665359136481463, + "grad_norm": 0.33254081249056655, + "learning_rate": 3.531089640594653e-06, + "loss": 0.4292, + "step": 17415 + }, + { + "epoch": 2.8667004978672628, + "grad_norm": 0.32644976125481445, + "learning_rate": 3.5306306074886727e-06, + "loss": 0.4156, + "step": 17416 + }, + { + "epoch": 2.8668650820863792, + "grad_norm": 0.36852603099967607, + "learning_rate": 3.5301715849600656e-06, + "loss": 0.4215, + "step": 17417 + }, + { + "epoch": 2.8670296663054957, + "grad_norm": 0.3235318335429592, + "learning_rate": 3.5297125730138384e-06, + "loss": 0.4174, + "step": 17418 + }, + { + "epoch": 2.867194250524612, + "grad_norm": 0.28132456297739444, + "learning_rate": 3.5292535716550013e-06, + "loss": 0.4454, + "step": 17419 + }, + { + "epoch": 2.8673588347437287, + "grad_norm": 0.3586742226160919, + "learning_rate": 3.5287945808885602e-06, + "loss": 0.4249, + "step": 17420 + }, + { + "epoch": 2.867523418962845, + "grad_norm": 0.3134723949749536, + "learning_rate": 3.5283356007195267e-06, + "loss": 0.4231, + "step": 17421 + }, + { + "epoch": 2.8676880031819616, + "grad_norm": 0.2842395584626507, + "learning_rate": 3.5278766311529083e-06, + "loss": 0.4224, + "step": 17422 + }, + { + "epoch": 2.867852587401078, + "grad_norm": 0.5224422130616279, + "learning_rate": 3.527417672193709e-06, + "loss": 0.4343, + "step": 17423 + }, + { + "epoch": 2.8680171716201945, + "grad_norm": 0.8431668915744212, + "learning_rate": 3.526958723846942e-06, + "loss": 0.4366, + "step": 17424 + }, + { + "epoch": 2.868181755839311, + "grad_norm": 0.30047647153435625, + "learning_rate": 3.5264997861176117e-06, + "loss": 0.4433, + "step": 17425 + }, + { + "epoch": 2.8683463400584275, + "grad_norm": 0.3399635986036373, + "learning_rate": 3.526040859010729e-06, + "loss": 0.426, + "step": 17426 + }, + { + "epoch": 2.868510924277544, + "grad_norm": 0.34800450958763834, + "learning_rate": 3.525581942531298e-06, + "loss": 0.4311, + "step": 17427 + }, + { + "epoch": 2.8686755084966604, + "grad_norm": 0.35944897049779106, + "learning_rate": 3.5251230366843294e-06, + "loss": 0.4309, + "step": 17428 + }, + { + "epoch": 2.868840092715777, + "grad_norm": 0.36495359198570226, + "learning_rate": 3.5246641414748273e-06, + "loss": 0.4274, + "step": 17429 + }, + { + "epoch": 2.8690046769348934, + "grad_norm": 0.34425709534357957, + "learning_rate": 3.524205256907803e-06, + "loss": 0.4272, + "step": 17430 + }, + { + "epoch": 2.86916926115401, + "grad_norm": 0.621363614941926, + "learning_rate": 3.52374638298826e-06, + "loss": 0.4213, + "step": 17431 + }, + { + "epoch": 2.8693338453731263, + "grad_norm": 0.33192276900525747, + "learning_rate": 3.523287519721207e-06, + "loss": 0.4098, + "step": 17432 + }, + { + "epoch": 2.8694984295922428, + "grad_norm": 0.3357576294843523, + "learning_rate": 3.522828667111653e-06, + "loss": 0.4359, + "step": 17433 + }, + { + "epoch": 2.8696630138113592, + "grad_norm": 0.3811141647375937, + "learning_rate": 3.5223698251645993e-06, + "loss": 0.4202, + "step": 17434 + }, + { + "epoch": 2.8698275980304757, + "grad_norm": 0.2769062285422329, + "learning_rate": 3.521910993885058e-06, + "loss": 0.4435, + "step": 17435 + }, + { + "epoch": 2.869992182249592, + "grad_norm": 0.3068595168744032, + "learning_rate": 3.5214521732780315e-06, + "loss": 0.4417, + "step": 17436 + }, + { + "epoch": 2.8701567664687087, + "grad_norm": 0.3188531411080381, + "learning_rate": 3.5209933633485308e-06, + "loss": 0.4268, + "step": 17437 + }, + { + "epoch": 2.870321350687825, + "grad_norm": 0.36896820242208617, + "learning_rate": 3.5205345641015593e-06, + "loss": 0.4448, + "step": 17438 + }, + { + "epoch": 2.8704859349069416, + "grad_norm": 0.39656766242190944, + "learning_rate": 3.520075775542123e-06, + "loss": 0.4211, + "step": 17439 + }, + { + "epoch": 2.870650519126058, + "grad_norm": 0.32019722022746766, + "learning_rate": 3.519616997675229e-06, + "loss": 0.4338, + "step": 17440 + }, + { + "epoch": 2.870815103345174, + "grad_norm": 0.2839626543679687, + "learning_rate": 3.519158230505882e-06, + "loss": 0.4261, + "step": 17441 + }, + { + "epoch": 2.8709796875642906, + "grad_norm": 0.3489103217881381, + "learning_rate": 3.5186994740390904e-06, + "loss": 0.4542, + "step": 17442 + }, + { + "epoch": 2.871144271783407, + "grad_norm": 0.38422954466838666, + "learning_rate": 3.5182407282798575e-06, + "loss": 0.4119, + "step": 17443 + }, + { + "epoch": 2.8713088560025235, + "grad_norm": 0.46146800085205075, + "learning_rate": 3.5177819932331905e-06, + "loss": 0.4263, + "step": 17444 + }, + { + "epoch": 2.87147344022164, + "grad_norm": 0.4116554893544578, + "learning_rate": 3.517323268904096e-06, + "loss": 0.4317, + "step": 17445 + }, + { + "epoch": 2.8716380244407564, + "grad_norm": 0.44247422368616773, + "learning_rate": 3.5168645552975754e-06, + "loss": 0.4339, + "step": 17446 + }, + { + "epoch": 2.871802608659873, + "grad_norm": 0.25576977603313145, + "learning_rate": 3.516405852418638e-06, + "loss": 0.4011, + "step": 17447 + }, + { + "epoch": 2.8719671928789894, + "grad_norm": 0.3409850047465722, + "learning_rate": 3.5159471602722856e-06, + "loss": 0.4195, + "step": 17448 + }, + { + "epoch": 2.872131777098106, + "grad_norm": 0.3528487025903805, + "learning_rate": 3.5154884788635263e-06, + "loss": 0.4265, + "step": 17449 + }, + { + "epoch": 2.8722963613172223, + "grad_norm": 0.9376489113719304, + "learning_rate": 3.515029808197363e-06, + "loss": 0.4388, + "step": 17450 + }, + { + "epoch": 2.872460945536339, + "grad_norm": 0.26099699964732215, + "learning_rate": 3.5145711482788023e-06, + "loss": 0.4194, + "step": 17451 + }, + { + "epoch": 2.8726255297554553, + "grad_norm": 0.38026553404402125, + "learning_rate": 3.5141124991128477e-06, + "loss": 0.4176, + "step": 17452 + }, + { + "epoch": 2.8727901139745717, + "grad_norm": 0.29526089747718154, + "learning_rate": 3.5136538607045034e-06, + "loss": 0.4332, + "step": 17453 + }, + { + "epoch": 2.872954698193688, + "grad_norm": 0.30887571600772057, + "learning_rate": 3.513195233058776e-06, + "loss": 0.4357, + "step": 17454 + }, + { + "epoch": 2.8731192824128047, + "grad_norm": 0.344099036251958, + "learning_rate": 3.5127366161806655e-06, + "loss": 0.4215, + "step": 17455 + }, + { + "epoch": 2.873283866631921, + "grad_norm": 0.4456054969025151, + "learning_rate": 3.512278010075182e-06, + "loss": 0.4396, + "step": 17456 + }, + { + "epoch": 2.8734484508510376, + "grad_norm": 0.4540551366967078, + "learning_rate": 3.511819414747325e-06, + "loss": 0.4468, + "step": 17457 + }, + { + "epoch": 2.873613035070154, + "grad_norm": 0.3225405214185915, + "learning_rate": 3.511360830202101e-06, + "loss": 0.4459, + "step": 17458 + }, + { + "epoch": 2.8737776192892706, + "grad_norm": 0.261252551838841, + "learning_rate": 3.5109022564445134e-06, + "loss": 0.428, + "step": 17459 + }, + { + "epoch": 2.873942203508387, + "grad_norm": 0.34881152820244593, + "learning_rate": 3.510443693479564e-06, + "loss": 0.4519, + "step": 17460 + }, + { + "epoch": 2.8741067877275035, + "grad_norm": 0.3918089656304943, + "learning_rate": 3.5099851413122596e-06, + "loss": 0.4387, + "step": 17461 + }, + { + "epoch": 2.87427137194662, + "grad_norm": 0.39959836359225365, + "learning_rate": 3.509526599947601e-06, + "loss": 0.4246, + "step": 17462 + }, + { + "epoch": 2.8744359561657364, + "grad_norm": 0.33673458398190126, + "learning_rate": 3.5090680693905937e-06, + "loss": 0.432, + "step": 17463 + }, + { + "epoch": 2.874600540384853, + "grad_norm": 0.2979815248415507, + "learning_rate": 3.50860954964624e-06, + "loss": 0.4131, + "step": 17464 + }, + { + "epoch": 2.8747651246039694, + "grad_norm": 0.39709378080258345, + "learning_rate": 3.5081510407195443e-06, + "loss": 0.4392, + "step": 17465 + }, + { + "epoch": 2.8749297088230854, + "grad_norm": 0.313235605852531, + "learning_rate": 3.5076925426155077e-06, + "loss": 0.4301, + "step": 17466 + }, + { + "epoch": 2.875094293042202, + "grad_norm": 0.3350202953389066, + "learning_rate": 3.507234055339133e-06, + "loss": 0.4333, + "step": 17467 + }, + { + "epoch": 2.8752588772613183, + "grad_norm": 0.3393102073674103, + "learning_rate": 3.506775578895426e-06, + "loss": 0.4146, + "step": 17468 + }, + { + "epoch": 2.875423461480435, + "grad_norm": 0.3597545770156154, + "learning_rate": 3.506317113289385e-06, + "loss": 0.4253, + "step": 17469 + }, + { + "epoch": 2.8755880456995513, + "grad_norm": 0.42835161910731306, + "learning_rate": 3.505858658526017e-06, + "loss": 0.4154, + "step": 17470 + }, + { + "epoch": 2.8757526299186678, + "grad_norm": 0.33497983250833635, + "learning_rate": 3.5054002146103216e-06, + "loss": 0.4248, + "step": 17471 + }, + { + "epoch": 2.8759172141377842, + "grad_norm": 0.32949795625342293, + "learning_rate": 3.5049417815473027e-06, + "loss": 0.4187, + "step": 17472 + }, + { + "epoch": 2.8760817983569007, + "grad_norm": 0.32898726958386537, + "learning_rate": 3.504483359341961e-06, + "loss": 0.4363, + "step": 17473 + }, + { + "epoch": 2.876246382576017, + "grad_norm": 0.4864876133253663, + "learning_rate": 3.504024947999301e-06, + "loss": 0.444, + "step": 17474 + }, + { + "epoch": 2.8764109667951336, + "grad_norm": 0.3015309938868187, + "learning_rate": 3.5035665475243237e-06, + "loss": 0.4062, + "step": 17475 + }, + { + "epoch": 2.87657555101425, + "grad_norm": 0.278539466113373, + "learning_rate": 3.503108157922029e-06, + "loss": 0.3998, + "step": 17476 + }, + { + "epoch": 2.8767401352333666, + "grad_norm": 0.3085679687824653, + "learning_rate": 3.502649779197421e-06, + "loss": 0.4325, + "step": 17477 + }, + { + "epoch": 2.876904719452483, + "grad_norm": 0.2981762361349608, + "learning_rate": 3.5021914113554993e-06, + "loss": 0.4345, + "step": 17478 + }, + { + "epoch": 2.8770693036715995, + "grad_norm": 0.42018455478878153, + "learning_rate": 3.5017330544012684e-06, + "loss": 0.4417, + "step": 17479 + }, + { + "epoch": 2.877233887890716, + "grad_norm": 0.3194290282093159, + "learning_rate": 3.501274708339727e-06, + "loss": 0.4217, + "step": 17480 + }, + { + "epoch": 2.8773984721098325, + "grad_norm": 0.2961955978280596, + "learning_rate": 3.500816373175878e-06, + "loss": 0.4146, + "step": 17481 + }, + { + "epoch": 2.877563056328949, + "grad_norm": 0.35320320042854264, + "learning_rate": 3.5003580489147217e-06, + "loss": 0.4167, + "step": 17482 + }, + { + "epoch": 2.8777276405480654, + "grad_norm": 0.2914516303858204, + "learning_rate": 3.499899735561259e-06, + "loss": 0.4225, + "step": 17483 + }, + { + "epoch": 2.877892224767182, + "grad_norm": 0.5742647961884108, + "learning_rate": 3.4994414331204927e-06, + "loss": 0.4424, + "step": 17484 + }, + { + "epoch": 2.8780568089862983, + "grad_norm": 0.47125135149148517, + "learning_rate": 3.4989831415974196e-06, + "loss": 0.4127, + "step": 17485 + }, + { + "epoch": 2.878221393205415, + "grad_norm": 0.277633921305222, + "learning_rate": 3.498524860997046e-06, + "loss": 0.4379, + "step": 17486 + }, + { + "epoch": 2.8783859774245313, + "grad_norm": 0.35444514695476276, + "learning_rate": 3.498066591324366e-06, + "loss": 0.415, + "step": 17487 + }, + { + "epoch": 2.8785505616436478, + "grad_norm": 0.3702431165102328, + "learning_rate": 3.4976083325843866e-06, + "loss": 0.4407, + "step": 17488 + }, + { + "epoch": 2.8787151458627642, + "grad_norm": 0.36745646953654393, + "learning_rate": 3.4971500847821044e-06, + "loss": 0.427, + "step": 17489 + }, + { + "epoch": 2.8788797300818807, + "grad_norm": 0.3012571879088693, + "learning_rate": 3.4966918479225185e-06, + "loss": 0.4189, + "step": 17490 + }, + { + "epoch": 2.879044314300997, + "grad_norm": 0.3522602230756812, + "learning_rate": 3.4962336220106322e-06, + "loss": 0.4279, + "step": 17491 + }, + { + "epoch": 2.8792088985201136, + "grad_norm": 0.47725249684852394, + "learning_rate": 3.495775407051443e-06, + "loss": 0.4388, + "step": 17492 + }, + { + "epoch": 2.87937348273923, + "grad_norm": 0.3846884788711646, + "learning_rate": 3.495317203049953e-06, + "loss": 0.4184, + "step": 17493 + }, + { + "epoch": 2.8795380669583466, + "grad_norm": 0.4336049163299836, + "learning_rate": 3.494859010011159e-06, + "loss": 0.441, + "step": 17494 + }, + { + "epoch": 2.879702651177463, + "grad_norm": 0.35240779292563884, + "learning_rate": 3.494400827940064e-06, + "loss": 0.4257, + "step": 17495 + }, + { + "epoch": 2.8798672353965795, + "grad_norm": 0.3353048719207727, + "learning_rate": 3.4939426568416664e-06, + "loss": 0.4356, + "step": 17496 + }, + { + "epoch": 2.880031819615696, + "grad_norm": 0.41796988582902395, + "learning_rate": 3.4934844967209633e-06, + "loss": 0.42, + "step": 17497 + }, + { + "epoch": 2.8801964038348125, + "grad_norm": 0.33536615585729473, + "learning_rate": 3.493026347582957e-06, + "loss": 0.4173, + "step": 17498 + }, + { + "epoch": 2.880360988053929, + "grad_norm": 0.3332212640094779, + "learning_rate": 3.4925682094326437e-06, + "loss": 0.416, + "step": 17499 + }, + { + "epoch": 2.8805255722730454, + "grad_norm": 0.3471955782673057, + "learning_rate": 3.492110082275025e-06, + "loss": 0.4401, + "step": 17500 + }, + { + "epoch": 2.880690156492162, + "grad_norm": 0.3282416402822312, + "learning_rate": 3.4916519661150986e-06, + "loss": 0.4458, + "step": 17501 + }, + { + "epoch": 2.8808547407112783, + "grad_norm": 0.36896467746199585, + "learning_rate": 3.4911938609578637e-06, + "loss": 0.443, + "step": 17502 + }, + { + "epoch": 2.881019324930395, + "grad_norm": 0.3332197037487723, + "learning_rate": 3.4907357668083204e-06, + "loss": 0.4565, + "step": 17503 + }, + { + "epoch": 2.8811839091495113, + "grad_norm": 0.34050078360924096, + "learning_rate": 3.490277683671462e-06, + "loss": 0.4408, + "step": 17504 + }, + { + "epoch": 2.8813484933686278, + "grad_norm": 0.3992178710229831, + "learning_rate": 3.489819611552294e-06, + "loss": 0.4526, + "step": 17505 + }, + { + "epoch": 2.881513077587744, + "grad_norm": 0.2679678739319319, + "learning_rate": 3.489361550455808e-06, + "loss": 0.4292, + "step": 17506 + }, + { + "epoch": 2.8816776618068607, + "grad_norm": 0.3198761892317233, + "learning_rate": 3.4889035003870087e-06, + "loss": 0.4267, + "step": 17507 + }, + { + "epoch": 2.8818422460259767, + "grad_norm": 0.2830674041493437, + "learning_rate": 3.4884454613508885e-06, + "loss": 0.4211, + "step": 17508 + }, + { + "epoch": 2.882006830245093, + "grad_norm": 0.416042072950869, + "learning_rate": 3.487987433352449e-06, + "loss": 0.4309, + "step": 17509 + }, + { + "epoch": 2.8821714144642097, + "grad_norm": 0.516522262405375, + "learning_rate": 3.4875294163966868e-06, + "loss": 0.4154, + "step": 17510 + }, + { + "epoch": 2.882335998683326, + "grad_norm": 0.49139703605895474, + "learning_rate": 3.4870714104885977e-06, + "loss": 0.4338, + "step": 17511 + }, + { + "epoch": 2.8825005829024426, + "grad_norm": 0.2969184023413605, + "learning_rate": 3.4866134156331832e-06, + "loss": 0.4435, + "step": 17512 + }, + { + "epoch": 2.882665167121559, + "grad_norm": 0.3249885720627407, + "learning_rate": 3.4861554318354375e-06, + "loss": 0.4418, + "step": 17513 + }, + { + "epoch": 2.8828297513406755, + "grad_norm": 0.3798585206088055, + "learning_rate": 3.485697459100359e-06, + "loss": 0.4363, + "step": 17514 + }, + { + "epoch": 2.882994335559792, + "grad_norm": 0.2624652605932739, + "learning_rate": 3.4852394974329453e-06, + "loss": 0.4459, + "step": 17515 + }, + { + "epoch": 2.8831589197789085, + "grad_norm": 0.3741199036591475, + "learning_rate": 3.484781546838195e-06, + "loss": 0.4175, + "step": 17516 + }, + { + "epoch": 2.883323503998025, + "grad_norm": 0.28807914592528444, + "learning_rate": 3.4843236073210996e-06, + "loss": 0.4435, + "step": 17517 + }, + { + "epoch": 2.8834880882171414, + "grad_norm": 0.3991709257107399, + "learning_rate": 3.483865678886663e-06, + "loss": 0.4195, + "step": 17518 + }, + { + "epoch": 2.883652672436258, + "grad_norm": 0.2976102140387971, + "learning_rate": 3.483407761539878e-06, + "loss": 0.4226, + "step": 17519 + }, + { + "epoch": 2.8838172566553744, + "grad_norm": 0.29345802221273715, + "learning_rate": 3.4829498552857405e-06, + "loss": 0.4297, + "step": 17520 + }, + { + "epoch": 2.883981840874491, + "grad_norm": 0.3035537774606567, + "learning_rate": 3.4824919601292494e-06, + "loss": 0.4397, + "step": 17521 + }, + { + "epoch": 2.8841464250936073, + "grad_norm": 0.29332745685733963, + "learning_rate": 3.4820340760753984e-06, + "loss": 0.4377, + "step": 17522 + }, + { + "epoch": 2.8843110093127238, + "grad_norm": 0.34426693441979905, + "learning_rate": 3.4815762031291865e-06, + "loss": 0.4279, + "step": 17523 + }, + { + "epoch": 2.8844755935318402, + "grad_norm": 0.3380721839942416, + "learning_rate": 3.481118341295608e-06, + "loss": 0.4203, + "step": 17524 + }, + { + "epoch": 2.8846401777509567, + "grad_norm": 0.28653978493406196, + "learning_rate": 3.4806604905796602e-06, + "loss": 0.4092, + "step": 17525 + }, + { + "epoch": 2.884804761970073, + "grad_norm": 0.4618970250864827, + "learning_rate": 3.480202650986339e-06, + "loss": 0.4352, + "step": 17526 + }, + { + "epoch": 2.8849693461891897, + "grad_norm": 0.7745580166610236, + "learning_rate": 3.4797448225206386e-06, + "loss": 0.4404, + "step": 17527 + }, + { + "epoch": 2.885133930408306, + "grad_norm": 0.3572989624425346, + "learning_rate": 3.479287005187556e-06, + "loss": 0.4239, + "step": 17528 + }, + { + "epoch": 2.8852985146274226, + "grad_norm": 0.3958605014986573, + "learning_rate": 3.4788291989920847e-06, + "loss": 0.4419, + "step": 17529 + }, + { + "epoch": 2.885463098846539, + "grad_norm": 0.29876933676602424, + "learning_rate": 3.478371403939223e-06, + "loss": 0.4596, + "step": 17530 + }, + { + "epoch": 2.8856276830656555, + "grad_norm": 0.30567302667861224, + "learning_rate": 3.4779136200339635e-06, + "loss": 0.4351, + "step": 17531 + }, + { + "epoch": 2.885792267284772, + "grad_norm": 0.7991158892795251, + "learning_rate": 3.477455847281304e-06, + "loss": 0.4336, + "step": 17532 + }, + { + "epoch": 2.885956851503888, + "grad_norm": 0.2860365966641438, + "learning_rate": 3.476998085686238e-06, + "loss": 0.4166, + "step": 17533 + }, + { + "epoch": 2.8861214357230045, + "grad_norm": 0.35813329852227593, + "learning_rate": 3.47654033525376e-06, + "loss": 0.428, + "step": 17534 + }, + { + "epoch": 2.886286019942121, + "grad_norm": 0.340048960958515, + "learning_rate": 3.476082595988867e-06, + "loss": 0.4089, + "step": 17535 + }, + { + "epoch": 2.8864506041612374, + "grad_norm": 0.4793172775125557, + "learning_rate": 3.475624867896549e-06, + "loss": 0.4241, + "step": 17536 + }, + { + "epoch": 2.886615188380354, + "grad_norm": 0.3980190982195709, + "learning_rate": 3.475167150981807e-06, + "loss": 0.4445, + "step": 17537 + }, + { + "epoch": 2.8867797725994704, + "grad_norm": 0.28018392859825864, + "learning_rate": 3.4747094452496293e-06, + "loss": 0.4342, + "step": 17538 + }, + { + "epoch": 2.886944356818587, + "grad_norm": 0.2853823124582563, + "learning_rate": 3.474251750705015e-06, + "loss": 0.4237, + "step": 17539 + }, + { + "epoch": 2.8871089410377033, + "grad_norm": 0.36079531254173375, + "learning_rate": 3.473794067352956e-06, + "loss": 0.4208, + "step": 17540 + }, + { + "epoch": 2.88727352525682, + "grad_norm": 0.37046200754921405, + "learning_rate": 3.4733363951984453e-06, + "loss": 0.4247, + "step": 17541 + }, + { + "epoch": 2.8874381094759363, + "grad_norm": 0.3782141148710571, + "learning_rate": 3.472878734246479e-06, + "loss": 0.4253, + "step": 17542 + }, + { + "epoch": 2.8876026936950527, + "grad_norm": 0.2860690892595623, + "learning_rate": 3.472421084502049e-06, + "loss": 0.4166, + "step": 17543 + }, + { + "epoch": 2.887767277914169, + "grad_norm": 0.2638562180584719, + "learning_rate": 3.471963445970151e-06, + "loss": 0.4282, + "step": 17544 + }, + { + "epoch": 2.8879318621332857, + "grad_norm": 0.3075702618199297, + "learning_rate": 3.471505818655777e-06, + "loss": 0.4205, + "step": 17545 + }, + { + "epoch": 2.888096446352402, + "grad_norm": 0.3551562646617458, + "learning_rate": 3.471048202563923e-06, + "loss": 0.4259, + "step": 17546 + }, + { + "epoch": 2.8882610305715186, + "grad_norm": 0.30878179949248125, + "learning_rate": 3.470590597699579e-06, + "loss": 0.4219, + "step": 17547 + }, + { + "epoch": 2.888425614790635, + "grad_norm": 0.3359741753003401, + "learning_rate": 3.4701330040677384e-06, + "loss": 0.4353, + "step": 17548 + }, + { + "epoch": 2.8885901990097516, + "grad_norm": 1.0444794328678997, + "learning_rate": 3.4696754216733972e-06, + "loss": 0.4264, + "step": 17549 + }, + { + "epoch": 2.888754783228868, + "grad_norm": 0.32698250234672555, + "learning_rate": 3.4692178505215448e-06, + "loss": 0.4573, + "step": 17550 + }, + { + "epoch": 2.8889193674479845, + "grad_norm": 0.27411367522197744, + "learning_rate": 3.4687602906171773e-06, + "loss": 0.4268, + "step": 17551 + }, + { + "epoch": 2.889083951667101, + "grad_norm": 0.33172186293468775, + "learning_rate": 3.4683027419652847e-06, + "loss": 0.4443, + "step": 17552 + }, + { + "epoch": 2.8892485358862174, + "grad_norm": 0.3046771536424512, + "learning_rate": 3.4678452045708622e-06, + "loss": 0.4141, + "step": 17553 + }, + { + "epoch": 2.889413120105334, + "grad_norm": 0.36778006334965835, + "learning_rate": 3.4673876784389016e-06, + "loss": 0.4505, + "step": 17554 + }, + { + "epoch": 2.8895777043244504, + "grad_norm": 0.49063857055608223, + "learning_rate": 3.4669301635743913e-06, + "loss": 0.4283, + "step": 17555 + }, + { + "epoch": 2.889742288543567, + "grad_norm": 0.394256688057511, + "learning_rate": 3.46647265998233e-06, + "loss": 0.4507, + "step": 17556 + }, + { + "epoch": 2.8899068727626833, + "grad_norm": 0.30983342194561003, + "learning_rate": 3.4660151676677034e-06, + "loss": 0.4348, + "step": 17557 + }, + { + "epoch": 2.8900714569818, + "grad_norm": 0.30241903659440256, + "learning_rate": 3.4655576866355097e-06, + "loss": 0.4417, + "step": 17558 + }, + { + "epoch": 2.8902360412009163, + "grad_norm": 0.3153832356377743, + "learning_rate": 3.465100216890736e-06, + "loss": 0.4471, + "step": 17559 + }, + { + "epoch": 2.8904006254200327, + "grad_norm": 0.40039286031331656, + "learning_rate": 3.4646427584383765e-06, + "loss": 0.4565, + "step": 17560 + }, + { + "epoch": 2.890565209639149, + "grad_norm": 0.3497720118990314, + "learning_rate": 3.4641853112834214e-06, + "loss": 0.46, + "step": 17561 + }, + { + "epoch": 2.8907297938582657, + "grad_norm": 0.3215586088161052, + "learning_rate": 3.463727875430863e-06, + "loss": 0.4388, + "step": 17562 + }, + { + "epoch": 2.890894378077382, + "grad_norm": 0.2806799404627749, + "learning_rate": 3.463270450885693e-06, + "loss": 0.4062, + "step": 17563 + }, + { + "epoch": 2.8910589622964986, + "grad_norm": 0.30590924071024106, + "learning_rate": 3.462813037652901e-06, + "loss": 0.4415, + "step": 17564 + }, + { + "epoch": 2.891223546515615, + "grad_norm": 0.39795707215124965, + "learning_rate": 3.462355635737481e-06, + "loss": 0.4133, + "step": 17565 + }, + { + "epoch": 2.8913881307347316, + "grad_norm": 0.28841486979329806, + "learning_rate": 3.4618982451444197e-06, + "loss": 0.4262, + "step": 17566 + }, + { + "epoch": 2.891552714953848, + "grad_norm": 0.30530892538511867, + "learning_rate": 3.4614408658787127e-06, + "loss": 0.4272, + "step": 17567 + }, + { + "epoch": 2.8917172991729645, + "grad_norm": 0.6587955945287981, + "learning_rate": 3.4609834979453457e-06, + "loss": 0.4235, + "step": 17568 + }, + { + "epoch": 2.891881883392081, + "grad_norm": 0.3581837116766279, + "learning_rate": 3.4605261413493155e-06, + "loss": 0.4399, + "step": 17569 + }, + { + "epoch": 2.8920464676111974, + "grad_norm": 0.30612813779363107, + "learning_rate": 3.4600687960956076e-06, + "loss": 0.4445, + "step": 17570 + }, + { + "epoch": 2.892211051830314, + "grad_norm": 0.31405506424736673, + "learning_rate": 3.4596114621892133e-06, + "loss": 0.4457, + "step": 17571 + }, + { + "epoch": 2.8923756360494304, + "grad_norm": 0.2900030281499961, + "learning_rate": 3.4591541396351244e-06, + "loss": 0.4448, + "step": 17572 + }, + { + "epoch": 2.892540220268547, + "grad_norm": 0.28487788863551433, + "learning_rate": 3.4586968284383298e-06, + "loss": 0.4158, + "step": 17573 + }, + { + "epoch": 2.8927048044876633, + "grad_norm": 0.27083017128369313, + "learning_rate": 3.4582395286038197e-06, + "loss": 0.4343, + "step": 17574 + }, + { + "epoch": 2.8928693887067793, + "grad_norm": 0.326629990273933, + "learning_rate": 3.4577822401365838e-06, + "loss": 0.4274, + "step": 17575 + }, + { + "epoch": 2.893033972925896, + "grad_norm": 0.3504863651738228, + "learning_rate": 3.457324963041613e-06, + "loss": 0.4321, + "step": 17576 + }, + { + "epoch": 2.8931985571450123, + "grad_norm": 0.4574549399371932, + "learning_rate": 3.4568676973238973e-06, + "loss": 0.4291, + "step": 17577 + }, + { + "epoch": 2.8933631413641288, + "grad_norm": 0.3123392716219587, + "learning_rate": 3.4564104429884223e-06, + "loss": 0.415, + "step": 17578 + }, + { + "epoch": 2.8935277255832452, + "grad_norm": 0.3238830641887869, + "learning_rate": 3.4559532000401816e-06, + "loss": 0.4326, + "step": 17579 + }, + { + "epoch": 2.8936923098023617, + "grad_norm": 0.2935385519404049, + "learning_rate": 3.455495968484162e-06, + "loss": 0.4331, + "step": 17580 + }, + { + "epoch": 2.893856894021478, + "grad_norm": 0.33603634738461574, + "learning_rate": 3.4550387483253537e-06, + "loss": 0.4509, + "step": 17581 + }, + { + "epoch": 2.8940214782405946, + "grad_norm": 0.27276667451662073, + "learning_rate": 3.4545815395687453e-06, + "loss": 0.4247, + "step": 17582 + }, + { + "epoch": 2.894186062459711, + "grad_norm": 0.27054568096228415, + "learning_rate": 3.454124342219327e-06, + "loss": 0.4311, + "step": 17583 + }, + { + "epoch": 2.8943506466788276, + "grad_norm": 0.3317891914521292, + "learning_rate": 3.453667156282087e-06, + "loss": 0.4182, + "step": 17584 + }, + { + "epoch": 2.894515230897944, + "grad_norm": 0.29678551621684296, + "learning_rate": 3.4532099817620107e-06, + "loss": 0.4264, + "step": 17585 + }, + { + "epoch": 2.8946798151170605, + "grad_norm": 0.40450783480095226, + "learning_rate": 3.452752818664092e-06, + "loss": 0.4304, + "step": 17586 + }, + { + "epoch": 2.894844399336177, + "grad_norm": 0.2833720786401857, + "learning_rate": 3.452295666993314e-06, + "loss": 0.435, + "step": 17587 + }, + { + "epoch": 2.8950089835552935, + "grad_norm": 0.3042480121931492, + "learning_rate": 3.4518385267546703e-06, + "loss": 0.4325, + "step": 17588 + }, + { + "epoch": 2.89517356777441, + "grad_norm": 0.27890726007985217, + "learning_rate": 3.451381397953144e-06, + "loss": 0.4472, + "step": 17589 + }, + { + "epoch": 2.8953381519935264, + "grad_norm": 0.354090550441332, + "learning_rate": 3.450924280593727e-06, + "loss": 0.4144, + "step": 17590 + }, + { + "epoch": 2.895502736212643, + "grad_norm": 0.3311840660126875, + "learning_rate": 3.4504671746814054e-06, + "loss": 0.4328, + "step": 17591 + }, + { + "epoch": 2.8956673204317593, + "grad_norm": 0.3337723131257527, + "learning_rate": 3.450010080221166e-06, + "loss": 0.4356, + "step": 17592 + }, + { + "epoch": 2.895831904650876, + "grad_norm": 0.3563506569008164, + "learning_rate": 3.449552997217999e-06, + "loss": 0.4425, + "step": 17593 + }, + { + "epoch": 2.8959964888699923, + "grad_norm": 0.29006337535950466, + "learning_rate": 3.4490959256768887e-06, + "loss": 0.4301, + "step": 17594 + }, + { + "epoch": 2.8961610730891088, + "grad_norm": 0.4306552211887624, + "learning_rate": 3.448638865602825e-06, + "loss": 0.4371, + "step": 17595 + }, + { + "epoch": 2.896325657308225, + "grad_norm": 0.37960210912353, + "learning_rate": 3.4481818170007944e-06, + "loss": 0.4201, + "step": 17596 + }, + { + "epoch": 2.8964902415273417, + "grad_norm": 0.2869460692042597, + "learning_rate": 3.4477247798757857e-06, + "loss": 0.4372, + "step": 17597 + }, + { + "epoch": 2.896654825746458, + "grad_norm": 0.332782372252181, + "learning_rate": 3.447267754232781e-06, + "loss": 0.4449, + "step": 17598 + }, + { + "epoch": 2.8968194099655746, + "grad_norm": 0.41277560059229834, + "learning_rate": 3.446810740076774e-06, + "loss": 0.436, + "step": 17599 + }, + { + "epoch": 2.8969839941846907, + "grad_norm": 0.3226705898444896, + "learning_rate": 3.446353737412746e-06, + "loss": 0.4327, + "step": 17600 + }, + { + "epoch": 2.897148578403807, + "grad_norm": 0.3131374708035312, + "learning_rate": 3.445896746245685e-06, + "loss": 0.4289, + "step": 17601 + }, + { + "epoch": 2.8973131626229236, + "grad_norm": 0.38371779238723275, + "learning_rate": 3.4454397665805785e-06, + "loss": 0.4148, + "step": 17602 + }, + { + "epoch": 2.89747774684204, + "grad_norm": 0.3992085554809812, + "learning_rate": 3.444982798422412e-06, + "loss": 0.4241, + "step": 17603 + }, + { + "epoch": 2.8976423310611565, + "grad_norm": 0.27723506544946885, + "learning_rate": 3.4445258417761734e-06, + "loss": 0.4323, + "step": 17604 + }, + { + "epoch": 2.897806915280273, + "grad_norm": 0.47666218029781204, + "learning_rate": 3.444068896646846e-06, + "loss": 0.4318, + "step": 17605 + }, + { + "epoch": 2.8979714994993895, + "grad_norm": 0.30510120682304553, + "learning_rate": 3.4436119630394186e-06, + "loss": 0.4209, + "step": 17606 + }, + { + "epoch": 2.898136083718506, + "grad_norm": 0.4314918849748407, + "learning_rate": 3.4431550409588767e-06, + "loss": 0.4047, + "step": 17607 + }, + { + "epoch": 2.8983006679376224, + "grad_norm": 0.35357462801580614, + "learning_rate": 3.442698130410203e-06, + "loss": 0.4384, + "step": 17608 + }, + { + "epoch": 2.898465252156739, + "grad_norm": 0.30285283816494646, + "learning_rate": 3.4422412313983867e-06, + "loss": 0.4402, + "step": 17609 + }, + { + "epoch": 2.8986298363758554, + "grad_norm": 0.36947669740277606, + "learning_rate": 3.441784343928411e-06, + "loss": 0.4324, + "step": 17610 + }, + { + "epoch": 2.898794420594972, + "grad_norm": 0.3775632311292628, + "learning_rate": 3.441327468005263e-06, + "loss": 0.4165, + "step": 17611 + }, + { + "epoch": 2.8989590048140883, + "grad_norm": 0.34658287083305633, + "learning_rate": 3.4408706036339264e-06, + "loss": 0.4354, + "step": 17612 + }, + { + "epoch": 2.8991235890332048, + "grad_norm": 0.35313998235139205, + "learning_rate": 3.440413750819388e-06, + "loss": 0.4429, + "step": 17613 + }, + { + "epoch": 2.8992881732523212, + "grad_norm": 0.2952482822347777, + "learning_rate": 3.439956909566632e-06, + "loss": 0.4444, + "step": 17614 + }, + { + "epoch": 2.8994527574714377, + "grad_norm": 0.42786781905398574, + "learning_rate": 3.4395000798806418e-06, + "loss": 0.4234, + "step": 17615 + }, + { + "epoch": 2.899617341690554, + "grad_norm": 0.3311169314032787, + "learning_rate": 3.439043261766405e-06, + "loss": 0.416, + "step": 17616 + }, + { + "epoch": 2.8997819259096707, + "grad_norm": 0.33214432687721374, + "learning_rate": 3.4385864552289023e-06, + "loss": 0.4295, + "step": 17617 + }, + { + "epoch": 2.899946510128787, + "grad_norm": 0.469509524189476, + "learning_rate": 3.438129660273124e-06, + "loss": 0.4452, + "step": 17618 + }, + { + "epoch": 2.9001110943479036, + "grad_norm": 0.3206636648928445, + "learning_rate": 3.4376728769040476e-06, + "loss": 0.4426, + "step": 17619 + }, + { + "epoch": 2.90027567856702, + "grad_norm": 0.37333838540874276, + "learning_rate": 3.437216105126664e-06, + "loss": 0.4281, + "step": 17620 + }, + { + "epoch": 2.9004402627861365, + "grad_norm": 0.340705525024164, + "learning_rate": 3.4367593449459526e-06, + "loss": 0.4219, + "step": 17621 + }, + { + "epoch": 2.900604847005253, + "grad_norm": 0.3381100580160605, + "learning_rate": 3.4363025963668984e-06, + "loss": 0.4294, + "step": 17622 + }, + { + "epoch": 2.9007694312243695, + "grad_norm": 0.28531483631558896, + "learning_rate": 3.4358458593944868e-06, + "loss": 0.4275, + "step": 17623 + }, + { + "epoch": 2.900934015443486, + "grad_norm": 0.27056745382096664, + "learning_rate": 3.4353891340336987e-06, + "loss": 0.4127, + "step": 17624 + }, + { + "epoch": 2.9010985996626024, + "grad_norm": 0.29242580050975925, + "learning_rate": 3.434932420289521e-06, + "loss": 0.4216, + "step": 17625 + }, + { + "epoch": 2.901263183881719, + "grad_norm": 0.39910999046507917, + "learning_rate": 3.434475718166935e-06, + "loss": 0.4253, + "step": 17626 + }, + { + "epoch": 2.9014277681008354, + "grad_norm": 0.32521720295621315, + "learning_rate": 3.4340190276709265e-06, + "loss": 0.4128, + "step": 17627 + }, + { + "epoch": 2.901592352319952, + "grad_norm": 0.36843049245490517, + "learning_rate": 3.433562348806475e-06, + "loss": 0.4299, + "step": 17628 + }, + { + "epoch": 2.9017569365390683, + "grad_norm": 0.614088766386188, + "learning_rate": 3.4331056815785662e-06, + "loss": 0.4194, + "step": 17629 + }, + { + "epoch": 2.9019215207581848, + "grad_norm": 0.34208998169652544, + "learning_rate": 3.432649025992183e-06, + "loss": 0.4416, + "step": 17630 + }, + { + "epoch": 2.9020861049773012, + "grad_norm": 0.41863096666227234, + "learning_rate": 3.4321923820523056e-06, + "loss": 0.4309, + "step": 17631 + }, + { + "epoch": 2.9022506891964177, + "grad_norm": 0.3387098617077992, + "learning_rate": 3.4317357497639205e-06, + "loss": 0.4542, + "step": 17632 + }, + { + "epoch": 2.902415273415534, + "grad_norm": 0.3880538504682831, + "learning_rate": 3.431279129132007e-06, + "loss": 0.4242, + "step": 17633 + }, + { + "epoch": 2.9025798576346507, + "grad_norm": 0.29517302631006787, + "learning_rate": 3.430822520161551e-06, + "loss": 0.4431, + "step": 17634 + }, + { + "epoch": 2.902744441853767, + "grad_norm": 0.3217759709408525, + "learning_rate": 3.4303659228575337e-06, + "loss": 0.4526, + "step": 17635 + }, + { + "epoch": 2.9029090260728836, + "grad_norm": 0.5737184089042462, + "learning_rate": 3.4299093372249333e-06, + "loss": 0.4197, + "step": 17636 + }, + { + "epoch": 2.903073610292, + "grad_norm": 0.38688542526269387, + "learning_rate": 3.429452763268738e-06, + "loss": 0.4194, + "step": 17637 + }, + { + "epoch": 2.9032381945111165, + "grad_norm": 0.3002994499825206, + "learning_rate": 3.4289962009939244e-06, + "loss": 0.4142, + "step": 17638 + }, + { + "epoch": 2.903402778730233, + "grad_norm": 0.35474702447463446, + "learning_rate": 3.4285396504054792e-06, + "loss": 0.4275, + "step": 17639 + }, + { + "epoch": 2.9035673629493495, + "grad_norm": 0.2754523365816297, + "learning_rate": 3.42808311150838e-06, + "loss": 0.4247, + "step": 17640 + }, + { + "epoch": 2.9037319471684655, + "grad_norm": 0.30124553560901396, + "learning_rate": 3.4276265843076113e-06, + "loss": 0.4152, + "step": 17641 + }, + { + "epoch": 2.903896531387582, + "grad_norm": 0.4355278989267508, + "learning_rate": 3.4271700688081523e-06, + "loss": 0.4205, + "step": 17642 + }, + { + "epoch": 2.9040611156066984, + "grad_norm": 0.2938263585504233, + "learning_rate": 3.4267135650149863e-06, + "loss": 0.4264, + "step": 17643 + }, + { + "epoch": 2.904225699825815, + "grad_norm": 0.30681761907619126, + "learning_rate": 3.4262570729330934e-06, + "loss": 0.4337, + "step": 17644 + }, + { + "epoch": 2.9043902840449314, + "grad_norm": 0.387173245592773, + "learning_rate": 3.425800592567454e-06, + "loss": 0.4488, + "step": 17645 + }, + { + "epoch": 2.904554868264048, + "grad_norm": 0.3232799699547336, + "learning_rate": 3.42534412392305e-06, + "loss": 0.4482, + "step": 17646 + }, + { + "epoch": 2.9047194524831643, + "grad_norm": 0.27505508379676474, + "learning_rate": 3.4248876670048623e-06, + "loss": 0.4503, + "step": 17647 + }, + { + "epoch": 2.904884036702281, + "grad_norm": 0.3042903382131154, + "learning_rate": 3.424431221817872e-06, + "loss": 0.4219, + "step": 17648 + }, + { + "epoch": 2.9050486209213973, + "grad_norm": 0.40167340114884303, + "learning_rate": 3.423974788367057e-06, + "loss": 0.4319, + "step": 17649 + }, + { + "epoch": 2.9052132051405137, + "grad_norm": 0.40027143285984046, + "learning_rate": 3.423518366657402e-06, + "loss": 0.4273, + "step": 17650 + }, + { + "epoch": 2.90537778935963, + "grad_norm": 0.38083677010822076, + "learning_rate": 3.4230619566938843e-06, + "loss": 0.4543, + "step": 17651 + }, + { + "epoch": 2.9055423735787467, + "grad_norm": 0.3445738566986874, + "learning_rate": 3.4226055584814837e-06, + "loss": 0.4232, + "step": 17652 + }, + { + "epoch": 2.905706957797863, + "grad_norm": 0.2830924066386983, + "learning_rate": 3.4221491720251826e-06, + "loss": 0.4173, + "step": 17653 + }, + { + "epoch": 2.9058715420169796, + "grad_norm": 0.4947656362937295, + "learning_rate": 3.4216927973299577e-06, + "loss": 0.4471, + "step": 17654 + }, + { + "epoch": 2.906036126236096, + "grad_norm": 0.2769119444352562, + "learning_rate": 3.4212364344007925e-06, + "loss": 0.4312, + "step": 17655 + }, + { + "epoch": 2.9062007104552126, + "grad_norm": 0.6499934734448956, + "learning_rate": 3.420780083242663e-06, + "loss": 0.4367, + "step": 17656 + }, + { + "epoch": 2.906365294674329, + "grad_norm": 0.370733265460432, + "learning_rate": 3.4203237438605524e-06, + "loss": 0.431, + "step": 17657 + }, + { + "epoch": 2.9065298788934455, + "grad_norm": 0.3411392926466954, + "learning_rate": 3.4198674162594386e-06, + "loss": 0.4339, + "step": 17658 + }, + { + "epoch": 2.906694463112562, + "grad_norm": 0.39875733082792414, + "learning_rate": 3.4194111004442983e-06, + "loss": 0.4408, + "step": 17659 + }, + { + "epoch": 2.9068590473316784, + "grad_norm": 0.39200771850558425, + "learning_rate": 3.4189547964201145e-06, + "loss": 0.428, + "step": 17660 + }, + { + "epoch": 2.907023631550795, + "grad_norm": 0.2871409612914079, + "learning_rate": 3.4184985041918627e-06, + "loss": 0.42, + "step": 17661 + }, + { + "epoch": 2.9071882157699114, + "grad_norm": 0.3061717105592732, + "learning_rate": 3.4180422237645253e-06, + "loss": 0.4212, + "step": 17662 + }, + { + "epoch": 2.907352799989028, + "grad_norm": 0.2843559900515243, + "learning_rate": 3.417585955143078e-06, + "loss": 0.4528, + "step": 17663 + }, + { + "epoch": 2.9075173842081443, + "grad_norm": 0.3243487704921083, + "learning_rate": 3.417129698332502e-06, + "loss": 0.4382, + "step": 17664 + }, + { + "epoch": 2.907681968427261, + "grad_norm": 0.2980024565274041, + "learning_rate": 3.4166734533377747e-06, + "loss": 0.4441, + "step": 17665 + }, + { + "epoch": 2.907846552646377, + "grad_norm": 0.34459119960972434, + "learning_rate": 3.416217220163873e-06, + "loss": 0.4292, + "step": 17666 + }, + { + "epoch": 2.9080111368654933, + "grad_norm": 1.0791551102000445, + "learning_rate": 3.4157609988157784e-06, + "loss": 0.4346, + "step": 17667 + }, + { + "epoch": 2.9081757210846098, + "grad_norm": 0.6016048549628633, + "learning_rate": 3.415304789298464e-06, + "loss": 0.4227, + "step": 17668 + }, + { + "epoch": 2.9083403053037262, + "grad_norm": 0.43372572493792233, + "learning_rate": 3.414848591616914e-06, + "loss": 0.4334, + "step": 17669 + }, + { + "epoch": 2.9085048895228427, + "grad_norm": 0.3356910552899443, + "learning_rate": 3.4143924057761e-06, + "loss": 0.4142, + "step": 17670 + }, + { + "epoch": 2.908669473741959, + "grad_norm": 0.3306805036575712, + "learning_rate": 3.413936231781006e-06, + "loss": 0.4217, + "step": 17671 + }, + { + "epoch": 2.9088340579610756, + "grad_norm": 0.30701960353201796, + "learning_rate": 3.4134800696366054e-06, + "loss": 0.4342, + "step": 17672 + }, + { + "epoch": 2.908998642180192, + "grad_norm": 0.29094351464733453, + "learning_rate": 3.4130239193478754e-06, + "loss": 0.4628, + "step": 17673 + }, + { + "epoch": 2.9091632263993086, + "grad_norm": 0.3301904863774064, + "learning_rate": 3.412567780919796e-06, + "loss": 0.4084, + "step": 17674 + }, + { + "epoch": 2.909327810618425, + "grad_norm": 0.3566565100217812, + "learning_rate": 3.4121116543573416e-06, + "loss": 0.4111, + "step": 17675 + }, + { + "epoch": 2.9094923948375415, + "grad_norm": 0.266684336790093, + "learning_rate": 3.411655539665492e-06, + "loss": 0.4072, + "step": 17676 + }, + { + "epoch": 2.909656979056658, + "grad_norm": 0.29303718820543445, + "learning_rate": 3.411199436849221e-06, + "loss": 0.4292, + "step": 17677 + }, + { + "epoch": 2.9098215632757745, + "grad_norm": 0.32859176300301063, + "learning_rate": 3.4107433459135096e-06, + "loss": 0.422, + "step": 17678 + }, + { + "epoch": 2.909986147494891, + "grad_norm": 0.302610520053293, + "learning_rate": 3.4102872668633315e-06, + "loss": 0.4267, + "step": 17679 + }, + { + "epoch": 2.9101507317140074, + "grad_norm": 0.3279636996325862, + "learning_rate": 3.409831199703662e-06, + "loss": 0.431, + "step": 17680 + }, + { + "epoch": 2.910315315933124, + "grad_norm": 0.33009465835008517, + "learning_rate": 3.4093751444394813e-06, + "loss": 0.4268, + "step": 17681 + }, + { + "epoch": 2.9104799001522403, + "grad_norm": 0.366280287216141, + "learning_rate": 3.4089191010757623e-06, + "loss": 0.4109, + "step": 17682 + }, + { + "epoch": 2.910644484371357, + "grad_norm": 0.39484965699830255, + "learning_rate": 3.408463069617484e-06, + "loss": 0.4315, + "step": 17683 + }, + { + "epoch": 2.9108090685904733, + "grad_norm": 0.3019101335583522, + "learning_rate": 3.4080070500696195e-06, + "loss": 0.4227, + "step": 17684 + }, + { + "epoch": 2.9109736528095898, + "grad_norm": 0.4397943867074195, + "learning_rate": 3.4075510424371476e-06, + "loss": 0.4305, + "step": 17685 + }, + { + "epoch": 2.911138237028706, + "grad_norm": 0.3450226086647439, + "learning_rate": 3.407095046725042e-06, + "loss": 0.4217, + "step": 17686 + }, + { + "epoch": 2.9113028212478227, + "grad_norm": 0.5353065765995956, + "learning_rate": 3.4066390629382793e-06, + "loss": 0.4498, + "step": 17687 + }, + { + "epoch": 2.911467405466939, + "grad_norm": 0.37676124154671603, + "learning_rate": 3.406183091081836e-06, + "loss": 0.4175, + "step": 17688 + }, + { + "epoch": 2.9116319896860556, + "grad_norm": 0.37203745809563854, + "learning_rate": 3.405727131160684e-06, + "loss": 0.4295, + "step": 17689 + }, + { + "epoch": 2.911796573905172, + "grad_norm": 0.34226933420973427, + "learning_rate": 3.405271183179803e-06, + "loss": 0.4425, + "step": 17690 + }, + { + "epoch": 2.9119611581242886, + "grad_norm": 0.38383361264212845, + "learning_rate": 3.404815247144164e-06, + "loss": 0.43, + "step": 17691 + }, + { + "epoch": 2.912125742343405, + "grad_norm": 0.2692537578894957, + "learning_rate": 3.4043593230587454e-06, + "loss": 0.4179, + "step": 17692 + }, + { + "epoch": 2.9122903265625215, + "grad_norm": 0.38348850251716265, + "learning_rate": 3.4039034109285192e-06, + "loss": 0.4272, + "step": 17693 + }, + { + "epoch": 2.912454910781638, + "grad_norm": 0.2814115397311186, + "learning_rate": 3.4034475107584626e-06, + "loss": 0.4167, + "step": 17694 + }, + { + "epoch": 2.9126194950007545, + "grad_norm": 0.3396851526485684, + "learning_rate": 3.4029916225535498e-06, + "loss": 0.4293, + "step": 17695 + }, + { + "epoch": 2.912784079219871, + "grad_norm": 0.3057095780168015, + "learning_rate": 3.4025357463187524e-06, + "loss": 0.4347, + "step": 17696 + }, + { + "epoch": 2.9129486634389874, + "grad_norm": 0.5856334016361633, + "learning_rate": 3.4020798820590497e-06, + "loss": 0.4328, + "step": 17697 + }, + { + "epoch": 2.913113247658104, + "grad_norm": 0.3165345726469618, + "learning_rate": 3.4016240297794105e-06, + "loss": 0.4376, + "step": 17698 + }, + { + "epoch": 2.9132778318772203, + "grad_norm": 0.26790571564430643, + "learning_rate": 3.401168189484814e-06, + "loss": 0.4139, + "step": 17699 + }, + { + "epoch": 2.913442416096337, + "grad_norm": 0.35081641582732853, + "learning_rate": 3.4007123611802284e-06, + "loss": 0.4417, + "step": 17700 + }, + { + "epoch": 2.9136070003154533, + "grad_norm": 0.4507318909432953, + "learning_rate": 3.400256544870634e-06, + "loss": 0.4251, + "step": 17701 + }, + { + "epoch": 2.9137715845345697, + "grad_norm": 0.29400161745694303, + "learning_rate": 3.3998007405610003e-06, + "loss": 0.446, + "step": 17702 + }, + { + "epoch": 2.913936168753686, + "grad_norm": 0.3430924489995087, + "learning_rate": 3.399344948256301e-06, + "loss": 0.426, + "step": 17703 + }, + { + "epoch": 2.9141007529728027, + "grad_norm": 0.3507783651897055, + "learning_rate": 3.398889167961511e-06, + "loss": 0.425, + "step": 17704 + }, + { + "epoch": 2.914265337191919, + "grad_norm": 0.43556437810017934, + "learning_rate": 3.3984333996816024e-06, + "loss": 0.4478, + "step": 17705 + }, + { + "epoch": 2.9144299214110356, + "grad_norm": 0.3252357809987994, + "learning_rate": 3.39797764342155e-06, + "loss": 0.4286, + "step": 17706 + }, + { + "epoch": 2.914594505630152, + "grad_norm": 0.3040779954874744, + "learning_rate": 3.397521899186324e-06, + "loss": 0.4188, + "step": 17707 + }, + { + "epoch": 2.914759089849268, + "grad_norm": 0.34531608497152533, + "learning_rate": 3.3970661669809005e-06, + "loss": 0.4204, + "step": 17708 + }, + { + "epoch": 2.9149236740683846, + "grad_norm": 0.38340548767782345, + "learning_rate": 3.396610446810252e-06, + "loss": 0.4526, + "step": 17709 + }, + { + "epoch": 2.915088258287501, + "grad_norm": 0.31692373966980675, + "learning_rate": 3.3961547386793476e-06, + "loss": 0.4431, + "step": 17710 + }, + { + "epoch": 2.9152528425066175, + "grad_norm": 0.2927901325514057, + "learning_rate": 3.3956990425931637e-06, + "loss": 0.4157, + "step": 17711 + }, + { + "epoch": 2.915417426725734, + "grad_norm": 0.3381982550939023, + "learning_rate": 3.3952433585566697e-06, + "loss": 0.4346, + "step": 17712 + }, + { + "epoch": 2.9155820109448505, + "grad_norm": 0.3908240460861359, + "learning_rate": 3.39478768657484e-06, + "loss": 0.4323, + "step": 17713 + }, + { + "epoch": 2.915746595163967, + "grad_norm": 0.35719831437371163, + "learning_rate": 3.394332026652645e-06, + "loss": 0.4052, + "step": 17714 + }, + { + "epoch": 2.9159111793830834, + "grad_norm": 0.30052725251285, + "learning_rate": 3.393876378795059e-06, + "loss": 0.4373, + "step": 17715 + }, + { + "epoch": 2.9160757636022, + "grad_norm": 0.3120089814016768, + "learning_rate": 3.393420743007053e-06, + "loss": 0.433, + "step": 17716 + }, + { + "epoch": 2.9162403478213164, + "grad_norm": 0.48157706619602825, + "learning_rate": 3.3929651192935958e-06, + "loss": 0.4404, + "step": 17717 + }, + { + "epoch": 2.916404932040433, + "grad_norm": 0.34109414817190065, + "learning_rate": 3.3925095076596646e-06, + "loss": 0.4327, + "step": 17718 + }, + { + "epoch": 2.9165695162595493, + "grad_norm": 0.293311088739958, + "learning_rate": 3.392053908110224e-06, + "loss": 0.4034, + "step": 17719 + }, + { + "epoch": 2.9167341004786658, + "grad_norm": 0.3200744243515672, + "learning_rate": 3.391598320650252e-06, + "loss": 0.4451, + "step": 17720 + }, + { + "epoch": 2.9168986846977822, + "grad_norm": 0.2798542360586772, + "learning_rate": 3.3911427452847153e-06, + "loss": 0.4271, + "step": 17721 + }, + { + "epoch": 2.9170632689168987, + "grad_norm": 0.4470121931155287, + "learning_rate": 3.390687182018587e-06, + "loss": 0.441, + "step": 17722 + }, + { + "epoch": 2.917227853136015, + "grad_norm": 0.3077008837417923, + "learning_rate": 3.3902316308568364e-06, + "loss": 0.4371, + "step": 17723 + }, + { + "epoch": 2.9173924373551317, + "grad_norm": 0.3355025362900558, + "learning_rate": 3.389776091804437e-06, + "loss": 0.4223, + "step": 17724 + }, + { + "epoch": 2.917557021574248, + "grad_norm": 0.30686200898920396, + "learning_rate": 3.389320564866357e-06, + "loss": 0.4231, + "step": 17725 + }, + { + "epoch": 2.9177216057933646, + "grad_norm": 0.368801143123311, + "learning_rate": 3.388865050047567e-06, + "loss": 0.4262, + "step": 17726 + }, + { + "epoch": 2.917886190012481, + "grad_norm": 0.4914583171959461, + "learning_rate": 3.388409547353039e-06, + "loss": 0.4434, + "step": 17727 + }, + { + "epoch": 2.9180507742315975, + "grad_norm": 0.3669387457957941, + "learning_rate": 3.3879540567877417e-06, + "loss": 0.4469, + "step": 17728 + }, + { + "epoch": 2.918215358450714, + "grad_norm": 0.2725498437245764, + "learning_rate": 3.387498578356648e-06, + "loss": 0.4399, + "step": 17729 + }, + { + "epoch": 2.9183799426698305, + "grad_norm": 0.3033736328383057, + "learning_rate": 3.3870431120647223e-06, + "loss": 0.4277, + "step": 17730 + }, + { + "epoch": 2.918544526888947, + "grad_norm": 0.3208317703581786, + "learning_rate": 3.386587657916941e-06, + "loss": 0.4261, + "step": 17731 + }, + { + "epoch": 2.9187091111080634, + "grad_norm": 0.3267134398822339, + "learning_rate": 3.38613221591827e-06, + "loss": 0.4305, + "step": 17732 + }, + { + "epoch": 2.9188736953271794, + "grad_norm": 0.35091926586516287, + "learning_rate": 3.385676786073679e-06, + "loss": 0.4268, + "step": 17733 + }, + { + "epoch": 2.919038279546296, + "grad_norm": 0.40270745330681973, + "learning_rate": 3.385221368388139e-06, + "loss": 0.4418, + "step": 17734 + }, + { + "epoch": 2.9192028637654124, + "grad_norm": 0.511617782510701, + "learning_rate": 3.3847659628666174e-06, + "loss": 0.4323, + "step": 17735 + }, + { + "epoch": 2.919367447984529, + "grad_norm": 0.3435292141125187, + "learning_rate": 3.3843105695140855e-06, + "loss": 0.4325, + "step": 17736 + }, + { + "epoch": 2.9195320322036453, + "grad_norm": 0.726359205620102, + "learning_rate": 3.38385518833551e-06, + "loss": 0.4008, + "step": 17737 + }, + { + "epoch": 2.919696616422762, + "grad_norm": 0.49802888669458983, + "learning_rate": 3.3833998193358633e-06, + "loss": 0.4347, + "step": 17738 + }, + { + "epoch": 2.9198612006418783, + "grad_norm": 0.30764054482203373, + "learning_rate": 3.3829444625201125e-06, + "loss": 0.4294, + "step": 17739 + }, + { + "epoch": 2.9200257848609947, + "grad_norm": 0.3067873725877841, + "learning_rate": 3.3824891178932242e-06, + "loss": 0.4325, + "step": 17740 + }, + { + "epoch": 2.920190369080111, + "grad_norm": 0.32780206845639454, + "learning_rate": 3.382033785460169e-06, + "loss": 0.4433, + "step": 17741 + }, + { + "epoch": 2.9203549532992277, + "grad_norm": 0.4208820936167372, + "learning_rate": 3.3815784652259143e-06, + "loss": 0.4156, + "step": 17742 + }, + { + "epoch": 2.920519537518344, + "grad_norm": 0.34487559592914324, + "learning_rate": 3.38112315719543e-06, + "loss": 0.4259, + "step": 17743 + }, + { + "epoch": 2.9206841217374606, + "grad_norm": 0.33124493288463996, + "learning_rate": 3.380667861373683e-06, + "loss": 0.4227, + "step": 17744 + }, + { + "epoch": 2.920848705956577, + "grad_norm": 0.5375485315494445, + "learning_rate": 3.3802125777656416e-06, + "loss": 0.4336, + "step": 17745 + }, + { + "epoch": 2.9210132901756936, + "grad_norm": 0.3042149214419165, + "learning_rate": 3.379757306376274e-06, + "loss": 0.4223, + "step": 17746 + }, + { + "epoch": 2.92117787439481, + "grad_norm": 0.4076199164068452, + "learning_rate": 3.3793020472105464e-06, + "loss": 0.4299, + "step": 17747 + }, + { + "epoch": 2.9213424586139265, + "grad_norm": 0.2753391754673677, + "learning_rate": 3.3788468002734295e-06, + "loss": 0.4278, + "step": 17748 + }, + { + "epoch": 2.921507042833043, + "grad_norm": 0.42015172301237264, + "learning_rate": 3.3783915655698867e-06, + "loss": 0.453, + "step": 17749 + }, + { + "epoch": 2.9216716270521594, + "grad_norm": 0.3835463068552829, + "learning_rate": 3.3779363431048895e-06, + "loss": 0.4461, + "step": 17750 + }, + { + "epoch": 2.921836211271276, + "grad_norm": 0.3857933309307179, + "learning_rate": 3.3774811328834003e-06, + "loss": 0.4282, + "step": 17751 + }, + { + "epoch": 2.9220007954903924, + "grad_norm": 0.4795193717887788, + "learning_rate": 3.3770259349103926e-06, + "loss": 0.4299, + "step": 17752 + }, + { + "epoch": 2.922165379709509, + "grad_norm": 0.3049139503695512, + "learning_rate": 3.376570749190828e-06, + "loss": 0.4354, + "step": 17753 + }, + { + "epoch": 2.9223299639286253, + "grad_norm": 0.3519921407722308, + "learning_rate": 3.3761155757296744e-06, + "loss": 0.4414, + "step": 17754 + }, + { + "epoch": 2.922494548147742, + "grad_norm": 0.27790101218221924, + "learning_rate": 3.3756604145319e-06, + "loss": 0.4282, + "step": 17755 + }, + { + "epoch": 2.9226591323668583, + "grad_norm": 0.44352435070616175, + "learning_rate": 3.3752052656024694e-06, + "loss": 0.4174, + "step": 17756 + }, + { + "epoch": 2.9228237165859747, + "grad_norm": 0.7634773223040751, + "learning_rate": 3.374750128946351e-06, + "loss": 0.4345, + "step": 17757 + }, + { + "epoch": 2.922988300805091, + "grad_norm": 0.2984704720453994, + "learning_rate": 3.3742950045685096e-06, + "loss": 0.4211, + "step": 17758 + }, + { + "epoch": 2.9231528850242077, + "grad_norm": 0.2873456683663334, + "learning_rate": 3.3738398924739133e-06, + "loss": 0.4269, + "step": 17759 + }, + { + "epoch": 2.923317469243324, + "grad_norm": 0.2681318304864954, + "learning_rate": 3.373384792667526e-06, + "loss": 0.4168, + "step": 17760 + }, + { + "epoch": 2.9234820534624406, + "grad_norm": 0.31182455738571124, + "learning_rate": 3.3729297051543125e-06, + "loss": 0.4427, + "step": 17761 + }, + { + "epoch": 2.923646637681557, + "grad_norm": 0.38556683725634705, + "learning_rate": 3.3724746299392423e-06, + "loss": 0.4332, + "step": 17762 + }, + { + "epoch": 2.9238112219006736, + "grad_norm": 0.3109254497590433, + "learning_rate": 3.3720195670272774e-06, + "loss": 0.4176, + "step": 17763 + }, + { + "epoch": 2.92397580611979, + "grad_norm": 0.41052865934839455, + "learning_rate": 3.3715645164233857e-06, + "loss": 0.4197, + "step": 17764 + }, + { + "epoch": 2.9241403903389065, + "grad_norm": 0.47507018610702884, + "learning_rate": 3.3711094781325302e-06, + "loss": 0.4279, + "step": 17765 + }, + { + "epoch": 2.924304974558023, + "grad_norm": 0.3087772347515225, + "learning_rate": 3.3706544521596794e-06, + "loss": 0.4373, + "step": 17766 + }, + { + "epoch": 2.9244695587771394, + "grad_norm": 0.5240269329553043, + "learning_rate": 3.3701994385097955e-06, + "loss": 0.4133, + "step": 17767 + }, + { + "epoch": 2.924634142996256, + "grad_norm": 0.3167803647593282, + "learning_rate": 3.3697444371878446e-06, + "loss": 0.4282, + "step": 17768 + }, + { + "epoch": 2.9247987272153724, + "grad_norm": 0.32589636198064426, + "learning_rate": 3.369289448198793e-06, + "loss": 0.4122, + "step": 17769 + }, + { + "epoch": 2.924963311434489, + "grad_norm": 0.436712996183922, + "learning_rate": 3.368834471547601e-06, + "loss": 0.4315, + "step": 17770 + }, + { + "epoch": 2.9251278956536053, + "grad_norm": 0.36515264087424926, + "learning_rate": 3.3683795072392378e-06, + "loss": 0.4334, + "step": 17771 + }, + { + "epoch": 2.925292479872722, + "grad_norm": 0.33728393279336444, + "learning_rate": 3.3679245552786643e-06, + "loss": 0.441, + "step": 17772 + }, + { + "epoch": 2.9254570640918383, + "grad_norm": 0.3032828860887325, + "learning_rate": 3.367469615670847e-06, + "loss": 0.4386, + "step": 17773 + }, + { + "epoch": 2.9256216483109547, + "grad_norm": 0.37763499166331943, + "learning_rate": 3.367014688420749e-06, + "loss": 0.4214, + "step": 17774 + }, + { + "epoch": 2.9257862325300708, + "grad_norm": 0.2951330033560652, + "learning_rate": 3.366559773533335e-06, + "loss": 0.4445, + "step": 17775 + }, + { + "epoch": 2.9259508167491872, + "grad_norm": 0.30043150877186625, + "learning_rate": 3.366104871013568e-06, + "loss": 0.4342, + "step": 17776 + }, + { + "epoch": 2.9261154009683037, + "grad_norm": 0.40274738954194816, + "learning_rate": 3.3656499808664123e-06, + "loss": 0.4189, + "step": 17777 + }, + { + "epoch": 2.92627998518742, + "grad_norm": 0.3009201296585576, + "learning_rate": 3.365195103096832e-06, + "loss": 0.4267, + "step": 17778 + }, + { + "epoch": 2.9264445694065366, + "grad_norm": 0.2916867542924242, + "learning_rate": 3.3647402377097877e-06, + "loss": 0.4137, + "step": 17779 + }, + { + "epoch": 2.926609153625653, + "grad_norm": 0.259953729653548, + "learning_rate": 3.364285384710247e-06, + "loss": 0.4436, + "step": 17780 + }, + { + "epoch": 2.9267737378447696, + "grad_norm": 0.24759841138821312, + "learning_rate": 3.3638305441031683e-06, + "loss": 0.4248, + "step": 17781 + }, + { + "epoch": 2.926938322063886, + "grad_norm": 0.4603191646651383, + "learning_rate": 3.36337571589352e-06, + "loss": 0.4267, + "step": 17782 + }, + { + "epoch": 2.9271029062830025, + "grad_norm": 0.33841942087743543, + "learning_rate": 3.3629209000862623e-06, + "loss": 0.427, + "step": 17783 + }, + { + "epoch": 2.927267490502119, + "grad_norm": 0.2910413296679468, + "learning_rate": 3.362466096686356e-06, + "loss": 0.4187, + "step": 17784 + }, + { + "epoch": 2.9274320747212355, + "grad_norm": 0.3569167002034256, + "learning_rate": 3.362011305698767e-06, + "loss": 0.4152, + "step": 17785 + }, + { + "epoch": 2.927596658940352, + "grad_norm": 0.30999334016630953, + "learning_rate": 3.3615565271284556e-06, + "loss": 0.4454, + "step": 17786 + }, + { + "epoch": 2.9277612431594684, + "grad_norm": 0.30668375560823136, + "learning_rate": 3.3611017609803855e-06, + "loss": 0.4255, + "step": 17787 + }, + { + "epoch": 2.927925827378585, + "grad_norm": 0.3213329643906008, + "learning_rate": 3.360647007259518e-06, + "loss": 0.4311, + "step": 17788 + }, + { + "epoch": 2.9280904115977013, + "grad_norm": 0.39838423121582006, + "learning_rate": 3.3601922659708165e-06, + "loss": 0.4196, + "step": 17789 + }, + { + "epoch": 2.928254995816818, + "grad_norm": 0.4026760553620033, + "learning_rate": 3.3597375371192427e-06, + "loss": 0.445, + "step": 17790 + }, + { + "epoch": 2.9284195800359343, + "grad_norm": 0.29864249003243665, + "learning_rate": 3.359282820709756e-06, + "loss": 0.4314, + "step": 17791 + }, + { + "epoch": 2.9285841642550507, + "grad_norm": 0.3050868808958821, + "learning_rate": 3.3588281167473213e-06, + "loss": 0.436, + "step": 17792 + }, + { + "epoch": 2.928748748474167, + "grad_norm": 0.35953047262041005, + "learning_rate": 3.3583734252368973e-06, + "loss": 0.4484, + "step": 17793 + }, + { + "epoch": 2.9289133326932837, + "grad_norm": 0.3837045335016808, + "learning_rate": 3.3579187461834484e-06, + "loss": 0.4034, + "step": 17794 + }, + { + "epoch": 2.9290779169124, + "grad_norm": 0.280143497290015, + "learning_rate": 3.3574640795919327e-06, + "loss": 0.4285, + "step": 17795 + }, + { + "epoch": 2.9292425011315166, + "grad_norm": 0.3197785437707039, + "learning_rate": 3.357009425467314e-06, + "loss": 0.433, + "step": 17796 + }, + { + "epoch": 2.929407085350633, + "grad_norm": 0.32012188869001085, + "learning_rate": 3.356554783814553e-06, + "loss": 0.4198, + "step": 17797 + }, + { + "epoch": 2.9295716695697496, + "grad_norm": 0.3085764226696516, + "learning_rate": 3.356100154638608e-06, + "loss": 0.4484, + "step": 17798 + }, + { + "epoch": 2.929736253788866, + "grad_norm": 0.3337658242361266, + "learning_rate": 3.3556455379444443e-06, + "loss": 0.43, + "step": 17799 + }, + { + "epoch": 2.929900838007982, + "grad_norm": 0.3262390653566618, + "learning_rate": 3.3551909337370166e-06, + "loss": 0.4364, + "step": 17800 + }, + { + "epoch": 2.9300654222270985, + "grad_norm": 0.2862178416575866, + "learning_rate": 3.354736342021291e-06, + "loss": 0.4307, + "step": 17801 + }, + { + "epoch": 2.930230006446215, + "grad_norm": 0.28962583779258144, + "learning_rate": 3.354281762802224e-06, + "loss": 0.4298, + "step": 17802 + }, + { + "epoch": 2.9303945906653315, + "grad_norm": 0.3390162700678234, + "learning_rate": 3.3538271960847778e-06, + "loss": 0.422, + "step": 17803 + }, + { + "epoch": 2.930559174884448, + "grad_norm": 0.3559782541224994, + "learning_rate": 3.353372641873912e-06, + "loss": 0.4394, + "step": 17804 + }, + { + "epoch": 2.9307237591035644, + "grad_norm": 0.33601042460541813, + "learning_rate": 3.3529181001745855e-06, + "loss": 0.4316, + "step": 17805 + }, + { + "epoch": 2.930888343322681, + "grad_norm": 0.29696711292317896, + "learning_rate": 3.352463570991759e-06, + "loss": 0.428, + "step": 17806 + }, + { + "epoch": 2.9310529275417974, + "grad_norm": 0.3066111993945832, + "learning_rate": 3.352009054330392e-06, + "loss": 0.4359, + "step": 17807 + }, + { + "epoch": 2.931217511760914, + "grad_norm": 0.29821728742624687, + "learning_rate": 3.3515545501954443e-06, + "loss": 0.4345, + "step": 17808 + }, + { + "epoch": 2.9313820959800303, + "grad_norm": 0.32587962170500345, + "learning_rate": 3.351100058591875e-06, + "loss": 0.4319, + "step": 17809 + }, + { + "epoch": 2.9315466801991468, + "grad_norm": 0.41296420067737905, + "learning_rate": 3.350645579524644e-06, + "loss": 0.4125, + "step": 17810 + }, + { + "epoch": 2.9317112644182632, + "grad_norm": 0.40766325984073193, + "learning_rate": 3.350191112998708e-06, + "loss": 0.4543, + "step": 17811 + }, + { + "epoch": 2.9318758486373797, + "grad_norm": 0.34426257609135424, + "learning_rate": 3.3497366590190294e-06, + "loss": 0.4411, + "step": 17812 + }, + { + "epoch": 2.932040432856496, + "grad_norm": 0.35398131134009875, + "learning_rate": 3.349282217590566e-06, + "loss": 0.4278, + "step": 17813 + }, + { + "epoch": 2.9322050170756127, + "grad_norm": 0.260521569928367, + "learning_rate": 3.348827788718273e-06, + "loss": 0.4142, + "step": 17814 + }, + { + "epoch": 2.932369601294729, + "grad_norm": 0.31993887898910756, + "learning_rate": 3.3483733724071136e-06, + "loss": 0.4372, + "step": 17815 + }, + { + "epoch": 2.9325341855138456, + "grad_norm": 0.30420162588792804, + "learning_rate": 3.3479189686620434e-06, + "loss": 0.4063, + "step": 17816 + }, + { + "epoch": 2.932698769732962, + "grad_norm": 0.3112488967412044, + "learning_rate": 3.347464577488022e-06, + "loss": 0.4084, + "step": 17817 + }, + { + "epoch": 2.9328633539520785, + "grad_norm": 0.3019402613651006, + "learning_rate": 3.347010198890007e-06, + "loss": 0.4246, + "step": 17818 + }, + { + "epoch": 2.933027938171195, + "grad_norm": 0.2904121659476341, + "learning_rate": 3.3465558328729574e-06, + "loss": 0.4282, + "step": 17819 + }, + { + "epoch": 2.9331925223903115, + "grad_norm": 0.3680894362455315, + "learning_rate": 3.3461014794418306e-06, + "loss": 0.4292, + "step": 17820 + }, + { + "epoch": 2.933357106609428, + "grad_norm": 0.8192649715094241, + "learning_rate": 3.3456471386015826e-06, + "loss": 0.4157, + "step": 17821 + }, + { + "epoch": 2.9335216908285444, + "grad_norm": 0.2984160228076957, + "learning_rate": 3.345192810357173e-06, + "loss": 0.4317, + "step": 17822 + }, + { + "epoch": 2.933686275047661, + "grad_norm": 0.2975559190054634, + "learning_rate": 3.3447384947135584e-06, + "loss": 0.4226, + "step": 17823 + }, + { + "epoch": 2.9338508592667774, + "grad_norm": 0.2975348369606467, + "learning_rate": 3.3442841916756962e-06, + "loss": 0.4478, + "step": 17824 + }, + { + "epoch": 2.934015443485894, + "grad_norm": 0.26314404116089213, + "learning_rate": 3.3438299012485428e-06, + "loss": 0.4198, + "step": 17825 + }, + { + "epoch": 2.9341800277050103, + "grad_norm": 0.5471598419819609, + "learning_rate": 3.3433756234370574e-06, + "loss": 0.4293, + "step": 17826 + }, + { + "epoch": 2.9343446119241268, + "grad_norm": 0.27370109412400817, + "learning_rate": 3.342921358246195e-06, + "loss": 0.4321, + "step": 17827 + }, + { + "epoch": 2.9345091961432432, + "grad_norm": 0.3078652883814931, + "learning_rate": 3.342467105680913e-06, + "loss": 0.4292, + "step": 17828 + }, + { + "epoch": 2.9346737803623597, + "grad_norm": 0.37786786549615187, + "learning_rate": 3.342012865746169e-06, + "loss": 0.4358, + "step": 17829 + }, + { + "epoch": 2.934838364581476, + "grad_norm": 0.2570317047160048, + "learning_rate": 3.341558638446916e-06, + "loss": 0.4209, + "step": 17830 + }, + { + "epoch": 2.9350029488005926, + "grad_norm": 0.292775737045322, + "learning_rate": 3.3411044237881156e-06, + "loss": 0.4218, + "step": 17831 + }, + { + "epoch": 2.935167533019709, + "grad_norm": 0.3469717961034144, + "learning_rate": 3.3406502217747186e-06, + "loss": 0.4323, + "step": 17832 + }, + { + "epoch": 2.9353321172388256, + "grad_norm": 0.2748724519305954, + "learning_rate": 3.340196032411686e-06, + "loss": 0.4294, + "step": 17833 + }, + { + "epoch": 2.935496701457942, + "grad_norm": 0.34156871435154457, + "learning_rate": 3.339741855703971e-06, + "loss": 0.4181, + "step": 17834 + }, + { + "epoch": 2.9356612856770585, + "grad_norm": 0.35911621554021983, + "learning_rate": 3.3392876916565288e-06, + "loss": 0.4466, + "step": 17835 + }, + { + "epoch": 2.935825869896175, + "grad_norm": 0.3112423328817136, + "learning_rate": 3.3388335402743165e-06, + "loss": 0.4508, + "step": 17836 + }, + { + "epoch": 2.9359904541152915, + "grad_norm": 0.3101049266574248, + "learning_rate": 3.3383794015622886e-06, + "loss": 0.4365, + "step": 17837 + }, + { + "epoch": 2.936155038334408, + "grad_norm": 0.40839422295382893, + "learning_rate": 3.337925275525402e-06, + "loss": 0.4697, + "step": 17838 + }, + { + "epoch": 2.9363196225535244, + "grad_norm": 0.3001045748710366, + "learning_rate": 3.3374711621686104e-06, + "loss": 0.4264, + "step": 17839 + }, + { + "epoch": 2.936484206772641, + "grad_norm": 0.36535566306293327, + "learning_rate": 3.33701706149687e-06, + "loss": 0.4172, + "step": 17840 + }, + { + "epoch": 2.936648790991757, + "grad_norm": 0.3262169272894157, + "learning_rate": 3.336562973515136e-06, + "loss": 0.4328, + "step": 17841 + }, + { + "epoch": 2.9368133752108734, + "grad_norm": 0.3334451630590715, + "learning_rate": 3.3361088982283614e-06, + "loss": 0.4247, + "step": 17842 + }, + { + "epoch": 2.93697795942999, + "grad_norm": 0.3085853407263294, + "learning_rate": 3.3356548356415024e-06, + "loss": 0.4309, + "step": 17843 + }, + { + "epoch": 2.9371425436491063, + "grad_norm": 0.38830741910708716, + "learning_rate": 3.335200785759512e-06, + "loss": 0.4231, + "step": 17844 + }, + { + "epoch": 2.937307127868223, + "grad_norm": 0.40923246564507443, + "learning_rate": 3.3347467485873472e-06, + "loss": 0.426, + "step": 17845 + }, + { + "epoch": 2.9374717120873393, + "grad_norm": 0.31888854734326744, + "learning_rate": 3.3342927241299597e-06, + "loss": 0.4291, + "step": 17846 + }, + { + "epoch": 2.9376362963064557, + "grad_norm": 0.3555392981963307, + "learning_rate": 3.333838712392306e-06, + "loss": 0.4407, + "step": 17847 + }, + { + "epoch": 2.937800880525572, + "grad_norm": 0.27002953860896445, + "learning_rate": 3.333384713379337e-06, + "loss": 0.4179, + "step": 17848 + }, + { + "epoch": 2.9379654647446887, + "grad_norm": 0.31473988399066993, + "learning_rate": 3.3329307270960096e-06, + "loss": 0.4227, + "step": 17849 + }, + { + "epoch": 2.938130048963805, + "grad_norm": 0.29185727361215336, + "learning_rate": 3.332476753547278e-06, + "loss": 0.4283, + "step": 17850 + }, + { + "epoch": 2.9382946331829216, + "grad_norm": 0.28049103874445924, + "learning_rate": 3.332022792738091e-06, + "loss": 0.4286, + "step": 17851 + }, + { + "epoch": 2.938459217402038, + "grad_norm": 0.30212525955101455, + "learning_rate": 3.3315688446734078e-06, + "loss": 0.4362, + "step": 17852 + }, + { + "epoch": 2.9386238016211546, + "grad_norm": 0.2904014180261415, + "learning_rate": 3.331114909358177e-06, + "loss": 0.4406, + "step": 17853 + }, + { + "epoch": 2.938788385840271, + "grad_norm": 0.3172390058145223, + "learning_rate": 3.330660986797355e-06, + "loss": 0.405, + "step": 17854 + }, + { + "epoch": 2.9389529700593875, + "grad_norm": 0.2798833508296669, + "learning_rate": 3.330207076995893e-06, + "loss": 0.4175, + "step": 17855 + }, + { + "epoch": 2.939117554278504, + "grad_norm": 0.3491267780694839, + "learning_rate": 3.3297531799587447e-06, + "loss": 0.4351, + "step": 17856 + }, + { + "epoch": 2.9392821384976204, + "grad_norm": 0.2964701733388763, + "learning_rate": 3.3292992956908635e-06, + "loss": 0.4277, + "step": 17857 + }, + { + "epoch": 2.939446722716737, + "grad_norm": 0.3005906746028151, + "learning_rate": 3.3288454241971997e-06, + "loss": 0.4321, + "step": 17858 + }, + { + "epoch": 2.9396113069358534, + "grad_norm": 0.33130600309368236, + "learning_rate": 3.328391565482708e-06, + "loss": 0.413, + "step": 17859 + }, + { + "epoch": 2.93977589115497, + "grad_norm": 0.26528857819174023, + "learning_rate": 3.3279377195523392e-06, + "loss": 0.4358, + "step": 17860 + }, + { + "epoch": 2.9399404753740863, + "grad_norm": 0.28999419057896214, + "learning_rate": 3.3274838864110487e-06, + "loss": 0.4465, + "step": 17861 + }, + { + "epoch": 2.940105059593203, + "grad_norm": 0.6408457564674434, + "learning_rate": 3.3270300660637825e-06, + "loss": 0.4379, + "step": 17862 + }, + { + "epoch": 2.9402696438123193, + "grad_norm": 0.4237045146833924, + "learning_rate": 3.3265762585154993e-06, + "loss": 0.4219, + "step": 17863 + }, + { + "epoch": 2.9404342280314357, + "grad_norm": 0.2729461510661597, + "learning_rate": 3.3261224637711465e-06, + "loss": 0.4524, + "step": 17864 + }, + { + "epoch": 2.940598812250552, + "grad_norm": 0.3003957467857865, + "learning_rate": 3.325668681835676e-06, + "loss": 0.424, + "step": 17865 + }, + { + "epoch": 2.9407633964696682, + "grad_norm": 0.29451358615558454, + "learning_rate": 3.3252149127140414e-06, + "loss": 0.4476, + "step": 17866 + }, + { + "epoch": 2.9409279806887847, + "grad_norm": 0.31582228502791126, + "learning_rate": 3.3247611564111913e-06, + "loss": 0.438, + "step": 17867 + }, + { + "epoch": 2.941092564907901, + "grad_norm": 0.3113893632208832, + "learning_rate": 3.32430741293208e-06, + "loss": 0.4042, + "step": 17868 + }, + { + "epoch": 2.9412571491270176, + "grad_norm": 0.4684080838883566, + "learning_rate": 3.3238536822816554e-06, + "loss": 0.4273, + "step": 17869 + }, + { + "epoch": 2.941421733346134, + "grad_norm": 0.3282530409190831, + "learning_rate": 3.323399964464871e-06, + "loss": 0.4317, + "step": 17870 + }, + { + "epoch": 2.9415863175652506, + "grad_norm": 0.387300135890649, + "learning_rate": 3.3229462594866778e-06, + "loss": 0.4472, + "step": 17871 + }, + { + "epoch": 2.941750901784367, + "grad_norm": 0.3230209122635716, + "learning_rate": 3.3224925673520237e-06, + "loss": 0.4375, + "step": 17872 + }, + { + "epoch": 2.9419154860034835, + "grad_norm": 0.2884446232718566, + "learning_rate": 3.322038888065861e-06, + "loss": 0.4448, + "step": 17873 + }, + { + "epoch": 2.9420800702226, + "grad_norm": 0.37083274672382366, + "learning_rate": 3.3215852216331395e-06, + "loss": 0.4327, + "step": 17874 + }, + { + "epoch": 2.9422446544417165, + "grad_norm": 0.29977657531127405, + "learning_rate": 3.3211315680588108e-06, + "loss": 0.4318, + "step": 17875 + }, + { + "epoch": 2.942409238660833, + "grad_norm": 0.47841463020664227, + "learning_rate": 3.320677927347822e-06, + "loss": 0.4537, + "step": 17876 + }, + { + "epoch": 2.9425738228799494, + "grad_norm": 0.3629239245166473, + "learning_rate": 3.320224299505127e-06, + "loss": 0.4355, + "step": 17877 + }, + { + "epoch": 2.942738407099066, + "grad_norm": 0.4161272794584303, + "learning_rate": 3.3197706845356733e-06, + "loss": 0.4212, + "step": 17878 + }, + { + "epoch": 2.9429029913181823, + "grad_norm": 0.291344277246969, + "learning_rate": 3.31931708244441e-06, + "loss": 0.4333, + "step": 17879 + }, + { + "epoch": 2.943067575537299, + "grad_norm": 0.40246922396376766, + "learning_rate": 3.318863493236289e-06, + "loss": 0.4366, + "step": 17880 + }, + { + "epoch": 2.9432321597564153, + "grad_norm": 0.29864377914785467, + "learning_rate": 3.3184099169162553e-06, + "loss": 0.4286, + "step": 17881 + }, + { + "epoch": 2.9433967439755317, + "grad_norm": 0.27655697850060473, + "learning_rate": 3.3179563534892642e-06, + "loss": 0.4205, + "step": 17882 + }, + { + "epoch": 2.943561328194648, + "grad_norm": 0.2935291739535782, + "learning_rate": 3.3175028029602586e-06, + "loss": 0.4354, + "step": 17883 + }, + { + "epoch": 2.9437259124137647, + "grad_norm": 0.49612396605339865, + "learning_rate": 3.3170492653341935e-06, + "loss": 0.4144, + "step": 17884 + }, + { + "epoch": 2.943890496632881, + "grad_norm": 0.41181039529890884, + "learning_rate": 3.3165957406160135e-06, + "loss": 0.4503, + "step": 17885 + }, + { + "epoch": 2.9440550808519976, + "grad_norm": 0.34263661056886074, + "learning_rate": 3.316142228810668e-06, + "loss": 0.4118, + "step": 17886 + }, + { + "epoch": 2.944219665071114, + "grad_norm": 0.28397946210819824, + "learning_rate": 3.3156887299231066e-06, + "loss": 0.4209, + "step": 17887 + }, + { + "epoch": 2.9443842492902306, + "grad_norm": 0.3446468897435312, + "learning_rate": 3.315235243958276e-06, + "loss": 0.4275, + "step": 17888 + }, + { + "epoch": 2.944548833509347, + "grad_norm": 0.34994566616221195, + "learning_rate": 3.314781770921127e-06, + "loss": 0.4281, + "step": 17889 + }, + { + "epoch": 2.9447134177284635, + "grad_norm": 0.2673280268924159, + "learning_rate": 3.3143283108166047e-06, + "loss": 0.4245, + "step": 17890 + }, + { + "epoch": 2.94487800194758, + "grad_norm": 0.3052985036142152, + "learning_rate": 3.3138748636496613e-06, + "loss": 0.4355, + "step": 17891 + }, + { + "epoch": 2.9450425861666965, + "grad_norm": 0.24296991418516659, + "learning_rate": 3.3134214294252385e-06, + "loss": 0.4266, + "step": 17892 + }, + { + "epoch": 2.945207170385813, + "grad_norm": 0.348211095834571, + "learning_rate": 3.3129680081482908e-06, + "loss": 0.4502, + "step": 17893 + }, + { + "epoch": 2.9453717546049294, + "grad_norm": 0.3066715616668804, + "learning_rate": 3.3125145998237613e-06, + "loss": 0.4228, + "step": 17894 + }, + { + "epoch": 2.945536338824046, + "grad_norm": 0.7027922140802841, + "learning_rate": 3.312061204456598e-06, + "loss": 0.4426, + "step": 17895 + }, + { + "epoch": 2.9457009230431623, + "grad_norm": 0.35514336309042926, + "learning_rate": 3.3116078220517486e-06, + "loss": 0.4192, + "step": 17896 + }, + { + "epoch": 2.945865507262279, + "grad_norm": 0.2990787120046496, + "learning_rate": 3.311154452614161e-06, + "loss": 0.4526, + "step": 17897 + }, + { + "epoch": 2.9460300914813953, + "grad_norm": 0.3404325301953905, + "learning_rate": 3.3107010961487814e-06, + "loss": 0.4163, + "step": 17898 + }, + { + "epoch": 2.9461946757005117, + "grad_norm": 0.3363829382675295, + "learning_rate": 3.310247752660556e-06, + "loss": 0.4156, + "step": 17899 + }, + { + "epoch": 2.946359259919628, + "grad_norm": 0.3528096123152522, + "learning_rate": 3.309794422154433e-06, + "loss": 0.4366, + "step": 17900 + }, + { + "epoch": 2.9465238441387447, + "grad_norm": 0.45103200585706277, + "learning_rate": 3.3093411046353597e-06, + "loss": 0.4289, + "step": 17901 + }, + { + "epoch": 2.946688428357861, + "grad_norm": 0.25641087569932003, + "learning_rate": 3.308887800108278e-06, + "loss": 0.421, + "step": 17902 + }, + { + "epoch": 2.9468530125769776, + "grad_norm": 0.36216526002760185, + "learning_rate": 3.3084345085781404e-06, + "loss": 0.433, + "step": 17903 + }, + { + "epoch": 2.947017596796094, + "grad_norm": 0.3100472023891358, + "learning_rate": 3.3079812300498875e-06, + "loss": 0.4348, + "step": 17904 + }, + { + "epoch": 2.9471821810152106, + "grad_norm": 0.28062098127009283, + "learning_rate": 3.3075279645284693e-06, + "loss": 0.4502, + "step": 17905 + }, + { + "epoch": 2.947346765234327, + "grad_norm": 0.3299692058165166, + "learning_rate": 3.307074712018829e-06, + "loss": 0.4278, + "step": 17906 + }, + { + "epoch": 2.9475113494534435, + "grad_norm": 0.42246795002332643, + "learning_rate": 3.3066214725259136e-06, + "loss": 0.4224, + "step": 17907 + }, + { + "epoch": 2.9476759336725595, + "grad_norm": 0.275676822494741, + "learning_rate": 3.3061682460546688e-06, + "loss": 0.4379, + "step": 17908 + }, + { + "epoch": 2.947840517891676, + "grad_norm": 0.29540839746440317, + "learning_rate": 3.3057150326100385e-06, + "loss": 0.4214, + "step": 17909 + }, + { + "epoch": 2.9480051021107925, + "grad_norm": 0.31534615907655184, + "learning_rate": 3.305261832196971e-06, + "loss": 0.4332, + "step": 17910 + }, + { + "epoch": 2.948169686329909, + "grad_norm": 0.3277670198611158, + "learning_rate": 3.3048086448204074e-06, + "loss": 0.4238, + "step": 17911 + }, + { + "epoch": 2.9483342705490254, + "grad_norm": 0.3162074351131492, + "learning_rate": 3.304355470485297e-06, + "loss": 0.4091, + "step": 17912 + }, + { + "epoch": 2.948498854768142, + "grad_norm": 0.4291195573102557, + "learning_rate": 3.30390230919658e-06, + "loss": 0.4411, + "step": 17913 + }, + { + "epoch": 2.9486634389872584, + "grad_norm": 0.31681130856431333, + "learning_rate": 3.303449160959206e-06, + "loss": 0.4334, + "step": 17914 + }, + { + "epoch": 2.948828023206375, + "grad_norm": 0.31191226705513325, + "learning_rate": 3.302996025778117e-06, + "loss": 0.4234, + "step": 17915 + }, + { + "epoch": 2.9489926074254913, + "grad_norm": 0.29734890492769767, + "learning_rate": 3.3025429036582564e-06, + "loss": 0.4267, + "step": 17916 + }, + { + "epoch": 2.9491571916446078, + "grad_norm": 0.2899155809468505, + "learning_rate": 3.3020897946045703e-06, + "loss": 0.4282, + "step": 17917 + }, + { + "epoch": 2.9493217758637242, + "grad_norm": 0.25672904390104206, + "learning_rate": 3.301636698622002e-06, + "loss": 0.4406, + "step": 17918 + }, + { + "epoch": 2.9494863600828407, + "grad_norm": 0.2789684768772823, + "learning_rate": 3.301183615715496e-06, + "loss": 0.442, + "step": 17919 + }, + { + "epoch": 2.949650944301957, + "grad_norm": 0.2896431189484109, + "learning_rate": 3.300730545889996e-06, + "loss": 0.4393, + "step": 17920 + }, + { + "epoch": 2.9498155285210736, + "grad_norm": 0.3326451688102741, + "learning_rate": 3.3002774891504457e-06, + "loss": 0.4504, + "step": 17921 + }, + { + "epoch": 2.94998011274019, + "grad_norm": 0.3025192215294995, + "learning_rate": 3.2998244455017897e-06, + "loss": 0.4544, + "step": 17922 + }, + { + "epoch": 2.9501446969593066, + "grad_norm": 0.36848992028546734, + "learning_rate": 3.299371414948969e-06, + "loss": 0.4385, + "step": 17923 + }, + { + "epoch": 2.950309281178423, + "grad_norm": 0.27464068844369605, + "learning_rate": 3.298918397496929e-06, + "loss": 0.4181, + "step": 17924 + }, + { + "epoch": 2.9504738653975395, + "grad_norm": 0.3508843613547432, + "learning_rate": 3.2984653931506105e-06, + "loss": 0.4378, + "step": 17925 + }, + { + "epoch": 2.950638449616656, + "grad_norm": 0.3168887900492193, + "learning_rate": 3.29801240191496e-06, + "loss": 0.4072, + "step": 17926 + }, + { + "epoch": 2.9508030338357725, + "grad_norm": 0.36933439551737113, + "learning_rate": 3.2975594237949166e-06, + "loss": 0.4455, + "step": 17927 + }, + { + "epoch": 2.950967618054889, + "grad_norm": 0.32323328773774185, + "learning_rate": 3.2971064587954263e-06, + "loss": 0.4321, + "step": 17928 + }, + { + "epoch": 2.9511322022740054, + "grad_norm": 0.2852129816505733, + "learning_rate": 3.2966535069214315e-06, + "loss": 0.4272, + "step": 17929 + }, + { + "epoch": 2.951296786493122, + "grad_norm": 0.3360420850201866, + "learning_rate": 3.2962005681778704e-06, + "loss": 0.443, + "step": 17930 + }, + { + "epoch": 2.9514613707122384, + "grad_norm": 0.28600922843412036, + "learning_rate": 3.2957476425696905e-06, + "loss": 0.4447, + "step": 17931 + }, + { + "epoch": 2.951625954931355, + "grad_norm": 0.35561012211751764, + "learning_rate": 3.295294730101829e-06, + "loss": 0.4325, + "step": 17932 + }, + { + "epoch": 2.951790539150471, + "grad_norm": 0.3208730436976375, + "learning_rate": 3.2948418307792336e-06, + "loss": 0.4253, + "step": 17933 + }, + { + "epoch": 2.9519551233695873, + "grad_norm": 0.2722359535079783, + "learning_rate": 3.294388944606842e-06, + "loss": 0.4368, + "step": 17934 + }, + { + "epoch": 2.952119707588704, + "grad_norm": 0.2885866212922849, + "learning_rate": 3.293936071589597e-06, + "loss": 0.4176, + "step": 17935 + }, + { + "epoch": 2.9522842918078203, + "grad_norm": 0.4040311456081655, + "learning_rate": 3.2934832117324396e-06, + "loss": 0.4256, + "step": 17936 + }, + { + "epoch": 2.9524488760269367, + "grad_norm": 0.33857774276731406, + "learning_rate": 3.2930303650403133e-06, + "loss": 0.4594, + "step": 17937 + }, + { + "epoch": 2.952613460246053, + "grad_norm": 0.3265745780732682, + "learning_rate": 3.292577531518158e-06, + "loss": 0.4259, + "step": 17938 + }, + { + "epoch": 2.9527780444651697, + "grad_norm": 0.29969628802037535, + "learning_rate": 3.292124711170913e-06, + "loss": 0.4242, + "step": 17939 + }, + { + "epoch": 2.952942628684286, + "grad_norm": 0.27139539898638215, + "learning_rate": 3.2916719040035227e-06, + "loss": 0.4326, + "step": 17940 + }, + { + "epoch": 2.9531072129034026, + "grad_norm": 0.33381717418697626, + "learning_rate": 3.2912191100209247e-06, + "loss": 0.4323, + "step": 17941 + }, + { + "epoch": 2.953271797122519, + "grad_norm": 0.2919655671312646, + "learning_rate": 3.2907663292280632e-06, + "loss": 0.4084, + "step": 17942 + }, + { + "epoch": 2.9534363813416356, + "grad_norm": 0.3061555882408243, + "learning_rate": 3.290313561629875e-06, + "loss": 0.4326, + "step": 17943 + }, + { + "epoch": 2.953600965560752, + "grad_norm": 0.2730678949072666, + "learning_rate": 3.2898608072313046e-06, + "loss": 0.4271, + "step": 17944 + }, + { + "epoch": 2.9537655497798685, + "grad_norm": 0.25751069019917944, + "learning_rate": 3.28940806603729e-06, + "loss": 0.4044, + "step": 17945 + }, + { + "epoch": 2.953930133998985, + "grad_norm": 0.32760159845054954, + "learning_rate": 3.28895533805277e-06, + "loss": 0.4326, + "step": 17946 + }, + { + "epoch": 2.9540947182181014, + "grad_norm": 0.3229338862618099, + "learning_rate": 3.2885026232826865e-06, + "loss": 0.4296, + "step": 17947 + }, + { + "epoch": 2.954259302437218, + "grad_norm": 0.283933379779198, + "learning_rate": 3.2880499217319778e-06, + "loss": 0.4437, + "step": 17948 + }, + { + "epoch": 2.9544238866563344, + "grad_norm": 0.33073947677974724, + "learning_rate": 3.2875972334055866e-06, + "loss": 0.4251, + "step": 17949 + }, + { + "epoch": 2.954588470875451, + "grad_norm": 0.30371658072334495, + "learning_rate": 3.2871445583084486e-06, + "loss": 0.4361, + "step": 17950 + }, + { + "epoch": 2.9547530550945673, + "grad_norm": 0.34983625322813805, + "learning_rate": 3.2866918964455068e-06, + "loss": 0.4238, + "step": 17951 + }, + { + "epoch": 2.954917639313684, + "grad_norm": 0.2555862807040976, + "learning_rate": 3.2862392478216996e-06, + "loss": 0.4112, + "step": 17952 + }, + { + "epoch": 2.9550822235328003, + "grad_norm": 0.511680191622086, + "learning_rate": 3.2857866124419632e-06, + "loss": 0.4365, + "step": 17953 + }, + { + "epoch": 2.9552468077519167, + "grad_norm": 0.3025622763140895, + "learning_rate": 3.2853339903112397e-06, + "loss": 0.4468, + "step": 17954 + }, + { + "epoch": 2.955411391971033, + "grad_norm": 0.30488710406531333, + "learning_rate": 3.284881381434466e-06, + "loss": 0.4548, + "step": 17955 + }, + { + "epoch": 2.9555759761901497, + "grad_norm": 0.29399243678847514, + "learning_rate": 3.284428785816583e-06, + "loss": 0.4458, + "step": 17956 + }, + { + "epoch": 2.955740560409266, + "grad_norm": 0.2704503173871928, + "learning_rate": 3.283976203462527e-06, + "loss": 0.4337, + "step": 17957 + }, + { + "epoch": 2.9559051446283826, + "grad_norm": 0.36732907875717224, + "learning_rate": 3.2835236343772375e-06, + "loss": 0.4158, + "step": 17958 + }, + { + "epoch": 2.956069728847499, + "grad_norm": 0.3164426199884768, + "learning_rate": 3.283071078565653e-06, + "loss": 0.4329, + "step": 17959 + }, + { + "epoch": 2.9562343130666155, + "grad_norm": 0.31369984071288404, + "learning_rate": 3.2826185360327103e-06, + "loss": 0.4209, + "step": 17960 + }, + { + "epoch": 2.956398897285732, + "grad_norm": 0.3399865849521089, + "learning_rate": 3.2821660067833503e-06, + "loss": 0.4232, + "step": 17961 + }, + { + "epoch": 2.9565634815048485, + "grad_norm": 0.3107880242346844, + "learning_rate": 3.281713490822506e-06, + "loss": 0.4189, + "step": 17962 + }, + { + "epoch": 2.956728065723965, + "grad_norm": 0.41034059979693965, + "learning_rate": 3.2812609881551207e-06, + "loss": 0.4374, + "step": 17963 + }, + { + "epoch": 2.9568926499430814, + "grad_norm": 0.3564405810029044, + "learning_rate": 3.2808084987861263e-06, + "loss": 0.4211, + "step": 17964 + }, + { + "epoch": 2.957057234162198, + "grad_norm": 0.36128352218561705, + "learning_rate": 3.2803560227204654e-06, + "loss": 0.4399, + "step": 17965 + }, + { + "epoch": 2.9572218183813144, + "grad_norm": 0.48125988482566323, + "learning_rate": 3.279903559963072e-06, + "loss": 0.4336, + "step": 17966 + }, + { + "epoch": 2.957386402600431, + "grad_norm": 0.30767701112235857, + "learning_rate": 3.2794511105188835e-06, + "loss": 0.4245, + "step": 17967 + }, + { + "epoch": 2.9575509868195473, + "grad_norm": 0.3268399282048589, + "learning_rate": 3.2789986743928377e-06, + "loss": 0.4268, + "step": 17968 + }, + { + "epoch": 2.957715571038664, + "grad_norm": 0.2830963299254264, + "learning_rate": 3.2785462515898706e-06, + "loss": 0.4249, + "step": 17969 + }, + { + "epoch": 2.9578801552577803, + "grad_norm": 0.2991147151326696, + "learning_rate": 3.27809384211492e-06, + "loss": 0.4392, + "step": 17970 + }, + { + "epoch": 2.9580447394768967, + "grad_norm": 0.37204084254622605, + "learning_rate": 3.277641445972921e-06, + "loss": 0.4376, + "step": 17971 + }, + { + "epoch": 2.958209323696013, + "grad_norm": 0.2818280247339522, + "learning_rate": 3.2771890631688128e-06, + "loss": 0.4306, + "step": 17972 + }, + { + "epoch": 2.9583739079151297, + "grad_norm": 0.2951545466599934, + "learning_rate": 3.276736693707526e-06, + "loss": 0.4288, + "step": 17973 + }, + { + "epoch": 2.958538492134246, + "grad_norm": 0.4873851836731497, + "learning_rate": 3.276284337594004e-06, + "loss": 0.4191, + "step": 17974 + }, + { + "epoch": 2.958703076353362, + "grad_norm": 0.3930468198978656, + "learning_rate": 3.275831994833177e-06, + "loss": 0.4402, + "step": 17975 + }, + { + "epoch": 2.9588676605724786, + "grad_norm": 0.2764863576168517, + "learning_rate": 3.275379665429982e-06, + "loss": 0.4427, + "step": 17976 + }, + { + "epoch": 2.959032244791595, + "grad_norm": 0.3303930389333476, + "learning_rate": 3.2749273493893567e-06, + "loss": 0.4386, + "step": 17977 + }, + { + "epoch": 2.9591968290107116, + "grad_norm": 0.29647947734247887, + "learning_rate": 3.2744750467162338e-06, + "loss": 0.4325, + "step": 17978 + }, + { + "epoch": 2.959361413229828, + "grad_norm": 0.3180004148333902, + "learning_rate": 3.2740227574155516e-06, + "loss": 0.4111, + "step": 17979 + }, + { + "epoch": 2.9595259974489445, + "grad_norm": 0.7645019242291001, + "learning_rate": 3.273570481492242e-06, + "loss": 0.4312, + "step": 17980 + }, + { + "epoch": 2.959690581668061, + "grad_norm": 0.2773599141407237, + "learning_rate": 3.2731182189512437e-06, + "loss": 0.4186, + "step": 17981 + }, + { + "epoch": 2.9598551658871775, + "grad_norm": 0.32322410607835417, + "learning_rate": 3.2726659697974906e-06, + "loss": 0.4221, + "step": 17982 + }, + { + "epoch": 2.960019750106294, + "grad_norm": 0.3347967700591786, + "learning_rate": 3.2722137340359128e-06, + "loss": 0.4509, + "step": 17983 + }, + { + "epoch": 2.9601843343254104, + "grad_norm": 0.32349798325468926, + "learning_rate": 3.2717615116714523e-06, + "loss": 0.4223, + "step": 17984 + }, + { + "epoch": 2.960348918544527, + "grad_norm": 0.2557811785916594, + "learning_rate": 3.271309302709038e-06, + "loss": 0.4437, + "step": 17985 + }, + { + "epoch": 2.9605135027636433, + "grad_norm": 0.2802734711444316, + "learning_rate": 3.2708571071536076e-06, + "loss": 0.424, + "step": 17986 + }, + { + "epoch": 2.96067808698276, + "grad_norm": 0.3835232587119877, + "learning_rate": 3.270404925010092e-06, + "loss": 0.4255, + "step": 17987 + }, + { + "epoch": 2.9608426712018763, + "grad_norm": 0.4379819983304958, + "learning_rate": 3.269952756283429e-06, + "loss": 0.4133, + "step": 17988 + }, + { + "epoch": 2.9610072554209927, + "grad_norm": 0.33758890094692523, + "learning_rate": 3.2695006009785503e-06, + "loss": 0.4357, + "step": 17989 + }, + { + "epoch": 2.961171839640109, + "grad_norm": 1.5701706025079465, + "learning_rate": 3.2690484591003883e-06, + "loss": 0.4507, + "step": 17990 + }, + { + "epoch": 2.9613364238592257, + "grad_norm": 0.34036360947169975, + "learning_rate": 3.26859633065388e-06, + "loss": 0.4198, + "step": 17991 + }, + { + "epoch": 2.961501008078342, + "grad_norm": 0.3288506108564918, + "learning_rate": 3.2681442156439565e-06, + "loss": 0.4209, + "step": 17992 + }, + { + "epoch": 2.9616655922974586, + "grad_norm": 0.4595635703723497, + "learning_rate": 3.2676921140755524e-06, + "loss": 0.4386, + "step": 17993 + }, + { + "epoch": 2.961830176516575, + "grad_norm": 0.4226522308980864, + "learning_rate": 3.267240025953598e-06, + "loss": 0.418, + "step": 17994 + }, + { + "epoch": 2.9619947607356916, + "grad_norm": 0.3557572729847082, + "learning_rate": 3.2667879512830316e-06, + "loss": 0.4484, + "step": 17995 + }, + { + "epoch": 2.962159344954808, + "grad_norm": 0.28465195785827985, + "learning_rate": 3.266335890068781e-06, + "loss": 0.4249, + "step": 17996 + }, + { + "epoch": 2.9623239291739245, + "grad_norm": 0.30735732145131517, + "learning_rate": 3.2658838423157806e-06, + "loss": 0.42, + "step": 17997 + }, + { + "epoch": 2.962488513393041, + "grad_norm": 0.3742831146688339, + "learning_rate": 3.265431808028964e-06, + "loss": 0.4296, + "step": 17998 + }, + { + "epoch": 2.9626530976121574, + "grad_norm": 0.3994875408359112, + "learning_rate": 3.2649797872132614e-06, + "loss": 0.4255, + "step": 17999 + }, + { + "epoch": 2.9628176818312735, + "grad_norm": 0.3290846626459086, + "learning_rate": 3.264527779873608e-06, + "loss": 0.4473, + "step": 18000 + }, + { + "epoch": 2.96298226605039, + "grad_norm": 0.3710954399446858, + "learning_rate": 3.2640757860149335e-06, + "loss": 0.4315, + "step": 18001 + }, + { + "epoch": 2.9631468502695064, + "grad_norm": 0.2933739663224296, + "learning_rate": 3.263623805642171e-06, + "loss": 0.4374, + "step": 18002 + }, + { + "epoch": 2.963311434488623, + "grad_norm": 0.32381176287437485, + "learning_rate": 3.263171838760254e-06, + "loss": 0.4303, + "step": 18003 + }, + { + "epoch": 2.9634760187077394, + "grad_norm": 0.33407116028706746, + "learning_rate": 3.26271988537411e-06, + "loss": 0.4459, + "step": 18004 + }, + { + "epoch": 2.963640602926856, + "grad_norm": 0.35993035731713136, + "learning_rate": 3.2622679454886734e-06, + "loss": 0.4172, + "step": 18005 + }, + { + "epoch": 2.9638051871459723, + "grad_norm": 0.2500691271741941, + "learning_rate": 3.2618160191088743e-06, + "loss": 0.4284, + "step": 18006 + }, + { + "epoch": 2.9639697713650888, + "grad_norm": 0.28256234629310606, + "learning_rate": 3.2613641062396454e-06, + "loss": 0.4517, + "step": 18007 + }, + { + "epoch": 2.9641343555842052, + "grad_norm": 0.449253121128817, + "learning_rate": 3.2609122068859164e-06, + "loss": 0.4362, + "step": 18008 + }, + { + "epoch": 2.9642989398033217, + "grad_norm": 0.2832709344301676, + "learning_rate": 3.2604603210526197e-06, + "loss": 0.4367, + "step": 18009 + }, + { + "epoch": 2.964463524022438, + "grad_norm": 0.8754438257605548, + "learning_rate": 3.2600084487446854e-06, + "loss": 0.4173, + "step": 18010 + }, + { + "epoch": 2.9646281082415546, + "grad_norm": 0.388661775026451, + "learning_rate": 3.259556589967043e-06, + "loss": 0.4363, + "step": 18011 + }, + { + "epoch": 2.964792692460671, + "grad_norm": 0.317062220781525, + "learning_rate": 3.259104744724626e-06, + "loss": 0.4336, + "step": 18012 + }, + { + "epoch": 2.9649572766797876, + "grad_norm": 0.3325180440188855, + "learning_rate": 3.2586529130223596e-06, + "loss": 0.4218, + "step": 18013 + }, + { + "epoch": 2.965121860898904, + "grad_norm": 0.42725282445783, + "learning_rate": 3.25820109486518e-06, + "loss": 0.4465, + "step": 18014 + }, + { + "epoch": 2.9652864451180205, + "grad_norm": 0.2973333427472893, + "learning_rate": 3.257749290258012e-06, + "loss": 0.4088, + "step": 18015 + }, + { + "epoch": 2.965451029337137, + "grad_norm": 0.29434542873997904, + "learning_rate": 3.2572974992057905e-06, + "loss": 0.4318, + "step": 18016 + }, + { + "epoch": 2.9656156135562535, + "grad_norm": 0.3345839461462118, + "learning_rate": 3.2568457217134415e-06, + "loss": 0.4231, + "step": 18017 + }, + { + "epoch": 2.96578019777537, + "grad_norm": 0.851610140823981, + "learning_rate": 3.2563939577858965e-06, + "loss": 0.4497, + "step": 18018 + }, + { + "epoch": 2.9659447819944864, + "grad_norm": 0.3371965846051764, + "learning_rate": 3.2559422074280836e-06, + "loss": 0.4251, + "step": 18019 + }, + { + "epoch": 2.966109366213603, + "grad_norm": 0.33373839748298434, + "learning_rate": 3.2554904706449325e-06, + "loss": 0.428, + "step": 18020 + }, + { + "epoch": 2.9662739504327194, + "grad_norm": 0.3017815513825485, + "learning_rate": 3.255038747441374e-06, + "loss": 0.4271, + "step": 18021 + }, + { + "epoch": 2.966438534651836, + "grad_norm": 0.44310968910509685, + "learning_rate": 3.254587037822334e-06, + "loss": 0.4318, + "step": 18022 + }, + { + "epoch": 2.9666031188709523, + "grad_norm": 0.3164345259894699, + "learning_rate": 3.254135341792746e-06, + "loss": 0.4194, + "step": 18023 + }, + { + "epoch": 2.9667677030900688, + "grad_norm": 0.3420479482428773, + "learning_rate": 3.253683659357532e-06, + "loss": 0.4386, + "step": 18024 + }, + { + "epoch": 2.9669322873091852, + "grad_norm": 0.2682684443609395, + "learning_rate": 3.253231990521628e-06, + "loss": 0.4296, + "step": 18025 + }, + { + "epoch": 2.9670968715283017, + "grad_norm": 0.2814989143324126, + "learning_rate": 3.252780335289958e-06, + "loss": 0.4304, + "step": 18026 + }, + { + "epoch": 2.967261455747418, + "grad_norm": 0.33286025813902886, + "learning_rate": 3.25232869366745e-06, + "loss": 0.4282, + "step": 18027 + }, + { + "epoch": 2.9674260399665346, + "grad_norm": 0.2880496410360446, + "learning_rate": 3.2518770656590347e-06, + "loss": 0.437, + "step": 18028 + }, + { + "epoch": 2.967590624185651, + "grad_norm": 0.3780400505630464, + "learning_rate": 3.2514254512696377e-06, + "loss": 0.4435, + "step": 18029 + }, + { + "epoch": 2.9677552084047676, + "grad_norm": 0.33580026727556184, + "learning_rate": 3.2509738505041884e-06, + "loss": 0.4404, + "step": 18030 + }, + { + "epoch": 2.967919792623884, + "grad_norm": 0.3335263603656979, + "learning_rate": 3.250522263367613e-06, + "loss": 0.4098, + "step": 18031 + }, + { + "epoch": 2.9680843768430005, + "grad_norm": 0.32127404286546213, + "learning_rate": 3.2500706898648417e-06, + "loss": 0.4358, + "step": 18032 + }, + { + "epoch": 2.968248961062117, + "grad_norm": 0.3090836506465173, + "learning_rate": 3.2496191300008004e-06, + "loss": 0.4251, + "step": 18033 + }, + { + "epoch": 2.9684135452812335, + "grad_norm": 0.3899512882733559, + "learning_rate": 3.249167583780414e-06, + "loss": 0.4194, + "step": 18034 + }, + { + "epoch": 2.96857812950035, + "grad_norm": 0.29893313967120105, + "learning_rate": 3.2487160512086143e-06, + "loss": 0.4246, + "step": 18035 + }, + { + "epoch": 2.9687427137194664, + "grad_norm": 0.6207934829357318, + "learning_rate": 3.248264532290323e-06, + "loss": 0.4358, + "step": 18036 + }, + { + "epoch": 2.968907297938583, + "grad_norm": 0.53899296820925, + "learning_rate": 3.2478130270304716e-06, + "loss": 0.4318, + "step": 18037 + }, + { + "epoch": 2.9690718821576993, + "grad_norm": 0.27980429461566236, + "learning_rate": 3.247361535433983e-06, + "loss": 0.4299, + "step": 18038 + }, + { + "epoch": 2.969236466376816, + "grad_norm": 0.3590153093987848, + "learning_rate": 3.2469100575057864e-06, + "loss": 0.4175, + "step": 18039 + }, + { + "epoch": 2.9694010505959323, + "grad_norm": 0.3317868256936113, + "learning_rate": 3.2464585932508075e-06, + "loss": 0.4384, + "step": 18040 + }, + { + "epoch": 2.9695656348150488, + "grad_norm": 1.0971192220495734, + "learning_rate": 3.246007142673971e-06, + "loss": 0.4354, + "step": 18041 + }, + { + "epoch": 2.969730219034165, + "grad_norm": 0.35863929218581314, + "learning_rate": 3.2455557057802063e-06, + "loss": 0.438, + "step": 18042 + }, + { + "epoch": 2.9698948032532813, + "grad_norm": 0.3274544277696532, + "learning_rate": 3.2451042825744333e-06, + "loss": 0.4466, + "step": 18043 + }, + { + "epoch": 2.9700593874723977, + "grad_norm": 0.3313034653954024, + "learning_rate": 3.2446528730615857e-06, + "loss": 0.4321, + "step": 18044 + }, + { + "epoch": 2.970223971691514, + "grad_norm": 0.30389316188656185, + "learning_rate": 3.2442014772465814e-06, + "loss": 0.4462, + "step": 18045 + }, + { + "epoch": 2.9703885559106307, + "grad_norm": 0.2934094414626938, + "learning_rate": 3.243750095134352e-06, + "loss": 0.4343, + "step": 18046 + }, + { + "epoch": 2.970553140129747, + "grad_norm": 0.34425230910755483, + "learning_rate": 3.24329872672982e-06, + "loss": 0.4098, + "step": 18047 + }, + { + "epoch": 2.9707177243488636, + "grad_norm": 0.30450840666561735, + "learning_rate": 3.2428473720379085e-06, + "loss": 0.4053, + "step": 18048 + }, + { + "epoch": 2.97088230856798, + "grad_norm": 0.41018405150240905, + "learning_rate": 3.242396031063547e-06, + "loss": 0.4099, + "step": 18049 + }, + { + "epoch": 2.9710468927870965, + "grad_norm": 0.29096578844948423, + "learning_rate": 3.241944703811657e-06, + "loss": 0.4463, + "step": 18050 + }, + { + "epoch": 2.971211477006213, + "grad_norm": 0.31110995772814504, + "learning_rate": 3.2414933902871647e-06, + "loss": 0.4294, + "step": 18051 + }, + { + "epoch": 2.9713760612253295, + "grad_norm": 0.28542042993102296, + "learning_rate": 3.2410420904949933e-06, + "loss": 0.4252, + "step": 18052 + }, + { + "epoch": 2.971540645444446, + "grad_norm": 0.33471298146038364, + "learning_rate": 3.2405908044400692e-06, + "loss": 0.434, + "step": 18053 + }, + { + "epoch": 2.9717052296635624, + "grad_norm": 0.2776863662421888, + "learning_rate": 3.2401395321273175e-06, + "loss": 0.4107, + "step": 18054 + }, + { + "epoch": 2.971869813882679, + "grad_norm": 0.31484487462523253, + "learning_rate": 3.2396882735616576e-06, + "loss": 0.4328, + "step": 18055 + }, + { + "epoch": 2.9720343981017954, + "grad_norm": 0.325430290067415, + "learning_rate": 3.2392370287480172e-06, + "loss": 0.4164, + "step": 18056 + }, + { + "epoch": 2.972198982320912, + "grad_norm": 0.28709782951035795, + "learning_rate": 3.238785797691318e-06, + "loss": 0.429, + "step": 18057 + }, + { + "epoch": 2.9723635665400283, + "grad_norm": 0.3961152654109752, + "learning_rate": 3.238334580396486e-06, + "loss": 0.4247, + "step": 18058 + }, + { + "epoch": 2.972528150759145, + "grad_norm": 0.473927739825349, + "learning_rate": 3.2378833768684428e-06, + "loss": 0.432, + "step": 18059 + }, + { + "epoch": 2.9726927349782613, + "grad_norm": 0.2904381855630161, + "learning_rate": 3.2374321871121134e-06, + "loss": 0.4234, + "step": 18060 + }, + { + "epoch": 2.9728573191973777, + "grad_norm": 0.33940041419371153, + "learning_rate": 3.236981011132419e-06, + "loss": 0.4241, + "step": 18061 + }, + { + "epoch": 2.973021903416494, + "grad_norm": 0.39408484606284216, + "learning_rate": 3.236529848934285e-06, + "loss": 0.4076, + "step": 18062 + }, + { + "epoch": 2.9731864876356107, + "grad_norm": 0.33323138770900007, + "learning_rate": 3.2360787005226334e-06, + "loss": 0.4065, + "step": 18063 + }, + { + "epoch": 2.973351071854727, + "grad_norm": 0.4534485413101231, + "learning_rate": 3.2356275659023844e-06, + "loss": 0.4268, + "step": 18064 + }, + { + "epoch": 2.9735156560738436, + "grad_norm": 0.30028391006421495, + "learning_rate": 3.2351764450784653e-06, + "loss": 0.4139, + "step": 18065 + }, + { + "epoch": 2.97368024029296, + "grad_norm": 0.3178120219809389, + "learning_rate": 3.2347253380557944e-06, + "loss": 0.4408, + "step": 18066 + }, + { + "epoch": 2.973844824512076, + "grad_norm": 0.39134603889554914, + "learning_rate": 3.2342742448392964e-06, + "loss": 0.4269, + "step": 18067 + }, + { + "epoch": 2.9740094087311926, + "grad_norm": 0.2806020550149464, + "learning_rate": 3.2338231654338913e-06, + "loss": 0.4288, + "step": 18068 + }, + { + "epoch": 2.974173992950309, + "grad_norm": 0.34966794647504806, + "learning_rate": 3.2333720998445036e-06, + "loss": 0.4295, + "step": 18069 + }, + { + "epoch": 2.9743385771694255, + "grad_norm": 0.30994245389032565, + "learning_rate": 3.2329210480760544e-06, + "loss": 0.4247, + "step": 18070 + }, + { + "epoch": 2.974503161388542, + "grad_norm": 0.3643813783368933, + "learning_rate": 3.2324700101334647e-06, + "loss": 0.4254, + "step": 18071 + }, + { + "epoch": 2.9746677456076585, + "grad_norm": 0.4606551678610015, + "learning_rate": 3.2320189860216563e-06, + "loss": 0.4257, + "step": 18072 + }, + { + "epoch": 2.974832329826775, + "grad_norm": 0.31573605145434186, + "learning_rate": 3.2315679757455496e-06, + "loss": 0.4299, + "step": 18073 + }, + { + "epoch": 2.9749969140458914, + "grad_norm": 0.33672582474808266, + "learning_rate": 3.2311169793100697e-06, + "loss": 0.4219, + "step": 18074 + }, + { + "epoch": 2.975161498265008, + "grad_norm": 0.3283598533366402, + "learning_rate": 3.230665996720131e-06, + "loss": 0.4217, + "step": 18075 + }, + { + "epoch": 2.9753260824841243, + "grad_norm": 0.27162495539309256, + "learning_rate": 3.2302150279806625e-06, + "loss": 0.4196, + "step": 18076 + }, + { + "epoch": 2.975490666703241, + "grad_norm": 0.4643539845857057, + "learning_rate": 3.2297640730965787e-06, + "loss": 0.4439, + "step": 18077 + }, + { + "epoch": 2.9756552509223573, + "grad_norm": 0.368683998694101, + "learning_rate": 3.229313132072802e-06, + "loss": 0.4451, + "step": 18078 + }, + { + "epoch": 2.9758198351414737, + "grad_norm": 0.3074675786557951, + "learning_rate": 3.228862204914254e-06, + "loss": 0.4114, + "step": 18079 + }, + { + "epoch": 2.97598441936059, + "grad_norm": 0.34895046305914723, + "learning_rate": 3.228411291625854e-06, + "loss": 0.4456, + "step": 18080 + }, + { + "epoch": 2.9761490035797067, + "grad_norm": 0.37398568795790305, + "learning_rate": 3.2279603922125224e-06, + "loss": 0.4327, + "step": 18081 + }, + { + "epoch": 2.976313587798823, + "grad_norm": 0.3974813545686356, + "learning_rate": 3.2275095066791794e-06, + "loss": 0.4314, + "step": 18082 + }, + { + "epoch": 2.9764781720179396, + "grad_norm": 0.4649457248662648, + "learning_rate": 3.2270586350307456e-06, + "loss": 0.4399, + "step": 18083 + }, + { + "epoch": 2.976642756237056, + "grad_norm": 0.3309637999187168, + "learning_rate": 3.2266077772721406e-06, + "loss": 0.4189, + "step": 18084 + }, + { + "epoch": 2.9768073404561726, + "grad_norm": 0.30971886634736673, + "learning_rate": 3.2261569334082805e-06, + "loss": 0.4194, + "step": 18085 + }, + { + "epoch": 2.976971924675289, + "grad_norm": 0.281886081834552, + "learning_rate": 3.2257061034440897e-06, + "loss": 0.4336, + "step": 18086 + }, + { + "epoch": 2.9771365088944055, + "grad_norm": 0.3450071004495589, + "learning_rate": 3.2252552873844843e-06, + "loss": 0.4452, + "step": 18087 + }, + { + "epoch": 2.977301093113522, + "grad_norm": 0.35518661833333515, + "learning_rate": 3.224804485234385e-06, + "loss": 0.4484, + "step": 18088 + }, + { + "epoch": 2.9774656773326384, + "grad_norm": 0.28753201503866177, + "learning_rate": 3.2243536969987096e-06, + "loss": 0.4397, + "step": 18089 + }, + { + "epoch": 2.977630261551755, + "grad_norm": 0.3600946230149472, + "learning_rate": 3.2239029226823786e-06, + "loss": 0.435, + "step": 18090 + }, + { + "epoch": 2.9777948457708714, + "grad_norm": 0.2928849477916003, + "learning_rate": 3.223452162290309e-06, + "loss": 0.4306, + "step": 18091 + }, + { + "epoch": 2.977959429989988, + "grad_norm": 0.33409587075416086, + "learning_rate": 3.22300141582742e-06, + "loss": 0.4141, + "step": 18092 + }, + { + "epoch": 2.9781240142091043, + "grad_norm": 0.36276975155674085, + "learning_rate": 3.2225506832986306e-06, + "loss": 0.433, + "step": 18093 + }, + { + "epoch": 2.978288598428221, + "grad_norm": 0.3506459464381036, + "learning_rate": 3.222099964708856e-06, + "loss": 0.4429, + "step": 18094 + }, + { + "epoch": 2.9784531826473373, + "grad_norm": 0.2919003799206049, + "learning_rate": 3.22164926006302e-06, + "loss": 0.4616, + "step": 18095 + }, + { + "epoch": 2.9786177668664537, + "grad_norm": 0.2964169570888546, + "learning_rate": 3.2211985693660334e-06, + "loss": 0.4249, + "step": 18096 + }, + { + "epoch": 2.97878235108557, + "grad_norm": 0.6718651471413585, + "learning_rate": 3.220747892622821e-06, + "loss": 0.4115, + "step": 18097 + }, + { + "epoch": 2.9789469353046867, + "grad_norm": 0.3263407815241886, + "learning_rate": 3.2202972298382957e-06, + "loss": 0.4261, + "step": 18098 + }, + { + "epoch": 2.979111519523803, + "grad_norm": 0.2992958976602193, + "learning_rate": 3.219846581017375e-06, + "loss": 0.4321, + "step": 18099 + }, + { + "epoch": 2.9792761037429196, + "grad_norm": 0.3037050417576219, + "learning_rate": 3.219395946164979e-06, + "loss": 0.4311, + "step": 18100 + }, + { + "epoch": 2.979440687962036, + "grad_norm": 0.33330487575134055, + "learning_rate": 3.218945325286023e-06, + "loss": 0.4412, + "step": 18101 + }, + { + "epoch": 2.9796052721811526, + "grad_norm": 0.39981885883986784, + "learning_rate": 3.2184947183854244e-06, + "loss": 0.4415, + "step": 18102 + }, + { + "epoch": 2.979769856400269, + "grad_norm": 0.3282863712935466, + "learning_rate": 3.218044125468099e-06, + "loss": 0.4143, + "step": 18103 + }, + { + "epoch": 2.9799344406193855, + "grad_norm": 0.37273799725123524, + "learning_rate": 3.217593546538967e-06, + "loss": 0.4079, + "step": 18104 + }, + { + "epoch": 2.980099024838502, + "grad_norm": 0.3147167169465058, + "learning_rate": 3.2171429816029397e-06, + "loss": 0.4171, + "step": 18105 + }, + { + "epoch": 2.9802636090576184, + "grad_norm": 0.3325195855156046, + "learning_rate": 3.2166924306649386e-06, + "loss": 0.4365, + "step": 18106 + }, + { + "epoch": 2.980428193276735, + "grad_norm": 0.32653489583676465, + "learning_rate": 3.216241893729876e-06, + "loss": 0.4246, + "step": 18107 + }, + { + "epoch": 2.980592777495851, + "grad_norm": 0.3390616252801089, + "learning_rate": 3.2157913708026696e-06, + "loss": 0.4163, + "step": 18108 + }, + { + "epoch": 2.9807573617149674, + "grad_norm": 0.27059271621322145, + "learning_rate": 3.215340861888235e-06, + "loss": 0.4383, + "step": 18109 + }, + { + "epoch": 2.980921945934084, + "grad_norm": 0.30289374970872623, + "learning_rate": 3.2148903669914884e-06, + "loss": 0.4012, + "step": 18110 + }, + { + "epoch": 2.9810865301532004, + "grad_norm": 0.41009121277019245, + "learning_rate": 3.214439886117346e-06, + "loss": 0.4446, + "step": 18111 + }, + { + "epoch": 2.981251114372317, + "grad_norm": 0.3620401172185879, + "learning_rate": 3.213989419270721e-06, + "loss": 0.4313, + "step": 18112 + }, + { + "epoch": 2.9814156985914333, + "grad_norm": 0.3128518314458058, + "learning_rate": 3.2135389664565316e-06, + "loss": 0.4273, + "step": 18113 + }, + { + "epoch": 2.9815802828105498, + "grad_norm": 0.5072903575923572, + "learning_rate": 3.2130885276796925e-06, + "loss": 0.4363, + "step": 18114 + }, + { + "epoch": 2.9817448670296662, + "grad_norm": 0.3272357301115453, + "learning_rate": 3.2126381029451146e-06, + "loss": 0.4381, + "step": 18115 + }, + { + "epoch": 2.9819094512487827, + "grad_norm": 0.2878470990612326, + "learning_rate": 3.2121876922577193e-06, + "loss": 0.4424, + "step": 18116 + }, + { + "epoch": 2.982074035467899, + "grad_norm": 0.3381122306172689, + "learning_rate": 3.2117372956224154e-06, + "loss": 0.427, + "step": 18117 + }, + { + "epoch": 2.9822386196870156, + "grad_norm": 0.36845413810216926, + "learning_rate": 3.2112869130441214e-06, + "loss": 0.4313, + "step": 18118 + }, + { + "epoch": 2.982403203906132, + "grad_norm": 0.3289819483762642, + "learning_rate": 3.2108365445277496e-06, + "loss": 0.4278, + "step": 18119 + }, + { + "epoch": 2.9825677881252486, + "grad_norm": 0.35928315823892576, + "learning_rate": 3.210386190078215e-06, + "loss": 0.4373, + "step": 18120 + }, + { + "epoch": 2.982732372344365, + "grad_norm": 0.6564359798749756, + "learning_rate": 3.2099358497004318e-06, + "loss": 0.4324, + "step": 18121 + }, + { + "epoch": 2.9828969565634815, + "grad_norm": 0.31058811923763524, + "learning_rate": 3.2094855233993134e-06, + "loss": 0.4344, + "step": 18122 + }, + { + "epoch": 2.983061540782598, + "grad_norm": 0.2963960899838568, + "learning_rate": 3.2090352111797756e-06, + "loss": 0.4362, + "step": 18123 + }, + { + "epoch": 2.9832261250017145, + "grad_norm": 0.32776813298408725, + "learning_rate": 3.2085849130467273e-06, + "loss": 0.4204, + "step": 18124 + }, + { + "epoch": 2.983390709220831, + "grad_norm": 0.5110102874344136, + "learning_rate": 3.2081346290050877e-06, + "loss": 0.4203, + "step": 18125 + }, + { + "epoch": 2.9835552934399474, + "grad_norm": 0.30187833255750773, + "learning_rate": 3.2076843590597653e-06, + "loss": 0.4406, + "step": 18126 + }, + { + "epoch": 2.983719877659064, + "grad_norm": 0.2855605888514474, + "learning_rate": 3.207234103215677e-06, + "loss": 0.4084, + "step": 18127 + }, + { + "epoch": 2.9838844618781803, + "grad_norm": 0.32957430467355686, + "learning_rate": 3.206783861477735e-06, + "loss": 0.4372, + "step": 18128 + }, + { + "epoch": 2.984049046097297, + "grad_norm": 0.38975332516752953, + "learning_rate": 3.2063336338508493e-06, + "loss": 0.421, + "step": 18129 + }, + { + "epoch": 2.9842136303164133, + "grad_norm": 0.2980585596483328, + "learning_rate": 3.2058834203399356e-06, + "loss": 0.4303, + "step": 18130 + }, + { + "epoch": 2.9843782145355298, + "grad_norm": 0.3398236694925661, + "learning_rate": 3.205433220949905e-06, + "loss": 0.4294, + "step": 18131 + }, + { + "epoch": 2.9845427987546462, + "grad_norm": 0.26269951204573494, + "learning_rate": 3.2049830356856713e-06, + "loss": 0.4531, + "step": 18132 + }, + { + "epoch": 2.9847073829737623, + "grad_norm": 0.36395240545705293, + "learning_rate": 3.204532864552145e-06, + "loss": 0.4328, + "step": 18133 + }, + { + "epoch": 2.9848719671928787, + "grad_norm": 0.4336705652596672, + "learning_rate": 3.20408270755424e-06, + "loss": 0.4283, + "step": 18134 + }, + { + "epoch": 2.985036551411995, + "grad_norm": 0.3006524868559794, + "learning_rate": 3.203632564696868e-06, + "loss": 0.4013, + "step": 18135 + }, + { + "epoch": 2.9852011356311117, + "grad_norm": 0.322686194402558, + "learning_rate": 3.2031824359849384e-06, + "loss": 0.3927, + "step": 18136 + }, + { + "epoch": 2.985365719850228, + "grad_norm": 0.3043040631434431, + "learning_rate": 3.202732321423365e-06, + "loss": 0.4276, + "step": 18137 + }, + { + "epoch": 2.9855303040693446, + "grad_norm": 0.3084164121969129, + "learning_rate": 3.202282221017058e-06, + "loss": 0.4297, + "step": 18138 + }, + { + "epoch": 2.985694888288461, + "grad_norm": 0.35498579787356155, + "learning_rate": 3.2018321347709307e-06, + "loss": 0.4414, + "step": 18139 + }, + { + "epoch": 2.9858594725075776, + "grad_norm": 0.27871479742328575, + "learning_rate": 3.201382062689892e-06, + "loss": 0.429, + "step": 18140 + }, + { + "epoch": 2.986024056726694, + "grad_norm": 0.3174931162113501, + "learning_rate": 3.2009320047788546e-06, + "loss": 0.4191, + "step": 18141 + }, + { + "epoch": 2.9861886409458105, + "grad_norm": 0.6164069800922533, + "learning_rate": 3.2004819610427284e-06, + "loss": 0.4144, + "step": 18142 + }, + { + "epoch": 2.986353225164927, + "grad_norm": 0.5379883698054649, + "learning_rate": 3.2000319314864245e-06, + "loss": 0.4237, + "step": 18143 + }, + { + "epoch": 2.9865178093840434, + "grad_norm": 0.28452726517508903, + "learning_rate": 3.1995819161148554e-06, + "loss": 0.4254, + "step": 18144 + }, + { + "epoch": 2.98668239360316, + "grad_norm": 0.3020895643523373, + "learning_rate": 3.1991319149329263e-06, + "loss": 0.4359, + "step": 18145 + }, + { + "epoch": 2.9868469778222764, + "grad_norm": 0.35551300631374627, + "learning_rate": 3.1986819279455528e-06, + "loss": 0.4462, + "step": 18146 + }, + { + "epoch": 2.987011562041393, + "grad_norm": 0.2904016493322226, + "learning_rate": 3.198231955157641e-06, + "loss": 0.414, + "step": 18147 + }, + { + "epoch": 2.9871761462605093, + "grad_norm": 0.35154727598182106, + "learning_rate": 3.1977819965741037e-06, + "loss": 0.4332, + "step": 18148 + }, + { + "epoch": 2.987340730479626, + "grad_norm": 0.33008611051504305, + "learning_rate": 3.1973320521998486e-06, + "loss": 0.4413, + "step": 18149 + }, + { + "epoch": 2.9875053146987423, + "grad_norm": 0.2835558509576164, + "learning_rate": 3.1968821220397872e-06, + "loss": 0.4272, + "step": 18150 + }, + { + "epoch": 2.9876698989178587, + "grad_norm": 0.3609501104969594, + "learning_rate": 3.1964322060988277e-06, + "loss": 0.4495, + "step": 18151 + }, + { + "epoch": 2.987834483136975, + "grad_norm": 0.2888185844920322, + "learning_rate": 3.1959823043818795e-06, + "loss": 0.4171, + "step": 18152 + }, + { + "epoch": 2.9879990673560917, + "grad_norm": 0.2936427475665115, + "learning_rate": 3.1955324168938523e-06, + "loss": 0.422, + "step": 18153 + }, + { + "epoch": 2.988163651575208, + "grad_norm": 0.3308612101987003, + "learning_rate": 3.195082543639654e-06, + "loss": 0.4351, + "step": 18154 + }, + { + "epoch": 2.9883282357943246, + "grad_norm": 0.30759910008070246, + "learning_rate": 3.194632684624196e-06, + "loss": 0.4394, + "step": 18155 + }, + { + "epoch": 2.988492820013441, + "grad_norm": 0.5856876981238213, + "learning_rate": 3.1941828398523823e-06, + "loss": 0.4548, + "step": 18156 + }, + { + "epoch": 2.9886574042325575, + "grad_norm": 0.3194599288209232, + "learning_rate": 3.1937330093291276e-06, + "loss": 0.4259, + "step": 18157 + }, + { + "epoch": 2.988821988451674, + "grad_norm": 0.36234103840466797, + "learning_rate": 3.1932831930593354e-06, + "loss": 0.4367, + "step": 18158 + }, + { + "epoch": 2.9889865726707905, + "grad_norm": 0.30308141678484296, + "learning_rate": 3.192833391047915e-06, + "loss": 0.4476, + "step": 18159 + }, + { + "epoch": 2.989151156889907, + "grad_norm": 0.30146952433299834, + "learning_rate": 3.192383603299775e-06, + "loss": 0.4143, + "step": 18160 + }, + { + "epoch": 2.9893157411090234, + "grad_norm": 0.5577042790559326, + "learning_rate": 3.191933829819823e-06, + "loss": 0.451, + "step": 18161 + }, + { + "epoch": 2.98948032532814, + "grad_norm": 0.35589513547973517, + "learning_rate": 3.1914840706129684e-06, + "loss": 0.444, + "step": 18162 + }, + { + "epoch": 2.9896449095472564, + "grad_norm": 0.3002996637792063, + "learning_rate": 3.191034325684116e-06, + "loss": 0.4246, + "step": 18163 + }, + { + "epoch": 2.989809493766373, + "grad_norm": 0.328292620084996, + "learning_rate": 3.1905845950381755e-06, + "loss": 0.4256, + "step": 18164 + }, + { + "epoch": 2.9899740779854893, + "grad_norm": 0.6209902002277193, + "learning_rate": 3.190134878680054e-06, + "loss": 0.4308, + "step": 18165 + }, + { + "epoch": 2.990138662204606, + "grad_norm": 0.3843180872343453, + "learning_rate": 3.1896851766146555e-06, + "loss": 0.4423, + "step": 18166 + }, + { + "epoch": 2.9903032464237222, + "grad_norm": 0.4509998126064767, + "learning_rate": 3.1892354888468922e-06, + "loss": 0.4425, + "step": 18167 + }, + { + "epoch": 2.9904678306428387, + "grad_norm": 0.29625525143325837, + "learning_rate": 3.1887858153816664e-06, + "loss": 0.4219, + "step": 18168 + }, + { + "epoch": 2.990632414861955, + "grad_norm": 0.45827014828035467, + "learning_rate": 3.188336156223887e-06, + "loss": 0.4435, + "step": 18169 + }, + { + "epoch": 2.9907969990810717, + "grad_norm": 0.37408544422334067, + "learning_rate": 3.1878865113784593e-06, + "loss": 0.4229, + "step": 18170 + }, + { + "epoch": 2.990961583300188, + "grad_norm": 0.36305505750525263, + "learning_rate": 3.1874368808502908e-06, + "loss": 0.4315, + "step": 18171 + }, + { + "epoch": 2.9911261675193046, + "grad_norm": 0.326156630131236, + "learning_rate": 3.186987264644288e-06, + "loss": 0.4212, + "step": 18172 + }, + { + "epoch": 2.991290751738421, + "grad_norm": 0.2828807883216947, + "learning_rate": 3.1865376627653548e-06, + "loss": 0.4239, + "step": 18173 + }, + { + "epoch": 2.9914553359575375, + "grad_norm": 0.3545049958634287, + "learning_rate": 3.1860880752184006e-06, + "loss": 0.43, + "step": 18174 + }, + { + "epoch": 2.9916199201766536, + "grad_norm": 0.33878685908841927, + "learning_rate": 3.1856385020083256e-06, + "loss": 0.4352, + "step": 18175 + }, + { + "epoch": 2.99178450439577, + "grad_norm": 0.39217285035326416, + "learning_rate": 3.185188943140042e-06, + "loss": 0.4303, + "step": 18176 + }, + { + "epoch": 2.9919490886148865, + "grad_norm": 0.34070472410426383, + "learning_rate": 3.1847393986184486e-06, + "loss": 0.4486, + "step": 18177 + }, + { + "epoch": 2.992113672834003, + "grad_norm": 0.654226350541991, + "learning_rate": 3.184289868448457e-06, + "loss": 0.42, + "step": 18178 + }, + { + "epoch": 2.9922782570531195, + "grad_norm": 0.35024427558591065, + "learning_rate": 3.1838403526349687e-06, + "loss": 0.4253, + "step": 18179 + }, + { + "epoch": 2.992442841272236, + "grad_norm": 0.2887993308092518, + "learning_rate": 3.183390851182888e-06, + "loss": 0.419, + "step": 18180 + }, + { + "epoch": 2.9926074254913524, + "grad_norm": 0.2852910550694381, + "learning_rate": 3.182941364097121e-06, + "loss": 0.4248, + "step": 18181 + }, + { + "epoch": 2.992772009710469, + "grad_norm": 0.31844235323231934, + "learning_rate": 3.1824918913825725e-06, + "loss": 0.4137, + "step": 18182 + }, + { + "epoch": 2.9929365939295853, + "grad_norm": 0.3990404528599743, + "learning_rate": 3.1820424330441472e-06, + "loss": 0.4466, + "step": 18183 + }, + { + "epoch": 2.993101178148702, + "grad_norm": 0.3081020974323921, + "learning_rate": 3.1815929890867476e-06, + "loss": 0.4373, + "step": 18184 + }, + { + "epoch": 2.9932657623678183, + "grad_norm": 0.29220689300256936, + "learning_rate": 3.1811435595152807e-06, + "loss": 0.4105, + "step": 18185 + }, + { + "epoch": 2.9934303465869347, + "grad_norm": 0.38694100904722095, + "learning_rate": 3.180694144334647e-06, + "loss": 0.4339, + "step": 18186 + }, + { + "epoch": 2.993594930806051, + "grad_norm": 0.38125835788420304, + "learning_rate": 3.180244743549755e-06, + "loss": 0.4457, + "step": 18187 + }, + { + "epoch": 2.9937595150251677, + "grad_norm": 1.1024776134142784, + "learning_rate": 3.1797953571655043e-06, + "loss": 0.4411, + "step": 18188 + }, + { + "epoch": 2.993924099244284, + "grad_norm": 0.45183397917621165, + "learning_rate": 3.1793459851867988e-06, + "loss": 0.4411, + "step": 18189 + }, + { + "epoch": 2.9940886834634006, + "grad_norm": 0.32011488526140774, + "learning_rate": 3.1788966276185444e-06, + "loss": 0.4292, + "step": 18190 + }, + { + "epoch": 2.994253267682517, + "grad_norm": 0.5045650725636368, + "learning_rate": 3.178447284465641e-06, + "loss": 0.4179, + "step": 18191 + }, + { + "epoch": 2.9944178519016336, + "grad_norm": 0.5554772919589578, + "learning_rate": 3.177997955732995e-06, + "loss": 0.4136, + "step": 18192 + }, + { + "epoch": 2.99458243612075, + "grad_norm": 0.475211106424764, + "learning_rate": 3.177548641425506e-06, + "loss": 0.4173, + "step": 18193 + }, + { + "epoch": 2.9947470203398665, + "grad_norm": 0.3771283509591505, + "learning_rate": 3.1770993415480795e-06, + "loss": 0.4452, + "step": 18194 + }, + { + "epoch": 2.994911604558983, + "grad_norm": 0.42594227266935014, + "learning_rate": 3.1766500561056188e-06, + "loss": 0.4117, + "step": 18195 + }, + { + "epoch": 2.9950761887780994, + "grad_norm": 0.3180131807086205, + "learning_rate": 3.176200785103021e-06, + "loss": 0.4285, + "step": 18196 + }, + { + "epoch": 2.995240772997216, + "grad_norm": 0.29612484007700896, + "learning_rate": 3.175751528545195e-06, + "loss": 0.4318, + "step": 18197 + }, + { + "epoch": 2.9954053572163324, + "grad_norm": 0.28254929449057237, + "learning_rate": 3.1753022864370376e-06, + "loss": 0.4345, + "step": 18198 + }, + { + "epoch": 2.995569941435449, + "grad_norm": 0.3183231439188339, + "learning_rate": 3.1748530587834543e-06, + "loss": 0.4349, + "step": 18199 + }, + { + "epoch": 2.995734525654565, + "grad_norm": 0.46343260585566537, + "learning_rate": 3.1744038455893443e-06, + "loss": 0.4357, + "step": 18200 + }, + { + "epoch": 2.9958991098736814, + "grad_norm": 0.29529337466275296, + "learning_rate": 3.173954646859612e-06, + "loss": 0.4184, + "step": 18201 + }, + { + "epoch": 2.996063694092798, + "grad_norm": 0.31809070614092383, + "learning_rate": 3.173505462599157e-06, + "loss": 0.43, + "step": 18202 + }, + { + "epoch": 2.9962282783119143, + "grad_norm": 0.30457104675831437, + "learning_rate": 3.1730562928128797e-06, + "loss": 0.4354, + "step": 18203 + }, + { + "epoch": 2.9963928625310308, + "grad_norm": 0.3062917076160698, + "learning_rate": 3.1726071375056836e-06, + "loss": 0.4087, + "step": 18204 + }, + { + "epoch": 2.9965574467501472, + "grad_norm": 0.3124888663990827, + "learning_rate": 3.1721579966824684e-06, + "loss": 0.4394, + "step": 18205 + }, + { + "epoch": 2.9967220309692637, + "grad_norm": 0.33867425297635956, + "learning_rate": 3.1717088703481363e-06, + "loss": 0.4486, + "step": 18206 + }, + { + "epoch": 2.99688661518838, + "grad_norm": 0.273558008512084, + "learning_rate": 3.171259758507584e-06, + "loss": 0.4071, + "step": 18207 + }, + { + "epoch": 2.9970511994074966, + "grad_norm": 0.5285508319253733, + "learning_rate": 3.1708106611657184e-06, + "loss": 0.3861, + "step": 18208 + }, + { + "epoch": 2.997215783626613, + "grad_norm": 0.3128874485399929, + "learning_rate": 3.170361578327435e-06, + "loss": 0.4164, + "step": 18209 + }, + { + "epoch": 2.9973803678457296, + "grad_norm": 0.37256940242292064, + "learning_rate": 3.1699125099976353e-06, + "loss": 0.4131, + "step": 18210 + }, + { + "epoch": 2.997544952064846, + "grad_norm": 0.35543677230722404, + "learning_rate": 3.169463456181219e-06, + "loss": 0.4285, + "step": 18211 + }, + { + "epoch": 2.9977095362839625, + "grad_norm": 0.5258426866304687, + "learning_rate": 3.1690144168830863e-06, + "loss": 0.4382, + "step": 18212 + }, + { + "epoch": 2.997874120503079, + "grad_norm": 0.3970173787688329, + "learning_rate": 3.168565392108138e-06, + "loss": 0.4528, + "step": 18213 + }, + { + "epoch": 2.9980387047221955, + "grad_norm": 0.38818627062660305, + "learning_rate": 3.1681163818612722e-06, + "loss": 0.4337, + "step": 18214 + }, + { + "epoch": 2.998203288941312, + "grad_norm": 0.32909854511156844, + "learning_rate": 3.167667386147389e-06, + "loss": 0.4244, + "step": 18215 + }, + { + "epoch": 2.9983678731604284, + "grad_norm": 0.31824541264496803, + "learning_rate": 3.1672184049713886e-06, + "loss": 0.4243, + "step": 18216 + }, + { + "epoch": 2.998532457379545, + "grad_norm": 0.2944951297577863, + "learning_rate": 3.166769438338168e-06, + "loss": 0.45, + "step": 18217 + }, + { + "epoch": 2.9986970415986614, + "grad_norm": 0.31699040399767714, + "learning_rate": 3.1663204862526272e-06, + "loss": 0.4246, + "step": 18218 + }, + { + "epoch": 2.998861625817778, + "grad_norm": 0.28009816206800014, + "learning_rate": 3.1658715487196644e-06, + "loss": 0.4535, + "step": 18219 + }, + { + "epoch": 2.9990262100368943, + "grad_norm": 0.29611811965362095, + "learning_rate": 3.16542262574418e-06, + "loss": 0.409, + "step": 18220 + }, + { + "epoch": 2.9991907942560108, + "grad_norm": 0.32801166259446557, + "learning_rate": 3.16497371733107e-06, + "loss": 0.4215, + "step": 18221 + }, + { + "epoch": 2.9993553784751272, + "grad_norm": 0.29321722798470556, + "learning_rate": 3.1645248234852354e-06, + "loss": 0.4182, + "step": 18222 + }, + { + "epoch": 2.9995199626942437, + "grad_norm": 0.40426280821449323, + "learning_rate": 3.164075944211572e-06, + "loss": 0.4137, + "step": 18223 + }, + { + "epoch": 2.99968454691336, + "grad_norm": 0.5970825070217444, + "learning_rate": 3.1636270795149784e-06, + "loss": 0.4313, + "step": 18224 + }, + { + "epoch": 2.9998491311324766, + "grad_norm": 0.2644700809917794, + "learning_rate": 3.1631782294003545e-06, + "loss": 0.4145, + "step": 18225 + }, + { + "epoch": 3.000013715351593, + "grad_norm": 0.2995399869875604, + "learning_rate": 3.162729393872594e-06, + "loss": 0.4123, + "step": 18226 + }, + { + "epoch": 3.0001782995707096, + "grad_norm": 0.27034051788182417, + "learning_rate": 3.1622805729365987e-06, + "loss": 0.4177, + "step": 18227 + }, + { + "epoch": 3.000342883789826, + "grad_norm": 0.3971305869673882, + "learning_rate": 3.1618317665972613e-06, + "loss": 0.4242, + "step": 18228 + }, + { + "epoch": 3.0005074680089425, + "grad_norm": 0.3414507516646599, + "learning_rate": 3.1613829748594845e-06, + "loss": 0.4184, + "step": 18229 + }, + { + "epoch": 3.000672052228059, + "grad_norm": 0.3465525713100099, + "learning_rate": 3.1609341977281607e-06, + "loss": 0.4125, + "step": 18230 + }, + { + "epoch": 3.0008366364471755, + "grad_norm": 0.33514919669558996, + "learning_rate": 3.160485435208189e-06, + "loss": 0.4305, + "step": 18231 + }, + { + "epoch": 3.001001220666292, + "grad_norm": 0.33085851251192505, + "learning_rate": 3.1600366873044664e-06, + "loss": 0.4442, + "step": 18232 + }, + { + "epoch": 3.0011658048854084, + "grad_norm": 0.27738310096517826, + "learning_rate": 3.159587954021887e-06, + "loss": 0.44, + "step": 18233 + }, + { + "epoch": 3.001330389104525, + "grad_norm": 0.46496425947877273, + "learning_rate": 3.1591392353653504e-06, + "loss": 0.4317, + "step": 18234 + }, + { + "epoch": 3.0014949733236413, + "grad_norm": 0.5780192641756802, + "learning_rate": 3.15869053133975e-06, + "loss": 0.4333, + "step": 18235 + }, + { + "epoch": 3.001659557542758, + "grad_norm": 0.4204830312536245, + "learning_rate": 3.1582418419499853e-06, + "loss": 0.4236, + "step": 18236 + }, + { + "epoch": 3.0018241417618743, + "grad_norm": 0.31813304116451246, + "learning_rate": 3.157793167200947e-06, + "loss": 0.4434, + "step": 18237 + }, + { + "epoch": 3.0019887259809903, + "grad_norm": 0.3340994695144974, + "learning_rate": 3.157344507097537e-06, + "loss": 0.4131, + "step": 18238 + }, + { + "epoch": 3.002153310200107, + "grad_norm": 0.32359193355644744, + "learning_rate": 3.1568958616446466e-06, + "loss": 0.4416, + "step": 18239 + }, + { + "epoch": 3.0023178944192233, + "grad_norm": 0.3914003964350955, + "learning_rate": 3.1564472308471723e-06, + "loss": 0.4414, + "step": 18240 + }, + { + "epoch": 3.0024824786383397, + "grad_norm": 1.5983605977406183, + "learning_rate": 3.1559986147100103e-06, + "loss": 0.4182, + "step": 18241 + }, + { + "epoch": 3.002647062857456, + "grad_norm": 0.2656196346512441, + "learning_rate": 3.155550013238054e-06, + "loss": 0.4238, + "step": 18242 + }, + { + "epoch": 3.0028116470765727, + "grad_norm": 0.3435030234191996, + "learning_rate": 3.1551014264362003e-06, + "loss": 0.4419, + "step": 18243 + }, + { + "epoch": 3.002976231295689, + "grad_norm": 0.2774345818391581, + "learning_rate": 3.1546528543093423e-06, + "loss": 0.4331, + "step": 18244 + }, + { + "epoch": 3.0031408155148056, + "grad_norm": 0.2795919065919006, + "learning_rate": 3.154204296862376e-06, + "loss": 0.4321, + "step": 18245 + }, + { + "epoch": 3.003305399733922, + "grad_norm": 0.3060950103109964, + "learning_rate": 3.1537557541001964e-06, + "loss": 0.4324, + "step": 18246 + }, + { + "epoch": 3.0034699839530385, + "grad_norm": 0.38995358074078834, + "learning_rate": 3.153307226027694e-06, + "loss": 0.4382, + "step": 18247 + }, + { + "epoch": 3.003634568172155, + "grad_norm": 0.352334026299767, + "learning_rate": 3.1528587126497684e-06, + "loss": 0.4104, + "step": 18248 + }, + { + "epoch": 3.0037991523912715, + "grad_norm": 0.714688159733198, + "learning_rate": 3.1524102139713094e-06, + "loss": 0.4323, + "step": 18249 + }, + { + "epoch": 3.003963736610388, + "grad_norm": 0.3182506796247081, + "learning_rate": 3.1519617299972128e-06, + "loss": 0.4201, + "step": 18250 + }, + { + "epoch": 3.0041283208295044, + "grad_norm": 0.39036458320277834, + "learning_rate": 3.151513260732371e-06, + "loss": 0.4338, + "step": 18251 + }, + { + "epoch": 3.004292905048621, + "grad_norm": 0.2987276384376063, + "learning_rate": 3.151064806181679e-06, + "loss": 0.4213, + "step": 18252 + }, + { + "epoch": 3.0044574892677374, + "grad_norm": 0.3210273705242925, + "learning_rate": 3.15061636635003e-06, + "loss": 0.4198, + "step": 18253 + }, + { + "epoch": 3.004622073486854, + "grad_norm": 0.2906514077196313, + "learning_rate": 3.1501679412423154e-06, + "loss": 0.4283, + "step": 18254 + }, + { + "epoch": 3.0047866577059703, + "grad_norm": 0.325529310036599, + "learning_rate": 3.1497195308634315e-06, + "loss": 0.4142, + "step": 18255 + }, + { + "epoch": 3.004951241925087, + "grad_norm": 0.2876029969384978, + "learning_rate": 3.149271135218266e-06, + "loss": 0.4263, + "step": 18256 + }, + { + "epoch": 3.0051158261442033, + "grad_norm": 0.3009431027312247, + "learning_rate": 3.148822754311718e-06, + "loss": 0.4323, + "step": 18257 + }, + { + "epoch": 3.0052804103633197, + "grad_norm": 1.0456543690184665, + "learning_rate": 3.148374388148674e-06, + "loss": 0.4425, + "step": 18258 + }, + { + "epoch": 3.005444994582436, + "grad_norm": 0.41242992191256006, + "learning_rate": 3.147926036734032e-06, + "loss": 0.4133, + "step": 18259 + }, + { + "epoch": 3.0056095788015527, + "grad_norm": 0.3018174934163969, + "learning_rate": 3.147477700072681e-06, + "loss": 0.4475, + "step": 18260 + }, + { + "epoch": 3.005774163020669, + "grad_norm": 0.30378887389327897, + "learning_rate": 3.1470293781695114e-06, + "loss": 0.4343, + "step": 18261 + }, + { + "epoch": 3.0059387472397856, + "grad_norm": 0.3030320265528302, + "learning_rate": 3.1465810710294193e-06, + "loss": 0.4224, + "step": 18262 + }, + { + "epoch": 3.006103331458902, + "grad_norm": 0.32468997777886277, + "learning_rate": 3.146132778657293e-06, + "loss": 0.4206, + "step": 18263 + }, + { + "epoch": 3.0062679156780185, + "grad_norm": 0.3073859418492667, + "learning_rate": 3.145684501058027e-06, + "loss": 0.4397, + "step": 18264 + }, + { + "epoch": 3.006432499897135, + "grad_norm": 0.3535746497464689, + "learning_rate": 3.1452362382365093e-06, + "loss": 0.428, + "step": 18265 + }, + { + "epoch": 3.0065970841162515, + "grad_norm": 0.3509794888005568, + "learning_rate": 3.1447879901976346e-06, + "loss": 0.4145, + "step": 18266 + }, + { + "epoch": 3.006761668335368, + "grad_norm": 0.37055426427310695, + "learning_rate": 3.1443397569462916e-06, + "loss": 0.4202, + "step": 18267 + }, + { + "epoch": 3.0069262525544844, + "grad_norm": 0.3983158517239644, + "learning_rate": 3.143891538487374e-06, + "loss": 0.4378, + "step": 18268 + }, + { + "epoch": 3.007090836773601, + "grad_norm": 0.8847329921051714, + "learning_rate": 3.1434433348257696e-06, + "loss": 0.4332, + "step": 18269 + }, + { + "epoch": 3.0072554209927174, + "grad_norm": 0.40040346566986273, + "learning_rate": 3.1429951459663688e-06, + "loss": 0.4313, + "step": 18270 + }, + { + "epoch": 3.0074200052118334, + "grad_norm": 0.30438473774909447, + "learning_rate": 3.1425469719140652e-06, + "loss": 0.4317, + "step": 18271 + }, + { + "epoch": 3.00758458943095, + "grad_norm": 0.26880189766547447, + "learning_rate": 3.142098812673746e-06, + "loss": 0.4123, + "step": 18272 + }, + { + "epoch": 3.0077491736500663, + "grad_norm": 0.43929111116823344, + "learning_rate": 3.1416506682503034e-06, + "loss": 0.4256, + "step": 18273 + }, + { + "epoch": 3.007913757869183, + "grad_norm": 0.31012222245409504, + "learning_rate": 3.1412025386486254e-06, + "loss": 0.4152, + "step": 18274 + }, + { + "epoch": 3.0080783420882993, + "grad_norm": 0.3281837024857674, + "learning_rate": 3.140754423873605e-06, + "loss": 0.4276, + "step": 18275 + }, + { + "epoch": 3.0082429263074157, + "grad_norm": 0.359541434317751, + "learning_rate": 3.14030632393013e-06, + "loss": 0.4311, + "step": 18276 + }, + { + "epoch": 3.008407510526532, + "grad_norm": 0.381024987330811, + "learning_rate": 3.1398582388230866e-06, + "loss": 0.4349, + "step": 18277 + }, + { + "epoch": 3.0085720947456487, + "grad_norm": 0.2725960858202017, + "learning_rate": 3.1394101685573705e-06, + "loss": 0.4377, + "step": 18278 + }, + { + "epoch": 3.008736678964765, + "grad_norm": 0.29303313243267587, + "learning_rate": 3.1389621131378657e-06, + "loss": 0.4281, + "step": 18279 + }, + { + "epoch": 3.0089012631838816, + "grad_norm": 0.35231873351827986, + "learning_rate": 3.1385140725694644e-06, + "loss": 0.4281, + "step": 18280 + }, + { + "epoch": 3.009065847402998, + "grad_norm": 0.32507177675599463, + "learning_rate": 3.138066046857053e-06, + "loss": 0.4141, + "step": 18281 + }, + { + "epoch": 3.0092304316221146, + "grad_norm": 0.3090038291024942, + "learning_rate": 3.137618036005522e-06, + "loss": 0.4278, + "step": 18282 + }, + { + "epoch": 3.009395015841231, + "grad_norm": 0.34411886423257904, + "learning_rate": 3.137170040019759e-06, + "loss": 0.435, + "step": 18283 + }, + { + "epoch": 3.0095596000603475, + "grad_norm": 0.3207335840825621, + "learning_rate": 3.1367220589046525e-06, + "loss": 0.4271, + "step": 18284 + }, + { + "epoch": 3.009724184279464, + "grad_norm": 0.394758107332526, + "learning_rate": 3.1362740926650915e-06, + "loss": 0.423, + "step": 18285 + }, + { + "epoch": 3.0098887684985804, + "grad_norm": 0.3173756255656595, + "learning_rate": 3.1358261413059622e-06, + "loss": 0.4243, + "step": 18286 + }, + { + "epoch": 3.010053352717697, + "grad_norm": 0.36759515811796317, + "learning_rate": 3.1353782048321555e-06, + "loss": 0.4229, + "step": 18287 + }, + { + "epoch": 3.0102179369368134, + "grad_norm": 0.3570919253203036, + "learning_rate": 3.1349302832485547e-06, + "loss": 0.4363, + "step": 18288 + }, + { + "epoch": 3.01038252115593, + "grad_norm": 0.44230721314041077, + "learning_rate": 3.1344823765600525e-06, + "loss": 0.4322, + "step": 18289 + }, + { + "epoch": 3.0105471053750463, + "grad_norm": 0.41149678706373655, + "learning_rate": 3.1340344847715324e-06, + "loss": 0.4149, + "step": 18290 + }, + { + "epoch": 3.010711689594163, + "grad_norm": 0.3896154491901678, + "learning_rate": 3.133586607887882e-06, + "loss": 0.4451, + "step": 18291 + }, + { + "epoch": 3.0108762738132793, + "grad_norm": 0.4611229387144769, + "learning_rate": 3.1331387459139904e-06, + "loss": 0.4311, + "step": 18292 + }, + { + "epoch": 3.0110408580323957, + "grad_norm": 0.41381418119138585, + "learning_rate": 3.1326908988547425e-06, + "loss": 0.4272, + "step": 18293 + }, + { + "epoch": 3.011205442251512, + "grad_norm": 0.33006048790236775, + "learning_rate": 3.1322430667150263e-06, + "loss": 0.4307, + "step": 18294 + }, + { + "epoch": 3.0113700264706287, + "grad_norm": 0.2806686050008286, + "learning_rate": 3.1317952494997273e-06, + "loss": 0.4334, + "step": 18295 + }, + { + "epoch": 3.011534610689745, + "grad_norm": 0.303403131668887, + "learning_rate": 3.1313474472137334e-06, + "loss": 0.4349, + "step": 18296 + }, + { + "epoch": 3.0116991949088616, + "grad_norm": 0.36015904202072035, + "learning_rate": 3.130899659861931e-06, + "loss": 0.4269, + "step": 18297 + }, + { + "epoch": 3.011863779127978, + "grad_norm": 0.39213784370577165, + "learning_rate": 3.1304518874492027e-06, + "loss": 0.4165, + "step": 18298 + }, + { + "epoch": 3.0120283633470946, + "grad_norm": 0.4320711827383614, + "learning_rate": 3.130004129980438e-06, + "loss": 0.4225, + "step": 18299 + }, + { + "epoch": 3.012192947566211, + "grad_norm": 0.3721589267015674, + "learning_rate": 3.1295563874605203e-06, + "loss": 0.4128, + "step": 18300 + }, + { + "epoch": 3.0123575317853275, + "grad_norm": 0.38413959309953516, + "learning_rate": 3.1291086598943376e-06, + "loss": 0.4374, + "step": 18301 + }, + { + "epoch": 3.012522116004444, + "grad_norm": 0.41329479123233775, + "learning_rate": 3.128660947286773e-06, + "loss": 0.4546, + "step": 18302 + }, + { + "epoch": 3.0126867002235604, + "grad_norm": 0.2678832781833614, + "learning_rate": 3.128213249642714e-06, + "loss": 0.4175, + "step": 18303 + }, + { + "epoch": 3.0128512844426765, + "grad_norm": 0.35521270889326156, + "learning_rate": 3.1277655669670446e-06, + "loss": 0.4205, + "step": 18304 + }, + { + "epoch": 3.013015868661793, + "grad_norm": 0.3104405779774533, + "learning_rate": 3.1273178992646486e-06, + "loss": 0.4259, + "step": 18305 + }, + { + "epoch": 3.0131804528809094, + "grad_norm": 0.2858893085394464, + "learning_rate": 3.1268702465404133e-06, + "loss": 0.4198, + "step": 18306 + }, + { + "epoch": 3.013345037100026, + "grad_norm": 0.37408477887754, + "learning_rate": 3.1264226087992197e-06, + "loss": 0.4288, + "step": 18307 + }, + { + "epoch": 3.0135096213191424, + "grad_norm": 0.4217822609844656, + "learning_rate": 3.1259749860459565e-06, + "loss": 0.4422, + "step": 18308 + }, + { + "epoch": 3.013674205538259, + "grad_norm": 0.26756539192671847, + "learning_rate": 3.125527378285504e-06, + "loss": 0.41, + "step": 18309 + }, + { + "epoch": 3.0138387897573753, + "grad_norm": 0.47541946698410265, + "learning_rate": 3.1250797855227504e-06, + "loss": 0.4163, + "step": 18310 + }, + { + "epoch": 3.0140033739764918, + "grad_norm": 0.384741342753798, + "learning_rate": 3.1246322077625755e-06, + "loss": 0.438, + "step": 18311 + }, + { + "epoch": 3.0141679581956082, + "grad_norm": 0.28876904645439844, + "learning_rate": 3.1241846450098664e-06, + "loss": 0.4262, + "step": 18312 + }, + { + "epoch": 3.0143325424147247, + "grad_norm": 0.28806536770785424, + "learning_rate": 3.1237370972695048e-06, + "loss": 0.4164, + "step": 18313 + }, + { + "epoch": 3.014497126633841, + "grad_norm": 0.3084635180596369, + "learning_rate": 3.1232895645463744e-06, + "loss": 0.4196, + "step": 18314 + }, + { + "epoch": 3.0146617108529576, + "grad_norm": 0.3054116371458221, + "learning_rate": 3.122842046845359e-06, + "loss": 0.4368, + "step": 18315 + }, + { + "epoch": 3.014826295072074, + "grad_norm": 0.3836994030013055, + "learning_rate": 3.1223945441713413e-06, + "loss": 0.4339, + "step": 18316 + }, + { + "epoch": 3.0149908792911906, + "grad_norm": 0.373344678246291, + "learning_rate": 3.1219470565292064e-06, + "loss": 0.4271, + "step": 18317 + }, + { + "epoch": 3.015155463510307, + "grad_norm": 0.3233689476672488, + "learning_rate": 3.121499583923832e-06, + "loss": 0.4241, + "step": 18318 + }, + { + "epoch": 3.0153200477294235, + "grad_norm": 0.49564569197291564, + "learning_rate": 3.121052126360107e-06, + "loss": 0.4406, + "step": 18319 + }, + { + "epoch": 3.01548463194854, + "grad_norm": 0.46988181103347443, + "learning_rate": 3.12060468384291e-06, + "loss": 0.4386, + "step": 18320 + }, + { + "epoch": 3.0156492161676565, + "grad_norm": 0.4603952940576739, + "learning_rate": 3.1201572563771234e-06, + "loss": 0.4121, + "step": 18321 + }, + { + "epoch": 3.015813800386773, + "grad_norm": 0.3319710266670054, + "learning_rate": 3.1197098439676307e-06, + "loss": 0.4435, + "step": 18322 + }, + { + "epoch": 3.0159783846058894, + "grad_norm": 0.28959240783036866, + "learning_rate": 3.1192624466193127e-06, + "loss": 0.415, + "step": 18323 + }, + { + "epoch": 3.016142968825006, + "grad_norm": 0.2921197590228239, + "learning_rate": 3.118815064337052e-06, + "loss": 0.4336, + "step": 18324 + }, + { + "epoch": 3.0163075530441223, + "grad_norm": 0.29168013057506503, + "learning_rate": 3.1183676971257303e-06, + "loss": 0.4458, + "step": 18325 + }, + { + "epoch": 3.016472137263239, + "grad_norm": 0.3304046783916581, + "learning_rate": 3.117920344990229e-06, + "loss": 0.4069, + "step": 18326 + }, + { + "epoch": 3.0166367214823553, + "grad_norm": 0.3093798804530362, + "learning_rate": 3.1174730079354304e-06, + "loss": 0.4122, + "step": 18327 + }, + { + "epoch": 3.0168013057014718, + "grad_norm": 0.26771936900248855, + "learning_rate": 3.117025685966212e-06, + "loss": 0.4209, + "step": 18328 + }, + { + "epoch": 3.0169658899205882, + "grad_norm": 0.40780476703473845, + "learning_rate": 3.11657837908746e-06, + "loss": 0.4304, + "step": 18329 + }, + { + "epoch": 3.0171304741397047, + "grad_norm": 0.32813383839828936, + "learning_rate": 3.1161310873040504e-06, + "loss": 0.4279, + "step": 18330 + }, + { + "epoch": 3.017295058358821, + "grad_norm": 0.2798937789325171, + "learning_rate": 3.1156838106208675e-06, + "loss": 0.423, + "step": 18331 + }, + { + "epoch": 3.0174596425779376, + "grad_norm": 0.32055734462181357, + "learning_rate": 3.11523654904279e-06, + "loss": 0.4241, + "step": 18332 + }, + { + "epoch": 3.017624226797054, + "grad_norm": 0.3172206049676667, + "learning_rate": 3.1147893025746985e-06, + "loss": 0.4325, + "step": 18333 + }, + { + "epoch": 3.0177888110161706, + "grad_norm": 0.3434309037151508, + "learning_rate": 3.1143420712214735e-06, + "loss": 0.4463, + "step": 18334 + }, + { + "epoch": 3.017953395235287, + "grad_norm": 0.3742062244031275, + "learning_rate": 3.113894854987994e-06, + "loss": 0.434, + "step": 18335 + }, + { + "epoch": 3.0181179794544035, + "grad_norm": 0.25831774897452026, + "learning_rate": 3.1134476538791415e-06, + "loss": 0.4124, + "step": 18336 + }, + { + "epoch": 3.0182825636735195, + "grad_norm": 0.3204505571282885, + "learning_rate": 3.1130004678997944e-06, + "loss": 0.4201, + "step": 18337 + }, + { + "epoch": 3.018447147892636, + "grad_norm": 0.3682921298167499, + "learning_rate": 3.112553297054834e-06, + "loss": 0.4264, + "step": 18338 + }, + { + "epoch": 3.0186117321117525, + "grad_norm": 0.23909293619699393, + "learning_rate": 3.1121061413491355e-06, + "loss": 0.4198, + "step": 18339 + }, + { + "epoch": 3.018776316330869, + "grad_norm": 0.29361493096236757, + "learning_rate": 3.1116590007875837e-06, + "loss": 0.4076, + "step": 18340 + }, + { + "epoch": 3.0189409005499854, + "grad_norm": 0.4612104279180189, + "learning_rate": 3.1112118753750543e-06, + "loss": 0.4377, + "step": 18341 + }, + { + "epoch": 3.019105484769102, + "grad_norm": 0.281475975355378, + "learning_rate": 3.110764765116425e-06, + "loss": 0.438, + "step": 18342 + }, + { + "epoch": 3.0192700689882184, + "grad_norm": 0.3623311275412427, + "learning_rate": 3.1103176700165773e-06, + "loss": 0.4306, + "step": 18343 + }, + { + "epoch": 3.019434653207335, + "grad_norm": 0.37699299963368665, + "learning_rate": 3.1098705900803884e-06, + "loss": 0.4141, + "step": 18344 + }, + { + "epoch": 3.0195992374264513, + "grad_norm": 0.33703887500358937, + "learning_rate": 3.109423525312737e-06, + "loss": 0.4311, + "step": 18345 + }, + { + "epoch": 3.019763821645568, + "grad_norm": 0.3045504218476557, + "learning_rate": 3.1089764757185005e-06, + "loss": 0.4219, + "step": 18346 + }, + { + "epoch": 3.0199284058646843, + "grad_norm": 0.3574769206992018, + "learning_rate": 3.108529441302558e-06, + "loss": 0.4169, + "step": 18347 + }, + { + "epoch": 3.0200929900838007, + "grad_norm": 0.31503034784782696, + "learning_rate": 3.1080824220697882e-06, + "loss": 0.4291, + "step": 18348 + }, + { + "epoch": 3.020257574302917, + "grad_norm": 0.38223697652308825, + "learning_rate": 3.107635418025066e-06, + "loss": 0.423, + "step": 18349 + }, + { + "epoch": 3.0204221585220337, + "grad_norm": 0.3895900387968357, + "learning_rate": 3.1071884291732707e-06, + "loss": 0.4337, + "step": 18350 + }, + { + "epoch": 3.02058674274115, + "grad_norm": 0.38589754748451754, + "learning_rate": 3.106741455519278e-06, + "loss": 0.4202, + "step": 18351 + }, + { + "epoch": 3.0207513269602666, + "grad_norm": 0.474879256779032, + "learning_rate": 3.106294497067968e-06, + "loss": 0.4267, + "step": 18352 + }, + { + "epoch": 3.020915911179383, + "grad_norm": 0.3209696342392641, + "learning_rate": 3.105847553824216e-06, + "loss": 0.4245, + "step": 18353 + }, + { + "epoch": 3.0210804953984995, + "grad_norm": 0.34778376983732656, + "learning_rate": 3.105400625792899e-06, + "loss": 0.4101, + "step": 18354 + }, + { + "epoch": 3.021245079617616, + "grad_norm": 0.33399183177152747, + "learning_rate": 3.1049537129788937e-06, + "loss": 0.4097, + "step": 18355 + }, + { + "epoch": 3.0214096638367325, + "grad_norm": 0.2988899506318333, + "learning_rate": 3.104506815387077e-06, + "loss": 0.4483, + "step": 18356 + }, + { + "epoch": 3.021574248055849, + "grad_norm": 0.3500392847200933, + "learning_rate": 3.1040599330223264e-06, + "loss": 0.447, + "step": 18357 + }, + { + "epoch": 3.0217388322749654, + "grad_norm": 0.48930436377342484, + "learning_rate": 3.1036130658895134e-06, + "loss": 0.4277, + "step": 18358 + }, + { + "epoch": 3.021903416494082, + "grad_norm": 0.3234127630400757, + "learning_rate": 3.1031662139935205e-06, + "loss": 0.4391, + "step": 18359 + }, + { + "epoch": 3.0220680007131984, + "grad_norm": 0.27320307065801214, + "learning_rate": 3.102719377339217e-06, + "loss": 0.4229, + "step": 18360 + }, + { + "epoch": 3.022232584932315, + "grad_norm": 0.5085343141406332, + "learning_rate": 3.1022725559314855e-06, + "loss": 0.4194, + "step": 18361 + }, + { + "epoch": 3.0223971691514313, + "grad_norm": 0.2773280004945635, + "learning_rate": 3.101825749775196e-06, + "loss": 0.4516, + "step": 18362 + }, + { + "epoch": 3.0225617533705478, + "grad_norm": 0.382392052409632, + "learning_rate": 3.1013789588752266e-06, + "loss": 0.444, + "step": 18363 + }, + { + "epoch": 3.0227263375896642, + "grad_norm": 0.36171418520805143, + "learning_rate": 3.1009321832364516e-06, + "loss": 0.4243, + "step": 18364 + }, + { + "epoch": 3.0228909218087807, + "grad_norm": 0.3450737581253108, + "learning_rate": 3.1004854228637462e-06, + "loss": 0.442, + "step": 18365 + }, + { + "epoch": 3.023055506027897, + "grad_norm": 0.49415695404695414, + "learning_rate": 3.1000386777619857e-06, + "loss": 0.4384, + "step": 18366 + }, + { + "epoch": 3.0232200902470137, + "grad_norm": 0.3017298209376994, + "learning_rate": 3.099591947936044e-06, + "loss": 0.4414, + "step": 18367 + }, + { + "epoch": 3.02338467446613, + "grad_norm": 0.4087842470656403, + "learning_rate": 3.099145233390797e-06, + "loss": 0.4327, + "step": 18368 + }, + { + "epoch": 3.0235492586852466, + "grad_norm": 0.40924406967671867, + "learning_rate": 3.0986985341311157e-06, + "loss": 0.4331, + "step": 18369 + }, + { + "epoch": 3.023713842904363, + "grad_norm": 0.2763617320415296, + "learning_rate": 3.098251850161879e-06, + "loss": 0.4223, + "step": 18370 + }, + { + "epoch": 3.023878427123479, + "grad_norm": 0.9895023088998105, + "learning_rate": 3.0978051814879594e-06, + "loss": 0.4436, + "step": 18371 + }, + { + "epoch": 3.0240430113425956, + "grad_norm": 0.3230702156672306, + "learning_rate": 3.0973585281142276e-06, + "loss": 0.4243, + "step": 18372 + }, + { + "epoch": 3.024207595561712, + "grad_norm": 0.334355242509786, + "learning_rate": 3.0969118900455616e-06, + "loss": 0.4267, + "step": 18373 + }, + { + "epoch": 3.0243721797808285, + "grad_norm": 0.30770411266118664, + "learning_rate": 3.0964652672868314e-06, + "loss": 0.4324, + "step": 18374 + }, + { + "epoch": 3.024536763999945, + "grad_norm": 0.2885193574945699, + "learning_rate": 3.096018659842913e-06, + "loss": 0.4276, + "step": 18375 + }, + { + "epoch": 3.0247013482190614, + "grad_norm": 0.34113089461111257, + "learning_rate": 3.0955720677186783e-06, + "loss": 0.4392, + "step": 18376 + }, + { + "epoch": 3.024865932438178, + "grad_norm": 0.452973237345632, + "learning_rate": 3.0951254909190017e-06, + "loss": 0.4398, + "step": 18377 + }, + { + "epoch": 3.0250305166572944, + "grad_norm": 0.34959258473425364, + "learning_rate": 3.0946789294487554e-06, + "loss": 0.4286, + "step": 18378 + }, + { + "epoch": 3.025195100876411, + "grad_norm": 0.3275405935360879, + "learning_rate": 3.0942323833128095e-06, + "loss": 0.4168, + "step": 18379 + }, + { + "epoch": 3.0253596850955273, + "grad_norm": 0.3018346913482798, + "learning_rate": 3.093785852516041e-06, + "loss": 0.4273, + "step": 18380 + }, + { + "epoch": 3.025524269314644, + "grad_norm": 0.36688729357237526, + "learning_rate": 3.0933393370633182e-06, + "loss": 0.42, + "step": 18381 + }, + { + "epoch": 3.0256888535337603, + "grad_norm": 0.4861306614021659, + "learning_rate": 3.0928928369595167e-06, + "loss": 0.3911, + "step": 18382 + }, + { + "epoch": 3.0258534377528767, + "grad_norm": 0.3492407070341485, + "learning_rate": 3.0924463522095054e-06, + "loss": 0.4361, + "step": 18383 + }, + { + "epoch": 3.026018021971993, + "grad_norm": 0.3980551315439607, + "learning_rate": 3.0919998828181593e-06, + "loss": 0.4216, + "step": 18384 + }, + { + "epoch": 3.0261826061911097, + "grad_norm": 0.38357388279611354, + "learning_rate": 3.0915534287903485e-06, + "loss": 0.419, + "step": 18385 + }, + { + "epoch": 3.026347190410226, + "grad_norm": 0.4891027846917855, + "learning_rate": 3.091106990130943e-06, + "loss": 0.4204, + "step": 18386 + }, + { + "epoch": 3.0265117746293426, + "grad_norm": 0.3391882237310598, + "learning_rate": 3.0906605668448177e-06, + "loss": 0.4179, + "step": 18387 + }, + { + "epoch": 3.026676358848459, + "grad_norm": 0.33533144601984904, + "learning_rate": 3.0902141589368396e-06, + "loss": 0.4356, + "step": 18388 + }, + { + "epoch": 3.0268409430675756, + "grad_norm": 0.3100015251700531, + "learning_rate": 3.0897677664118844e-06, + "loss": 0.4194, + "step": 18389 + }, + { + "epoch": 3.027005527286692, + "grad_norm": 0.3667714832533851, + "learning_rate": 3.089321389274818e-06, + "loss": 0.4085, + "step": 18390 + }, + { + "epoch": 3.0271701115058085, + "grad_norm": 0.32901637661456695, + "learning_rate": 3.088875027530516e-06, + "loss": 0.4409, + "step": 18391 + }, + { + "epoch": 3.027334695724925, + "grad_norm": 0.3926995176310053, + "learning_rate": 3.088428681183845e-06, + "loss": 0.425, + "step": 18392 + }, + { + "epoch": 3.0274992799440414, + "grad_norm": 0.33404719587309123, + "learning_rate": 3.087982350239677e-06, + "loss": 0.4376, + "step": 18393 + }, + { + "epoch": 3.027663864163158, + "grad_norm": 0.27830564539727404, + "learning_rate": 3.087536034702883e-06, + "loss": 0.4219, + "step": 18394 + }, + { + "epoch": 3.0278284483822744, + "grad_norm": 0.411080967064806, + "learning_rate": 3.0870897345783307e-06, + "loss": 0.4101, + "step": 18395 + }, + { + "epoch": 3.027993032601391, + "grad_norm": 0.337151936162758, + "learning_rate": 3.0866434498708924e-06, + "loss": 0.4232, + "step": 18396 + }, + { + "epoch": 3.0281576168205073, + "grad_norm": 0.28520005890645866, + "learning_rate": 3.086197180585436e-06, + "loss": 0.4461, + "step": 18397 + }, + { + "epoch": 3.028322201039624, + "grad_norm": 0.33666331248057874, + "learning_rate": 3.0857509267268324e-06, + "loss": 0.4296, + "step": 18398 + }, + { + "epoch": 3.0284867852587403, + "grad_norm": 0.30655635466216224, + "learning_rate": 3.0853046882999488e-06, + "loss": 0.4181, + "step": 18399 + }, + { + "epoch": 3.0286513694778567, + "grad_norm": 0.37866615924108316, + "learning_rate": 3.084858465309659e-06, + "loss": 0.4393, + "step": 18400 + } + ], + "logging_steps": 1.0, + "max_steps": 30375, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.8309052386849587e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}