{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.989993328885924, "eval_steps": 500, "global_step": 935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00533689126084056, "grad_norm": 6.148000037568765, "learning_rate": 4.2553191489361704e-07, "loss": 0.9543, "step": 1 }, { "epoch": 0.01067378252168112, "grad_norm": 5.76207120136237, "learning_rate": 8.510638297872341e-07, "loss": 0.9275, "step": 2 }, { "epoch": 0.016010673782521682, "grad_norm": 5.846133487180721, "learning_rate": 1.276595744680851e-06, "loss": 0.9352, "step": 3 }, { "epoch": 0.02134756504336224, "grad_norm": 5.948943250861422, "learning_rate": 1.7021276595744682e-06, "loss": 0.9911, "step": 4 }, { "epoch": 0.0266844563042028, "grad_norm": 5.471501096137681, "learning_rate": 2.1276595744680853e-06, "loss": 0.9284, "step": 5 }, { "epoch": 0.032021347565043365, "grad_norm": 4.598154721919055, "learning_rate": 2.553191489361702e-06, "loss": 0.9151, "step": 6 }, { "epoch": 0.037358238825883926, "grad_norm": 4.191582543667081, "learning_rate": 2.978723404255319e-06, "loss": 0.9174, "step": 7 }, { "epoch": 0.04269513008672448, "grad_norm": 2.2852207165276175, "learning_rate": 3.4042553191489363e-06, "loss": 0.8733, "step": 8 }, { "epoch": 0.04803202134756504, "grad_norm": 1.9926907110993184, "learning_rate": 3.8297872340425535e-06, "loss": 0.8863, "step": 9 }, { "epoch": 0.0533689126084056, "grad_norm": 1.6897648811838724, "learning_rate": 4.255319148936171e-06, "loss": 0.8197, "step": 10 }, { "epoch": 0.05870580386924616, "grad_norm": 4.197834380211571, "learning_rate": 4.680851063829788e-06, "loss": 0.8614, "step": 11 }, { "epoch": 0.06404269513008673, "grad_norm": 4.349335585259934, "learning_rate": 5.106382978723404e-06, "loss": 0.8723, "step": 12 }, { "epoch": 0.06937958639092728, "grad_norm": 3.969299235666275, "learning_rate": 5.531914893617022e-06, "loss": 0.8121, "step": 13 }, { "epoch": 0.07471647765176785, "grad_norm": 3.1468331398249854, "learning_rate": 5.957446808510638e-06, "loss": 0.8111, "step": 14 }, { "epoch": 0.0800533689126084, "grad_norm": 3.060340690213249, "learning_rate": 6.382978723404256e-06, "loss": 0.7968, "step": 15 }, { "epoch": 0.08539026017344896, "grad_norm": 2.5011639930656364, "learning_rate": 6.808510638297873e-06, "loss": 0.7474, "step": 16 }, { "epoch": 0.09072715143428953, "grad_norm": 2.0423245778644086, "learning_rate": 7.234042553191491e-06, "loss": 0.7869, "step": 17 }, { "epoch": 0.09606404269513008, "grad_norm": 1.7376707160976537, "learning_rate": 7.659574468085107e-06, "loss": 0.7671, "step": 18 }, { "epoch": 0.10140093395597065, "grad_norm": 1.615732209627739, "learning_rate": 8.085106382978723e-06, "loss": 0.7463, "step": 19 }, { "epoch": 0.1067378252168112, "grad_norm": 1.728049657874694, "learning_rate": 8.510638297872341e-06, "loss": 0.7339, "step": 20 }, { "epoch": 0.11207471647765177, "grad_norm": 1.7850972209703333, "learning_rate": 8.936170212765958e-06, "loss": 0.7487, "step": 21 }, { "epoch": 0.11741160773849232, "grad_norm": 1.427350518416662, "learning_rate": 9.361702127659576e-06, "loss": 0.7327, "step": 22 }, { "epoch": 0.12274849899933289, "grad_norm": 1.259778470170131, "learning_rate": 9.787234042553192e-06, "loss": 0.7032, "step": 23 }, { "epoch": 0.12808539026017346, "grad_norm": 1.2541341499268899, "learning_rate": 1.0212765957446808e-05, "loss": 0.7157, "step": 24 }, { "epoch": 0.133422281521014, "grad_norm": 1.209399238284116, "learning_rate": 1.0638297872340426e-05, "loss": 0.696, "step": 25 }, { "epoch": 0.13875917278185457, "grad_norm": 1.1148869413377325, "learning_rate": 1.1063829787234044e-05, "loss": 0.7342, "step": 26 }, { "epoch": 0.14409606404269512, "grad_norm": 1.0267231678585527, "learning_rate": 1.1489361702127662e-05, "loss": 0.7014, "step": 27 }, { "epoch": 0.1494329553035357, "grad_norm": 1.053000012577453, "learning_rate": 1.1914893617021277e-05, "loss": 0.7193, "step": 28 }, { "epoch": 0.15476984656437626, "grad_norm": 0.9236120588909748, "learning_rate": 1.2340425531914895e-05, "loss": 0.6848, "step": 29 }, { "epoch": 0.1601067378252168, "grad_norm": 0.707299824861324, "learning_rate": 1.2765957446808513e-05, "loss": 0.6967, "step": 30 }, { "epoch": 0.16544362908605736, "grad_norm": 0.7954547209115258, "learning_rate": 1.3191489361702127e-05, "loss": 0.7101, "step": 31 }, { "epoch": 0.17078052034689792, "grad_norm": 0.7740855587703414, "learning_rate": 1.3617021276595745e-05, "loss": 0.6959, "step": 32 }, { "epoch": 0.1761174116077385, "grad_norm": 0.6911657011135235, "learning_rate": 1.4042553191489363e-05, "loss": 0.6649, "step": 33 }, { "epoch": 0.18145430286857905, "grad_norm": 0.6352745397687202, "learning_rate": 1.4468085106382981e-05, "loss": 0.7012, "step": 34 }, { "epoch": 0.1867911941294196, "grad_norm": 0.5651606409699009, "learning_rate": 1.4893617021276596e-05, "loss": 0.6294, "step": 35 }, { "epoch": 0.19212808539026016, "grad_norm": 0.7512985605813014, "learning_rate": 1.5319148936170214e-05, "loss": 0.6641, "step": 36 }, { "epoch": 0.19746497665110074, "grad_norm": 0.6367644839194788, "learning_rate": 1.5744680851063832e-05, "loss": 0.6414, "step": 37 }, { "epoch": 0.2028018679119413, "grad_norm": 0.6727260374168498, "learning_rate": 1.6170212765957446e-05, "loss": 0.6606, "step": 38 }, { "epoch": 0.20813875917278185, "grad_norm": 0.5863518339156979, "learning_rate": 1.6595744680851064e-05, "loss": 0.629, "step": 39 }, { "epoch": 0.2134756504336224, "grad_norm": 0.5957413274178028, "learning_rate": 1.7021276595744682e-05, "loss": 0.6433, "step": 40 }, { "epoch": 0.218812541694463, "grad_norm": 0.6599440739948166, "learning_rate": 1.74468085106383e-05, "loss": 0.6458, "step": 41 }, { "epoch": 0.22414943295530354, "grad_norm": 0.6721754940213285, "learning_rate": 1.7872340425531915e-05, "loss": 0.6492, "step": 42 }, { "epoch": 0.2294863242161441, "grad_norm": 0.7632199119447193, "learning_rate": 1.8297872340425533e-05, "loss": 0.6746, "step": 43 }, { "epoch": 0.23482321547698465, "grad_norm": 0.5166294877933729, "learning_rate": 1.872340425531915e-05, "loss": 0.6295, "step": 44 }, { "epoch": 0.24016010673782523, "grad_norm": 0.829534817664272, "learning_rate": 1.914893617021277e-05, "loss": 0.6375, "step": 45 }, { "epoch": 0.24549699799866578, "grad_norm": 0.5857977032260275, "learning_rate": 1.9574468085106384e-05, "loss": 0.6572, "step": 46 }, { "epoch": 0.25083388925950634, "grad_norm": 0.7823636354279289, "learning_rate": 2e-05, "loss": 0.6504, "step": 47 }, { "epoch": 0.2561707805203469, "grad_norm": 0.7047081341160781, "learning_rate": 2.0425531914893616e-05, "loss": 0.6228, "step": 48 }, { "epoch": 0.26150767178118745, "grad_norm": 0.6863987126187889, "learning_rate": 2.0851063829787238e-05, "loss": 0.6277, "step": 49 }, { "epoch": 0.266844563042028, "grad_norm": 0.6190735802489663, "learning_rate": 2.1276595744680852e-05, "loss": 0.6425, "step": 50 }, { "epoch": 0.27218145430286855, "grad_norm": 0.6101823544223539, "learning_rate": 2.1702127659574467e-05, "loss": 0.6437, "step": 51 }, { "epoch": 0.27751834556370913, "grad_norm": 0.6644263069643335, "learning_rate": 2.2127659574468088e-05, "loss": 0.6387, "step": 52 }, { "epoch": 0.2828552368245497, "grad_norm": 0.5986962358886303, "learning_rate": 2.2553191489361703e-05, "loss": 0.6385, "step": 53 }, { "epoch": 0.28819212808539024, "grad_norm": 0.6094843736073496, "learning_rate": 2.2978723404255324e-05, "loss": 0.6117, "step": 54 }, { "epoch": 0.2935290193462308, "grad_norm": 0.6020778721421863, "learning_rate": 2.340425531914894e-05, "loss": 0.6254, "step": 55 }, { "epoch": 0.2988659106070714, "grad_norm": 0.811050821660795, "learning_rate": 2.3829787234042553e-05, "loss": 0.6128, "step": 56 }, { "epoch": 0.30420280186791193, "grad_norm": 0.5754298827220308, "learning_rate": 2.4255319148936175e-05, "loss": 0.6311, "step": 57 }, { "epoch": 0.3095396931287525, "grad_norm": 0.7520600773223384, "learning_rate": 2.468085106382979e-05, "loss": 0.6179, "step": 58 }, { "epoch": 0.31487658438959304, "grad_norm": 0.6029568660086547, "learning_rate": 2.5106382978723404e-05, "loss": 0.6297, "step": 59 }, { "epoch": 0.3202134756504336, "grad_norm": 0.5648234220070932, "learning_rate": 2.5531914893617025e-05, "loss": 0.6281, "step": 60 }, { "epoch": 0.3255503669112742, "grad_norm": 0.6476246719033275, "learning_rate": 2.595744680851064e-05, "loss": 0.627, "step": 61 }, { "epoch": 0.33088725817211473, "grad_norm": 0.7385881909771188, "learning_rate": 2.6382978723404255e-05, "loss": 0.6173, "step": 62 }, { "epoch": 0.3362241494329553, "grad_norm": 0.7821703232556236, "learning_rate": 2.6808510638297876e-05, "loss": 0.6044, "step": 63 }, { "epoch": 0.34156104069379584, "grad_norm": 0.6836861306090984, "learning_rate": 2.723404255319149e-05, "loss": 0.588, "step": 64 }, { "epoch": 0.3468979319546364, "grad_norm": 0.6580791298040481, "learning_rate": 2.7659574468085112e-05, "loss": 0.6286, "step": 65 }, { "epoch": 0.352234823215477, "grad_norm": 0.864506493478045, "learning_rate": 2.8085106382978727e-05, "loss": 0.5923, "step": 66 }, { "epoch": 0.3575717144763175, "grad_norm": 0.7017167406064279, "learning_rate": 2.851063829787234e-05, "loss": 0.5999, "step": 67 }, { "epoch": 0.3629086057371581, "grad_norm": 0.9052162942115397, "learning_rate": 2.8936170212765963e-05, "loss": 0.6348, "step": 68 }, { "epoch": 0.3682454969979987, "grad_norm": 0.7977337352191972, "learning_rate": 2.9361702127659577e-05, "loss": 0.5823, "step": 69 }, { "epoch": 0.3735823882588392, "grad_norm": 0.741861801885776, "learning_rate": 2.9787234042553192e-05, "loss": 0.6078, "step": 70 }, { "epoch": 0.3789192795196798, "grad_norm": 0.8436055916627975, "learning_rate": 3.0212765957446813e-05, "loss": 0.65, "step": 71 }, { "epoch": 0.3842561707805203, "grad_norm": 0.9156977886774698, "learning_rate": 3.063829787234043e-05, "loss": 0.6131, "step": 72 }, { "epoch": 0.3895930620413609, "grad_norm": 0.7176731849912837, "learning_rate": 3.1063829787234046e-05, "loss": 0.6065, "step": 73 }, { "epoch": 0.3949299533022015, "grad_norm": 1.0096379849584094, "learning_rate": 3.1489361702127664e-05, "loss": 0.6437, "step": 74 }, { "epoch": 0.400266844563042, "grad_norm": 0.9015077136133018, "learning_rate": 3.191489361702128e-05, "loss": 0.5947, "step": 75 }, { "epoch": 0.4056037358238826, "grad_norm": 0.9057483735249481, "learning_rate": 3.234042553191489e-05, "loss": 0.6039, "step": 76 }, { "epoch": 0.4109406270847232, "grad_norm": 1.036920332839965, "learning_rate": 3.276595744680851e-05, "loss": 0.6058, "step": 77 }, { "epoch": 0.4162775183455637, "grad_norm": 0.6532671217107379, "learning_rate": 3.319148936170213e-05, "loss": 0.6039, "step": 78 }, { "epoch": 0.4216144096064043, "grad_norm": 0.861543032559001, "learning_rate": 3.361702127659575e-05, "loss": 0.6169, "step": 79 }, { "epoch": 0.4269513008672448, "grad_norm": 0.9427328391918289, "learning_rate": 3.4042553191489365e-05, "loss": 0.5972, "step": 80 }, { "epoch": 0.4322881921280854, "grad_norm": 0.8268349751981375, "learning_rate": 3.446808510638298e-05, "loss": 0.6028, "step": 81 }, { "epoch": 0.437625083388926, "grad_norm": 0.8010280657011346, "learning_rate": 3.48936170212766e-05, "loss": 0.5937, "step": 82 }, { "epoch": 0.4429619746497665, "grad_norm": 1.4163426245382202, "learning_rate": 3.531914893617022e-05, "loss": 0.5801, "step": 83 }, { "epoch": 0.4482988659106071, "grad_norm": 0.8866001712478151, "learning_rate": 3.574468085106383e-05, "loss": 0.6121, "step": 84 }, { "epoch": 0.4536357571714476, "grad_norm": 1.2990003965290253, "learning_rate": 3.617021276595745e-05, "loss": 0.6509, "step": 85 }, { "epoch": 0.4589726484322882, "grad_norm": 0.9828125705811529, "learning_rate": 3.6595744680851066e-05, "loss": 0.6122, "step": 86 }, { "epoch": 0.46430953969312877, "grad_norm": 1.1075341839055335, "learning_rate": 3.7021276595744684e-05, "loss": 0.6258, "step": 87 }, { "epoch": 0.4696464309539693, "grad_norm": 1.0144524046084673, "learning_rate": 3.74468085106383e-05, "loss": 0.6086, "step": 88 }, { "epoch": 0.4749833222148099, "grad_norm": 1.0772540630578744, "learning_rate": 3.787234042553192e-05, "loss": 0.5999, "step": 89 }, { "epoch": 0.48032021347565046, "grad_norm": 0.881806469299722, "learning_rate": 3.829787234042554e-05, "loss": 0.6498, "step": 90 }, { "epoch": 0.485657104736491, "grad_norm": 0.9769728043077871, "learning_rate": 3.872340425531915e-05, "loss": 0.632, "step": 91 }, { "epoch": 0.49099399599733157, "grad_norm": 0.9506924647295948, "learning_rate": 3.914893617021277e-05, "loss": 0.5646, "step": 92 }, { "epoch": 0.4963308872581721, "grad_norm": 0.697157023956671, "learning_rate": 3.9574468085106385e-05, "loss": 0.5814, "step": 93 }, { "epoch": 0.5016677785190127, "grad_norm": 0.8702722934298305, "learning_rate": 4e-05, "loss": 0.6086, "step": 94 }, { "epoch": 0.5070046697798533, "grad_norm": 0.8781146273756057, "learning_rate": 3.9999860457259224e-05, "loss": 0.579, "step": 95 }, { "epoch": 0.5123415610406938, "grad_norm": 0.6961405489963192, "learning_rate": 3.99994418309841e-05, "loss": 0.6133, "step": 96 }, { "epoch": 0.5176784523015343, "grad_norm": 0.7087239184079108, "learning_rate": 3.9998744127016264e-05, "loss": 0.5902, "step": 97 }, { "epoch": 0.5230153435623749, "grad_norm": 0.5997862763213605, "learning_rate": 3.999776735509166e-05, "loss": 0.5875, "step": 98 }, { "epoch": 0.5283522348232155, "grad_norm": 0.660612211013302, "learning_rate": 3.999651152884044e-05, "loss": 0.6003, "step": 99 }, { "epoch": 0.533689126084056, "grad_norm": 0.6214742049455745, "learning_rate": 3.999497666578674e-05, "loss": 0.6351, "step": 100 }, { "epoch": 0.5390260173448966, "grad_norm": 0.764941637291837, "learning_rate": 3.999316278734846e-05, "loss": 0.5883, "step": 101 }, { "epoch": 0.5443629086057371, "grad_norm": 0.5428320010347989, "learning_rate": 3.9991069918836966e-05, "loss": 0.607, "step": 102 }, { "epoch": 0.5496997998665777, "grad_norm": 0.6973175643455113, "learning_rate": 3.998869808945671e-05, "loss": 0.5755, "step": 103 }, { "epoch": 0.5550366911274183, "grad_norm": 0.5367648197100708, "learning_rate": 3.998604733230485e-05, "loss": 0.577, "step": 104 }, { "epoch": 0.5603735823882589, "grad_norm": 0.6880065518848788, "learning_rate": 3.998311768437078e-05, "loss": 0.606, "step": 105 }, { "epoch": 0.5657104736490994, "grad_norm": 0.6838905198209162, "learning_rate": 3.9979909186535606e-05, "loss": 0.5624, "step": 106 }, { "epoch": 0.57104736490994, "grad_norm": 0.6426687525240344, "learning_rate": 3.9976421883571594e-05, "loss": 0.6195, "step": 107 }, { "epoch": 0.5763842561707805, "grad_norm": 0.7965064222410392, "learning_rate": 3.9972655824141524e-05, "loss": 0.6357, "step": 108 }, { "epoch": 0.5817211474316211, "grad_norm": 0.5885934942607214, "learning_rate": 3.996861106079801e-05, "loss": 0.558, "step": 109 }, { "epoch": 0.5870580386924616, "grad_norm": 0.7774147945831619, "learning_rate": 3.9964287649982805e-05, "loss": 0.5971, "step": 110 }, { "epoch": 0.5923949299533022, "grad_norm": 0.5353107229236527, "learning_rate": 3.9959685652025954e-05, "loss": 0.5731, "step": 111 }, { "epoch": 0.5977318212141428, "grad_norm": 0.8291069965223162, "learning_rate": 3.995480513114501e-05, "loss": 0.6127, "step": 112 }, { "epoch": 0.6030687124749833, "grad_norm": 0.7325810415707809, "learning_rate": 3.994964615544409e-05, "loss": 0.6041, "step": 113 }, { "epoch": 0.6084056037358239, "grad_norm": 0.6428699302157482, "learning_rate": 3.994420879691296e-05, "loss": 0.5808, "step": 114 }, { "epoch": 0.6137424949966644, "grad_norm": 0.5760017273903351, "learning_rate": 3.993849313142601e-05, "loss": 0.5625, "step": 115 }, { "epoch": 0.619079386257505, "grad_norm": 0.6798380160844573, "learning_rate": 3.9932499238741205e-05, "loss": 0.5739, "step": 116 }, { "epoch": 0.6244162775183456, "grad_norm": 0.6045010645880844, "learning_rate": 3.992622720249896e-05, "loss": 0.5535, "step": 117 }, { "epoch": 0.6297531687791861, "grad_norm": 0.6499452053823308, "learning_rate": 3.991967711022099e-05, "loss": 0.5774, "step": 118 }, { "epoch": 0.6350900600400267, "grad_norm": 0.852341222777626, "learning_rate": 3.991284905330908e-05, "loss": 0.6156, "step": 119 }, { "epoch": 0.6404269513008672, "grad_norm": 0.7942631863588528, "learning_rate": 3.99057431270438e-05, "loss": 0.6146, "step": 120 }, { "epoch": 0.6457638425617078, "grad_norm": 0.7346243276112419, "learning_rate": 3.989835943058321e-05, "loss": 0.6245, "step": 121 }, { "epoch": 0.6511007338225484, "grad_norm": 0.5552728410457584, "learning_rate": 3.989069806696141e-05, "loss": 0.5767, "step": 122 }, { "epoch": 0.6564376250833889, "grad_norm": 0.7057507722370929, "learning_rate": 3.9882759143087194e-05, "loss": 0.588, "step": 123 }, { "epoch": 0.6617745163442295, "grad_norm": 0.6039953078185621, "learning_rate": 3.9874542769742465e-05, "loss": 0.5562, "step": 124 }, { "epoch": 0.66711140760507, "grad_norm": 0.6614177548639164, "learning_rate": 3.9866049061580754e-05, "loss": 0.6246, "step": 125 }, { "epoch": 0.6724482988659106, "grad_norm": 0.4994146494205324, "learning_rate": 3.985727813712559e-05, "loss": 0.5781, "step": 126 }, { "epoch": 0.6777851901267512, "grad_norm": 0.6316687122403261, "learning_rate": 3.984823011876885e-05, "loss": 0.5849, "step": 127 }, { "epoch": 0.6831220813875917, "grad_norm": 0.6451958765097028, "learning_rate": 3.983890513276908e-05, "loss": 0.5729, "step": 128 }, { "epoch": 0.6884589726484323, "grad_norm": 0.6080146197253071, "learning_rate": 3.982930330924968e-05, "loss": 0.581, "step": 129 }, { "epoch": 0.6937958639092728, "grad_norm": 0.6403798321958414, "learning_rate": 3.981942478219712e-05, "loss": 0.58, "step": 130 }, { "epoch": 0.6991327551701134, "grad_norm": 0.5940580400091781, "learning_rate": 3.980926968945909e-05, "loss": 0.5851, "step": 131 }, { "epoch": 0.704469646430954, "grad_norm": 0.6114900367863954, "learning_rate": 3.9798838172742523e-05, "loss": 0.5861, "step": 132 }, { "epoch": 0.7098065376917946, "grad_norm": 0.578905757005195, "learning_rate": 3.978813037761167e-05, "loss": 0.5686, "step": 133 }, { "epoch": 0.715143428952635, "grad_norm": 0.4992188099121227, "learning_rate": 3.977714645348603e-05, "loss": 0.5839, "step": 134 }, { "epoch": 0.7204803202134756, "grad_norm": 0.6287390942847498, "learning_rate": 3.9765886553638305e-05, "loss": 0.5935, "step": 135 }, { "epoch": 0.7258172114743162, "grad_norm": 0.5377587251455253, "learning_rate": 3.975435083519221e-05, "loss": 0.5908, "step": 136 }, { "epoch": 0.7311541027351568, "grad_norm": 0.5666232050777832, "learning_rate": 3.974253945912033e-05, "loss": 0.5761, "step": 137 }, { "epoch": 0.7364909939959974, "grad_norm": 0.5157376399011453, "learning_rate": 3.9730452590241855e-05, "loss": 0.5535, "step": 138 }, { "epoch": 0.7418278852568378, "grad_norm": 0.5906288982124854, "learning_rate": 3.9718090397220235e-05, "loss": 0.5669, "step": 139 }, { "epoch": 0.7471647765176784, "grad_norm": 0.4830392788438412, "learning_rate": 3.9705453052560935e-05, "loss": 0.5834, "step": 140 }, { "epoch": 0.752501667778519, "grad_norm": 0.49576189569537626, "learning_rate": 3.9692540732608895e-05, "loss": 0.5695, "step": 141 }, { "epoch": 0.7578385590393596, "grad_norm": 0.5736086879059951, "learning_rate": 3.9679353617546185e-05, "loss": 0.5677, "step": 142 }, { "epoch": 0.7631754503002002, "grad_norm": 0.6081982434820203, "learning_rate": 3.966589189138941e-05, "loss": 0.5737, "step": 143 }, { "epoch": 0.7685123415610406, "grad_norm": 0.4513895583255699, "learning_rate": 3.9652155741987204e-05, "loss": 0.5747, "step": 144 }, { "epoch": 0.7738492328218812, "grad_norm": 0.5662887213755944, "learning_rate": 3.963814536101756e-05, "loss": 0.5642, "step": 145 }, { "epoch": 0.7791861240827218, "grad_norm": 0.5224566159232965, "learning_rate": 3.962386094398515e-05, "loss": 0.557, "step": 146 }, { "epoch": 0.7845230153435624, "grad_norm": 0.630041060013869, "learning_rate": 3.960930269021866e-05, "loss": 0.5868, "step": 147 }, { "epoch": 0.789859906604403, "grad_norm": 0.4804049278016382, "learning_rate": 3.959447080286795e-05, "loss": 0.562, "step": 148 }, { "epoch": 0.7951967978652434, "grad_norm": 0.6899935340653105, "learning_rate": 3.957936548890126e-05, "loss": 0.5842, "step": 149 }, { "epoch": 0.800533689126084, "grad_norm": 0.5121084821707802, "learning_rate": 3.956398695910225e-05, "loss": 0.5835, "step": 150 }, { "epoch": 0.8058705803869246, "grad_norm": 0.668616893098578, "learning_rate": 3.954833542806716e-05, "loss": 0.5861, "step": 151 }, { "epoch": 0.8112074716477652, "grad_norm": 0.45695972645097455, "learning_rate": 3.953241111420174e-05, "loss": 0.5684, "step": 152 }, { "epoch": 0.8165443629086058, "grad_norm": 0.5756971939096189, "learning_rate": 3.951621423971822e-05, "loss": 0.5609, "step": 153 }, { "epoch": 0.8218812541694464, "grad_norm": 0.6608106731864758, "learning_rate": 3.949974503063224e-05, "loss": 0.5832, "step": 154 }, { "epoch": 0.8272181454302868, "grad_norm": 0.45567367046023793, "learning_rate": 3.9483003716759656e-05, "loss": 0.5733, "step": 155 }, { "epoch": 0.8325550366911274, "grad_norm": 0.5216817660800224, "learning_rate": 3.946599053171334e-05, "loss": 0.5863, "step": 156 }, { "epoch": 0.837891927951968, "grad_norm": 0.5621476528316465, "learning_rate": 3.944870571289995e-05, "loss": 0.6054, "step": 157 }, { "epoch": 0.8432288192128086, "grad_norm": 0.5106888550302184, "learning_rate": 3.943114950151658e-05, "loss": 0.5567, "step": 158 }, { "epoch": 0.8485657104736491, "grad_norm": 0.48507154178393364, "learning_rate": 3.94133221425474e-05, "loss": 0.5759, "step": 159 }, { "epoch": 0.8539026017344896, "grad_norm": 0.5726162474627128, "learning_rate": 3.93952238847603e-05, "loss": 0.5689, "step": 160 }, { "epoch": 0.8592394929953302, "grad_norm": 0.5476504710325754, "learning_rate": 3.9376854980703305e-05, "loss": 0.5509, "step": 161 }, { "epoch": 0.8645763842561708, "grad_norm": 0.5841063194600445, "learning_rate": 3.935821568670113e-05, "loss": 0.5787, "step": 162 }, { "epoch": 0.8699132755170114, "grad_norm": 0.5131288550994832, "learning_rate": 3.9339306262851604e-05, "loss": 0.5543, "step": 163 }, { "epoch": 0.875250166777852, "grad_norm": 0.46011085571103316, "learning_rate": 3.932012697302202e-05, "loss": 0.5432, "step": 164 }, { "epoch": 0.8805870580386924, "grad_norm": 1.3675570339416427, "learning_rate": 3.9300678084845414e-05, "loss": 0.5746, "step": 165 }, { "epoch": 0.885923949299533, "grad_norm": 0.5279362760899026, "learning_rate": 3.928095986971693e-05, "loss": 0.5498, "step": 166 }, { "epoch": 0.8912608405603736, "grad_norm": 0.5332378885535611, "learning_rate": 3.926097260278994e-05, "loss": 0.5896, "step": 167 }, { "epoch": 0.8965977318212142, "grad_norm": 0.45490314214957983, "learning_rate": 3.924071656297224e-05, "loss": 0.5788, "step": 168 }, { "epoch": 0.9019346230820547, "grad_norm": 0.57394261542064, "learning_rate": 3.922019203292217e-05, "loss": 0.572, "step": 169 }, { "epoch": 0.9072715143428952, "grad_norm": 0.5549810760866458, "learning_rate": 3.9199399299044636e-05, "loss": 0.604, "step": 170 }, { "epoch": 0.9126084056037358, "grad_norm": 0.47674471227279036, "learning_rate": 3.9178338651487146e-05, "loss": 0.5958, "step": 171 }, { "epoch": 0.9179452968645764, "grad_norm": 0.5092134083211615, "learning_rate": 3.915701038413575e-05, "loss": 0.5463, "step": 172 }, { "epoch": 0.923282188125417, "grad_norm": 0.5631301439234547, "learning_rate": 3.913541479461095e-05, "loss": 0.5829, "step": 173 }, { "epoch": 0.9286190793862575, "grad_norm": 0.46094438063306387, "learning_rate": 3.9113552184263506e-05, "loss": 0.5647, "step": 174 }, { "epoch": 0.933955970647098, "grad_norm": 0.5137591466008057, "learning_rate": 3.9091422858170275e-05, "loss": 0.571, "step": 175 }, { "epoch": 0.9392928619079386, "grad_norm": 0.5226838432471194, "learning_rate": 3.906902712512994e-05, "loss": 0.5626, "step": 176 }, { "epoch": 0.9446297531687792, "grad_norm": 0.5466147422388645, "learning_rate": 3.904636529765872e-05, "loss": 0.5726, "step": 177 }, { "epoch": 0.9499666444296198, "grad_norm": 0.5487628898316014, "learning_rate": 3.902343769198592e-05, "loss": 0.5629, "step": 178 }, { "epoch": 0.9553035356904603, "grad_norm": 0.5645880019209112, "learning_rate": 3.900024462804968e-05, "loss": 0.5379, "step": 179 }, { "epoch": 0.9606404269513009, "grad_norm": 0.5714192698614569, "learning_rate": 3.897678642949234e-05, "loss": 0.559, "step": 180 }, { "epoch": 0.9659773182121414, "grad_norm": 0.8028140925883746, "learning_rate": 3.8953063423656055e-05, "loss": 0.5528, "step": 181 }, { "epoch": 0.971314209472982, "grad_norm": 0.5192388665650057, "learning_rate": 3.892907594157813e-05, "loss": 0.6081, "step": 182 }, { "epoch": 0.9766511007338226, "grad_norm": 0.6005085196506503, "learning_rate": 3.8904824317986475e-05, "loss": 0.597, "step": 183 }, { "epoch": 0.9819879919946631, "grad_norm": 0.5300178874667031, "learning_rate": 3.8880308891294894e-05, "loss": 0.5569, "step": 184 }, { "epoch": 0.9873248832555037, "grad_norm": 0.507750454533361, "learning_rate": 3.885553000359836e-05, "loss": 0.5902, "step": 185 }, { "epoch": 0.9926617745163442, "grad_norm": 0.5220015768920386, "learning_rate": 3.8830488000668276e-05, "loss": 0.5907, "step": 186 }, { "epoch": 0.9979986657771848, "grad_norm": 0.4864525986978283, "learning_rate": 3.8805183231947605e-05, "loss": 0.5545, "step": 187 }, { "epoch": 1.0033355570380253, "grad_norm": 0.7889929121631584, "learning_rate": 3.8779616050546035e-05, "loss": 0.8624, "step": 188 }, { "epoch": 1.0086724482988658, "grad_norm": 0.6971636670787393, "learning_rate": 3.875378681323501e-05, "loss": 0.4685, "step": 189 }, { "epoch": 1.0140093395597065, "grad_norm": 0.8621305134857299, "learning_rate": 3.872769588044279e-05, "loss": 0.5079, "step": 190 }, { "epoch": 1.019346230820547, "grad_norm": 0.868037119002654, "learning_rate": 3.8701343616249415e-05, "loss": 0.4286, "step": 191 }, { "epoch": 1.0246831220813877, "grad_norm": 0.8244134271269803, "learning_rate": 3.867473038838158e-05, "loss": 0.5353, "step": 192 }, { "epoch": 1.0300200133422281, "grad_norm": 0.750497350373635, "learning_rate": 3.864785656820758e-05, "loss": 0.4478, "step": 193 }, { "epoch": 1.0353569046030686, "grad_norm": 0.7802579665937702, "learning_rate": 3.862072253073207e-05, "loss": 0.4809, "step": 194 }, { "epoch": 1.0406937958639093, "grad_norm": 0.6332504376170986, "learning_rate": 3.859332865459082e-05, "loss": 0.4659, "step": 195 }, { "epoch": 1.0460306871247498, "grad_norm": 0.5212841037738816, "learning_rate": 3.856567532204551e-05, "loss": 0.4419, "step": 196 }, { "epoch": 1.0513675783855905, "grad_norm": 0.5429591096952704, "learning_rate": 3.853776291897831e-05, "loss": 0.471, "step": 197 }, { "epoch": 1.056704469646431, "grad_norm": 0.6105400333523633, "learning_rate": 3.850959183488655e-05, "loss": 0.4869, "step": 198 }, { "epoch": 1.0620413609072714, "grad_norm": 1.389605521972471, "learning_rate": 3.848116246287725e-05, "loss": 0.4194, "step": 199 }, { "epoch": 1.067378252168112, "grad_norm": 0.6500172584353179, "learning_rate": 3.845247519966167e-05, "loss": 0.4742, "step": 200 }, { "epoch": 1.0727151434289526, "grad_norm": 0.6231565335876623, "learning_rate": 3.842353044554973e-05, "loss": 0.4883, "step": 201 }, { "epoch": 1.0780520346897933, "grad_norm": 0.5326635797858091, "learning_rate": 3.839432860444447e-05, "loss": 0.4684, "step": 202 }, { "epoch": 1.0833889259506337, "grad_norm": 0.5630660263478705, "learning_rate": 3.836487008383638e-05, "loss": 0.4974, "step": 203 }, { "epoch": 1.0887258172114742, "grad_norm": 0.5426950088640516, "learning_rate": 3.8335155294797744e-05, "loss": 0.4966, "step": 204 }, { "epoch": 1.094062708472315, "grad_norm": 0.46720899935844223, "learning_rate": 3.8305184651976855e-05, "loss": 0.4518, "step": 205 }, { "epoch": 1.0993995997331554, "grad_norm": 0.6590650012988468, "learning_rate": 3.827495857359228e-05, "loss": 0.5123, "step": 206 }, { "epoch": 1.104736490993996, "grad_norm": 0.4688566921090766, "learning_rate": 3.824447748142701e-05, "loss": 0.4665, "step": 207 }, { "epoch": 1.1100733822548365, "grad_norm": 0.6539956626751293, "learning_rate": 3.821374180082256e-05, "loss": 0.4836, "step": 208 }, { "epoch": 1.115410273515677, "grad_norm": 0.4735771906265032, "learning_rate": 3.8182751960673024e-05, "loss": 0.4854, "step": 209 }, { "epoch": 1.1207471647765177, "grad_norm": 0.5752246665082068, "learning_rate": 3.815150839341915e-05, "loss": 0.4598, "step": 210 }, { "epoch": 1.1260840560373582, "grad_norm": 0.5222212002375315, "learning_rate": 3.812001153504221e-05, "loss": 0.4436, "step": 211 }, { "epoch": 1.1314209472981989, "grad_norm": 0.4870834555429964, "learning_rate": 3.8088261825058025e-05, "loss": 0.468, "step": 212 }, { "epoch": 1.1367578385590393, "grad_norm": 0.5849873353511134, "learning_rate": 3.8056259706510735e-05, "loss": 0.4751, "step": 213 }, { "epoch": 1.1420947298198798, "grad_norm": 0.6135788225861464, "learning_rate": 3.802400562596668e-05, "loss": 0.4372, "step": 214 }, { "epoch": 1.1474316210807205, "grad_norm": 0.4949403107444529, "learning_rate": 3.799150003350813e-05, "loss": 0.4886, "step": 215 }, { "epoch": 1.152768512341561, "grad_norm": 0.4908480402002135, "learning_rate": 3.795874338272705e-05, "loss": 0.4244, "step": 216 }, { "epoch": 1.1581054036024017, "grad_norm": 0.5302662495518609, "learning_rate": 3.79257361307187e-05, "loss": 0.5117, "step": 217 }, { "epoch": 1.1634422948632421, "grad_norm": 0.47953450158842736, "learning_rate": 3.7892478738075335e-05, "loss": 0.4814, "step": 218 }, { "epoch": 1.1687791861240826, "grad_norm": 0.4762686857338102, "learning_rate": 3.785897166887973e-05, "loss": 0.4668, "step": 219 }, { "epoch": 1.1741160773849233, "grad_norm": 0.4687082531990241, "learning_rate": 3.7825215390698696e-05, "loss": 0.4596, "step": 220 }, { "epoch": 1.1794529686457638, "grad_norm": 0.4882003652888224, "learning_rate": 3.779121037457661e-05, "loss": 0.4827, "step": 221 }, { "epoch": 1.1847898599066045, "grad_norm": 0.4673722770115682, "learning_rate": 3.7756957095028776e-05, "loss": 0.4739, "step": 222 }, { "epoch": 1.190126751167445, "grad_norm": 0.46193789537235297, "learning_rate": 3.772245603003485e-05, "loss": 0.4785, "step": 223 }, { "epoch": 1.1954636424282854, "grad_norm": 0.41695599713095927, "learning_rate": 3.768770766103214e-05, "loss": 0.4574, "step": 224 }, { "epoch": 1.200800533689126, "grad_norm": 0.48103374543287214, "learning_rate": 3.765271247290892e-05, "loss": 0.4968, "step": 225 }, { "epoch": 1.2061374249499666, "grad_norm": 0.4107744420495834, "learning_rate": 3.761747095399764e-05, "loss": 0.4691, "step": 226 }, { "epoch": 1.2114743162108073, "grad_norm": 0.5216753656288083, "learning_rate": 3.75819835960681e-05, "loss": 0.4655, "step": 227 }, { "epoch": 1.2168112074716477, "grad_norm": 0.47480701486406535, "learning_rate": 3.754625089432062e-05, "loss": 0.4659, "step": 228 }, { "epoch": 1.2221480987324884, "grad_norm": 0.5167936671852781, "learning_rate": 3.751027334737913e-05, "loss": 0.4789, "step": 229 }, { "epoch": 1.227484989993329, "grad_norm": 0.5894608057326902, "learning_rate": 3.747405145728416e-05, "loss": 0.4857, "step": 230 }, { "epoch": 1.2328218812541694, "grad_norm": 0.4441843183602042, "learning_rate": 3.743758572948591e-05, "loss": 0.4711, "step": 231 }, { "epoch": 1.23815877251501, "grad_norm": 0.44680579976734436, "learning_rate": 3.740087667283712e-05, "loss": 0.4913, "step": 232 }, { "epoch": 1.2434956637758505, "grad_norm": 0.4767148142121902, "learning_rate": 3.736392479958606e-05, "loss": 0.4583, "step": 233 }, { "epoch": 1.2488325550366912, "grad_norm": 0.4360066844228599, "learning_rate": 3.732673062536926e-05, "loss": 0.461, "step": 234 }, { "epoch": 1.2541694462975317, "grad_norm": 0.49211626438882655, "learning_rate": 3.728929466920445e-05, "loss": 0.4771, "step": 235 }, { "epoch": 1.2595063375583724, "grad_norm": 0.4462865895003735, "learning_rate": 3.72516174534832e-05, "loss": 0.4735, "step": 236 }, { "epoch": 1.2648432288192129, "grad_norm": 0.3962897023044732, "learning_rate": 3.721369950396373e-05, "loss": 0.44, "step": 237 }, { "epoch": 1.2701801200800533, "grad_norm": 0.47458867881560224, "learning_rate": 3.7175541349763474e-05, "loss": 0.4798, "step": 238 }, { "epoch": 1.2755170113408938, "grad_norm": 0.47364034729607424, "learning_rate": 3.7137143523351787e-05, "loss": 0.4918, "step": 239 }, { "epoch": 1.2808539026017345, "grad_norm": 0.5140571591821321, "learning_rate": 3.7098506560542464e-05, "loss": 0.4755, "step": 240 }, { "epoch": 1.2861907938625752, "grad_norm": 0.4575077757654535, "learning_rate": 3.705963100048627e-05, "loss": 0.4618, "step": 241 }, { "epoch": 1.2915276851234156, "grad_norm": 0.4752155163431786, "learning_rate": 3.702051738566343e-05, "loss": 0.4805, "step": 242 }, { "epoch": 1.2968645763842561, "grad_norm": 0.44739905544161696, "learning_rate": 3.698116626187603e-05, "loss": 0.4553, "step": 243 }, { "epoch": 1.3022014676450968, "grad_norm": 0.4313224074744495, "learning_rate": 3.694157817824046e-05, "loss": 0.4586, "step": 244 }, { "epoch": 1.3075383589059373, "grad_norm": 0.4540941645048732, "learning_rate": 3.6901753687179674e-05, "loss": 0.4484, "step": 245 }, { "epoch": 1.312875250166778, "grad_norm": 0.4544044695491594, "learning_rate": 3.686169334441554e-05, "loss": 0.4662, "step": 246 }, { "epoch": 1.3182121414276184, "grad_norm": 0.4324267581370308, "learning_rate": 3.6821397708961045e-05, "loss": 0.4973, "step": 247 }, { "epoch": 1.323549032688459, "grad_norm": 0.5525969912230031, "learning_rate": 3.678086734311256e-05, "loss": 0.4824, "step": 248 }, { "epoch": 1.3288859239492996, "grad_norm": 0.48134801141058897, "learning_rate": 3.67401028124419e-05, "loss": 0.4953, "step": 249 }, { "epoch": 1.33422281521014, "grad_norm": 0.5606903378778597, "learning_rate": 3.66991046857885e-05, "loss": 0.4763, "step": 250 }, { "epoch": 1.3395597064709808, "grad_norm": 0.4778001351815925, "learning_rate": 3.6657873535251456e-05, "loss": 0.4427, "step": 251 }, { "epoch": 1.3448965977318212, "grad_norm": 0.5198221637041993, "learning_rate": 3.661640993618155e-05, "loss": 0.4962, "step": 252 }, { "epoch": 1.3502334889926617, "grad_norm": 0.614188855305235, "learning_rate": 3.6574714467173194e-05, "loss": 0.4621, "step": 253 }, { "epoch": 1.3555703802535024, "grad_norm": 0.47715507181630035, "learning_rate": 3.6532787710056405e-05, "loss": 0.4506, "step": 254 }, { "epoch": 1.3609072715143429, "grad_norm": 0.4823822750756398, "learning_rate": 3.649063024988864e-05, "loss": 0.4813, "step": 255 }, { "epoch": 1.3662441627751836, "grad_norm": 0.5035590732756817, "learning_rate": 3.644824267494664e-05, "loss": 0.4732, "step": 256 }, { "epoch": 1.371581054036024, "grad_norm": 0.4775276009416485, "learning_rate": 3.6405625576718256e-05, "loss": 0.502, "step": 257 }, { "epoch": 1.3769179452968645, "grad_norm": 0.4616087736602264, "learning_rate": 3.6362779549894155e-05, "loss": 0.4687, "step": 258 }, { "epoch": 1.3822548365577052, "grad_norm": 0.5051090242818488, "learning_rate": 3.631970519235954e-05, "loss": 0.4527, "step": 259 }, { "epoch": 1.3875917278185457, "grad_norm": 0.46170933564301747, "learning_rate": 3.62764031051858e-05, "loss": 0.4531, "step": 260 }, { "epoch": 1.3929286190793864, "grad_norm": 0.4597839229598545, "learning_rate": 3.623287389262211e-05, "loss": 0.4428, "step": 261 }, { "epoch": 1.3982655103402268, "grad_norm": 0.47396042868358945, "learning_rate": 3.618911816208707e-05, "loss": 0.4748, "step": 262 }, { "epoch": 1.4036024016010673, "grad_norm": 0.46865196681556354, "learning_rate": 3.614513652416011e-05, "loss": 0.4555, "step": 263 }, { "epoch": 1.408939292861908, "grad_norm": 0.4941893702835204, "learning_rate": 3.610092959257306e-05, "loss": 0.4475, "step": 264 }, { "epoch": 1.4142761841227485, "grad_norm": 0.45053813237543944, "learning_rate": 3.6056497984201566e-05, "loss": 0.5037, "step": 265 }, { "epoch": 1.4196130753835892, "grad_norm": 0.4602967972562359, "learning_rate": 3.601184231905647e-05, "loss": 0.4625, "step": 266 }, { "epoch": 1.4249499666444296, "grad_norm": 0.40312077116267125, "learning_rate": 3.5966963220275155e-05, "loss": 0.4322, "step": 267 }, { "epoch": 1.43028685790527, "grad_norm": 0.43605945616048775, "learning_rate": 3.592186131411288e-05, "loss": 0.4758, "step": 268 }, { "epoch": 1.4356237491661108, "grad_norm": 0.4490964338856413, "learning_rate": 3.5876537229933994e-05, "loss": 0.4606, "step": 269 }, { "epoch": 1.4409606404269513, "grad_norm": 0.4488693733839842, "learning_rate": 3.583099160020319e-05, "loss": 0.5358, "step": 270 }, { "epoch": 1.446297531687792, "grad_norm": 0.4399709509429179, "learning_rate": 3.578522506047667e-05, "loss": 0.4585, "step": 271 }, { "epoch": 1.4516344229486324, "grad_norm": 0.48797858146525214, "learning_rate": 3.573923824939327e-05, "loss": 0.4934, "step": 272 }, { "epoch": 1.456971314209473, "grad_norm": 0.43113845680671775, "learning_rate": 3.5693031808665563e-05, "loss": 0.4624, "step": 273 }, { "epoch": 1.4623082054703136, "grad_norm": 0.4622009600777627, "learning_rate": 3.564660638307088e-05, "loss": 0.4418, "step": 274 }, { "epoch": 1.467645096731154, "grad_norm": 0.5575580830867934, "learning_rate": 3.5599962620442344e-05, "loss": 0.507, "step": 275 }, { "epoch": 1.4729819879919948, "grad_norm": 0.4878453026449952, "learning_rate": 3.555310117165979e-05, "loss": 0.4176, "step": 276 }, { "epoch": 1.4783188792528352, "grad_norm": 0.5728589696523062, "learning_rate": 3.550602269064073e-05, "loss": 0.5278, "step": 277 }, { "epoch": 1.4836557705136757, "grad_norm": 0.48279056904217577, "learning_rate": 3.545872783433118e-05, "loss": 0.4131, "step": 278 }, { "epoch": 1.4889926617745164, "grad_norm": 0.5888361010983522, "learning_rate": 3.541121726269654e-05, "loss": 0.4494, "step": 279 }, { "epoch": 1.4943295530353569, "grad_norm": 0.5243711854925368, "learning_rate": 3.5363491638712326e-05, "loss": 0.4546, "step": 280 }, { "epoch": 1.4996664442961976, "grad_norm": 0.565934472490769, "learning_rate": 3.531555162835501e-05, "loss": 0.4774, "step": 281 }, { "epoch": 1.505003335557038, "grad_norm": 0.46628567085954914, "learning_rate": 3.52673979005926e-05, "loss": 0.4572, "step": 282 }, { "epoch": 1.5103402268178785, "grad_norm": 0.5254588931292046, "learning_rate": 3.521903112737544e-05, "loss": 0.5014, "step": 283 }, { "epoch": 1.5156771180787192, "grad_norm": 0.4149936994490417, "learning_rate": 3.517045198362672e-05, "loss": 0.4611, "step": 284 }, { "epoch": 1.5210140093395597, "grad_norm": 0.4142527610245841, "learning_rate": 3.512166114723314e-05, "loss": 0.4378, "step": 285 }, { "epoch": 1.5263509006004004, "grad_norm": 0.4952367594365313, "learning_rate": 3.507265929903539e-05, "loss": 0.5056, "step": 286 }, { "epoch": 1.5316877918612408, "grad_norm": 0.49022347663893007, "learning_rate": 3.5023447122818696e-05, "loss": 0.4144, "step": 287 }, { "epoch": 1.5370246831220813, "grad_norm": 0.5284452745454441, "learning_rate": 3.497402530530326e-05, "loss": 0.4864, "step": 288 }, { "epoch": 1.542361574382922, "grad_norm": 0.6563737965515248, "learning_rate": 3.492439453613466e-05, "loss": 0.4772, "step": 289 }, { "epoch": 1.5476984656437625, "grad_norm": 0.4605849913602888, "learning_rate": 3.487455550787426e-05, "loss": 0.4519, "step": 290 }, { "epoch": 1.5530353569046031, "grad_norm": 0.5170611719530955, "learning_rate": 3.482450891598951e-05, "loss": 0.4967, "step": 291 }, { "epoch": 1.5583722481654436, "grad_norm": 0.46268930563862615, "learning_rate": 3.4774255458844273e-05, "loss": 0.4515, "step": 292 }, { "epoch": 1.563709139426284, "grad_norm": 0.4177933995256324, "learning_rate": 3.472379583768906e-05, "loss": 0.4557, "step": 293 }, { "epoch": 1.5690460306871248, "grad_norm": 0.4790527598758891, "learning_rate": 3.4673130756651266e-05, "loss": 0.4557, "step": 294 }, { "epoch": 1.5743829219479655, "grad_norm": 0.38274726550797006, "learning_rate": 3.4622260922725315e-05, "loss": 0.4655, "step": 295 }, { "epoch": 1.579719813208806, "grad_norm": 0.5046386994690183, "learning_rate": 3.457118704576281e-05, "loss": 0.5072, "step": 296 }, { "epoch": 1.5850567044696464, "grad_norm": 0.3438922118300985, "learning_rate": 3.451990983846262e-05, "loss": 0.4092, "step": 297 }, { "epoch": 1.5903935957304869, "grad_norm": 0.47843066059231854, "learning_rate": 3.4468430016360955e-05, "loss": 0.4719, "step": 298 }, { "epoch": 1.5957304869913276, "grad_norm": 0.41900316347012834, "learning_rate": 3.4416748297821375e-05, "loss": 0.4697, "step": 299 }, { "epoch": 1.6010673782521683, "grad_norm": 0.4504191244471495, "learning_rate": 3.4364865404024725e-05, "loss": 0.4716, "step": 300 }, { "epoch": 1.6064042695130087, "grad_norm": 0.40206430223864725, "learning_rate": 3.4312782058959136e-05, "loss": 0.4693, "step": 301 }, { "epoch": 1.6117411607738492, "grad_norm": 0.4131717908261127, "learning_rate": 3.426049898940988e-05, "loss": 0.4326, "step": 302 }, { "epoch": 1.6170780520346897, "grad_norm": 0.4192801860140271, "learning_rate": 3.420801692494923e-05, "loss": 0.4816, "step": 303 }, { "epoch": 1.6224149432955304, "grad_norm": 0.45720379651134796, "learning_rate": 3.415533659792631e-05, "loss": 0.4762, "step": 304 }, { "epoch": 1.627751834556371, "grad_norm": 0.4322890724213698, "learning_rate": 3.4102458743456836e-05, "loss": 0.4956, "step": 305 }, { "epoch": 1.6330887258172115, "grad_norm": 0.3701032350873174, "learning_rate": 3.404938409941288e-05, "loss": 0.4258, "step": 306 }, { "epoch": 1.638425617078052, "grad_norm": 0.4418583797952319, "learning_rate": 3.3996113406412575e-05, "loss": 0.4635, "step": 307 }, { "epoch": 1.6437625083388925, "grad_norm": 0.5119651096884537, "learning_rate": 3.394264740780977e-05, "loss": 0.4565, "step": 308 }, { "epoch": 1.6490993995997332, "grad_norm": 0.439255187163439, "learning_rate": 3.388898684968367e-05, "loss": 0.4244, "step": 309 }, { "epoch": 1.6544362908605739, "grad_norm": 0.6314072128471104, "learning_rate": 3.3835132480828395e-05, "loss": 0.4979, "step": 310 }, { "epoch": 1.6597731821214143, "grad_norm": 0.45356149569718207, "learning_rate": 3.3781085052742587e-05, "loss": 0.4659, "step": 311 }, { "epoch": 1.6651100733822548, "grad_norm": 0.6595936619453585, "learning_rate": 3.372684531961885e-05, "loss": 0.4715, "step": 312 }, { "epoch": 1.6704469646430953, "grad_norm": 0.40146590046819736, "learning_rate": 3.3672414038333294e-05, "loss": 0.442, "step": 313 }, { "epoch": 1.675783855903936, "grad_norm": 0.5439115291324319, "learning_rate": 3.361779196843495e-05, "loss": 0.4642, "step": 314 }, { "epoch": 1.6811207471647767, "grad_norm": 0.4925416574124429, "learning_rate": 3.356297987213514e-05, "loss": 0.4799, "step": 315 }, { "epoch": 1.6864576384256171, "grad_norm": 0.41997671654731134, "learning_rate": 3.350797851429688e-05, "loss": 0.4485, "step": 316 }, { "epoch": 1.6917945296864576, "grad_norm": 0.5655372175649206, "learning_rate": 3.345278866242419e-05, "loss": 0.4933, "step": 317 }, { "epoch": 1.697131420947298, "grad_norm": 0.45516429030940203, "learning_rate": 3.339741108665139e-05, "loss": 0.4693, "step": 318 }, { "epoch": 1.7024683122081388, "grad_norm": 0.5182373828396348, "learning_rate": 3.334184655973236e-05, "loss": 0.4318, "step": 319 }, { "epoch": 1.7078052034689795, "grad_norm": 0.5043642496605346, "learning_rate": 3.3286095857029724e-05, "loss": 0.5043, "step": 320 }, { "epoch": 1.71314209472982, "grad_norm": 0.5143484476959619, "learning_rate": 3.3230159756504065e-05, "loss": 0.4523, "step": 321 }, { "epoch": 1.7184789859906604, "grad_norm": 0.4004155764578505, "learning_rate": 3.317403903870308e-05, "loss": 0.4542, "step": 322 }, { "epoch": 1.7238158772515009, "grad_norm": 0.43510446375283907, "learning_rate": 3.311773448675063e-05, "loss": 0.4591, "step": 323 }, { "epoch": 1.7291527685123416, "grad_norm": 0.40735475539646576, "learning_rate": 3.3061246886335866e-05, "loss": 0.4767, "step": 324 }, { "epoch": 1.7344896597731823, "grad_norm": 0.5027103748999582, "learning_rate": 3.300457702570225e-05, "loss": 0.42, "step": 325 }, { "epoch": 1.7398265510340227, "grad_norm": 0.39562071018506284, "learning_rate": 3.294772569563656e-05, "loss": 0.5089, "step": 326 }, { "epoch": 1.7451634422948632, "grad_norm": 0.45829391562540117, "learning_rate": 3.2890693689457817e-05, "loss": 0.4785, "step": 327 }, { "epoch": 1.7505003335557037, "grad_norm": 0.3733919681858633, "learning_rate": 3.283348180300627e-05, "loss": 0.4503, "step": 328 }, { "epoch": 1.7558372248165444, "grad_norm": 0.4599706292196285, "learning_rate": 3.277609083463228e-05, "loss": 0.4637, "step": 329 }, { "epoch": 1.761174116077385, "grad_norm": 0.4095854985895356, "learning_rate": 3.271852158518514e-05, "loss": 0.4707, "step": 330 }, { "epoch": 1.7665110073382255, "grad_norm": 0.4230424744489382, "learning_rate": 3.266077485800192e-05, "loss": 0.4611, "step": 331 }, { "epoch": 1.771847898599066, "grad_norm": 0.40727774451181376, "learning_rate": 3.26028514588963e-05, "loss": 0.4899, "step": 332 }, { "epoch": 1.7771847898599065, "grad_norm": 0.4257028861000557, "learning_rate": 3.2544752196147266e-05, "loss": 0.4759, "step": 333 }, { "epoch": 1.7825216811207472, "grad_norm": 0.4044619820594187, "learning_rate": 3.248647788048784e-05, "loss": 0.4589, "step": 334 }, { "epoch": 1.7878585723815879, "grad_norm": 0.41896755044660233, "learning_rate": 3.2428029325093794e-05, "loss": 0.457, "step": 335 }, { "epoch": 1.7931954636424283, "grad_norm": 0.4226741156494024, "learning_rate": 3.23694073455723e-05, "loss": 0.468, "step": 336 }, { "epoch": 1.7985323549032688, "grad_norm": 0.3803247543345477, "learning_rate": 3.2310612759950535e-05, "loss": 0.4548, "step": 337 }, { "epoch": 1.8038692461641093, "grad_norm": 0.4456607835139093, "learning_rate": 3.225164638866424e-05, "loss": 0.4808, "step": 338 }, { "epoch": 1.80920613742495, "grad_norm": 0.43894216697827915, "learning_rate": 3.219250905454633e-05, "loss": 0.4538, "step": 339 }, { "epoch": 1.8145430286857906, "grad_norm": 0.3622459739187082, "learning_rate": 3.213320158281538e-05, "loss": 0.429, "step": 340 }, { "epoch": 1.8198799199466311, "grad_norm": 0.45236032311104785, "learning_rate": 3.207372480106409e-05, "loss": 0.4955, "step": 341 }, { "epoch": 1.8252168112074716, "grad_norm": 0.40834732197197826, "learning_rate": 3.201407953924779e-05, "loss": 0.4419, "step": 342 }, { "epoch": 1.830553702468312, "grad_norm": 0.4934429140575739, "learning_rate": 3.195426662967281e-05, "loss": 0.5097, "step": 343 }, { "epoch": 1.8358905937291528, "grad_norm": 0.40870574603025267, "learning_rate": 3.189428690698487e-05, "loss": 0.4398, "step": 344 }, { "epoch": 1.8412274849899934, "grad_norm": 0.459285280694887, "learning_rate": 3.183414120815747e-05, "loss": 0.4808, "step": 345 }, { "epoch": 1.846564376250834, "grad_norm": 0.3783731629311537, "learning_rate": 3.177383037248018e-05, "loss": 0.4393, "step": 346 }, { "epoch": 1.8519012675116744, "grad_norm": 0.43203892549748285, "learning_rate": 3.171335524154691e-05, "loss": 0.4496, "step": 347 }, { "epoch": 1.8572381587725149, "grad_norm": 0.5083672035817517, "learning_rate": 3.165271665924424e-05, "loss": 0.4537, "step": 348 }, { "epoch": 1.8625750500333556, "grad_norm": 0.470971515949764, "learning_rate": 3.159191547173955e-05, "loss": 0.4534, "step": 349 }, { "epoch": 1.8679119412941962, "grad_norm": 0.5043411580642327, "learning_rate": 3.153095252746928e-05, "loss": 0.455, "step": 350 }, { "epoch": 1.8732488325550367, "grad_norm": 0.5422278049069252, "learning_rate": 3.146982867712706e-05, "loss": 0.4976, "step": 351 }, { "epoch": 1.8785857238158772, "grad_norm": 0.4604934576130623, "learning_rate": 3.140854477365185e-05, "loss": 0.4338, "step": 352 }, { "epoch": 1.8839226150767177, "grad_norm": 0.5227311072550253, "learning_rate": 3.134710167221604e-05, "loss": 0.4867, "step": 353 }, { "epoch": 1.8892595063375583, "grad_norm": 0.5016387520943354, "learning_rate": 3.12855002302135e-05, "loss": 0.4463, "step": 354 }, { "epoch": 1.894596397598399, "grad_norm": 0.48681527381444745, "learning_rate": 3.122374130724765e-05, "loss": 0.4878, "step": 355 }, { "epoch": 1.8999332888592395, "grad_norm": 0.4753924890233577, "learning_rate": 3.116182576511941e-05, "loss": 0.4614, "step": 356 }, { "epoch": 1.90527018012008, "grad_norm": 0.40839824511015216, "learning_rate": 3.1099754467815244e-05, "loss": 0.4551, "step": 357 }, { "epoch": 1.9106070713809205, "grad_norm": 0.5233984189857435, "learning_rate": 3.103752828149502e-05, "loss": 0.4852, "step": 358 }, { "epoch": 1.9159439626417611, "grad_norm": 0.44864050338135375, "learning_rate": 3.0975148074480026e-05, "loss": 0.4786, "step": 359 }, { "epoch": 1.9212808539026018, "grad_norm": 0.43437456170806327, "learning_rate": 3.0912614717240745e-05, "loss": 0.464, "step": 360 }, { "epoch": 1.9266177451634423, "grad_norm": 0.399850990382915, "learning_rate": 3.08499290823848e-05, "loss": 0.4438, "step": 361 }, { "epoch": 1.9319546364242828, "grad_norm": 0.43768868935382793, "learning_rate": 3.07870920446447e-05, "loss": 0.4603, "step": 362 }, { "epoch": 1.9372915276851232, "grad_norm": 0.5552601097943353, "learning_rate": 3.072410448086572e-05, "loss": 0.4762, "step": 363 }, { "epoch": 1.942628418945964, "grad_norm": 0.40557866679728677, "learning_rate": 3.066096726999357e-05, "loss": 0.4607, "step": 364 }, { "epoch": 1.9479653102068046, "grad_norm": 0.5268701709396402, "learning_rate": 3.0597681293062187e-05, "loss": 0.4389, "step": 365 }, { "epoch": 1.953302201467645, "grad_norm": 0.4027710666772095, "learning_rate": 3.053424743318146e-05, "loss": 0.4517, "step": 366 }, { "epoch": 1.9586390927284856, "grad_norm": 0.44216756582628347, "learning_rate": 3.047066657552484e-05, "loss": 0.4616, "step": 367 }, { "epoch": 1.9639759839893263, "grad_norm": 0.4117419874422281, "learning_rate": 3.040693960731704e-05, "loss": 0.4517, "step": 368 }, { "epoch": 1.9693128752501667, "grad_norm": 0.4044754379711903, "learning_rate": 3.034306741782166e-05, "loss": 0.4719, "step": 369 }, { "epoch": 1.9746497665110074, "grad_norm": 0.3728722340841331, "learning_rate": 3.0279050898328716e-05, "loss": 0.4168, "step": 370 }, { "epoch": 1.979986657771848, "grad_norm": 0.4162488452438543, "learning_rate": 3.021489094214228e-05, "loss": 0.4688, "step": 371 }, { "epoch": 1.9853235490326884, "grad_norm": 0.43351004658705333, "learning_rate": 3.0150588444567962e-05, "loss": 0.4519, "step": 372 }, { "epoch": 1.990660440293529, "grad_norm": 0.3852293267610788, "learning_rate": 3.0086144302900425e-05, "loss": 0.4819, "step": 373 }, { "epoch": 1.9959973315543695, "grad_norm": 0.36569663935905455, "learning_rate": 3.002155941641091e-05, "loss": 0.4535, "step": 374 }, { "epoch": 2.0013342228152102, "grad_norm": 0.608016827751094, "learning_rate": 2.99568346863346e-05, "loss": 0.7085, "step": 375 }, { "epoch": 2.0066711140760507, "grad_norm": 0.6591571174677568, "learning_rate": 2.989197101585813e-05, "loss": 0.3487, "step": 376 }, { "epoch": 2.012008005336891, "grad_norm": 0.6684266661006703, "learning_rate": 2.9826969310106927e-05, "loss": 0.3407, "step": 377 }, { "epoch": 2.0173448965977316, "grad_norm": 0.6399522965663798, "learning_rate": 2.976183047613262e-05, "loss": 0.3667, "step": 378 }, { "epoch": 2.0226817878585726, "grad_norm": 0.5520199671519408, "learning_rate": 2.9696555422900352e-05, "loss": 0.3039, "step": 379 }, { "epoch": 2.028018679119413, "grad_norm": 0.5699997898314517, "learning_rate": 2.9631145061276093e-05, "loss": 0.3339, "step": 380 }, { "epoch": 2.0333555703802535, "grad_norm": 0.6020462454607703, "learning_rate": 2.956560030401397e-05, "loss": 0.3557, "step": 381 }, { "epoch": 2.038692461641094, "grad_norm": 0.5721714900378834, "learning_rate": 2.949992206574348e-05, "loss": 0.3475, "step": 382 }, { "epoch": 2.0440293529019344, "grad_norm": 0.49943187780464754, "learning_rate": 2.9434111262956767e-05, "loss": 0.3062, "step": 383 }, { "epoch": 2.0493662441627754, "grad_norm": 0.6557048215847663, "learning_rate": 2.9368168813995806e-05, "loss": 0.3715, "step": 384 }, { "epoch": 2.054703135423616, "grad_norm": 0.5041985226572085, "learning_rate": 2.9302095639039607e-05, "loss": 0.3342, "step": 385 }, { "epoch": 2.0600400266844563, "grad_norm": 0.5594849349537587, "learning_rate": 2.923589266009136e-05, "loss": 0.3116, "step": 386 }, { "epoch": 2.0653769179452968, "grad_norm": 0.4290186723298561, "learning_rate": 2.9169560800965583e-05, "loss": 0.3593, "step": 387 }, { "epoch": 2.0707138092061372, "grad_norm": 0.526856097744535, "learning_rate": 2.910310098727521e-05, "loss": 0.3534, "step": 388 }, { "epoch": 2.076050700466978, "grad_norm": 0.4372849060681286, "learning_rate": 2.9036514146418705e-05, "loss": 0.3224, "step": 389 }, { "epoch": 2.0813875917278186, "grad_norm": 0.4365535359116469, "learning_rate": 2.896980120756709e-05, "loss": 0.3381, "step": 390 }, { "epoch": 2.086724482988659, "grad_norm": 0.4371195260504785, "learning_rate": 2.8902963101651004e-05, "loss": 0.3337, "step": 391 }, { "epoch": 2.0920613742494996, "grad_norm": 0.4119069627388638, "learning_rate": 2.883600076134768e-05, "loss": 0.3396, "step": 392 }, { "epoch": 2.09739826551034, "grad_norm": 0.4389727835717101, "learning_rate": 2.8768915121067987e-05, "loss": 0.3544, "step": 393 }, { "epoch": 2.102735156771181, "grad_norm": 0.39839899451373395, "learning_rate": 2.870170711694333e-05, "loss": 0.3258, "step": 394 }, { "epoch": 2.1080720480320214, "grad_norm": 0.42893629704061353, "learning_rate": 2.8634377686812608e-05, "loss": 0.3257, "step": 395 }, { "epoch": 2.113408939292862, "grad_norm": 0.41063083754083984, "learning_rate": 2.8566927770209153e-05, "loss": 0.335, "step": 396 }, { "epoch": 2.1187458305537024, "grad_norm": 0.48368270464573715, "learning_rate": 2.8499358308347595e-05, "loss": 0.3707, "step": 397 }, { "epoch": 2.124082721814543, "grad_norm": 0.3961965622304182, "learning_rate": 2.843167024411071e-05, "loss": 0.3395, "step": 398 }, { "epoch": 2.1294196130753837, "grad_norm": 0.4082398648072262, "learning_rate": 2.8363864522036298e-05, "loss": 0.3116, "step": 399 }, { "epoch": 2.134756504336224, "grad_norm": 0.39241622775035234, "learning_rate": 2.8295942088304004e-05, "loss": 0.3331, "step": 400 }, { "epoch": 2.1400933955970647, "grad_norm": 0.43575155152898126, "learning_rate": 2.822790389072207e-05, "loss": 0.3516, "step": 401 }, { "epoch": 2.145430286857905, "grad_norm": 0.34607848217775505, "learning_rate": 2.815975087871416e-05, "loss": 0.2954, "step": 402 }, { "epoch": 2.1507671781187456, "grad_norm": 0.40910730417092234, "learning_rate": 2.8091484003306074e-05, "loss": 0.3335, "step": 403 }, { "epoch": 2.1561040693795865, "grad_norm": 0.3834996888062665, "learning_rate": 2.802310421711252e-05, "loss": 0.3293, "step": 404 }, { "epoch": 2.161440960640427, "grad_norm": 0.39804942929539155, "learning_rate": 2.7954612474323754e-05, "loss": 0.35, "step": 405 }, { "epoch": 2.1667778519012675, "grad_norm": 0.40500969458813774, "learning_rate": 2.788600973069234e-05, "loss": 0.3375, "step": 406 }, { "epoch": 2.172114743162108, "grad_norm": 0.38166833304508063, "learning_rate": 2.781729694351976e-05, "loss": 0.3614, "step": 407 }, { "epoch": 2.1774516344229484, "grad_norm": 0.39321975173629375, "learning_rate": 2.7748475071643085e-05, "loss": 0.3361, "step": 408 }, { "epoch": 2.1827885256837893, "grad_norm": 0.4264354527637513, "learning_rate": 2.7679545075421573e-05, "loss": 0.3403, "step": 409 }, { "epoch": 2.18812541694463, "grad_norm": 0.4009398961332454, "learning_rate": 2.7610507916723283e-05, "loss": 0.3494, "step": 410 }, { "epoch": 2.1934623082054703, "grad_norm": 0.44735512944431943, "learning_rate": 2.754136455891165e-05, "loss": 0.3381, "step": 411 }, { "epoch": 2.1987991994663107, "grad_norm": 0.40873569246310604, "learning_rate": 2.7472115966832044e-05, "loss": 0.32, "step": 412 }, { "epoch": 2.204136090727151, "grad_norm": 0.451038278060695, "learning_rate": 2.7402763106798295e-05, "loss": 0.3537, "step": 413 }, { "epoch": 2.209472981987992, "grad_norm": 0.43597576256513326, "learning_rate": 2.733330694657921e-05, "loss": 0.3341, "step": 414 }, { "epoch": 2.2148098732488326, "grad_norm": 0.4216053478782035, "learning_rate": 2.7263748455385098e-05, "loss": 0.3496, "step": 415 }, { "epoch": 2.220146764509673, "grad_norm": 0.4741703909383378, "learning_rate": 2.719408860385421e-05, "loss": 0.3387, "step": 416 }, { "epoch": 2.2254836557705135, "grad_norm": 0.396307019176774, "learning_rate": 2.7124328364039203e-05, "loss": 0.3447, "step": 417 }, { "epoch": 2.230820547031354, "grad_norm": 0.44800881081287386, "learning_rate": 2.7054468709393575e-05, "loss": 0.3513, "step": 418 }, { "epoch": 2.236157438292195, "grad_norm": 0.3929341967270722, "learning_rate": 2.6984510614758112e-05, "loss": 0.3298, "step": 419 }, { "epoch": 2.2414943295530354, "grad_norm": 0.4067007662384614, "learning_rate": 2.6914455056347225e-05, "loss": 0.3325, "step": 420 }, { "epoch": 2.246831220813876, "grad_norm": 0.43892260225513113, "learning_rate": 2.6844303011735385e-05, "loss": 0.3449, "step": 421 }, { "epoch": 2.2521681120747163, "grad_norm": 0.41729138409748806, "learning_rate": 2.677405545984344e-05, "loss": 0.3414, "step": 422 }, { "epoch": 2.257505003335557, "grad_norm": 0.38213817747852563, "learning_rate": 2.6703713380924993e-05, "loss": 0.343, "step": 423 }, { "epoch": 2.2628418945963977, "grad_norm": 0.38560789077218444, "learning_rate": 2.6633277756552683e-05, "loss": 0.3193, "step": 424 }, { "epoch": 2.268178785857238, "grad_norm": 0.40286245157177797, "learning_rate": 2.6562749569604527e-05, "loss": 0.3133, "step": 425 }, { "epoch": 2.2735156771180787, "grad_norm": 0.41596719722020303, "learning_rate": 2.6492129804250173e-05, "loss": 0.3523, "step": 426 }, { "epoch": 2.278852568378919, "grad_norm": 0.4153442559638167, "learning_rate": 2.642141944593718e-05, "loss": 0.3541, "step": 427 }, { "epoch": 2.2841894596397596, "grad_norm": 0.40946075262767867, "learning_rate": 2.635061948137727e-05, "loss": 0.3321, "step": 428 }, { "epoch": 2.2895263509006005, "grad_norm": 0.3585516295174825, "learning_rate": 2.6279730898532548e-05, "loss": 0.3568, "step": 429 }, { "epoch": 2.294863242161441, "grad_norm": 0.4126272108188889, "learning_rate": 2.6208754686601735e-05, "loss": 0.3522, "step": 430 }, { "epoch": 2.3002001334222815, "grad_norm": 0.339353916561967, "learning_rate": 2.613769183600634e-05, "loss": 0.2944, "step": 431 }, { "epoch": 2.305537024683122, "grad_norm": 0.4282861919791616, "learning_rate": 2.6066543338376865e-05, "loss": 0.3318, "step": 432 }, { "epoch": 2.3108739159439624, "grad_norm": 0.35897497473677303, "learning_rate": 2.599531018653893e-05, "loss": 0.3378, "step": 433 }, { "epoch": 2.3162108072048033, "grad_norm": 0.47411097531132296, "learning_rate": 2.5923993374499475e-05, "loss": 0.3662, "step": 434 }, { "epoch": 2.321547698465644, "grad_norm": 0.3568620059406491, "learning_rate": 2.585259389743284e-05, "loss": 0.3143, "step": 435 }, { "epoch": 2.3268845897264843, "grad_norm": 0.4473037089763921, "learning_rate": 2.5781112751666886e-05, "loss": 0.3753, "step": 436 }, { "epoch": 2.3322214809873247, "grad_norm": 0.3799705626691493, "learning_rate": 2.5709550934669123e-05, "loss": 0.3361, "step": 437 }, { "epoch": 2.337558372248165, "grad_norm": 0.39700940455368583, "learning_rate": 2.5637909445032752e-05, "loss": 0.3174, "step": 438 }, { "epoch": 2.342895263509006, "grad_norm": 0.4519495883374496, "learning_rate": 2.5566189282462766e-05, "loss": 0.356, "step": 439 }, { "epoch": 2.3482321547698466, "grad_norm": 0.3870700641532921, "learning_rate": 2.549439144776195e-05, "loss": 0.3332, "step": 440 }, { "epoch": 2.353569046030687, "grad_norm": 0.4068180001517662, "learning_rate": 2.542251694281699e-05, "loss": 0.3342, "step": 441 }, { "epoch": 2.3589059372915275, "grad_norm": 0.3931784325165116, "learning_rate": 2.5350566770584423e-05, "loss": 0.3316, "step": 442 }, { "epoch": 2.364242828552368, "grad_norm": 0.44083065881755534, "learning_rate": 2.5278541935076656e-05, "loss": 0.3493, "step": 443 }, { "epoch": 2.369579719813209, "grad_norm": 0.41781820434477857, "learning_rate": 2.5206443441347995e-05, "loss": 0.3334, "step": 444 }, { "epoch": 2.3749166110740494, "grad_norm": 0.4211075137727358, "learning_rate": 2.5134272295480587e-05, "loss": 0.353, "step": 445 }, { "epoch": 2.38025350233489, "grad_norm": 0.4276719699292506, "learning_rate": 2.506202950457038e-05, "loss": 0.3194, "step": 446 }, { "epoch": 2.3855903935957303, "grad_norm": 0.3891436905777496, "learning_rate": 2.4989716076713063e-05, "loss": 0.3298, "step": 447 }, { "epoch": 2.390927284856571, "grad_norm": 0.4136388035611674, "learning_rate": 2.4917333020990045e-05, "loss": 0.3316, "step": 448 }, { "epoch": 2.3962641761174117, "grad_norm": 0.46318938516966707, "learning_rate": 2.4844881347454326e-05, "loss": 0.3561, "step": 449 }, { "epoch": 2.401601067378252, "grad_norm": 0.4361718813430768, "learning_rate": 2.477236206711641e-05, "loss": 0.3353, "step": 450 }, { "epoch": 2.4069379586390927, "grad_norm": 0.4231633561528134, "learning_rate": 2.46997761919302e-05, "loss": 0.3528, "step": 451 }, { "epoch": 2.412274849899933, "grad_norm": 0.44841625308699595, "learning_rate": 2.4627124734778905e-05, "loss": 0.3407, "step": 452 }, { "epoch": 2.417611741160774, "grad_norm": 0.368806895133852, "learning_rate": 2.4554408709460873e-05, "loss": 0.3171, "step": 453 }, { "epoch": 2.4229486324216145, "grad_norm": 0.46212577157467083, "learning_rate": 2.4481629130675444e-05, "loss": 0.3593, "step": 454 }, { "epoch": 2.428285523682455, "grad_norm": 0.4396967709166417, "learning_rate": 2.4408787014008807e-05, "loss": 0.3441, "step": 455 }, { "epoch": 2.4336224149432955, "grad_norm": 0.47046726193724747, "learning_rate": 2.4335883375919828e-05, "loss": 0.354, "step": 456 }, { "epoch": 2.438959306204136, "grad_norm": 0.4453837386684812, "learning_rate": 2.4262919233725853e-05, "loss": 0.3545, "step": 457 }, { "epoch": 2.444296197464977, "grad_norm": 0.4077861792439572, "learning_rate": 2.418989560558852e-05, "loss": 0.3307, "step": 458 }, { "epoch": 2.4496330887258173, "grad_norm": 0.4120092551712366, "learning_rate": 2.411681351049954e-05, "loss": 0.3594, "step": 459 }, { "epoch": 2.454969979986658, "grad_norm": 0.3903774406478128, "learning_rate": 2.404367396826651e-05, "loss": 0.3511, "step": 460 }, { "epoch": 2.4603068712474983, "grad_norm": 0.3537187224774452, "learning_rate": 2.3970477999498648e-05, "loss": 0.3413, "step": 461 }, { "epoch": 2.4656437625083387, "grad_norm": 0.37223078801129084, "learning_rate": 2.3897226625592555e-05, "loss": 0.3289, "step": 462 }, { "epoch": 2.4709806537691796, "grad_norm": 0.3724310146397289, "learning_rate": 2.3823920868717982e-05, "loss": 0.333, "step": 463 }, { "epoch": 2.47631754503002, "grad_norm": 0.37821163484897535, "learning_rate": 2.3750561751803563e-05, "loss": 0.3397, "step": 464 }, { "epoch": 2.4816544362908606, "grad_norm": 0.3963343238925445, "learning_rate": 2.3677150298522513e-05, "loss": 0.3728, "step": 465 }, { "epoch": 2.486991327551701, "grad_norm": 0.36709987148577683, "learning_rate": 2.3603687533278364e-05, "loss": 0.3222, "step": 466 }, { "epoch": 2.4923282188125415, "grad_norm": 0.37561130929516573, "learning_rate": 2.3530174481190692e-05, "loss": 0.3266, "step": 467 }, { "epoch": 2.4976651100733824, "grad_norm": 0.3478504823013864, "learning_rate": 2.3456612168080764e-05, "loss": 0.3445, "step": 468 }, { "epoch": 2.503002001334223, "grad_norm": 0.43376543680900886, "learning_rate": 2.338300162045726e-05, "loss": 0.3578, "step": 469 }, { "epoch": 2.5083388925950634, "grad_norm": 0.3467248206638372, "learning_rate": 2.330934386550194e-05, "loss": 0.3527, "step": 470 }, { "epoch": 2.513675783855904, "grad_norm": 0.3703030631269327, "learning_rate": 2.32356399310553e-05, "loss": 0.3372, "step": 471 }, { "epoch": 2.5190126751167448, "grad_norm": 0.36617067977815226, "learning_rate": 2.316189084560224e-05, "loss": 0.3212, "step": 472 }, { "epoch": 2.524349566377585, "grad_norm": 0.4109293782714893, "learning_rate": 2.3088097638257722e-05, "loss": 0.3669, "step": 473 }, { "epoch": 2.5296864576384257, "grad_norm": 0.40880336771156084, "learning_rate": 2.3014261338752376e-05, "loss": 0.3373, "step": 474 }, { "epoch": 2.535023348899266, "grad_norm": 0.41531532752412315, "learning_rate": 2.294038297741817e-05, "loss": 0.3412, "step": 475 }, { "epoch": 2.5403602401601066, "grad_norm": 0.4088747291326209, "learning_rate": 2.2866463585174007e-05, "loss": 0.3429, "step": 476 }, { "epoch": 2.5456971314209476, "grad_norm": 0.3575193305915961, "learning_rate": 2.2792504193511338e-05, "loss": 0.3681, "step": 477 }, { "epoch": 2.5510340226817876, "grad_norm": 0.4035090318465378, "learning_rate": 2.2718505834479787e-05, "loss": 0.335, "step": 478 }, { "epoch": 2.5563709139426285, "grad_norm": 0.34216276945535873, "learning_rate": 2.2644469540672736e-05, "loss": 0.3387, "step": 479 }, { "epoch": 2.561707805203469, "grad_norm": 0.37240018289403937, "learning_rate": 2.2570396345212932e-05, "loss": 0.3565, "step": 480 }, { "epoch": 2.5670446964643094, "grad_norm": 0.33808417077793806, "learning_rate": 2.2496287281738033e-05, "loss": 0.3349, "step": 481 }, { "epoch": 2.5723815877251504, "grad_norm": 0.3746743257239068, "learning_rate": 2.2422143384386222e-05, "loss": 0.3495, "step": 482 }, { "epoch": 2.577718478985991, "grad_norm": 0.3494328211369956, "learning_rate": 2.234796568778178e-05, "loss": 0.3434, "step": 483 }, { "epoch": 2.5830553702468313, "grad_norm": 0.38271781562914736, "learning_rate": 2.22737552270206e-05, "loss": 0.3177, "step": 484 }, { "epoch": 2.5883922615076718, "grad_norm": 0.37885165545443145, "learning_rate": 2.219951303765579e-05, "loss": 0.345, "step": 485 }, { "epoch": 2.5937291527685122, "grad_norm": 0.417558398659155, "learning_rate": 2.212524015568322e-05, "loss": 0.3384, "step": 486 }, { "epoch": 2.599066044029353, "grad_norm": 0.33797987587981193, "learning_rate": 2.205093761752704e-05, "loss": 0.3424, "step": 487 }, { "epoch": 2.6044029352901936, "grad_norm": 0.3436020891272047, "learning_rate": 2.197660646002523e-05, "loss": 0.3098, "step": 488 }, { "epoch": 2.609739826551034, "grad_norm": 0.36972858197550945, "learning_rate": 2.190224772041512e-05, "loss": 0.3573, "step": 489 }, { "epoch": 2.6150767178118746, "grad_norm": 0.3473912972172743, "learning_rate": 2.1827862436318964e-05, "loss": 0.3435, "step": 490 }, { "epoch": 2.620413609072715, "grad_norm": 0.355276880649726, "learning_rate": 2.175345164572939e-05, "loss": 0.3507, "step": 491 }, { "epoch": 2.625750500333556, "grad_norm": 0.36822265084446165, "learning_rate": 2.1679016386994972e-05, "loss": 0.3484, "step": 492 }, { "epoch": 2.6310873915943964, "grad_norm": 0.34120947707028404, "learning_rate": 2.1604557698805707e-05, "loss": 0.3581, "step": 493 }, { "epoch": 2.636424282855237, "grad_norm": 0.3625974151488874, "learning_rate": 2.153007662017854e-05, "loss": 0.3486, "step": 494 }, { "epoch": 2.6417611741160774, "grad_norm": 0.3810454285560133, "learning_rate": 2.145557419044286e-05, "loss": 0.3262, "step": 495 }, { "epoch": 2.647098065376918, "grad_norm": 0.3567767494033327, "learning_rate": 2.1381051449225977e-05, "loss": 0.3523, "step": 496 }, { "epoch": 2.6524349566377587, "grad_norm": 0.33557909084194115, "learning_rate": 2.130650943643866e-05, "loss": 0.3318, "step": 497 }, { "epoch": 2.657771847898599, "grad_norm": 0.3835013976581898, "learning_rate": 2.123194919226058e-05, "loss": 0.3419, "step": 498 }, { "epoch": 2.6631087391594397, "grad_norm": 0.3425374794771174, "learning_rate": 2.1157371757125827e-05, "loss": 0.3314, "step": 499 }, { "epoch": 2.66844563042028, "grad_norm": 0.3613222705329777, "learning_rate": 2.1082778171708355e-05, "loss": 0.3367, "step": 500 }, { "epoch": 2.6737825216811206, "grad_norm": 0.39030975504768284, "learning_rate": 2.100816947690751e-05, "loss": 0.3633, "step": 501 }, { "epoch": 2.6791194129419615, "grad_norm": 0.34775441424791625, "learning_rate": 2.0933546713833474e-05, "loss": 0.3261, "step": 502 }, { "epoch": 2.684456304202802, "grad_norm": 0.35619640082486737, "learning_rate": 2.0858910923792725e-05, "loss": 0.3468, "step": 503 }, { "epoch": 2.6897931954636425, "grad_norm": 0.36749874498504337, "learning_rate": 2.0784263148273537e-05, "loss": 0.3222, "step": 504 }, { "epoch": 2.695130086724483, "grad_norm": 0.32371066166225, "learning_rate": 2.070960442893143e-05, "loss": 0.3295, "step": 505 }, { "epoch": 2.7004669779853234, "grad_norm": 0.35511531727159107, "learning_rate": 2.0634935807574633e-05, "loss": 0.3412, "step": 506 }, { "epoch": 2.7058038692461643, "grad_norm": 0.36830566559856426, "learning_rate": 2.0560258326149557e-05, "loss": 0.3419, "step": 507 }, { "epoch": 2.711140760507005, "grad_norm": 0.34429904324010957, "learning_rate": 2.0485573026726243e-05, "loss": 0.3339, "step": 508 }, { "epoch": 2.7164776517678453, "grad_norm": 0.3237344051103606, "learning_rate": 2.041088095148383e-05, "loss": 0.3129, "step": 509 }, { "epoch": 2.7218145430286858, "grad_norm": 0.34535776524886275, "learning_rate": 2.0336183142696006e-05, "loss": 0.3493, "step": 510 }, { "epoch": 2.727151434289526, "grad_norm": 0.37790756994617064, "learning_rate": 2.0261480642716462e-05, "loss": 0.3668, "step": 511 }, { "epoch": 2.732488325550367, "grad_norm": 0.31766118054327325, "learning_rate": 2.018677449396437e-05, "loss": 0.3307, "step": 512 }, { "epoch": 2.7378252168112076, "grad_norm": 0.36231855507183786, "learning_rate": 2.01120657389098e-05, "loss": 0.3419, "step": 513 }, { "epoch": 2.743162108072048, "grad_norm": 0.3366225474904379, "learning_rate": 2.0037355420059193e-05, "loss": 0.3281, "step": 514 }, { "epoch": 2.7484989993328885, "grad_norm": 0.3288307627791964, "learning_rate": 1.9962644579940814e-05, "loss": 0.3446, "step": 515 }, { "epoch": 2.753835890593729, "grad_norm": 0.4002909383646017, "learning_rate": 1.988793426109021e-05, "loss": 0.3407, "step": 516 }, { "epoch": 2.75917278185457, "grad_norm": 0.361890263436206, "learning_rate": 1.9813225506035637e-05, "loss": 0.3573, "step": 517 }, { "epoch": 2.7645096731154104, "grad_norm": 0.36972409412445767, "learning_rate": 1.973851935728354e-05, "loss": 0.3386, "step": 518 }, { "epoch": 2.769846564376251, "grad_norm": 0.3890677812342347, "learning_rate": 1.9663816857304005e-05, "loss": 0.3526, "step": 519 }, { "epoch": 2.7751834556370913, "grad_norm": 0.3515870789484941, "learning_rate": 1.9589119048516177e-05, "loss": 0.327, "step": 520 }, { "epoch": 2.780520346897932, "grad_norm": 0.33110583319058595, "learning_rate": 1.951442697327376e-05, "loss": 0.3114, "step": 521 }, { "epoch": 2.7858572381587727, "grad_norm": 0.3644018882302513, "learning_rate": 1.943974167385045e-05, "loss": 0.3222, "step": 522 }, { "epoch": 2.791194129419613, "grad_norm": 0.3435318930149615, "learning_rate": 1.936506419242537e-05, "loss": 0.3289, "step": 523 }, { "epoch": 2.7965310206804537, "grad_norm": 0.3927187524254616, "learning_rate": 1.9290395571068573e-05, "loss": 0.3519, "step": 524 }, { "epoch": 2.801867911941294, "grad_norm": 0.3388044424262462, "learning_rate": 1.921573685172647e-05, "loss": 0.3167, "step": 525 }, { "epoch": 2.8072048032021346, "grad_norm": 0.3587054307128672, "learning_rate": 1.914108907620728e-05, "loss": 0.3389, "step": 526 }, { "epoch": 2.8125416944629755, "grad_norm": 0.3485342273967904, "learning_rate": 1.9066453286166536e-05, "loss": 0.3412, "step": 527 }, { "epoch": 2.817878585723816, "grad_norm": 0.3341586078617526, "learning_rate": 1.8991830523092497e-05, "loss": 0.3216, "step": 528 }, { "epoch": 2.8232154769846565, "grad_norm": 0.37102695451617046, "learning_rate": 1.8917221828291652e-05, "loss": 0.356, "step": 529 }, { "epoch": 2.828552368245497, "grad_norm": 0.3137005027651722, "learning_rate": 1.8842628242874187e-05, "loss": 0.3256, "step": 530 }, { "epoch": 2.8338892595063374, "grad_norm": 0.39850109158745595, "learning_rate": 1.8768050807739425e-05, "loss": 0.3387, "step": 531 }, { "epoch": 2.8392261507671783, "grad_norm": 0.36284113759820136, "learning_rate": 1.8693490563561343e-05, "loss": 0.3234, "step": 532 }, { "epoch": 2.844563042028019, "grad_norm": 0.3420275126150477, "learning_rate": 1.8618948550774033e-05, "loss": 0.3297, "step": 533 }, { "epoch": 2.8498999332888593, "grad_norm": 0.39257170087476245, "learning_rate": 1.854442580955715e-05, "loss": 0.3321, "step": 534 }, { "epoch": 2.8552368245496997, "grad_norm": 0.3523635968791173, "learning_rate": 1.846992337982147e-05, "loss": 0.3585, "step": 535 }, { "epoch": 2.86057371581054, "grad_norm": 0.337680075516661, "learning_rate": 1.83954423011943e-05, "loss": 0.3269, "step": 536 }, { "epoch": 2.865910607071381, "grad_norm": 0.3968837701031737, "learning_rate": 1.832098361300503e-05, "loss": 0.3644, "step": 537 }, { "epoch": 2.8712474983322216, "grad_norm": 0.35722398722368337, "learning_rate": 1.8246548354270616e-05, "loss": 0.335, "step": 538 }, { "epoch": 2.876584389593062, "grad_norm": 0.3572447721264452, "learning_rate": 1.8172137563681042e-05, "loss": 0.3513, "step": 539 }, { "epoch": 2.8819212808539025, "grad_norm": 0.3689075039010707, "learning_rate": 1.809775227958488e-05, "loss": 0.3479, "step": 540 }, { "epoch": 2.887258172114743, "grad_norm": 0.4157671175592236, "learning_rate": 1.802339353997478e-05, "loss": 0.3629, "step": 541 }, { "epoch": 2.892595063375584, "grad_norm": 0.3447307730764763, "learning_rate": 1.7949062382472967e-05, "loss": 0.3172, "step": 542 }, { "epoch": 2.8979319546364244, "grad_norm": 0.3990421667679026, "learning_rate": 1.787475984431678e-05, "loss": 0.3568, "step": 543 }, { "epoch": 2.903268845897265, "grad_norm": 0.3284492948137503, "learning_rate": 1.7800486962344213e-05, "loss": 0.3232, "step": 544 }, { "epoch": 2.9086057371581053, "grad_norm": 0.3564409700384773, "learning_rate": 1.7726244772979408e-05, "loss": 0.3262, "step": 545 }, { "epoch": 2.913942628418946, "grad_norm": 0.38056209814773034, "learning_rate": 1.7652034312218234e-05, "loss": 0.3504, "step": 546 }, { "epoch": 2.9192795196797867, "grad_norm": 0.33330941463187824, "learning_rate": 1.757785661561378e-05, "loss": 0.3503, "step": 547 }, { "epoch": 2.924616410940627, "grad_norm": 0.32533655359519337, "learning_rate": 1.7503712718261977e-05, "loss": 0.3306, "step": 548 }, { "epoch": 2.9299533022014677, "grad_norm": 0.34762836109457435, "learning_rate": 1.7429603654787078e-05, "loss": 0.3464, "step": 549 }, { "epoch": 2.935290193462308, "grad_norm": 0.355505370708806, "learning_rate": 1.7355530459327267e-05, "loss": 0.3355, "step": 550 }, { "epoch": 2.9406270847231486, "grad_norm": 0.34476430444950434, "learning_rate": 1.7281494165520217e-05, "loss": 0.3215, "step": 551 }, { "epoch": 2.9459639759839895, "grad_norm": 0.34463395158247007, "learning_rate": 1.7207495806488672e-05, "loss": 0.3286, "step": 552 }, { "epoch": 2.95130086724483, "grad_norm": 0.39694696141098895, "learning_rate": 1.7133536414826e-05, "loss": 0.348, "step": 553 }, { "epoch": 2.9566377585056705, "grad_norm": 0.3414356307623428, "learning_rate": 1.705961702258183e-05, "loss": 0.3266, "step": 554 }, { "epoch": 2.961974649766511, "grad_norm": 0.34062712306187115, "learning_rate": 1.6985738661247627e-05, "loss": 0.3245, "step": 555 }, { "epoch": 2.9673115410273514, "grad_norm": 0.35420373579360254, "learning_rate": 1.691190236174228e-05, "loss": 0.3094, "step": 556 }, { "epoch": 2.9726484322881923, "grad_norm": 0.35393965464453037, "learning_rate": 1.6838109154397764e-05, "loss": 0.3636, "step": 557 }, { "epoch": 2.977985323549033, "grad_norm": 0.3398430528475795, "learning_rate": 1.6764360068944706e-05, "loss": 0.3489, "step": 558 }, { "epoch": 2.9833222148098733, "grad_norm": 0.36959208816642103, "learning_rate": 1.6690656134498063e-05, "loss": 0.3545, "step": 559 }, { "epoch": 2.9886591060707137, "grad_norm": 0.3244411298894109, "learning_rate": 1.661699837954275e-05, "loss": 0.3024, "step": 560 }, { "epoch": 2.993995997331554, "grad_norm": 0.31714532814318064, "learning_rate": 1.6543387831919243e-05, "loss": 0.3196, "step": 561 }, { "epoch": 2.999332888592395, "grad_norm": 0.49216959263692955, "learning_rate": 1.646982551880931e-05, "loss": 0.4825, "step": 562 }, { "epoch": 3.0046697798532356, "grad_norm": 0.6375873638749311, "learning_rate": 1.639631246672164e-05, "loss": 0.3264, "step": 563 }, { "epoch": 3.010006671114076, "grad_norm": 0.4465753271154956, "learning_rate": 1.632284970147749e-05, "loss": 0.2328, "step": 564 }, { "epoch": 3.0153435623749165, "grad_norm": 0.4868378113812574, "learning_rate": 1.6249438248196437e-05, "loss": 0.2209, "step": 565 }, { "epoch": 3.020680453635757, "grad_norm": 0.7954769693611728, "learning_rate": 1.617607913128202e-05, "loss": 0.2305, "step": 566 }, { "epoch": 3.026017344896598, "grad_norm": 0.4733054296578894, "learning_rate": 1.610277337440745e-05, "loss": 0.231, "step": 567 }, { "epoch": 3.0313542361574384, "grad_norm": 0.44905546773789157, "learning_rate": 1.6029522000501362e-05, "loss": 0.2129, "step": 568 }, { "epoch": 3.036691127418279, "grad_norm": 0.46572310673573625, "learning_rate": 1.5956326031733496e-05, "loss": 0.2092, "step": 569 }, { "epoch": 3.0420280186791193, "grad_norm": 0.46263530375828704, "learning_rate": 1.5883186489500465e-05, "loss": 0.2214, "step": 570 }, { "epoch": 3.04736490993996, "grad_norm": 0.4153475909168177, "learning_rate": 1.5810104394411494e-05, "loss": 0.2374, "step": 571 }, { "epoch": 3.0527018012008007, "grad_norm": 0.3926765936070903, "learning_rate": 1.5737080766274154e-05, "loss": 0.2181, "step": 572 }, { "epoch": 3.058038692461641, "grad_norm": 0.4423780999916496, "learning_rate": 1.5664116624080176e-05, "loss": 0.2263, "step": 573 }, { "epoch": 3.0633755837224816, "grad_norm": 0.48104075830395865, "learning_rate": 1.55912129859912e-05, "loss": 0.2299, "step": 574 }, { "epoch": 3.068712474983322, "grad_norm": 0.34802553512115314, "learning_rate": 1.5518370869324562e-05, "loss": 0.2038, "step": 575 }, { "epoch": 3.0740493662441626, "grad_norm": 0.4024921353804397, "learning_rate": 1.5445591290539133e-05, "loss": 0.2306, "step": 576 }, { "epoch": 3.0793862575050035, "grad_norm": 0.39223703505556246, "learning_rate": 1.5372875265221098e-05, "loss": 0.2146, "step": 577 }, { "epoch": 3.084723148765844, "grad_norm": 0.37616914852450856, "learning_rate": 1.53002238080698e-05, "loss": 0.2215, "step": 578 }, { "epoch": 3.0900600400266844, "grad_norm": 0.38195507124837264, "learning_rate": 1.5227637932883603e-05, "loss": 0.2008, "step": 579 }, { "epoch": 3.095396931287525, "grad_norm": 0.37714457548841984, "learning_rate": 1.515511865254568e-05, "loss": 0.2257, "step": 580 }, { "epoch": 3.1007338225483654, "grad_norm": 0.36734917441929993, "learning_rate": 1.5082666979009953e-05, "loss": 0.2081, "step": 581 }, { "epoch": 3.1060707138092063, "grad_norm": 0.38769476025178157, "learning_rate": 1.5010283923286944e-05, "loss": 0.2271, "step": 582 }, { "epoch": 3.1114076050700468, "grad_norm": 0.3586607196800494, "learning_rate": 1.493797049542963e-05, "loss": 0.2021, "step": 583 }, { "epoch": 3.1167444963308872, "grad_norm": 0.37496022310615634, "learning_rate": 1.4865727704519416e-05, "loss": 0.2443, "step": 584 }, { "epoch": 3.1220813875917277, "grad_norm": 0.34669037639542094, "learning_rate": 1.4793556558652012e-05, "loss": 0.2221, "step": 585 }, { "epoch": 3.127418278852568, "grad_norm": 0.33087385453332585, "learning_rate": 1.472145806492335e-05, "loss": 0.2068, "step": 586 }, { "epoch": 3.132755170113409, "grad_norm": 0.3375224847458593, "learning_rate": 1.4649433229415588e-05, "loss": 0.2167, "step": 587 }, { "epoch": 3.1380920613742496, "grad_norm": 0.34607209711469494, "learning_rate": 1.457748305718301e-05, "loss": 0.2175, "step": 588 }, { "epoch": 3.14342895263509, "grad_norm": 0.330025155691137, "learning_rate": 1.4505608552238047e-05, "loss": 0.2341, "step": 589 }, { "epoch": 3.1487658438959305, "grad_norm": 0.34147253971194746, "learning_rate": 1.4433810717537244e-05, "loss": 0.2146, "step": 590 }, { "epoch": 3.154102735156771, "grad_norm": 0.3453098452736006, "learning_rate": 1.436209055496725e-05, "loss": 0.2024, "step": 591 }, { "epoch": 3.159439626417612, "grad_norm": 0.3466365634852416, "learning_rate": 1.429044906533088e-05, "loss": 0.242, "step": 592 }, { "epoch": 3.1647765176784524, "grad_norm": 0.3290618234863436, "learning_rate": 1.4218887248333123e-05, "loss": 0.2111, "step": 593 }, { "epoch": 3.170113408939293, "grad_norm": 0.351656000947437, "learning_rate": 1.414740610256717e-05, "loss": 0.225, "step": 594 }, { "epoch": 3.1754503002001333, "grad_norm": 0.3176619140403144, "learning_rate": 1.4076006625500526e-05, "loss": 0.2234, "step": 595 }, { "epoch": 3.1807871914609738, "grad_norm": 0.3355617376830507, "learning_rate": 1.4004689813461072e-05, "loss": 0.2105, "step": 596 }, { "epoch": 3.1861240827218147, "grad_norm": 0.3361984337758312, "learning_rate": 1.3933456661623142e-05, "loss": 0.2243, "step": 597 }, { "epoch": 3.191460973982655, "grad_norm": 0.3271843509477646, "learning_rate": 1.3862308163993667e-05, "loss": 0.2094, "step": 598 }, { "epoch": 3.1967978652434956, "grad_norm": 0.3571117855172697, "learning_rate": 1.379124531339827e-05, "loss": 0.2162, "step": 599 }, { "epoch": 3.202134756504336, "grad_norm": 0.3273165917601703, "learning_rate": 1.3720269101467454e-05, "loss": 0.2061, "step": 600 }, { "epoch": 3.2074716477651766, "grad_norm": 0.3284542388193666, "learning_rate": 1.364938051862274e-05, "loss": 0.2066, "step": 601 }, { "epoch": 3.2128085390260175, "grad_norm": 0.36447675615077985, "learning_rate": 1.3578580554062826e-05, "loss": 0.2281, "step": 602 }, { "epoch": 3.218145430286858, "grad_norm": 0.3262380092074158, "learning_rate": 1.3507870195749829e-05, "loss": 0.203, "step": 603 }, { "epoch": 3.2234823215476984, "grad_norm": 0.3457313320643053, "learning_rate": 1.3437250430395478e-05, "loss": 0.2379, "step": 604 }, { "epoch": 3.228819212808539, "grad_norm": 0.3441380900113131, "learning_rate": 1.336672224344732e-05, "loss": 0.2245, "step": 605 }, { "epoch": 3.2341561040693794, "grad_norm": 0.32996002379601147, "learning_rate": 1.3296286619075016e-05, "loss": 0.2227, "step": 606 }, { "epoch": 3.2394929953302203, "grad_norm": 0.3282894261484981, "learning_rate": 1.3225944540156565e-05, "loss": 0.2046, "step": 607 }, { "epoch": 3.2448298865910608, "grad_norm": 0.32710033563774793, "learning_rate": 1.3155696988264621e-05, "loss": 0.1997, "step": 608 }, { "epoch": 3.2501667778519012, "grad_norm": 0.3762040570145167, "learning_rate": 1.3085544943652783e-05, "loss": 0.235, "step": 609 }, { "epoch": 3.2555036691127417, "grad_norm": 0.3440879224425546, "learning_rate": 1.3015489385241895e-05, "loss": 0.2028, "step": 610 }, { "epoch": 3.260840560373582, "grad_norm": 0.38071227441207245, "learning_rate": 1.2945531290606423e-05, "loss": 0.2346, "step": 611 }, { "epoch": 3.266177451634423, "grad_norm": 0.31980915182543196, "learning_rate": 1.2875671635960807e-05, "loss": 0.214, "step": 612 }, { "epoch": 3.2715143428952635, "grad_norm": 0.35716629327114985, "learning_rate": 1.2805911396145794e-05, "loss": 0.2117, "step": 613 }, { "epoch": 3.276851234156104, "grad_norm": 0.34571819244635243, "learning_rate": 1.2736251544614903e-05, "loss": 0.2304, "step": 614 }, { "epoch": 3.2821881254169445, "grad_norm": 0.3208986289612815, "learning_rate": 1.2666693053420795e-05, "loss": 0.218, "step": 615 }, { "epoch": 3.287525016677785, "grad_norm": 0.3567717531275436, "learning_rate": 1.2597236893201712e-05, "loss": 0.2036, "step": 616 }, { "epoch": 3.292861907938626, "grad_norm": 0.32882485699097, "learning_rate": 1.2527884033167966e-05, "loss": 0.2248, "step": 617 }, { "epoch": 3.2981987991994663, "grad_norm": 0.33037504825082553, "learning_rate": 1.2458635441088354e-05, "loss": 0.2136, "step": 618 }, { "epoch": 3.303535690460307, "grad_norm": 0.3715971284418149, "learning_rate": 1.2389492083276719e-05, "loss": 0.2321, "step": 619 }, { "epoch": 3.3088725817211473, "grad_norm": 0.33125816705438366, "learning_rate": 1.2320454924578435e-05, "loss": 0.2364, "step": 620 }, { "epoch": 3.3142094729819878, "grad_norm": 0.33161675743461216, "learning_rate": 1.225152492835692e-05, "loss": 0.2092, "step": 621 }, { "epoch": 3.3195463642428287, "grad_norm": 0.34978398552096673, "learning_rate": 1.2182703056480243e-05, "loss": 0.2293, "step": 622 }, { "epoch": 3.324883255503669, "grad_norm": 0.32364794551815557, "learning_rate": 1.211399026930767e-05, "loss": 0.2204, "step": 623 }, { "epoch": 3.3302201467645096, "grad_norm": 0.3499702901876252, "learning_rate": 1.2045387525676253e-05, "loss": 0.2159, "step": 624 }, { "epoch": 3.33555703802535, "grad_norm": 0.29554206232950536, "learning_rate": 1.1976895782887488e-05, "loss": 0.199, "step": 625 }, { "epoch": 3.3408939292861906, "grad_norm": 0.3556138967561671, "learning_rate": 1.1908515996693927e-05, "loss": 0.2231, "step": 626 }, { "epoch": 3.3462308205470315, "grad_norm": 0.3363266594913331, "learning_rate": 1.1840249121285843e-05, "loss": 0.2252, "step": 627 }, { "epoch": 3.351567711807872, "grad_norm": 0.29662341791312835, "learning_rate": 1.1772096109277937e-05, "loss": 0.2, "step": 628 }, { "epoch": 3.3569046030687124, "grad_norm": 0.32468090780131365, "learning_rate": 1.1704057911696003e-05, "loss": 0.2124, "step": 629 }, { "epoch": 3.362241494329553, "grad_norm": 0.33364365893577863, "learning_rate": 1.1636135477963702e-05, "loss": 0.2418, "step": 630 }, { "epoch": 3.3675783855903934, "grad_norm": 0.32925700089942483, "learning_rate": 1.15683297558893e-05, "loss": 0.2139, "step": 631 }, { "epoch": 3.3729152768512343, "grad_norm": 0.3328549235968737, "learning_rate": 1.1500641691652412e-05, "loss": 0.2165, "step": 632 }, { "epoch": 3.3782521681120747, "grad_norm": 0.30882570277397, "learning_rate": 1.1433072229790847e-05, "loss": 0.2128, "step": 633 }, { "epoch": 3.383589059372915, "grad_norm": 0.3370088932960149, "learning_rate": 1.1365622313187402e-05, "loss": 0.2289, "step": 634 }, { "epoch": 3.3889259506337557, "grad_norm": 0.3419578297439491, "learning_rate": 1.1298292883056682e-05, "loss": 0.2295, "step": 635 }, { "epoch": 3.394262841894596, "grad_norm": 0.310821271131648, "learning_rate": 1.1231084878932018e-05, "loss": 0.1937, "step": 636 }, { "epoch": 3.399599733155437, "grad_norm": 0.32752165763324353, "learning_rate": 1.1163999238652328e-05, "loss": 0.2342, "step": 637 }, { "epoch": 3.4049366244162775, "grad_norm": 0.33007003787949457, "learning_rate": 1.109703689834901e-05, "loss": 0.2068, "step": 638 }, { "epoch": 3.410273515677118, "grad_norm": 0.34735756204554885, "learning_rate": 1.1030198792432915e-05, "loss": 0.2414, "step": 639 }, { "epoch": 3.4156104069379585, "grad_norm": 0.305560108048619, "learning_rate": 1.09634858535813e-05, "loss": 0.2097, "step": 640 }, { "epoch": 3.4209472981987994, "grad_norm": 0.3389861898800808, "learning_rate": 1.089689901272479e-05, "loss": 0.2127, "step": 641 }, { "epoch": 3.42628418945964, "grad_norm": 0.3207807998603898, "learning_rate": 1.0830439199034424e-05, "loss": 0.2226, "step": 642 }, { "epoch": 3.4316210807204803, "grad_norm": 0.3264054440024789, "learning_rate": 1.0764107339908643e-05, "loss": 0.222, "step": 643 }, { "epoch": 3.436957971981321, "grad_norm": 0.32600813384811467, "learning_rate": 1.0697904360960392e-05, "loss": 0.209, "step": 644 }, { "epoch": 3.4422948632421613, "grad_norm": 0.3177047337904919, "learning_rate": 1.06318311860042e-05, "loss": 0.2089, "step": 645 }, { "epoch": 3.447631754503002, "grad_norm": 0.3210423679681483, "learning_rate": 1.0565888737043238e-05, "loss": 0.2274, "step": 646 }, { "epoch": 3.4529686457638427, "grad_norm": 0.3142756745691945, "learning_rate": 1.050007793425653e-05, "loss": 0.208, "step": 647 }, { "epoch": 3.458305537024683, "grad_norm": 0.3189823143454847, "learning_rate": 1.0434399695986038e-05, "loss": 0.237, "step": 648 }, { "epoch": 3.4636424282855236, "grad_norm": 0.35446119159135153, "learning_rate": 1.0368854938723909e-05, "loss": 0.2257, "step": 649 }, { "epoch": 3.468979319546364, "grad_norm": 0.30379218539151415, "learning_rate": 1.0303444577099657e-05, "loss": 0.2055, "step": 650 }, { "epoch": 3.474316210807205, "grad_norm": 0.3334793611412552, "learning_rate": 1.023816952386738e-05, "loss": 0.2259, "step": 651 }, { "epoch": 3.4796531020680455, "grad_norm": 0.31245098736700866, "learning_rate": 1.0173030689893073e-05, "loss": 0.217, "step": 652 }, { "epoch": 3.484989993328886, "grad_norm": 0.298270066885541, "learning_rate": 1.010802898414188e-05, "loss": 0.1971, "step": 653 }, { "epoch": 3.4903268845897264, "grad_norm": 0.3270591833255489, "learning_rate": 1.0043165313665408e-05, "loss": 0.2278, "step": 654 }, { "epoch": 3.495663775850567, "grad_norm": 0.305730303527883, "learning_rate": 9.978440583589097e-06, "loss": 0.2177, "step": 655 }, { "epoch": 3.5010006671114073, "grad_norm": 0.3144303533888383, "learning_rate": 9.913855697099581e-06, "loss": 0.2244, "step": 656 }, { "epoch": 3.5063375583722483, "grad_norm": 0.3186316247887544, "learning_rate": 9.84941155543205e-06, "loss": 0.2117, "step": 657 }, { "epoch": 3.5116744496330887, "grad_norm": 0.3338362484473223, "learning_rate": 9.785109057857724e-06, "loss": 0.2175, "step": 658 }, { "epoch": 3.517011340893929, "grad_norm": 0.33877190549699376, "learning_rate": 9.720949101671283e-06, "loss": 0.2072, "step": 659 }, { "epoch": 3.52234823215477, "grad_norm": 0.2878815210471794, "learning_rate": 9.65693258217834e-06, "loss": 0.2024, "step": 660 }, { "epoch": 3.52768512341561, "grad_norm": 0.3229015014259361, "learning_rate": 9.59306039268296e-06, "loss": 0.2309, "step": 661 }, { "epoch": 3.533022014676451, "grad_norm": 0.329774980311716, "learning_rate": 9.529333424475165e-06, "loss": 0.21, "step": 662 }, { "epoch": 3.5383589059372915, "grad_norm": 0.3254821489847073, "learning_rate": 9.465752566818545e-06, "loss": 0.2255, "step": 663 }, { "epoch": 3.543695797198132, "grad_norm": 0.3318586960366466, "learning_rate": 9.402318706937818e-06, "loss": 0.2298, "step": 664 }, { "epoch": 3.549032688458973, "grad_norm": 0.3116201867034196, "learning_rate": 9.33903273000644e-06, "loss": 0.2054, "step": 665 }, { "epoch": 3.554369579719813, "grad_norm": 0.31694579049650856, "learning_rate": 9.275895519134284e-06, "loss": 0.222, "step": 666 }, { "epoch": 3.559706470980654, "grad_norm": 0.32312031003526426, "learning_rate": 9.212907955355302e-06, "loss": 0.2173, "step": 667 }, { "epoch": 3.5650433622414943, "grad_norm": 0.32535627286627056, "learning_rate": 9.150070917615209e-06, "loss": 0.224, "step": 668 }, { "epoch": 3.570380253502335, "grad_norm": 0.32754134685676567, "learning_rate": 9.087385282759262e-06, "loss": 0.2056, "step": 669 }, { "epoch": 3.5757171447631757, "grad_norm": 0.32097631798875437, "learning_rate": 9.024851925519984e-06, "loss": 0.2167, "step": 670 }, { "epoch": 3.581054036024016, "grad_norm": 0.3224914181755297, "learning_rate": 8.962471718504981e-06, "loss": 0.2318, "step": 671 }, { "epoch": 3.5863909272848566, "grad_norm": 0.3069463307574391, "learning_rate": 8.90024553218477e-06, "loss": 0.2147, "step": 672 }, { "epoch": 3.591727818545697, "grad_norm": 0.34503320829284617, "learning_rate": 8.838174234880595e-06, "loss": 0.2115, "step": 673 }, { "epoch": 3.5970647098065376, "grad_norm": 0.33721115686062114, "learning_rate": 8.776258692752355e-06, "loss": 0.2125, "step": 674 }, { "epoch": 3.6024016010673785, "grad_norm": 0.2988266857938936, "learning_rate": 8.714499769786504e-06, "loss": 0.2068, "step": 675 }, { "epoch": 3.607738492328219, "grad_norm": 0.33052375619924823, "learning_rate": 8.652898327783966e-06, "loss": 0.2081, "step": 676 }, { "epoch": 3.6130753835890594, "grad_norm": 0.2972205217005894, "learning_rate": 8.591455226348153e-06, "loss": 0.2165, "step": 677 }, { "epoch": 3.6184122748499, "grad_norm": 0.29969511000575444, "learning_rate": 8.530171322872943e-06, "loss": 0.2013, "step": 678 }, { "epoch": 3.6237491661107404, "grad_norm": 0.3304499281322153, "learning_rate": 8.469047472530721e-06, "loss": 0.2355, "step": 679 }, { "epoch": 3.6290860573715813, "grad_norm": 0.290277414481082, "learning_rate": 8.408084528260454e-06, "loss": 0.2072, "step": 680 }, { "epoch": 3.6344229486324218, "grad_norm": 0.33406571248792316, "learning_rate": 8.347283340755762e-06, "loss": 0.2103, "step": 681 }, { "epoch": 3.6397598398932622, "grad_norm": 0.33772777521240266, "learning_rate": 8.286644758453084e-06, "loss": 0.2277, "step": 682 }, { "epoch": 3.6450967311541027, "grad_norm": 0.32580165533637756, "learning_rate": 8.226169627519829e-06, "loss": 0.1972, "step": 683 }, { "epoch": 3.650433622414943, "grad_norm": 0.327604705269643, "learning_rate": 8.165858791842531e-06, "loss": 0.2301, "step": 684 }, { "epoch": 3.655770513675784, "grad_norm": 0.30673340194726156, "learning_rate": 8.10571309301513e-06, "loss": 0.2214, "step": 685 }, { "epoch": 3.6611074049366246, "grad_norm": 0.3082993998431756, "learning_rate": 8.045733370327197e-06, "loss": 0.2142, "step": 686 }, { "epoch": 3.666444296197465, "grad_norm": 0.3264898640096142, "learning_rate": 7.98592046075221e-06, "loss": 0.2239, "step": 687 }, { "epoch": 3.6717811874583055, "grad_norm": 0.3267139343805825, "learning_rate": 7.926275198935915e-06, "loss": 0.2174, "step": 688 }, { "epoch": 3.677118078719146, "grad_norm": 0.3038869171361146, "learning_rate": 7.866798417184631e-06, "loss": 0.2167, "step": 689 }, { "epoch": 3.682454969979987, "grad_norm": 0.3153766070982165, "learning_rate": 7.807490945453675e-06, "loss": 0.2214, "step": 690 }, { "epoch": 3.6877918612408274, "grad_norm": 0.3359093682690147, "learning_rate": 7.748353611335772e-06, "loss": 0.2282, "step": 691 }, { "epoch": 3.693128752501668, "grad_norm": 0.3235870658057771, "learning_rate": 7.689387240049475e-06, "loss": 0.215, "step": 692 }, { "epoch": 3.6984656437625083, "grad_norm": 0.32596212481215736, "learning_rate": 7.6305926544277e-06, "loss": 0.2204, "step": 693 }, { "epoch": 3.7038025350233488, "grad_norm": 0.3089795878726115, "learning_rate": 7.571970674906212e-06, "loss": 0.2013, "step": 694 }, { "epoch": 3.7091394262841897, "grad_norm": 0.3036096075117797, "learning_rate": 7.513522119512171e-06, "loss": 0.2128, "step": 695 }, { "epoch": 3.71447631754503, "grad_norm": 0.31370142673701046, "learning_rate": 7.455247803852741e-06, "loss": 0.2153, "step": 696 }, { "epoch": 3.7198132088058706, "grad_norm": 0.3164732161874329, "learning_rate": 7.397148541103698e-06, "loss": 0.215, "step": 697 }, { "epoch": 3.725150100066711, "grad_norm": 0.3143756963158667, "learning_rate": 7.339225141998076e-06, "loss": 0.2274, "step": 698 }, { "epoch": 3.7304869913275516, "grad_norm": 0.3259467908831874, "learning_rate": 7.281478414814869e-06, "loss": 0.2248, "step": 699 }, { "epoch": 3.7358238825883925, "grad_norm": 0.3243759701523797, "learning_rate": 7.223909165367722e-06, "loss": 0.2267, "step": 700 }, { "epoch": 3.741160773849233, "grad_norm": 0.29306669061610235, "learning_rate": 7.166518196993726e-06, "loss": 0.2039, "step": 701 }, { "epoch": 3.7464976651100734, "grad_norm": 0.3122606127392118, "learning_rate": 7.109306310542193e-06, "loss": 0.2247, "step": 702 }, { "epoch": 3.751834556370914, "grad_norm": 0.3084720774983147, "learning_rate": 7.052274304363449e-06, "loss": 0.2263, "step": 703 }, { "epoch": 3.7571714476317544, "grad_norm": 0.32807197455446596, "learning_rate": 6.995422974297748e-06, "loss": 0.2182, "step": 704 }, { "epoch": 3.7625083388925953, "grad_norm": 0.29627151938725177, "learning_rate": 6.938753113664138e-06, "loss": 0.223, "step": 705 }, { "epoch": 3.7678452301534358, "grad_norm": 0.30481388666511067, "learning_rate": 6.882265513249376e-06, "loss": 0.2153, "step": 706 }, { "epoch": 3.7731821214142762, "grad_norm": 0.30367279516217394, "learning_rate": 6.8259609612969245e-06, "loss": 0.1961, "step": 707 }, { "epoch": 3.7785190126751167, "grad_norm": 0.3311032509813633, "learning_rate": 6.769840243495937e-06, "loss": 0.2451, "step": 708 }, { "epoch": 3.783855903935957, "grad_norm": 0.3124678571976269, "learning_rate": 6.713904142970282e-06, "loss": 0.215, "step": 709 }, { "epoch": 3.789192795196798, "grad_norm": 0.31743166594564953, "learning_rate": 6.658153440267649e-06, "loss": 0.2176, "step": 710 }, { "epoch": 3.7945296864576386, "grad_norm": 0.28782289424591967, "learning_rate": 6.602588913348611e-06, "loss": 0.2094, "step": 711 }, { "epoch": 3.799866577718479, "grad_norm": 0.3124172224100049, "learning_rate": 6.547211337575812e-06, "loss": 0.2083, "step": 712 }, { "epoch": 3.8052034689793195, "grad_norm": 0.3173760507861447, "learning_rate": 6.4920214857031286e-06, "loss": 0.2181, "step": 713 }, { "epoch": 3.81054036024016, "grad_norm": 0.3191329518696444, "learning_rate": 6.437020127864863e-06, "loss": 0.2134, "step": 714 }, { "epoch": 3.815877251501001, "grad_norm": 0.3051391344384358, "learning_rate": 6.382208031565051e-06, "loss": 0.2204, "step": 715 }, { "epoch": 3.8212141427618413, "grad_norm": 0.31850920002112115, "learning_rate": 6.327585961666703e-06, "loss": 0.2253, "step": 716 }, { "epoch": 3.826551034022682, "grad_norm": 0.29793389352549826, "learning_rate": 6.273154680381152e-06, "loss": 0.2078, "step": 717 }, { "epoch": 3.8318879252835223, "grad_norm": 0.3003506055760915, "learning_rate": 6.218914947257424e-06, "loss": 0.231, "step": 718 }, { "epoch": 3.8372248165443628, "grad_norm": 0.320501388188613, "learning_rate": 6.164867519171609e-06, "loss": 0.2244, "step": 719 }, { "epoch": 3.8425617078052037, "grad_norm": 0.3141510603372196, "learning_rate": 6.111013150316336e-06, "loss": 0.2145, "step": 720 }, { "epoch": 3.847898599066044, "grad_norm": 0.28759406096829065, "learning_rate": 6.057352592190233e-06, "loss": 0.2059, "step": 721 }, { "epoch": 3.8532354903268846, "grad_norm": 0.2965011224320686, "learning_rate": 6.003886593587429e-06, "loss": 0.1994, "step": 722 }, { "epoch": 3.858572381587725, "grad_norm": 0.32309929809226157, "learning_rate": 5.9506159005871225e-06, "loss": 0.2223, "step": 723 }, { "epoch": 3.8639092728485656, "grad_norm": 0.3064344176543871, "learning_rate": 5.897541256543171e-06, "loss": 0.2165, "step": 724 }, { "epoch": 3.8692461641094065, "grad_norm": 0.30830702220241546, "learning_rate": 5.844663402073696e-06, "loss": 0.222, "step": 725 }, { "epoch": 3.874583055370247, "grad_norm": 0.29076332143502975, "learning_rate": 5.791983075050773e-06, "loss": 0.2048, "step": 726 }, { "epoch": 3.8799199466310874, "grad_norm": 0.300114275064821, "learning_rate": 5.739501010590132e-06, "loss": 0.2084, "step": 727 }, { "epoch": 3.885256837891928, "grad_norm": 0.3045488309986525, "learning_rate": 5.68721794104087e-06, "loss": 0.2197, "step": 728 }, { "epoch": 3.8905937291527684, "grad_norm": 0.327552950910515, "learning_rate": 5.635134595975285e-06, "loss": 0.2175, "step": 729 }, { "epoch": 3.8959306204136093, "grad_norm": 0.30484919275231337, "learning_rate": 5.583251702178634e-06, "loss": 0.2093, "step": 730 }, { "epoch": 3.9012675116744497, "grad_norm": 0.2921364622612547, "learning_rate": 5.531569983639045e-06, "loss": 0.2058, "step": 731 }, { "epoch": 3.90660440293529, "grad_norm": 0.30049818877171114, "learning_rate": 5.480090161537388e-06, "loss": 0.2259, "step": 732 }, { "epoch": 3.9119412941961307, "grad_norm": 0.3139671791644443, "learning_rate": 5.4288129542371995e-06, "loss": 0.2197, "step": 733 }, { "epoch": 3.917278185456971, "grad_norm": 0.31085024950173556, "learning_rate": 5.377739077274688e-06, "loss": 0.2223, "step": 734 }, { "epoch": 3.922615076717812, "grad_norm": 0.27795634676595493, "learning_rate": 5.326869243348734e-06, "loss": 0.2087, "step": 735 }, { "epoch": 3.9279519679786525, "grad_norm": 0.292718813818914, "learning_rate": 5.276204162310938e-06, "loss": 0.2138, "step": 736 }, { "epoch": 3.933288859239493, "grad_norm": 0.3124029055450099, "learning_rate": 5.225744541155731e-06, "loss": 0.2202, "step": 737 }, { "epoch": 3.9386257505003335, "grad_norm": 0.3056001516045311, "learning_rate": 5.1754910840105e-06, "loss": 0.2203, "step": 738 }, { "epoch": 3.943962641761174, "grad_norm": 0.28861734955419716, "learning_rate": 5.125444492125748e-06, "loss": 0.2118, "step": 739 }, { "epoch": 3.949299533022015, "grad_norm": 0.2944862914827709, "learning_rate": 5.075605463865348e-06, "loss": 0.215, "step": 740 }, { "epoch": 3.9546364242828553, "grad_norm": 0.3018187959209579, "learning_rate": 5.025974694696747e-06, "loss": 0.2121, "step": 741 }, { "epoch": 3.959973315543696, "grad_norm": 0.32968530294516596, "learning_rate": 4.9765528771813065e-06, "loss": 0.2194, "step": 742 }, { "epoch": 3.9653102068045363, "grad_norm": 0.2895487447372504, "learning_rate": 4.92734070096462e-06, "loss": 0.1945, "step": 743 }, { "epoch": 3.9706470980653767, "grad_norm": 0.30516795101430666, "learning_rate": 4.878338852766871e-06, "loss": 0.2218, "step": 744 }, { "epoch": 3.9759839893262177, "grad_norm": 0.30604355308841524, "learning_rate": 4.829548016373285e-06, "loss": 0.2166, "step": 745 }, { "epoch": 3.981320880587058, "grad_norm": 0.2907648161711581, "learning_rate": 4.780968872624569e-06, "loss": 0.2155, "step": 746 }, { "epoch": 3.9866577718478986, "grad_norm": 0.2981568120720774, "learning_rate": 4.732602099407402e-06, "loss": 0.2271, "step": 747 }, { "epoch": 3.991994663108739, "grad_norm": 0.30885152124963694, "learning_rate": 4.684448371645003e-06, "loss": 0.2183, "step": 748 }, { "epoch": 3.9973315543695795, "grad_norm": 0.29884575186957585, "learning_rate": 4.636508361287675e-06, "loss": 0.2158, "step": 749 }, { "epoch": 4.0026684456304205, "grad_norm": 0.601890871189883, "learning_rate": 4.58878273730347e-06, "loss": 0.3212, "step": 750 }, { "epoch": 4.0080053368912605, "grad_norm": 0.5298412615002074, "learning_rate": 4.541272165668829e-06, "loss": 0.1575, "step": 751 }, { "epoch": 4.013342228152101, "grad_norm": 0.42841188304774436, "learning_rate": 4.493977309359279e-06, "loss": 0.1504, "step": 752 }, { "epoch": 4.018679119412942, "grad_norm": 0.3120373077214368, "learning_rate": 4.4468988283402135e-06, "loss": 0.1287, "step": 753 }, { "epoch": 4.024016010673782, "grad_norm": 0.33027912692579064, "learning_rate": 4.40003737955766e-06, "loss": 0.161, "step": 754 }, { "epoch": 4.029352901934623, "grad_norm": 0.39662465830617283, "learning_rate": 4.353393616929118e-06, "loss": 0.15, "step": 755 }, { "epoch": 4.034689793195463, "grad_norm": 0.4875405375103197, "learning_rate": 4.306968191334437e-06, "loss": 0.1542, "step": 756 }, { "epoch": 4.040026684456304, "grad_norm": 0.4647132436994288, "learning_rate": 4.260761750606734e-06, "loss": 0.1372, "step": 757 }, { "epoch": 4.045363575717145, "grad_norm": 0.4316859008197165, "learning_rate": 4.2147749395233365e-06, "loss": 0.1507, "step": 758 }, { "epoch": 4.050700466977985, "grad_norm": 0.36991769149322723, "learning_rate": 4.1690083997968216e-06, "loss": 0.1399, "step": 759 }, { "epoch": 4.056037358238826, "grad_norm": 0.3081151075975586, "learning_rate": 4.123462770066013e-06, "loss": 0.1358, "step": 760 }, { "epoch": 4.061374249499666, "grad_norm": 0.33035087289352366, "learning_rate": 4.078138685887125e-06, "loss": 0.137, "step": 761 }, { "epoch": 4.066711140760507, "grad_norm": 0.35282903465510224, "learning_rate": 4.033036779724848e-06, "loss": 0.1423, "step": 762 }, { "epoch": 4.072048032021348, "grad_norm": 0.35728767536135847, "learning_rate": 3.988157680943536e-06, "loss": 0.1471, "step": 763 }, { "epoch": 4.077384923282188, "grad_norm": 0.35082302549758365, "learning_rate": 3.943502015798437e-06, "loss": 0.1385, "step": 764 }, { "epoch": 4.082721814543029, "grad_norm": 0.30763129510502485, "learning_rate": 3.899070407426948e-06, "loss": 0.1308, "step": 765 }, { "epoch": 4.088058705803869, "grad_norm": 0.31052345237501106, "learning_rate": 3.854863475839898e-06, "loss": 0.1508, "step": 766 }, { "epoch": 4.09339559706471, "grad_norm": 0.29117416131590373, "learning_rate": 3.810881837912934e-06, "loss": 0.1359, "step": 767 }, { "epoch": 4.098732488325551, "grad_norm": 0.32738573833211637, "learning_rate": 3.7671261073778875e-06, "loss": 0.1551, "step": 768 }, { "epoch": 4.104069379586391, "grad_norm": 0.2879978040339806, "learning_rate": 3.7235968948142098e-06, "loss": 0.1285, "step": 769 }, { "epoch": 4.109406270847232, "grad_norm": 0.3297634034251823, "learning_rate": 3.6802948076404675e-06, "loss": 0.1476, "step": 770 }, { "epoch": 4.114743162108072, "grad_norm": 0.30016892407880663, "learning_rate": 3.6372204501058494e-06, "loss": 0.1295, "step": 771 }, { "epoch": 4.120080053368913, "grad_norm": 0.3145248211219936, "learning_rate": 3.5943744232817455e-06, "loss": 0.1296, "step": 772 }, { "epoch": 4.1254169446297535, "grad_norm": 0.31318694134932973, "learning_rate": 3.551757325053362e-06, "loss": 0.1482, "step": 773 }, { "epoch": 4.1307538358905935, "grad_norm": 0.2948784037632883, "learning_rate": 3.5093697501113645e-06, "loss": 0.1422, "step": 774 }, { "epoch": 4.136090727151434, "grad_norm": 0.2801444303260538, "learning_rate": 3.4672122899435935e-06, "loss": 0.1352, "step": 775 }, { "epoch": 4.1414276184122745, "grad_norm": 0.2824012703736762, "learning_rate": 3.4252855328268055e-06, "loss": 0.1505, "step": 776 }, { "epoch": 4.146764509673115, "grad_norm": 0.2877569800912583, "learning_rate": 3.3835900638184538e-06, "loss": 0.1336, "step": 777 }, { "epoch": 4.152101400933956, "grad_norm": 0.2922681146418443, "learning_rate": 3.3421264647485476e-06, "loss": 0.1403, "step": 778 }, { "epoch": 4.157438292194796, "grad_norm": 0.29571418661558607, "learning_rate": 3.300895314211503e-06, "loss": 0.1417, "step": 779 }, { "epoch": 4.162775183455637, "grad_norm": 0.2807197693531769, "learning_rate": 3.259897187558101e-06, "loss": 0.1321, "step": 780 }, { "epoch": 4.168112074716477, "grad_norm": 0.26947880551875214, "learning_rate": 3.219132656887445e-06, "loss": 0.1393, "step": 781 }, { "epoch": 4.173448965977318, "grad_norm": 0.30694317413028555, "learning_rate": 3.1786022910389524e-06, "loss": 0.1541, "step": 782 }, { "epoch": 4.178785857238159, "grad_norm": 0.2702053773322514, "learning_rate": 3.1383066555844686e-06, "loss": 0.1245, "step": 783 }, { "epoch": 4.184122748498999, "grad_norm": 0.3036431279445982, "learning_rate": 3.0982463128203346e-06, "loss": 0.1378, "step": 784 }, { "epoch": 4.18945963975984, "grad_norm": 0.2903132944155927, "learning_rate": 3.058421821759545e-06, "loss": 0.1446, "step": 785 }, { "epoch": 4.19479653102068, "grad_norm": 0.2693686823614546, "learning_rate": 3.0188337381239696e-06, "loss": 0.1388, "step": 786 }, { "epoch": 4.200133422281521, "grad_norm": 0.2547143693355037, "learning_rate": 2.9794826143365794e-06, "loss": 0.1284, "step": 787 }, { "epoch": 4.205470313542362, "grad_norm": 0.2751535551881633, "learning_rate": 2.940368999513734e-06, "loss": 0.1375, "step": 788 }, { "epoch": 4.210807204803202, "grad_norm": 0.2778666347500376, "learning_rate": 2.901493439457543e-06, "loss": 0.1303, "step": 789 }, { "epoch": 4.216144096064043, "grad_norm": 0.2846619856811341, "learning_rate": 2.8628564766482193e-06, "loss": 0.1492, "step": 790 }, { "epoch": 4.221480987324883, "grad_norm": 0.281092186868078, "learning_rate": 2.824458650236532e-06, "loss": 0.1414, "step": 791 }, { "epoch": 4.226817878585724, "grad_norm": 0.2832353643098496, "learning_rate": 2.7863004960362784e-06, "loss": 0.141, "step": 792 }, { "epoch": 4.232154769846565, "grad_norm": 0.26863830308187436, "learning_rate": 2.748382546516799e-06, "loss": 0.1276, "step": 793 }, { "epoch": 4.237491661107405, "grad_norm": 0.2837290905016847, "learning_rate": 2.7107053307955535e-06, "loss": 0.1464, "step": 794 }, { "epoch": 4.242828552368246, "grad_norm": 0.30003856403259976, "learning_rate": 2.6732693746307405e-06, "loss": 0.1467, "step": 795 }, { "epoch": 4.248165443629086, "grad_norm": 0.3031551246837749, "learning_rate": 2.6360752004139457e-06, "loss": 0.141, "step": 796 }, { "epoch": 4.253502334889927, "grad_norm": 0.2839203101647867, "learning_rate": 2.599123327162876e-06, "loss": 0.128, "step": 797 }, { "epoch": 4.2588392261507675, "grad_norm": 0.28309288050123305, "learning_rate": 2.5624142705140974e-06, "loss": 0.1424, "step": 798 }, { "epoch": 4.2641761174116075, "grad_norm": 0.3055488535217384, "learning_rate": 2.5259485427158436e-06, "loss": 0.1489, "step": 799 }, { "epoch": 4.269513008672448, "grad_norm": 0.300540177885969, "learning_rate": 2.489726652620883e-06, "loss": 0.1362, "step": 800 }, { "epoch": 4.2748498999332885, "grad_norm": 0.29347736303425087, "learning_rate": 2.453749105679386e-06, "loss": 0.1465, "step": 801 }, { "epoch": 4.280186791194129, "grad_norm": 0.27707210287753226, "learning_rate": 2.418016403931909e-06, "loss": 0.1329, "step": 802 }, { "epoch": 4.28552368245497, "grad_norm": 0.2920650228372569, "learning_rate": 2.382529046002371e-06, "loss": 0.1527, "step": 803 }, { "epoch": 4.29086057371581, "grad_norm": 0.287631363925361, "learning_rate": 2.347287527091082e-06, "loss": 0.1422, "step": 804 }, { "epoch": 4.296197464976651, "grad_norm": 0.28709688346306034, "learning_rate": 2.3122923389678607e-06, "loss": 0.1285, "step": 805 }, { "epoch": 4.301534356237491, "grad_norm": 0.27342773854893104, "learning_rate": 2.2775439699651567e-06, "loss": 0.1388, "step": 806 }, { "epoch": 4.306871247498332, "grad_norm": 0.2666822866123337, "learning_rate": 2.2430429049712268e-06, "loss": 0.1336, "step": 807 }, { "epoch": 4.312208138759173, "grad_norm": 0.2983890987441143, "learning_rate": 2.208789625423391e-06, "loss": 0.1508, "step": 808 }, { "epoch": 4.317545030020013, "grad_norm": 0.2800353859335473, "learning_rate": 2.174784609301306e-06, "loss": 0.1319, "step": 809 }, { "epoch": 4.322881921280854, "grad_norm": 0.29623766050646816, "learning_rate": 2.141028331120276e-06, "loss": 0.1513, "step": 810 }, { "epoch": 4.328218812541694, "grad_norm": 0.27571067910906955, "learning_rate": 2.107521261924668e-06, "loss": 0.1358, "step": 811 }, { "epoch": 4.333555703802535, "grad_norm": 0.29103395140128885, "learning_rate": 2.0742638692813033e-06, "loss": 0.1309, "step": 812 }, { "epoch": 4.338892595063376, "grad_norm": 0.2933371021781572, "learning_rate": 2.0412566172729554e-06, "loss": 0.1351, "step": 813 }, { "epoch": 4.344229486324216, "grad_norm": 0.27215608874364955, "learning_rate": 2.0084999664918725e-06, "loss": 0.1475, "step": 814 }, { "epoch": 4.349566377585057, "grad_norm": 0.2883150380634738, "learning_rate": 1.9759943740333256e-06, "loss": 0.1475, "step": 815 }, { "epoch": 4.354903268845897, "grad_norm": 0.2777208010579726, "learning_rate": 1.943740293489267e-06, "loss": 0.1479, "step": 816 }, { "epoch": 4.360240160106738, "grad_norm": 0.2824439004885457, "learning_rate": 1.9117381749419794e-06, "loss": 0.1471, "step": 817 }, { "epoch": 4.365577051367579, "grad_norm": 0.28150269206830153, "learning_rate": 1.8799884649577915e-06, "loss": 0.132, "step": 818 }, { "epoch": 4.370913942628419, "grad_norm": 0.29174288272730886, "learning_rate": 1.8484916065808622e-06, "loss": 0.1482, "step": 819 }, { "epoch": 4.37625083388926, "grad_norm": 0.27656705304849727, "learning_rate": 1.8172480393269797e-06, "loss": 0.137, "step": 820 }, { "epoch": 4.3815877251501, "grad_norm": 0.2757754977908488, "learning_rate": 1.7862581991774486e-06, "loss": 0.1377, "step": 821 }, { "epoch": 4.386924616410941, "grad_norm": 0.2750527107134215, "learning_rate": 1.755522518572994e-06, "loss": 0.1207, "step": 822 }, { "epoch": 4.3922615076717815, "grad_norm": 0.29276192591791234, "learning_rate": 1.725041426407723e-06, "loss": 0.1401, "step": 823 }, { "epoch": 4.3975983989326215, "grad_norm": 0.29627925099149893, "learning_rate": 1.6948153480231511e-06, "loss": 0.1403, "step": 824 }, { "epoch": 4.402935290193462, "grad_norm": 0.2781883617488326, "learning_rate": 1.6648447052022643e-06, "loss": 0.1266, "step": 825 }, { "epoch": 4.408272181454302, "grad_norm": 0.2996634586484779, "learning_rate": 1.6351299161636202e-06, "loss": 0.1599, "step": 826 }, { "epoch": 4.413609072715143, "grad_norm": 0.26162944731844284, "learning_rate": 1.6056713955555349e-06, "loss": 0.1365, "step": 827 }, { "epoch": 4.418945963975984, "grad_norm": 0.30091733604376597, "learning_rate": 1.5764695544502774e-06, "loss": 0.1536, "step": 828 }, { "epoch": 4.424282855236824, "grad_norm": 0.28660862503859624, "learning_rate": 1.5475248003383382e-06, "loss": 0.1343, "step": 829 }, { "epoch": 4.429619746497665, "grad_norm": 0.2913443376429063, "learning_rate": 1.5188375371227525e-06, "loss": 0.1519, "step": 830 }, { "epoch": 4.434956637758505, "grad_norm": 0.2701467713064408, "learning_rate": 1.4904081651134527e-06, "loss": 0.1262, "step": 831 }, { "epoch": 4.440293529019346, "grad_norm": 0.2852211039949671, "learning_rate": 1.462237081021689e-06, "loss": 0.1501, "step": 832 }, { "epoch": 4.445630420280187, "grad_norm": 0.28582152725735244, "learning_rate": 1.4343246779544929e-06, "loss": 0.1482, "step": 833 }, { "epoch": 4.450967311541027, "grad_norm": 0.2763605840804222, "learning_rate": 1.4066713454091808e-06, "loss": 0.1375, "step": 834 }, { "epoch": 4.456304202801868, "grad_norm": 0.2918688271426615, "learning_rate": 1.3792774692679366e-06, "loss": 0.1432, "step": 835 }, { "epoch": 4.461641094062708, "grad_norm": 0.3054006010342176, "learning_rate": 1.3521434317924186e-06, "loss": 0.1491, "step": 836 }, { "epoch": 4.466977985323549, "grad_norm": 0.2824524390265842, "learning_rate": 1.3252696116184184e-06, "loss": 0.13, "step": 837 }, { "epoch": 4.47231487658439, "grad_norm": 0.2892331408612295, "learning_rate": 1.2986563837505894e-06, "loss": 0.1433, "step": 838 }, { "epoch": 4.47765176784523, "grad_norm": 0.27746574091084786, "learning_rate": 1.2723041195572106e-06, "loss": 0.1375, "step": 839 }, { "epoch": 4.482988659106071, "grad_norm": 0.2907412475565416, "learning_rate": 1.246213186764995e-06, "loss": 0.1578, "step": 840 }, { "epoch": 4.488325550366911, "grad_norm": 0.27648095040345927, "learning_rate": 1.2203839494539738e-06, "loss": 0.1415, "step": 841 }, { "epoch": 4.493662441627752, "grad_norm": 0.2821554850140892, "learning_rate": 1.1948167680523981e-06, "loss": 0.1416, "step": 842 }, { "epoch": 4.498999332888593, "grad_norm": 0.2659555704630584, "learning_rate": 1.1695119993317271e-06, "loss": 0.1262, "step": 843 }, { "epoch": 4.504336224149433, "grad_norm": 0.27101074307755924, "learning_rate": 1.1444699964016448e-06, "loss": 0.1339, "step": 844 }, { "epoch": 4.509673115410274, "grad_norm": 0.27920563868852216, "learning_rate": 1.1196911087051143e-06, "loss": 0.144, "step": 845 }, { "epoch": 4.515010006671114, "grad_norm": 0.2917517252311828, "learning_rate": 1.0951756820135294e-06, "loss": 0.1562, "step": 846 }, { "epoch": 4.5203468979319545, "grad_norm": 0.2719082360193771, "learning_rate": 1.070924058421876e-06, "loss": 0.1393, "step": 847 }, { "epoch": 4.5256837891927955, "grad_norm": 0.255901572414045, "learning_rate": 1.0469365763439532e-06, "loss": 0.1277, "step": 848 }, { "epoch": 4.5310206804536355, "grad_norm": 0.2763364181685816, "learning_rate": 1.0232135705076596e-06, "loss": 0.1551, "step": 849 }, { "epoch": 4.536357571714476, "grad_norm": 0.2962740172540047, "learning_rate": 9.997553719503239e-07, "loss": 0.1561, "step": 850 }, { "epoch": 4.541694462975316, "grad_norm": 0.268054629742994, "learning_rate": 9.765623080140774e-07, "loss": 0.1234, "step": 851 }, { "epoch": 4.547031354236157, "grad_norm": 0.3035670976166526, "learning_rate": 9.536347023412928e-07, "loss": 0.1636, "step": 852 }, { "epoch": 4.552368245496998, "grad_norm": 0.2793525064851917, "learning_rate": 9.309728748700574e-07, "loss": 0.1398, "step": 853 }, { "epoch": 4.557705136757838, "grad_norm": 0.28016641299104156, "learning_rate": 9.085771418297274e-07, "loss": 0.1292, "step": 854 }, { "epoch": 4.563042028018679, "grad_norm": 0.27699022892058145, "learning_rate": 8.864478157364997e-07, "loss": 0.1435, "step": 855 }, { "epoch": 4.568378919279519, "grad_norm": 0.2720294929447123, "learning_rate": 8.645852053890547e-07, "loss": 0.1324, "step": 856 }, { "epoch": 4.57371581054036, "grad_norm": 0.28532423639550997, "learning_rate": 8.429896158642492e-07, "loss": 0.1472, "step": 857 }, { "epoch": 4.579052701801201, "grad_norm": 0.26985015917853045, "learning_rate": 8.216613485128611e-07, "loss": 0.141, "step": 858 }, { "epoch": 4.584389593062041, "grad_norm": 0.2704957723845787, "learning_rate": 8.00600700955374e-07, "loss": 0.1361, "step": 859 }, { "epoch": 4.589726484322882, "grad_norm": 0.28012140457224527, "learning_rate": 7.798079670778391e-07, "loss": 0.1282, "step": 860 }, { "epoch": 4.595063375583722, "grad_norm": 0.2782972818576599, "learning_rate": 7.592834370277624e-07, "loss": 0.1243, "step": 861 }, { "epoch": 4.600400266844563, "grad_norm": 0.29307525545180607, "learning_rate": 7.390273972100614e-07, "loss": 0.1442, "step": 862 }, { "epoch": 4.605737158105404, "grad_norm": 0.2702797863957165, "learning_rate": 7.190401302830729e-07, "loss": 0.1313, "step": 863 }, { "epoch": 4.611074049366244, "grad_norm": 0.2884103769485427, "learning_rate": 6.993219151545871e-07, "loss": 0.1507, "step": 864 }, { "epoch": 4.616410940627085, "grad_norm": 0.3042376630394671, "learning_rate": 6.798730269779907e-07, "loss": 0.1413, "step": 865 }, { "epoch": 4.621747831887925, "grad_norm": 0.30142009513304885, "learning_rate": 6.60693737148399e-07, "loss": 0.1411, "step": 866 }, { "epoch": 4.627084723148766, "grad_norm": 0.28563889877580295, "learning_rate": 6.417843132988744e-07, "loss": 0.138, "step": 867 }, { "epoch": 4.632421614409607, "grad_norm": 0.2656629603197653, "learning_rate": 6.231450192967048e-07, "loss": 0.1421, "step": 868 }, { "epoch": 4.637758505670447, "grad_norm": 0.26399752051623476, "learning_rate": 6.047761152397025e-07, "loss": 0.1325, "step": 869 }, { "epoch": 4.643095396931288, "grad_norm": 0.27939357858366015, "learning_rate": 5.866778574525933e-07, "loss": 0.1346, "step": 870 }, { "epoch": 4.648432288192128, "grad_norm": 0.268759877392077, "learning_rate": 5.688504984834287e-07, "loss": 0.1367, "step": 871 }, { "epoch": 4.6537691794529685, "grad_norm": 0.2898319032741956, "learning_rate": 5.512942871000549e-07, "loss": 0.1516, "step": 872 }, { "epoch": 4.6591060707138094, "grad_norm": 0.28063437477691194, "learning_rate": 5.340094682866603e-07, "loss": 0.1278, "step": 873 }, { "epoch": 4.6644429619746495, "grad_norm": 0.2872370132188477, "learning_rate": 5.169962832403475e-07, "loss": 0.1502, "step": 874 }, { "epoch": 4.66977985323549, "grad_norm": 0.2712192364166597, "learning_rate": 5.002549693677594e-07, "loss": 0.1337, "step": 875 }, { "epoch": 4.67511674449633, "grad_norm": 0.29491089886365257, "learning_rate": 4.837857602817808e-07, "loss": 0.1503, "step": 876 }, { "epoch": 4.680453635757171, "grad_norm": 0.27587942631382745, "learning_rate": 4.675888857982669e-07, "loss": 0.1327, "step": 877 }, { "epoch": 4.685790527018012, "grad_norm": 0.29332732966215386, "learning_rate": 4.5166457193284386e-07, "loss": 0.1458, "step": 878 }, { "epoch": 4.691127418278852, "grad_norm": 0.28714021402000245, "learning_rate": 4.3601304089775366e-07, "loss": 0.1466, "step": 879 }, { "epoch": 4.696464309539693, "grad_norm": 0.26784688798952216, "learning_rate": 4.2063451109874756e-07, "loss": 0.1274, "step": 880 }, { "epoch": 4.701801200800533, "grad_norm": 0.2797049205351308, "learning_rate": 4.055291971320485e-07, "loss": 0.142, "step": 881 }, { "epoch": 4.707138092061374, "grad_norm": 0.26674171880102343, "learning_rate": 3.906973097813449e-07, "loss": 0.1293, "step": 882 }, { "epoch": 4.712474983322215, "grad_norm": 0.25308546573318846, "learning_rate": 3.76139056014857e-07, "loss": 0.1315, "step": 883 }, { "epoch": 4.717811874583055, "grad_norm": 0.2710104168106361, "learning_rate": 3.6185463898245066e-07, "loss": 0.1489, "step": 884 }, { "epoch": 4.723148765843896, "grad_norm": 0.28048882254908447, "learning_rate": 3.478442580127972e-07, "loss": 0.144, "step": 885 }, { "epoch": 4.728485657104736, "grad_norm": 0.26485229244166536, "learning_rate": 3.341081086105891e-07, "loss": 0.1255, "step": 886 }, { "epoch": 4.733822548365577, "grad_norm": 0.2759331791232634, "learning_rate": 3.2064638245382194e-07, "loss": 0.1568, "step": 887 }, { "epoch": 4.739159439626418, "grad_norm": 0.26413539339616676, "learning_rate": 3.0745926739111033e-07, "loss": 0.1342, "step": 888 }, { "epoch": 4.744496330887258, "grad_norm": 0.28475596898962846, "learning_rate": 2.9454694743907386e-07, "loss": 0.1398, "step": 889 }, { "epoch": 4.749833222148099, "grad_norm": 0.277687235094201, "learning_rate": 2.819096027797641e-07, "loss": 0.1342, "step": 890 }, { "epoch": 4.755170113408939, "grad_norm": 0.2957077369519711, "learning_rate": 2.6954740975815076e-07, "loss": 0.1464, "step": 891 }, { "epoch": 4.76050700466978, "grad_norm": 0.2907912947380036, "learning_rate": 2.57460540879666e-07, "loss": 0.154, "step": 892 }, { "epoch": 4.765843895930621, "grad_norm": 0.2801914453007158, "learning_rate": 2.4564916480778855e-07, "loss": 0.1468, "step": 893 }, { "epoch": 4.771180787191461, "grad_norm": 0.2711625198266653, "learning_rate": 2.3411344636169898e-07, "loss": 0.1301, "step": 894 }, { "epoch": 4.776517678452302, "grad_norm": 0.2839350340429628, "learning_rate": 2.228535465139703e-07, "loss": 0.1495, "step": 895 }, { "epoch": 4.781854569713142, "grad_norm": 0.27595704547443245, "learning_rate": 2.1186962238833653e-07, "loss": 0.1238, "step": 896 }, { "epoch": 4.7871914609739825, "grad_norm": 0.28199603087443903, "learning_rate": 2.0116182725748334e-07, "loss": 0.1334, "step": 897 }, { "epoch": 4.792528352234823, "grad_norm": 0.2743312329373981, "learning_rate": 1.907303105409164e-07, "loss": 0.146, "step": 898 }, { "epoch": 4.7978652434956635, "grad_norm": 0.2780483555703688, "learning_rate": 1.80575217802883e-07, "loss": 0.1459, "step": 899 }, { "epoch": 4.803202134756504, "grad_norm": 0.2805564993027487, "learning_rate": 1.7069669075032492e-07, "loss": 0.1393, "step": 900 }, { "epoch": 4.808539026017344, "grad_norm": 0.2703447762812398, "learning_rate": 1.6109486723092426e-07, "loss": 0.1315, "step": 901 }, { "epoch": 4.813875917278185, "grad_norm": 0.2847007738527403, "learning_rate": 1.5176988123114966e-07, "loss": 0.1495, "step": 902 }, { "epoch": 4.819212808539026, "grad_norm": 0.2653003442730712, "learning_rate": 1.4272186287441535e-07, "loss": 0.1355, "step": 903 }, { "epoch": 4.824549699799866, "grad_norm": 0.2822152085841712, "learning_rate": 1.3395093841925166e-07, "loss": 0.1576, "step": 904 }, { "epoch": 4.829886591060707, "grad_norm": 0.27304693619702214, "learning_rate": 1.2545723025753743e-07, "loss": 0.1316, "step": 905 }, { "epoch": 4.835223482321548, "grad_norm": 0.27468533143866714, "learning_rate": 1.1724085691280806e-07, "loss": 0.132, "step": 906 }, { "epoch": 4.840560373582388, "grad_norm": 0.27963914069565776, "learning_rate": 1.0930193303858805e-07, "loss": 0.1461, "step": 907 }, { "epoch": 4.845897264843229, "grad_norm": 0.2565432498525225, "learning_rate": 1.0164056941679657e-07, "loss": 0.1246, "step": 908 }, { "epoch": 4.851234156104069, "grad_norm": 0.2811061477351414, "learning_rate": 9.42568729561999e-08, "loss": 0.1443, "step": 909 }, { "epoch": 4.85657104736491, "grad_norm": 0.2857439707607634, "learning_rate": 8.715094669092816e-08, "loss": 0.1338, "step": 910 }, { "epoch": 4.861907938625751, "grad_norm": 0.2767918923747537, "learning_rate": 8.032288977901647e-08, "loss": 0.1317, "step": 911 }, { "epoch": 4.867244829886591, "grad_norm": 0.27736070767280885, "learning_rate": 7.377279750104605e-08, "loss": 0.1332, "step": 912 }, { "epoch": 4.872581721147432, "grad_norm": 0.27702046457250123, "learning_rate": 6.750076125880079e-08, "loss": 0.1403, "step": 913 }, { "epoch": 4.877918612408272, "grad_norm": 0.2726767818543548, "learning_rate": 6.150686857399057e-08, "loss": 0.1375, "step": 914 }, { "epoch": 4.883255503669113, "grad_norm": 0.2770778486020138, "learning_rate": 5.5791203087041114e-08, "loss": 0.1448, "step": 915 }, { "epoch": 4.888592394929954, "grad_norm": 0.2673740108848184, "learning_rate": 5.0353844555910415e-08, "loss": 0.1401, "step": 916 }, { "epoch": 4.893929286190794, "grad_norm": 0.2773633602325955, "learning_rate": 4.5194868854991913e-08, "loss": 0.1408, "step": 917 }, { "epoch": 4.899266177451635, "grad_norm": 0.29556441642236714, "learning_rate": 4.031434797404421e-08, "loss": 0.1381, "step": 918 }, { "epoch": 4.904603068712475, "grad_norm": 0.2597939381066577, "learning_rate": 3.571235001719853e-08, "loss": 0.1283, "step": 919 }, { "epoch": 4.909939959973316, "grad_norm": 0.26747252132965876, "learning_rate": 3.13889392019906e-08, "loss": 0.1424, "step": 920 }, { "epoch": 4.9152768512341565, "grad_norm": 0.28253875872873746, "learning_rate": 2.734417585848137e-08, "loss": 0.1428, "step": 921 }, { "epoch": 4.9206137424949965, "grad_norm": 0.28810571816428815, "learning_rate": 2.3578116428408792e-08, "loss": 0.1478, "step": 922 }, { "epoch": 4.925950633755837, "grad_norm": 0.2585384085401356, "learning_rate": 2.0090813464395122e-08, "loss": 0.1388, "step": 923 }, { "epoch": 4.931287525016677, "grad_norm": 0.2820464252629273, "learning_rate": 1.6882315629225267e-08, "loss": 0.1495, "step": 924 }, { "epoch": 4.936624416277518, "grad_norm": 0.283069152205124, "learning_rate": 1.3952667695156241e-08, "loss": 0.1303, "step": 925 }, { "epoch": 4.941961307538359, "grad_norm": 0.27731020890016556, "learning_rate": 1.1301910543295436e-08, "loss": 0.1329, "step": 926 }, { "epoch": 4.947298198799199, "grad_norm": 0.2906776181838218, "learning_rate": 8.93008116303884e-09, "loss": 0.1623, "step": 927 }, { "epoch": 4.95263509006004, "grad_norm": 0.27016994542838946, "learning_rate": 6.8372126515403594e-09, "loss": 0.1419, "step": 928 }, { "epoch": 4.95797198132088, "grad_norm": 0.268976173834872, "learning_rate": 5.0233342132632865e-09, "loss": 0.1335, "step": 929 }, { "epoch": 4.963308872581721, "grad_norm": 0.2831312656649367, "learning_rate": 3.4884711595650765e-09, "loss": 0.1481, "step": 930 }, { "epoch": 4.968645763842562, "grad_norm": 0.2723703468394432, "learning_rate": 2.2326449083420745e-09, "loss": 0.1325, "step": 931 }, { "epoch": 4.973982655103402, "grad_norm": 0.2875352034119992, "learning_rate": 1.255872983740858e-09, "loss": 0.1477, "step": 932 }, { "epoch": 4.979319546364243, "grad_norm": 0.2737232836278831, "learning_rate": 5.581690159006669e-10, "loss": 0.1447, "step": 933 }, { "epoch": 4.984656437625083, "grad_norm": 0.2956259263292112, "learning_rate": 1.3954274078020748e-10, "loss": 0.1562, "step": 934 }, { "epoch": 4.989993328885924, "grad_norm": 0.285955394980644, "learning_rate": 0.0, "loss": 0.1344, "step": 935 }, { "epoch": 4.989993328885924, "step": 935, "total_flos": 1.946622601061204e+18, "train_loss": 0.3608587793966028, "train_runtime": 57464.5588, "train_samples_per_second": 2.087, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 935, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.946622601061204e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }