{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9983155530600785, "eval_steps": 500, "global_step": 1335, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022459292532285235, "grad_norm": 1.9819730520248413, "learning_rate": 1.1940298507462686e-06, "loss": 0.8684, "step": 1 }, { "epoch": 0.004491858506457047, "grad_norm": 2.0259573459625244, "learning_rate": 2.3880597014925373e-06, "loss": 0.8821, "step": 2 }, { "epoch": 0.00673778775968557, "grad_norm": 2.085845947265625, "learning_rate": 3.582089552238806e-06, "loss": 0.9416, "step": 3 }, { "epoch": 0.008983717012914094, "grad_norm": 1.9758139848709106, "learning_rate": 4.7761194029850745e-06, "loss": 0.9074, "step": 4 }, { "epoch": 0.011229646266142616, "grad_norm": 1.7997628450393677, "learning_rate": 5.970149253731343e-06, "loss": 0.9069, "step": 5 }, { "epoch": 0.01347557551937114, "grad_norm": 1.4710899591445923, "learning_rate": 7.164179104477612e-06, "loss": 0.8754, "step": 6 }, { "epoch": 0.015721504772599662, "grad_norm": 1.4286643266677856, "learning_rate": 8.35820895522388e-06, "loss": 0.8896, "step": 7 }, { "epoch": 0.017967434025828188, "grad_norm": 1.2477623224258423, "learning_rate": 9.552238805970149e-06, "loss": 0.8545, "step": 8 }, { "epoch": 0.02021336327905671, "grad_norm": 1.2966960668563843, "learning_rate": 1.074626865671642e-05, "loss": 0.839, "step": 9 }, { "epoch": 0.022459292532285232, "grad_norm": 1.4356369972229004, "learning_rate": 1.1940298507462686e-05, "loss": 0.8639, "step": 10 }, { "epoch": 0.024705221785513758, "grad_norm": 1.1653496026992798, "learning_rate": 1.3134328358208957e-05, "loss": 0.8142, "step": 11 }, { "epoch": 0.02695115103874228, "grad_norm": 0.9282035231590271, "learning_rate": 1.4328358208955224e-05, "loss": 0.8022, "step": 12 }, { "epoch": 0.029197080291970802, "grad_norm": 1.086950421333313, "learning_rate": 1.5522388059701494e-05, "loss": 0.7908, "step": 13 }, { "epoch": 0.031443009545199324, "grad_norm": 0.7987905144691467, "learning_rate": 1.671641791044776e-05, "loss": 0.7838, "step": 14 }, { "epoch": 0.033688938798427846, "grad_norm": 0.7030066847801208, "learning_rate": 1.791044776119403e-05, "loss": 0.7856, "step": 15 }, { "epoch": 0.035934868051656375, "grad_norm": 0.7216812372207642, "learning_rate": 1.9104477611940298e-05, "loss": 0.7666, "step": 16 }, { "epoch": 0.0381807973048849, "grad_norm": 0.7004393935203552, "learning_rate": 2.029850746268657e-05, "loss": 0.7602, "step": 17 }, { "epoch": 0.04042672655811342, "grad_norm": 0.5651209950447083, "learning_rate": 2.149253731343284e-05, "loss": 0.7637, "step": 18 }, { "epoch": 0.04267265581134194, "grad_norm": 0.5799914598464966, "learning_rate": 2.2686567164179106e-05, "loss": 0.7357, "step": 19 }, { "epoch": 0.044918585064570464, "grad_norm": 0.531233549118042, "learning_rate": 2.3880597014925373e-05, "loss": 0.7552, "step": 20 }, { "epoch": 0.047164514317798986, "grad_norm": 0.5684418678283691, "learning_rate": 2.5074626865671646e-05, "loss": 0.7671, "step": 21 }, { "epoch": 0.049410443571027515, "grad_norm": 0.4873054623603821, "learning_rate": 2.6268656716417913e-05, "loss": 0.737, "step": 22 }, { "epoch": 0.05165637282425604, "grad_norm": 0.49275314807891846, "learning_rate": 2.746268656716418e-05, "loss": 0.7362, "step": 23 }, { "epoch": 0.05390230207748456, "grad_norm": 0.47075843811035156, "learning_rate": 2.8656716417910447e-05, "loss": 0.7234, "step": 24 }, { "epoch": 0.05614823133071308, "grad_norm": 0.3865251839160919, "learning_rate": 2.985074626865672e-05, "loss": 0.7169, "step": 25 }, { "epoch": 0.058394160583941604, "grad_norm": 0.4154004156589508, "learning_rate": 3.104477611940299e-05, "loss": 0.7119, "step": 26 }, { "epoch": 0.060640089837170126, "grad_norm": 0.37092125415802, "learning_rate": 3.2238805970149255e-05, "loss": 0.7138, "step": 27 }, { "epoch": 0.06288601909039865, "grad_norm": 0.3488411605358124, "learning_rate": 3.343283582089552e-05, "loss": 0.7216, "step": 28 }, { "epoch": 0.06513194834362718, "grad_norm": 0.32693928480148315, "learning_rate": 3.462686567164179e-05, "loss": 0.6925, "step": 29 }, { "epoch": 0.06737787759685569, "grad_norm": 0.34904295206069946, "learning_rate": 3.582089552238806e-05, "loss": 0.7105, "step": 30 }, { "epoch": 0.06962380685008422, "grad_norm": 0.32673367857933044, "learning_rate": 3.701492537313433e-05, "loss": 0.696, "step": 31 }, { "epoch": 0.07186973610331275, "grad_norm": 0.32177790999412537, "learning_rate": 3.8208955223880596e-05, "loss": 0.7064, "step": 32 }, { "epoch": 0.07411566535654127, "grad_norm": 0.3286134600639343, "learning_rate": 3.940298507462687e-05, "loss": 0.7091, "step": 33 }, { "epoch": 0.0763615946097698, "grad_norm": 0.3438747525215149, "learning_rate": 4.059701492537314e-05, "loss": 0.7149, "step": 34 }, { "epoch": 0.07860752386299831, "grad_norm": 0.29362648725509644, "learning_rate": 4.1791044776119404e-05, "loss": 0.685, "step": 35 }, { "epoch": 0.08085345311622684, "grad_norm": 0.30074256658554077, "learning_rate": 4.298507462686568e-05, "loss": 0.7011, "step": 36 }, { "epoch": 0.08309938236945537, "grad_norm": 0.3120618462562561, "learning_rate": 4.4179104477611944e-05, "loss": 0.684, "step": 37 }, { "epoch": 0.08534531162268388, "grad_norm": 0.2569892406463623, "learning_rate": 4.537313432835821e-05, "loss": 0.684, "step": 38 }, { "epoch": 0.08759124087591241, "grad_norm": 0.28327882289886475, "learning_rate": 4.6567164179104485e-05, "loss": 0.6968, "step": 39 }, { "epoch": 0.08983717012914093, "grad_norm": 0.26424843072891235, "learning_rate": 4.7761194029850745e-05, "loss": 0.6915, "step": 40 }, { "epoch": 0.09208309938236946, "grad_norm": 0.2620261609554291, "learning_rate": 4.895522388059702e-05, "loss": 0.6744, "step": 41 }, { "epoch": 0.09432902863559797, "grad_norm": 0.32121092081069946, "learning_rate": 5.014925373134329e-05, "loss": 0.6746, "step": 42 }, { "epoch": 0.0965749578888265, "grad_norm": 0.3997937738895416, "learning_rate": 5.134328358208955e-05, "loss": 0.6806, "step": 43 }, { "epoch": 0.09882088714205503, "grad_norm": 0.3264799416065216, "learning_rate": 5.2537313432835826e-05, "loss": 0.6729, "step": 44 }, { "epoch": 0.10106681639528355, "grad_norm": 0.33052176237106323, "learning_rate": 5.37313432835821e-05, "loss": 0.6758, "step": 45 }, { "epoch": 0.10331274564851207, "grad_norm": 0.43345314264297485, "learning_rate": 5.492537313432836e-05, "loss": 0.6767, "step": 46 }, { "epoch": 0.10555867490174059, "grad_norm": 0.37080681324005127, "learning_rate": 5.6119402985074634e-05, "loss": 0.6526, "step": 47 }, { "epoch": 0.10780460415496912, "grad_norm": 0.381356418132782, "learning_rate": 5.7313432835820894e-05, "loss": 0.6739, "step": 48 }, { "epoch": 0.11005053340819765, "grad_norm": 0.36677348613739014, "learning_rate": 5.850746268656717e-05, "loss": 0.6782, "step": 49 }, { "epoch": 0.11229646266142616, "grad_norm": 0.40393349528312683, "learning_rate": 5.970149253731344e-05, "loss": 0.6528, "step": 50 }, { "epoch": 0.11454239191465469, "grad_norm": 0.5078914165496826, "learning_rate": 6.08955223880597e-05, "loss": 0.6697, "step": 51 }, { "epoch": 0.11678832116788321, "grad_norm": 0.742857813835144, "learning_rate": 6.208955223880598e-05, "loss": 0.6341, "step": 52 }, { "epoch": 0.11903425042111174, "grad_norm": 0.8604367971420288, "learning_rate": 6.328358208955224e-05, "loss": 0.662, "step": 53 }, { "epoch": 0.12128017967434025, "grad_norm": 0.7391287684440613, "learning_rate": 6.447761194029851e-05, "loss": 0.6696, "step": 54 }, { "epoch": 0.12352610892756878, "grad_norm": 0.5703966617584229, "learning_rate": 6.567164179104479e-05, "loss": 0.6619, "step": 55 }, { "epoch": 0.1257720381807973, "grad_norm": 0.7210264801979065, "learning_rate": 6.686567164179104e-05, "loss": 0.6647, "step": 56 }, { "epoch": 0.12801796743402583, "grad_norm": 0.8133912682533264, "learning_rate": 6.805970149253732e-05, "loss": 0.658, "step": 57 }, { "epoch": 0.13026389668725435, "grad_norm": 0.9062953591346741, "learning_rate": 6.925373134328358e-05, "loss": 0.6732, "step": 58 }, { "epoch": 0.13250982594048288, "grad_norm": 0.9497516751289368, "learning_rate": 7.044776119402986e-05, "loss": 0.6743, "step": 59 }, { "epoch": 0.13475575519371139, "grad_norm": 0.5923281908035278, "learning_rate": 7.164179104477612e-05, "loss": 0.6609, "step": 60 }, { "epoch": 0.13700168444693991, "grad_norm": 0.839241087436676, "learning_rate": 7.283582089552239e-05, "loss": 0.6673, "step": 61 }, { "epoch": 0.13924761370016844, "grad_norm": 0.9110313653945923, "learning_rate": 7.402985074626866e-05, "loss": 0.6795, "step": 62 }, { "epoch": 0.14149354295339697, "grad_norm": 0.6465680599212646, "learning_rate": 7.522388059701494e-05, "loss": 0.6634, "step": 63 }, { "epoch": 0.1437394722066255, "grad_norm": 0.5419987440109253, "learning_rate": 7.641791044776119e-05, "loss": 0.6489, "step": 64 }, { "epoch": 0.145985401459854, "grad_norm": 0.6124593019485474, "learning_rate": 7.761194029850747e-05, "loss": 0.6617, "step": 65 }, { "epoch": 0.14823133071308253, "grad_norm": 0.5836852788925171, "learning_rate": 7.880597014925374e-05, "loss": 0.6319, "step": 66 }, { "epoch": 0.15047725996631106, "grad_norm": 0.6319289207458496, "learning_rate": 8e-05, "loss": 0.6504, "step": 67 }, { "epoch": 0.1527231892195396, "grad_norm": 0.6081493496894836, "learning_rate": 8.119402985074627e-05, "loss": 0.65, "step": 68 }, { "epoch": 0.15496911847276812, "grad_norm": 0.5973412394523621, "learning_rate": 8.238805970149255e-05, "loss": 0.6449, "step": 69 }, { "epoch": 0.15721504772599662, "grad_norm": 0.6423139572143555, "learning_rate": 8.358208955223881e-05, "loss": 0.6584, "step": 70 }, { "epoch": 0.15946097697922515, "grad_norm": 0.7579260468482971, "learning_rate": 8.477611940298507e-05, "loss": 0.6472, "step": 71 }, { "epoch": 0.16170690623245368, "grad_norm": 0.8475743532180786, "learning_rate": 8.597014925373135e-05, "loss": 0.6405, "step": 72 }, { "epoch": 0.1639528354856822, "grad_norm": 0.5964512228965759, "learning_rate": 8.716417910447762e-05, "loss": 0.633, "step": 73 }, { "epoch": 0.16619876473891074, "grad_norm": 0.47265729308128357, "learning_rate": 8.835820895522389e-05, "loss": 0.6453, "step": 74 }, { "epoch": 0.16844469399213924, "grad_norm": 0.7188097238540649, "learning_rate": 8.955223880597014e-05, "loss": 0.6603, "step": 75 }, { "epoch": 0.17069062324536777, "grad_norm": 0.49939826130867004, "learning_rate": 9.074626865671642e-05, "loss": 0.6339, "step": 76 }, { "epoch": 0.1729365524985963, "grad_norm": 0.5468081831932068, "learning_rate": 9.194029850746269e-05, "loss": 0.639, "step": 77 }, { "epoch": 0.17518248175182483, "grad_norm": 0.6105530858039856, "learning_rate": 9.313432835820897e-05, "loss": 0.6537, "step": 78 }, { "epoch": 0.17742841100505333, "grad_norm": 0.48114606738090515, "learning_rate": 9.432835820895524e-05, "loss": 0.6579, "step": 79 }, { "epoch": 0.17967434025828186, "grad_norm": 0.6263488531112671, "learning_rate": 9.552238805970149e-05, "loss": 0.6292, "step": 80 }, { "epoch": 0.18192026951151039, "grad_norm": 0.5369325280189514, "learning_rate": 9.671641791044777e-05, "loss": 0.6608, "step": 81 }, { "epoch": 0.18416619876473891, "grad_norm": 0.7140039801597595, "learning_rate": 9.791044776119404e-05, "loss": 0.6339, "step": 82 }, { "epoch": 0.18641212801796744, "grad_norm": 0.9011125564575195, "learning_rate": 9.91044776119403e-05, "loss": 0.6342, "step": 83 }, { "epoch": 0.18865805727119594, "grad_norm": 1.1369616985321045, "learning_rate": 0.00010029850746268659, "loss": 0.6442, "step": 84 }, { "epoch": 0.19090398652442447, "grad_norm": 1.0306285619735718, "learning_rate": 0.00010149253731343285, "loss": 0.6419, "step": 85 }, { "epoch": 0.193149915777653, "grad_norm": 0.8979660272598267, "learning_rate": 0.0001026865671641791, "loss": 0.632, "step": 86 }, { "epoch": 0.19539584503088153, "grad_norm": 0.6676183342933655, "learning_rate": 0.00010388059701492539, "loss": 0.6386, "step": 87 }, { "epoch": 0.19764177428411006, "grad_norm": 0.7217721939086914, "learning_rate": 0.00010507462686567165, "loss": 0.6546, "step": 88 }, { "epoch": 0.19988770353733856, "grad_norm": 0.7290446162223816, "learning_rate": 0.00010626865671641792, "loss": 0.6328, "step": 89 }, { "epoch": 0.2021336327905671, "grad_norm": 0.8381432890892029, "learning_rate": 0.0001074626865671642, "loss": 0.6311, "step": 90 }, { "epoch": 0.20437956204379562, "grad_norm": 1.0938982963562012, "learning_rate": 0.00010865671641791045, "loss": 0.6559, "step": 91 }, { "epoch": 0.20662549129702415, "grad_norm": 0.8039063215255737, "learning_rate": 0.00010985074626865672, "loss": 0.636, "step": 92 }, { "epoch": 0.20887142055025268, "grad_norm": 0.7171061635017395, "learning_rate": 0.000111044776119403, "loss": 0.6456, "step": 93 }, { "epoch": 0.21111734980348118, "grad_norm": 0.7186174988746643, "learning_rate": 0.00011223880597014927, "loss": 0.6285, "step": 94 }, { "epoch": 0.2133632790567097, "grad_norm": 0.6290779113769531, "learning_rate": 0.00011343283582089553, "loss": 0.6336, "step": 95 }, { "epoch": 0.21560920830993824, "grad_norm": 0.7359249591827393, "learning_rate": 0.00011462686567164179, "loss": 0.6542, "step": 96 }, { "epoch": 0.21785513756316677, "grad_norm": 0.775365948677063, "learning_rate": 0.00011582089552238807, "loss": 0.6369, "step": 97 }, { "epoch": 0.2201010668163953, "grad_norm": 0.8260976076126099, "learning_rate": 0.00011701492537313434, "loss": 0.6142, "step": 98 }, { "epoch": 0.2223469960696238, "grad_norm": 0.704872727394104, "learning_rate": 0.00011820895522388062, "loss": 0.6473, "step": 99 }, { "epoch": 0.22459292532285233, "grad_norm": 0.5987293124198914, "learning_rate": 0.00011940298507462688, "loss": 0.6458, "step": 100 }, { "epoch": 0.22683885457608086, "grad_norm": 0.7472802400588989, "learning_rate": 0.00012059701492537314, "loss": 0.6235, "step": 101 }, { "epoch": 0.22908478382930939, "grad_norm": 0.7303177118301392, "learning_rate": 0.0001217910447761194, "loss": 0.6432, "step": 102 }, { "epoch": 0.2313307130825379, "grad_norm": 0.5669957995414734, "learning_rate": 0.00012298507462686568, "loss": 0.6276, "step": 103 }, { "epoch": 0.23357664233576642, "grad_norm": 0.47117286920547485, "learning_rate": 0.00012417910447761195, "loss": 0.6429, "step": 104 }, { "epoch": 0.23582257158899494, "grad_norm": 0.6563988327980042, "learning_rate": 0.00012537313432835822, "loss": 0.6276, "step": 105 }, { "epoch": 0.23806850084222347, "grad_norm": 0.5849066972732544, "learning_rate": 0.00012656716417910448, "loss": 0.6309, "step": 106 }, { "epoch": 0.240314430095452, "grad_norm": 0.7347849607467651, "learning_rate": 0.00012776119402985075, "loss": 0.6382, "step": 107 }, { "epoch": 0.2425603593486805, "grad_norm": 0.6520137190818787, "learning_rate": 0.00012895522388059702, "loss": 0.6386, "step": 108 }, { "epoch": 0.24480628860190903, "grad_norm": 0.60540372133255, "learning_rate": 0.00013014925373134329, "loss": 0.613, "step": 109 }, { "epoch": 0.24705221785513756, "grad_norm": 0.7710558176040649, "learning_rate": 0.00013134328358208958, "loss": 0.6104, "step": 110 }, { "epoch": 0.2492981471083661, "grad_norm": 0.6582499742507935, "learning_rate": 0.00013253731343283582, "loss": 0.628, "step": 111 }, { "epoch": 0.2515440763615946, "grad_norm": 0.6089588403701782, "learning_rate": 0.00013373134328358209, "loss": 0.6313, "step": 112 }, { "epoch": 0.2537900056148231, "grad_norm": 0.5754179358482361, "learning_rate": 0.00013492537313432838, "loss": 0.6283, "step": 113 }, { "epoch": 0.25603593486805165, "grad_norm": 0.617273211479187, "learning_rate": 0.00013611940298507465, "loss": 0.6187, "step": 114 }, { "epoch": 0.2582818641212802, "grad_norm": 0.6104961037635803, "learning_rate": 0.00013731343283582091, "loss": 0.6267, "step": 115 }, { "epoch": 0.2605277933745087, "grad_norm": 0.691856861114502, "learning_rate": 0.00013850746268656715, "loss": 0.6202, "step": 116 }, { "epoch": 0.26277372262773724, "grad_norm": 0.8089864253997803, "learning_rate": 0.00013970149253731345, "loss": 0.635, "step": 117 }, { "epoch": 0.26501965188096577, "grad_norm": 1.1346023082733154, "learning_rate": 0.00014089552238805972, "loss": 0.6462, "step": 118 }, { "epoch": 0.2672655811341943, "grad_norm": 0.8319297432899475, "learning_rate": 0.00014208955223880598, "loss": 0.6179, "step": 119 }, { "epoch": 0.26951151038742277, "grad_norm": 0.5904942154884338, "learning_rate": 0.00014328358208955225, "loss": 0.629, "step": 120 }, { "epoch": 0.2717574396406513, "grad_norm": 0.5950160026550293, "learning_rate": 0.00014447761194029852, "loss": 0.6245, "step": 121 }, { "epoch": 0.27400336889387983, "grad_norm": 0.6426451802253723, "learning_rate": 0.00014567164179104478, "loss": 0.614, "step": 122 }, { "epoch": 0.27624929814710836, "grad_norm": 0.6028596758842468, "learning_rate": 0.00014686567164179105, "loss": 0.6127, "step": 123 }, { "epoch": 0.2784952274003369, "grad_norm": 0.6075330972671509, "learning_rate": 0.00014805970149253732, "loss": 0.6283, "step": 124 }, { "epoch": 0.2807411566535654, "grad_norm": 0.6084921360015869, "learning_rate": 0.0001492537313432836, "loss": 0.6351, "step": 125 }, { "epoch": 0.28298708590679394, "grad_norm": 0.627112865447998, "learning_rate": 0.00015044776119402988, "loss": 0.6393, "step": 126 }, { "epoch": 0.2852330151600225, "grad_norm": 0.6501988172531128, "learning_rate": 0.00015164179104477612, "loss": 0.6097, "step": 127 }, { "epoch": 0.287478944413251, "grad_norm": 0.6280235648155212, "learning_rate": 0.00015283582089552238, "loss": 0.6281, "step": 128 }, { "epoch": 0.28972487366647953, "grad_norm": 0.49232786893844604, "learning_rate": 0.00015402985074626868, "loss": 0.6341, "step": 129 }, { "epoch": 0.291970802919708, "grad_norm": 0.5303974747657776, "learning_rate": 0.00015522388059701495, "loss": 0.6098, "step": 130 }, { "epoch": 0.29421673217293653, "grad_norm": 0.5729207992553711, "learning_rate": 0.0001564179104477612, "loss": 0.617, "step": 131 }, { "epoch": 0.29646266142616506, "grad_norm": 0.6265519857406616, "learning_rate": 0.00015761194029850748, "loss": 0.5968, "step": 132 }, { "epoch": 0.2987085906793936, "grad_norm": 0.6463232636451721, "learning_rate": 0.00015880597014925375, "loss": 0.6391, "step": 133 }, { "epoch": 0.3009545199326221, "grad_norm": 0.593257486820221, "learning_rate": 0.00016, "loss": 0.6189, "step": 134 }, { "epoch": 0.30320044918585065, "grad_norm": 0.5925970077514648, "learning_rate": 0.00015999972630083387, "loss": 0.6139, "step": 135 }, { "epoch": 0.3054463784390792, "grad_norm": 0.6394967436790466, "learning_rate": 0.00015999890520520824, "loss": 0.6038, "step": 136 }, { "epoch": 0.3076923076923077, "grad_norm": 0.649044394493103, "learning_rate": 0.00015999753671874147, "loss": 0.614, "step": 137 }, { "epoch": 0.30993823694553624, "grad_norm": 0.6179019808769226, "learning_rate": 0.00015999562085079733, "loss": 0.6171, "step": 138 }, { "epoch": 0.3121841661987647, "grad_norm": 0.5114040374755859, "learning_rate": 0.0001599931576144852, "loss": 0.6076, "step": 139 }, { "epoch": 0.31443009545199324, "grad_norm": 0.5721436142921448, "learning_rate": 0.00015999014702665964, "loss": 0.6173, "step": 140 }, { "epoch": 0.31667602470522177, "grad_norm": 0.7164266109466553, "learning_rate": 0.00015998658910792058, "loss": 0.611, "step": 141 }, { "epoch": 0.3189219539584503, "grad_norm": 0.8134217858314514, "learning_rate": 0.00015998248388261302, "loss": 0.6296, "step": 142 }, { "epoch": 0.32116788321167883, "grad_norm": 0.828131377696991, "learning_rate": 0.00015997783137882682, "loss": 0.6331, "step": 143 }, { "epoch": 0.32341381246490736, "grad_norm": 0.7628505825996399, "learning_rate": 0.00015997263162839667, "loss": 0.6524, "step": 144 }, { "epoch": 0.3256597417181359, "grad_norm": 0.6403250098228455, "learning_rate": 0.0001599668846669018, "loss": 0.6097, "step": 145 }, { "epoch": 0.3279056709713644, "grad_norm": 0.5496403574943542, "learning_rate": 0.00015996059053366562, "loss": 0.6187, "step": 146 }, { "epoch": 0.33015160022459294, "grad_norm": 0.6352928876876831, "learning_rate": 0.0001599537492717556, "loss": 0.619, "step": 147 }, { "epoch": 0.3323975294778215, "grad_norm": 0.6073532104492188, "learning_rate": 0.00015994636092798295, "loss": 0.6218, "step": 148 }, { "epoch": 0.33464345873104995, "grad_norm": 0.40914225578308105, "learning_rate": 0.00015993842555290226, "loss": 0.6161, "step": 149 }, { "epoch": 0.3368893879842785, "grad_norm": 0.4364437758922577, "learning_rate": 0.0001599299432008112, "loss": 0.637, "step": 150 }, { "epoch": 0.339135317237507, "grad_norm": 0.5311095118522644, "learning_rate": 0.00015992091392975002, "loss": 0.5972, "step": 151 }, { "epoch": 0.34138124649073553, "grad_norm": 0.545671284198761, "learning_rate": 0.00015991133780150136, "loss": 0.6103, "step": 152 }, { "epoch": 0.34362717574396406, "grad_norm": 0.4276280105113983, "learning_rate": 0.00015990121488158968, "loss": 0.6148, "step": 153 }, { "epoch": 0.3458731049971926, "grad_norm": 0.4059518575668335, "learning_rate": 0.00015989054523928085, "loss": 0.6332, "step": 154 }, { "epoch": 0.3481190342504211, "grad_norm": 0.42028188705444336, "learning_rate": 0.00015987932894758164, "loss": 0.5972, "step": 155 }, { "epoch": 0.35036496350364965, "grad_norm": 0.44919151067733765, "learning_rate": 0.00015986756608323932, "loss": 0.6017, "step": 156 }, { "epoch": 0.3526108927568782, "grad_norm": 0.3990235924720764, "learning_rate": 0.00015985525672674103, "loss": 0.6146, "step": 157 }, { "epoch": 0.35485682201010665, "grad_norm": 0.42787206172943115, "learning_rate": 0.0001598424009623133, "loss": 0.6199, "step": 158 }, { "epoch": 0.3571027512633352, "grad_norm": 0.44234439730644226, "learning_rate": 0.00015982899887792145, "loss": 0.6279, "step": 159 }, { "epoch": 0.3593486805165637, "grad_norm": 0.4152110815048218, "learning_rate": 0.00015981505056526893, "loss": 0.6032, "step": 160 }, { "epoch": 0.36159460976979224, "grad_norm": 0.36194872856140137, "learning_rate": 0.0001598005561197968, "loss": 0.6286, "step": 161 }, { "epoch": 0.36384053902302077, "grad_norm": 0.4214819669723511, "learning_rate": 0.00015978551564068295, "loss": 0.6006, "step": 162 }, { "epoch": 0.3660864682762493, "grad_norm": 0.41948559880256653, "learning_rate": 0.00015976992923084161, "loss": 0.615, "step": 163 }, { "epoch": 0.36833239752947783, "grad_norm": 0.44141775369644165, "learning_rate": 0.00015975379699692245, "loss": 0.6236, "step": 164 }, { "epoch": 0.37057832678270636, "grad_norm": 0.47903239727020264, "learning_rate": 0.00015973711904930993, "loss": 0.5979, "step": 165 }, { "epoch": 0.3728242560359349, "grad_norm": 0.45099982619285583, "learning_rate": 0.00015971989550212255, "loss": 0.6229, "step": 166 }, { "epoch": 0.3750701852891634, "grad_norm": 0.4214828312397003, "learning_rate": 0.00015970212647321207, "loss": 0.6146, "step": 167 }, { "epoch": 0.3773161145423919, "grad_norm": 0.43835896253585815, "learning_rate": 0.00015968381208416273, "loss": 0.6162, "step": 168 }, { "epoch": 0.3795620437956204, "grad_norm": 0.4372192621231079, "learning_rate": 0.00015966495246029033, "loss": 0.6152, "step": 169 }, { "epoch": 0.38180797304884895, "grad_norm": 0.45570138096809387, "learning_rate": 0.00015964554773064148, "loss": 0.6107, "step": 170 }, { "epoch": 0.3840539023020775, "grad_norm": 0.5014758706092834, "learning_rate": 0.0001596255980279926, "loss": 0.5861, "step": 171 }, { "epoch": 0.386299831555306, "grad_norm": 0.46727222204208374, "learning_rate": 0.00015960510348884914, "loss": 0.6104, "step": 172 }, { "epoch": 0.38854576080853453, "grad_norm": 0.5081140398979187, "learning_rate": 0.00015958406425344455, "loss": 0.5948, "step": 173 }, { "epoch": 0.39079169006176306, "grad_norm": 0.4470350742340088, "learning_rate": 0.00015956248046573938, "loss": 0.5924, "step": 174 }, { "epoch": 0.3930376193149916, "grad_norm": 0.36340662837028503, "learning_rate": 0.00015954035227342019, "loss": 0.5972, "step": 175 }, { "epoch": 0.3952835485682201, "grad_norm": 0.34276771545410156, "learning_rate": 0.00015951767982789875, "loss": 0.5955, "step": 176 }, { "epoch": 0.39752947782144865, "grad_norm": 0.35867977142333984, "learning_rate": 0.00015949446328431075, "loss": 0.611, "step": 177 }, { "epoch": 0.3997754070746771, "grad_norm": 0.3728366792201996, "learning_rate": 0.00015947070280151492, "loss": 0.6117, "step": 178 }, { "epoch": 0.40202133632790565, "grad_norm": 0.32302939891815186, "learning_rate": 0.00015944639854209184, "loss": 0.6225, "step": 179 }, { "epoch": 0.4042672655811342, "grad_norm": 0.33579641580581665, "learning_rate": 0.00015942155067234293, "loss": 0.5915, "step": 180 }, { "epoch": 0.4065131948343627, "grad_norm": 0.29243412613868713, "learning_rate": 0.00015939615936228922, "loss": 0.5915, "step": 181 }, { "epoch": 0.40875912408759124, "grad_norm": 0.32980793714523315, "learning_rate": 0.00015937022478567023, "loss": 0.6172, "step": 182 }, { "epoch": 0.41100505334081977, "grad_norm": 0.30575114488601685, "learning_rate": 0.0001593437471199427, "loss": 0.5958, "step": 183 }, { "epoch": 0.4132509825940483, "grad_norm": 0.3298383951187134, "learning_rate": 0.00015931672654627958, "loss": 0.5949, "step": 184 }, { "epoch": 0.41549691184727683, "grad_norm": 0.330642431974411, "learning_rate": 0.00015928916324956855, "loss": 0.5929, "step": 185 }, { "epoch": 0.41774284110050536, "grad_norm": 0.33059626817703247, "learning_rate": 0.00015926105741841088, "loss": 0.609, "step": 186 }, { "epoch": 0.41998877035373383, "grad_norm": 0.3915445804595947, "learning_rate": 0.00015923240924512014, "loss": 0.6045, "step": 187 }, { "epoch": 0.42223469960696236, "grad_norm": 0.3589101731777191, "learning_rate": 0.00015920321892572088, "loss": 0.6175, "step": 188 }, { "epoch": 0.4244806288601909, "grad_norm": 0.399964302778244, "learning_rate": 0.00015917348665994723, "loss": 0.6157, "step": 189 }, { "epoch": 0.4267265581134194, "grad_norm": 0.3923908770084381, "learning_rate": 0.0001591432126512416, "loss": 0.6041, "step": 190 }, { "epoch": 0.42897248736664795, "grad_norm": 0.38844165205955505, "learning_rate": 0.0001591123971067533, "loss": 0.5865, "step": 191 }, { "epoch": 0.4312184166198765, "grad_norm": 0.41744178533554077, "learning_rate": 0.00015908104023733697, "loss": 0.5823, "step": 192 }, { "epoch": 0.433464345873105, "grad_norm": 0.3478281795978546, "learning_rate": 0.0001590491422575514, "loss": 0.6064, "step": 193 }, { "epoch": 0.43571027512633353, "grad_norm": 0.38580065965652466, "learning_rate": 0.00015901670338565785, "loss": 0.6119, "step": 194 }, { "epoch": 0.43795620437956206, "grad_norm": 0.5283933877944946, "learning_rate": 0.0001589837238436186, "loss": 0.5945, "step": 195 }, { "epoch": 0.4402021336327906, "grad_norm": 0.48087194561958313, "learning_rate": 0.00015895020385709553, "loss": 0.6058, "step": 196 }, { "epoch": 0.44244806288601907, "grad_norm": 0.35071608424186707, "learning_rate": 0.00015891614365544837, "loss": 0.5672, "step": 197 }, { "epoch": 0.4446939921392476, "grad_norm": 0.3820844888687134, "learning_rate": 0.0001588815434717334, "loss": 0.5898, "step": 198 }, { "epoch": 0.4469399213924761, "grad_norm": 0.3622789680957794, "learning_rate": 0.0001588464035427016, "loss": 0.5842, "step": 199 }, { "epoch": 0.44918585064570465, "grad_norm": 0.348568856716156, "learning_rate": 0.00015881072410879726, "loss": 0.6025, "step": 200 }, { "epoch": 0.4514317798989332, "grad_norm": 0.36718496680259705, "learning_rate": 0.00015877450541415615, "loss": 0.5888, "step": 201 }, { "epoch": 0.4536777091521617, "grad_norm": 0.39695170521736145, "learning_rate": 0.0001587377477066039, "loss": 0.6159, "step": 202 }, { "epoch": 0.45592363840539024, "grad_norm": 0.4380107522010803, "learning_rate": 0.0001587004512376544, "loss": 0.6001, "step": 203 }, { "epoch": 0.45816956765861877, "grad_norm": 0.40494075417518616, "learning_rate": 0.00015866261626250794, "loss": 0.6016, "step": 204 }, { "epoch": 0.4604154969118473, "grad_norm": 0.3275372385978699, "learning_rate": 0.00015862424304004954, "loss": 0.5918, "step": 205 }, { "epoch": 0.4626614261650758, "grad_norm": 0.3288284242153168, "learning_rate": 0.00015858533183284718, "loss": 0.608, "step": 206 }, { "epoch": 0.4649073554183043, "grad_norm": 0.32171040773391724, "learning_rate": 0.00015854588290714999, "loss": 0.5816, "step": 207 }, { "epoch": 0.46715328467153283, "grad_norm": 0.3992040157318115, "learning_rate": 0.00015850589653288642, "loss": 0.591, "step": 208 }, { "epoch": 0.46939921392476136, "grad_norm": 0.38158226013183594, "learning_rate": 0.00015846537298366242, "loss": 0.5831, "step": 209 }, { "epoch": 0.4716451431779899, "grad_norm": 0.32366326451301575, "learning_rate": 0.0001584243125367595, "loss": 0.5822, "step": 210 }, { "epoch": 0.4738910724312184, "grad_norm": 0.41187676787376404, "learning_rate": 0.00015838271547313293, "loss": 0.6027, "step": 211 }, { "epoch": 0.47613700168444695, "grad_norm": 0.48473531007766724, "learning_rate": 0.00015834058207740974, "loss": 0.5819, "step": 212 }, { "epoch": 0.4783829309376755, "grad_norm": 0.3934939205646515, "learning_rate": 0.00015829791263788682, "loss": 0.6042, "step": 213 }, { "epoch": 0.480628860190904, "grad_norm": 0.32344624400138855, "learning_rate": 0.00015825470744652894, "loss": 0.5717, "step": 214 }, { "epoch": 0.48287478944413254, "grad_norm": 0.27189725637435913, "learning_rate": 0.0001582109667989667, "loss": 0.6015, "step": 215 }, { "epoch": 0.485120718697361, "grad_norm": 0.349128395318985, "learning_rate": 0.00015816669099449454, "loss": 0.6037, "step": 216 }, { "epoch": 0.48736664795058954, "grad_norm": 0.3456957936286926, "learning_rate": 0.00015812188033606877, "loss": 0.5974, "step": 217 }, { "epoch": 0.48961257720381807, "grad_norm": 0.29926273226737976, "learning_rate": 0.00015807653513030538, "loss": 0.6, "step": 218 }, { "epoch": 0.4918585064570466, "grad_norm": 0.3260749280452728, "learning_rate": 0.00015803065568747798, "loss": 0.5955, "step": 219 }, { "epoch": 0.4941044357102751, "grad_norm": 0.4071785509586334, "learning_rate": 0.00015798424232151573, "loss": 0.5899, "step": 220 }, { "epoch": 0.49635036496350365, "grad_norm": 0.37568220496177673, "learning_rate": 0.00015793729535000108, "loss": 0.6008, "step": 221 }, { "epoch": 0.4985962942167322, "grad_norm": 0.4158768355846405, "learning_rate": 0.00015788981509416773, "loss": 0.5897, "step": 222 }, { "epoch": 0.5008422234699607, "grad_norm": 0.44514065980911255, "learning_rate": 0.00015784180187889833, "loss": 0.5807, "step": 223 }, { "epoch": 0.5030881527231892, "grad_norm": 0.37475013732910156, "learning_rate": 0.00015779325603272232, "loss": 0.586, "step": 224 }, { "epoch": 0.5053340819764177, "grad_norm": 0.4093579649925232, "learning_rate": 0.0001577441778878136, "loss": 0.5966, "step": 225 }, { "epoch": 0.5075800112296462, "grad_norm": 0.4048860967159271, "learning_rate": 0.00015769456777998842, "loss": 0.6107, "step": 226 }, { "epoch": 0.5098259404828748, "grad_norm": 0.31557515263557434, "learning_rate": 0.00015764442604870285, "loss": 0.609, "step": 227 }, { "epoch": 0.5120718697361033, "grad_norm": 0.33514130115509033, "learning_rate": 0.0001575937530370507, "loss": 0.5866, "step": 228 }, { "epoch": 0.5143177989893318, "grad_norm": 0.3601367771625519, "learning_rate": 0.0001575425490917609, "loss": 0.586, "step": 229 }, { "epoch": 0.5165637282425604, "grad_norm": 0.3701965808868408, "learning_rate": 0.00015749081456319544, "loss": 0.5755, "step": 230 }, { "epoch": 0.5188096574957889, "grad_norm": 0.3042786419391632, "learning_rate": 0.0001574385498053468, "loss": 0.5978, "step": 231 }, { "epoch": 0.5210555867490174, "grad_norm": 0.33692997694015503, "learning_rate": 0.00015738575517583542, "loss": 0.6078, "step": 232 }, { "epoch": 0.523301516002246, "grad_norm": 0.36644524335861206, "learning_rate": 0.00015733243103590748, "loss": 0.575, "step": 233 }, { "epoch": 0.5255474452554745, "grad_norm": 0.3802548050880432, "learning_rate": 0.00015727857775043227, "loss": 0.6041, "step": 234 }, { "epoch": 0.527793374508703, "grad_norm": 0.37767213582992554, "learning_rate": 0.00015722419568789983, "loss": 0.591, "step": 235 }, { "epoch": 0.5300393037619315, "grad_norm": 0.38524535298347473, "learning_rate": 0.00015716928522041825, "loss": 0.601, "step": 236 }, { "epoch": 0.5322852330151601, "grad_norm": 0.4806888699531555, "learning_rate": 0.00015711384672371126, "loss": 0.5935, "step": 237 }, { "epoch": 0.5345311622683886, "grad_norm": 0.3960248827934265, "learning_rate": 0.0001570578805771156, "loss": 0.5789, "step": 238 }, { "epoch": 0.5367770915216171, "grad_norm": 0.29831525683403015, "learning_rate": 0.00015700138716357852, "loss": 0.5917, "step": 239 }, { "epoch": 0.5390230207748455, "grad_norm": 0.30690401792526245, "learning_rate": 0.00015694436686965497, "loss": 0.5819, "step": 240 }, { "epoch": 0.5412689500280741, "grad_norm": 0.30107825994491577, "learning_rate": 0.00015688682008550514, "loss": 0.5965, "step": 241 }, { "epoch": 0.5435148792813026, "grad_norm": 0.30696406960487366, "learning_rate": 0.0001568287472048917, "loss": 0.6025, "step": 242 }, { "epoch": 0.5457608085345311, "grad_norm": 0.32280731201171875, "learning_rate": 0.00015677014862517714, "loss": 0.5868, "step": 243 }, { "epoch": 0.5480067377877597, "grad_norm": 0.31739377975463867, "learning_rate": 0.000156711024747321, "loss": 0.5898, "step": 244 }, { "epoch": 0.5502526670409882, "grad_norm": 0.3620510995388031, "learning_rate": 0.0001566513759758772, "loss": 0.5621, "step": 245 }, { "epoch": 0.5524985962942167, "grad_norm": 0.26646366715431213, "learning_rate": 0.00015659120271899118, "loss": 0.5731, "step": 246 }, { "epoch": 0.5547445255474452, "grad_norm": 0.3814524710178375, "learning_rate": 0.00015653050538839722, "loss": 0.5947, "step": 247 }, { "epoch": 0.5569904548006738, "grad_norm": 0.4031396210193634, "learning_rate": 0.00015646928439941557, "loss": 0.612, "step": 248 }, { "epoch": 0.5592363840539023, "grad_norm": 0.38268253207206726, "learning_rate": 0.00015640754017094954, "loss": 0.5792, "step": 249 }, { "epoch": 0.5614823133071308, "grad_norm": 0.37941139936447144, "learning_rate": 0.0001563452731254827, "loss": 0.6071, "step": 250 }, { "epoch": 0.5637282425603594, "grad_norm": 0.3618276119232178, "learning_rate": 0.00015628248368907603, "loss": 0.5776, "step": 251 }, { "epoch": 0.5659741718135879, "grad_norm": 0.3906313180923462, "learning_rate": 0.000156219172291365, "loss": 0.5732, "step": 252 }, { "epoch": 0.5682201010668164, "grad_norm": 0.4234972894191742, "learning_rate": 0.0001561553393655564, "loss": 0.5674, "step": 253 }, { "epoch": 0.570466030320045, "grad_norm": 0.4400922954082489, "learning_rate": 0.00015609098534842582, "loss": 0.5894, "step": 254 }, { "epoch": 0.5727119595732735, "grad_norm": 0.38799750804901123, "learning_rate": 0.0001560261106803142, "loss": 0.5833, "step": 255 }, { "epoch": 0.574957888826502, "grad_norm": 0.31524044275283813, "learning_rate": 0.00015596071580512515, "loss": 0.5841, "step": 256 }, { "epoch": 0.5772038180797305, "grad_norm": 0.3451038599014282, "learning_rate": 0.00015589480117032174, "loss": 0.6003, "step": 257 }, { "epoch": 0.5794497473329591, "grad_norm": 0.3648560047149658, "learning_rate": 0.00015582836722692346, "loss": 0.5787, "step": 258 }, { "epoch": 0.5816956765861875, "grad_norm": 0.37476226687431335, "learning_rate": 0.00015576141442950317, "loss": 0.5719, "step": 259 }, { "epoch": 0.583941605839416, "grad_norm": 0.33187106251716614, "learning_rate": 0.00015569394323618403, "loss": 0.5785, "step": 260 }, { "epoch": 0.5861875350926445, "grad_norm": 0.36073818802833557, "learning_rate": 0.00015562595410863626, "loss": 0.5965, "step": 261 }, { "epoch": 0.5884334643458731, "grad_norm": 0.3586486577987671, "learning_rate": 0.00015555744751207404, "loss": 0.5857, "step": 262 }, { "epoch": 0.5906793935991016, "grad_norm": 0.44820883870124817, "learning_rate": 0.0001554884239152523, "loss": 0.5804, "step": 263 }, { "epoch": 0.5929253228523301, "grad_norm": 0.43128344416618347, "learning_rate": 0.00015541888379046366, "loss": 0.5613, "step": 264 }, { "epoch": 0.5951712521055587, "grad_norm": 0.38606396317481995, "learning_rate": 0.0001553488276135349, "loss": 0.5958, "step": 265 }, { "epoch": 0.5974171813587872, "grad_norm": 0.36493563652038574, "learning_rate": 0.0001552782558638239, "loss": 0.5663, "step": 266 }, { "epoch": 0.5996631106120157, "grad_norm": 0.40545809268951416, "learning_rate": 0.00015520716902421648, "loss": 0.5934, "step": 267 }, { "epoch": 0.6019090398652442, "grad_norm": 0.42288488149642944, "learning_rate": 0.00015513556758112282, "loss": 0.5729, "step": 268 }, { "epoch": 0.6041549691184728, "grad_norm": 0.2895568311214447, "learning_rate": 0.00015506345202447432, "loss": 0.6046, "step": 269 }, { "epoch": 0.6064008983717013, "grad_norm": 0.3440837562084198, "learning_rate": 0.00015499082284772017, "loss": 0.5654, "step": 270 }, { "epoch": 0.6086468276249298, "grad_norm": 0.36002352833747864, "learning_rate": 0.00015491768054782395, "loss": 0.5923, "step": 271 }, { "epoch": 0.6108927568781584, "grad_norm": 0.28700196743011475, "learning_rate": 0.00015484402562526036, "loss": 0.5826, "step": 272 }, { "epoch": 0.6131386861313869, "grad_norm": 0.32599133253097534, "learning_rate": 0.0001547698585840117, "loss": 0.5783, "step": 273 }, { "epoch": 0.6153846153846154, "grad_norm": 0.37215182185173035, "learning_rate": 0.00015469517993156435, "loss": 0.583, "step": 274 }, { "epoch": 0.617630544637844, "grad_norm": 0.3325370252132416, "learning_rate": 0.0001546199901789055, "loss": 0.582, "step": 275 }, { "epoch": 0.6198764738910725, "grad_norm": 0.3477807939052582, "learning_rate": 0.00015454428984051937, "loss": 0.5726, "step": 276 }, { "epoch": 0.622122403144301, "grad_norm": 0.37678489089012146, "learning_rate": 0.000154468079434384, "loss": 0.5786, "step": 277 }, { "epoch": 0.6243683323975294, "grad_norm": 0.3045758008956909, "learning_rate": 0.00015439135948196756, "loss": 0.5829, "step": 278 }, { "epoch": 0.626614261650758, "grad_norm": 0.3221797049045563, "learning_rate": 0.0001543141305082246, "loss": 0.5811, "step": 279 }, { "epoch": 0.6288601909039865, "grad_norm": 0.35202842950820923, "learning_rate": 0.00015423639304159288, "loss": 0.5655, "step": 280 }, { "epoch": 0.631106120157215, "grad_norm": 0.2838123142719269, "learning_rate": 0.00015415814761398936, "loss": 0.5991, "step": 281 }, { "epoch": 0.6333520494104435, "grad_norm": 0.33944493532180786, "learning_rate": 0.0001540793947608067, "loss": 0.5764, "step": 282 }, { "epoch": 0.6355979786636721, "grad_norm": 0.29667678475379944, "learning_rate": 0.0001540001350209097, "loss": 0.5745, "step": 283 }, { "epoch": 0.6378439079169006, "grad_norm": 0.37716934084892273, "learning_rate": 0.00015392036893663148, "loss": 0.5739, "step": 284 }, { "epoch": 0.6400898371701291, "grad_norm": 0.3955274522304535, "learning_rate": 0.00015384009705376978, "loss": 0.574, "step": 285 }, { "epoch": 0.6423357664233577, "grad_norm": 0.29740408062934875, "learning_rate": 0.00015375931992158331, "loss": 0.567, "step": 286 }, { "epoch": 0.6445816956765862, "grad_norm": 0.3198919892311096, "learning_rate": 0.0001536780380927879, "loss": 0.5672, "step": 287 }, { "epoch": 0.6468276249298147, "grad_norm": 0.3355892598628998, "learning_rate": 0.0001535962521235528, "loss": 0.57, "step": 288 }, { "epoch": 0.6490735541830432, "grad_norm": 0.32803425192832947, "learning_rate": 0.00015351396257349675, "loss": 0.5839, "step": 289 }, { "epoch": 0.6513194834362718, "grad_norm": 0.3538999557495117, "learning_rate": 0.00015343117000568432, "loss": 0.5864, "step": 290 }, { "epoch": 0.6535654126895003, "grad_norm": 0.3156984746456146, "learning_rate": 0.00015334787498662192, "loss": 0.5872, "step": 291 }, { "epoch": 0.6558113419427288, "grad_norm": 0.336056113243103, "learning_rate": 0.00015326407808625395, "loss": 0.578, "step": 292 }, { "epoch": 0.6580572711959574, "grad_norm": 0.3894708454608917, "learning_rate": 0.00015317977987795898, "loss": 0.5682, "step": 293 }, { "epoch": 0.6603032004491859, "grad_norm": 0.3500683605670929, "learning_rate": 0.00015309498093854577, "loss": 0.5934, "step": 294 }, { "epoch": 0.6625491297024144, "grad_norm": 0.331767201423645, "learning_rate": 0.00015300968184824926, "loss": 0.5781, "step": 295 }, { "epoch": 0.664795058955643, "grad_norm": 0.4042721092700958, "learning_rate": 0.0001529238831907267, "loss": 0.5811, "step": 296 }, { "epoch": 0.6670409882088714, "grad_norm": 0.2907451093196869, "learning_rate": 0.00015283758555305362, "loss": 0.5925, "step": 297 }, { "epoch": 0.6692869174620999, "grad_norm": 0.28044381737709045, "learning_rate": 0.0001527507895257198, "loss": 0.5717, "step": 298 }, { "epoch": 0.6715328467153284, "grad_norm": 0.2812747359275818, "learning_rate": 0.00015266349570262528, "loss": 0.5796, "step": 299 }, { "epoch": 0.673778775968557, "grad_norm": 0.28039273619651794, "learning_rate": 0.00015257570468107617, "loss": 0.5682, "step": 300 }, { "epoch": 0.6760247052217855, "grad_norm": 0.2821033000946045, "learning_rate": 0.00015248741706178073, "loss": 0.5939, "step": 301 }, { "epoch": 0.678270634475014, "grad_norm": 0.31085771322250366, "learning_rate": 0.0001523986334488452, "loss": 0.5829, "step": 302 }, { "epoch": 0.6805165637282425, "grad_norm": 0.31658798456192017, "learning_rate": 0.00015230935444976955, "loss": 0.6073, "step": 303 }, { "epoch": 0.6827624929814711, "grad_norm": 0.28057488799095154, "learning_rate": 0.00015221958067544348, "loss": 0.5888, "step": 304 }, { "epoch": 0.6850084222346996, "grad_norm": 0.29499179124832153, "learning_rate": 0.00015212931274014214, "loss": 0.5713, "step": 305 }, { "epoch": 0.6872543514879281, "grad_norm": 0.31696656346321106, "learning_rate": 0.00015203855126152204, "loss": 0.5956, "step": 306 }, { "epoch": 0.6895002807411567, "grad_norm": 0.2905656695365906, "learning_rate": 0.00015194729686061672, "loss": 0.56, "step": 307 }, { "epoch": 0.6917462099943852, "grad_norm": 0.33711618185043335, "learning_rate": 0.00015185555016183246, "loss": 0.5816, "step": 308 }, { "epoch": 0.6939921392476137, "grad_norm": 0.3962436616420746, "learning_rate": 0.00015176331179294416, "loss": 0.5933, "step": 309 }, { "epoch": 0.6962380685008422, "grad_norm": 0.2827875316143036, "learning_rate": 0.00015167058238509093, "loss": 0.5529, "step": 310 }, { "epoch": 0.6984839977540708, "grad_norm": 0.252986878156662, "learning_rate": 0.00015157736257277182, "loss": 0.5915, "step": 311 }, { "epoch": 0.7007299270072993, "grad_norm": 0.28363773226737976, "learning_rate": 0.00015148365299384145, "loss": 0.5621, "step": 312 }, { "epoch": 0.7029758562605278, "grad_norm": 0.26527139544487, "learning_rate": 0.00015138945428950566, "loss": 0.5791, "step": 313 }, { "epoch": 0.7052217855137564, "grad_norm": 0.24393455684185028, "learning_rate": 0.0001512947671043171, "loss": 0.5549, "step": 314 }, { "epoch": 0.7074677147669849, "grad_norm": 0.24904131889343262, "learning_rate": 0.00015119959208617092, "loss": 0.5627, "step": 315 }, { "epoch": 0.7097136440202133, "grad_norm": 0.3018868863582611, "learning_rate": 0.00015110392988630016, "loss": 0.5802, "step": 316 }, { "epoch": 0.7119595732734418, "grad_norm": 0.34517163038253784, "learning_rate": 0.0001510077811592714, "loss": 0.5831, "step": 317 }, { "epoch": 0.7142055025266704, "grad_norm": 0.3295687437057495, "learning_rate": 0.00015091114656298033, "loss": 0.5978, "step": 318 }, { "epoch": 0.7164514317798989, "grad_norm": 0.3116067945957184, "learning_rate": 0.00015081402675864717, "loss": 0.58, "step": 319 }, { "epoch": 0.7186973610331274, "grad_norm": 0.2843012809753418, "learning_rate": 0.00015071642241081212, "loss": 0.5837, "step": 320 }, { "epoch": 0.720943290286356, "grad_norm": 0.27185961604118347, "learning_rate": 0.00015061833418733095, "loss": 0.5746, "step": 321 }, { "epoch": 0.7231892195395845, "grad_norm": 0.26890790462493896, "learning_rate": 0.00015051976275937023, "loss": 0.5642, "step": 322 }, { "epoch": 0.725435148792813, "grad_norm": 0.29379114508628845, "learning_rate": 0.00015042070880140292, "loss": 0.5796, "step": 323 }, { "epoch": 0.7276810780460415, "grad_norm": 0.2906297743320465, "learning_rate": 0.0001503211729912037, "loss": 0.5666, "step": 324 }, { "epoch": 0.7299270072992701, "grad_norm": 0.2815559506416321, "learning_rate": 0.00015022115600984423, "loss": 0.5582, "step": 325 }, { "epoch": 0.7321729365524986, "grad_norm": 0.3286380469799042, "learning_rate": 0.0001501206585416886, "loss": 0.5462, "step": 326 }, { "epoch": 0.7344188658057271, "grad_norm": 0.3522341549396515, "learning_rate": 0.00015001968127438872, "loss": 0.5654, "step": 327 }, { "epoch": 0.7366647950589557, "grad_norm": 0.33905208110809326, "learning_rate": 0.00014991822489887938, "loss": 0.5606, "step": 328 }, { "epoch": 0.7389107243121842, "grad_norm": 0.29921072721481323, "learning_rate": 0.00014981629010937372, "loss": 0.5772, "step": 329 }, { "epoch": 0.7411566535654127, "grad_norm": 0.2822812497615814, "learning_rate": 0.00014971387760335841, "loss": 0.5772, "step": 330 }, { "epoch": 0.7434025828186412, "grad_norm": 0.3244154155254364, "learning_rate": 0.0001496109880815889, "loss": 0.5736, "step": 331 }, { "epoch": 0.7456485120718698, "grad_norm": 0.3305480480194092, "learning_rate": 0.0001495076222480846, "loss": 0.586, "step": 332 }, { "epoch": 0.7478944413250983, "grad_norm": 0.3018239140510559, "learning_rate": 0.00014940378081012407, "loss": 0.579, "step": 333 }, { "epoch": 0.7501403705783268, "grad_norm": 0.3692958652973175, "learning_rate": 0.00014929946447824014, "loss": 0.5767, "step": 334 }, { "epoch": 0.7523862998315554, "grad_norm": 0.3724178373813629, "learning_rate": 0.00014919467396621523, "loss": 0.5721, "step": 335 }, { "epoch": 0.7546322290847838, "grad_norm": 0.3226647973060608, "learning_rate": 0.00014908940999107615, "loss": 0.553, "step": 336 }, { "epoch": 0.7568781583380123, "grad_norm": 0.28518086671829224, "learning_rate": 0.00014898367327308945, "loss": 0.566, "step": 337 }, { "epoch": 0.7591240875912408, "grad_norm": 0.2642190158367157, "learning_rate": 0.0001488774645357565, "loss": 0.5732, "step": 338 }, { "epoch": 0.7613700168444694, "grad_norm": 0.2713199555873871, "learning_rate": 0.0001487707845058083, "loss": 0.5679, "step": 339 }, { "epoch": 0.7636159460976979, "grad_norm": 0.28339532017707825, "learning_rate": 0.00014866363391320076, "loss": 0.5664, "step": 340 }, { "epoch": 0.7658618753509264, "grad_norm": 0.26976078748703003, "learning_rate": 0.0001485560134911096, "loss": 0.5917, "step": 341 }, { "epoch": 0.768107804604155, "grad_norm": 0.31055644154548645, "learning_rate": 0.00014844792397592524, "loss": 0.5609, "step": 342 }, { "epoch": 0.7703537338573835, "grad_norm": 0.28089481592178345, "learning_rate": 0.000148339366107248, "loss": 0.5553, "step": 343 }, { "epoch": 0.772599663110612, "grad_norm": 0.3059735894203186, "learning_rate": 0.00014823034062788282, "loss": 0.5827, "step": 344 }, { "epoch": 0.7748455923638405, "grad_norm": 0.3540654480457306, "learning_rate": 0.00014812084828383425, "loss": 0.5417, "step": 345 }, { "epoch": 0.7770915216170691, "grad_norm": 0.3125968277454376, "learning_rate": 0.0001480108898243014, "loss": 0.5676, "step": 346 }, { "epoch": 0.7793374508702976, "grad_norm": 0.2534315884113312, "learning_rate": 0.0001479004660016727, "loss": 0.5724, "step": 347 }, { "epoch": 0.7815833801235261, "grad_norm": 0.30985814332962036, "learning_rate": 0.0001477895775715209, "loss": 0.5682, "step": 348 }, { "epoch": 0.7838293093767547, "grad_norm": 0.334831178188324, "learning_rate": 0.00014767822529259772, "loss": 0.5653, "step": 349 }, { "epoch": 0.7860752386299832, "grad_norm": 0.29639920592308044, "learning_rate": 0.00014756640992682883, "loss": 0.5959, "step": 350 }, { "epoch": 0.7883211678832117, "grad_norm": 0.33278346061706543, "learning_rate": 0.00014745413223930858, "loss": 0.57, "step": 351 }, { "epoch": 0.7905670971364402, "grad_norm": 0.26434555649757385, "learning_rate": 0.00014734139299829466, "loss": 0.5847, "step": 352 }, { "epoch": 0.7928130263896688, "grad_norm": 0.295564204454422, "learning_rate": 0.00014722819297520296, "loss": 0.5345, "step": 353 }, { "epoch": 0.7950589556428973, "grad_norm": 0.32043787837028503, "learning_rate": 0.00014711453294460235, "loss": 0.5751, "step": 354 }, { "epoch": 0.7973048848961257, "grad_norm": 0.35145339369773865, "learning_rate": 0.00014700041368420914, "loss": 0.5782, "step": 355 }, { "epoch": 0.7995508141493542, "grad_norm": 0.2663813531398773, "learning_rate": 0.00014688583597488204, "loss": 0.5457, "step": 356 }, { "epoch": 0.8017967434025828, "grad_norm": 0.3394940197467804, "learning_rate": 0.00014677080060061662, "loss": 0.5669, "step": 357 }, { "epoch": 0.8040426726558113, "grad_norm": 0.28702473640441895, "learning_rate": 0.00014665530834854002, "loss": 0.5715, "step": 358 }, { "epoch": 0.8062886019090398, "grad_norm": 0.3419654071331024, "learning_rate": 0.0001465393600089056, "loss": 0.5804, "step": 359 }, { "epoch": 0.8085345311622684, "grad_norm": 0.35292762517929077, "learning_rate": 0.00014642295637508742, "loss": 0.5666, "step": 360 }, { "epoch": 0.8107804604154969, "grad_norm": 0.31325843930244446, "learning_rate": 0.00014630609824357494, "loss": 0.5857, "step": 361 }, { "epoch": 0.8130263896687254, "grad_norm": 0.27262774109840393, "learning_rate": 0.00014618878641396748, "loss": 0.5797, "step": 362 }, { "epoch": 0.815272318921954, "grad_norm": 0.2780674397945404, "learning_rate": 0.00014607102168896882, "loss": 0.5552, "step": 363 }, { "epoch": 0.8175182481751825, "grad_norm": 0.2732245922088623, "learning_rate": 0.00014595280487438158, "loss": 0.5716, "step": 364 }, { "epoch": 0.819764177428411, "grad_norm": 0.33612555265426636, "learning_rate": 0.0001458341367791019, "loss": 0.5756, "step": 365 }, { "epoch": 0.8220101066816395, "grad_norm": 0.267904669046402, "learning_rate": 0.0001457150182151137, "loss": 0.5694, "step": 366 }, { "epoch": 0.8242560359348681, "grad_norm": 0.2547987401485443, "learning_rate": 0.0001455954499974833, "loss": 0.5718, "step": 367 }, { "epoch": 0.8265019651880966, "grad_norm": 0.2813619375228882, "learning_rate": 0.00014547543294435376, "loss": 0.5521, "step": 368 }, { "epoch": 0.8287478944413251, "grad_norm": 0.2692398428916931, "learning_rate": 0.0001453549678769392, "loss": 0.5644, "step": 369 }, { "epoch": 0.8309938236945537, "grad_norm": 0.24875199794769287, "learning_rate": 0.0001452340556195194, "loss": 0.5563, "step": 370 }, { "epoch": 0.8332397529477822, "grad_norm": 0.24863849580287933, "learning_rate": 0.00014511269699943392, "loss": 0.5479, "step": 371 }, { "epoch": 0.8354856822010107, "grad_norm": 0.2492000311613083, "learning_rate": 0.00014499089284707658, "loss": 0.5742, "step": 372 }, { "epoch": 0.8377316114542392, "grad_norm": 0.2373623251914978, "learning_rate": 0.0001448686439958898, "loss": 0.5688, "step": 373 }, { "epoch": 0.8399775407074677, "grad_norm": 0.265248566865921, "learning_rate": 0.00014474595128235876, "loss": 0.5616, "step": 374 }, { "epoch": 0.8422234699606962, "grad_norm": 0.2871013879776001, "learning_rate": 0.00014462281554600577, "loss": 0.556, "step": 375 }, { "epoch": 0.8444693992139247, "grad_norm": 0.31418806314468384, "learning_rate": 0.00014449923762938462, "loss": 0.5644, "step": 376 }, { "epoch": 0.8467153284671532, "grad_norm": 0.3332020044326782, "learning_rate": 0.00014437521837807455, "loss": 0.5611, "step": 377 }, { "epoch": 0.8489612577203818, "grad_norm": 0.2672823965549469, "learning_rate": 0.00014425075864067473, "loss": 0.5575, "step": 378 }, { "epoch": 0.8512071869736103, "grad_norm": 0.23632559180259705, "learning_rate": 0.00014412585926879833, "loss": 0.578, "step": 379 }, { "epoch": 0.8534531162268388, "grad_norm": 0.31967830657958984, "learning_rate": 0.00014400052111706668, "loss": 0.5738, "step": 380 }, { "epoch": 0.8556990454800674, "grad_norm": 0.3274000287055969, "learning_rate": 0.0001438747450431035, "loss": 0.5606, "step": 381 }, { "epoch": 0.8579449747332959, "grad_norm": 0.32115650177001953, "learning_rate": 0.00014374853190752892, "loss": 0.601, "step": 382 }, { "epoch": 0.8601909039865244, "grad_norm": 0.3195722997188568, "learning_rate": 0.00014362188257395367, "loss": 0.5794, "step": 383 }, { "epoch": 0.862436833239753, "grad_norm": 0.32217174768447876, "learning_rate": 0.00014349479790897325, "loss": 0.5687, "step": 384 }, { "epoch": 0.8646827624929815, "grad_norm": 0.3338417410850525, "learning_rate": 0.00014336727878216178, "loss": 0.5513, "step": 385 }, { "epoch": 0.86692869174621, "grad_norm": 0.2939014732837677, "learning_rate": 0.00014323932606606624, "loss": 0.5845, "step": 386 }, { "epoch": 0.8691746209994385, "grad_norm": 0.34269392490386963, "learning_rate": 0.00014311094063620036, "loss": 0.5721, "step": 387 }, { "epoch": 0.8714205502526671, "grad_norm": 0.3684992492198944, "learning_rate": 0.00014298212337103888, "loss": 0.5924, "step": 388 }, { "epoch": 0.8736664795058956, "grad_norm": 0.27671441435813904, "learning_rate": 0.0001428528751520112, "loss": 0.5536, "step": 389 }, { "epoch": 0.8759124087591241, "grad_norm": 0.3508271276950836, "learning_rate": 0.0001427231968634955, "loss": 0.5499, "step": 390 }, { "epoch": 0.8781583380123527, "grad_norm": 0.3735405504703522, "learning_rate": 0.00014259308939281292, "loss": 0.5472, "step": 391 }, { "epoch": 0.8804042672655812, "grad_norm": 0.30313754081726074, "learning_rate": 0.00014246255363022095, "loss": 0.5598, "step": 392 }, { "epoch": 0.8826501965188096, "grad_norm": 0.28613924980163574, "learning_rate": 0.00014233159046890792, "loss": 0.5589, "step": 393 }, { "epoch": 0.8848961257720381, "grad_norm": 0.3396552503108978, "learning_rate": 0.00014220020080498648, "loss": 0.5722, "step": 394 }, { "epoch": 0.8871420550252667, "grad_norm": 0.24562208354473114, "learning_rate": 0.00014206838553748773, "loss": 0.5617, "step": 395 }, { "epoch": 0.8893879842784952, "grad_norm": 0.26731806993484497, "learning_rate": 0.00014193614556835482, "loss": 0.5876, "step": 396 }, { "epoch": 0.8916339135317237, "grad_norm": 0.30024391412734985, "learning_rate": 0.00014180348180243706, "loss": 0.5457, "step": 397 }, { "epoch": 0.8938798427849522, "grad_norm": 0.23074807226657867, "learning_rate": 0.0001416703951474834, "loss": 0.5767, "step": 398 }, { "epoch": 0.8961257720381808, "grad_norm": 0.2882399260997772, "learning_rate": 0.00014153688651413662, "loss": 0.548, "step": 399 }, { "epoch": 0.8983717012914093, "grad_norm": 0.30070793628692627, "learning_rate": 0.00014140295681592667, "loss": 0.5483, "step": 400 }, { "epoch": 0.9006176305446378, "grad_norm": 0.261349081993103, "learning_rate": 0.00014126860696926473, "loss": 0.5568, "step": 401 }, { "epoch": 0.9028635597978664, "grad_norm": 0.2514486610889435, "learning_rate": 0.00014113383789343686, "loss": 0.5656, "step": 402 }, { "epoch": 0.9051094890510949, "grad_norm": 0.28470492362976074, "learning_rate": 0.00014099865051059765, "loss": 0.5877, "step": 403 }, { "epoch": 0.9073554183043234, "grad_norm": 0.28581055998802185, "learning_rate": 0.00014086304574576394, "loss": 0.5703, "step": 404 }, { "epoch": 0.909601347557552, "grad_norm": 0.22891870141029358, "learning_rate": 0.00014072702452680848, "loss": 0.5631, "step": 405 }, { "epoch": 0.9118472768107805, "grad_norm": 0.27686670422554016, "learning_rate": 0.00014059058778445363, "loss": 0.542, "step": 406 }, { "epoch": 0.914093206064009, "grad_norm": 0.27244243025779724, "learning_rate": 0.000140453736452265, "loss": 0.5444, "step": 407 }, { "epoch": 0.9163391353172375, "grad_norm": 0.2376582771539688, "learning_rate": 0.00014031647146664494, "loss": 0.5624, "step": 408 }, { "epoch": 0.9185850645704661, "grad_norm": 0.29739177227020264, "learning_rate": 0.00014017879376682627, "loss": 0.5579, "step": 409 }, { "epoch": 0.9208309938236946, "grad_norm": 0.24522463977336884, "learning_rate": 0.00014004070429486575, "loss": 0.5778, "step": 410 }, { "epoch": 0.9230769230769231, "grad_norm": 0.28420501947402954, "learning_rate": 0.00013990220399563775, "loss": 0.582, "step": 411 }, { "epoch": 0.9253228523301515, "grad_norm": 0.3128701150417328, "learning_rate": 0.0001397632938168277, "loss": 0.5597, "step": 412 }, { "epoch": 0.9275687815833801, "grad_norm": 0.2503584921360016, "learning_rate": 0.0001396239747089255, "loss": 0.557, "step": 413 }, { "epoch": 0.9298147108366086, "grad_norm": 0.2346629947423935, "learning_rate": 0.00013948424762521937, "loss": 0.5567, "step": 414 }, { "epoch": 0.9320606400898371, "grad_norm": 0.26330509781837463, "learning_rate": 0.00013934411352178888, "loss": 0.5556, "step": 415 }, { "epoch": 0.9343065693430657, "grad_norm": 0.2683373689651489, "learning_rate": 0.00013920357335749873, "loss": 0.5585, "step": 416 }, { "epoch": 0.9365524985962942, "grad_norm": 0.21568100154399872, "learning_rate": 0.0001390626280939921, "loss": 0.5837, "step": 417 }, { "epoch": 0.9387984278495227, "grad_norm": 0.2531348168849945, "learning_rate": 0.00013892127869568396, "loss": 0.5505, "step": 418 }, { "epoch": 0.9410443571027512, "grad_norm": 0.2542068362236023, "learning_rate": 0.00013877952612975465, "loss": 0.5834, "step": 419 }, { "epoch": 0.9432902863559798, "grad_norm": 0.22806115448474884, "learning_rate": 0.00013863737136614318, "loss": 0.5648, "step": 420 }, { "epoch": 0.9455362156092083, "grad_norm": 0.3068370819091797, "learning_rate": 0.00013849481537754054, "loss": 0.5488, "step": 421 }, { "epoch": 0.9477821448624368, "grad_norm": 0.31988707184791565, "learning_rate": 0.00013835185913938305, "loss": 0.5679, "step": 422 }, { "epoch": 0.9500280741156654, "grad_norm": 0.2480153888463974, "learning_rate": 0.00013820850362984585, "loss": 0.5481, "step": 423 }, { "epoch": 0.9522740033688939, "grad_norm": 0.2835778295993805, "learning_rate": 0.00013806474982983602, "loss": 0.5575, "step": 424 }, { "epoch": 0.9545199326221224, "grad_norm": 0.28375929594039917, "learning_rate": 0.0001379205987229859, "loss": 0.5522, "step": 425 }, { "epoch": 0.956765861875351, "grad_norm": 0.32451605796813965, "learning_rate": 0.00013777605129564649, "loss": 0.5531, "step": 426 }, { "epoch": 0.9590117911285795, "grad_norm": 0.2671431005001068, "learning_rate": 0.00013763110853688053, "loss": 0.5597, "step": 427 }, { "epoch": 0.961257720381808, "grad_norm": 0.31597959995269775, "learning_rate": 0.0001374857714384558, "loss": 0.5668, "step": 428 }, { "epoch": 0.9635036496350365, "grad_norm": 0.3307490050792694, "learning_rate": 0.00013734004099483842, "loss": 0.5412, "step": 429 }, { "epoch": 0.9657495788882651, "grad_norm": 0.2594424784183502, "learning_rate": 0.00013719391820318585, "loss": 0.534, "step": 430 }, { "epoch": 0.9679955081414935, "grad_norm": 0.24068522453308105, "learning_rate": 0.00013704740406334027, "loss": 0.567, "step": 431 }, { "epoch": 0.970241437394722, "grad_norm": 0.27227339148521423, "learning_rate": 0.00013690049957782162, "loss": 0.55, "step": 432 }, { "epoch": 0.9724873666479505, "grad_norm": 0.2331283837556839, "learning_rate": 0.0001367532057518208, "loss": 0.5296, "step": 433 }, { "epoch": 0.9747332959011791, "grad_norm": 0.25519710779190063, "learning_rate": 0.00013660552359319274, "loss": 0.5759, "step": 434 }, { "epoch": 0.9769792251544076, "grad_norm": 0.25783583521842957, "learning_rate": 0.0001364574541124495, "loss": 0.5642, "step": 435 }, { "epoch": 0.9792251544076361, "grad_norm": 0.2404668927192688, "learning_rate": 0.00013630899832275348, "loss": 0.5566, "step": 436 }, { "epoch": 0.9814710836608647, "grad_norm": 0.29142558574676514, "learning_rate": 0.00013616015723991027, "loss": 0.5666, "step": 437 }, { "epoch": 0.9837170129140932, "grad_norm": 0.2782052755355835, "learning_rate": 0.00013601093188236188, "loss": 0.5507, "step": 438 }, { "epoch": 0.9859629421673217, "grad_norm": 0.21326802670955658, "learning_rate": 0.00013586132327117974, "loss": 0.5685, "step": 439 }, { "epoch": 0.9882088714205502, "grad_norm": 0.25155940651893616, "learning_rate": 0.00013571133243005763, "loss": 0.5803, "step": 440 }, { "epoch": 0.9904548006737788, "grad_norm": 0.2218320518732071, "learning_rate": 0.00013556096038530474, "loss": 0.5488, "step": 441 }, { "epoch": 0.9927007299270073, "grad_norm": 0.27737680077552795, "learning_rate": 0.00013541020816583869, "loss": 0.5651, "step": 442 }, { "epoch": 0.9949466591802358, "grad_norm": 0.2509002387523651, "learning_rate": 0.00013525907680317836, "loss": 0.5525, "step": 443 }, { "epoch": 0.9971925884334644, "grad_norm": 0.25884488224983215, "learning_rate": 0.000135107567331437, "loss": 0.567, "step": 444 }, { "epoch": 0.9994385176866929, "grad_norm": 0.2978728711605072, "learning_rate": 0.00013495568078731495, "loss": 0.5405, "step": 445 }, { "epoch": 1.0016844469399213, "grad_norm": 0.31027480959892273, "learning_rate": 0.00013480341821009277, "loss": 0.5251, "step": 446 }, { "epoch": 1.00393037619315, "grad_norm": 0.3249771296977997, "learning_rate": 0.00013465078064162393, "loss": 0.5197, "step": 447 }, { "epoch": 1.0061763054463784, "grad_norm": 0.330244243144989, "learning_rate": 0.00013449776912632784, "loss": 0.5177, "step": 448 }, { "epoch": 1.008422234699607, "grad_norm": 0.31560900807380676, "learning_rate": 0.00013434438471118262, "loss": 0.5108, "step": 449 }, { "epoch": 1.0106681639528354, "grad_norm": 0.32275712490081787, "learning_rate": 0.00013419062844571784, "loss": 0.498, "step": 450 }, { "epoch": 1.012914093206064, "grad_norm": 0.30424079298973083, "learning_rate": 0.0001340365013820077, "loss": 0.5394, "step": 451 }, { "epoch": 1.0151600224592925, "grad_norm": 0.26794448494911194, "learning_rate": 0.00013388200457466326, "loss": 0.4944, "step": 452 }, { "epoch": 1.0174059517125211, "grad_norm": 0.31360936164855957, "learning_rate": 0.00013372713908082578, "loss": 0.5062, "step": 453 }, { "epoch": 1.0196518809657495, "grad_norm": 0.33009976148605347, "learning_rate": 0.00013357190596015919, "loss": 0.5105, "step": 454 }, { "epoch": 1.0218978102189782, "grad_norm": 0.2470821887254715, "learning_rate": 0.00013341630627484286, "loss": 0.5185, "step": 455 }, { "epoch": 1.0241437394722066, "grad_norm": 0.304426908493042, "learning_rate": 0.00013326034108956437, "loss": 0.5292, "step": 456 }, { "epoch": 1.0263896687254352, "grad_norm": 0.3242713510990143, "learning_rate": 0.0001331040114715123, "loss": 0.5214, "step": 457 }, { "epoch": 1.0286355979786637, "grad_norm": 0.31412094831466675, "learning_rate": 0.00013294731849036875, "loss": 0.5106, "step": 458 }, { "epoch": 1.0308815272318923, "grad_norm": 0.27217480540275574, "learning_rate": 0.0001327902632183022, "loss": 0.5344, "step": 459 }, { "epoch": 1.0331274564851207, "grad_norm": 0.2789839208126068, "learning_rate": 0.00013263284672996009, "loss": 0.521, "step": 460 }, { "epoch": 1.0353733857383491, "grad_norm": 0.27859795093536377, "learning_rate": 0.00013247507010246144, "loss": 0.5316, "step": 461 }, { "epoch": 1.0376193149915778, "grad_norm": 0.30018481612205505, "learning_rate": 0.00013231693441538952, "loss": 0.5083, "step": 462 }, { "epoch": 1.0398652442448062, "grad_norm": 0.2683006525039673, "learning_rate": 0.0001321584407507845, "loss": 0.5378, "step": 463 }, { "epoch": 1.0421111734980348, "grad_norm": 0.27185767889022827, "learning_rate": 0.000131999590193136, "loss": 0.5117, "step": 464 }, { "epoch": 1.0443571027512633, "grad_norm": 0.2839741110801697, "learning_rate": 0.0001318403838293756, "loss": 0.5282, "step": 465 }, { "epoch": 1.046603032004492, "grad_norm": 0.2537892460823059, "learning_rate": 0.00013168082274886953, "loss": 0.5096, "step": 466 }, { "epoch": 1.0488489612577203, "grad_norm": 0.2625972032546997, "learning_rate": 0.00013152090804341118, "loss": 0.5188, "step": 467 }, { "epoch": 1.051094890510949, "grad_norm": 0.3052925169467926, "learning_rate": 0.00013136064080721354, "loss": 0.5409, "step": 468 }, { "epoch": 1.0533408197641774, "grad_norm": 0.2866557538509369, "learning_rate": 0.00013120002213690192, "loss": 0.5101, "step": 469 }, { "epoch": 1.055586749017406, "grad_norm": 0.26804205775260925, "learning_rate": 0.00013103905313150617, "loss": 0.5221, "step": 470 }, { "epoch": 1.0578326782706344, "grad_norm": 0.2677738070487976, "learning_rate": 0.00013087773489245334, "loss": 0.5203, "step": 471 }, { "epoch": 1.060078607523863, "grad_norm": 0.273448646068573, "learning_rate": 0.00013071606852356013, "loss": 0.5349, "step": 472 }, { "epoch": 1.0623245367770915, "grad_norm": 0.27046024799346924, "learning_rate": 0.00013055405513102533, "loss": 0.5132, "step": 473 }, { "epoch": 1.0645704660303201, "grad_norm": 0.25829020142555237, "learning_rate": 0.00013039169582342215, "loss": 0.4968, "step": 474 }, { "epoch": 1.0668163952835485, "grad_norm": 0.27012374997138977, "learning_rate": 0.0001302289917116908, "loss": 0.5166, "step": 475 }, { "epoch": 1.0690623245367772, "grad_norm": 0.2819938063621521, "learning_rate": 0.00013006594390913077, "loss": 0.5238, "step": 476 }, { "epoch": 1.0713082537900056, "grad_norm": 0.24958448112010956, "learning_rate": 0.00012990255353139324, "loss": 0.5031, "step": 477 }, { "epoch": 1.073554183043234, "grad_norm": 0.23778881132602692, "learning_rate": 0.0001297388216964735, "loss": 0.5297, "step": 478 }, { "epoch": 1.0758001122964627, "grad_norm": 0.25948163866996765, "learning_rate": 0.00012957474952470313, "loss": 0.5146, "step": 479 }, { "epoch": 1.078046041549691, "grad_norm": 0.22898133099079132, "learning_rate": 0.00012941033813874264, "loss": 0.5137, "step": 480 }, { "epoch": 1.0802919708029197, "grad_norm": 0.2507185637950897, "learning_rate": 0.00012924558866357343, "loss": 0.5241, "step": 481 }, { "epoch": 1.0825379000561481, "grad_norm": 0.2403927892446518, "learning_rate": 0.00012908050222649043, "loss": 0.5036, "step": 482 }, { "epoch": 1.0847838293093768, "grad_norm": 0.23922879993915558, "learning_rate": 0.00012891507995709412, "loss": 0.528, "step": 483 }, { "epoch": 1.0870297585626052, "grad_norm": 0.2286342829465866, "learning_rate": 0.00012874932298728286, "loss": 0.5202, "step": 484 }, { "epoch": 1.0892756878158338, "grad_norm": 0.258478045463562, "learning_rate": 0.00012858323245124538, "loss": 0.5041, "step": 485 }, { "epoch": 1.0915216170690623, "grad_norm": 0.27987441420555115, "learning_rate": 0.0001284168094854526, "loss": 0.5021, "step": 486 }, { "epoch": 1.093767546322291, "grad_norm": 0.22872576117515564, "learning_rate": 0.00012825005522865027, "loss": 0.5243, "step": 487 }, { "epoch": 1.0960134755755193, "grad_norm": 0.22990728914737701, "learning_rate": 0.00012808297082185087, "loss": 0.5186, "step": 488 }, { "epoch": 1.098259404828748, "grad_norm": 0.21057239174842834, "learning_rate": 0.000127915557408326, "loss": 0.5074, "step": 489 }, { "epoch": 1.1005053340819764, "grad_norm": 0.2562633752822876, "learning_rate": 0.00012774781613359841, "loss": 0.5205, "step": 490 }, { "epoch": 1.102751263335205, "grad_norm": 0.23108799755573273, "learning_rate": 0.0001275797481454343, "loss": 0.5289, "step": 491 }, { "epoch": 1.1049971925884334, "grad_norm": 0.2631300389766693, "learning_rate": 0.00012741135459383543, "loss": 0.5198, "step": 492 }, { "epoch": 1.107243121841662, "grad_norm": 0.2443421483039856, "learning_rate": 0.00012724263663103108, "loss": 0.535, "step": 493 }, { "epoch": 1.1094890510948905, "grad_norm": 0.22926633059978485, "learning_rate": 0.00012707359541147043, "loss": 0.4935, "step": 494 }, { "epoch": 1.1117349803481191, "grad_norm": 0.25909942388534546, "learning_rate": 0.00012690423209181452, "loss": 0.4998, "step": 495 }, { "epoch": 1.1139809096013475, "grad_norm": 0.24831925332546234, "learning_rate": 0.0001267345478309283, "loss": 0.5246, "step": 496 }, { "epoch": 1.1162268388545762, "grad_norm": 0.26700034737586975, "learning_rate": 0.00012656454378987282, "loss": 0.5276, "step": 497 }, { "epoch": 1.1184727681078046, "grad_norm": 0.24582357704639435, "learning_rate": 0.00012639422113189713, "loss": 0.5274, "step": 498 }, { "epoch": 1.120718697361033, "grad_norm": 0.2464480996131897, "learning_rate": 0.00012622358102243054, "loss": 0.514, "step": 499 }, { "epoch": 1.1229646266142617, "grad_norm": 0.28942957520484924, "learning_rate": 0.0001260526246290744, "loss": 0.5216, "step": 500 }, { "epoch": 1.12521055586749, "grad_norm": 0.29417484998703003, "learning_rate": 0.00012588135312159427, "loss": 0.5214, "step": 501 }, { "epoch": 1.1274564851207187, "grad_norm": 0.27026209235191345, "learning_rate": 0.00012570976767191188, "loss": 0.5206, "step": 502 }, { "epoch": 1.1297024143739471, "grad_norm": 0.2554686963558197, "learning_rate": 0.0001255378694540971, "loss": 0.5285, "step": 503 }, { "epoch": 1.1319483436271758, "grad_norm": 0.28773826360702515, "learning_rate": 0.00012536565964435986, "loss": 0.4933, "step": 504 }, { "epoch": 1.1341942728804042, "grad_norm": 0.28885528445243835, "learning_rate": 0.00012519313942104224, "loss": 0.5392, "step": 505 }, { "epoch": 1.1364402021336328, "grad_norm": 0.31166213750839233, "learning_rate": 0.00012502030996461023, "loss": 0.5333, "step": 506 }, { "epoch": 1.1386861313868613, "grad_norm": 0.3064601719379425, "learning_rate": 0.00012484717245764585, "loss": 0.5261, "step": 507 }, { "epoch": 1.14093206064009, "grad_norm": 0.3036741018295288, "learning_rate": 0.00012467372808483882, "loss": 0.5309, "step": 508 }, { "epoch": 1.1431779898933183, "grad_norm": 0.2402871996164322, "learning_rate": 0.00012449997803297866, "loss": 0.4906, "step": 509 }, { "epoch": 1.145423919146547, "grad_norm": 0.26572084426879883, "learning_rate": 0.0001243259234909465, "loss": 0.5152, "step": 510 }, { "epoch": 1.1476698483997754, "grad_norm": 0.26166555285453796, "learning_rate": 0.00012415156564970687, "loss": 0.5266, "step": 511 }, { "epoch": 1.149915777653004, "grad_norm": 0.26020121574401855, "learning_rate": 0.0001239769057022997, "loss": 0.5063, "step": 512 }, { "epoch": 1.1521617069062324, "grad_norm": 0.2840318977832794, "learning_rate": 0.00012380194484383201, "loss": 0.5301, "step": 513 }, { "epoch": 1.154407636159461, "grad_norm": 0.2320166826248169, "learning_rate": 0.00012362668427146986, "loss": 0.5074, "step": 514 }, { "epoch": 1.1566535654126895, "grad_norm": 0.26712101697921753, "learning_rate": 0.00012345112518443008, "loss": 0.5247, "step": 515 }, { "epoch": 1.158899494665918, "grad_norm": 0.2772868871688843, "learning_rate": 0.000123275268783972, "loss": 0.5113, "step": 516 }, { "epoch": 1.1611454239191465, "grad_norm": 0.23757833242416382, "learning_rate": 0.00012309911627338943, "loss": 0.5383, "step": 517 }, { "epoch": 1.1633913531723752, "grad_norm": 0.24388740956783295, "learning_rate": 0.00012292266885800221, "loss": 0.5404, "step": 518 }, { "epoch": 1.1656372824256036, "grad_norm": 0.32931777834892273, "learning_rate": 0.00012274592774514812, "loss": 0.5304, "step": 519 }, { "epoch": 1.167883211678832, "grad_norm": 0.2616422176361084, "learning_rate": 0.00012256889414417456, "loss": 0.5111, "step": 520 }, { "epoch": 1.1701291409320607, "grad_norm": 0.20813870429992676, "learning_rate": 0.0001223915692664302, "loss": 0.4817, "step": 521 }, { "epoch": 1.172375070185289, "grad_norm": 0.2631247639656067, "learning_rate": 0.00012221395432525687, "loss": 0.5119, "step": 522 }, { "epoch": 1.1746209994385177, "grad_norm": 0.22986264526844025, "learning_rate": 0.0001220360505359811, "loss": 0.5136, "step": 523 }, { "epoch": 1.1768669286917461, "grad_norm": 0.23806849122047424, "learning_rate": 0.00012185785911590583, "loss": 0.5247, "step": 524 }, { "epoch": 1.1791128579449748, "grad_norm": 0.2917364537715912, "learning_rate": 0.00012167938128430216, "loss": 0.5286, "step": 525 }, { "epoch": 1.1813587871982032, "grad_norm": 0.24546997249126434, "learning_rate": 0.00012150061826240091, "loss": 0.5197, "step": 526 }, { "epoch": 1.1836047164514318, "grad_norm": 0.22644369304180145, "learning_rate": 0.00012132157127338435, "loss": 0.5369, "step": 527 }, { "epoch": 1.1858506457046603, "grad_norm": 0.2547290623188019, "learning_rate": 0.00012114224154237777, "loss": 0.5108, "step": 528 }, { "epoch": 1.188096574957889, "grad_norm": 0.2384437471628189, "learning_rate": 0.00012096263029644112, "loss": 0.528, "step": 529 }, { "epoch": 1.1903425042111173, "grad_norm": 0.2654406726360321, "learning_rate": 0.0001207827387645606, "loss": 0.5179, "step": 530 }, { "epoch": 1.192588433464346, "grad_norm": 0.19757139682769775, "learning_rate": 0.00012060256817764025, "loss": 0.5126, "step": 531 }, { "epoch": 1.1948343627175744, "grad_norm": 0.21663667261600494, "learning_rate": 0.00012042211976849356, "loss": 0.5136, "step": 532 }, { "epoch": 1.197080291970803, "grad_norm": 0.21993404626846313, "learning_rate": 0.00012024139477183504, "loss": 0.5185, "step": 533 }, { "epoch": 1.1993262212240314, "grad_norm": 0.2317759096622467, "learning_rate": 0.00012006039442427167, "loss": 0.5139, "step": 534 }, { "epoch": 1.20157215047726, "grad_norm": 0.21483832597732544, "learning_rate": 0.0001198791199642946, "loss": 0.5231, "step": 535 }, { "epoch": 1.2038180797304885, "grad_norm": 0.2653373181819916, "learning_rate": 0.0001196975726322705, "loss": 0.5177, "step": 536 }, { "epoch": 1.206064008983717, "grad_norm": 0.19980397820472717, "learning_rate": 0.00011951575367043321, "loss": 0.5081, "step": 537 }, { "epoch": 1.2083099382369455, "grad_norm": 0.2335788607597351, "learning_rate": 0.00011933366432287522, "loss": 0.5283, "step": 538 }, { "epoch": 1.210555867490174, "grad_norm": 0.20896217226982117, "learning_rate": 0.00011915130583553906, "loss": 0.5009, "step": 539 }, { "epoch": 1.2128017967434026, "grad_norm": 0.2064492404460907, "learning_rate": 0.00011896867945620891, "loss": 0.5072, "step": 540 }, { "epoch": 1.215047725996631, "grad_norm": 0.22994771599769592, "learning_rate": 0.00011878578643450191, "loss": 0.506, "step": 541 }, { "epoch": 1.2172936552498597, "grad_norm": 0.21593116223812103, "learning_rate": 0.00011860262802185982, "loss": 0.5304, "step": 542 }, { "epoch": 1.219539584503088, "grad_norm": 0.21689918637275696, "learning_rate": 0.0001184192054715402, "loss": 0.5163, "step": 543 }, { "epoch": 1.2217855137563167, "grad_norm": 0.20837046205997467, "learning_rate": 0.00011823552003860805, "loss": 0.5247, "step": 544 }, { "epoch": 1.2240314430095451, "grad_norm": 0.2125036120414734, "learning_rate": 0.00011805157297992715, "loss": 0.5118, "step": 545 }, { "epoch": 1.2262773722627738, "grad_norm": 0.21233297884464264, "learning_rate": 0.00011786736555415134, "loss": 0.5091, "step": 546 }, { "epoch": 1.2285233015160022, "grad_norm": 0.2236490547657013, "learning_rate": 0.00011768289902171612, "loss": 0.5168, "step": 547 }, { "epoch": 1.2307692307692308, "grad_norm": 0.2149861603975296, "learning_rate": 0.00011749817464482995, "loss": 0.5221, "step": 548 }, { "epoch": 1.2330151600224593, "grad_norm": 0.23652967810630798, "learning_rate": 0.00011731319368746545, "loss": 0.5132, "step": 549 }, { "epoch": 1.235261089275688, "grad_norm": 0.2397671788930893, "learning_rate": 0.00011712795741535098, "loss": 0.5085, "step": 550 }, { "epoch": 1.2375070185289163, "grad_norm": 0.1940278857946396, "learning_rate": 0.00011694246709596195, "loss": 0.5429, "step": 551 }, { "epoch": 1.239752947782145, "grad_norm": 0.24372558295726776, "learning_rate": 0.00011675672399851188, "loss": 0.5091, "step": 552 }, { "epoch": 1.2419988770353734, "grad_norm": 0.21898634731769562, "learning_rate": 0.00011657072939394413, "loss": 0.5164, "step": 553 }, { "epoch": 1.2442448062886018, "grad_norm": 0.2210114300251007, "learning_rate": 0.00011638448455492287, "loss": 0.5133, "step": 554 }, { "epoch": 1.2464907355418304, "grad_norm": 0.2156367301940918, "learning_rate": 0.00011619799075582452, "loss": 0.5109, "step": 555 }, { "epoch": 1.248736664795059, "grad_norm": 0.1969204545021057, "learning_rate": 0.00011601124927272906, "loss": 0.5143, "step": 556 }, { "epoch": 1.2509825940482875, "grad_norm": 0.19980621337890625, "learning_rate": 0.00011582426138341111, "loss": 0.5087, "step": 557 }, { "epoch": 1.253228523301516, "grad_norm": 0.2064458578824997, "learning_rate": 0.00011563702836733152, "loss": 0.505, "step": 558 }, { "epoch": 1.2554744525547445, "grad_norm": 0.24166250228881836, "learning_rate": 0.00011544955150562819, "loss": 0.5204, "step": 559 }, { "epoch": 1.2577203818079732, "grad_norm": 0.251028448343277, "learning_rate": 0.0001152618320811077, "loss": 0.5071, "step": 560 }, { "epoch": 1.2599663110612016, "grad_norm": 0.1982237845659256, "learning_rate": 0.0001150738713782363, "loss": 0.5059, "step": 561 }, { "epoch": 1.26221224031443, "grad_norm": 0.26162639260292053, "learning_rate": 0.00011488567068313114, "loss": 0.5172, "step": 562 }, { "epoch": 1.2644581695676587, "grad_norm": 0.2098427712917328, "learning_rate": 0.0001146972312835516, "loss": 0.5135, "step": 563 }, { "epoch": 1.266704098820887, "grad_norm": 0.2430814802646637, "learning_rate": 0.00011450855446889031, "loss": 0.5125, "step": 564 }, { "epoch": 1.2689500280741157, "grad_norm": 0.21262916922569275, "learning_rate": 0.00011431964153016444, "loss": 0.5114, "step": 565 }, { "epoch": 1.2711959573273441, "grad_norm": 0.20545636117458344, "learning_rate": 0.00011413049376000686, "loss": 0.5095, "step": 566 }, { "epoch": 1.2734418865805728, "grad_norm": 0.23621973395347595, "learning_rate": 0.00011394111245265724, "loss": 0.5231, "step": 567 }, { "epoch": 1.2756878158338012, "grad_norm": 0.21574462950229645, "learning_rate": 0.00011375149890395321, "loss": 0.5292, "step": 568 }, { "epoch": 1.2779337450870298, "grad_norm": 0.22070422768592834, "learning_rate": 0.00011356165441132152, "loss": 0.5157, "step": 569 }, { "epoch": 1.2801796743402583, "grad_norm": 0.19420836865901947, "learning_rate": 0.00011337158027376918, "loss": 0.5179, "step": 570 }, { "epoch": 1.2824256035934867, "grad_norm": 0.26924458146095276, "learning_rate": 0.0001131812777918745, "loss": 0.5408, "step": 571 }, { "epoch": 1.2846715328467153, "grad_norm": 0.22928448021411896, "learning_rate": 0.00011299074826777824, "loss": 0.5146, "step": 572 }, { "epoch": 1.286917462099944, "grad_norm": 0.24480290710926056, "learning_rate": 0.00011279999300517471, "loss": 0.5151, "step": 573 }, { "epoch": 1.2891633913531724, "grad_norm": 0.2365870326757431, "learning_rate": 0.0001126090133093028, "loss": 0.5088, "step": 574 }, { "epoch": 1.2914093206064008, "grad_norm": 0.2634016275405884, "learning_rate": 0.0001124178104869371, "loss": 0.519, "step": 575 }, { "epoch": 1.2936552498596294, "grad_norm": 0.275654673576355, "learning_rate": 0.00011222638584637897, "loss": 0.5276, "step": 576 }, { "epoch": 1.295901179112858, "grad_norm": 0.2414851039648056, "learning_rate": 0.00011203474069744747, "loss": 0.4996, "step": 577 }, { "epoch": 1.2981471083660865, "grad_norm": 0.23619700968265533, "learning_rate": 0.00011184287635147058, "loss": 0.5116, "step": 578 }, { "epoch": 1.300393037619315, "grad_norm": 0.25254112482070923, "learning_rate": 0.00011165079412127607, "loss": 0.5133, "step": 579 }, { "epoch": 1.3026389668725435, "grad_norm": 0.21320711076259613, "learning_rate": 0.00011145849532118258, "loss": 0.5049, "step": 580 }, { "epoch": 1.304884896125772, "grad_norm": 0.24191851913928986, "learning_rate": 0.00011126598126699068, "loss": 0.5226, "step": 581 }, { "epoch": 1.3071308253790006, "grad_norm": 0.20547953248023987, "learning_rate": 0.00011107325327597372, "loss": 0.5196, "step": 582 }, { "epoch": 1.309376754632229, "grad_norm": 0.2211044281721115, "learning_rate": 0.00011088031266686902, "loss": 0.5135, "step": 583 }, { "epoch": 1.3116226838854577, "grad_norm": 0.226315438747406, "learning_rate": 0.00011068716075986863, "loss": 0.5155, "step": 584 }, { "epoch": 1.313868613138686, "grad_norm": 0.1992364525794983, "learning_rate": 0.00011049379887661044, "loss": 0.5135, "step": 585 }, { "epoch": 1.3161145423919147, "grad_norm": 0.20736606419086456, "learning_rate": 0.00011030022834016916, "loss": 0.5107, "step": 586 }, { "epoch": 1.3183604716451431, "grad_norm": 0.20780953764915466, "learning_rate": 0.00011010645047504712, "loss": 0.5072, "step": 587 }, { "epoch": 1.3206064008983718, "grad_norm": 0.20156902074813843, "learning_rate": 0.0001099124666071653, "loss": 0.5037, "step": 588 }, { "epoch": 1.3228523301516002, "grad_norm": 0.18280163407325745, "learning_rate": 0.00010971827806385431, "loss": 0.5308, "step": 589 }, { "epoch": 1.3250982594048288, "grad_norm": 0.20286300778388977, "learning_rate": 0.00010952388617384519, "loss": 0.5239, "step": 590 }, { "epoch": 1.3273441886580573, "grad_norm": 0.20476078987121582, "learning_rate": 0.00010932929226726041, "loss": 0.5339, "step": 591 }, { "epoch": 1.3295901179112857, "grad_norm": 0.19983462989330292, "learning_rate": 0.00010913449767560468, "loss": 0.5166, "step": 592 }, { "epoch": 1.3318360471645143, "grad_norm": 0.22195865213871002, "learning_rate": 0.00010893950373175597, "loss": 0.514, "step": 593 }, { "epoch": 1.334081976417743, "grad_norm": 0.20715545117855072, "learning_rate": 0.00010874431176995627, "loss": 0.5296, "step": 594 }, { "epoch": 1.3363279056709714, "grad_norm": 0.21173766255378723, "learning_rate": 0.00010854892312580249, "loss": 0.4918, "step": 595 }, { "epoch": 1.3385738349241998, "grad_norm": 0.2034001350402832, "learning_rate": 0.0001083533391362374, "loss": 0.5176, "step": 596 }, { "epoch": 1.3408197641774284, "grad_norm": 0.23540934920310974, "learning_rate": 0.00010815756113954031, "loss": 0.5145, "step": 597 }, { "epoch": 1.343065693430657, "grad_norm": 0.19440345466136932, "learning_rate": 0.00010796159047531811, "loss": 0.5167, "step": 598 }, { "epoch": 1.3453116226838855, "grad_norm": 0.2172805666923523, "learning_rate": 0.00010776542848449602, "loss": 0.5235, "step": 599 }, { "epoch": 1.347557551937114, "grad_norm": 0.19153092801570892, "learning_rate": 0.00010756907650930831, "loss": 0.4961, "step": 600 }, { "epoch": 1.3498034811903425, "grad_norm": 0.2150796353816986, "learning_rate": 0.00010737253589328933, "loss": 0.5154, "step": 601 }, { "epoch": 1.352049410443571, "grad_norm": 0.21939396858215332, "learning_rate": 0.0001071758079812641, "loss": 0.5387, "step": 602 }, { "epoch": 1.3542953396967996, "grad_norm": 0.20470492541790009, "learning_rate": 0.00010697889411933928, "loss": 0.4978, "step": 603 }, { "epoch": 1.356541268950028, "grad_norm": 0.21058504283428192, "learning_rate": 0.00010678179565489388, "loss": 0.5096, "step": 604 }, { "epoch": 1.3587871982032567, "grad_norm": 0.1950283795595169, "learning_rate": 0.00010658451393656999, "loss": 0.5089, "step": 605 }, { "epoch": 1.361033127456485, "grad_norm": 0.21830430626869202, "learning_rate": 0.00010638705031426371, "loss": 0.4892, "step": 606 }, { "epoch": 1.3632790567097137, "grad_norm": 0.19007915258407593, "learning_rate": 0.00010618940613911576, "loss": 0.5309, "step": 607 }, { "epoch": 1.3655249859629421, "grad_norm": 0.20983009040355682, "learning_rate": 0.0001059915827635022, "loss": 0.5171, "step": 608 }, { "epoch": 1.3677709152161706, "grad_norm": 0.20747217535972595, "learning_rate": 0.00010579358154102548, "loss": 0.4915, "step": 609 }, { "epoch": 1.3700168444693992, "grad_norm": 0.20381350815296173, "learning_rate": 0.00010559540382650474, "loss": 0.503, "step": 610 }, { "epoch": 1.3722627737226278, "grad_norm": 0.2014596313238144, "learning_rate": 0.00010539705097596689, "loss": 0.5124, "step": 611 }, { "epoch": 1.3745087029758563, "grad_norm": 0.2117050141096115, "learning_rate": 0.00010519852434663721, "loss": 0.4996, "step": 612 }, { "epoch": 1.3767546322290847, "grad_norm": 0.21098558604717255, "learning_rate": 0.00010499982529692996, "loss": 0.492, "step": 613 }, { "epoch": 1.3790005614823133, "grad_norm": 0.22107858955860138, "learning_rate": 0.00010480095518643929, "loss": 0.5165, "step": 614 }, { "epoch": 1.381246490735542, "grad_norm": 0.22238287329673767, "learning_rate": 0.00010460191537592977, "loss": 0.5095, "step": 615 }, { "epoch": 1.3834924199887704, "grad_norm": 0.20342691242694855, "learning_rate": 0.00010440270722732714, "loss": 0.5141, "step": 616 }, { "epoch": 1.3857383492419988, "grad_norm": 0.22299018502235413, "learning_rate": 0.00010420333210370903, "loss": 0.5133, "step": 617 }, { "epoch": 1.3879842784952274, "grad_norm": 0.20717273652553558, "learning_rate": 0.00010400379136929557, "loss": 0.5143, "step": 618 }, { "epoch": 1.3902302077484558, "grad_norm": 0.20377473533153534, "learning_rate": 0.00010380408638944007, "loss": 0.4835, "step": 619 }, { "epoch": 1.3924761370016845, "grad_norm": 0.22891288995742798, "learning_rate": 0.00010360421853061966, "loss": 0.5122, "step": 620 }, { "epoch": 1.394722066254913, "grad_norm": 0.19375132024288177, "learning_rate": 0.00010340418916042603, "loss": 0.5052, "step": 621 }, { "epoch": 1.3969679955081415, "grad_norm": 0.191814586520195, "learning_rate": 0.00010320399964755596, "loss": 0.4988, "step": 622 }, { "epoch": 1.39921392476137, "grad_norm": 0.1985396444797516, "learning_rate": 0.00010300365136180201, "loss": 0.5049, "step": 623 }, { "epoch": 1.4014598540145986, "grad_norm": 0.18780378997325897, "learning_rate": 0.0001028031456740432, "loss": 0.5002, "step": 624 }, { "epoch": 1.403705783267827, "grad_norm": 0.21660645306110382, "learning_rate": 0.00010260248395623548, "loss": 0.5184, "step": 625 }, { "epoch": 1.4059517125210557, "grad_norm": 0.19068920612335205, "learning_rate": 0.00010240166758140245, "loss": 0.5032, "step": 626 }, { "epoch": 1.408197641774284, "grad_norm": 0.2113179713487625, "learning_rate": 0.00010220069792362601, "loss": 0.5152, "step": 627 }, { "epoch": 1.4104435710275127, "grad_norm": 0.18784399330615997, "learning_rate": 0.00010199957635803684, "loss": 0.5261, "step": 628 }, { "epoch": 1.4126895002807411, "grad_norm": 0.1969737708568573, "learning_rate": 0.00010179830426080504, "loss": 0.5152, "step": 629 }, { "epoch": 1.4149354295339696, "grad_norm": 0.18799488246440887, "learning_rate": 0.00010159688300913076, "loss": 0.5111, "step": 630 }, { "epoch": 1.4171813587871982, "grad_norm": 0.18792767822742462, "learning_rate": 0.0001013953139812347, "loss": 0.5092, "step": 631 }, { "epoch": 1.4194272880404268, "grad_norm": 0.21675904095172882, "learning_rate": 0.00010119359855634876, "loss": 0.5076, "step": 632 }, { "epoch": 1.4216732172936553, "grad_norm": 0.19109146296977997, "learning_rate": 0.00010099173811470652, "loss": 0.507, "step": 633 }, { "epoch": 1.4239191465468837, "grad_norm": 0.1930873841047287, "learning_rate": 0.00010078973403753383, "loss": 0.5195, "step": 634 }, { "epoch": 1.4261650758001123, "grad_norm": 0.18737006187438965, "learning_rate": 0.00010058758770703938, "loss": 0.5233, "step": 635 }, { "epoch": 1.428411005053341, "grad_norm": 0.1958773285150528, "learning_rate": 0.00010038530050640522, "loss": 0.5031, "step": 636 }, { "epoch": 1.4306569343065694, "grad_norm": 0.18015055358409882, "learning_rate": 0.00010018287381977732, "loss": 0.5138, "step": 637 }, { "epoch": 1.4329028635597978, "grad_norm": 0.18713940680027008, "learning_rate": 9.998030903225603e-05, "loss": 0.5084, "step": 638 }, { "epoch": 1.4351487928130264, "grad_norm": 0.20459598302841187, "learning_rate": 9.977760752988671e-05, "loss": 0.5409, "step": 639 }, { "epoch": 1.4373947220662548, "grad_norm": 0.17716822028160095, "learning_rate": 9.957477069965018e-05, "loss": 0.509, "step": 640 }, { "epoch": 1.4396406513194835, "grad_norm": 0.1981070339679718, "learning_rate": 9.93717999294532e-05, "loss": 0.4953, "step": 641 }, { "epoch": 1.441886580572712, "grad_norm": 0.19121180474758148, "learning_rate": 9.916869660811906e-05, "loss": 0.5109, "step": 642 }, { "epoch": 1.4441325098259405, "grad_norm": 0.20929452776908875, "learning_rate": 9.896546212537793e-05, "loss": 0.517, "step": 643 }, { "epoch": 1.446378439079169, "grad_norm": 0.19593368470668793, "learning_rate": 9.87620978718576e-05, "loss": 0.5071, "step": 644 }, { "epoch": 1.4486243683323976, "grad_norm": 0.21035808324813843, "learning_rate": 9.855860523907372e-05, "loss": 0.5198, "step": 645 }, { "epoch": 1.450870297585626, "grad_norm": 0.19853971898555756, "learning_rate": 9.835498561942036e-05, "loss": 0.5437, "step": 646 }, { "epoch": 1.4531162268388544, "grad_norm": 0.1949443370103836, "learning_rate": 9.815124040616056e-05, "loss": 0.5076, "step": 647 }, { "epoch": 1.455362156092083, "grad_norm": 0.20280544459819794, "learning_rate": 9.794737099341664e-05, "loss": 0.5093, "step": 648 }, { "epoch": 1.4576080853453117, "grad_norm": 0.21078361570835114, "learning_rate": 9.774337877616083e-05, "loss": 0.5081, "step": 649 }, { "epoch": 1.4598540145985401, "grad_norm": 0.1961338371038437, "learning_rate": 9.753926515020567e-05, "loss": 0.5096, "step": 650 }, { "epoch": 1.4620999438517686, "grad_norm": 0.19009891152381897, "learning_rate": 9.733503151219433e-05, "loss": 0.4999, "step": 651 }, { "epoch": 1.4643458731049972, "grad_norm": 0.18627040088176727, "learning_rate": 9.713067925959126e-05, "loss": 0.5056, "step": 652 }, { "epoch": 1.4665918023582258, "grad_norm": 0.1938895285129547, "learning_rate": 9.692620979067245e-05, "loss": 0.5137, "step": 653 }, { "epoch": 1.4688377316114543, "grad_norm": 0.2050761729478836, "learning_rate": 9.672162450451602e-05, "loss": 0.5051, "step": 654 }, { "epoch": 1.4710836608646827, "grad_norm": 0.19880592823028564, "learning_rate": 9.651692480099251e-05, "loss": 0.5055, "step": 655 }, { "epoch": 1.4733295901179113, "grad_norm": 0.18447960913181305, "learning_rate": 9.631211208075534e-05, "loss": 0.5296, "step": 656 }, { "epoch": 1.4755755193711397, "grad_norm": 0.19004195928573608, "learning_rate": 9.610718774523137e-05, "loss": 0.5258, "step": 657 }, { "epoch": 1.4778214486243684, "grad_norm": 0.19954320788383484, "learning_rate": 9.590215319661097e-05, "loss": 0.5011, "step": 658 }, { "epoch": 1.4800673778775968, "grad_norm": 0.17005719244480133, "learning_rate": 9.569700983783885e-05, "loss": 0.5062, "step": 659 }, { "epoch": 1.4823133071308254, "grad_norm": 0.21068550646305084, "learning_rate": 9.549175907260415e-05, "loss": 0.5044, "step": 660 }, { "epoch": 1.4845592363840538, "grad_norm": 0.18736523389816284, "learning_rate": 9.528640230533093e-05, "loss": 0.521, "step": 661 }, { "epoch": 1.4868051656372825, "grad_norm": 0.19477304816246033, "learning_rate": 9.508094094116863e-05, "loss": 0.5065, "step": 662 }, { "epoch": 1.489051094890511, "grad_norm": 0.20427975058555603, "learning_rate": 9.48753763859823e-05, "loss": 0.5208, "step": 663 }, { "epoch": 1.4912970241437395, "grad_norm": 0.20408067107200623, "learning_rate": 9.466971004634316e-05, "loss": 0.4917, "step": 664 }, { "epoch": 1.493542953396968, "grad_norm": 0.22063596546649933, "learning_rate": 9.446394332951885e-05, "loss": 0.5097, "step": 665 }, { "epoch": 1.4957888826501966, "grad_norm": 0.20878678560256958, "learning_rate": 9.425807764346383e-05, "loss": 0.505, "step": 666 }, { "epoch": 1.498034811903425, "grad_norm": 0.21228721737861633, "learning_rate": 9.405211439680975e-05, "loss": 0.5249, "step": 667 }, { "epoch": 1.5002807411566534, "grad_norm": 0.21478019654750824, "learning_rate": 9.384605499885586e-05, "loss": 0.516, "step": 668 }, { "epoch": 1.502526670409882, "grad_norm": 0.23727190494537354, "learning_rate": 9.363990085955929e-05, "loss": 0.5128, "step": 669 }, { "epoch": 1.5047725996631107, "grad_norm": 0.211452454328537, "learning_rate": 9.343365338952544e-05, "loss": 0.5141, "step": 670 }, { "epoch": 1.5070185289163391, "grad_norm": 0.24813149869441986, "learning_rate": 9.322731399999829e-05, "loss": 0.5286, "step": 671 }, { "epoch": 1.5092644581695676, "grad_norm": 0.19929581880569458, "learning_rate": 9.302088410285084e-05, "loss": 0.5065, "step": 672 }, { "epoch": 1.5115103874227962, "grad_norm": 0.23539748787879944, "learning_rate": 9.281436511057538e-05, "loss": 0.5045, "step": 673 }, { "epoch": 1.5137563166760248, "grad_norm": 0.18617475032806396, "learning_rate": 9.260775843627378e-05, "loss": 0.4943, "step": 674 }, { "epoch": 1.5160022459292533, "grad_norm": 0.22366289794445038, "learning_rate": 9.24010654936479e-05, "loss": 0.5136, "step": 675 }, { "epoch": 1.5182481751824817, "grad_norm": 0.21610277891159058, "learning_rate": 9.219428769698991e-05, "loss": 0.4968, "step": 676 }, { "epoch": 1.5204941044357103, "grad_norm": 0.19368857145309448, "learning_rate": 9.198742646117254e-05, "loss": 0.5129, "step": 677 }, { "epoch": 1.522740033688939, "grad_norm": 0.20865383744239807, "learning_rate": 9.178048320163954e-05, "loss": 0.5136, "step": 678 }, { "epoch": 1.5249859629421674, "grad_norm": 0.18743731081485748, "learning_rate": 9.15734593343958e-05, "loss": 0.5149, "step": 679 }, { "epoch": 1.5272318921953958, "grad_norm": 0.22473086416721344, "learning_rate": 9.136635627599783e-05, "loss": 0.5155, "step": 680 }, { "epoch": 1.5294778214486242, "grad_norm": 0.1838371306657791, "learning_rate": 9.115917544354398e-05, "loss": 0.5102, "step": 681 }, { "epoch": 1.5317237507018528, "grad_norm": 0.19203968346118927, "learning_rate": 9.095191825466481e-05, "loss": 0.5225, "step": 682 }, { "epoch": 1.5339696799550815, "grad_norm": 0.21374920010566711, "learning_rate": 9.074458612751329e-05, "loss": 0.5165, "step": 683 }, { "epoch": 1.53621560920831, "grad_norm": 0.19073887169361115, "learning_rate": 9.053718048075516e-05, "loss": 0.5082, "step": 684 }, { "epoch": 1.5384615384615383, "grad_norm": 0.21084338426589966, "learning_rate": 9.032970273355926e-05, "loss": 0.4975, "step": 685 }, { "epoch": 1.540707467714767, "grad_norm": 0.20061564445495605, "learning_rate": 9.012215430558776e-05, "loss": 0.5048, "step": 686 }, { "epoch": 1.5429533969679956, "grad_norm": 0.17530708014965057, "learning_rate": 8.991453661698641e-05, "loss": 0.51, "step": 687 }, { "epoch": 1.545199326221224, "grad_norm": 0.2152005285024643, "learning_rate": 8.970685108837497e-05, "loss": 0.5224, "step": 688 }, { "epoch": 1.5474452554744524, "grad_norm": 0.1882491558790207, "learning_rate": 8.949909914083732e-05, "loss": 0.5271, "step": 689 }, { "epoch": 1.549691184727681, "grad_norm": 0.21567484736442566, "learning_rate": 8.92912821959118e-05, "loss": 0.5156, "step": 690 }, { "epoch": 1.5519371139809097, "grad_norm": 0.19783969223499298, "learning_rate": 8.908340167558154e-05, "loss": 0.4966, "step": 691 }, { "epoch": 1.5541830432341381, "grad_norm": 0.20946729183197021, "learning_rate": 8.88754590022647e-05, "loss": 0.4923, "step": 692 }, { "epoch": 1.5564289724873666, "grad_norm": 0.19118967652320862, "learning_rate": 8.866745559880464e-05, "loss": 0.5136, "step": 693 }, { "epoch": 1.5586749017405952, "grad_norm": 0.2122071534395218, "learning_rate": 8.845939288846032e-05, "loss": 0.5155, "step": 694 }, { "epoch": 1.5609208309938238, "grad_norm": 0.1733548641204834, "learning_rate": 8.825127229489653e-05, "loss": 0.4971, "step": 695 }, { "epoch": 1.5631667602470523, "grad_norm": 0.2194015234708786, "learning_rate": 8.804309524217408e-05, "loss": 0.4942, "step": 696 }, { "epoch": 1.5654126895002807, "grad_norm": 0.1795753836631775, "learning_rate": 8.783486315474008e-05, "loss": 0.5032, "step": 697 }, { "epoch": 1.5676586187535093, "grad_norm": 0.21514686942100525, "learning_rate": 8.762657745741831e-05, "loss": 0.5036, "step": 698 }, { "epoch": 1.5699045480067377, "grad_norm": 0.20286062359809875, "learning_rate": 8.741823957539926e-05, "loss": 0.5097, "step": 699 }, { "epoch": 1.5721504772599664, "grad_norm": 0.19607621431350708, "learning_rate": 8.720985093423053e-05, "loss": 0.498, "step": 700 }, { "epoch": 1.5743964065131948, "grad_norm": 0.23368516564369202, "learning_rate": 8.700141295980711e-05, "loss": 0.529, "step": 701 }, { "epoch": 1.5766423357664232, "grad_norm": 0.21203581988811493, "learning_rate": 8.679292707836149e-05, "loss": 0.4959, "step": 702 }, { "epoch": 1.5788882650196518, "grad_norm": 0.26587924361228943, "learning_rate": 8.658439471645391e-05, "loss": 0.5201, "step": 703 }, { "epoch": 1.5811341942728805, "grad_norm": 0.1834084540605545, "learning_rate": 8.637581730096275e-05, "loss": 0.504, "step": 704 }, { "epoch": 1.583380123526109, "grad_norm": 0.24840541183948517, "learning_rate": 8.616719625907463e-05, "loss": 0.5149, "step": 705 }, { "epoch": 1.5856260527793373, "grad_norm": 0.18650217354297638, "learning_rate": 8.595853301827469e-05, "loss": 0.4866, "step": 706 }, { "epoch": 1.587871982032566, "grad_norm": 0.21472761034965515, "learning_rate": 8.574982900633676e-05, "loss": 0.513, "step": 707 }, { "epoch": 1.5901179112857946, "grad_norm": 0.20243674516677856, "learning_rate": 8.554108565131373e-05, "loss": 0.5073, "step": 708 }, { "epoch": 1.592363840539023, "grad_norm": 0.18156473338603973, "learning_rate": 8.533230438152765e-05, "loss": 0.5117, "step": 709 }, { "epoch": 1.5946097697922514, "grad_norm": 0.18785932660102844, "learning_rate": 8.512348662555996e-05, "loss": 0.5184, "step": 710 }, { "epoch": 1.59685569904548, "grad_norm": 0.19026771187782288, "learning_rate": 8.49146338122419e-05, "loss": 0.493, "step": 711 }, { "epoch": 1.5991016282987087, "grad_norm": 0.1765296906232834, "learning_rate": 8.47057473706444e-05, "loss": 0.4921, "step": 712 }, { "epoch": 1.6013475575519371, "grad_norm": 0.18513350188732147, "learning_rate": 8.449682873006862e-05, "loss": 0.5043, "step": 713 }, { "epoch": 1.6035934868051656, "grad_norm": 0.1919069141149521, "learning_rate": 8.4287879320036e-05, "loss": 0.4893, "step": 714 }, { "epoch": 1.6058394160583942, "grad_norm": 0.18348896503448486, "learning_rate": 8.40789005702785e-05, "loss": 0.5287, "step": 715 }, { "epoch": 1.6080853453116228, "grad_norm": 0.19792461395263672, "learning_rate": 8.386989391072892e-05, "loss": 0.518, "step": 716 }, { "epoch": 1.6103312745648513, "grad_norm": 0.2027343064546585, "learning_rate": 8.366086077151091e-05, "loss": 0.5109, "step": 717 }, { "epoch": 1.6125772038180797, "grad_norm": 0.2016996443271637, "learning_rate": 8.34518025829294e-05, "loss": 0.5169, "step": 718 }, { "epoch": 1.614823133071308, "grad_norm": 0.20013925433158875, "learning_rate": 8.324272077546064e-05, "loss": 0.4997, "step": 719 }, { "epoch": 1.6170690623245367, "grad_norm": 0.18940746784210205, "learning_rate": 8.30336167797426e-05, "loss": 0.4962, "step": 720 }, { "epoch": 1.6193149915777654, "grad_norm": 0.20259737968444824, "learning_rate": 8.282449202656496e-05, "loss": 0.524, "step": 721 }, { "epoch": 1.6215609208309938, "grad_norm": 0.22202381491661072, "learning_rate": 8.261534794685952e-05, "loss": 0.4966, "step": 722 }, { "epoch": 1.6238068500842222, "grad_norm": 0.19881968200206757, "learning_rate": 8.240618597169029e-05, "loss": 0.5065, "step": 723 }, { "epoch": 1.6260527793374508, "grad_norm": 0.1963961273431778, "learning_rate": 8.219700753224371e-05, "loss": 0.5027, "step": 724 }, { "epoch": 1.6282987085906795, "grad_norm": 0.20289023220539093, "learning_rate": 8.198781405981888e-05, "loss": 0.5123, "step": 725 }, { "epoch": 1.630544637843908, "grad_norm": 0.20166555047035217, "learning_rate": 8.177860698581778e-05, "loss": 0.4844, "step": 726 }, { "epoch": 1.6327905670971363, "grad_norm": 0.21527273952960968, "learning_rate": 8.156938774173548e-05, "loss": 0.4884, "step": 727 }, { "epoch": 1.635036496350365, "grad_norm": 0.19657008349895477, "learning_rate": 8.136015775915025e-05, "loss": 0.5046, "step": 728 }, { "epoch": 1.6372824256035936, "grad_norm": 0.1984531283378601, "learning_rate": 8.11509184697139e-05, "loss": 0.5075, "step": 729 }, { "epoch": 1.639528354856822, "grad_norm": 0.18290367722511292, "learning_rate": 8.094167130514195e-05, "loss": 0.5094, "step": 730 }, { "epoch": 1.6417742841100504, "grad_norm": 0.18201418220996857, "learning_rate": 8.073241769720371e-05, "loss": 0.4916, "step": 731 }, { "epoch": 1.644020213363279, "grad_norm": 0.17987395823001862, "learning_rate": 8.052315907771262e-05, "loss": 0.5107, "step": 732 }, { "epoch": 1.6462661426165077, "grad_norm": 0.17415151000022888, "learning_rate": 8.031389687851647e-05, "loss": 0.4787, "step": 733 }, { "epoch": 1.6485120718697361, "grad_norm": 0.18529638648033142, "learning_rate": 8.010463253148746e-05, "loss": 0.4942, "step": 734 }, { "epoch": 1.6507580011229646, "grad_norm": 0.18021097779273987, "learning_rate": 7.989536746851255e-05, "loss": 0.5244, "step": 735 }, { "epoch": 1.6530039303761932, "grad_norm": 0.18884895741939545, "learning_rate": 7.968610312148354e-05, "loss": 0.5067, "step": 736 }, { "epoch": 1.6552498596294218, "grad_norm": 0.17446008324623108, "learning_rate": 7.94768409222874e-05, "loss": 0.4919, "step": 737 }, { "epoch": 1.6574957888826503, "grad_norm": 0.16754934191703796, "learning_rate": 7.926758230279634e-05, "loss": 0.504, "step": 738 }, { "epoch": 1.6597417181358787, "grad_norm": 0.17202447354793549, "learning_rate": 7.905832869485808e-05, "loss": 0.5118, "step": 739 }, { "epoch": 1.661987647389107, "grad_norm": 0.17612679302692413, "learning_rate": 7.88490815302861e-05, "loss": 0.4997, "step": 740 }, { "epoch": 1.6642335766423357, "grad_norm": 0.1580231636762619, "learning_rate": 7.863984224084977e-05, "loss": 0.477, "step": 741 }, { "epoch": 1.6664795058955644, "grad_norm": 0.1829080730676651, "learning_rate": 7.843061225826455e-05, "loss": 0.5091, "step": 742 }, { "epoch": 1.6687254351487928, "grad_norm": 0.17909185588359833, "learning_rate": 7.822139301418226e-05, "loss": 0.5197, "step": 743 }, { "epoch": 1.6709713644020212, "grad_norm": 0.18631631135940552, "learning_rate": 7.801218594018115e-05, "loss": 0.5069, "step": 744 }, { "epoch": 1.6732172936552498, "grad_norm": 0.17326535284519196, "learning_rate": 7.78029924677563e-05, "loss": 0.5088, "step": 745 }, { "epoch": 1.6754632229084785, "grad_norm": 0.20143157243728638, "learning_rate": 7.759381402830973e-05, "loss": 0.528, "step": 746 }, { "epoch": 1.677709152161707, "grad_norm": 0.1783144623041153, "learning_rate": 7.738465205314048e-05, "loss": 0.4956, "step": 747 }, { "epoch": 1.6799550814149353, "grad_norm": 0.19444549083709717, "learning_rate": 7.717550797343506e-05, "loss": 0.4859, "step": 748 }, { "epoch": 1.682201010668164, "grad_norm": 0.18391017615795135, "learning_rate": 7.696638322025744e-05, "loss": 0.5036, "step": 749 }, { "epoch": 1.6844469399213926, "grad_norm": 0.2030087262392044, "learning_rate": 7.675727922453939e-05, "loss": 0.5032, "step": 750 }, { "epoch": 1.686692869174621, "grad_norm": 0.17419691383838654, "learning_rate": 7.654819741707065e-05, "loss": 0.5055, "step": 751 }, { "epoch": 1.6889387984278494, "grad_norm": 0.1854201853275299, "learning_rate": 7.633913922848912e-05, "loss": 0.5, "step": 752 }, { "epoch": 1.691184727681078, "grad_norm": 0.19161422550678253, "learning_rate": 7.613010608927113e-05, "loss": 0.4888, "step": 753 }, { "epoch": 1.6934306569343067, "grad_norm": 0.1729954481124878, "learning_rate": 7.592109942972152e-05, "loss": 0.5028, "step": 754 }, { "epoch": 1.6956765861875351, "grad_norm": 0.19286830723285675, "learning_rate": 7.571212067996402e-05, "loss": 0.5133, "step": 755 }, { "epoch": 1.6979225154407636, "grad_norm": 0.17671585083007812, "learning_rate": 7.550317126993141e-05, "loss": 0.5035, "step": 756 }, { "epoch": 1.700168444693992, "grad_norm": 0.1909675896167755, "learning_rate": 7.529425262935561e-05, "loss": 0.5147, "step": 757 }, { "epoch": 1.7024143739472206, "grad_norm": 0.1676298975944519, "learning_rate": 7.508536618775814e-05, "loss": 0.488, "step": 758 }, { "epoch": 1.7046603032004493, "grad_norm": 0.1871660202741623, "learning_rate": 7.487651337444005e-05, "loss": 0.4986, "step": 759 }, { "epoch": 1.7069062324536777, "grad_norm": 0.17889705300331116, "learning_rate": 7.466769561847239e-05, "loss": 0.5103, "step": 760 }, { "epoch": 1.709152161706906, "grad_norm": 0.18187767267227173, "learning_rate": 7.445891434868628e-05, "loss": 0.477, "step": 761 }, { "epoch": 1.7113980909601347, "grad_norm": 0.17818237841129303, "learning_rate": 7.425017099366326e-05, "loss": 0.5143, "step": 762 }, { "epoch": 1.7136440202133634, "grad_norm": 0.1854383796453476, "learning_rate": 7.404146698172536e-05, "loss": 0.5286, "step": 763 }, { "epoch": 1.7158899494665918, "grad_norm": 0.1802191585302353, "learning_rate": 7.383280374092538e-05, "loss": 0.493, "step": 764 }, { "epoch": 1.7181358787198202, "grad_norm": 0.17232070863246918, "learning_rate": 7.362418269903728e-05, "loss": 0.5124, "step": 765 }, { "epoch": 1.7203818079730488, "grad_norm": 0.2103428691625595, "learning_rate": 7.34156052835461e-05, "loss": 0.5372, "step": 766 }, { "epoch": 1.7226277372262775, "grad_norm": 0.1758391559123993, "learning_rate": 7.320707292163853e-05, "loss": 0.5019, "step": 767 }, { "epoch": 1.724873666479506, "grad_norm": 0.19223737716674805, "learning_rate": 7.299858704019291e-05, "loss": 0.4956, "step": 768 }, { "epoch": 1.7271195957327343, "grad_norm": 0.17237992584705353, "learning_rate": 7.279014906576949e-05, "loss": 0.4991, "step": 769 }, { "epoch": 1.729365524985963, "grad_norm": 0.17996814846992493, "learning_rate": 7.258176042460077e-05, "loss": 0.4882, "step": 770 }, { "epoch": 1.7316114542391916, "grad_norm": 0.17651812732219696, "learning_rate": 7.237342254258173e-05, "loss": 0.5167, "step": 771 }, { "epoch": 1.73385738349242, "grad_norm": 0.19715122878551483, "learning_rate": 7.216513684525992e-05, "loss": 0.516, "step": 772 }, { "epoch": 1.7361033127456484, "grad_norm": 0.16534049808979034, "learning_rate": 7.195690475782596e-05, "loss": 0.5241, "step": 773 }, { "epoch": 1.738349241998877, "grad_norm": 0.20934666693210602, "learning_rate": 7.174872770510348e-05, "loss": 0.4848, "step": 774 }, { "epoch": 1.7405951712521057, "grad_norm": 0.17493613064289093, "learning_rate": 7.15406071115397e-05, "loss": 0.509, "step": 775 }, { "epoch": 1.7428411005053341, "grad_norm": 0.19224363565444946, "learning_rate": 7.133254440119538e-05, "loss": 0.5166, "step": 776 }, { "epoch": 1.7450870297585626, "grad_norm": 0.17673024535179138, "learning_rate": 7.11245409977353e-05, "loss": 0.4919, "step": 777 }, { "epoch": 1.747332959011791, "grad_norm": 0.17207755148410797, "learning_rate": 7.091659832441848e-05, "loss": 0.5325, "step": 778 }, { "epoch": 1.7495788882650196, "grad_norm": 0.17099009454250336, "learning_rate": 7.070871780408824e-05, "loss": 0.4918, "step": 779 }, { "epoch": 1.7518248175182483, "grad_norm": 0.16904598474502563, "learning_rate": 7.05009008591627e-05, "loss": 0.4883, "step": 780 }, { "epoch": 1.7540707467714767, "grad_norm": 0.17518097162246704, "learning_rate": 7.029314891162504e-05, "loss": 0.5112, "step": 781 }, { "epoch": 1.756316676024705, "grad_norm": 0.1848541796207428, "learning_rate": 7.008546338301358e-05, "loss": 0.522, "step": 782 }, { "epoch": 1.7585626052779337, "grad_norm": 0.18024159967899323, "learning_rate": 6.987784569441228e-05, "loss": 0.5163, "step": 783 }, { "epoch": 1.7608085345311624, "grad_norm": 0.16730569303035736, "learning_rate": 6.967029726644075e-05, "loss": 0.4693, "step": 784 }, { "epoch": 1.7630544637843908, "grad_norm": 0.18763582408428192, "learning_rate": 6.946281951924487e-05, "loss": 0.5143, "step": 785 }, { "epoch": 1.7653003930376192, "grad_norm": 0.16916576027870178, "learning_rate": 6.925541387248674e-05, "loss": 0.5188, "step": 786 }, { "epoch": 1.7675463222908478, "grad_norm": 0.19620057940483093, "learning_rate": 6.904808174533521e-05, "loss": 0.5024, "step": 787 }, { "epoch": 1.7697922515440765, "grad_norm": 0.16816137731075287, "learning_rate": 6.884082455645606e-05, "loss": 0.4878, "step": 788 }, { "epoch": 1.772038180797305, "grad_norm": 0.1925499141216278, "learning_rate": 6.863364372400221e-05, "loss": 0.4922, "step": 789 }, { "epoch": 1.7742841100505333, "grad_norm": 0.15602745115756989, "learning_rate": 6.842654066560422e-05, "loss": 0.4888, "step": 790 }, { "epoch": 1.776530039303762, "grad_norm": 0.17124199867248535, "learning_rate": 6.821951679836049e-05, "loss": 0.4795, "step": 791 }, { "epoch": 1.7787759685569906, "grad_norm": 0.17022277414798737, "learning_rate": 6.801257353882746e-05, "loss": 0.4966, "step": 792 }, { "epoch": 1.781021897810219, "grad_norm": 0.15725384652614594, "learning_rate": 6.78057123030101e-05, "loss": 0.4905, "step": 793 }, { "epoch": 1.7832678270634474, "grad_norm": 0.17000839114189148, "learning_rate": 6.759893450635213e-05, "loss": 0.498, "step": 794 }, { "epoch": 1.7855137563166759, "grad_norm": 0.15647220611572266, "learning_rate": 6.739224156372625e-05, "loss": 0.4948, "step": 795 }, { "epoch": 1.7877596855699045, "grad_norm": 0.17224030196666718, "learning_rate": 6.718563488942463e-05, "loss": 0.4995, "step": 796 }, { "epoch": 1.7900056148231331, "grad_norm": 0.17135286331176758, "learning_rate": 6.697911589714917e-05, "loss": 0.5028, "step": 797 }, { "epoch": 1.7922515440763616, "grad_norm": 0.1629776656627655, "learning_rate": 6.677268600000172e-05, "loss": 0.5004, "step": 798 }, { "epoch": 1.79449747332959, "grad_norm": 0.19575197994709015, "learning_rate": 6.656634661047461e-05, "loss": 0.5112, "step": 799 }, { "epoch": 1.7967434025828186, "grad_norm": 0.15462997555732727, "learning_rate": 6.636009914044074e-05, "loss": 0.5036, "step": 800 }, { "epoch": 1.7989893318360473, "grad_norm": 0.19468659162521362, "learning_rate": 6.615394500114417e-05, "loss": 0.5062, "step": 801 }, { "epoch": 1.8012352610892757, "grad_norm": 0.15850648283958435, "learning_rate": 6.594788560319025e-05, "loss": 0.5103, "step": 802 }, { "epoch": 1.803481190342504, "grad_norm": 0.16901031136512756, "learning_rate": 6.574192235653619e-05, "loss": 0.4964, "step": 803 }, { "epoch": 1.8057271195957327, "grad_norm": 0.16941389441490173, "learning_rate": 6.553605667048119e-05, "loss": 0.4956, "step": 804 }, { "epoch": 1.8079730488489614, "grad_norm": 0.1633678376674652, "learning_rate": 6.533028995365687e-05, "loss": 0.4844, "step": 805 }, { "epoch": 1.8102189781021898, "grad_norm": 0.16450218856334686, "learning_rate": 6.51246236140177e-05, "loss": 0.5039, "step": 806 }, { "epoch": 1.8124649073554182, "grad_norm": 0.1649266928434372, "learning_rate": 6.49190590588314e-05, "loss": 0.5237, "step": 807 }, { "epoch": 1.8147108366086468, "grad_norm": 0.17138883471488953, "learning_rate": 6.471359769466907e-05, "loss": 0.5086, "step": 808 }, { "epoch": 1.8169567658618755, "grad_norm": 0.17378132045269012, "learning_rate": 6.450824092739589e-05, "loss": 0.5091, "step": 809 }, { "epoch": 1.819202695115104, "grad_norm": 0.17285092175006866, "learning_rate": 6.430299016216119e-05, "loss": 0.5055, "step": 810 }, { "epoch": 1.8214486243683323, "grad_norm": 0.1718919575214386, "learning_rate": 6.409784680338905e-05, "loss": 0.4842, "step": 811 }, { "epoch": 1.823694553621561, "grad_norm": 0.16790670156478882, "learning_rate": 6.389281225476867e-05, "loss": 0.5004, "step": 812 }, { "epoch": 1.8259404828747896, "grad_norm": 0.1849760264158249, "learning_rate": 6.368788791924467e-05, "loss": 0.4939, "step": 813 }, { "epoch": 1.828186412128018, "grad_norm": 0.16113969683647156, "learning_rate": 6.348307519900753e-05, "loss": 0.5024, "step": 814 }, { "epoch": 1.8304323413812464, "grad_norm": 0.1709127277135849, "learning_rate": 6.3278375495484e-05, "loss": 0.4977, "step": 815 }, { "epoch": 1.8326782706344749, "grad_norm": 0.1758309006690979, "learning_rate": 6.307379020932758e-05, "loss": 0.4689, "step": 816 }, { "epoch": 1.8349241998877035, "grad_norm": 0.16264449059963226, "learning_rate": 6.286932074040876e-05, "loss": 0.4974, "step": 817 }, { "epoch": 1.8371701291409321, "grad_norm": 0.17811472713947296, "learning_rate": 6.266496848780567e-05, "loss": 0.4987, "step": 818 }, { "epoch": 1.8394160583941606, "grad_norm": 0.17399878799915314, "learning_rate": 6.246073484979436e-05, "loss": 0.4867, "step": 819 }, { "epoch": 1.841661987647389, "grad_norm": 0.17421691119670868, "learning_rate": 6.225662122383918e-05, "loss": 0.5162, "step": 820 }, { "epoch": 1.8439079169006176, "grad_norm": 0.17920304834842682, "learning_rate": 6.205262900658339e-05, "loss": 0.5058, "step": 821 }, { "epoch": 1.8461538461538463, "grad_norm": 0.16607870161533356, "learning_rate": 6.184875959383947e-05, "loss": 0.5063, "step": 822 }, { "epoch": 1.8483997754070747, "grad_norm": 0.19281108677387238, "learning_rate": 6.164501438057965e-05, "loss": 0.4936, "step": 823 }, { "epoch": 1.850645704660303, "grad_norm": 0.16037489473819733, "learning_rate": 6.144139476092631e-05, "loss": 0.4949, "step": 824 }, { "epoch": 1.8528916339135317, "grad_norm": 0.19559049606323242, "learning_rate": 6.123790212814241e-05, "loss": 0.4981, "step": 825 }, { "epoch": 1.8551375631667604, "grad_norm": 0.15469707548618317, "learning_rate": 6.1034537874622085e-05, "loss": 0.5021, "step": 826 }, { "epoch": 1.8573834924199888, "grad_norm": 0.18738782405853271, "learning_rate": 6.0831303391880975e-05, "loss": 0.4846, "step": 827 }, { "epoch": 1.8596294216732172, "grad_norm": 0.16658605635166168, "learning_rate": 6.0628200070546796e-05, "loss": 0.4945, "step": 828 }, { "epoch": 1.8618753509264458, "grad_norm": 0.16776609420776367, "learning_rate": 6.042522930034984e-05, "loss": 0.4992, "step": 829 }, { "epoch": 1.8641212801796745, "grad_norm": 0.17124858498573303, "learning_rate": 6.022239247011331e-05, "loss": 0.4915, "step": 830 }, { "epoch": 1.866367209432903, "grad_norm": 0.15521185100078583, "learning_rate": 6.001969096774399e-05, "loss": 0.5134, "step": 831 }, { "epoch": 1.8686131386861313, "grad_norm": 0.1691064089536667, "learning_rate": 5.981712618022272e-05, "loss": 0.5018, "step": 832 }, { "epoch": 1.87085906793936, "grad_norm": 0.15585891902446747, "learning_rate": 5.96146994935948e-05, "loss": 0.5071, "step": 833 }, { "epoch": 1.8731049971925884, "grad_norm": 0.1678674966096878, "learning_rate": 5.9412412292960656e-05, "loss": 0.5123, "step": 834 }, { "epoch": 1.875350926445817, "grad_norm": 0.15515373647212982, "learning_rate": 5.92102659624662e-05, "loss": 0.495, "step": 835 }, { "epoch": 1.8775968556990454, "grad_norm": 0.17066361010074615, "learning_rate": 5.900826188529351e-05, "loss": 0.4982, "step": 836 }, { "epoch": 1.8798427849522739, "grad_norm": 0.14784766733646393, "learning_rate": 5.880640144365124e-05, "loss": 0.492, "step": 837 }, { "epoch": 1.8820887142055025, "grad_norm": 0.1624741405248642, "learning_rate": 5.86046860187653e-05, "loss": 0.4985, "step": 838 }, { "epoch": 1.8843346434587311, "grad_norm": 0.14903901517391205, "learning_rate": 5.840311699086928e-05, "loss": 0.4887, "step": 839 }, { "epoch": 1.8865805727119596, "grad_norm": 0.16569632291793823, "learning_rate": 5.820169573919499e-05, "loss": 0.5031, "step": 840 }, { "epoch": 1.888826501965188, "grad_norm": 0.15516996383666992, "learning_rate": 5.800042364196319e-05, "loss": 0.4974, "step": 841 }, { "epoch": 1.8910724312184166, "grad_norm": 0.1705656498670578, "learning_rate": 5.779930207637401e-05, "loss": 0.5064, "step": 842 }, { "epoch": 1.8933183604716453, "grad_norm": 0.16612909734249115, "learning_rate": 5.759833241859755e-05, "loss": 0.4928, "step": 843 }, { "epoch": 1.8955642897248737, "grad_norm": 0.15691480040550232, "learning_rate": 5.7397516043764564e-05, "loss": 0.4992, "step": 844 }, { "epoch": 1.897810218978102, "grad_norm": 0.15925776958465576, "learning_rate": 5.719685432595681e-05, "loss": 0.503, "step": 845 }, { "epoch": 1.9000561482313307, "grad_norm": 0.1793777197599411, "learning_rate": 5.6996348638198e-05, "loss": 0.5015, "step": 846 }, { "epoch": 1.9023020774845594, "grad_norm": 0.15224167704582214, "learning_rate": 5.6796000352444056e-05, "loss": 0.4791, "step": 847 }, { "epoch": 1.9045480067377878, "grad_norm": 0.17081177234649658, "learning_rate": 5.6595810839574e-05, "loss": 0.4925, "step": 848 }, { "epoch": 1.9067939359910162, "grad_norm": 0.1512937992811203, "learning_rate": 5.6395781469380354e-05, "loss": 0.4901, "step": 849 }, { "epoch": 1.9090398652442448, "grad_norm": 0.15645167231559753, "learning_rate": 5.619591361055998e-05, "loss": 0.5001, "step": 850 }, { "epoch": 1.9112857944974735, "grad_norm": 0.17164252698421478, "learning_rate": 5.5996208630704445e-05, "loss": 0.4956, "step": 851 }, { "epoch": 1.913531723750702, "grad_norm": 0.15667004883289337, "learning_rate": 5.579666789629098e-05, "loss": 0.4906, "step": 852 }, { "epoch": 1.9157776530039303, "grad_norm": 0.16768649220466614, "learning_rate": 5.559729277267286e-05, "loss": 0.5099, "step": 853 }, { "epoch": 1.9180235822571587, "grad_norm": 0.16727498173713684, "learning_rate": 5.539808462407026e-05, "loss": 0.503, "step": 854 }, { "epoch": 1.9202695115103874, "grad_norm": 0.16755451261997223, "learning_rate": 5.519904481356076e-05, "loss": 0.5099, "step": 855 }, { "epoch": 1.922515440763616, "grad_norm": 0.16387148201465607, "learning_rate": 5.500017470307007e-05, "loss": 0.4957, "step": 856 }, { "epoch": 1.9247613700168444, "grad_norm": 0.15775729715824127, "learning_rate": 5.480147565336282e-05, "loss": 0.4976, "step": 857 }, { "epoch": 1.9270072992700729, "grad_norm": 0.1581815481185913, "learning_rate": 5.4602949024033116e-05, "loss": 0.4949, "step": 858 }, { "epoch": 1.9292532285233015, "grad_norm": 0.15002784132957458, "learning_rate": 5.4404596173495265e-05, "loss": 0.5099, "step": 859 }, { "epoch": 1.9314991577765301, "grad_norm": 0.15235161781311035, "learning_rate": 5.420641845897455e-05, "loss": 0.4809, "step": 860 }, { "epoch": 1.9337450870297586, "grad_norm": 0.16005192697048187, "learning_rate": 5.4008417236497815e-05, "loss": 0.493, "step": 861 }, { "epoch": 1.935991016282987, "grad_norm": 0.15347884595394135, "learning_rate": 5.381059386088428e-05, "loss": 0.5071, "step": 862 }, { "epoch": 1.9382369455362156, "grad_norm": 0.15472036600112915, "learning_rate": 5.361294968573629e-05, "loss": 0.4924, "step": 863 }, { "epoch": 1.9404828747894443, "grad_norm": 0.17055299878120422, "learning_rate": 5.341548606343001e-05, "loss": 0.5057, "step": 864 }, { "epoch": 1.9427288040426727, "grad_norm": 0.15424910187721252, "learning_rate": 5.321820434510617e-05, "loss": 0.5041, "step": 865 }, { "epoch": 1.944974733295901, "grad_norm": 0.15976421535015106, "learning_rate": 5.302110588066075e-05, "loss": 0.4742, "step": 866 }, { "epoch": 1.9472206625491297, "grad_norm": 0.15673977136611938, "learning_rate": 5.282419201873593e-05, "loss": 0.49, "step": 867 }, { "epoch": 1.9494665918023584, "grad_norm": 0.14829935133457184, "learning_rate": 5.262746410671071e-05, "loss": 0.5017, "step": 868 }, { "epoch": 1.9517125210555868, "grad_norm": 0.14897191524505615, "learning_rate": 5.243092349069169e-05, "loss": 0.4803, "step": 869 }, { "epoch": 1.9539584503088152, "grad_norm": 0.15609802305698395, "learning_rate": 5.223457151550402e-05, "loss": 0.4961, "step": 870 }, { "epoch": 1.9562043795620438, "grad_norm": 0.15764057636260986, "learning_rate": 5.203840952468191e-05, "loss": 0.5003, "step": 871 }, { "epoch": 1.9584503088152723, "grad_norm": 0.16121333837509155, "learning_rate": 5.184243886045971e-05, "loss": 0.5054, "step": 872 }, { "epoch": 1.960696238068501, "grad_norm": 0.15507447719573975, "learning_rate": 5.164666086376262e-05, "loss": 0.4954, "step": 873 }, { "epoch": 1.9629421673217293, "grad_norm": 0.16584189236164093, "learning_rate": 5.145107687419751e-05, "loss": 0.4924, "step": 874 }, { "epoch": 1.9651880965749577, "grad_norm": 0.15702944993972778, "learning_rate": 5.1255688230043766e-05, "loss": 0.5004, "step": 875 }, { "epoch": 1.9674340258281864, "grad_norm": 0.17031584680080414, "learning_rate": 5.106049626824405e-05, "loss": 0.5139, "step": 876 }, { "epoch": 1.969679955081415, "grad_norm": 0.16193878650665283, "learning_rate": 5.0865502324395345e-05, "loss": 0.4849, "step": 877 }, { "epoch": 1.9719258843346434, "grad_norm": 0.16150209307670593, "learning_rate": 5.067070773273962e-05, "loss": 0.4719, "step": 878 }, { "epoch": 1.9741718135878719, "grad_norm": 0.1520845890045166, "learning_rate": 5.047611382615481e-05, "loss": 0.4995, "step": 879 }, { "epoch": 1.9764177428411005, "grad_norm": 0.16827571392059326, "learning_rate": 5.0281721936145713e-05, "loss": 0.4908, "step": 880 }, { "epoch": 1.9786636720943291, "grad_norm": 0.15770889818668365, "learning_rate": 5.008753339283471e-05, "loss": 0.5116, "step": 881 }, { "epoch": 1.9809096013475576, "grad_norm": 0.1623336225748062, "learning_rate": 4.98935495249529e-05, "loss": 0.492, "step": 882 }, { "epoch": 1.983155530600786, "grad_norm": 0.16279038786888123, "learning_rate": 4.9699771659830855e-05, "loss": 0.5021, "step": 883 }, { "epoch": 1.9854014598540146, "grad_norm": 0.16874343156814575, "learning_rate": 4.950620112338955e-05, "loss": 0.4876, "step": 884 }, { "epoch": 1.9876473891072433, "grad_norm": 0.15390436351299286, "learning_rate": 4.931283924013141e-05, "loss": 0.4879, "step": 885 }, { "epoch": 1.9898933183604717, "grad_norm": 0.17372553050518036, "learning_rate": 4.911968733313101e-05, "loss": 0.4876, "step": 886 }, { "epoch": 1.9921392476137, "grad_norm": 0.15854312479496002, "learning_rate": 4.892674672402631e-05, "loss": 0.5128, "step": 887 }, { "epoch": 1.9943851768669287, "grad_norm": 0.1635546237230301, "learning_rate": 4.873401873300934e-05, "loss": 0.4946, "step": 888 }, { "epoch": 1.9966311061201574, "grad_norm": 0.15970109403133392, "learning_rate": 4.8541504678817435e-05, "loss": 0.501, "step": 889 }, { "epoch": 1.9988770353733858, "grad_norm": 0.1637182980775833, "learning_rate": 4.834920587872397e-05, "loss": 0.4807, "step": 890 }, { "epoch": 2.001122964626614, "grad_norm": 0.17495502531528473, "learning_rate": 4.815712364852945e-05, "loss": 0.4725, "step": 891 }, { "epoch": 2.0033688938798426, "grad_norm": 0.20412583649158478, "learning_rate": 4.7965259302552546e-05, "loss": 0.4545, "step": 892 }, { "epoch": 2.0056148231330715, "grad_norm": 0.1694943606853485, "learning_rate": 4.777361415362106e-05, "loss": 0.4561, "step": 893 }, { "epoch": 2.0078607523863, "grad_norm": 0.20532694458961487, "learning_rate": 4.75821895130629e-05, "loss": 0.4585, "step": 894 }, { "epoch": 2.0101066816395283, "grad_norm": 0.21771076321601868, "learning_rate": 4.739098669069723e-05, "loss": 0.4609, "step": 895 }, { "epoch": 2.0123526108927567, "grad_norm": 0.19157661497592926, "learning_rate": 4.7200006994825314e-05, "loss": 0.4533, "step": 896 }, { "epoch": 2.0145985401459856, "grad_norm": 0.1829356700181961, "learning_rate": 4.700925173222178e-05, "loss": 0.4401, "step": 897 }, { "epoch": 2.016844469399214, "grad_norm": 0.1815447062253952, "learning_rate": 4.681872220812551e-05, "loss": 0.4497, "step": 898 }, { "epoch": 2.0190903986524424, "grad_norm": 0.16822156310081482, "learning_rate": 4.662841972623084e-05, "loss": 0.4573, "step": 899 }, { "epoch": 2.021336327905671, "grad_norm": 0.18054281175136566, "learning_rate": 4.643834558867852e-05, "loss": 0.4589, "step": 900 }, { "epoch": 2.0235822571588993, "grad_norm": 0.18319673836231232, "learning_rate": 4.6248501096046827e-05, "loss": 0.4376, "step": 901 }, { "epoch": 2.025828186412128, "grad_norm": 0.1645708829164505, "learning_rate": 4.605888754734278e-05, "loss": 0.4304, "step": 902 }, { "epoch": 2.0280741156653566, "grad_norm": 0.17893430590629578, "learning_rate": 4.586950623999314e-05, "loss": 0.4526, "step": 903 }, { "epoch": 2.030320044918585, "grad_norm": 0.17927826941013336, "learning_rate": 4.568035846983558e-05, "loss": 0.4616, "step": 904 }, { "epoch": 2.0325659741718134, "grad_norm": 0.1680602878332138, "learning_rate": 4.549144553110974e-05, "loss": 0.4611, "step": 905 }, { "epoch": 2.0348119034250423, "grad_norm": 0.1612085998058319, "learning_rate": 4.5302768716448434e-05, "loss": 0.4567, "step": 906 }, { "epoch": 2.0370578326782707, "grad_norm": 0.1724167913198471, "learning_rate": 4.5114329316868875e-05, "loss": 0.4666, "step": 907 }, { "epoch": 2.039303761931499, "grad_norm": 0.15838028490543365, "learning_rate": 4.492612862176371e-05, "loss": 0.4529, "step": 908 }, { "epoch": 2.0415496911847275, "grad_norm": 0.15649183094501495, "learning_rate": 4.473816791889228e-05, "loss": 0.4462, "step": 909 }, { "epoch": 2.0437956204379564, "grad_norm": 0.16123028099536896, "learning_rate": 4.455044849437182e-05, "loss": 0.4345, "step": 910 }, { "epoch": 2.046041549691185, "grad_norm": 0.16162772476673126, "learning_rate": 4.436297163266853e-05, "loss": 0.4585, "step": 911 }, { "epoch": 2.048287478944413, "grad_norm": 0.1522200107574463, "learning_rate": 4.4175738616588894e-05, "loss": 0.4614, "step": 912 }, { "epoch": 2.0505334081976416, "grad_norm": 0.16501305997371674, "learning_rate": 4.398875072727097e-05, "loss": 0.4486, "step": 913 }, { "epoch": 2.0527793374508705, "grad_norm": 0.15927040576934814, "learning_rate": 4.380200924417548e-05, "loss": 0.4574, "step": 914 }, { "epoch": 2.055025266704099, "grad_norm": 0.1553938090801239, "learning_rate": 4.361551544507713e-05, "loss": 0.4446, "step": 915 }, { "epoch": 2.0572711959573273, "grad_norm": 0.16552136838436127, "learning_rate": 4.3429270606055895e-05, "loss": 0.4583, "step": 916 }, { "epoch": 2.0595171252105557, "grad_norm": 0.1564835011959076, "learning_rate": 4.3243276001488156e-05, "loss": 0.4476, "step": 917 }, { "epoch": 2.0617630544637846, "grad_norm": 0.1577308475971222, "learning_rate": 4.305753290403809e-05, "loss": 0.4632, "step": 918 }, { "epoch": 2.064008983717013, "grad_norm": 0.15984061360359192, "learning_rate": 4.2872042584649015e-05, "loss": 0.4624, "step": 919 }, { "epoch": 2.0662549129702414, "grad_norm": 0.16448809206485748, "learning_rate": 4.268680631253455e-05, "loss": 0.4436, "step": 920 }, { "epoch": 2.06850084222347, "grad_norm": 0.16196516156196594, "learning_rate": 4.250182535517008e-05, "loss": 0.4375, "step": 921 }, { "epoch": 2.0707467714766983, "grad_norm": 0.15193282067775726, "learning_rate": 4.231710097828388e-05, "loss": 0.4287, "step": 922 }, { "epoch": 2.072992700729927, "grad_norm": 0.16018415987491608, "learning_rate": 4.2132634445848704e-05, "loss": 0.4543, "step": 923 }, { "epoch": 2.0752386299831556, "grad_norm": 0.16128796339035034, "learning_rate": 4.194842702007289e-05, "loss": 0.4621, "step": 924 }, { "epoch": 2.077484559236384, "grad_norm": 0.15342706441879272, "learning_rate": 4.176447996139196e-05, "loss": 0.4355, "step": 925 }, { "epoch": 2.0797304884896124, "grad_norm": 0.1577060967683792, "learning_rate": 4.1580794528459834e-05, "loss": 0.4521, "step": 926 }, { "epoch": 2.0819764177428413, "grad_norm": 0.16120131313800812, "learning_rate": 4.13973719781402e-05, "loss": 0.4501, "step": 927 }, { "epoch": 2.0842223469960697, "grad_norm": 0.16163085401058197, "learning_rate": 4.1214213565498086e-05, "loss": 0.4518, "step": 928 }, { "epoch": 2.086468276249298, "grad_norm": 0.1605272889137268, "learning_rate": 4.10313205437911e-05, "loss": 0.4334, "step": 929 }, { "epoch": 2.0887142055025265, "grad_norm": 0.16757291555404663, "learning_rate": 4.084869416446095e-05, "loss": 0.4579, "step": 930 }, { "epoch": 2.0909601347557554, "grad_norm": 0.1572689265012741, "learning_rate": 4.0666335677124816e-05, "loss": 0.4462, "step": 931 }, { "epoch": 2.093206064008984, "grad_norm": 0.1841953992843628, "learning_rate": 4.048424632956681e-05, "loss": 0.4241, "step": 932 }, { "epoch": 2.095451993262212, "grad_norm": 0.1640552282333374, "learning_rate": 4.030242736772952e-05, "loss": 0.4495, "step": 933 }, { "epoch": 2.0976979225154406, "grad_norm": 0.15904076397418976, "learning_rate": 4.0120880035705416e-05, "loss": 0.4513, "step": 934 }, { "epoch": 2.0999438517686695, "grad_norm": 0.17605750262737274, "learning_rate": 3.9939605575728315e-05, "loss": 0.4444, "step": 935 }, { "epoch": 2.102189781021898, "grad_norm": 0.15149734914302826, "learning_rate": 3.975860522816497e-05, "loss": 0.4423, "step": 936 }, { "epoch": 2.1044357102751263, "grad_norm": 0.15931731462478638, "learning_rate": 3.957788023150647e-05, "loss": 0.4558, "step": 937 }, { "epoch": 2.1066816395283547, "grad_norm": 0.1513037383556366, "learning_rate": 3.939743182235978e-05, "loss": 0.4451, "step": 938 }, { "epoch": 2.108927568781583, "grad_norm": 0.1563446819782257, "learning_rate": 3.921726123543942e-05, "loss": 0.4438, "step": 939 }, { "epoch": 2.111173498034812, "grad_norm": 0.14871710538864136, "learning_rate": 3.9037369703558876e-05, "loss": 0.449, "step": 940 }, { "epoch": 2.1134194272880404, "grad_norm": 0.14909642934799194, "learning_rate": 3.8857758457622246e-05, "loss": 0.4643, "step": 941 }, { "epoch": 2.115665356541269, "grad_norm": 0.15018634498119354, "learning_rate": 3.867842872661565e-05, "loss": 0.4483, "step": 942 }, { "epoch": 2.1179112857944973, "grad_norm": 0.16879071295261383, "learning_rate": 3.8499381737599124e-05, "loss": 0.4726, "step": 943 }, { "epoch": 2.120157215047726, "grad_norm": 0.1683071106672287, "learning_rate": 3.832061871569787e-05, "loss": 0.4499, "step": 944 }, { "epoch": 2.1224031443009546, "grad_norm": 0.15678516030311584, "learning_rate": 3.814214088409419e-05, "loss": 0.4484, "step": 945 }, { "epoch": 2.124649073554183, "grad_norm": 0.1773703545331955, "learning_rate": 3.7963949464018945e-05, "loss": 0.4605, "step": 946 }, { "epoch": 2.1268950028074114, "grad_norm": 0.1767614483833313, "learning_rate": 3.778604567474314e-05, "loss": 0.4574, "step": 947 }, { "epoch": 2.1291409320606403, "grad_norm": 0.15908308327198029, "learning_rate": 3.760843073356981e-05, "loss": 0.4357, "step": 948 }, { "epoch": 2.1313868613138687, "grad_norm": 0.1637633740901947, "learning_rate": 3.743110585582549e-05, "loss": 0.4566, "step": 949 }, { "epoch": 2.133632790567097, "grad_norm": 0.1657618135213852, "learning_rate": 3.725407225485191e-05, "loss": 0.4497, "step": 950 }, { "epoch": 2.1358787198203255, "grad_norm": 0.15281249582767487, "learning_rate": 3.707733114199783e-05, "loss": 0.4494, "step": 951 }, { "epoch": 2.1381246490735544, "grad_norm": 0.16828225553035736, "learning_rate": 3.690088372661061e-05, "loss": 0.4412, "step": 952 }, { "epoch": 2.140370578326783, "grad_norm": 0.16215671598911285, "learning_rate": 3.672473121602801e-05, "loss": 0.449, "step": 953 }, { "epoch": 2.142616507580011, "grad_norm": 0.14198768138885498, "learning_rate": 3.654887481556993e-05, "loss": 0.4556, "step": 954 }, { "epoch": 2.1448624368332396, "grad_norm": 0.1703426092863083, "learning_rate": 3.6373315728530145e-05, "loss": 0.4456, "step": 955 }, { "epoch": 2.147108366086468, "grad_norm": 0.15878015756607056, "learning_rate": 3.6198055156168025e-05, "loss": 0.4593, "step": 956 }, { "epoch": 2.149354295339697, "grad_norm": 0.15779636800289154, "learning_rate": 3.602309429770034e-05, "loss": 0.4543, "step": 957 }, { "epoch": 2.1516002245929253, "grad_norm": 0.15963739156723022, "learning_rate": 3.584843435029316e-05, "loss": 0.4363, "step": 958 }, { "epoch": 2.1538461538461537, "grad_norm": 0.15662063658237457, "learning_rate": 3.567407650905353e-05, "loss": 0.458, "step": 959 }, { "epoch": 2.156092083099382, "grad_norm": 0.14531637728214264, "learning_rate": 3.5500021967021344e-05, "loss": 0.4474, "step": 960 }, { "epoch": 2.158338012352611, "grad_norm": 0.15317556262016296, "learning_rate": 3.5326271915161205e-05, "loss": 0.439, "step": 961 }, { "epoch": 2.1605839416058394, "grad_norm": 0.15082910656929016, "learning_rate": 3.515282754235419e-05, "loss": 0.4497, "step": 962 }, { "epoch": 2.162829870859068, "grad_norm": 0.14299066364765167, "learning_rate": 3.4979690035389774e-05, "loss": 0.4468, "step": 963 }, { "epoch": 2.1650758001122963, "grad_norm": 0.1458815485239029, "learning_rate": 3.480686057895778e-05, "loss": 0.453, "step": 964 }, { "epoch": 2.167321729365525, "grad_norm": 0.1518121361732483, "learning_rate": 3.4634340355640136e-05, "loss": 0.4393, "step": 965 }, { "epoch": 2.1695676586187536, "grad_norm": 0.14630930125713348, "learning_rate": 3.446213054590291e-05, "loss": 0.4617, "step": 966 }, { "epoch": 2.171813587871982, "grad_norm": 0.15554536879062653, "learning_rate": 3.4290232328088136e-05, "loss": 0.4555, "step": 967 }, { "epoch": 2.1740595171252104, "grad_norm": 0.1420973539352417, "learning_rate": 3.4118646878405755e-05, "loss": 0.4575, "step": 968 }, { "epoch": 2.1763054463784393, "grad_norm": 0.15307992696762085, "learning_rate": 3.394737537092562e-05, "loss": 0.466, "step": 969 }, { "epoch": 2.1785513756316677, "grad_norm": 0.14762836694717407, "learning_rate": 3.377641897756947e-05, "loss": 0.4653, "step": 970 }, { "epoch": 2.180797304884896, "grad_norm": 0.14197176694869995, "learning_rate": 3.360577886810286e-05, "loss": 0.4534, "step": 971 }, { "epoch": 2.1830432341381245, "grad_norm": 0.14574755728244781, "learning_rate": 3.343545621012721e-05, "loss": 0.4436, "step": 972 }, { "epoch": 2.1852891633913534, "grad_norm": 0.1501995027065277, "learning_rate": 3.326545216907171e-05, "loss": 0.4551, "step": 973 }, { "epoch": 2.187535092644582, "grad_norm": 0.15226097404956818, "learning_rate": 3.309576790818551e-05, "loss": 0.4458, "step": 974 }, { "epoch": 2.18978102189781, "grad_norm": 0.14684434235095978, "learning_rate": 3.292640458852958e-05, "loss": 0.4494, "step": 975 }, { "epoch": 2.1920269511510386, "grad_norm": 0.14523442089557648, "learning_rate": 3.275736336896893e-05, "loss": 0.4445, "step": 976 }, { "epoch": 2.1942728804042675, "grad_norm": 0.1518959403038025, "learning_rate": 3.25886454061646e-05, "loss": 0.4649, "step": 977 }, { "epoch": 2.196518809657496, "grad_norm": 0.1398971676826477, "learning_rate": 3.2420251854565704e-05, "loss": 0.4563, "step": 978 }, { "epoch": 2.1987647389107243, "grad_norm": 0.13926076889038086, "learning_rate": 3.22521838664016e-05, "loss": 0.4479, "step": 979 }, { "epoch": 2.2010106681639527, "grad_norm": 0.14644260704517365, "learning_rate": 3.2084442591674024e-05, "loss": 0.4349, "step": 980 }, { "epoch": 2.203256597417181, "grad_norm": 0.14670224487781525, "learning_rate": 3.191702917814916e-05, "loss": 0.4532, "step": 981 }, { "epoch": 2.20550252667041, "grad_norm": 0.13720498979091644, "learning_rate": 3.174994477134978e-05, "loss": 0.431, "step": 982 }, { "epoch": 2.2077484559236384, "grad_norm": 0.13734634220600128, "learning_rate": 3.158319051454743e-05, "loss": 0.437, "step": 983 }, { "epoch": 2.209994385176867, "grad_norm": 0.14033032953739166, "learning_rate": 3.141676754875465e-05, "loss": 0.4487, "step": 984 }, { "epoch": 2.2122403144300953, "grad_norm": 0.1471083164215088, "learning_rate": 3.1250677012717135e-05, "loss": 0.4544, "step": 985 }, { "epoch": 2.214486243683324, "grad_norm": 0.13971002399921417, "learning_rate": 3.10849200429059e-05, "loss": 0.4535, "step": 986 }, { "epoch": 2.2167321729365526, "grad_norm": 0.1465609073638916, "learning_rate": 3.091949777350958e-05, "loss": 0.4482, "step": 987 }, { "epoch": 2.218978102189781, "grad_norm": 0.14760175347328186, "learning_rate": 3.075441133642659e-05, "loss": 0.4461, "step": 988 }, { "epoch": 2.2212240314430094, "grad_norm": 0.1456819474697113, "learning_rate": 3.05896618612574e-05, "loss": 0.4468, "step": 989 }, { "epoch": 2.2234699606962383, "grad_norm": 0.14734943211078644, "learning_rate": 3.0425250475296883e-05, "loss": 0.433, "step": 990 }, { "epoch": 2.2257158899494667, "grad_norm": 0.13213606178760529, "learning_rate": 3.0261178303526536e-05, "loss": 0.4395, "step": 991 }, { "epoch": 2.227961819202695, "grad_norm": 0.14420166611671448, "learning_rate": 3.0097446468606785e-05, "loss": 0.4391, "step": 992 }, { "epoch": 2.2302077484559235, "grad_norm": 0.14115062355995178, "learning_rate": 2.9934056090869242e-05, "loss": 0.4371, "step": 993 }, { "epoch": 2.2324536777091524, "grad_norm": 0.14169073104858398, "learning_rate": 2.9771008288309224e-05, "loss": 0.4334, "step": 994 }, { "epoch": 2.234699606962381, "grad_norm": 0.14184604585170746, "learning_rate": 2.9608304176577872e-05, "loss": 0.4442, "step": 995 }, { "epoch": 2.236945536215609, "grad_norm": 0.14200329780578613, "learning_rate": 2.9445944868974688e-05, "loss": 0.465, "step": 996 }, { "epoch": 2.2391914654688376, "grad_norm": 0.14416737854480743, "learning_rate": 2.9283931476439886e-05, "loss": 0.4423, "step": 997 }, { "epoch": 2.241437394722066, "grad_norm": 0.14188611507415771, "learning_rate": 2.9122265107546677e-05, "loss": 0.4647, "step": 998 }, { "epoch": 2.243683323975295, "grad_norm": 0.14122439920902252, "learning_rate": 2.8960946868493843e-05, "loss": 0.4317, "step": 999 }, { "epoch": 2.2459292532285233, "grad_norm": 0.14019352197647095, "learning_rate": 2.87999778630981e-05, "loss": 0.4415, "step": 1000 }, { "epoch": 2.2481751824817517, "grad_norm": 0.1378793567419052, "learning_rate": 2.863935919278645e-05, "loss": 0.4537, "step": 1001 }, { "epoch": 2.25042111173498, "grad_norm": 0.14002038538455963, "learning_rate": 2.847909195658886e-05, "loss": 0.4427, "step": 1002 }, { "epoch": 2.252667040988209, "grad_norm": 0.1482112854719162, "learning_rate": 2.8319177251130495e-05, "loss": 0.4465, "step": 1003 }, { "epoch": 2.2549129702414374, "grad_norm": 0.1393243372440338, "learning_rate": 2.815961617062442e-05, "loss": 0.4405, "step": 1004 }, { "epoch": 2.257158899494666, "grad_norm": 0.14361439645290375, "learning_rate": 2.8000409806864007e-05, "loss": 0.4672, "step": 1005 }, { "epoch": 2.2594048287478943, "grad_norm": 0.13548092544078827, "learning_rate": 2.7841559249215503e-05, "loss": 0.4557, "step": 1006 }, { "epoch": 2.261650758001123, "grad_norm": 0.13999567925930023, "learning_rate": 2.768306558461051e-05, "loss": 0.4577, "step": 1007 }, { "epoch": 2.2638966872543516, "grad_norm": 0.14704839885234833, "learning_rate": 2.75249298975386e-05, "loss": 0.4556, "step": 1008 }, { "epoch": 2.26614261650758, "grad_norm": 0.1454869657754898, "learning_rate": 2.7367153270039934e-05, "loss": 0.4656, "step": 1009 }, { "epoch": 2.2683885457608084, "grad_norm": 0.14805535972118378, "learning_rate": 2.720973678169781e-05, "loss": 0.4463, "step": 1010 }, { "epoch": 2.2706344750140373, "grad_norm": 0.14422546327114105, "learning_rate": 2.705268150963125e-05, "loss": 0.4463, "step": 1011 }, { "epoch": 2.2728804042672657, "grad_norm": 0.14471085369586945, "learning_rate": 2.6895988528487724e-05, "loss": 0.4499, "step": 1012 }, { "epoch": 2.275126333520494, "grad_norm": 0.14727704226970673, "learning_rate": 2.6739658910435663e-05, "loss": 0.4498, "step": 1013 }, { "epoch": 2.2773722627737225, "grad_norm": 0.13678747415542603, "learning_rate": 2.6583693725157176e-05, "loss": 0.4396, "step": 1014 }, { "epoch": 2.279618192026951, "grad_norm": 0.14493557810783386, "learning_rate": 2.6428094039840827e-05, "loss": 0.4493, "step": 1015 }, { "epoch": 2.28186412128018, "grad_norm": 0.14464671909809113, "learning_rate": 2.6272860919174223e-05, "loss": 0.4586, "step": 1016 }, { "epoch": 2.284110050533408, "grad_norm": 0.13754825294017792, "learning_rate": 2.6117995425336774e-05, "loss": 0.4587, "step": 1017 }, { "epoch": 2.2863559797866366, "grad_norm": 0.14128117263317108, "learning_rate": 2.596349861799235e-05, "loss": 0.4578, "step": 1018 }, { "epoch": 2.2886019090398655, "grad_norm": 0.14357365667819977, "learning_rate": 2.5809371554282177e-05, "loss": 0.4492, "step": 1019 }, { "epoch": 2.290847838293094, "grad_norm": 0.1328091323375702, "learning_rate": 2.565561528881744e-05, "loss": 0.4526, "step": 1020 }, { "epoch": 2.2930937675463223, "grad_norm": 0.13385091722011566, "learning_rate": 2.5502230873672177e-05, "loss": 0.4692, "step": 1021 }, { "epoch": 2.2953396967995507, "grad_norm": 0.13780003786087036, "learning_rate": 2.5349219358376082e-05, "loss": 0.4652, "step": 1022 }, { "epoch": 2.297585626052779, "grad_norm": 0.1325894445180893, "learning_rate": 2.519658178990727e-05, "loss": 0.4384, "step": 1023 }, { "epoch": 2.299831555306008, "grad_norm": 0.13235574960708618, "learning_rate": 2.5044319212685066e-05, "loss": 0.454, "step": 1024 }, { "epoch": 2.3020774845592364, "grad_norm": 0.13442382216453552, "learning_rate": 2.4892432668563017e-05, "loss": 0.4449, "step": 1025 }, { "epoch": 2.304323413812465, "grad_norm": 0.1442955881357193, "learning_rate": 2.4740923196821653e-05, "loss": 0.4764, "step": 1026 }, { "epoch": 2.3065693430656933, "grad_norm": 0.13242414593696594, "learning_rate": 2.4589791834161324e-05, "loss": 0.44, "step": 1027 }, { "epoch": 2.308815272318922, "grad_norm": 0.1390787959098816, "learning_rate": 2.443903961469528e-05, "loss": 0.4671, "step": 1028 }, { "epoch": 2.3110612015721506, "grad_norm": 0.14238110184669495, "learning_rate": 2.4288667569942402e-05, "loss": 0.4375, "step": 1029 }, { "epoch": 2.313307130825379, "grad_norm": 0.14821192622184753, "learning_rate": 2.4138676728820274e-05, "loss": 0.4575, "step": 1030 }, { "epoch": 2.3155530600786074, "grad_norm": 0.1424325704574585, "learning_rate": 2.3989068117638114e-05, "loss": 0.4418, "step": 1031 }, { "epoch": 2.317798989331836, "grad_norm": 0.1394152194261551, "learning_rate": 2.383984276008975e-05, "loss": 0.4298, "step": 1032 }, { "epoch": 2.3200449185850647, "grad_norm": 0.1432042270898819, "learning_rate": 2.3691001677246552e-05, "loss": 0.4409, "step": 1033 }, { "epoch": 2.322290847838293, "grad_norm": 0.14173389971256256, "learning_rate": 2.354254588755051e-05, "loss": 0.4557, "step": 1034 }, { "epoch": 2.3245367770915215, "grad_norm": 0.1387631595134735, "learning_rate": 2.339447640680728e-05, "loss": 0.4562, "step": 1035 }, { "epoch": 2.3267827063447504, "grad_norm": 0.14601486921310425, "learning_rate": 2.3246794248179203e-05, "loss": 0.4496, "step": 1036 }, { "epoch": 2.329028635597979, "grad_norm": 0.13562379777431488, "learning_rate": 2.309950042217838e-05, "loss": 0.4385, "step": 1037 }, { "epoch": 2.331274564851207, "grad_norm": 0.14119566977024078, "learning_rate": 2.2952595936659757e-05, "loss": 0.4468, "step": 1038 }, { "epoch": 2.3335204941044356, "grad_norm": 0.13435381650924683, "learning_rate": 2.2806081796814193e-05, "loss": 0.4479, "step": 1039 }, { "epoch": 2.335766423357664, "grad_norm": 0.14311861991882324, "learning_rate": 2.2659959005161617e-05, "loss": 0.4466, "step": 1040 }, { "epoch": 2.338012352610893, "grad_norm": 0.13565625250339508, "learning_rate": 2.25142285615442e-05, "loss": 0.4656, "step": 1041 }, { "epoch": 2.3402582818641213, "grad_norm": 0.1413930356502533, "learning_rate": 2.2368891463119473e-05, "loss": 0.4426, "step": 1042 }, { "epoch": 2.3425042111173497, "grad_norm": 0.14812184870243073, "learning_rate": 2.222394870435352e-05, "loss": 0.4617, "step": 1043 }, { "epoch": 2.344750140370578, "grad_norm": 0.1381373107433319, "learning_rate": 2.2079401277014102e-05, "loss": 0.4506, "step": 1044 }, { "epoch": 2.346996069623807, "grad_norm": 0.1399037092924118, "learning_rate": 2.193525017016402e-05, "loss": 0.4427, "step": 1045 }, { "epoch": 2.3492419988770354, "grad_norm": 0.14365847408771515, "learning_rate": 2.1791496370154173e-05, "loss": 0.4575, "step": 1046 }, { "epoch": 2.351487928130264, "grad_norm": 0.13773076236248016, "learning_rate": 2.1648140860616974e-05, "loss": 0.4501, "step": 1047 }, { "epoch": 2.3537338573834923, "grad_norm": 0.13768814504146576, "learning_rate": 2.1505184622459517e-05, "loss": 0.4754, "step": 1048 }, { "epoch": 2.3559797866367207, "grad_norm": 0.13707469403743744, "learning_rate": 2.1362628633856836e-05, "loss": 0.4243, "step": 1049 }, { "epoch": 2.3582257158899496, "grad_norm": 0.1411537230014801, "learning_rate": 2.1220473870245347e-05, "loss": 0.463, "step": 1050 }, { "epoch": 2.360471645143178, "grad_norm": 0.1276266723871231, "learning_rate": 2.1078721304316064e-05, "loss": 0.4492, "step": 1051 }, { "epoch": 2.3627175743964064, "grad_norm": 0.13482601940631866, "learning_rate": 2.093737190600793e-05, "loss": 0.451, "step": 1052 }, { "epoch": 2.3649635036496353, "grad_norm": 0.13639169931411743, "learning_rate": 2.0796426642501305e-05, "loss": 0.4458, "step": 1053 }, { "epoch": 2.3672094329028637, "grad_norm": 0.128794863820076, "learning_rate": 2.065588647821116e-05, "loss": 0.452, "step": 1054 }, { "epoch": 2.369455362156092, "grad_norm": 0.13202716410160065, "learning_rate": 2.0515752374780664e-05, "loss": 0.4405, "step": 1055 }, { "epoch": 2.3717012914093205, "grad_norm": 0.15147733688354492, "learning_rate": 2.03760252910745e-05, "loss": 0.451, "step": 1056 }, { "epoch": 2.373947220662549, "grad_norm": 0.13587650656700134, "learning_rate": 2.023670618317235e-05, "loss": 0.4373, "step": 1057 }, { "epoch": 2.376193149915778, "grad_norm": 0.1358175277709961, "learning_rate": 2.009779600436228e-05, "loss": 0.4628, "step": 1058 }, { "epoch": 2.378439079169006, "grad_norm": 0.13308054208755493, "learning_rate": 1.995929570513427e-05, "loss": 0.4517, "step": 1059 }, { "epoch": 2.3806850084222346, "grad_norm": 0.14447179436683655, "learning_rate": 1.9821206233173756e-05, "loss": 0.464, "step": 1060 }, { "epoch": 2.382930937675463, "grad_norm": 0.1535249650478363, "learning_rate": 1.9683528533355077e-05, "loss": 0.4783, "step": 1061 }, { "epoch": 2.385176866928692, "grad_norm": 0.13172586262226105, "learning_rate": 1.9546263547735006e-05, "loss": 0.4451, "step": 1062 }, { "epoch": 2.3874227961819203, "grad_norm": 0.13454264402389526, "learning_rate": 1.9409412215546385e-05, "loss": 0.4326, "step": 1063 }, { "epoch": 2.3896687254351487, "grad_norm": 0.13548077642917633, "learning_rate": 1.9272975473191566e-05, "loss": 0.4725, "step": 1064 }, { "epoch": 2.391914654688377, "grad_norm": 0.1396332085132599, "learning_rate": 1.91369542542361e-05, "loss": 0.4433, "step": 1065 }, { "epoch": 2.394160583941606, "grad_norm": 0.13676691055297852, "learning_rate": 1.9001349489402374e-05, "loss": 0.4533, "step": 1066 }, { "epoch": 2.3964065131948344, "grad_norm": 0.138559028506279, "learning_rate": 1.886616210656314e-05, "loss": 0.4546, "step": 1067 }, { "epoch": 2.398652442448063, "grad_norm": 0.14537115395069122, "learning_rate": 1.873139303073529e-05, "loss": 0.4505, "step": 1068 }, { "epoch": 2.4008983717012913, "grad_norm": 0.14567793905735016, "learning_rate": 1.859704318407336e-05, "loss": 0.4494, "step": 1069 }, { "epoch": 2.40314430095452, "grad_norm": 0.16292881965637207, "learning_rate": 1.8463113485863423e-05, "loss": 0.4493, "step": 1070 }, { "epoch": 2.4053902302077486, "grad_norm": 0.1402868777513504, "learning_rate": 1.832960485251661e-05, "loss": 0.4546, "step": 1071 }, { "epoch": 2.407636159460977, "grad_norm": 0.13375958800315857, "learning_rate": 1.819651819756297e-05, "loss": 0.4469, "step": 1072 }, { "epoch": 2.4098820887142054, "grad_norm": 0.14132662117481232, "learning_rate": 1.80638544316452e-05, "loss": 0.4505, "step": 1073 }, { "epoch": 2.412128017967434, "grad_norm": 0.13755889236927032, "learning_rate": 1.7931614462512293e-05, "loss": 0.4704, "step": 1074 }, { "epoch": 2.4143739472206627, "grad_norm": 0.13184499740600586, "learning_rate": 1.7799799195013526e-05, "loss": 0.4369, "step": 1075 }, { "epoch": 2.416619876473891, "grad_norm": 0.13104869425296783, "learning_rate": 1.7668409531092097e-05, "loss": 0.4521, "step": 1076 }, { "epoch": 2.4188658057271195, "grad_norm": 0.135769784450531, "learning_rate": 1.7537446369779072e-05, "loss": 0.4674, "step": 1077 }, { "epoch": 2.421111734980348, "grad_norm": 0.13897131383419037, "learning_rate": 1.740691060718712e-05, "loss": 0.4401, "step": 1078 }, { "epoch": 2.423357664233577, "grad_norm": 0.12773634493350983, "learning_rate": 1.72768031365045e-05, "loss": 0.4339, "step": 1079 }, { "epoch": 2.425603593486805, "grad_norm": 0.13083034753799438, "learning_rate": 1.7147124847988834e-05, "loss": 0.451, "step": 1080 }, { "epoch": 2.4278495227400336, "grad_norm": 0.13339859247207642, "learning_rate": 1.7017876628961126e-05, "loss": 0.4495, "step": 1081 }, { "epoch": 2.430095451993262, "grad_norm": 0.13018065690994263, "learning_rate": 1.6889059363799623e-05, "loss": 0.4483, "step": 1082 }, { "epoch": 2.432341381246491, "grad_norm": 0.13034923374652863, "learning_rate": 1.67606739339338e-05, "loss": 0.4381, "step": 1083 }, { "epoch": 2.4345873104997193, "grad_norm": 0.1323402225971222, "learning_rate": 1.6632721217838258e-05, "loss": 0.4414, "step": 1084 }, { "epoch": 2.4368332397529477, "grad_norm": 0.13824905455112457, "learning_rate": 1.650520209102677e-05, "loss": 0.4469, "step": 1085 }, { "epoch": 2.439079169006176, "grad_norm": 0.12723715603351593, "learning_rate": 1.6378117426046332e-05, "loss": 0.4551, "step": 1086 }, { "epoch": 2.441325098259405, "grad_norm": 0.12957409024238586, "learning_rate": 1.6251468092471093e-05, "loss": 0.4435, "step": 1087 }, { "epoch": 2.4435710275126334, "grad_norm": 0.13387183845043182, "learning_rate": 1.612525495689651e-05, "loss": 0.4321, "step": 1088 }, { "epoch": 2.445816956765862, "grad_norm": 0.13002759218215942, "learning_rate": 1.5999478882933325e-05, "loss": 0.4461, "step": 1089 }, { "epoch": 2.4480628860190903, "grad_norm": 0.13771192729473114, "learning_rate": 1.5874140731201694e-05, "loss": 0.4337, "step": 1090 }, { "epoch": 2.4503088152723187, "grad_norm": 0.13762550055980682, "learning_rate": 1.574924135932529e-05, "loss": 0.4435, "step": 1091 }, { "epoch": 2.4525547445255476, "grad_norm": 0.13518671691417694, "learning_rate": 1.5624781621925462e-05, "loss": 0.4457, "step": 1092 }, { "epoch": 2.454800673778776, "grad_norm": 0.13244876265525818, "learning_rate": 1.5500762370615392e-05, "loss": 0.4466, "step": 1093 }, { "epoch": 2.4570466030320044, "grad_norm": 0.1363506317138672, "learning_rate": 1.5377184453994232e-05, "loss": 0.4397, "step": 1094 }, { "epoch": 2.4592925322852333, "grad_norm": 0.13642770051956177, "learning_rate": 1.5254048717641268e-05, "loss": 0.4525, "step": 1095 }, { "epoch": 2.4615384615384617, "grad_norm": 0.129640594124794, "learning_rate": 1.5131356004110234e-05, "loss": 0.4743, "step": 1096 }, { "epoch": 2.46378439079169, "grad_norm": 0.12901091575622559, "learning_rate": 1.500910715292343e-05, "loss": 0.4579, "step": 1097 }, { "epoch": 2.4660303200449185, "grad_norm": 0.1358920782804489, "learning_rate": 1.4887303000566103e-05, "loss": 0.4218, "step": 1098 }, { "epoch": 2.468276249298147, "grad_norm": 0.13251328468322754, "learning_rate": 1.4765944380480633e-05, "loss": 0.454, "step": 1099 }, { "epoch": 2.470522178551376, "grad_norm": 0.13257341086864471, "learning_rate": 1.464503212306081e-05, "loss": 0.4534, "step": 1100 }, { "epoch": 2.472768107804604, "grad_norm": 0.135364830493927, "learning_rate": 1.4524567055646261e-05, "loss": 0.4535, "step": 1101 }, { "epoch": 2.4750140370578326, "grad_norm": 0.13053563237190247, "learning_rate": 1.4404550002516709e-05, "loss": 0.469, "step": 1102 }, { "epoch": 2.477259966311061, "grad_norm": 0.12724533677101135, "learning_rate": 1.4284981784886314e-05, "loss": 0.4409, "step": 1103 }, { "epoch": 2.47950589556429, "grad_norm": 0.13512974977493286, "learning_rate": 1.4165863220898132e-05, "loss": 0.4644, "step": 1104 }, { "epoch": 2.4817518248175183, "grad_norm": 0.1417611837387085, "learning_rate": 1.404719512561843e-05, "loss": 0.4507, "step": 1105 }, { "epoch": 2.4839977540707467, "grad_norm": 0.13797731697559357, "learning_rate": 1.3928978311031194e-05, "loss": 0.4427, "step": 1106 }, { "epoch": 2.486243683323975, "grad_norm": 0.13513045012950897, "learning_rate": 1.3811213586032506e-05, "loss": 0.4495, "step": 1107 }, { "epoch": 2.4884896125772036, "grad_norm": 0.13863462209701538, "learning_rate": 1.369390175642507e-05, "loss": 0.4447, "step": 1108 }, { "epoch": 2.4907355418304324, "grad_norm": 0.13138817250728607, "learning_rate": 1.3577043624912602e-05, "loss": 0.4433, "step": 1109 }, { "epoch": 2.492981471083661, "grad_norm": 0.13766634464263916, "learning_rate": 1.3460639991094423e-05, "loss": 0.4569, "step": 1110 }, { "epoch": 2.4952274003368893, "grad_norm": 0.13439221680164337, "learning_rate": 1.3344691651459987e-05, "loss": 0.4527, "step": 1111 }, { "epoch": 2.497473329590118, "grad_norm": 0.12861117720603943, "learning_rate": 1.3229199399383395e-05, "loss": 0.4226, "step": 1112 }, { "epoch": 2.4997192588433466, "grad_norm": 0.135506734251976, "learning_rate": 1.3114164025117968e-05, "loss": 0.4355, "step": 1113 }, { "epoch": 2.501965188096575, "grad_norm": 0.13466140627861023, "learning_rate": 1.299958631579088e-05, "loss": 0.4613, "step": 1114 }, { "epoch": 2.5042111173498034, "grad_norm": 0.131247416138649, "learning_rate": 1.2885467055397691e-05, "loss": 0.4421, "step": 1115 }, { "epoch": 2.506457046603032, "grad_norm": 0.13447698950767517, "learning_rate": 1.2771807024797052e-05, "loss": 0.438, "step": 1116 }, { "epoch": 2.5087029758562607, "grad_norm": 0.14003418385982513, "learning_rate": 1.2658607001705359e-05, "loss": 0.4327, "step": 1117 }, { "epoch": 2.510948905109489, "grad_norm": 0.13097427785396576, "learning_rate": 1.254586776069143e-05, "loss": 0.4427, "step": 1118 }, { "epoch": 2.5131948343627175, "grad_norm": 0.1318497210741043, "learning_rate": 1.2433590073171175e-05, "loss": 0.4516, "step": 1119 }, { "epoch": 2.5154407636159464, "grad_norm": 0.12292584031820297, "learning_rate": 1.23217747074023e-05, "loss": 0.4355, "step": 1120 }, { "epoch": 2.517686692869175, "grad_norm": 0.12714707851409912, "learning_rate": 1.2210422428479122e-05, "loss": 0.4457, "step": 1121 }, { "epoch": 2.519932622122403, "grad_norm": 0.13449381291866302, "learning_rate": 1.2099533998327328e-05, "loss": 0.443, "step": 1122 }, { "epoch": 2.5221785513756316, "grad_norm": 0.1288016140460968, "learning_rate": 1.1989110175698629e-05, "loss": 0.4488, "step": 1123 }, { "epoch": 2.52442448062886, "grad_norm": 0.12953847646713257, "learning_rate": 1.1879151716165782e-05, "loss": 0.4327, "step": 1124 }, { "epoch": 2.5266704098820885, "grad_norm": 0.1303713619709015, "learning_rate": 1.1769659372117208e-05, "loss": 0.4452, "step": 1125 }, { "epoch": 2.5289163391353173, "grad_norm": 0.12560470402240753, "learning_rate": 1.1660633892752018e-05, "loss": 0.453, "step": 1126 }, { "epoch": 2.5311622683885457, "grad_norm": 0.1277565062046051, "learning_rate": 1.1552076024074767e-05, "loss": 0.4342, "step": 1127 }, { "epoch": 2.533408197641774, "grad_norm": 0.13078947365283966, "learning_rate": 1.1443986508890438e-05, "loss": 0.4529, "step": 1128 }, { "epoch": 2.535654126895003, "grad_norm": 0.13425932824611664, "learning_rate": 1.1336366086799262e-05, "loss": 0.4608, "step": 1129 }, { "epoch": 2.5379000561482314, "grad_norm": 0.12628474831581116, "learning_rate": 1.1229215494191724e-05, "loss": 0.4679, "step": 1130 }, { "epoch": 2.54014598540146, "grad_norm": 0.12629267573356628, "learning_rate": 1.112253546424352e-05, "loss": 0.4525, "step": 1131 }, { "epoch": 2.5423919146546883, "grad_norm": 0.1339423656463623, "learning_rate": 1.1016326726910554e-05, "loss": 0.4601, "step": 1132 }, { "epoch": 2.5446378439079167, "grad_norm": 0.12335141748189926, "learning_rate": 1.0910590008923871e-05, "loss": 0.444, "step": 1133 }, { "epoch": 2.5468837731611456, "grad_norm": 0.12865717709064484, "learning_rate": 1.0805326033784804e-05, "loss": 0.4384, "step": 1134 }, { "epoch": 2.549129702414374, "grad_norm": 0.13087087869644165, "learning_rate": 1.0700535521759874e-05, "loss": 0.4367, "step": 1135 }, { "epoch": 2.5513756316676024, "grad_norm": 0.12297067791223526, "learning_rate": 1.0596219189875963e-05, "loss": 0.4431, "step": 1136 }, { "epoch": 2.5536215609208313, "grad_norm": 0.13361698389053345, "learning_rate": 1.049237775191542e-05, "loss": 0.4345, "step": 1137 }, { "epoch": 2.5558674901740597, "grad_norm": 0.1307375282049179, "learning_rate": 1.0389011918411103e-05, "loss": 0.469, "step": 1138 }, { "epoch": 2.558113419427288, "grad_norm": 0.13051824271678925, "learning_rate": 1.0286122396641587e-05, "loss": 0.464, "step": 1139 }, { "epoch": 2.5603593486805165, "grad_norm": 0.13012929260730743, "learning_rate": 1.0183709890626301e-05, "loss": 0.4517, "step": 1140 }, { "epoch": 2.562605277933745, "grad_norm": 0.13006287813186646, "learning_rate": 1.0081775101120645e-05, "loss": 0.4565, "step": 1141 }, { "epoch": 2.5648512071869733, "grad_norm": 0.12601535022258759, "learning_rate": 9.980318725611294e-06, "loss": 0.4355, "step": 1142 }, { "epoch": 2.567097136440202, "grad_norm": 0.13367784023284912, "learning_rate": 9.879341458311394e-06, "loss": 0.459, "step": 1143 }, { "epoch": 2.5693430656934306, "grad_norm": 0.13120903074741364, "learning_rate": 9.778843990155784e-06, "loss": 0.4516, "step": 1144 }, { "epoch": 2.571588994946659, "grad_norm": 0.12156583368778229, "learning_rate": 9.67882700879632e-06, "loss": 0.4366, "step": 1145 }, { "epoch": 2.573834924199888, "grad_norm": 0.12496156245470047, "learning_rate": 9.57929119859708e-06, "loss": 0.4503, "step": 1146 }, { "epoch": 2.5760808534531163, "grad_norm": 0.1285991668701172, "learning_rate": 9.480237240629794e-06, "loss": 0.4546, "step": 1147 }, { "epoch": 2.5783267827063447, "grad_norm": 0.12715794146060944, "learning_rate": 9.381665812669074e-06, "loss": 0.4353, "step": 1148 }, { "epoch": 2.580572711959573, "grad_norm": 0.12791746854782104, "learning_rate": 9.283577589187884e-06, "loss": 0.4783, "step": 1149 }, { "epoch": 2.5828186412128016, "grad_norm": 0.12204549461603165, "learning_rate": 9.185973241352859e-06, "loss": 0.4475, "step": 1150 }, { "epoch": 2.5850645704660304, "grad_norm": 0.12769286334514618, "learning_rate": 9.088853437019688e-06, "loss": 0.44, "step": 1151 }, { "epoch": 2.587310499719259, "grad_norm": 0.12649452686309814, "learning_rate": 8.99221884072862e-06, "loss": 0.44, "step": 1152 }, { "epoch": 2.5895564289724873, "grad_norm": 0.12873002886772156, "learning_rate": 8.896070113699874e-06, "loss": 0.4356, "step": 1153 }, { "epoch": 2.591802358225716, "grad_norm": 0.12493567168712616, "learning_rate": 8.800407913829088e-06, "loss": 0.456, "step": 1154 }, { "epoch": 2.5940482874789446, "grad_norm": 0.12773042917251587, "learning_rate": 8.705232895682906e-06, "loss": 0.4502, "step": 1155 }, { "epoch": 2.596294216732173, "grad_norm": 0.1301664263010025, "learning_rate": 8.610545710494356e-06, "loss": 0.441, "step": 1156 }, { "epoch": 2.5985401459854014, "grad_norm": 0.136691614985466, "learning_rate": 8.516347006158567e-06, "loss": 0.4451, "step": 1157 }, { "epoch": 2.60078607523863, "grad_norm": 0.12582361698150635, "learning_rate": 8.422637427228193e-06, "loss": 0.4477, "step": 1158 }, { "epoch": 2.6030320044918582, "grad_norm": 0.12166401743888855, "learning_rate": 8.329417614909094e-06, "loss": 0.4402, "step": 1159 }, { "epoch": 2.605277933745087, "grad_norm": 0.12802627682685852, "learning_rate": 8.236688207055885e-06, "loss": 0.4545, "step": 1160 }, { "epoch": 2.6075238629983155, "grad_norm": 0.1304531693458557, "learning_rate": 8.144449838167579e-06, "loss": 0.4655, "step": 1161 }, { "epoch": 2.609769792251544, "grad_norm": 0.12477454543113708, "learning_rate": 8.052703139383315e-06, "loss": 0.4568, "step": 1162 }, { "epoch": 2.612015721504773, "grad_norm": 0.12605507671833038, "learning_rate": 7.96144873847796e-06, "loss": 0.4558, "step": 1163 }, { "epoch": 2.614261650758001, "grad_norm": 0.12706461548805237, "learning_rate": 7.870687259857858e-06, "loss": 0.4343, "step": 1164 }, { "epoch": 2.6165075800112296, "grad_norm": 0.12751144170761108, "learning_rate": 7.78041932455655e-06, "loss": 0.4554, "step": 1165 }, { "epoch": 2.618753509264458, "grad_norm": 0.12677204608917236, "learning_rate": 7.690645550230482e-06, "loss": 0.4587, "step": 1166 }, { "epoch": 2.6209994385176865, "grad_norm": 0.12588229775428772, "learning_rate": 7.6013665511548114e-06, "loss": 0.4358, "step": 1167 }, { "epoch": 2.6232453677709153, "grad_norm": 0.12063749879598618, "learning_rate": 7.512582938219259e-06, "loss": 0.4384, "step": 1168 }, { "epoch": 2.6254912970241437, "grad_norm": 0.12080162763595581, "learning_rate": 7.424295318923831e-06, "loss": 0.4542, "step": 1169 }, { "epoch": 2.627737226277372, "grad_norm": 0.12560433149337769, "learning_rate": 7.336504297374749e-06, "loss": 0.4493, "step": 1170 }, { "epoch": 2.629983155530601, "grad_norm": 9.130139350891113, "learning_rate": 7.249210474280208e-06, "loss": 0.4636, "step": 1171 }, { "epoch": 2.6322290847838294, "grad_norm": 0.12350396066904068, "learning_rate": 7.162414446946395e-06, "loss": 0.4543, "step": 1172 }, { "epoch": 2.634475014037058, "grad_norm": 0.12666672468185425, "learning_rate": 7.076116809273323e-06, "loss": 0.4633, "step": 1173 }, { "epoch": 2.6367209432902863, "grad_norm": 0.12505994737148285, "learning_rate": 6.990318151750757e-06, "loss": 0.4401, "step": 1174 }, { "epoch": 2.6389668725435147, "grad_norm": 0.1194506362080574, "learning_rate": 6.9050190614542565e-06, "loss": 0.4625, "step": 1175 }, { "epoch": 2.6412128017967436, "grad_norm": 0.12401262670755386, "learning_rate": 6.8202201220410255e-06, "loss": 0.4357, "step": 1176 }, { "epoch": 2.643458731049972, "grad_norm": 0.12455414235591888, "learning_rate": 6.73592191374607e-06, "loss": 0.4494, "step": 1177 }, { "epoch": 2.6457046603032004, "grad_norm": 0.12066637724637985, "learning_rate": 6.652125013378108e-06, "loss": 0.4565, "step": 1178 }, { "epoch": 2.647950589556429, "grad_norm": 0.12697719037532806, "learning_rate": 6.5688299943157e-06, "loss": 0.4434, "step": 1179 }, { "epoch": 2.6501965188096577, "grad_norm": 0.12216756492853165, "learning_rate": 6.486037426503276e-06, "loss": 0.4461, "step": 1180 }, { "epoch": 2.652442448062886, "grad_norm": 0.12145403027534485, "learning_rate": 6.403747876447232e-06, "loss": 0.4506, "step": 1181 }, { "epoch": 2.6546883773161145, "grad_norm": 0.11756281554698944, "learning_rate": 6.321961907212109e-06, "loss": 0.463, "step": 1182 }, { "epoch": 2.656934306569343, "grad_norm": 0.12291593104600906, "learning_rate": 6.240680078416699e-06, "loss": 0.4538, "step": 1183 }, { "epoch": 2.6591802358225713, "grad_norm": 0.12477383762598038, "learning_rate": 6.15990294623023e-06, "loss": 0.456, "step": 1184 }, { "epoch": 2.6614261650758, "grad_norm": 0.12275049090385437, "learning_rate": 6.079631063368547e-06, "loss": 0.4443, "step": 1185 }, { "epoch": 2.6636720943290286, "grad_norm": 0.12498319894075394, "learning_rate": 5.999864979090326e-06, "loss": 0.4487, "step": 1186 }, { "epoch": 2.665918023582257, "grad_norm": 0.11939443647861481, "learning_rate": 5.92060523919332e-06, "loss": 0.4285, "step": 1187 }, { "epoch": 2.668163952835486, "grad_norm": 0.12449135631322861, "learning_rate": 5.8418523860106665e-06, "loss": 0.4609, "step": 1188 }, { "epoch": 2.6704098820887143, "grad_norm": 0.12374921143054962, "learning_rate": 5.763606958407116e-06, "loss": 0.4441, "step": 1189 }, { "epoch": 2.6726558113419427, "grad_norm": 0.11954803764820099, "learning_rate": 5.6858694917754e-06, "loss": 0.4566, "step": 1190 }, { "epoch": 2.674901740595171, "grad_norm": 0.12245208770036697, "learning_rate": 5.6086405180324665e-06, "loss": 0.4519, "step": 1191 }, { "epoch": 2.6771476698483996, "grad_norm": 0.1250237375497818, "learning_rate": 5.531920565616e-06, "loss": 0.4364, "step": 1192 }, { "epoch": 2.6793935991016284, "grad_norm": 0.12335599958896637, "learning_rate": 5.455710159480649e-06, "loss": 0.4513, "step": 1193 }, { "epoch": 2.681639528354857, "grad_norm": 0.12619943916797638, "learning_rate": 5.380009821094536e-06, "loss": 0.4531, "step": 1194 }, { "epoch": 2.6838854576080853, "grad_norm": 0.1240544244647026, "learning_rate": 5.30482006843565e-06, "loss": 0.4396, "step": 1195 }, { "epoch": 2.686131386861314, "grad_norm": 0.12158697843551636, "learning_rate": 5.230141415988312e-06, "loss": 0.4426, "step": 1196 }, { "epoch": 2.6883773161145426, "grad_norm": 0.12433162331581116, "learning_rate": 5.155974374739634e-06, "loss": 0.447, "step": 1197 }, { "epoch": 2.690623245367771, "grad_norm": 0.12310656160116196, "learning_rate": 5.082319452176068e-06, "loss": 0.4359, "step": 1198 }, { "epoch": 2.6928691746209994, "grad_norm": 0.11813896149396896, "learning_rate": 5.009177152279865e-06, "loss": 0.4538, "step": 1199 }, { "epoch": 2.695115103874228, "grad_norm": 0.12028888612985611, "learning_rate": 4.936547975525692e-06, "loss": 0.4334, "step": 1200 }, { "epoch": 2.6973610331274562, "grad_norm": 0.1224963515996933, "learning_rate": 4.864432418877192e-06, "loss": 0.4454, "step": 1201 }, { "epoch": 2.699606962380685, "grad_norm": 0.12296409159898758, "learning_rate": 4.792830975783531e-06, "loss": 0.4439, "step": 1202 }, { "epoch": 2.7018528916339135, "grad_norm": 0.11706443876028061, "learning_rate": 4.721744136176103e-06, "loss": 0.4288, "step": 1203 }, { "epoch": 2.704098820887142, "grad_norm": 0.12277070432901382, "learning_rate": 4.651172386465152e-06, "loss": 0.454, "step": 1204 }, { "epoch": 2.706344750140371, "grad_norm": 0.12013454735279083, "learning_rate": 4.581116209536358e-06, "loss": 0.4405, "step": 1205 }, { "epoch": 2.708590679393599, "grad_norm": 0.12198374420404434, "learning_rate": 4.511576084747696e-06, "loss": 0.4646, "step": 1206 }, { "epoch": 2.7108366086468276, "grad_norm": 0.11776817589998245, "learning_rate": 4.442552487925982e-06, "loss": 0.4494, "step": 1207 }, { "epoch": 2.713082537900056, "grad_norm": 0.12356902658939362, "learning_rate": 4.3740458913637605e-06, "loss": 0.4578, "step": 1208 }, { "epoch": 2.7153284671532845, "grad_norm": 0.11953306198120117, "learning_rate": 4.3060567638159775e-06, "loss": 0.4379, "step": 1209 }, { "epoch": 2.7175743964065133, "grad_norm": 0.12432871758937836, "learning_rate": 4.238585570496847e-06, "loss": 0.4441, "step": 1210 }, { "epoch": 2.7198203256597417, "grad_norm": 0.11917420476675034, "learning_rate": 4.171632773076581e-06, "loss": 0.4477, "step": 1211 }, { "epoch": 2.72206625491297, "grad_norm": 0.11728362739086151, "learning_rate": 4.105198829678285e-06, "loss": 0.4638, "step": 1212 }, { "epoch": 2.724312184166199, "grad_norm": 0.1192561611533165, "learning_rate": 4.039284194874862e-06, "loss": 0.427, "step": 1213 }, { "epoch": 2.7265581134194274, "grad_norm": 0.11842131614685059, "learning_rate": 3.973889319685809e-06, "loss": 0.4321, "step": 1214 }, { "epoch": 2.728804042672656, "grad_norm": 0.11767691373825073, "learning_rate": 3.909014651574197e-06, "loss": 0.4407, "step": 1215 }, { "epoch": 2.7310499719258843, "grad_norm": 0.1151251420378685, "learning_rate": 3.844660634443616e-06, "loss": 0.4472, "step": 1216 }, { "epoch": 2.7332959011791127, "grad_norm": 0.1207621842622757, "learning_rate": 3.7808277086350464e-06, "loss": 0.4326, "step": 1217 }, { "epoch": 2.735541830432341, "grad_norm": 0.11696569621562958, "learning_rate": 3.7175163109239855e-06, "loss": 0.4421, "step": 1218 }, { "epoch": 2.73778775968557, "grad_norm": 0.11953862756490707, "learning_rate": 3.6547268745173247e-06, "loss": 0.4382, "step": 1219 }, { "epoch": 2.7400336889387984, "grad_norm": 0.12477323412895203, "learning_rate": 3.5924598290504855e-06, "loss": 0.4477, "step": 1220 }, { "epoch": 2.742279618192027, "grad_norm": 0.11988485604524612, "learning_rate": 3.530715600584449e-06, "loss": 0.4432, "step": 1221 }, { "epoch": 2.7445255474452557, "grad_norm": 0.12258612364530563, "learning_rate": 3.469494611602775e-06, "loss": 0.4575, "step": 1222 }, { "epoch": 2.746771476698484, "grad_norm": 0.125362828373909, "learning_rate": 3.4087972810088287e-06, "loss": 0.4358, "step": 1223 }, { "epoch": 2.7490174059517125, "grad_norm": 0.11876025050878525, "learning_rate": 3.348624024122824e-06, "loss": 0.4377, "step": 1224 }, { "epoch": 2.751263335204941, "grad_norm": 0.1166381686925888, "learning_rate": 3.2889752526790165e-06, "loss": 0.4348, "step": 1225 }, { "epoch": 2.7535092644581693, "grad_norm": 0.1194562315940857, "learning_rate": 3.2298513748228787e-06, "loss": 0.4443, "step": 1226 }, { "epoch": 2.755755193711398, "grad_norm": 0.11869972944259644, "learning_rate": 3.1712527951083126e-06, "loss": 0.4479, "step": 1227 }, { "epoch": 2.7580011229646266, "grad_norm": 0.11969739198684692, "learning_rate": 3.1131799144948683e-06, "loss": 0.454, "step": 1228 }, { "epoch": 2.760247052217855, "grad_norm": 0.12087547779083252, "learning_rate": 3.0556331303450437e-06, "loss": 0.4341, "step": 1229 }, { "epoch": 2.762492981471084, "grad_norm": 0.12332521378993988, "learning_rate": 2.998612836421506e-06, "loss": 0.4513, "step": 1230 }, { "epoch": 2.7647389107243123, "grad_norm": 0.12205971032381058, "learning_rate": 2.9421194228844084e-06, "loss": 0.4501, "step": 1231 }, { "epoch": 2.7669848399775407, "grad_norm": 0.12263938784599304, "learning_rate": 2.88615327628877e-06, "loss": 0.4504, "step": 1232 }, { "epoch": 2.769230769230769, "grad_norm": 0.12588439881801605, "learning_rate": 2.830714779581776e-06, "loss": 0.4397, "step": 1233 }, { "epoch": 2.7714766984839976, "grad_norm": 0.12059302628040314, "learning_rate": 2.7758043121001834e-06, "loss": 0.4354, "step": 1234 }, { "epoch": 2.7737226277372264, "grad_norm": 0.11515524238348007, "learning_rate": 2.721422249567729e-06, "loss": 0.4495, "step": 1235 }, { "epoch": 2.775968556990455, "grad_norm": 0.11858617514371872, "learning_rate": 2.667568964092544e-06, "loss": 0.4572, "step": 1236 }, { "epoch": 2.7782144862436833, "grad_norm": 0.1132800504565239, "learning_rate": 2.6142448241646046e-06, "loss": 0.4492, "step": 1237 }, { "epoch": 2.7804604154969117, "grad_norm": 0.12041954696178436, "learning_rate": 2.561450194653219e-06, "loss": 0.444, "step": 1238 }, { "epoch": 2.7827063447501406, "grad_norm": 0.1182764321565628, "learning_rate": 2.509185436804549e-06, "loss": 0.4338, "step": 1239 }, { "epoch": 2.784952274003369, "grad_norm": 0.13408203423023224, "learning_rate": 2.457450908239114e-06, "loss": 0.4487, "step": 1240 }, { "epoch": 2.7871982032565974, "grad_norm": 0.12381100654602051, "learning_rate": 2.4062469629493412e-06, "loss": 0.4364, "step": 1241 }, { "epoch": 2.789444132509826, "grad_norm": 0.12140806019306183, "learning_rate": 2.3555739512971565e-06, "loss": 0.4296, "step": 1242 }, { "epoch": 2.7916900617630542, "grad_norm": 0.12192130833864212, "learning_rate": 2.3054322200115963e-06, "loss": 0.4537, "step": 1243 }, { "epoch": 2.793935991016283, "grad_norm": 0.11975108832120895, "learning_rate": 2.255822112186401e-06, "loss": 0.4735, "step": 1244 }, { "epoch": 2.7961819202695115, "grad_norm": 0.12235341221094131, "learning_rate": 2.2067439672777047e-06, "loss": 0.451, "step": 1245 }, { "epoch": 2.79842784952274, "grad_norm": 0.12043313682079315, "learning_rate": 2.158198121101691e-06, "loss": 0.4381, "step": 1246 }, { "epoch": 2.800673778775969, "grad_norm": 0.12066707760095596, "learning_rate": 2.1101849058322932e-06, "loss": 0.4435, "step": 1247 }, { "epoch": 2.802919708029197, "grad_norm": 0.1135956272482872, "learning_rate": 2.062704649998937e-06, "loss": 0.4406, "step": 1248 }, { "epoch": 2.8051656372824256, "grad_norm": 0.12277340143918991, "learning_rate": 2.0157576784843024e-06, "loss": 0.4661, "step": 1249 }, { "epoch": 2.807411566535654, "grad_norm": 0.11728162318468094, "learning_rate": 1.9693443125220346e-06, "loss": 0.4431, "step": 1250 }, { "epoch": 2.8096574957888825, "grad_norm": 0.11668264865875244, "learning_rate": 1.9234648696946354e-06, "loss": 0.4388, "step": 1251 }, { "epoch": 2.8119034250421113, "grad_norm": 0.11695986986160278, "learning_rate": 1.878119663931246e-06, "loss": 0.4494, "step": 1252 }, { "epoch": 2.8141493542953397, "grad_norm": 0.1159198209643364, "learning_rate": 1.833309005505477e-06, "loss": 0.4311, "step": 1253 }, { "epoch": 2.816395283548568, "grad_norm": 0.1172918975353241, "learning_rate": 1.7890332010333233e-06, "loss": 0.4621, "step": 1254 }, { "epoch": 2.8186412128017966, "grad_norm": 0.12139487266540527, "learning_rate": 1.7452925534710763e-06, "loss": 0.4393, "step": 1255 }, { "epoch": 2.8208871420550254, "grad_norm": 0.11837179213762283, "learning_rate": 1.7020873621131738e-06, "loss": 0.447, "step": 1256 }, { "epoch": 2.823133071308254, "grad_norm": 0.12008003145456314, "learning_rate": 1.6594179225902652e-06, "loss": 0.4516, "step": 1257 }, { "epoch": 2.8253790005614823, "grad_norm": 0.11927176266908646, "learning_rate": 1.617284526867078e-06, "loss": 0.4404, "step": 1258 }, { "epoch": 2.8276249298147107, "grad_norm": 0.11693605035543442, "learning_rate": 1.5756874632405095e-06, "loss": 0.4438, "step": 1259 }, { "epoch": 2.829870859067939, "grad_norm": 0.11941110342741013, "learning_rate": 1.534627016337593e-06, "loss": 0.4426, "step": 1260 }, { "epoch": 2.832116788321168, "grad_norm": 0.11750718951225281, "learning_rate": 1.494103467113588e-06, "loss": 0.4322, "step": 1261 }, { "epoch": 2.8343627175743964, "grad_norm": 0.116007000207901, "learning_rate": 1.4541170928500248e-06, "loss": 0.4621, "step": 1262 }, { "epoch": 2.836608646827625, "grad_norm": 0.11818964034318924, "learning_rate": 1.4146681671528418e-06, "loss": 0.4638, "step": 1263 }, { "epoch": 2.8388545760808537, "grad_norm": 0.11916031688451767, "learning_rate": 1.3757569599504917e-06, "loss": 0.4425, "step": 1264 }, { "epoch": 2.841100505334082, "grad_norm": 0.11497969180345535, "learning_rate": 1.3373837374920862e-06, "loss": 0.4425, "step": 1265 }, { "epoch": 2.8433464345873105, "grad_norm": 0.11767168343067169, "learning_rate": 1.2995487623456194e-06, "loss": 0.4532, "step": 1266 }, { "epoch": 2.845592363840539, "grad_norm": 0.115963876247406, "learning_rate": 1.2622522933961112e-06, "loss": 0.4344, "step": 1267 }, { "epoch": 2.8478382930937673, "grad_norm": 0.11714527010917664, "learning_rate": 1.225494585843876e-06, "loss": 0.4678, "step": 1268 }, { "epoch": 2.850084222346996, "grad_norm": 0.11749914288520813, "learning_rate": 1.1892758912027546e-06, "loss": 0.4445, "step": 1269 }, { "epoch": 2.8523301516002246, "grad_norm": 0.11820235848426819, "learning_rate": 1.1535964572984093e-06, "loss": 0.4659, "step": 1270 }, { "epoch": 2.854576080853453, "grad_norm": 0.11582965403795242, "learning_rate": 1.118456528266636e-06, "loss": 0.4441, "step": 1271 }, { "epoch": 2.856822010106682, "grad_norm": 0.11765659600496292, "learning_rate": 1.0838563445516503e-06, "loss": 0.4441, "step": 1272 }, { "epoch": 2.8590679393599103, "grad_norm": 0.11495634913444519, "learning_rate": 1.0497961429044979e-06, "loss": 0.4397, "step": 1273 }, { "epoch": 2.8613138686131387, "grad_norm": 0.13958555459976196, "learning_rate": 1.0162761563813927e-06, "loss": 0.4332, "step": 1274 }, { "epoch": 2.863559797866367, "grad_norm": 0.11957214772701263, "learning_rate": 9.832966143421551e-07, "loss": 0.4476, "step": 1275 }, { "epoch": 2.8658057271195956, "grad_norm": 0.12185267359018326, "learning_rate": 9.508577424486031e-07, "loss": 0.4571, "step": 1276 }, { "epoch": 2.868051656372824, "grad_norm": 0.1512320339679718, "learning_rate": 9.18959762663043e-07, "loss": 0.4322, "step": 1277 }, { "epoch": 2.870297585626053, "grad_norm": 0.11999038606882095, "learning_rate": 8.876028932467417e-07, "loss": 0.4399, "step": 1278 }, { "epoch": 2.8725435148792813, "grad_norm": 0.11745017766952515, "learning_rate": 8.567873487584077e-07, "loss": 0.444, "step": 1279 }, { "epoch": 2.8747894441325097, "grad_norm": 0.12399045377969742, "learning_rate": 8.265133400527881e-07, "loss": 0.4421, "step": 1280 }, { "epoch": 2.8770353733857386, "grad_norm": 0.11552898585796356, "learning_rate": 7.967810742791404e-07, "loss": 0.4369, "step": 1281 }, { "epoch": 2.879281302638967, "grad_norm": 0.1216784194111824, "learning_rate": 7.675907548798744e-07, "loss": 0.4597, "step": 1282 }, { "epoch": 2.8815272318921954, "grad_norm": 0.12029793858528137, "learning_rate": 7.389425815891394e-07, "loss": 0.4455, "step": 1283 }, { "epoch": 2.883773161145424, "grad_norm": 0.11763288825750351, "learning_rate": 7.108367504314651e-07, "loss": 0.4422, "step": 1284 }, { "epoch": 2.8860190903986522, "grad_norm": 0.11679881066083908, "learning_rate": 6.832734537204299e-07, "loss": 0.4525, "step": 1285 }, { "epoch": 2.888265019651881, "grad_norm": 0.11851628869771957, "learning_rate": 6.562528800572931e-07, "loss": 0.4435, "step": 1286 }, { "epoch": 2.8905109489051095, "grad_norm": 0.11480539292097092, "learning_rate": 6.297752143297864e-07, "loss": 0.4484, "step": 1287 }, { "epoch": 2.892756878158338, "grad_norm": 0.12258218973875046, "learning_rate": 6.03840637710782e-07, "loss": 0.4504, "step": 1288 }, { "epoch": 2.895002807411567, "grad_norm": 0.12160119414329529, "learning_rate": 5.784493276570669e-07, "loss": 0.4401, "step": 1289 }, { "epoch": 2.897248736664795, "grad_norm": 0.1183420866727829, "learning_rate": 5.536014579081617e-07, "loss": 0.4523, "step": 1290 }, { "epoch": 2.8994946659180236, "grad_norm": 0.1185230165719986, "learning_rate": 5.292971984850948e-07, "loss": 0.4497, "step": 1291 }, { "epoch": 2.901740595171252, "grad_norm": 0.11411769688129425, "learning_rate": 5.055367156892654e-07, "loss": 0.4436, "step": 1292 }, { "epoch": 2.9039865244244805, "grad_norm": 0.11810418963432312, "learning_rate": 4.823201721012538e-07, "loss": 0.4435, "step": 1293 }, { "epoch": 2.906232453677709, "grad_norm": 0.11871050298213959, "learning_rate": 4.5964772657980827e-07, "loss": 0.4512, "step": 1294 }, { "epoch": 2.9084783829309377, "grad_norm": 0.12631046772003174, "learning_rate": 4.375195342606464e-07, "loss": 0.4352, "step": 1295 }, { "epoch": 2.910724312184166, "grad_norm": 0.11332812160253525, "learning_rate": 4.159357465554603e-07, "loss": 0.4344, "step": 1296 }, { "epoch": 2.9129702414373946, "grad_norm": 0.11570383608341217, "learning_rate": 3.9489651115087734e-07, "loss": 0.4491, "step": 1297 }, { "epoch": 2.9152161706906234, "grad_norm": 0.1170554980635643, "learning_rate": 3.7440197200741214e-07, "loss": 0.4314, "step": 1298 }, { "epoch": 2.917462099943852, "grad_norm": 0.11701026558876038, "learning_rate": 3.544522693585428e-07, "loss": 0.449, "step": 1299 }, { "epoch": 2.9197080291970803, "grad_norm": 0.11610274761915207, "learning_rate": 3.3504753970968083e-07, "loss": 0.4493, "step": 1300 }, { "epoch": 2.9219539584503087, "grad_norm": 0.1187182143330574, "learning_rate": 3.1618791583729157e-07, "loss": 0.4714, "step": 1301 }, { "epoch": 2.924199887703537, "grad_norm": 0.11808615922927856, "learning_rate": 2.97873526787944e-07, "loss": 0.4494, "step": 1302 }, { "epoch": 2.926445816956766, "grad_norm": 0.11943615227937698, "learning_rate": 2.801044978774758e-07, "loss": 0.444, "step": 1303 }, { "epoch": 2.9286917462099944, "grad_norm": 0.11159630864858627, "learning_rate": 2.6288095069009647e-07, "loss": 0.4365, "step": 1304 }, { "epoch": 2.930937675463223, "grad_norm": 0.11759793758392334, "learning_rate": 2.4620300307756975e-07, "loss": 0.4449, "step": 1305 }, { "epoch": 2.9331836047164517, "grad_norm": 0.11761987954378128, "learning_rate": 2.30070769158397e-07, "loss": 0.4392, "step": 1306 }, { "epoch": 2.93542953396968, "grad_norm": 0.11657937616109848, "learning_rate": 2.1448435931705315e-07, "loss": 0.4361, "step": 1307 }, { "epoch": 2.9376754632229085, "grad_norm": 0.11725448071956635, "learning_rate": 1.994438802032228e-07, "loss": 0.4267, "step": 1308 }, { "epoch": 2.939921392476137, "grad_norm": 0.12075719982385635, "learning_rate": 1.8494943473108095e-07, "loss": 0.4495, "step": 1309 }, { "epoch": 2.9421673217293653, "grad_norm": 0.11142679303884506, "learning_rate": 1.710011220785557e-07, "loss": 0.4275, "step": 1310 }, { "epoch": 2.944413250982594, "grad_norm": 0.1148485466837883, "learning_rate": 1.575990376866976e-07, "loss": 0.4362, "step": 1311 }, { "epoch": 2.9466591802358226, "grad_norm": 0.11960410326719284, "learning_rate": 1.4474327325897818e-07, "loss": 0.4507, "step": 1312 }, { "epoch": 2.948905109489051, "grad_norm": 0.11774080991744995, "learning_rate": 1.324339167607036e-07, "loss": 0.4571, "step": 1313 }, { "epoch": 2.9511510387422795, "grad_norm": 0.11617586016654968, "learning_rate": 1.2067105241839294e-07, "loss": 0.4501, "step": 1314 }, { "epoch": 2.9533969679955083, "grad_norm": 0.11817507445812225, "learning_rate": 1.0945476071918316e-07, "loss": 0.4471, "step": 1315 }, { "epoch": 2.9556428972487367, "grad_norm": 0.12056715786457062, "learning_rate": 9.878511841034056e-08, "loss": 0.4382, "step": 1316 }, { "epoch": 2.957888826501965, "grad_norm": 0.11966580897569656, "learning_rate": 8.866219849864799e-08, "loss": 0.4471, "step": 1317 }, { "epoch": 2.9601347557551936, "grad_norm": 0.11672661453485489, "learning_rate": 7.908607024999626e-08, "loss": 0.4636, "step": 1318 }, { "epoch": 2.962380685008422, "grad_norm": 0.12275572121143341, "learning_rate": 7.005679918882457e-08, "loss": 0.4388, "step": 1319 }, { "epoch": 2.964626614261651, "grad_norm": 0.12186376005411148, "learning_rate": 6.157444709773863e-08, "loss": 0.4538, "step": 1320 }, { "epoch": 2.9668725435148793, "grad_norm": 0.11584927141666412, "learning_rate": 5.3639072017057647e-08, "loss": 0.4464, "step": 1321 }, { "epoch": 2.9691184727681077, "grad_norm": 0.11662715673446655, "learning_rate": 4.625072824441468e-08, "loss": 0.439, "step": 1322 }, { "epoch": 2.9713644020213366, "grad_norm": 0.11534745246171951, "learning_rate": 3.940946633440135e-08, "loss": 0.4496, "step": 1323 }, { "epoch": 2.973610331274565, "grad_norm": 0.11853344738483429, "learning_rate": 3.3115333098212576e-08, "loss": 0.4498, "step": 1324 }, { "epoch": 2.9758562605277934, "grad_norm": 0.11632394790649414, "learning_rate": 2.7368371603326838e-08, "loss": 0.4311, "step": 1325 }, { "epoch": 2.978102189781022, "grad_norm": 0.11567545682191849, "learning_rate": 2.216862117319529e-08, "loss": 0.4427, "step": 1326 }, { "epoch": 2.9803481190342502, "grad_norm": 0.11598379909992218, "learning_rate": 1.7516117387010866e-08, "loss": 0.4452, "step": 1327 }, { "epoch": 2.982594048287479, "grad_norm": 0.12003415077924728, "learning_rate": 1.3410892079432914e-08, "loss": 0.4408, "step": 1328 }, { "epoch": 2.9848399775407075, "grad_norm": 0.11351985484361649, "learning_rate": 9.85297334037405e-09, "loss": 0.4529, "step": 1329 }, { "epoch": 2.987085906793936, "grad_norm": 0.11411769688129425, "learning_rate": 6.842385514831407e-09, "loss": 0.434, "step": 1330 }, { "epoch": 2.9893318360471643, "grad_norm": 0.12261338531970978, "learning_rate": 4.3791492026734604e-09, "loss": 0.4534, "step": 1331 }, { "epoch": 2.991577765300393, "grad_norm": 0.11999525874853134, "learning_rate": 2.463281258560102e-09, "loss": 0.448, "step": 1332 }, { "epoch": 2.9938236945536216, "grad_norm": 0.11168470978736877, "learning_rate": 1.094794791764997e-09, "loss": 0.4408, "step": 1333 }, { "epoch": 2.99606962380685, "grad_norm": 0.11737479269504547, "learning_rate": 2.736991661400623e-10, "loss": 0.4357, "step": 1334 }, { "epoch": 2.9983155530600785, "grad_norm": 0.11935008317232132, "learning_rate": 0.0, "loss": 0.4366, "step": 1335 }, { "epoch": 2.9983155530600785, "step": 1335, "total_flos": 4.209303851158733e+19, "train_loss": 0.5228641195690141, "train_runtime": 86845.4923, "train_samples_per_second": 3.937, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 1335, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.209303851158733e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }