{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014388489208633094, "grad_norm": 5.232170786665442, "learning_rate": 2.2988505747126437e-07, "loss": 0.8858, "step": 1 }, { "epoch": 0.0028776978417266188, "grad_norm": 5.189003189324628, "learning_rate": 4.5977011494252875e-07, "loss": 0.8824, "step": 2 }, { "epoch": 0.004316546762589928, "grad_norm": 5.307893904128877, "learning_rate": 6.896551724137931e-07, "loss": 0.8989, "step": 3 }, { "epoch": 0.0057553956834532375, "grad_norm": 5.24498684344306, "learning_rate": 9.195402298850575e-07, "loss": 0.8875, "step": 4 }, { "epoch": 0.007194244604316547, "grad_norm": 5.139649808396179, "learning_rate": 1.1494252873563219e-06, "loss": 0.8715, "step": 5 }, { "epoch": 0.008633093525179856, "grad_norm": 4.94061782589568, "learning_rate": 1.3793103448275862e-06, "loss": 0.8736, "step": 6 }, { "epoch": 0.010071942446043165, "grad_norm": 4.757209414249211, "learning_rate": 1.6091954022988506e-06, "loss": 0.8562, "step": 7 }, { "epoch": 0.011510791366906475, "grad_norm": 3.9956540897026795, "learning_rate": 1.839080459770115e-06, "loss": 0.8307, "step": 8 }, { "epoch": 0.012949640287769784, "grad_norm": 3.7568289996074418, "learning_rate": 2.0689655172413796e-06, "loss": 0.8259, "step": 9 }, { "epoch": 0.014388489208633094, "grad_norm": 2.192756078161156, "learning_rate": 2.2988505747126437e-06, "loss": 0.7865, "step": 10 }, { "epoch": 0.015827338129496403, "grad_norm": 2.1187206799876277, "learning_rate": 2.5287356321839083e-06, "loss": 0.7909, "step": 11 }, { "epoch": 0.017266187050359712, "grad_norm": 1.8660742674953652, "learning_rate": 2.7586206896551725e-06, "loss": 0.7736, "step": 12 }, { "epoch": 0.01870503597122302, "grad_norm": 1.5692999774699359, "learning_rate": 2.988505747126437e-06, "loss": 0.7577, "step": 13 }, { "epoch": 0.02014388489208633, "grad_norm": 3.0415774169771965, "learning_rate": 3.2183908045977012e-06, "loss": 0.7518, "step": 14 }, { "epoch": 0.02158273381294964, "grad_norm": 3.216180658900517, "learning_rate": 3.448275862068966e-06, "loss": 0.748, "step": 15 }, { "epoch": 0.02302158273381295, "grad_norm": 3.1235913271233002, "learning_rate": 3.67816091954023e-06, "loss": 0.7518, "step": 16 }, { "epoch": 0.02446043165467626, "grad_norm": 2.9049126603009823, "learning_rate": 3.908045977011495e-06, "loss": 0.7397, "step": 17 }, { "epoch": 0.025899280575539568, "grad_norm": 2.1000702530912303, "learning_rate": 4.137931034482759e-06, "loss": 0.6992, "step": 18 }, { "epoch": 0.027338129496402876, "grad_norm": 1.8334607763736972, "learning_rate": 4.367816091954023e-06, "loss": 0.6987, "step": 19 }, { "epoch": 0.02877697841726619, "grad_norm": 1.5129650569733848, "learning_rate": 4.5977011494252875e-06, "loss": 0.6819, "step": 20 }, { "epoch": 0.030215827338129497, "grad_norm": 1.2049975073314985, "learning_rate": 4.8275862068965525e-06, "loss": 0.678, "step": 21 }, { "epoch": 0.031654676258992806, "grad_norm": 1.1334124431583827, "learning_rate": 5.057471264367817e-06, "loss": 0.6714, "step": 22 }, { "epoch": 0.033093525179856115, "grad_norm": 1.2261346776502773, "learning_rate": 5.287356321839081e-06, "loss": 0.6587, "step": 23 }, { "epoch": 0.034532374100719423, "grad_norm": 1.1277895897758023, "learning_rate": 5.517241379310345e-06, "loss": 0.6492, "step": 24 }, { "epoch": 0.03597122302158273, "grad_norm": 1.1047080413701726, "learning_rate": 5.747126436781609e-06, "loss": 0.639, "step": 25 }, { "epoch": 0.03741007194244604, "grad_norm": 0.9207307510687444, "learning_rate": 5.977011494252874e-06, "loss": 0.6345, "step": 26 }, { "epoch": 0.03884892086330935, "grad_norm": 0.7894647228308851, "learning_rate": 6.206896551724138e-06, "loss": 0.6311, "step": 27 }, { "epoch": 0.04028776978417266, "grad_norm": 0.8631101241424584, "learning_rate": 6.4367816091954025e-06, "loss": 0.6275, "step": 28 }, { "epoch": 0.041726618705035974, "grad_norm": 0.8182259655769767, "learning_rate": 6.666666666666667e-06, "loss": 0.6156, "step": 29 }, { "epoch": 0.04316546762589928, "grad_norm": 0.646368696077582, "learning_rate": 6.896551724137932e-06, "loss": 0.6257, "step": 30 }, { "epoch": 0.04460431654676259, "grad_norm": 0.49976707011608257, "learning_rate": 7.126436781609196e-06, "loss": 0.6176, "step": 31 }, { "epoch": 0.0460431654676259, "grad_norm": 0.6799614136172442, "learning_rate": 7.35632183908046e-06, "loss": 0.6103, "step": 32 }, { "epoch": 0.04748201438848921, "grad_norm": 0.6347758272440364, "learning_rate": 7.586206896551724e-06, "loss": 0.5904, "step": 33 }, { "epoch": 0.04892086330935252, "grad_norm": 0.44059215896413373, "learning_rate": 7.81609195402299e-06, "loss": 0.6114, "step": 34 }, { "epoch": 0.050359712230215826, "grad_norm": 0.4568865835639603, "learning_rate": 8.045977011494253e-06, "loss": 0.5947, "step": 35 }, { "epoch": 0.051798561151079135, "grad_norm": 0.5224557606359925, "learning_rate": 8.275862068965518e-06, "loss": 0.5908, "step": 36 }, { "epoch": 0.053237410071942444, "grad_norm": 0.5031264697052017, "learning_rate": 8.505747126436782e-06, "loss": 0.5873, "step": 37 }, { "epoch": 0.05467625899280575, "grad_norm": 0.3916284756805459, "learning_rate": 8.735632183908047e-06, "loss": 0.5903, "step": 38 }, { "epoch": 0.05611510791366906, "grad_norm": 0.3204316996379746, "learning_rate": 8.965517241379312e-06, "loss": 0.5723, "step": 39 }, { "epoch": 0.05755395683453238, "grad_norm": 0.42456684891862123, "learning_rate": 9.195402298850575e-06, "loss": 0.6004, "step": 40 }, { "epoch": 0.058992805755395686, "grad_norm": 0.42093007381873226, "learning_rate": 9.42528735632184e-06, "loss": 0.5726, "step": 41 }, { "epoch": 0.060431654676258995, "grad_norm": 0.32619760323201125, "learning_rate": 9.655172413793105e-06, "loss": 0.5743, "step": 42 }, { "epoch": 0.0618705035971223, "grad_norm": 0.36955472140670037, "learning_rate": 9.885057471264368e-06, "loss": 0.5657, "step": 43 }, { "epoch": 0.06330935251798561, "grad_norm": 0.41646173590921515, "learning_rate": 1.0114942528735633e-05, "loss": 0.5826, "step": 44 }, { "epoch": 0.06474820143884892, "grad_norm": 0.40117142453928417, "learning_rate": 1.0344827586206898e-05, "loss": 0.5789, "step": 45 }, { "epoch": 0.06618705035971223, "grad_norm": 0.34911727085364064, "learning_rate": 1.0574712643678162e-05, "loss": 0.5632, "step": 46 }, { "epoch": 0.06762589928057554, "grad_norm": 0.42852395741759985, "learning_rate": 1.0804597701149427e-05, "loss": 0.5581, "step": 47 }, { "epoch": 0.06906474820143885, "grad_norm": 0.3203905738896688, "learning_rate": 1.103448275862069e-05, "loss": 0.559, "step": 48 }, { "epoch": 0.07050359712230216, "grad_norm": 0.3396044368467807, "learning_rate": 1.1264367816091955e-05, "loss": 0.563, "step": 49 }, { "epoch": 0.07194244604316546, "grad_norm": 0.3961432162322185, "learning_rate": 1.1494252873563218e-05, "loss": 0.5598, "step": 50 }, { "epoch": 0.07338129496402877, "grad_norm": 0.24444535453022656, "learning_rate": 1.1724137931034483e-05, "loss": 0.5568, "step": 51 }, { "epoch": 0.07482014388489208, "grad_norm": 0.34947999235158883, "learning_rate": 1.1954022988505748e-05, "loss": 0.5533, "step": 52 }, { "epoch": 0.07625899280575539, "grad_norm": 0.3065382163330684, "learning_rate": 1.2183908045977013e-05, "loss": 0.551, "step": 53 }, { "epoch": 0.0776978417266187, "grad_norm": 0.304266662513688, "learning_rate": 1.2413793103448277e-05, "loss": 0.5457, "step": 54 }, { "epoch": 0.07913669064748201, "grad_norm": 0.23181630612110501, "learning_rate": 1.2643678160919542e-05, "loss": 0.5485, "step": 55 }, { "epoch": 0.08057553956834532, "grad_norm": 0.2608490814351373, "learning_rate": 1.2873563218390805e-05, "loss": 0.5391, "step": 56 }, { "epoch": 0.08201438848920864, "grad_norm": 0.22861678964733734, "learning_rate": 1.310344827586207e-05, "loss": 0.5488, "step": 57 }, { "epoch": 0.08345323741007195, "grad_norm": 0.2387132398893568, "learning_rate": 1.3333333333333333e-05, "loss": 0.5497, "step": 58 }, { "epoch": 0.08489208633093526, "grad_norm": 0.23731641935350856, "learning_rate": 1.3563218390804598e-05, "loss": 0.5351, "step": 59 }, { "epoch": 0.08633093525179857, "grad_norm": 0.2517381403070896, "learning_rate": 1.3793103448275863e-05, "loss": 0.5499, "step": 60 }, { "epoch": 0.08776978417266187, "grad_norm": 0.2490594423576649, "learning_rate": 1.4022988505747128e-05, "loss": 0.5339, "step": 61 }, { "epoch": 0.08920863309352518, "grad_norm": 0.20571335373889837, "learning_rate": 1.4252873563218392e-05, "loss": 0.5518, "step": 62 }, { "epoch": 0.09064748201438849, "grad_norm": 0.24952993436192078, "learning_rate": 1.4482758620689657e-05, "loss": 0.5457, "step": 63 }, { "epoch": 0.0920863309352518, "grad_norm": 0.20944495579900255, "learning_rate": 1.471264367816092e-05, "loss": 0.5502, "step": 64 }, { "epoch": 0.09352517985611511, "grad_norm": 0.23265496702724225, "learning_rate": 1.4942528735632185e-05, "loss": 0.5265, "step": 65 }, { "epoch": 0.09496402877697842, "grad_norm": 0.2572714284263536, "learning_rate": 1.5172413793103448e-05, "loss": 0.5322, "step": 66 }, { "epoch": 0.09640287769784173, "grad_norm": 0.2546598700811475, "learning_rate": 1.540229885057471e-05, "loss": 0.5359, "step": 67 }, { "epoch": 0.09784172661870504, "grad_norm": 0.2670212576850694, "learning_rate": 1.563218390804598e-05, "loss": 0.5339, "step": 68 }, { "epoch": 0.09928057553956834, "grad_norm": 0.3097272932505348, "learning_rate": 1.586206896551724e-05, "loss": 0.5363, "step": 69 }, { "epoch": 0.10071942446043165, "grad_norm": 0.22597215512314667, "learning_rate": 1.6091954022988507e-05, "loss": 0.5366, "step": 70 }, { "epoch": 0.10215827338129496, "grad_norm": 0.24896049028910877, "learning_rate": 1.632183908045977e-05, "loss": 0.5188, "step": 71 }, { "epoch": 0.10359712230215827, "grad_norm": 0.22038628808523028, "learning_rate": 1.6551724137931037e-05, "loss": 0.5307, "step": 72 }, { "epoch": 0.10503597122302158, "grad_norm": 0.2642384824187733, "learning_rate": 1.6781609195402298e-05, "loss": 0.517, "step": 73 }, { "epoch": 0.10647482014388489, "grad_norm": 0.31600117407722406, "learning_rate": 1.7011494252873563e-05, "loss": 0.5289, "step": 74 }, { "epoch": 0.1079136690647482, "grad_norm": 0.42116471977610626, "learning_rate": 1.7241379310344828e-05, "loss": 0.5234, "step": 75 }, { "epoch": 0.1093525179856115, "grad_norm": 0.7330810250123087, "learning_rate": 1.7471264367816093e-05, "loss": 0.524, "step": 76 }, { "epoch": 0.11079136690647481, "grad_norm": 1.1433562597711888, "learning_rate": 1.770114942528736e-05, "loss": 0.5339, "step": 77 }, { "epoch": 0.11223021582733812, "grad_norm": 1.0151631179784177, "learning_rate": 1.7931034482758623e-05, "loss": 0.5309, "step": 78 }, { "epoch": 0.11366906474820145, "grad_norm": 0.4383599009940003, "learning_rate": 1.8160919540229885e-05, "loss": 0.5247, "step": 79 }, { "epoch": 0.11510791366906475, "grad_norm": 0.6380284405492358, "learning_rate": 1.839080459770115e-05, "loss": 0.5286, "step": 80 }, { "epoch": 0.11654676258992806, "grad_norm": 0.9433211526077079, "learning_rate": 1.8620689655172415e-05, "loss": 0.526, "step": 81 }, { "epoch": 0.11798561151079137, "grad_norm": 0.6454820817885937, "learning_rate": 1.885057471264368e-05, "loss": 0.5184, "step": 82 }, { "epoch": 0.11942446043165468, "grad_norm": 0.6186938094843576, "learning_rate": 1.908045977011494e-05, "loss": 0.5373, "step": 83 }, { "epoch": 0.12086330935251799, "grad_norm": 0.8222985848657414, "learning_rate": 1.931034482758621e-05, "loss": 0.5133, "step": 84 }, { "epoch": 0.1223021582733813, "grad_norm": 0.6373941213971877, "learning_rate": 1.9540229885057475e-05, "loss": 0.5225, "step": 85 }, { "epoch": 0.1237410071942446, "grad_norm": 0.5803977667901652, "learning_rate": 1.9770114942528737e-05, "loss": 0.5109, "step": 86 }, { "epoch": 0.1251798561151079, "grad_norm": 0.7300312340518113, "learning_rate": 2e-05, "loss": 0.5161, "step": 87 }, { "epoch": 0.12661870503597122, "grad_norm": 0.49245807578355527, "learning_rate": 2.0229885057471267e-05, "loss": 0.5243, "step": 88 }, { "epoch": 0.12805755395683452, "grad_norm": 0.5783075683156449, "learning_rate": 2.0459770114942528e-05, "loss": 0.5143, "step": 89 }, { "epoch": 0.12949640287769784, "grad_norm": 0.5790626254010335, "learning_rate": 2.0689655172413797e-05, "loss": 0.5123, "step": 90 }, { "epoch": 0.13093525179856116, "grad_norm": 0.6617277361345342, "learning_rate": 2.0919540229885058e-05, "loss": 0.5156, "step": 91 }, { "epoch": 0.13237410071942446, "grad_norm": 1.0316999106593459, "learning_rate": 2.1149425287356323e-05, "loss": 0.5135, "step": 92 }, { "epoch": 0.13381294964028778, "grad_norm": 1.1145362643699137, "learning_rate": 2.1379310344827585e-05, "loss": 0.5224, "step": 93 }, { "epoch": 0.13525179856115108, "grad_norm": 0.8549443929767417, "learning_rate": 2.1609195402298853e-05, "loss": 0.5157, "step": 94 }, { "epoch": 0.1366906474820144, "grad_norm": 1.363906719878284, "learning_rate": 2.183908045977012e-05, "loss": 0.5135, "step": 95 }, { "epoch": 0.1381294964028777, "grad_norm": 0.909276356364517, "learning_rate": 2.206896551724138e-05, "loss": 0.518, "step": 96 }, { "epoch": 0.13956834532374102, "grad_norm": 1.306827176055507, "learning_rate": 2.229885057471265e-05, "loss": 0.5133, "step": 97 }, { "epoch": 0.1410071942446043, "grad_norm": 0.7284051970530677, "learning_rate": 2.252873563218391e-05, "loss": 0.5141, "step": 98 }, { "epoch": 0.14244604316546763, "grad_norm": 1.4690961991545168, "learning_rate": 2.2758620689655175e-05, "loss": 0.5155, "step": 99 }, { "epoch": 0.14388489208633093, "grad_norm": 0.8564984095806956, "learning_rate": 2.2988505747126437e-05, "loss": 0.514, "step": 100 }, { "epoch": 0.14532374100719425, "grad_norm": 1.58868035636525, "learning_rate": 2.3218390804597705e-05, "loss": 0.5214, "step": 101 }, { "epoch": 0.14676258992805755, "grad_norm": 1.1821772704261104, "learning_rate": 2.3448275862068967e-05, "loss": 0.5068, "step": 102 }, { "epoch": 0.14820143884892087, "grad_norm": 1.3970183237937182, "learning_rate": 2.367816091954023e-05, "loss": 0.5114, "step": 103 }, { "epoch": 0.14964028776978416, "grad_norm": 1.3856255061438907, "learning_rate": 2.3908045977011497e-05, "loss": 0.5194, "step": 104 }, { "epoch": 0.1510791366906475, "grad_norm": 1.0199674097168947, "learning_rate": 2.413793103448276e-05, "loss": 0.5106, "step": 105 }, { "epoch": 0.15251798561151078, "grad_norm": 1.2300452127883577, "learning_rate": 2.4367816091954027e-05, "loss": 0.5078, "step": 106 }, { "epoch": 0.1539568345323741, "grad_norm": 0.9037488204666687, "learning_rate": 2.4597701149425288e-05, "loss": 0.5114, "step": 107 }, { "epoch": 0.1553956834532374, "grad_norm": 0.9619214892056264, "learning_rate": 2.4827586206896553e-05, "loss": 0.5088, "step": 108 }, { "epoch": 0.15683453237410072, "grad_norm": 0.7461287597995564, "learning_rate": 2.5057471264367815e-05, "loss": 0.4968, "step": 109 }, { "epoch": 0.15827338129496402, "grad_norm": 0.7472151674844343, "learning_rate": 2.5287356321839083e-05, "loss": 0.5027, "step": 110 }, { "epoch": 0.15971223021582734, "grad_norm": 0.7111109009469494, "learning_rate": 2.551724137931035e-05, "loss": 0.5089, "step": 111 }, { "epoch": 0.16115107913669063, "grad_norm": 0.6338520803535057, "learning_rate": 2.574712643678161e-05, "loss": 0.5154, "step": 112 }, { "epoch": 0.16258992805755396, "grad_norm": 0.675370574100815, "learning_rate": 2.597701149425288e-05, "loss": 0.5195, "step": 113 }, { "epoch": 0.16402877697841728, "grad_norm": 0.7298286160968676, "learning_rate": 2.620689655172414e-05, "loss": 0.5111, "step": 114 }, { "epoch": 0.16546762589928057, "grad_norm": 0.4997081048522759, "learning_rate": 2.6436781609195405e-05, "loss": 0.5109, "step": 115 }, { "epoch": 0.1669064748201439, "grad_norm": 0.5721967528687938, "learning_rate": 2.6666666666666667e-05, "loss": 0.4977, "step": 116 }, { "epoch": 0.1683453237410072, "grad_norm": 0.621106185128584, "learning_rate": 2.6896551724137935e-05, "loss": 0.4996, "step": 117 }, { "epoch": 0.1697841726618705, "grad_norm": 0.46702811394454086, "learning_rate": 2.7126436781609197e-05, "loss": 0.5017, "step": 118 }, { "epoch": 0.1712230215827338, "grad_norm": 0.6367448127200948, "learning_rate": 2.735632183908046e-05, "loss": 0.5095, "step": 119 }, { "epoch": 0.17266187050359713, "grad_norm": 0.8388646694034495, "learning_rate": 2.7586206896551727e-05, "loss": 0.5082, "step": 120 }, { "epoch": 0.17410071942446043, "grad_norm": 0.8221549454299817, "learning_rate": 2.781609195402299e-05, "loss": 0.5081, "step": 121 }, { "epoch": 0.17553956834532375, "grad_norm": 1.0718933842162548, "learning_rate": 2.8045977011494257e-05, "loss": 0.5076, "step": 122 }, { "epoch": 0.17697841726618704, "grad_norm": 1.1573932937368792, "learning_rate": 2.8275862068965518e-05, "loss": 0.5006, "step": 123 }, { "epoch": 0.17841726618705037, "grad_norm": 0.8462900690961672, "learning_rate": 2.8505747126436783e-05, "loss": 0.4994, "step": 124 }, { "epoch": 0.17985611510791366, "grad_norm": 0.7461924145603112, "learning_rate": 2.8735632183908045e-05, "loss": 0.4955, "step": 125 }, { "epoch": 0.18129496402877698, "grad_norm": 0.903532518218726, "learning_rate": 2.8965517241379313e-05, "loss": 0.4967, "step": 126 }, { "epoch": 0.18273381294964028, "grad_norm": 1.1408053895067267, "learning_rate": 2.919540229885058e-05, "loss": 0.5088, "step": 127 }, { "epoch": 0.1841726618705036, "grad_norm": 1.0551621554288855, "learning_rate": 2.942528735632184e-05, "loss": 0.5017, "step": 128 }, { "epoch": 0.1856115107913669, "grad_norm": 1.0963160001943373, "learning_rate": 2.965517241379311e-05, "loss": 0.5044, "step": 129 }, { "epoch": 0.18705035971223022, "grad_norm": 0.9073606740947865, "learning_rate": 2.988505747126437e-05, "loss": 0.5071, "step": 130 }, { "epoch": 0.1884892086330935, "grad_norm": 0.9264114085091071, "learning_rate": 3.0114942528735635e-05, "loss": 0.5064, "step": 131 }, { "epoch": 0.18992805755395684, "grad_norm": 1.0371785802095712, "learning_rate": 3.0344827586206897e-05, "loss": 0.497, "step": 132 }, { "epoch": 0.19136690647482013, "grad_norm": 0.9909533495251422, "learning_rate": 3.057471264367816e-05, "loss": 0.4964, "step": 133 }, { "epoch": 0.19280575539568345, "grad_norm": 1.1174578586845554, "learning_rate": 3.080459770114942e-05, "loss": 0.5089, "step": 134 }, { "epoch": 0.19424460431654678, "grad_norm": 0.9005843732910414, "learning_rate": 3.103448275862069e-05, "loss": 0.5, "step": 135 }, { "epoch": 0.19568345323741007, "grad_norm": 0.8643748894738493, "learning_rate": 3.126436781609196e-05, "loss": 0.4979, "step": 136 }, { "epoch": 0.1971223021582734, "grad_norm": 0.7844564240356566, "learning_rate": 3.149425287356322e-05, "loss": 0.4914, "step": 137 }, { "epoch": 0.1985611510791367, "grad_norm": 0.8420107305727663, "learning_rate": 3.172413793103448e-05, "loss": 0.4989, "step": 138 }, { "epoch": 0.2, "grad_norm": 0.8010607454745727, "learning_rate": 3.195402298850575e-05, "loss": 0.4972, "step": 139 }, { "epoch": 0.2014388489208633, "grad_norm": 0.7536572747354274, "learning_rate": 3.218390804597701e-05, "loss": 0.4947, "step": 140 }, { "epoch": 0.20287769784172663, "grad_norm": 0.7581207765273857, "learning_rate": 3.2413793103448275e-05, "loss": 0.4917, "step": 141 }, { "epoch": 0.20431654676258992, "grad_norm": 0.9658796512775292, "learning_rate": 3.264367816091954e-05, "loss": 0.491, "step": 142 }, { "epoch": 0.20575539568345325, "grad_norm": 1.37702441636452, "learning_rate": 3.287356321839081e-05, "loss": 0.493, "step": 143 }, { "epoch": 0.20719424460431654, "grad_norm": 0.5256931079204877, "learning_rate": 3.310344827586207e-05, "loss": 0.4887, "step": 144 }, { "epoch": 0.20863309352517986, "grad_norm": 0.9428348881298956, "learning_rate": 3.3333333333333335e-05, "loss": 0.4957, "step": 145 }, { "epoch": 0.21007194244604316, "grad_norm": 1.3016157308761458, "learning_rate": 3.3563218390804597e-05, "loss": 0.4998, "step": 146 }, { "epoch": 0.21151079136690648, "grad_norm": 0.613818512646567, "learning_rate": 3.3793103448275865e-05, "loss": 0.4979, "step": 147 }, { "epoch": 0.21294964028776978, "grad_norm": 1.009800309668571, "learning_rate": 3.4022988505747127e-05, "loss": 0.4947, "step": 148 }, { "epoch": 0.2143884892086331, "grad_norm": 0.9624925516955802, "learning_rate": 3.4252873563218395e-05, "loss": 0.4969, "step": 149 }, { "epoch": 0.2158273381294964, "grad_norm": 0.8825107931109423, "learning_rate": 3.4482758620689657e-05, "loss": 0.4992, "step": 150 }, { "epoch": 0.21726618705035972, "grad_norm": 1.1975991844403961, "learning_rate": 3.4712643678160925e-05, "loss": 0.5014, "step": 151 }, { "epoch": 0.218705035971223, "grad_norm": 0.9023368561489841, "learning_rate": 3.4942528735632187e-05, "loss": 0.504, "step": 152 }, { "epoch": 0.22014388489208633, "grad_norm": 1.4831068778936636, "learning_rate": 3.517241379310345e-05, "loss": 0.4937, "step": 153 }, { "epoch": 0.22158273381294963, "grad_norm": 0.7609441164303112, "learning_rate": 3.540229885057472e-05, "loss": 0.4942, "step": 154 }, { "epoch": 0.22302158273381295, "grad_norm": 1.199593587875097, "learning_rate": 3.563218390804598e-05, "loss": 0.4845, "step": 155 }, { "epoch": 0.22446043165467625, "grad_norm": 0.7396075741999119, "learning_rate": 3.586206896551725e-05, "loss": 0.4966, "step": 156 }, { "epoch": 0.22589928057553957, "grad_norm": 0.9778734659781795, "learning_rate": 3.609195402298851e-05, "loss": 0.498, "step": 157 }, { "epoch": 0.2273381294964029, "grad_norm": 1.185683620829216, "learning_rate": 3.632183908045977e-05, "loss": 0.4945, "step": 158 }, { "epoch": 0.22877697841726619, "grad_norm": 0.6786828186846136, "learning_rate": 3.655172413793104e-05, "loss": 0.5011, "step": 159 }, { "epoch": 0.2302158273381295, "grad_norm": 0.9352379928924268, "learning_rate": 3.67816091954023e-05, "loss": 0.4918, "step": 160 }, { "epoch": 0.2316546762589928, "grad_norm": 0.8893597905763662, "learning_rate": 3.701149425287357e-05, "loss": 0.4989, "step": 161 }, { "epoch": 0.23309352517985613, "grad_norm": 0.7629180786629474, "learning_rate": 3.724137931034483e-05, "loss": 0.4937, "step": 162 }, { "epoch": 0.23453237410071942, "grad_norm": 0.9092757267696973, "learning_rate": 3.74712643678161e-05, "loss": 0.5025, "step": 163 }, { "epoch": 0.23597122302158274, "grad_norm": 1.0300863068218389, "learning_rate": 3.770114942528736e-05, "loss": 0.4894, "step": 164 }, { "epoch": 0.23741007194244604, "grad_norm": 1.4373845829394083, "learning_rate": 3.793103448275862e-05, "loss": 0.5019, "step": 165 }, { "epoch": 0.23884892086330936, "grad_norm": 1.1369279239361678, "learning_rate": 3.816091954022988e-05, "loss": 0.4941, "step": 166 }, { "epoch": 0.24028776978417266, "grad_norm": 1.2368590685738565, "learning_rate": 3.839080459770115e-05, "loss": 0.5106, "step": 167 }, { "epoch": 0.24172661870503598, "grad_norm": 1.2470456300859118, "learning_rate": 3.862068965517242e-05, "loss": 0.4916, "step": 168 }, { "epoch": 0.24316546762589927, "grad_norm": 1.4542567342620654, "learning_rate": 3.885057471264368e-05, "loss": 0.4971, "step": 169 }, { "epoch": 0.2446043165467626, "grad_norm": 0.778853934850899, "learning_rate": 3.908045977011495e-05, "loss": 0.4923, "step": 170 }, { "epoch": 0.2460431654676259, "grad_norm": 1.7833309124005472, "learning_rate": 3.931034482758621e-05, "loss": 0.4958, "step": 171 }, { "epoch": 0.2474820143884892, "grad_norm": 0.93242302095893, "learning_rate": 3.954022988505747e-05, "loss": 0.4912, "step": 172 }, { "epoch": 0.2489208633093525, "grad_norm": 2.12123455399472, "learning_rate": 3.9770114942528735e-05, "loss": 0.4964, "step": 173 }, { "epoch": 0.2503597122302158, "grad_norm": 1.7833482803959417, "learning_rate": 4e-05, "loss": 0.4945, "step": 174 }, { "epoch": 0.2517985611510791, "grad_norm": 1.3701247359533977, "learning_rate": 4.022988505747127e-05, "loss": 0.498, "step": 175 }, { "epoch": 0.25323741007194245, "grad_norm": 1.5349346547437832, "learning_rate": 4.045977011494253e-05, "loss": 0.4927, "step": 176 }, { "epoch": 0.25467625899280577, "grad_norm": 1.1074123983244828, "learning_rate": 4.0689655172413795e-05, "loss": 0.502, "step": 177 }, { "epoch": 0.25611510791366904, "grad_norm": 1.7093858086487201, "learning_rate": 4.0919540229885057e-05, "loss": 0.4954, "step": 178 }, { "epoch": 0.25755395683453236, "grad_norm": 1.1468166105078421, "learning_rate": 4.1149425287356325e-05, "loss": 0.5036, "step": 179 }, { "epoch": 0.2589928057553957, "grad_norm": 1.771636805784584, "learning_rate": 4.137931034482759e-05, "loss": 0.4949, "step": 180 }, { "epoch": 0.260431654676259, "grad_norm": 1.2476683208712824, "learning_rate": 4.160919540229885e-05, "loss": 0.4957, "step": 181 }, { "epoch": 0.26187050359712233, "grad_norm": 1.8081462170443292, "learning_rate": 4.1839080459770117e-05, "loss": 0.5002, "step": 182 }, { "epoch": 0.2633093525179856, "grad_norm": 1.1150500115400306, "learning_rate": 4.2068965517241385e-05, "loss": 0.4933, "step": 183 }, { "epoch": 0.2647482014388489, "grad_norm": 2.093915191575719, "learning_rate": 4.2298850574712647e-05, "loss": 0.5128, "step": 184 }, { "epoch": 0.26618705035971224, "grad_norm": 1.600106959664035, "learning_rate": 4.2528735632183915e-05, "loss": 0.5009, "step": 185 }, { "epoch": 0.26762589928057556, "grad_norm": 1.8336012966254829, "learning_rate": 4.275862068965517e-05, "loss": 0.4964, "step": 186 }, { "epoch": 0.26906474820143883, "grad_norm": 1.5245854964800294, "learning_rate": 4.298850574712644e-05, "loss": 0.5077, "step": 187 }, { "epoch": 0.27050359712230215, "grad_norm": 1.767595562772753, "learning_rate": 4.321839080459771e-05, "loss": 0.5076, "step": 188 }, { "epoch": 0.2719424460431655, "grad_norm": 1.5139363631737155, "learning_rate": 4.344827586206897e-05, "loss": 0.5029, "step": 189 }, { "epoch": 0.2733812949640288, "grad_norm": 1.542045683799105, "learning_rate": 4.367816091954024e-05, "loss": 0.5013, "step": 190 }, { "epoch": 0.27482014388489207, "grad_norm": 1.3381200993387352, "learning_rate": 4.39080459770115e-05, "loss": 0.4874, "step": 191 }, { "epoch": 0.2762589928057554, "grad_norm": 1.5390253779260872, "learning_rate": 4.413793103448276e-05, "loss": 0.494, "step": 192 }, { "epoch": 0.2776978417266187, "grad_norm": 1.0552446418090449, "learning_rate": 4.436781609195403e-05, "loss": 0.4863, "step": 193 }, { "epoch": 0.27913669064748203, "grad_norm": 1.77100245437121, "learning_rate": 4.45977011494253e-05, "loss": 0.5031, "step": 194 }, { "epoch": 0.2805755395683453, "grad_norm": 1.4851400864755933, "learning_rate": 4.482758620689655e-05, "loss": 0.4962, "step": 195 }, { "epoch": 0.2820143884892086, "grad_norm": 1.4291720970659931, "learning_rate": 4.505747126436782e-05, "loss": 0.49, "step": 196 }, { "epoch": 0.28345323741007195, "grad_norm": 1.1639239859473705, "learning_rate": 4.528735632183908e-05, "loss": 0.5018, "step": 197 }, { "epoch": 0.28489208633093527, "grad_norm": 1.413061827929404, "learning_rate": 4.551724137931035e-05, "loss": 0.4887, "step": 198 }, { "epoch": 0.28633093525179854, "grad_norm": 0.998532759658418, "learning_rate": 4.574712643678162e-05, "loss": 0.488, "step": 199 }, { "epoch": 0.28776978417266186, "grad_norm": 1.7513659096171872, "learning_rate": 4.597701149425287e-05, "loss": 0.4903, "step": 200 }, { "epoch": 0.2892086330935252, "grad_norm": 1.3216333801978293, "learning_rate": 4.620689655172414e-05, "loss": 0.4855, "step": 201 }, { "epoch": 0.2906474820143885, "grad_norm": 1.4684021935280545, "learning_rate": 4.643678160919541e-05, "loss": 0.4872, "step": 202 }, { "epoch": 0.2920863309352518, "grad_norm": 1.2952384457839685, "learning_rate": 4.666666666666667e-05, "loss": 0.4888, "step": 203 }, { "epoch": 0.2935251798561151, "grad_norm": 1.3990150221114375, "learning_rate": 4.689655172413793e-05, "loss": 0.4974, "step": 204 }, { "epoch": 0.2949640287769784, "grad_norm": 0.9876775520718227, "learning_rate": 4.7126436781609195e-05, "loss": 0.488, "step": 205 }, { "epoch": 0.29640287769784174, "grad_norm": 1.36524312409901, "learning_rate": 4.735632183908046e-05, "loss": 0.4879, "step": 206 }, { "epoch": 0.29784172661870506, "grad_norm": 1.1149550671155792, "learning_rate": 4.758620689655173e-05, "loss": 0.5, "step": 207 }, { "epoch": 0.2992805755395683, "grad_norm": 1.4692418153023588, "learning_rate": 4.781609195402299e-05, "loss": 0.4978, "step": 208 }, { "epoch": 0.30071942446043165, "grad_norm": 1.195808176437362, "learning_rate": 4.8045977011494255e-05, "loss": 0.5006, "step": 209 }, { "epoch": 0.302158273381295, "grad_norm": 1.3168347564869889, "learning_rate": 4.827586206896552e-05, "loss": 0.4878, "step": 210 }, { "epoch": 0.3035971223021583, "grad_norm": 1.0935788325080322, "learning_rate": 4.8505747126436785e-05, "loss": 0.4933, "step": 211 }, { "epoch": 0.30503597122302156, "grad_norm": 1.1314020714687931, "learning_rate": 4.873563218390805e-05, "loss": 0.478, "step": 212 }, { "epoch": 0.3064748201438849, "grad_norm": 0.9368152176738559, "learning_rate": 4.896551724137931e-05, "loss": 0.4969, "step": 213 }, { "epoch": 0.3079136690647482, "grad_norm": 1.0341123301704138, "learning_rate": 4.9195402298850577e-05, "loss": 0.4874, "step": 214 }, { "epoch": 0.30935251798561153, "grad_norm": 0.8357582951933543, "learning_rate": 4.9425287356321845e-05, "loss": 0.4847, "step": 215 }, { "epoch": 0.3107913669064748, "grad_norm": 0.7732084800276675, "learning_rate": 4.9655172413793107e-05, "loss": 0.4867, "step": 216 }, { "epoch": 0.3122302158273381, "grad_norm": 0.7720204405423883, "learning_rate": 4.9885057471264375e-05, "loss": 0.4852, "step": 217 }, { "epoch": 0.31366906474820144, "grad_norm": 0.7119262037670632, "learning_rate": 5.011494252873563e-05, "loss": 0.4851, "step": 218 }, { "epoch": 0.31510791366906477, "grad_norm": 0.7681186415638412, "learning_rate": 5.03448275862069e-05, "loss": 0.4795, "step": 219 }, { "epoch": 0.31654676258992803, "grad_norm": 1.010887539368242, "learning_rate": 5.057471264367817e-05, "loss": 0.4876, "step": 220 }, { "epoch": 0.31798561151079136, "grad_norm": 1.1684469481139526, "learning_rate": 5.0804597701149435e-05, "loss": 0.4816, "step": 221 }, { "epoch": 0.3194244604316547, "grad_norm": 1.104313919572175, "learning_rate": 5.10344827586207e-05, "loss": 0.4789, "step": 222 }, { "epoch": 0.320863309352518, "grad_norm": 1.259432298938836, "learning_rate": 5.126436781609196e-05, "loss": 0.4852, "step": 223 }, { "epoch": 0.32230215827338127, "grad_norm": 0.9975875091199469, "learning_rate": 5.149425287356322e-05, "loss": 0.4795, "step": 224 }, { "epoch": 0.3237410071942446, "grad_norm": 1.179464731042854, "learning_rate": 5.172413793103449e-05, "loss": 0.4797, "step": 225 }, { "epoch": 0.3251798561151079, "grad_norm": 1.3336680054277188, "learning_rate": 5.195402298850576e-05, "loss": 0.4709, "step": 226 }, { "epoch": 0.32661870503597124, "grad_norm": 0.6182311428543251, "learning_rate": 5.218390804597701e-05, "loss": 0.48, "step": 227 }, { "epoch": 0.32805755395683456, "grad_norm": 1.5772180116039807, "learning_rate": 5.241379310344828e-05, "loss": 0.4933, "step": 228 }, { "epoch": 0.3294964028776978, "grad_norm": 0.8301808960896024, "learning_rate": 5.264367816091954e-05, "loss": 0.4851, "step": 229 }, { "epoch": 0.33093525179856115, "grad_norm": 1.287879247394346, "learning_rate": 5.287356321839081e-05, "loss": 0.4882, "step": 230 }, { "epoch": 0.33237410071942447, "grad_norm": 0.7930256285204332, "learning_rate": 5.310344827586208e-05, "loss": 0.4724, "step": 231 }, { "epoch": 0.3338129496402878, "grad_norm": 0.8685183431577114, "learning_rate": 5.333333333333333e-05, "loss": 0.4913, "step": 232 }, { "epoch": 0.33525179856115106, "grad_norm": 1.1066482481846536, "learning_rate": 5.35632183908046e-05, "loss": 0.4848, "step": 233 }, { "epoch": 0.3366906474820144, "grad_norm": 0.8998014785047385, "learning_rate": 5.379310344827587e-05, "loss": 0.4811, "step": 234 }, { "epoch": 0.3381294964028777, "grad_norm": 1.1251664212166712, "learning_rate": 5.402298850574713e-05, "loss": 0.4744, "step": 235 }, { "epoch": 0.339568345323741, "grad_norm": 0.797966347020963, "learning_rate": 5.425287356321839e-05, "loss": 0.473, "step": 236 }, { "epoch": 0.3410071942446043, "grad_norm": 1.4078158818561126, "learning_rate": 5.4482758620689655e-05, "loss": 0.4877, "step": 237 }, { "epoch": 0.3424460431654676, "grad_norm": 0.5758988831861516, "learning_rate": 5.471264367816092e-05, "loss": 0.4842, "step": 238 }, { "epoch": 0.34388489208633094, "grad_norm": 1.04982528972906, "learning_rate": 5.494252873563219e-05, "loss": 0.4849, "step": 239 }, { "epoch": 0.34532374100719426, "grad_norm": 1.0248462844064845, "learning_rate": 5.517241379310345e-05, "loss": 0.4749, "step": 240 }, { "epoch": 0.34676258992805753, "grad_norm": 0.788697716625114, "learning_rate": 5.5402298850574715e-05, "loss": 0.4837, "step": 241 }, { "epoch": 0.34820143884892085, "grad_norm": 1.103753220461005, "learning_rate": 5.563218390804598e-05, "loss": 0.4812, "step": 242 }, { "epoch": 0.3496402877697842, "grad_norm": 1.1125950371481492, "learning_rate": 5.5862068965517245e-05, "loss": 0.4853, "step": 243 }, { "epoch": 0.3510791366906475, "grad_norm": 0.7511131774634014, "learning_rate": 5.609195402298851e-05, "loss": 0.4708, "step": 244 }, { "epoch": 0.35251798561151076, "grad_norm": 0.984133060801353, "learning_rate": 5.632183908045977e-05, "loss": 0.4779, "step": 245 }, { "epoch": 0.3539568345323741, "grad_norm": 1.3331164516977596, "learning_rate": 5.6551724137931037e-05, "loss": 0.4825, "step": 246 }, { "epoch": 0.3553956834532374, "grad_norm": 1.070100828780393, "learning_rate": 5.6781609195402305e-05, "loss": 0.4829, "step": 247 }, { "epoch": 0.35683453237410073, "grad_norm": 1.0753477967632834, "learning_rate": 5.7011494252873567e-05, "loss": 0.4806, "step": 248 }, { "epoch": 0.35827338129496406, "grad_norm": 1.0086916569998965, "learning_rate": 5.7241379310344835e-05, "loss": 0.479, "step": 249 }, { "epoch": 0.3597122302158273, "grad_norm": 1.000129764716916, "learning_rate": 5.747126436781609e-05, "loss": 0.4776, "step": 250 }, { "epoch": 0.36115107913669064, "grad_norm": 1.363231863410886, "learning_rate": 5.770114942528736e-05, "loss": 0.4708, "step": 251 }, { "epoch": 0.36258992805755397, "grad_norm": 0.9754682903695154, "learning_rate": 5.7931034482758627e-05, "loss": 0.4708, "step": 252 }, { "epoch": 0.3640287769784173, "grad_norm": 1.1626305469845395, "learning_rate": 5.8160919540229895e-05, "loss": 0.4819, "step": 253 }, { "epoch": 0.36546762589928056, "grad_norm": 1.0281653908967707, "learning_rate": 5.839080459770116e-05, "loss": 0.4862, "step": 254 }, { "epoch": 0.3669064748201439, "grad_norm": 1.5241614994146868, "learning_rate": 5.862068965517242e-05, "loss": 0.4755, "step": 255 }, { "epoch": 0.3683453237410072, "grad_norm": 0.6054701794166305, "learning_rate": 5.885057471264368e-05, "loss": 0.4836, "step": 256 }, { "epoch": 0.3697841726618705, "grad_norm": 1.597009972339469, "learning_rate": 5.908045977011495e-05, "loss": 0.4903, "step": 257 }, { "epoch": 0.3712230215827338, "grad_norm": 0.898775265650471, "learning_rate": 5.931034482758622e-05, "loss": 0.4752, "step": 258 }, { "epoch": 0.3726618705035971, "grad_norm": 1.2510005303008964, "learning_rate": 5.954022988505747e-05, "loss": 0.492, "step": 259 }, { "epoch": 0.37410071942446044, "grad_norm": 1.263106758218865, "learning_rate": 5.977011494252874e-05, "loss": 0.4839, "step": 260 }, { "epoch": 0.37553956834532376, "grad_norm": 0.8391195279641513, "learning_rate": 6.000000000000001e-05, "loss": 0.4799, "step": 261 }, { "epoch": 0.376978417266187, "grad_norm": 1.2992976420589124, "learning_rate": 6.022988505747127e-05, "loss": 0.4918, "step": 262 }, { "epoch": 0.37841726618705035, "grad_norm": 0.9180286078429746, "learning_rate": 6.045977011494254e-05, "loss": 0.4831, "step": 263 }, { "epoch": 0.37985611510791367, "grad_norm": 1.1975745180339628, "learning_rate": 6.068965517241379e-05, "loss": 0.4739, "step": 264 }, { "epoch": 0.381294964028777, "grad_norm": 0.8871909141965262, "learning_rate": 6.091954022988506e-05, "loss": 0.4734, "step": 265 }, { "epoch": 0.38273381294964026, "grad_norm": 0.7935115442549551, "learning_rate": 6.114942528735632e-05, "loss": 0.4723, "step": 266 }, { "epoch": 0.3841726618705036, "grad_norm": 0.7815236006599882, "learning_rate": 6.137931034482759e-05, "loss": 0.4715, "step": 267 }, { "epoch": 0.3856115107913669, "grad_norm": 0.9352283862124354, "learning_rate": 6.160919540229885e-05, "loss": 0.4815, "step": 268 }, { "epoch": 0.38705035971223023, "grad_norm": 1.4559923722177637, "learning_rate": 6.183908045977011e-05, "loss": 0.4815, "step": 269 }, { "epoch": 0.38848920863309355, "grad_norm": 0.6876046733921063, "learning_rate": 6.206896551724138e-05, "loss": 0.4745, "step": 270 }, { "epoch": 0.3899280575539568, "grad_norm": 0.833844240162271, "learning_rate": 6.229885057471265e-05, "loss": 0.4763, "step": 271 }, { "epoch": 0.39136690647482014, "grad_norm": 1.2650849654272054, "learning_rate": 6.252873563218392e-05, "loss": 0.4822, "step": 272 }, { "epoch": 0.39280575539568346, "grad_norm": 1.0858926640637128, "learning_rate": 6.275862068965517e-05, "loss": 0.4664, "step": 273 }, { "epoch": 0.3942446043165468, "grad_norm": 1.019011478452704, "learning_rate": 6.298850574712644e-05, "loss": 0.474, "step": 274 }, { "epoch": 0.39568345323741005, "grad_norm": 1.0938566637945895, "learning_rate": 6.321839080459771e-05, "loss": 0.4774, "step": 275 }, { "epoch": 0.3971223021582734, "grad_norm": 0.729489325137161, "learning_rate": 6.344827586206897e-05, "loss": 0.4747, "step": 276 }, { "epoch": 0.3985611510791367, "grad_norm": 0.9699927603048148, "learning_rate": 6.367816091954023e-05, "loss": 0.4725, "step": 277 }, { "epoch": 0.4, "grad_norm": 0.842114037888034, "learning_rate": 6.39080459770115e-05, "loss": 0.4774, "step": 278 }, { "epoch": 0.4014388489208633, "grad_norm": 1.2681756784337148, "learning_rate": 6.413793103448276e-05, "loss": 0.4776, "step": 279 }, { "epoch": 0.4028776978417266, "grad_norm": 1.0074650909265712, "learning_rate": 6.436781609195403e-05, "loss": 0.4707, "step": 280 }, { "epoch": 0.40431654676258993, "grad_norm": 0.8006996946486058, "learning_rate": 6.45977011494253e-05, "loss": 0.4759, "step": 281 }, { "epoch": 0.40575539568345326, "grad_norm": 0.8574536311516484, "learning_rate": 6.482758620689655e-05, "loss": 0.4737, "step": 282 }, { "epoch": 0.4071942446043165, "grad_norm": 0.9260422341892245, "learning_rate": 6.505747126436782e-05, "loss": 0.4772, "step": 283 }, { "epoch": 0.40863309352517985, "grad_norm": 1.0937426163706612, "learning_rate": 6.528735632183909e-05, "loss": 0.4734, "step": 284 }, { "epoch": 0.41007194244604317, "grad_norm": 1.1852915257823429, "learning_rate": 6.551724137931035e-05, "loss": 0.4743, "step": 285 }, { "epoch": 0.4115107913669065, "grad_norm": 0.8483014350505912, "learning_rate": 6.574712643678162e-05, "loss": 0.4762, "step": 286 }, { "epoch": 0.41294964028776976, "grad_norm": 0.7821533734039084, "learning_rate": 6.597701149425288e-05, "loss": 0.4712, "step": 287 }, { "epoch": 0.4143884892086331, "grad_norm": 0.8098544578006053, "learning_rate": 6.620689655172415e-05, "loss": 0.4704, "step": 288 }, { "epoch": 0.4158273381294964, "grad_norm": 1.0002003064812475, "learning_rate": 6.643678160919542e-05, "loss": 0.4828, "step": 289 }, { "epoch": 0.4172661870503597, "grad_norm": 1.0166549231116344, "learning_rate": 6.666666666666667e-05, "loss": 0.473, "step": 290 }, { "epoch": 0.418705035971223, "grad_norm": 0.8173620976657725, "learning_rate": 6.689655172413794e-05, "loss": 0.4666, "step": 291 }, { "epoch": 0.4201438848920863, "grad_norm": 0.8176969242130069, "learning_rate": 6.712643678160919e-05, "loss": 0.466, "step": 292 }, { "epoch": 0.42158273381294964, "grad_norm": 1.067242351763639, "learning_rate": 6.735632183908046e-05, "loss": 0.4743, "step": 293 }, { "epoch": 0.42302158273381296, "grad_norm": 1.1373720540518368, "learning_rate": 6.758620689655173e-05, "loss": 0.4769, "step": 294 }, { "epoch": 0.4244604316546763, "grad_norm": 1.2091834939605204, "learning_rate": 6.7816091954023e-05, "loss": 0.4723, "step": 295 }, { "epoch": 0.42589928057553955, "grad_norm": 0.8553948982776225, "learning_rate": 6.804597701149425e-05, "loss": 0.4688, "step": 296 }, { "epoch": 0.4273381294964029, "grad_norm": 0.9063903596659111, "learning_rate": 6.827586206896552e-05, "loss": 0.472, "step": 297 }, { "epoch": 0.4287769784172662, "grad_norm": 0.8888596110092511, "learning_rate": 6.850574712643679e-05, "loss": 0.4724, "step": 298 }, { "epoch": 0.4302158273381295, "grad_norm": 1.283932813683222, "learning_rate": 6.873563218390806e-05, "loss": 0.4701, "step": 299 }, { "epoch": 0.4316546762589928, "grad_norm": 1.221149892015976, "learning_rate": 6.896551724137931e-05, "loss": 0.4729, "step": 300 }, { "epoch": 0.4330935251798561, "grad_norm": 0.9176746013485364, "learning_rate": 6.919540229885058e-05, "loss": 0.4693, "step": 301 }, { "epoch": 0.43453237410071943, "grad_norm": 0.7027300631739719, "learning_rate": 6.942528735632185e-05, "loss": 0.4664, "step": 302 }, { "epoch": 0.43597122302158275, "grad_norm": 0.7020387618730545, "learning_rate": 6.96551724137931e-05, "loss": 0.4673, "step": 303 }, { "epoch": 0.437410071942446, "grad_norm": 0.6808651310429936, "learning_rate": 6.988505747126437e-05, "loss": 0.4683, "step": 304 }, { "epoch": 0.43884892086330934, "grad_norm": 0.8424811250488698, "learning_rate": 7.011494252873563e-05, "loss": 0.457, "step": 305 }, { "epoch": 0.44028776978417267, "grad_norm": 0.9438233740449851, "learning_rate": 7.03448275862069e-05, "loss": 0.4771, "step": 306 }, { "epoch": 0.441726618705036, "grad_norm": 0.9851213333724348, "learning_rate": 7.057471264367816e-05, "loss": 0.4681, "step": 307 }, { "epoch": 0.44316546762589926, "grad_norm": 1.289642954389415, "learning_rate": 7.080459770114943e-05, "loss": 0.4785, "step": 308 }, { "epoch": 0.4446043165467626, "grad_norm": 1.0544507950814246, "learning_rate": 7.10344827586207e-05, "loss": 0.4724, "step": 309 }, { "epoch": 0.4460431654676259, "grad_norm": 1.3215957113890155, "learning_rate": 7.126436781609196e-05, "loss": 0.4757, "step": 310 }, { "epoch": 0.4474820143884892, "grad_norm": 0.7316608520379191, "learning_rate": 7.149425287356322e-05, "loss": 0.4684, "step": 311 }, { "epoch": 0.4489208633093525, "grad_norm": 0.969696808950938, "learning_rate": 7.17241379310345e-05, "loss": 0.479, "step": 312 }, { "epoch": 0.4503597122302158, "grad_norm": 0.9477963717668171, "learning_rate": 7.195402298850576e-05, "loss": 0.4741, "step": 313 }, { "epoch": 0.45179856115107914, "grad_norm": 0.9206502919147358, "learning_rate": 7.218390804597702e-05, "loss": 0.4748, "step": 314 }, { "epoch": 0.45323741007194246, "grad_norm": 1.1130093138557773, "learning_rate": 7.241379310344828e-05, "loss": 0.4694, "step": 315 }, { "epoch": 0.4546762589928058, "grad_norm": 1.1388074164156727, "learning_rate": 7.264367816091954e-05, "loss": 0.4692, "step": 316 }, { "epoch": 0.45611510791366905, "grad_norm": 1.1164661442567994, "learning_rate": 7.287356321839081e-05, "loss": 0.4741, "step": 317 }, { "epoch": 0.45755395683453237, "grad_norm": 1.0654010909802574, "learning_rate": 7.310344827586208e-05, "loss": 0.4683, "step": 318 }, { "epoch": 0.4589928057553957, "grad_norm": 0.8053385009779973, "learning_rate": 7.333333333333333e-05, "loss": 0.4666, "step": 319 }, { "epoch": 0.460431654676259, "grad_norm": 0.7282412960155227, "learning_rate": 7.35632183908046e-05, "loss": 0.463, "step": 320 }, { "epoch": 0.4618705035971223, "grad_norm": 0.942774932234221, "learning_rate": 7.379310344827587e-05, "loss": 0.4667, "step": 321 }, { "epoch": 0.4633093525179856, "grad_norm": 1.2102092661091306, "learning_rate": 7.402298850574714e-05, "loss": 0.4753, "step": 322 }, { "epoch": 0.46474820143884893, "grad_norm": 1.0870046005139904, "learning_rate": 7.425287356321839e-05, "loss": 0.472, "step": 323 }, { "epoch": 0.46618705035971225, "grad_norm": 1.1085973189399043, "learning_rate": 7.448275862068966e-05, "loss": 0.4696, "step": 324 }, { "epoch": 0.4676258992805755, "grad_norm": 1.0038842187549621, "learning_rate": 7.471264367816093e-05, "loss": 0.4615, "step": 325 }, { "epoch": 0.46906474820143884, "grad_norm": 1.0359164440235704, "learning_rate": 7.49425287356322e-05, "loss": 0.4694, "step": 326 }, { "epoch": 0.47050359712230216, "grad_norm": 0.9837789064434185, "learning_rate": 7.517241379310345e-05, "loss": 0.4713, "step": 327 }, { "epoch": 0.4719424460431655, "grad_norm": 0.9620898511324295, "learning_rate": 7.540229885057472e-05, "loss": 0.4688, "step": 328 }, { "epoch": 0.47338129496402875, "grad_norm": 1.1430319437453274, "learning_rate": 7.563218390804599e-05, "loss": 0.472, "step": 329 }, { "epoch": 0.4748201438848921, "grad_norm": 1.007227411189436, "learning_rate": 7.586206896551724e-05, "loss": 0.4648, "step": 330 }, { "epoch": 0.4762589928057554, "grad_norm": 1.0606046089785117, "learning_rate": 7.609195402298851e-05, "loss": 0.4743, "step": 331 }, { "epoch": 0.4776978417266187, "grad_norm": 0.8960210040501575, "learning_rate": 7.632183908045977e-05, "loss": 0.4628, "step": 332 }, { "epoch": 0.479136690647482, "grad_norm": 1.1241387696547236, "learning_rate": 7.655172413793103e-05, "loss": 0.4702, "step": 333 }, { "epoch": 0.4805755395683453, "grad_norm": 1.3163055988137928, "learning_rate": 7.67816091954023e-05, "loss": 0.4706, "step": 334 }, { "epoch": 0.48201438848920863, "grad_norm": 0.8647862396783775, "learning_rate": 7.701149425287357e-05, "loss": 0.469, "step": 335 }, { "epoch": 0.48345323741007196, "grad_norm": 0.7643381038686257, "learning_rate": 7.724137931034484e-05, "loss": 0.4728, "step": 336 }, { "epoch": 0.4848920863309353, "grad_norm": 0.8778154724393922, "learning_rate": 7.74712643678161e-05, "loss": 0.4625, "step": 337 }, { "epoch": 0.48633093525179855, "grad_norm": 1.114347449968458, "learning_rate": 7.770114942528736e-05, "loss": 0.471, "step": 338 }, { "epoch": 0.48776978417266187, "grad_norm": 0.7707670307497948, "learning_rate": 7.793103448275863e-05, "loss": 0.4725, "step": 339 }, { "epoch": 0.4892086330935252, "grad_norm": 0.9295505229318217, "learning_rate": 7.81609195402299e-05, "loss": 0.4583, "step": 340 }, { "epoch": 0.4906474820143885, "grad_norm": 1.3350454744024955, "learning_rate": 7.839080459770115e-05, "loss": 0.468, "step": 341 }, { "epoch": 0.4920863309352518, "grad_norm": 0.8539494522690838, "learning_rate": 7.862068965517242e-05, "loss": 0.4654, "step": 342 }, { "epoch": 0.4935251798561151, "grad_norm": 0.851472197165697, "learning_rate": 7.885057471264368e-05, "loss": 0.4592, "step": 343 }, { "epoch": 0.4949640287769784, "grad_norm": 0.7949738359780544, "learning_rate": 7.908045977011495e-05, "loss": 0.4674, "step": 344 }, { "epoch": 0.49640287769784175, "grad_norm": 0.6553751988750726, "learning_rate": 7.931034482758621e-05, "loss": 0.4653, "step": 345 }, { "epoch": 0.497841726618705, "grad_norm": 0.9279428864744551, "learning_rate": 7.954022988505747e-05, "loss": 0.4661, "step": 346 }, { "epoch": 0.49928057553956834, "grad_norm": 1.081074146306807, "learning_rate": 7.977011494252874e-05, "loss": 0.4613, "step": 347 }, { "epoch": 0.5007194244604316, "grad_norm": 0.8701513845004177, "learning_rate": 8e-05, "loss": 0.4624, "step": 348 }, { "epoch": 0.5021582733812949, "grad_norm": 1.2328271579662156, "learning_rate": 7.999997981289966e-05, "loss": 0.4798, "step": 349 }, { "epoch": 0.5035971223021583, "grad_norm": 0.8508168481221876, "learning_rate": 7.999991925161896e-05, "loss": 0.4626, "step": 350 }, { "epoch": 0.5050359712230216, "grad_norm": 0.8463101325573931, "learning_rate": 7.999981831621906e-05, "loss": 0.4758, "step": 351 }, { "epoch": 0.5064748201438849, "grad_norm": 1.1928759234173607, "learning_rate": 7.999967700680183e-05, "loss": 0.4797, "step": 352 }, { "epoch": 0.5079136690647482, "grad_norm": 1.4347979499250567, "learning_rate": 7.99994953235099e-05, "loss": 0.4592, "step": 353 }, { "epoch": 0.5093525179856115, "grad_norm": 0.8032557148584375, "learning_rate": 7.999927326652667e-05, "loss": 0.4683, "step": 354 }, { "epoch": 0.5107913669064749, "grad_norm": 1.0275144868037598, "learning_rate": 7.999901083607624e-05, "loss": 0.4708, "step": 355 }, { "epoch": 0.5122302158273381, "grad_norm": 1.3566895504819316, "learning_rate": 7.99987080324235e-05, "loss": 0.4681, "step": 356 }, { "epoch": 0.5136690647482014, "grad_norm": 1.0626475712542294, "learning_rate": 7.999836485587415e-05, "loss": 0.4619, "step": 357 }, { "epoch": 0.5151079136690647, "grad_norm": 0.9998118581806145, "learning_rate": 7.99979813067745e-05, "loss": 0.4576, "step": 358 }, { "epoch": 0.516546762589928, "grad_norm": 0.9557268904806338, "learning_rate": 7.999755738551171e-05, "loss": 0.4668, "step": 359 }, { "epoch": 0.5179856115107914, "grad_norm": 1.0796775023064598, "learning_rate": 7.999709309251368e-05, "loss": 0.4574, "step": 360 }, { "epoch": 0.5194244604316547, "grad_norm": 1.0500619364793755, "learning_rate": 7.999658842824904e-05, "loss": 0.4577, "step": 361 }, { "epoch": 0.520863309352518, "grad_norm": 0.9744110414015235, "learning_rate": 7.999604339322717e-05, "loss": 0.4604, "step": 362 }, { "epoch": 0.5223021582733813, "grad_norm": 0.6004652084768644, "learning_rate": 7.999545798799823e-05, "loss": 0.4583, "step": 363 }, { "epoch": 0.5237410071942447, "grad_norm": 0.4987568342275504, "learning_rate": 7.999483221315307e-05, "loss": 0.4656, "step": 364 }, { "epoch": 0.5251798561151079, "grad_norm": 0.6004131997500541, "learning_rate": 7.999416606932331e-05, "loss": 0.4534, "step": 365 }, { "epoch": 0.5266187050359712, "grad_norm": 0.6149304879668175, "learning_rate": 7.999345955718136e-05, "loss": 0.4613, "step": 366 }, { "epoch": 0.5280575539568345, "grad_norm": 0.8500344008908939, "learning_rate": 7.999271267744033e-05, "loss": 0.4652, "step": 367 }, { "epoch": 0.5294964028776978, "grad_norm": 0.9413054629306632, "learning_rate": 7.999192543085407e-05, "loss": 0.4687, "step": 368 }, { "epoch": 0.5309352517985612, "grad_norm": 1.2121156729309888, "learning_rate": 7.999109781821722e-05, "loss": 0.4625, "step": 369 }, { "epoch": 0.5323741007194245, "grad_norm": 0.7421231375424578, "learning_rate": 7.999022984036512e-05, "loss": 0.4623, "step": 370 }, { "epoch": 0.5338129496402878, "grad_norm": 0.6597146546987727, "learning_rate": 7.998932149817386e-05, "loss": 0.457, "step": 371 }, { "epoch": 0.5352517985611511, "grad_norm": 0.9338747126537402, "learning_rate": 7.998837279256028e-05, "loss": 0.4741, "step": 372 }, { "epoch": 0.5366906474820143, "grad_norm": 1.1172196724004102, "learning_rate": 7.998738372448196e-05, "loss": 0.4587, "step": 373 }, { "epoch": 0.5381294964028777, "grad_norm": 0.7812818117620776, "learning_rate": 7.998635429493726e-05, "loss": 0.4728, "step": 374 }, { "epoch": 0.539568345323741, "grad_norm": 0.6787927858149476, "learning_rate": 7.998528450496519e-05, "loss": 0.4578, "step": 375 }, { "epoch": 0.5410071942446043, "grad_norm": 0.714091642204277, "learning_rate": 7.998417435564557e-05, "loss": 0.4722, "step": 376 }, { "epoch": 0.5424460431654676, "grad_norm": 0.8504914934981742, "learning_rate": 7.998302384809893e-05, "loss": 0.4537, "step": 377 }, { "epoch": 0.543884892086331, "grad_norm": 0.9898476948452237, "learning_rate": 7.998183298348654e-05, "loss": 0.4664, "step": 378 }, { "epoch": 0.5453237410071943, "grad_norm": 1.2535233391949137, "learning_rate": 7.998060176301041e-05, "loss": 0.4541, "step": 379 }, { "epoch": 0.5467625899280576, "grad_norm": 0.6997883291597523, "learning_rate": 7.997933018791327e-05, "loss": 0.4583, "step": 380 }, { "epoch": 0.5482014388489208, "grad_norm": 0.7736047534054794, "learning_rate": 7.99780182594786e-05, "loss": 0.4607, "step": 381 }, { "epoch": 0.5496402877697841, "grad_norm": 0.849423061431514, "learning_rate": 7.99766659790306e-05, "loss": 0.4664, "step": 382 }, { "epoch": 0.5510791366906475, "grad_norm": 0.8499243646515073, "learning_rate": 7.997527334793419e-05, "loss": 0.4532, "step": 383 }, { "epoch": 0.5525179856115108, "grad_norm": 0.9507351440178784, "learning_rate": 7.997384036759505e-05, "loss": 0.4644, "step": 384 }, { "epoch": 0.5539568345323741, "grad_norm": 0.7880806752546511, "learning_rate": 7.997236703945955e-05, "loss": 0.4565, "step": 385 }, { "epoch": 0.5553956834532374, "grad_norm": 0.8235336874532355, "learning_rate": 7.99708533650148e-05, "loss": 0.4602, "step": 386 }, { "epoch": 0.5568345323741007, "grad_norm": 1.1406064704681915, "learning_rate": 7.996929934578864e-05, "loss": 0.4631, "step": 387 }, { "epoch": 0.5582733812949641, "grad_norm": 1.0936145617683148, "learning_rate": 7.996770498334963e-05, "loss": 0.4665, "step": 388 }, { "epoch": 0.5597122302158274, "grad_norm": 0.8845721786169036, "learning_rate": 7.996607027930705e-05, "loss": 0.4534, "step": 389 }, { "epoch": 0.5611510791366906, "grad_norm": 0.8780922124552242, "learning_rate": 7.996439523531088e-05, "loss": 0.4624, "step": 390 }, { "epoch": 0.5625899280575539, "grad_norm": 1.004467673049218, "learning_rate": 7.996267985305186e-05, "loss": 0.4589, "step": 391 }, { "epoch": 0.5640287769784172, "grad_norm": 1.0826878157187065, "learning_rate": 7.99609241342614e-05, "loss": 0.4558, "step": 392 }, { "epoch": 0.5654676258992806, "grad_norm": 0.8097477147009582, "learning_rate": 7.995912808071164e-05, "loss": 0.4619, "step": 393 }, { "epoch": 0.5669064748201439, "grad_norm": 0.693965038621037, "learning_rate": 7.995729169421545e-05, "loss": 0.4616, "step": 394 }, { "epoch": 0.5683453237410072, "grad_norm": 0.9308338290598528, "learning_rate": 7.99554149766264e-05, "loss": 0.4622, "step": 395 }, { "epoch": 0.5697841726618705, "grad_norm": 1.1817576406269814, "learning_rate": 7.995349792983874e-05, "loss": 0.4614, "step": 396 }, { "epoch": 0.5712230215827339, "grad_norm": 0.6528770424649769, "learning_rate": 7.995154055578748e-05, "loss": 0.459, "step": 397 }, { "epoch": 0.5726618705035971, "grad_norm": 0.45150970577996347, "learning_rate": 7.994954285644827e-05, "loss": 0.4591, "step": 398 }, { "epoch": 0.5741007194244604, "grad_norm": 0.5556005129157767, "learning_rate": 7.994750483383753e-05, "loss": 0.4613, "step": 399 }, { "epoch": 0.5755395683453237, "grad_norm": 0.6912244983340633, "learning_rate": 7.994542649001235e-05, "loss": 0.4553, "step": 400 }, { "epoch": 0.576978417266187, "grad_norm": 0.8960022596299475, "learning_rate": 7.994330782707048e-05, "loss": 0.4597, "step": 401 }, { "epoch": 0.5784172661870504, "grad_norm": 1.025870832226272, "learning_rate": 7.994114884715045e-05, "loss": 0.451, "step": 402 }, { "epoch": 0.5798561151079137, "grad_norm": 0.8911346100337705, "learning_rate": 7.99389495524314e-05, "loss": 0.4627, "step": 403 }, { "epoch": 0.581294964028777, "grad_norm": 0.7554954071013955, "learning_rate": 7.993670994513321e-05, "loss": 0.4526, "step": 404 }, { "epoch": 0.5827338129496403, "grad_norm": 0.6689831752367191, "learning_rate": 7.993443002751646e-05, "loss": 0.4594, "step": 405 }, { "epoch": 0.5841726618705037, "grad_norm": 0.5578468632676835, "learning_rate": 7.993210980188236e-05, "loss": 0.4526, "step": 406 }, { "epoch": 0.5856115107913669, "grad_norm": 0.5803040347753762, "learning_rate": 7.992974927057287e-05, "loss": 0.4643, "step": 407 }, { "epoch": 0.5870503597122302, "grad_norm": 0.6577126675911862, "learning_rate": 7.992734843597058e-05, "loss": 0.4604, "step": 408 }, { "epoch": 0.5884892086330935, "grad_norm": 0.6586582167853312, "learning_rate": 7.992490730049881e-05, "loss": 0.4506, "step": 409 }, { "epoch": 0.5899280575539568, "grad_norm": 0.5099795931765497, "learning_rate": 7.992242586662152e-05, "loss": 0.4527, "step": 410 }, { "epoch": 0.5913669064748202, "grad_norm": 0.43969451590829833, "learning_rate": 7.991990413684336e-05, "loss": 0.4408, "step": 411 }, { "epoch": 0.5928057553956835, "grad_norm": 0.48368464104887693, "learning_rate": 7.991734211370965e-05, "loss": 0.4543, "step": 412 }, { "epoch": 0.5942446043165468, "grad_norm": 0.6189368627099892, "learning_rate": 7.991473979980637e-05, "loss": 0.4501, "step": 413 }, { "epoch": 0.5956834532374101, "grad_norm": 0.7712221623764791, "learning_rate": 7.99120971977602e-05, "loss": 0.4439, "step": 414 }, { "epoch": 0.5971223021582733, "grad_norm": 1.1017419802815924, "learning_rate": 7.990941431023844e-05, "loss": 0.4692, "step": 415 }, { "epoch": 0.5985611510791367, "grad_norm": 1.3463982426436771, "learning_rate": 7.990669113994911e-05, "loss": 0.4535, "step": 416 }, { "epoch": 0.6, "grad_norm": 0.5313608173987471, "learning_rate": 7.99039276896408e-05, "loss": 0.4552, "step": 417 }, { "epoch": 0.6014388489208633, "grad_norm": 0.8384027425355336, "learning_rate": 7.990112396210288e-05, "loss": 0.4498, "step": 418 }, { "epoch": 0.6028776978417266, "grad_norm": 1.5419005868846274, "learning_rate": 7.989827996016525e-05, "loss": 0.4541, "step": 419 }, { "epoch": 0.60431654676259, "grad_norm": 0.5236106532851287, "learning_rate": 7.989539568669856e-05, "loss": 0.4547, "step": 420 }, { "epoch": 0.6057553956834533, "grad_norm": 1.5495871475389187, "learning_rate": 7.989247114461403e-05, "loss": 0.4727, "step": 421 }, { "epoch": 0.6071942446043166, "grad_norm": 0.6710377853395758, "learning_rate": 7.988950633686358e-05, "loss": 0.4629, "step": 422 }, { "epoch": 0.6086330935251798, "grad_norm": 1.1495687379614778, "learning_rate": 7.988650126643976e-05, "loss": 0.4643, "step": 423 }, { "epoch": 0.6100719424460431, "grad_norm": 0.7728837622561019, "learning_rate": 7.988345593637572e-05, "loss": 0.4505, "step": 424 }, { "epoch": 0.6115107913669064, "grad_norm": 0.7153271121647338, "learning_rate": 7.988037034974532e-05, "loss": 0.454, "step": 425 }, { "epoch": 0.6129496402877698, "grad_norm": 0.8556877038559139, "learning_rate": 7.9877244509663e-05, "loss": 0.4501, "step": 426 }, { "epoch": 0.6143884892086331, "grad_norm": 0.5822077080483781, "learning_rate": 7.987407841928384e-05, "loss": 0.4555, "step": 427 }, { "epoch": 0.6158273381294964, "grad_norm": 0.7502935198956667, "learning_rate": 7.987087208180355e-05, "loss": 0.4597, "step": 428 }, { "epoch": 0.6172661870503597, "grad_norm": 0.894659880332655, "learning_rate": 7.986762550045844e-05, "loss": 0.4425, "step": 429 }, { "epoch": 0.6187050359712231, "grad_norm": 0.8428743370334983, "learning_rate": 7.98643386785255e-05, "loss": 0.4614, "step": 430 }, { "epoch": 0.6201438848920864, "grad_norm": 1.0150305538139375, "learning_rate": 7.986101161932227e-05, "loss": 0.4542, "step": 431 }, { "epoch": 0.6215827338129496, "grad_norm": 1.1915793912859454, "learning_rate": 7.985764432620695e-05, "loss": 0.4591, "step": 432 }, { "epoch": 0.6230215827338129, "grad_norm": 0.9500995658899973, "learning_rate": 7.985423680257833e-05, "loss": 0.4557, "step": 433 }, { "epoch": 0.6244604316546762, "grad_norm": 0.8504572576195092, "learning_rate": 7.985078905187582e-05, "loss": 0.4489, "step": 434 }, { "epoch": 0.6258992805755396, "grad_norm": 0.6552243752639753, "learning_rate": 7.984730107757942e-05, "loss": 0.4535, "step": 435 }, { "epoch": 0.6273381294964029, "grad_norm": 0.9486300093548566, "learning_rate": 7.984377288320973e-05, "loss": 0.4709, "step": 436 }, { "epoch": 0.6287769784172662, "grad_norm": 1.2450786856279674, "learning_rate": 7.984020447232795e-05, "loss": 0.4603, "step": 437 }, { "epoch": 0.6302158273381295, "grad_norm": 0.7081913514292113, "learning_rate": 7.983659584853586e-05, "loss": 0.4555, "step": 438 }, { "epoch": 0.6316546762589929, "grad_norm": 0.5133251754462216, "learning_rate": 7.983294701547588e-05, "loss": 0.457, "step": 439 }, { "epoch": 0.6330935251798561, "grad_norm": 0.6949146616899369, "learning_rate": 7.982925797683095e-05, "loss": 0.4654, "step": 440 }, { "epoch": 0.6345323741007194, "grad_norm": 0.7996453947268342, "learning_rate": 7.982552873632461e-05, "loss": 0.4503, "step": 441 }, { "epoch": 0.6359712230215827, "grad_norm": 0.7979003739605092, "learning_rate": 7.982175929772102e-05, "loss": 0.4614, "step": 442 }, { "epoch": 0.637410071942446, "grad_norm": 0.7773994652608701, "learning_rate": 7.981794966482486e-05, "loss": 0.4603, "step": 443 }, { "epoch": 0.6388489208633094, "grad_norm": 0.7595360865967253, "learning_rate": 7.98140998414814e-05, "loss": 0.4578, "step": 444 }, { "epoch": 0.6402877697841727, "grad_norm": 0.7520837510292663, "learning_rate": 7.98102098315765e-05, "loss": 0.4502, "step": 445 }, { "epoch": 0.641726618705036, "grad_norm": 0.8160841775970298, "learning_rate": 7.980627963903654e-05, "loss": 0.4485, "step": 446 }, { "epoch": 0.6431654676258993, "grad_norm": 1.0464193362779555, "learning_rate": 7.980230926782848e-05, "loss": 0.4477, "step": 447 }, { "epoch": 0.6446043165467625, "grad_norm": 1.0091826922037084, "learning_rate": 7.979829872195984e-05, "loss": 0.4538, "step": 448 }, { "epoch": 0.6460431654676259, "grad_norm": 0.8990157367734588, "learning_rate": 7.979424800547869e-05, "loss": 0.4557, "step": 449 }, { "epoch": 0.6474820143884892, "grad_norm": 0.8300047632795456, "learning_rate": 7.979015712247365e-05, "loss": 0.458, "step": 450 }, { "epoch": 0.6489208633093525, "grad_norm": 0.7149509540074934, "learning_rate": 7.978602607707383e-05, "loss": 0.47, "step": 451 }, { "epoch": 0.6503597122302158, "grad_norm": 0.5970009144007832, "learning_rate": 7.978185487344897e-05, "loss": 0.4519, "step": 452 }, { "epoch": 0.6517985611510791, "grad_norm": 0.4967763910687283, "learning_rate": 7.977764351580928e-05, "loss": 0.4439, "step": 453 }, { "epoch": 0.6532374100719425, "grad_norm": 0.5092650465666206, "learning_rate": 7.97733920084055e-05, "loss": 0.4511, "step": 454 }, { "epoch": 0.6546762589928058, "grad_norm": 0.6185513764677956, "learning_rate": 7.976910035552892e-05, "loss": 0.4513, "step": 455 }, { "epoch": 0.6561151079136691, "grad_norm": 0.6398423442285093, "learning_rate": 7.976476856151134e-05, "loss": 0.4493, "step": 456 }, { "epoch": 0.6575539568345323, "grad_norm": 0.7664721097324098, "learning_rate": 7.976039663072509e-05, "loss": 0.4441, "step": 457 }, { "epoch": 0.6589928057553956, "grad_norm": 0.827523466225571, "learning_rate": 7.975598456758298e-05, "loss": 0.4593, "step": 458 }, { "epoch": 0.660431654676259, "grad_norm": 0.9182022459615838, "learning_rate": 7.975153237653836e-05, "loss": 0.4487, "step": 459 }, { "epoch": 0.6618705035971223, "grad_norm": 0.9735790118953446, "learning_rate": 7.974704006208509e-05, "loss": 0.4479, "step": 460 }, { "epoch": 0.6633093525179856, "grad_norm": 0.9415118087459532, "learning_rate": 7.974250762875747e-05, "loss": 0.4533, "step": 461 }, { "epoch": 0.6647482014388489, "grad_norm": 0.7848209637914124, "learning_rate": 7.973793508113035e-05, "loss": 0.4423, "step": 462 }, { "epoch": 0.6661870503597123, "grad_norm": 0.5158906455961678, "learning_rate": 7.973332242381908e-05, "loss": 0.4514, "step": 463 }, { "epoch": 0.6676258992805756, "grad_norm": 0.41797571499127906, "learning_rate": 7.972866966147942e-05, "loss": 0.4475, "step": 464 }, { "epoch": 0.6690647482014388, "grad_norm": 0.47340136151923096, "learning_rate": 7.972397679880771e-05, "loss": 0.4517, "step": 465 }, { "epoch": 0.6705035971223021, "grad_norm": 0.5787418072919882, "learning_rate": 7.971924384054068e-05, "loss": 0.4573, "step": 466 }, { "epoch": 0.6719424460431654, "grad_norm": 0.6523824054062983, "learning_rate": 7.971447079145557e-05, "loss": 0.4542, "step": 467 }, { "epoch": 0.6733812949640288, "grad_norm": 0.7105356243378426, "learning_rate": 7.970965765637011e-05, "loss": 0.4577, "step": 468 }, { "epoch": 0.6748201438848921, "grad_norm": 0.692146495435072, "learning_rate": 7.970480444014244e-05, "loss": 0.4499, "step": 469 }, { "epoch": 0.6762589928057554, "grad_norm": 0.6645295929196283, "learning_rate": 7.969991114767114e-05, "loss": 0.465, "step": 470 }, { "epoch": 0.6776978417266187, "grad_norm": 0.6729940725627133, "learning_rate": 7.969497778389534e-05, "loss": 0.4536, "step": 471 }, { "epoch": 0.679136690647482, "grad_norm": 0.6771506293473587, "learning_rate": 7.969000435379454e-05, "loss": 0.4508, "step": 472 }, { "epoch": 0.6805755395683454, "grad_norm": 0.6495567644754584, "learning_rate": 7.968499086238867e-05, "loss": 0.443, "step": 473 }, { "epoch": 0.6820143884892086, "grad_norm": 0.6882929173622521, "learning_rate": 7.967993731473815e-05, "loss": 0.448, "step": 474 }, { "epoch": 0.6834532374100719, "grad_norm": 0.7237880211855974, "learning_rate": 7.96748437159438e-05, "loss": 0.449, "step": 475 }, { "epoch": 0.6848920863309352, "grad_norm": 0.8347328822613261, "learning_rate": 7.966971007114686e-05, "loss": 0.4523, "step": 476 }, { "epoch": 0.6863309352517986, "grad_norm": 0.9719198065761699, "learning_rate": 7.966453638552901e-05, "loss": 0.4591, "step": 477 }, { "epoch": 0.6877697841726619, "grad_norm": 1.040787964834932, "learning_rate": 7.965932266431232e-05, "loss": 0.4499, "step": 478 }, { "epoch": 0.6892086330935252, "grad_norm": 0.9334315562534466, "learning_rate": 7.96540689127593e-05, "loss": 0.4545, "step": 479 }, { "epoch": 0.6906474820143885, "grad_norm": 0.845279321333729, "learning_rate": 7.964877513617285e-05, "loss": 0.4475, "step": 480 }, { "epoch": 0.6920863309352518, "grad_norm": 0.7979150737125542, "learning_rate": 7.964344133989627e-05, "loss": 0.4508, "step": 481 }, { "epoch": 0.6935251798561151, "grad_norm": 0.8147584264372341, "learning_rate": 7.963806752931324e-05, "loss": 0.4572, "step": 482 }, { "epoch": 0.6949640287769784, "grad_norm": 0.7981752284101542, "learning_rate": 7.963265370984786e-05, "loss": 0.4459, "step": 483 }, { "epoch": 0.6964028776978417, "grad_norm": 0.5970863793585182, "learning_rate": 7.962719988696458e-05, "loss": 0.4515, "step": 484 }, { "epoch": 0.697841726618705, "grad_norm": 0.4620326784454222, "learning_rate": 7.962170606616826e-05, "loss": 0.4476, "step": 485 }, { "epoch": 0.6992805755395683, "grad_norm": 0.6131933586653694, "learning_rate": 7.96161722530041e-05, "loss": 0.4539, "step": 486 }, { "epoch": 0.7007194244604317, "grad_norm": 0.6436228395416139, "learning_rate": 7.96105984530577e-05, "loss": 0.4441, "step": 487 }, { "epoch": 0.702158273381295, "grad_norm": 0.6941374919640537, "learning_rate": 7.9604984671955e-05, "loss": 0.4598, "step": 488 }, { "epoch": 0.7035971223021583, "grad_norm": 0.8557240788774572, "learning_rate": 7.959933091536227e-05, "loss": 0.4587, "step": 489 }, { "epoch": 0.7050359712230215, "grad_norm": 0.8250817581367761, "learning_rate": 7.95936371889862e-05, "loss": 0.4522, "step": 490 }, { "epoch": 0.7064748201438849, "grad_norm": 0.7243783703842612, "learning_rate": 7.958790349857375e-05, "loss": 0.446, "step": 491 }, { "epoch": 0.7079136690647482, "grad_norm": 0.6328054932553656, "learning_rate": 7.958212984991226e-05, "loss": 0.4511, "step": 492 }, { "epoch": 0.7093525179856115, "grad_norm": 0.6050868740033291, "learning_rate": 7.957631624882938e-05, "loss": 0.4478, "step": 493 }, { "epoch": 0.7107913669064748, "grad_norm": 0.6145153359274808, "learning_rate": 7.957046270119313e-05, "loss": 0.4511, "step": 494 }, { "epoch": 0.7122302158273381, "grad_norm": 0.4955750148447891, "learning_rate": 7.956456921291178e-05, "loss": 0.4437, "step": 495 }, { "epoch": 0.7136690647482015, "grad_norm": 0.4658103703985101, "learning_rate": 7.955863578993396e-05, "loss": 0.4485, "step": 496 }, { "epoch": 0.7151079136690648, "grad_norm": 0.4789906846468974, "learning_rate": 7.955266243824864e-05, "loss": 0.4555, "step": 497 }, { "epoch": 0.7165467625899281, "grad_norm": 0.3628724330812125, "learning_rate": 7.954664916388499e-05, "loss": 0.4484, "step": 498 }, { "epoch": 0.7179856115107913, "grad_norm": 0.38832507873774125, "learning_rate": 7.954059597291257e-05, "loss": 0.4471, "step": 499 }, { "epoch": 0.7194244604316546, "grad_norm": 0.4895740944847101, "learning_rate": 7.953450287144121e-05, "loss": 0.4527, "step": 500 }, { "epoch": 0.720863309352518, "grad_norm": 0.560521301328584, "learning_rate": 7.952836986562099e-05, "loss": 0.4527, "step": 501 }, { "epoch": 0.7223021582733813, "grad_norm": 0.5655012201487319, "learning_rate": 7.952219696164231e-05, "loss": 0.446, "step": 502 }, { "epoch": 0.7237410071942446, "grad_norm": 0.6256450052744676, "learning_rate": 7.95159841657358e-05, "loss": 0.4578, "step": 503 }, { "epoch": 0.7251798561151079, "grad_norm": 0.843383844614636, "learning_rate": 7.950973148417239e-05, "loss": 0.456, "step": 504 }, { "epoch": 0.7266187050359713, "grad_norm": 1.2252756875756765, "learning_rate": 7.950343892326327e-05, "loss": 0.4481, "step": 505 }, { "epoch": 0.7280575539568346, "grad_norm": 0.81689007020895, "learning_rate": 7.949710648935984e-05, "loss": 0.4444, "step": 506 }, { "epoch": 0.7294964028776978, "grad_norm": 0.5386696162694277, "learning_rate": 7.949073418885378e-05, "loss": 0.439, "step": 507 }, { "epoch": 0.7309352517985611, "grad_norm": 0.4174766191896591, "learning_rate": 7.948432202817703e-05, "loss": 0.452, "step": 508 }, { "epoch": 0.7323741007194244, "grad_norm": 0.6394814206881526, "learning_rate": 7.94778700138017e-05, "loss": 0.4457, "step": 509 }, { "epoch": 0.7338129496402878, "grad_norm": 0.7941586105291152, "learning_rate": 7.947137815224018e-05, "loss": 0.4562, "step": 510 }, { "epoch": 0.7352517985611511, "grad_norm": 0.8189576364572564, "learning_rate": 7.946484645004508e-05, "loss": 0.4598, "step": 511 }, { "epoch": 0.7366906474820144, "grad_norm": 0.740740940198565, "learning_rate": 7.945827491380916e-05, "loss": 0.4477, "step": 512 }, { "epoch": 0.7381294964028777, "grad_norm": 0.6823905757803396, "learning_rate": 7.945166355016548e-05, "loss": 0.4514, "step": 513 }, { "epoch": 0.739568345323741, "grad_norm": 0.6897531675747028, "learning_rate": 7.944501236578722e-05, "loss": 0.4437, "step": 514 }, { "epoch": 0.7410071942446043, "grad_norm": 0.6769501230737464, "learning_rate": 7.943832136738783e-05, "loss": 0.4474, "step": 515 }, { "epoch": 0.7424460431654676, "grad_norm": 0.6940470611142417, "learning_rate": 7.943159056172084e-05, "loss": 0.4568, "step": 516 }, { "epoch": 0.7438848920863309, "grad_norm": 0.6391637366081594, "learning_rate": 7.942481995558007e-05, "loss": 0.4407, "step": 517 }, { "epoch": 0.7453237410071942, "grad_norm": 0.7123591035777244, "learning_rate": 7.941800955579946e-05, "loss": 0.4617, "step": 518 }, { "epoch": 0.7467625899280576, "grad_norm": 0.8389132097956593, "learning_rate": 7.941115936925311e-05, "loss": 0.4465, "step": 519 }, { "epoch": 0.7482014388489209, "grad_norm": 0.8571734890511532, "learning_rate": 7.940426940285529e-05, "loss": 0.4475, "step": 520 }, { "epoch": 0.7496402877697842, "grad_norm": 0.8845916711288304, "learning_rate": 7.939733966356042e-05, "loss": 0.4404, "step": 521 }, { "epoch": 0.7510791366906475, "grad_norm": 0.8603567525751223, "learning_rate": 7.939037015836308e-05, "loss": 0.4549, "step": 522 }, { "epoch": 0.7525179856115108, "grad_norm": 0.6545784514861386, "learning_rate": 7.938336089429796e-05, "loss": 0.4436, "step": 523 }, { "epoch": 0.753956834532374, "grad_norm": 0.46746127864891357, "learning_rate": 7.937631187843991e-05, "loss": 0.4346, "step": 524 }, { "epoch": 0.7553956834532374, "grad_norm": 0.6535704298488452, "learning_rate": 7.936922311790388e-05, "loss": 0.4457, "step": 525 }, { "epoch": 0.7568345323741007, "grad_norm": 0.8604580580713901, "learning_rate": 7.936209461984495e-05, "loss": 0.4554, "step": 526 }, { "epoch": 0.758273381294964, "grad_norm": 0.8596757460031823, "learning_rate": 7.935492639145831e-05, "loss": 0.4547, "step": 527 }, { "epoch": 0.7597122302158273, "grad_norm": 0.6855811230205074, "learning_rate": 7.934771843997922e-05, "loss": 0.4527, "step": 528 }, { "epoch": 0.7611510791366907, "grad_norm": 0.4247306668156656, "learning_rate": 7.934047077268311e-05, "loss": 0.4529, "step": 529 }, { "epoch": 0.762589928057554, "grad_norm": 0.5480303698838035, "learning_rate": 7.93331833968854e-05, "loss": 0.4467, "step": 530 }, { "epoch": 0.7640287769784173, "grad_norm": 0.7585712280153364, "learning_rate": 7.932585631994168e-05, "loss": 0.4381, "step": 531 }, { "epoch": 0.7654676258992805, "grad_norm": 0.8302009985547316, "learning_rate": 7.931848954924754e-05, "loss": 0.449, "step": 532 }, { "epoch": 0.7669064748201438, "grad_norm": 0.7819737375461054, "learning_rate": 7.931108309223868e-05, "loss": 0.4474, "step": 533 }, { "epoch": 0.7683453237410072, "grad_norm": 0.6155414299458252, "learning_rate": 7.930363695639085e-05, "loss": 0.4444, "step": 534 }, { "epoch": 0.7697841726618705, "grad_norm": 0.6867692350658405, "learning_rate": 7.929615114921984e-05, "loss": 0.4431, "step": 535 }, { "epoch": 0.7712230215827338, "grad_norm": 0.8520333131629818, "learning_rate": 7.92886256782815e-05, "loss": 0.4495, "step": 536 }, { "epoch": 0.7726618705035971, "grad_norm": 0.928867857208261, "learning_rate": 7.928106055117168e-05, "loss": 0.4447, "step": 537 }, { "epoch": 0.7741007194244605, "grad_norm": 0.8792475479227084, "learning_rate": 7.927345577552627e-05, "loss": 0.4361, "step": 538 }, { "epoch": 0.7755395683453238, "grad_norm": 0.8503296097274649, "learning_rate": 7.926581135902122e-05, "loss": 0.4404, "step": 539 }, { "epoch": 0.7769784172661871, "grad_norm": 0.8781035666975411, "learning_rate": 7.925812730937245e-05, "loss": 0.4395, "step": 540 }, { "epoch": 0.7784172661870503, "grad_norm": 0.7640793827684093, "learning_rate": 7.92504036343359e-05, "loss": 0.4418, "step": 541 }, { "epoch": 0.7798561151079136, "grad_norm": 0.5452920148681172, "learning_rate": 7.924264034170747e-05, "loss": 0.439, "step": 542 }, { "epoch": 0.781294964028777, "grad_norm": 0.4078470063626378, "learning_rate": 7.923483743932311e-05, "loss": 0.438, "step": 543 }, { "epoch": 0.7827338129496403, "grad_norm": 0.5268504728636281, "learning_rate": 7.922699493505871e-05, "loss": 0.4445, "step": 544 }, { "epoch": 0.7841726618705036, "grad_norm": 0.5668670210410232, "learning_rate": 7.921911283683013e-05, "loss": 0.4481, "step": 545 }, { "epoch": 0.7856115107913669, "grad_norm": 0.5041645002269407, "learning_rate": 7.921119115259322e-05, "loss": 0.4447, "step": 546 }, { "epoch": 0.7870503597122303, "grad_norm": 0.5076344748214742, "learning_rate": 7.920322989034377e-05, "loss": 0.4565, "step": 547 }, { "epoch": 0.7884892086330936, "grad_norm": 0.6064613677143496, "learning_rate": 7.919522905811752e-05, "loss": 0.4321, "step": 548 }, { "epoch": 0.7899280575539568, "grad_norm": 0.7411255790888717, "learning_rate": 7.918718866399012e-05, "loss": 0.4495, "step": 549 }, { "epoch": 0.7913669064748201, "grad_norm": 0.8797687912923331, "learning_rate": 7.917910871607723e-05, "loss": 0.4509, "step": 550 }, { "epoch": 0.7928057553956834, "grad_norm": 0.9548767462700447, "learning_rate": 7.917098922253436e-05, "loss": 0.4423, "step": 551 }, { "epoch": 0.7942446043165468, "grad_norm": 0.9972894572478253, "learning_rate": 7.916283019155696e-05, "loss": 0.4503, "step": 552 }, { "epoch": 0.7956834532374101, "grad_norm": 0.9893577525224617, "learning_rate": 7.915463163138041e-05, "loss": 0.4595, "step": 553 }, { "epoch": 0.7971223021582734, "grad_norm": 0.8972701855596109, "learning_rate": 7.914639355027995e-05, "loss": 0.4391, "step": 554 }, { "epoch": 0.7985611510791367, "grad_norm": 0.7835065881851512, "learning_rate": 7.913811595657072e-05, "loss": 0.4372, "step": 555 }, { "epoch": 0.8, "grad_norm": 0.6394121885285287, "learning_rate": 7.912979885860776e-05, "loss": 0.452, "step": 556 }, { "epoch": 0.8014388489208633, "grad_norm": 0.6801870197183523, "learning_rate": 7.912144226478598e-05, "loss": 0.447, "step": 557 }, { "epoch": 0.8028776978417266, "grad_norm": 0.7980152451628971, "learning_rate": 7.911304618354015e-05, "loss": 0.4487, "step": 558 }, { "epoch": 0.8043165467625899, "grad_norm": 0.7279067435889713, "learning_rate": 7.910461062334488e-05, "loss": 0.4328, "step": 559 }, { "epoch": 0.8057553956834532, "grad_norm": 0.6438312702522009, "learning_rate": 7.909613559271467e-05, "loss": 0.4527, "step": 560 }, { "epoch": 0.8071942446043165, "grad_norm": 0.5888117923156623, "learning_rate": 7.908762110020382e-05, "loss": 0.4484, "step": 561 }, { "epoch": 0.8086330935251799, "grad_norm": 0.5027698438536533, "learning_rate": 7.907906715440649e-05, "loss": 0.441, "step": 562 }, { "epoch": 0.8100719424460432, "grad_norm": 0.33582201172495263, "learning_rate": 7.907047376395661e-05, "loss": 0.448, "step": 563 }, { "epoch": 0.8115107913669065, "grad_norm": 0.37382020125050747, "learning_rate": 7.906184093752801e-05, "loss": 0.447, "step": 564 }, { "epoch": 0.8129496402877698, "grad_norm": 0.535153938679081, "learning_rate": 7.905316868383425e-05, "loss": 0.4549, "step": 565 }, { "epoch": 0.814388489208633, "grad_norm": 0.6398796518984677, "learning_rate": 7.904445701162872e-05, "loss": 0.4415, "step": 566 }, { "epoch": 0.8158273381294964, "grad_norm": 0.7020604505602379, "learning_rate": 7.903570592970458e-05, "loss": 0.4401, "step": 567 }, { "epoch": 0.8172661870503597, "grad_norm": 0.6924476831387525, "learning_rate": 7.902691544689479e-05, "loss": 0.4373, "step": 568 }, { "epoch": 0.818705035971223, "grad_norm": 0.720414887360444, "learning_rate": 7.901808557207206e-05, "loss": 0.4413, "step": 569 }, { "epoch": 0.8201438848920863, "grad_norm": 0.795139580874903, "learning_rate": 7.900921631414887e-05, "loss": 0.4531, "step": 570 }, { "epoch": 0.8215827338129497, "grad_norm": 0.8747103811649327, "learning_rate": 7.900030768207746e-05, "loss": 0.4496, "step": 571 }, { "epoch": 0.823021582733813, "grad_norm": 0.961793877132737, "learning_rate": 7.899135968484979e-05, "loss": 0.4431, "step": 572 }, { "epoch": 0.8244604316546763, "grad_norm": 0.9721856253540773, "learning_rate": 7.898237233149758e-05, "loss": 0.4417, "step": 573 }, { "epoch": 0.8258992805755395, "grad_norm": 0.8357185389253394, "learning_rate": 7.897334563109225e-05, "loss": 0.4418, "step": 574 }, { "epoch": 0.8273381294964028, "grad_norm": 0.6615605307941537, "learning_rate": 7.896427959274494e-05, "loss": 0.4419, "step": 575 }, { "epoch": 0.8287769784172662, "grad_norm": 0.48424946567746674, "learning_rate": 7.895517422560651e-05, "loss": 0.4401, "step": 576 }, { "epoch": 0.8302158273381295, "grad_norm": 0.42481938118484464, "learning_rate": 7.89460295388675e-05, "loss": 0.4467, "step": 577 }, { "epoch": 0.8316546762589928, "grad_norm": 0.567836520865301, "learning_rate": 7.893684554175817e-05, "loss": 0.4403, "step": 578 }, { "epoch": 0.8330935251798561, "grad_norm": 0.6493035442382297, "learning_rate": 7.892762224354839e-05, "loss": 0.4487, "step": 579 }, { "epoch": 0.8345323741007195, "grad_norm": 0.6415333815536135, "learning_rate": 7.891835965354778e-05, "loss": 0.4423, "step": 580 }, { "epoch": 0.8359712230215828, "grad_norm": 0.5888206024793224, "learning_rate": 7.890905778110557e-05, "loss": 0.4371, "step": 581 }, { "epoch": 0.837410071942446, "grad_norm": 0.4362844457550126, "learning_rate": 7.889971663561065e-05, "loss": 0.441, "step": 582 }, { "epoch": 0.8388489208633093, "grad_norm": 0.516045485925217, "learning_rate": 7.889033622649155e-05, "loss": 0.442, "step": 583 }, { "epoch": 0.8402877697841726, "grad_norm": 0.6938260293020249, "learning_rate": 7.888091656321644e-05, "loss": 0.4439, "step": 584 }, { "epoch": 0.841726618705036, "grad_norm": 0.8866903216895299, "learning_rate": 7.88714576552931e-05, "loss": 0.4445, "step": 585 }, { "epoch": 0.8431654676258993, "grad_norm": 0.9571652867714048, "learning_rate": 7.886195951226892e-05, "loss": 0.4561, "step": 586 }, { "epoch": 0.8446043165467626, "grad_norm": 0.9639686794717786, "learning_rate": 7.885242214373091e-05, "loss": 0.4479, "step": 587 }, { "epoch": 0.8460431654676259, "grad_norm": 0.9415290789476168, "learning_rate": 7.884284555930564e-05, "loss": 0.433, "step": 588 }, { "epoch": 0.8474820143884892, "grad_norm": 0.8602482684889639, "learning_rate": 7.883322976865932e-05, "loss": 0.4436, "step": 589 }, { "epoch": 0.8489208633093526, "grad_norm": 0.7968024067108611, "learning_rate": 7.882357478149767e-05, "loss": 0.443, "step": 590 }, { "epoch": 0.8503597122302158, "grad_norm": 0.6837876068072184, "learning_rate": 7.8813880607566e-05, "loss": 0.4454, "step": 591 }, { "epoch": 0.8517985611510791, "grad_norm": 0.6487747345944853, "learning_rate": 7.880414725664918e-05, "loss": 0.439, "step": 592 }, { "epoch": 0.8532374100719424, "grad_norm": 0.6613059271967316, "learning_rate": 7.879437473857161e-05, "loss": 0.4384, "step": 593 }, { "epoch": 0.8546762589928057, "grad_norm": 0.7117235558673315, "learning_rate": 7.878456306319723e-05, "loss": 0.434, "step": 594 }, { "epoch": 0.8561151079136691, "grad_norm": 0.7068550862253427, "learning_rate": 7.877471224042952e-05, "loss": 0.4433, "step": 595 }, { "epoch": 0.8575539568345324, "grad_norm": 0.5607720123908527, "learning_rate": 7.876482228021144e-05, "loss": 0.4498, "step": 596 }, { "epoch": 0.8589928057553957, "grad_norm": 0.4156934120702612, "learning_rate": 7.875489319252549e-05, "loss": 0.4403, "step": 597 }, { "epoch": 0.860431654676259, "grad_norm": 0.5178057413217351, "learning_rate": 7.874492498739362e-05, "loss": 0.4391, "step": 598 }, { "epoch": 0.8618705035971223, "grad_norm": 0.612546753914525, "learning_rate": 7.87349176748773e-05, "loss": 0.4448, "step": 599 }, { "epoch": 0.8633093525179856, "grad_norm": 0.554902923548074, "learning_rate": 7.872487126507747e-05, "loss": 0.4418, "step": 600 }, { "epoch": 0.8647482014388489, "grad_norm": 0.46737793048588583, "learning_rate": 7.87147857681345e-05, "loss": 0.4471, "step": 601 }, { "epoch": 0.8661870503597122, "grad_norm": 0.5002389683481634, "learning_rate": 7.870466119422826e-05, "loss": 0.4473, "step": 602 }, { "epoch": 0.8676258992805755, "grad_norm": 0.4864787553709012, "learning_rate": 7.869449755357803e-05, "loss": 0.4337, "step": 603 }, { "epoch": 0.8690647482014389, "grad_norm": 0.4529141193587548, "learning_rate": 7.868429485644252e-05, "loss": 0.443, "step": 604 }, { "epoch": 0.8705035971223022, "grad_norm": 0.5213553189981291, "learning_rate": 7.86740531131199e-05, "loss": 0.4453, "step": 605 }, { "epoch": 0.8719424460431655, "grad_norm": 0.614473071399125, "learning_rate": 7.866377233394771e-05, "loss": 0.4323, "step": 606 }, { "epoch": 0.8733812949640288, "grad_norm": 0.5923931578535838, "learning_rate": 7.865345252930291e-05, "loss": 0.4405, "step": 607 }, { "epoch": 0.874820143884892, "grad_norm": 0.4877696955164087, "learning_rate": 7.864309370960184e-05, "loss": 0.4445, "step": 608 }, { "epoch": 0.8762589928057554, "grad_norm": 0.4759437715506379, "learning_rate": 7.863269588530023e-05, "loss": 0.4424, "step": 609 }, { "epoch": 0.8776978417266187, "grad_norm": 0.5031051834295212, "learning_rate": 7.862225906689319e-05, "loss": 0.4451, "step": 610 }, { "epoch": 0.879136690647482, "grad_norm": 0.6237107752983813, "learning_rate": 7.861178326491514e-05, "loss": 0.442, "step": 611 }, { "epoch": 0.8805755395683453, "grad_norm": 0.8022485843670729, "learning_rate": 7.860126848993992e-05, "loss": 0.434, "step": 612 }, { "epoch": 0.8820143884892087, "grad_norm": 0.8512996518757874, "learning_rate": 7.859071475258065e-05, "loss": 0.4496, "step": 613 }, { "epoch": 0.883453237410072, "grad_norm": 0.8482540481368318, "learning_rate": 7.85801220634898e-05, "loss": 0.4328, "step": 614 }, { "epoch": 0.8848920863309353, "grad_norm": 0.8197679584683998, "learning_rate": 7.856949043335917e-05, "loss": 0.4369, "step": 615 }, { "epoch": 0.8863309352517985, "grad_norm": 0.9063289810672892, "learning_rate": 7.855881987291983e-05, "loss": 0.4383, "step": 616 }, { "epoch": 0.8877697841726618, "grad_norm": 1.0126671490109374, "learning_rate": 7.854811039294216e-05, "loss": 0.4439, "step": 617 }, { "epoch": 0.8892086330935252, "grad_norm": 0.9523707023332784, "learning_rate": 7.853736200423584e-05, "loss": 0.4485, "step": 618 }, { "epoch": 0.8906474820143885, "grad_norm": 0.726533354436605, "learning_rate": 7.852657471764983e-05, "loss": 0.4396, "step": 619 }, { "epoch": 0.8920863309352518, "grad_norm": 0.5317978393118742, "learning_rate": 7.851574854407228e-05, "loss": 0.436, "step": 620 }, { "epoch": 0.8935251798561151, "grad_norm": 0.5176552199820211, "learning_rate": 7.85048834944307e-05, "loss": 0.4426, "step": 621 }, { "epoch": 0.8949640287769784, "grad_norm": 0.5084489605107904, "learning_rate": 7.849397957969173e-05, "loss": 0.4477, "step": 622 }, { "epoch": 0.8964028776978418, "grad_norm": 0.581415487862338, "learning_rate": 7.848303681086134e-05, "loss": 0.4391, "step": 623 }, { "epoch": 0.897841726618705, "grad_norm": 0.5787490361092864, "learning_rate": 7.847205519898461e-05, "loss": 0.4428, "step": 624 }, { "epoch": 0.8992805755395683, "grad_norm": 0.4641397927785643, "learning_rate": 7.846103475514595e-05, "loss": 0.4428, "step": 625 }, { "epoch": 0.9007194244604316, "grad_norm": 0.4145065936872774, "learning_rate": 7.844997549046886e-05, "loss": 0.4384, "step": 626 }, { "epoch": 0.902158273381295, "grad_norm": 0.4596652659908233, "learning_rate": 7.843887741611608e-05, "loss": 0.4437, "step": 627 }, { "epoch": 0.9035971223021583, "grad_norm": 0.5582975704307552, "learning_rate": 7.842774054328949e-05, "loss": 0.4351, "step": 628 }, { "epoch": 0.9050359712230216, "grad_norm": 0.6206476700620335, "learning_rate": 7.841656488323017e-05, "loss": 0.4373, "step": 629 }, { "epoch": 0.9064748201438849, "grad_norm": 0.5875705133755592, "learning_rate": 7.840535044721832e-05, "loss": 0.4406, "step": 630 }, { "epoch": 0.9079136690647482, "grad_norm": 0.6055808749949936, "learning_rate": 7.839409724657327e-05, "loss": 0.439, "step": 631 }, { "epoch": 0.9093525179856116, "grad_norm": 0.5157141811047572, "learning_rate": 7.838280529265353e-05, "loss": 0.4503, "step": 632 }, { "epoch": 0.9107913669064748, "grad_norm": 0.3784451801413552, "learning_rate": 7.837147459685666e-05, "loss": 0.4359, "step": 633 }, { "epoch": 0.9122302158273381, "grad_norm": 0.42682466417211756, "learning_rate": 7.836010517061937e-05, "loss": 0.4371, "step": 634 }, { "epoch": 0.9136690647482014, "grad_norm": 0.483018868022784, "learning_rate": 7.834869702541742e-05, "loss": 0.4485, "step": 635 }, { "epoch": 0.9151079136690647, "grad_norm": 0.373002785791148, "learning_rate": 7.833725017276573e-05, "loss": 0.431, "step": 636 }, { "epoch": 0.9165467625899281, "grad_norm": 0.5226798778671032, "learning_rate": 7.83257646242182e-05, "loss": 0.4411, "step": 637 }, { "epoch": 0.9179856115107914, "grad_norm": 0.6245402002062002, "learning_rate": 7.831424039136783e-05, "loss": 0.4352, "step": 638 }, { "epoch": 0.9194244604316547, "grad_norm": 0.7298061828081333, "learning_rate": 7.830267748584666e-05, "loss": 0.4375, "step": 639 }, { "epoch": 0.920863309352518, "grad_norm": 0.8633694543394981, "learning_rate": 7.829107591932578e-05, "loss": 0.4351, "step": 640 }, { "epoch": 0.9223021582733812, "grad_norm": 1.1393206979864197, "learning_rate": 7.82794357035153e-05, "loss": 0.4522, "step": 641 }, { "epoch": 0.9237410071942446, "grad_norm": 0.6438360260021782, "learning_rate": 7.82677568501643e-05, "loss": 0.4421, "step": 642 }, { "epoch": 0.9251798561151079, "grad_norm": 0.42681014197527684, "learning_rate": 7.82560393710609e-05, "loss": 0.4405, "step": 643 }, { "epoch": 0.9266187050359712, "grad_norm": 0.47523960417453875, "learning_rate": 7.824428327803221e-05, "loss": 0.4307, "step": 644 }, { "epoch": 0.9280575539568345, "grad_norm": 0.5290286468933811, "learning_rate": 7.823248858294428e-05, "loss": 0.4381, "step": 645 }, { "epoch": 0.9294964028776979, "grad_norm": 0.5520431122422456, "learning_rate": 7.822065529770216e-05, "loss": 0.4359, "step": 646 }, { "epoch": 0.9309352517985612, "grad_norm": 0.5624293793113554, "learning_rate": 7.820878343424984e-05, "loss": 0.4288, "step": 647 }, { "epoch": 0.9323741007194245, "grad_norm": 0.5790328927418427, "learning_rate": 7.819687300457021e-05, "loss": 0.4414, "step": 648 }, { "epoch": 0.9338129496402877, "grad_norm": 0.6663729371991175, "learning_rate": 7.818492402068517e-05, "loss": 0.4552, "step": 649 }, { "epoch": 0.935251798561151, "grad_norm": 0.7082272101827812, "learning_rate": 7.817293649465546e-05, "loss": 0.4541, "step": 650 }, { "epoch": 0.9366906474820144, "grad_norm": 0.7769959445586613, "learning_rate": 7.816091043858076e-05, "loss": 0.4321, "step": 651 }, { "epoch": 0.9381294964028777, "grad_norm": 0.9309288447563863, "learning_rate": 7.814884586459962e-05, "loss": 0.4442, "step": 652 }, { "epoch": 0.939568345323741, "grad_norm": 1.0193886938186758, "learning_rate": 7.813674278488949e-05, "loss": 0.4408, "step": 653 }, { "epoch": 0.9410071942446043, "grad_norm": 0.8100948835729215, "learning_rate": 7.812460121166666e-05, "loss": 0.4373, "step": 654 }, { "epoch": 0.9424460431654677, "grad_norm": 0.8449929536774763, "learning_rate": 7.81124211571863e-05, "loss": 0.4393, "step": 655 }, { "epoch": 0.943884892086331, "grad_norm": 0.9862265081898043, "learning_rate": 7.810020263374239e-05, "loss": 0.4304, "step": 656 }, { "epoch": 0.9453237410071943, "grad_norm": 0.8462187956035074, "learning_rate": 7.808794565366778e-05, "loss": 0.4437, "step": 657 }, { "epoch": 0.9467625899280575, "grad_norm": 0.6678706409869629, "learning_rate": 7.807565022933412e-05, "loss": 0.437, "step": 658 }, { "epoch": 0.9482014388489208, "grad_norm": 0.542640476887084, "learning_rate": 7.806331637315183e-05, "loss": 0.4426, "step": 659 }, { "epoch": 0.9496402877697842, "grad_norm": 0.532435359619077, "learning_rate": 7.805094409757017e-05, "loss": 0.4308, "step": 660 }, { "epoch": 0.9510791366906475, "grad_norm": 0.6982411367119742, "learning_rate": 7.803853341507715e-05, "loss": 0.445, "step": 661 }, { "epoch": 0.9525179856115108, "grad_norm": 0.6172363235937145, "learning_rate": 7.802608433819957e-05, "loss": 0.4469, "step": 662 }, { "epoch": 0.9539568345323741, "grad_norm": 0.4425610817255528, "learning_rate": 7.801359687950292e-05, "loss": 0.4443, "step": 663 }, { "epoch": 0.9553956834532374, "grad_norm": 0.4900095133638887, "learning_rate": 7.800107105159155e-05, "loss": 0.4307, "step": 664 }, { "epoch": 0.9568345323741008, "grad_norm": 0.4187982557155431, "learning_rate": 7.798850686710841e-05, "loss": 0.444, "step": 665 }, { "epoch": 0.958273381294964, "grad_norm": 0.4096966639403132, "learning_rate": 7.797590433873526e-05, "loss": 0.4375, "step": 666 }, { "epoch": 0.9597122302158273, "grad_norm": 0.467720980260205, "learning_rate": 7.79632634791925e-05, "loss": 0.4436, "step": 667 }, { "epoch": 0.9611510791366906, "grad_norm": 0.47391654471052125, "learning_rate": 7.795058430123925e-05, "loss": 0.4428, "step": 668 }, { "epoch": 0.962589928057554, "grad_norm": 0.4792805071275589, "learning_rate": 7.793786681767333e-05, "loss": 0.4389, "step": 669 }, { "epoch": 0.9640287769784173, "grad_norm": 0.36928231507310566, "learning_rate": 7.792511104133117e-05, "loss": 0.4476, "step": 670 }, { "epoch": 0.9654676258992806, "grad_norm": 0.316373392013736, "learning_rate": 7.791231698508786e-05, "loss": 0.4457, "step": 671 }, { "epoch": 0.9669064748201439, "grad_norm": 0.345313584756319, "learning_rate": 7.789948466185718e-05, "loss": 0.4406, "step": 672 }, { "epoch": 0.9683453237410072, "grad_norm": 0.3893150006205188, "learning_rate": 7.788661408459146e-05, "loss": 0.4281, "step": 673 }, { "epoch": 0.9697841726618706, "grad_norm": 0.3901165937334357, "learning_rate": 7.787370526628173e-05, "loss": 0.4304, "step": 674 }, { "epoch": 0.9712230215827338, "grad_norm": 0.3949496609169628, "learning_rate": 7.786075821995754e-05, "loss": 0.4421, "step": 675 }, { "epoch": 0.9726618705035971, "grad_norm": 0.4315872922327187, "learning_rate": 7.784777295868706e-05, "loss": 0.439, "step": 676 }, { "epoch": 0.9741007194244604, "grad_norm": 0.4446444112401096, "learning_rate": 7.783474949557704e-05, "loss": 0.4344, "step": 677 }, { "epoch": 0.9755395683453237, "grad_norm": 0.4579936400268822, "learning_rate": 7.782168784377276e-05, "loss": 0.4311, "step": 678 }, { "epoch": 0.9769784172661871, "grad_norm": 0.5336441400695344, "learning_rate": 7.780858801645806e-05, "loss": 0.4442, "step": 679 }, { "epoch": 0.9784172661870504, "grad_norm": 0.6129966638581394, "learning_rate": 7.779545002685535e-05, "loss": 0.4348, "step": 680 }, { "epoch": 0.9798561151079137, "grad_norm": 0.6015585936775667, "learning_rate": 7.778227388822552e-05, "loss": 0.4497, "step": 681 }, { "epoch": 0.981294964028777, "grad_norm": 0.6238991019639683, "learning_rate": 7.776905961386793e-05, "loss": 0.4394, "step": 682 }, { "epoch": 0.9827338129496402, "grad_norm": 0.637066953593209, "learning_rate": 7.77558072171205e-05, "loss": 0.4462, "step": 683 }, { "epoch": 0.9841726618705036, "grad_norm": 0.6920493127521262, "learning_rate": 7.774251671135961e-05, "loss": 0.442, "step": 684 }, { "epoch": 0.9856115107913669, "grad_norm": 0.902123798026066, "learning_rate": 7.77291881100001e-05, "loss": 0.4422, "step": 685 }, { "epoch": 0.9870503597122302, "grad_norm": 1.080075310815761, "learning_rate": 7.771582142649523e-05, "loss": 0.4361, "step": 686 }, { "epoch": 0.9884892086330935, "grad_norm": 0.7868523641587457, "learning_rate": 7.770241667433677e-05, "loss": 0.4437, "step": 687 }, { "epoch": 0.9899280575539569, "grad_norm": 0.6176988692747292, "learning_rate": 7.768897386705488e-05, "loss": 0.4344, "step": 688 }, { "epoch": 0.9913669064748202, "grad_norm": 0.6170658867591173, "learning_rate": 7.767549301821807e-05, "loss": 0.4364, "step": 689 }, { "epoch": 0.9928057553956835, "grad_norm": 0.48431512071645794, "learning_rate": 7.766197414143333e-05, "loss": 0.4316, "step": 690 }, { "epoch": 0.9942446043165467, "grad_norm": 0.3441788556944156, "learning_rate": 7.764841725034602e-05, "loss": 0.4369, "step": 691 }, { "epoch": 0.99568345323741, "grad_norm": 0.36446005498861855, "learning_rate": 7.763482235863985e-05, "loss": 0.439, "step": 692 }, { "epoch": 0.9971223021582734, "grad_norm": 0.45740062246008695, "learning_rate": 7.762118948003688e-05, "loss": 0.4381, "step": 693 }, { "epoch": 0.9985611510791367, "grad_norm": 0.4857706549122722, "learning_rate": 7.760751862829754e-05, "loss": 0.4279, "step": 694 }, { "epoch": 1.0, "grad_norm": 0.5428761936290282, "learning_rate": 7.759380981722055e-05, "loss": 0.4366, "step": 695 }, { "epoch": 1.0014388489208632, "grad_norm": 0.5992478587237964, "learning_rate": 7.758006306064301e-05, "loss": 0.4209, "step": 696 }, { "epoch": 1.0028776978417266, "grad_norm": 0.5650891647222237, "learning_rate": 7.756627837244023e-05, "loss": 0.416, "step": 697 }, { "epoch": 1.0043165467625899, "grad_norm": 0.568430795185666, "learning_rate": 7.755245576652588e-05, "loss": 0.415, "step": 698 }, { "epoch": 1.0057553956834533, "grad_norm": 0.6422965686402028, "learning_rate": 7.753859525685187e-05, "loss": 0.4222, "step": 699 }, { "epoch": 1.0071942446043165, "grad_norm": 0.6090204933635286, "learning_rate": 7.752469685740838e-05, "loss": 0.4233, "step": 700 }, { "epoch": 1.00863309352518, "grad_norm": 0.6244656974243654, "learning_rate": 7.751076058222381e-05, "loss": 0.4394, "step": 701 }, { "epoch": 1.0100719424460431, "grad_norm": 0.6321414492283441, "learning_rate": 7.749678644536485e-05, "loss": 0.4154, "step": 702 }, { "epoch": 1.0115107913669066, "grad_norm": 0.7184162771194458, "learning_rate": 7.748277446093631e-05, "loss": 0.4215, "step": 703 }, { "epoch": 1.0129496402877698, "grad_norm": 0.7578461572480496, "learning_rate": 7.746872464308131e-05, "loss": 0.4165, "step": 704 }, { "epoch": 1.014388489208633, "grad_norm": 0.6677024168677679, "learning_rate": 7.745463700598108e-05, "loss": 0.4174, "step": 705 }, { "epoch": 1.0158273381294964, "grad_norm": 0.6107255829884857, "learning_rate": 7.744051156385503e-05, "loss": 0.4253, "step": 706 }, { "epoch": 1.0172661870503596, "grad_norm": 0.6250570156008007, "learning_rate": 7.742634833096077e-05, "loss": 0.4188, "step": 707 }, { "epoch": 1.018705035971223, "grad_norm": 0.6380171143766701, "learning_rate": 7.741214732159403e-05, "loss": 0.4146, "step": 708 }, { "epoch": 1.0201438848920863, "grad_norm": 0.6168839916378758, "learning_rate": 7.739790855008867e-05, "loss": 0.4248, "step": 709 }, { "epoch": 1.0215827338129497, "grad_norm": 0.5726456992958739, "learning_rate": 7.738363203081664e-05, "loss": 0.4205, "step": 710 }, { "epoch": 1.023021582733813, "grad_norm": 0.5481407715702845, "learning_rate": 7.736931777818805e-05, "loss": 0.4219, "step": 711 }, { "epoch": 1.0244604316546762, "grad_norm": 0.519092810724968, "learning_rate": 7.735496580665105e-05, "loss": 0.4238, "step": 712 }, { "epoch": 1.0258992805755396, "grad_norm": 0.5406096531900932, "learning_rate": 7.734057613069188e-05, "loss": 0.4135, "step": 713 }, { "epoch": 1.0273381294964028, "grad_norm": 0.633851534977381, "learning_rate": 7.73261487648348e-05, "loss": 0.4198, "step": 714 }, { "epoch": 1.0287769784172662, "grad_norm": 0.6490670741748399, "learning_rate": 7.731168372364219e-05, "loss": 0.4226, "step": 715 }, { "epoch": 1.0302158273381294, "grad_norm": 0.7422510055372743, "learning_rate": 7.729718102171438e-05, "loss": 0.417, "step": 716 }, { "epoch": 1.0316546762589929, "grad_norm": 0.9312794652767561, "learning_rate": 7.728264067368976e-05, "loss": 0.4163, "step": 717 }, { "epoch": 1.033093525179856, "grad_norm": 1.3856090088417783, "learning_rate": 7.726806269424469e-05, "loss": 0.4273, "step": 718 }, { "epoch": 1.0345323741007195, "grad_norm": 0.41077893968222584, "learning_rate": 7.725344709809355e-05, "loss": 0.4127, "step": 719 }, { "epoch": 1.0359712230215827, "grad_norm": 1.254621053277421, "learning_rate": 7.723879389998864e-05, "loss": 0.4197, "step": 720 }, { "epoch": 1.037410071942446, "grad_norm": 0.8999150862991432, "learning_rate": 7.722410311472026e-05, "loss": 0.4232, "step": 721 }, { "epoch": 1.0388489208633094, "grad_norm": 0.6094735034533252, "learning_rate": 7.72093747571166e-05, "loss": 0.4232, "step": 722 }, { "epoch": 1.0402877697841726, "grad_norm": 0.561578298258745, "learning_rate": 7.719460884204383e-05, "loss": 0.4186, "step": 723 }, { "epoch": 1.041726618705036, "grad_norm": 0.6265469575259921, "learning_rate": 7.717980538440599e-05, "loss": 0.4242, "step": 724 }, { "epoch": 1.0431654676258992, "grad_norm": 0.6957086180987356, "learning_rate": 7.716496439914502e-05, "loss": 0.4263, "step": 725 }, { "epoch": 1.0446043165467627, "grad_norm": 0.6804151719159521, "learning_rate": 7.715008590124076e-05, "loss": 0.4141, "step": 726 }, { "epoch": 1.0460431654676259, "grad_norm": 0.6133997418818746, "learning_rate": 7.713516990571088e-05, "loss": 0.4212, "step": 727 }, { "epoch": 1.0474820143884893, "grad_norm": 0.5238363700426827, "learning_rate": 7.71202164276109e-05, "loss": 0.4196, "step": 728 }, { "epoch": 1.0489208633093525, "grad_norm": 0.3610851256412559, "learning_rate": 7.710522548203424e-05, "loss": 0.4128, "step": 729 }, { "epoch": 1.0503597122302157, "grad_norm": 0.370798257580522, "learning_rate": 7.709019708411202e-05, "loss": 0.4215, "step": 730 }, { "epoch": 1.0517985611510792, "grad_norm": 0.45825791549468775, "learning_rate": 7.707513124901327e-05, "loss": 0.4154, "step": 731 }, { "epoch": 1.0532374100719424, "grad_norm": 0.4493226973620425, "learning_rate": 7.706002799194476e-05, "loss": 0.4295, "step": 732 }, { "epoch": 1.0546762589928058, "grad_norm": 0.4377297934543004, "learning_rate": 7.704488732815105e-05, "loss": 0.4115, "step": 733 }, { "epoch": 1.056115107913669, "grad_norm": 0.40999934507501834, "learning_rate": 7.702970927291442e-05, "loss": 0.4174, "step": 734 }, { "epoch": 1.0575539568345325, "grad_norm": 0.3422549763398815, "learning_rate": 7.701449384155492e-05, "loss": 0.4242, "step": 735 }, { "epoch": 1.0589928057553957, "grad_norm": 0.3160845755661669, "learning_rate": 7.699924104943033e-05, "loss": 0.422, "step": 736 }, { "epoch": 1.0604316546762589, "grad_norm": 0.3725745991616709, "learning_rate": 7.698395091193615e-05, "loss": 0.4181, "step": 737 }, { "epoch": 1.0618705035971223, "grad_norm": 0.3846927289474282, "learning_rate": 7.696862344450553e-05, "loss": 0.4208, "step": 738 }, { "epoch": 1.0633093525179855, "grad_norm": 0.4433253183936695, "learning_rate": 7.695325866260932e-05, "loss": 0.4079, "step": 739 }, { "epoch": 1.064748201438849, "grad_norm": 0.6210118491031278, "learning_rate": 7.693785658175607e-05, "loss": 0.4281, "step": 740 }, { "epoch": 1.0661870503597122, "grad_norm": 0.7680274413581848, "learning_rate": 7.692241721749194e-05, "loss": 0.4148, "step": 741 }, { "epoch": 1.0676258992805756, "grad_norm": 0.8021919763630236, "learning_rate": 7.69069405854007e-05, "loss": 0.4222, "step": 742 }, { "epoch": 1.0690647482014388, "grad_norm": 0.7288764113640167, "learning_rate": 7.68914267011038e-05, "loss": 0.4121, "step": 743 }, { "epoch": 1.0705035971223023, "grad_norm": 0.6378506361039106, "learning_rate": 7.687587558026024e-05, "loss": 0.4138, "step": 744 }, { "epoch": 1.0719424460431655, "grad_norm": 0.5576514753911082, "learning_rate": 7.686028723856664e-05, "loss": 0.4182, "step": 745 }, { "epoch": 1.0733812949640287, "grad_norm": 0.4314748972331098, "learning_rate": 7.684466169175714e-05, "loss": 0.4145, "step": 746 }, { "epoch": 1.074820143884892, "grad_norm": 0.2744023173556825, "learning_rate": 7.68289989556035e-05, "loss": 0.4289, "step": 747 }, { "epoch": 1.0762589928057553, "grad_norm": 0.432284955906597, "learning_rate": 7.681329904591495e-05, "loss": 0.4116, "step": 748 }, { "epoch": 1.0776978417266188, "grad_norm": 0.5768063729704934, "learning_rate": 7.67975619785383e-05, "loss": 0.4281, "step": 749 }, { "epoch": 1.079136690647482, "grad_norm": 0.4962978373730406, "learning_rate": 7.678178776935781e-05, "loss": 0.4094, "step": 750 }, { "epoch": 1.0805755395683454, "grad_norm": 0.41707422752363793, "learning_rate": 7.676597643429528e-05, "loss": 0.4191, "step": 751 }, { "epoch": 1.0820143884892086, "grad_norm": 0.5272821722002808, "learning_rate": 7.675012798930994e-05, "loss": 0.4192, "step": 752 }, { "epoch": 1.083453237410072, "grad_norm": 0.603326470094669, "learning_rate": 7.673424245039852e-05, "loss": 0.4148, "step": 753 }, { "epoch": 1.0848920863309353, "grad_norm": 0.49183314809179646, "learning_rate": 7.671831983359515e-05, "loss": 0.4258, "step": 754 }, { "epoch": 1.0863309352517985, "grad_norm": 0.4918040628996872, "learning_rate": 7.670236015497141e-05, "loss": 0.4221, "step": 755 }, { "epoch": 1.087769784172662, "grad_norm": 0.6446415855495722, "learning_rate": 7.668636343063628e-05, "loss": 0.4122, "step": 756 }, { "epoch": 1.0892086330935251, "grad_norm": 0.7542892767463729, "learning_rate": 7.667032967673614e-05, "loss": 0.4209, "step": 757 }, { "epoch": 1.0906474820143885, "grad_norm": 0.8568636634175414, "learning_rate": 7.665425890945474e-05, "loss": 0.4236, "step": 758 }, { "epoch": 1.0920863309352518, "grad_norm": 0.864668217646376, "learning_rate": 7.663815114501319e-05, "loss": 0.4106, "step": 759 }, { "epoch": 1.0935251798561152, "grad_norm": 0.8526717179777845, "learning_rate": 7.662200639966992e-05, "loss": 0.4176, "step": 760 }, { "epoch": 1.0949640287769784, "grad_norm": 0.7336891682731496, "learning_rate": 7.660582468972074e-05, "loss": 0.4224, "step": 761 }, { "epoch": 1.0964028776978418, "grad_norm": 0.7285535475605878, "learning_rate": 7.658960603149873e-05, "loss": 0.4196, "step": 762 }, { "epoch": 1.097841726618705, "grad_norm": 0.7078631580302459, "learning_rate": 7.657335044137427e-05, "loss": 0.4161, "step": 763 }, { "epoch": 1.0992805755395683, "grad_norm": 0.6430803820273636, "learning_rate": 7.655705793575504e-05, "loss": 0.4244, "step": 764 }, { "epoch": 1.1007194244604317, "grad_norm": 0.5619905072652992, "learning_rate": 7.654072853108592e-05, "loss": 0.4284, "step": 765 }, { "epoch": 1.102158273381295, "grad_norm": 0.5244169515954566, "learning_rate": 7.652436224384911e-05, "loss": 0.4282, "step": 766 }, { "epoch": 1.1035971223021583, "grad_norm": 0.3987451516791701, "learning_rate": 7.6507959090564e-05, "loss": 0.4153, "step": 767 }, { "epoch": 1.1050359712230216, "grad_norm": 0.31705681956288584, "learning_rate": 7.649151908778721e-05, "loss": 0.4207, "step": 768 }, { "epoch": 1.106474820143885, "grad_norm": 0.45003755116239236, "learning_rate": 7.64750422521125e-05, "loss": 0.4248, "step": 769 }, { "epoch": 1.1079136690647482, "grad_norm": 0.51467358904595, "learning_rate": 7.645852860017086e-05, "loss": 0.4263, "step": 770 }, { "epoch": 1.1093525179856114, "grad_norm": 0.43555140409683163, "learning_rate": 7.644197814863045e-05, "loss": 0.4303, "step": 771 }, { "epoch": 1.1107913669064748, "grad_norm": 0.41737442204483916, "learning_rate": 7.642539091419654e-05, "loss": 0.4257, "step": 772 }, { "epoch": 1.112230215827338, "grad_norm": 0.4252050622071493, "learning_rate": 7.640876691361152e-05, "loss": 0.4122, "step": 773 }, { "epoch": 1.1136690647482015, "grad_norm": 0.421240589158121, "learning_rate": 7.639210616365494e-05, "loss": 0.4179, "step": 774 }, { "epoch": 1.1151079136690647, "grad_norm": 0.41311441086953393, "learning_rate": 7.637540868114338e-05, "loss": 0.4198, "step": 775 }, { "epoch": 1.1165467625899281, "grad_norm": 0.5074729502685691, "learning_rate": 7.635867448293056e-05, "loss": 0.4237, "step": 776 }, { "epoch": 1.1179856115107913, "grad_norm": 0.6085863862933002, "learning_rate": 7.63419035859072e-05, "loss": 0.4153, "step": 777 }, { "epoch": 1.1194244604316548, "grad_norm": 0.49446882458967484, "learning_rate": 7.63250960070011e-05, "loss": 0.4146, "step": 778 }, { "epoch": 1.120863309352518, "grad_norm": 0.31521512846895183, "learning_rate": 7.630825176317707e-05, "loss": 0.4167, "step": 779 }, { "epoch": 1.1223021582733812, "grad_norm": 0.5936739993914748, "learning_rate": 7.629137087143693e-05, "loss": 0.4168, "step": 780 }, { "epoch": 1.1237410071942446, "grad_norm": 0.7538350242414342, "learning_rate": 7.627445334881951e-05, "loss": 0.418, "step": 781 }, { "epoch": 1.1251798561151078, "grad_norm": 0.75065201309353, "learning_rate": 7.625749921240058e-05, "loss": 0.4153, "step": 782 }, { "epoch": 1.1266187050359713, "grad_norm": 0.6423560489394579, "learning_rate": 7.62405084792929e-05, "loss": 0.4176, "step": 783 }, { "epoch": 1.1280575539568345, "grad_norm": 0.5168027732967486, "learning_rate": 7.622348116664611e-05, "loss": 0.4279, "step": 784 }, { "epoch": 1.129496402877698, "grad_norm": 0.5891227033433204, "learning_rate": 7.620641729164686e-05, "loss": 0.4103, "step": 785 }, { "epoch": 1.1309352517985611, "grad_norm": 0.7134438221688847, "learning_rate": 7.618931687151863e-05, "loss": 0.4185, "step": 786 }, { "epoch": 1.1323741007194243, "grad_norm": 0.7213046385941655, "learning_rate": 7.617217992352183e-05, "loss": 0.4252, "step": 787 }, { "epoch": 1.1338129496402878, "grad_norm": 0.5705780566060814, "learning_rate": 7.615500646495373e-05, "loss": 0.4204, "step": 788 }, { "epoch": 1.135251798561151, "grad_norm": 0.4302245388899498, "learning_rate": 7.613779651314841e-05, "loss": 0.4154, "step": 789 }, { "epoch": 1.1366906474820144, "grad_norm": 0.37727734368717947, "learning_rate": 7.612055008547688e-05, "loss": 0.4243, "step": 790 }, { "epoch": 1.1381294964028776, "grad_norm": 0.3739177488902291, "learning_rate": 7.610326719934685e-05, "loss": 0.4252, "step": 791 }, { "epoch": 1.139568345323741, "grad_norm": 0.3263271090762003, "learning_rate": 7.608594787220292e-05, "loss": 0.4173, "step": 792 }, { "epoch": 1.1410071942446043, "grad_norm": 0.37994810884379415, "learning_rate": 7.606859212152644e-05, "loss": 0.4173, "step": 793 }, { "epoch": 1.1424460431654677, "grad_norm": 0.5773190952778903, "learning_rate": 7.605119996483551e-05, "loss": 0.418, "step": 794 }, { "epoch": 1.143884892086331, "grad_norm": 0.6208254448365393, "learning_rate": 7.6033771419685e-05, "loss": 0.4177, "step": 795 }, { "epoch": 1.1453237410071941, "grad_norm": 0.6584362582594144, "learning_rate": 7.601630650366648e-05, "loss": 0.4222, "step": 796 }, { "epoch": 1.1467625899280576, "grad_norm": 0.6752585542994677, "learning_rate": 7.59988052344083e-05, "loss": 0.415, "step": 797 }, { "epoch": 1.1482014388489208, "grad_norm": 0.760537285794366, "learning_rate": 7.59812676295754e-05, "loss": 0.4167, "step": 798 }, { "epoch": 1.1496402877697842, "grad_norm": 0.8781635828536211, "learning_rate": 7.596369370686947e-05, "loss": 0.4351, "step": 799 }, { "epoch": 1.1510791366906474, "grad_norm": 0.9415873644042007, "learning_rate": 7.594608348402885e-05, "loss": 0.4256, "step": 800 }, { "epoch": 1.1525179856115109, "grad_norm": 0.8338541290612863, "learning_rate": 7.592843697882848e-05, "loss": 0.4165, "step": 801 }, { "epoch": 1.153956834532374, "grad_norm": 0.6683626268938538, "learning_rate": 7.591075420907997e-05, "loss": 0.4259, "step": 802 }, { "epoch": 1.1553956834532375, "grad_norm": 0.4844470964570286, "learning_rate": 7.589303519263151e-05, "loss": 0.423, "step": 803 }, { "epoch": 1.1568345323741007, "grad_norm": 0.35350010087738093, "learning_rate": 7.587527994736787e-05, "loss": 0.4179, "step": 804 }, { "epoch": 1.158273381294964, "grad_norm": 0.44582792317039727, "learning_rate": 7.58574884912104e-05, "loss": 0.4072, "step": 805 }, { "epoch": 1.1597122302158274, "grad_norm": 0.48742184625135226, "learning_rate": 7.5839660842117e-05, "loss": 0.4232, "step": 806 }, { "epoch": 1.1611510791366906, "grad_norm": 0.504469321517267, "learning_rate": 7.582179701808208e-05, "loss": 0.4104, "step": 807 }, { "epoch": 1.162589928057554, "grad_norm": 0.4152839795687809, "learning_rate": 7.580389703713661e-05, "loss": 0.4162, "step": 808 }, { "epoch": 1.1640287769784172, "grad_norm": 0.34815113916988333, "learning_rate": 7.5785960917348e-05, "loss": 0.4226, "step": 809 }, { "epoch": 1.1654676258992807, "grad_norm": 0.4819757121959254, "learning_rate": 7.576798867682018e-05, "loss": 0.4214, "step": 810 }, { "epoch": 1.1669064748201439, "grad_norm": 0.5178763620456481, "learning_rate": 7.574998033369349e-05, "loss": 0.4171, "step": 811 }, { "epoch": 1.1683453237410073, "grad_norm": 0.4154321963183835, "learning_rate": 7.573193590614479e-05, "loss": 0.4215, "step": 812 }, { "epoch": 1.1697841726618705, "grad_norm": 0.29377641830878887, "learning_rate": 7.571385541238727e-05, "loss": 0.415, "step": 813 }, { "epoch": 1.1712230215827337, "grad_norm": 0.3873091669226385, "learning_rate": 7.569573887067059e-05, "loss": 0.4301, "step": 814 }, { "epoch": 1.1726618705035972, "grad_norm": 0.4440531679666098, "learning_rate": 7.567758629928076e-05, "loss": 0.4269, "step": 815 }, { "epoch": 1.1741007194244604, "grad_norm": 0.4238674797228725, "learning_rate": 7.565939771654018e-05, "loss": 0.412, "step": 816 }, { "epoch": 1.1755395683453238, "grad_norm": 0.46840086425383703, "learning_rate": 7.564117314080758e-05, "loss": 0.4124, "step": 817 }, { "epoch": 1.176978417266187, "grad_norm": 0.4391720956518648, "learning_rate": 7.562291259047804e-05, "loss": 0.4301, "step": 818 }, { "epoch": 1.1784172661870504, "grad_norm": 0.34956265214346327, "learning_rate": 7.560461608398292e-05, "loss": 0.4182, "step": 819 }, { "epoch": 1.1798561151079137, "grad_norm": 0.36980676294878867, "learning_rate": 7.558628363978991e-05, "loss": 0.4123, "step": 820 }, { "epoch": 1.181294964028777, "grad_norm": 0.46513685237089797, "learning_rate": 7.556791527640292e-05, "loss": 0.4176, "step": 821 }, { "epoch": 1.1827338129496403, "grad_norm": 0.5231382758166871, "learning_rate": 7.554951101236219e-05, "loss": 0.4201, "step": 822 }, { "epoch": 1.1841726618705035, "grad_norm": 0.5160947629933547, "learning_rate": 7.553107086624413e-05, "loss": 0.4147, "step": 823 }, { "epoch": 1.185611510791367, "grad_norm": 0.45194829521755847, "learning_rate": 7.551259485666141e-05, "loss": 0.4174, "step": 824 }, { "epoch": 1.1870503597122302, "grad_norm": 0.4063881919431981, "learning_rate": 7.549408300226287e-05, "loss": 0.4161, "step": 825 }, { "epoch": 1.1884892086330936, "grad_norm": 0.4188773062037776, "learning_rate": 7.547553532173356e-05, "loss": 0.4292, "step": 826 }, { "epoch": 1.1899280575539568, "grad_norm": 0.43758049905108026, "learning_rate": 7.545695183379465e-05, "loss": 0.4283, "step": 827 }, { "epoch": 1.19136690647482, "grad_norm": 0.4549818928421618, "learning_rate": 7.54383325572035e-05, "loss": 0.4188, "step": 828 }, { "epoch": 1.1928057553956835, "grad_norm": 0.4509482466305469, "learning_rate": 7.541967751075354e-05, "loss": 0.4224, "step": 829 }, { "epoch": 1.1942446043165469, "grad_norm": 0.5444101749600369, "learning_rate": 7.540098671327438e-05, "loss": 0.4249, "step": 830 }, { "epoch": 1.19568345323741, "grad_norm": 0.626516592937173, "learning_rate": 7.538226018363164e-05, "loss": 0.4171, "step": 831 }, { "epoch": 1.1971223021582733, "grad_norm": 0.6904870109865596, "learning_rate": 7.536349794072705e-05, "loss": 0.4193, "step": 832 }, { "epoch": 1.1985611510791367, "grad_norm": 0.7503121033224123, "learning_rate": 7.534470000349835e-05, "loss": 0.4234, "step": 833 }, { "epoch": 1.2, "grad_norm": 0.8111243569118842, "learning_rate": 7.532586639091936e-05, "loss": 0.4173, "step": 834 }, { "epoch": 1.2014388489208634, "grad_norm": 0.8836389030724152, "learning_rate": 7.530699712199985e-05, "loss": 0.4256, "step": 835 }, { "epoch": 1.2028776978417266, "grad_norm": 0.8013048465813984, "learning_rate": 7.528809221578565e-05, "loss": 0.4184, "step": 836 }, { "epoch": 1.2043165467625898, "grad_norm": 0.5653848901027986, "learning_rate": 7.52691516913585e-05, "loss": 0.4318, "step": 837 }, { "epoch": 1.2057553956834532, "grad_norm": 0.4638018529487915, "learning_rate": 7.525017556783612e-05, "loss": 0.4174, "step": 838 }, { "epoch": 1.2071942446043165, "grad_norm": 0.6313017471518908, "learning_rate": 7.523116386437216e-05, "loss": 0.4166, "step": 839 }, { "epoch": 1.20863309352518, "grad_norm": 0.65247948493221, "learning_rate": 7.521211660015615e-05, "loss": 0.4162, "step": 840 }, { "epoch": 1.210071942446043, "grad_norm": 0.48412170832446494, "learning_rate": 7.519303379441357e-05, "loss": 0.4226, "step": 841 }, { "epoch": 1.2115107913669065, "grad_norm": 0.3894002356186535, "learning_rate": 7.517391546640573e-05, "loss": 0.4259, "step": 842 }, { "epoch": 1.2129496402877697, "grad_norm": 0.4743786697065132, "learning_rate": 7.515476163542982e-05, "loss": 0.4189, "step": 843 }, { "epoch": 1.2143884892086332, "grad_norm": 0.44692095971218454, "learning_rate": 7.513557232081887e-05, "loss": 0.4217, "step": 844 }, { "epoch": 1.2158273381294964, "grad_norm": 0.30137146063183906, "learning_rate": 7.511634754194168e-05, "loss": 0.4205, "step": 845 }, { "epoch": 1.2172661870503596, "grad_norm": 0.4219746836134125, "learning_rate": 7.50970873182029e-05, "loss": 0.4186, "step": 846 }, { "epoch": 1.218705035971223, "grad_norm": 0.47854173151744644, "learning_rate": 7.507779166904292e-05, "loss": 0.4204, "step": 847 }, { "epoch": 1.2201438848920863, "grad_norm": 0.3758110059976876, "learning_rate": 7.50584606139379e-05, "loss": 0.4228, "step": 848 }, { "epoch": 1.2215827338129497, "grad_norm": 0.35428654478853616, "learning_rate": 7.503909417239975e-05, "loss": 0.4218, "step": 849 }, { "epoch": 1.223021582733813, "grad_norm": 0.5034960728391591, "learning_rate": 7.501969236397607e-05, "loss": 0.4128, "step": 850 }, { "epoch": 1.2244604316546763, "grad_norm": 0.6410823859828029, "learning_rate": 7.500025520825018e-05, "loss": 0.4323, "step": 851 }, { "epoch": 1.2258992805755395, "grad_norm": 0.8171577583438102, "learning_rate": 7.498078272484108e-05, "loss": 0.4206, "step": 852 }, { "epoch": 1.227338129496403, "grad_norm": 0.9905863473480702, "learning_rate": 7.496127493340341e-05, "loss": 0.4265, "step": 853 }, { "epoch": 1.2287769784172662, "grad_norm": 1.1019278429398947, "learning_rate": 7.494173185362745e-05, "loss": 0.4254, "step": 854 }, { "epoch": 1.2302158273381294, "grad_norm": 0.7783355621185797, "learning_rate": 7.492215350523913e-05, "loss": 0.4335, "step": 855 }, { "epoch": 1.2316546762589928, "grad_norm": 0.5123283474072974, "learning_rate": 7.490253990799991e-05, "loss": 0.4123, "step": 856 }, { "epoch": 1.233093525179856, "grad_norm": 0.4051449414915292, "learning_rate": 7.488289108170692e-05, "loss": 0.4207, "step": 857 }, { "epoch": 1.2345323741007195, "grad_norm": 0.45377464565373454, "learning_rate": 7.486320704619276e-05, "loss": 0.4146, "step": 858 }, { "epoch": 1.2359712230215827, "grad_norm": 0.5245555215627437, "learning_rate": 7.484348782132565e-05, "loss": 0.4162, "step": 859 }, { "epoch": 1.2374100719424461, "grad_norm": 0.5327932452184712, "learning_rate": 7.482373342700927e-05, "loss": 0.4075, "step": 860 }, { "epoch": 1.2388489208633093, "grad_norm": 0.5541213309539225, "learning_rate": 7.48039438831828e-05, "loss": 0.4159, "step": 861 }, { "epoch": 1.2402877697841728, "grad_norm": 0.5643639432637277, "learning_rate": 7.478411920982095e-05, "loss": 0.4157, "step": 862 }, { "epoch": 1.241726618705036, "grad_norm": 0.580061380544043, "learning_rate": 7.476425942693382e-05, "loss": 0.4202, "step": 863 }, { "epoch": 1.2431654676258992, "grad_norm": 0.5220355777656684, "learning_rate": 7.474436455456701e-05, "loss": 0.4204, "step": 864 }, { "epoch": 1.2446043165467626, "grad_norm": 0.38702531956157415, "learning_rate": 7.472443461280149e-05, "loss": 0.4176, "step": 865 }, { "epoch": 1.2460431654676258, "grad_norm": 0.41202934508706474, "learning_rate": 7.470446962175367e-05, "loss": 0.4267, "step": 866 }, { "epoch": 1.2474820143884893, "grad_norm": 0.5035311974061664, "learning_rate": 7.468446960157527e-05, "loss": 0.4265, "step": 867 }, { "epoch": 1.2489208633093525, "grad_norm": 0.569434393610722, "learning_rate": 7.466443457245344e-05, "loss": 0.4172, "step": 868 }, { "epoch": 1.2503597122302157, "grad_norm": 0.6006260739620486, "learning_rate": 7.464436455461066e-05, "loss": 0.4218, "step": 869 }, { "epoch": 1.2517985611510791, "grad_norm": 0.5455962473135081, "learning_rate": 7.462425956830466e-05, "loss": 0.4152, "step": 870 }, { "epoch": 1.2532374100719426, "grad_norm": 0.3809920362148773, "learning_rate": 7.460411963382853e-05, "loss": 0.4178, "step": 871 }, { "epoch": 1.2546762589928058, "grad_norm": 0.41925821122076273, "learning_rate": 7.45839447715106e-05, "loss": 0.4165, "step": 872 }, { "epoch": 1.256115107913669, "grad_norm": 0.4823757639947033, "learning_rate": 7.456373500171449e-05, "loss": 0.4111, "step": 873 }, { "epoch": 1.2575539568345324, "grad_norm": 0.4346303434108943, "learning_rate": 7.454349034483903e-05, "loss": 0.4185, "step": 874 }, { "epoch": 1.2589928057553956, "grad_norm": 0.44014311113204413, "learning_rate": 7.452321082131824e-05, "loss": 0.4159, "step": 875 }, { "epoch": 1.260431654676259, "grad_norm": 0.4565120265001911, "learning_rate": 7.450289645162138e-05, "loss": 0.4185, "step": 876 }, { "epoch": 1.2618705035971223, "grad_norm": 0.46322718110655864, "learning_rate": 7.448254725625287e-05, "loss": 0.4185, "step": 877 }, { "epoch": 1.2633093525179855, "grad_norm": 0.4851681984355067, "learning_rate": 7.446216325575225e-05, "loss": 0.4182, "step": 878 }, { "epoch": 1.264748201438849, "grad_norm": 0.5717465825487871, "learning_rate": 7.444174447069423e-05, "loss": 0.4102, "step": 879 }, { "epoch": 1.2661870503597124, "grad_norm": 0.5430731629551574, "learning_rate": 7.442129092168859e-05, "loss": 0.4139, "step": 880 }, { "epoch": 1.2676258992805756, "grad_norm": 0.4706433407685034, "learning_rate": 7.440080262938026e-05, "loss": 0.4198, "step": 881 }, { "epoch": 1.2690647482014388, "grad_norm": 0.5170788910681468, "learning_rate": 7.438027961444916e-05, "loss": 0.4237, "step": 882 }, { "epoch": 1.2705035971223022, "grad_norm": 0.5067616291813265, "learning_rate": 7.435972189761033e-05, "loss": 0.4265, "step": 883 }, { "epoch": 1.2719424460431654, "grad_norm": 0.4597445039030618, "learning_rate": 7.43391294996138e-05, "loss": 0.4171, "step": 884 }, { "epoch": 1.2733812949640289, "grad_norm": 0.42939256207205173, "learning_rate": 7.431850244124459e-05, "loss": 0.4161, "step": 885 }, { "epoch": 1.274820143884892, "grad_norm": 0.46502457984089807, "learning_rate": 7.429784074332274e-05, "loss": 0.416, "step": 886 }, { "epoch": 1.2762589928057553, "grad_norm": 0.5397594990198463, "learning_rate": 7.427714442670324e-05, "loss": 0.4199, "step": 887 }, { "epoch": 1.2776978417266187, "grad_norm": 0.6916813186652401, "learning_rate": 7.425641351227602e-05, "loss": 0.4127, "step": 888 }, { "epoch": 1.2791366906474821, "grad_norm": 0.8740038338577955, "learning_rate": 7.423564802096592e-05, "loss": 0.416, "step": 889 }, { "epoch": 1.2805755395683454, "grad_norm": 1.082921425091471, "learning_rate": 7.42148479737327e-05, "loss": 0.4121, "step": 890 }, { "epoch": 1.2820143884892086, "grad_norm": 0.8147796616381295, "learning_rate": 7.419401339157099e-05, "loss": 0.4086, "step": 891 }, { "epoch": 1.283453237410072, "grad_norm": 0.5109372878426125, "learning_rate": 7.41731442955103e-05, "loss": 0.4181, "step": 892 }, { "epoch": 1.2848920863309352, "grad_norm": 0.5118049349727969, "learning_rate": 7.415224070661492e-05, "loss": 0.4276, "step": 893 }, { "epoch": 1.2863309352517986, "grad_norm": 0.7298639025862309, "learning_rate": 7.413130264598404e-05, "loss": 0.4234, "step": 894 }, { "epoch": 1.2877697841726619, "grad_norm": 1.1217691855762015, "learning_rate": 7.411033013475156e-05, "loss": 0.4246, "step": 895 }, { "epoch": 1.289208633093525, "grad_norm": 0.3163488848249356, "learning_rate": 7.408932319408619e-05, "loss": 0.4061, "step": 896 }, { "epoch": 1.2906474820143885, "grad_norm": 0.7430713251309599, "learning_rate": 7.406828184519141e-05, "loss": 0.4217, "step": 897 }, { "epoch": 1.292086330935252, "grad_norm": 0.8800106116979265, "learning_rate": 7.40472061093054e-05, "loss": 0.4162, "step": 898 }, { "epoch": 1.2935251798561151, "grad_norm": 0.7060110576639796, "learning_rate": 7.402609600770104e-05, "loss": 0.4271, "step": 899 }, { "epoch": 1.2949640287769784, "grad_norm": 0.5071115772502598, "learning_rate": 7.400495156168596e-05, "loss": 0.4215, "step": 900 }, { "epoch": 1.2964028776978418, "grad_norm": 0.6015817039371135, "learning_rate": 7.39837727926024e-05, "loss": 0.4132, "step": 901 }, { "epoch": 1.297841726618705, "grad_norm": 0.6836388340295115, "learning_rate": 7.396255972182723e-05, "loss": 0.4236, "step": 902 }, { "epoch": 1.2992805755395684, "grad_norm": 0.5118641074100791, "learning_rate": 7.394131237077199e-05, "loss": 0.4148, "step": 903 }, { "epoch": 1.3007194244604317, "grad_norm": 0.4947960018961066, "learning_rate": 7.39200307608828e-05, "loss": 0.4228, "step": 904 }, { "epoch": 1.3021582733812949, "grad_norm": 2.809075742801845, "learning_rate": 7.389871491364036e-05, "loss": 0.4854, "step": 905 }, { "epoch": 1.3035971223021583, "grad_norm": 3.1902703322169135, "learning_rate": 7.387736485055993e-05, "loss": 0.4914, "step": 906 }, { "epoch": 1.3050359712230215, "grad_norm": 3.169079308937887, "learning_rate": 7.385598059319129e-05, "loss": 0.515, "step": 907 }, { "epoch": 1.306474820143885, "grad_norm": 2.296508230631993, "learning_rate": 7.383456216311875e-05, "loss": 0.4804, "step": 908 }, { "epoch": 1.3079136690647482, "grad_norm": 25.565188899774473, "learning_rate": 7.381310958196112e-05, "loss": 0.7105, "step": 909 }, { "epoch": 1.3093525179856116, "grad_norm": 544.9258123420445, "learning_rate": 7.379162287137167e-05, "loss": 6.2232, "step": 910 }, { "epoch": 1.3107913669064748, "grad_norm": 20.044170581073192, "learning_rate": 7.37701020530381e-05, "loss": 1.0189, "step": 911 }, { "epoch": 1.3122302158273382, "grad_norm": 32.03230660800528, "learning_rate": 7.374854714868259e-05, "loss": 1.0518, "step": 912 }, { "epoch": 1.3136690647482014, "grad_norm": 12.129176440633575, "learning_rate": 7.372695818006167e-05, "loss": 0.8048, "step": 913 }, { "epoch": 1.3151079136690647, "grad_norm": 20.22774171993354, "learning_rate": 7.370533516896627e-05, "loss": 1.0387, "step": 914 }, { "epoch": 1.316546762589928, "grad_norm": 4.675031924475641, "learning_rate": 7.368367813722169e-05, "loss": 0.6672, "step": 915 }, { "epoch": 1.3179856115107913, "grad_norm": 3.3403459553688046, "learning_rate": 7.366198710668755e-05, "loss": 0.6201, "step": 916 }, { "epoch": 1.3194244604316547, "grad_norm": 0.9943048642973623, "learning_rate": 7.364026209925783e-05, "loss": 0.5459, "step": 917 }, { "epoch": 1.320863309352518, "grad_norm": 91.42623205140953, "learning_rate": 7.361850313686076e-05, "loss": 0.9093, "step": 918 }, { "epoch": 1.3223021582733812, "grad_norm": 247.10257252231983, "learning_rate": 7.359671024145886e-05, "loss": 5.0509, "step": 919 }, { "epoch": 1.3237410071942446, "grad_norm": 18.58663410845094, "learning_rate": 7.35748834350489e-05, "loss": 1.4504, "step": 920 }, { "epoch": 1.325179856115108, "grad_norm": 5.159800355671047, "learning_rate": 7.355302273966186e-05, "loss": 0.9351, "step": 921 }, { "epoch": 1.3266187050359712, "grad_norm": 3.3547307250861187, "learning_rate": 7.353112817736295e-05, "loss": 0.7155, "step": 922 }, { "epoch": 1.3280575539568344, "grad_norm": 1.5664564817666657, "learning_rate": 7.350919977025157e-05, "loss": 0.5998, "step": 923 }, { "epoch": 1.3294964028776979, "grad_norm": 57.02363649654408, "learning_rate": 7.348723754046127e-05, "loss": 0.8424, "step": 924 }, { "epoch": 1.330935251798561, "grad_norm": 4.901340781457194, "learning_rate": 7.34652415101597e-05, "loss": 0.6654, "step": 925 }, { "epoch": 1.3323741007194245, "grad_norm": 2.5937845788317055, "learning_rate": 7.344321170154871e-05, "loss": 0.6026, "step": 926 }, { "epoch": 1.3338129496402877, "grad_norm": 0.9994241678902555, "learning_rate": 7.342114813686419e-05, "loss": 0.5567, "step": 927 }, { "epoch": 1.335251798561151, "grad_norm": 1.1829428406464586, "learning_rate": 7.339905083837608e-05, "loss": 0.5524, "step": 928 }, { "epoch": 1.3366906474820144, "grad_norm": 0.9102676284435843, "learning_rate": 7.337691982838841e-05, "loss": 0.5277, "step": 929 }, { "epoch": 1.3381294964028778, "grad_norm": 1.1130004246164928, "learning_rate": 7.335475512923924e-05, "loss": 0.525, "step": 930 }, { "epoch": 1.339568345323741, "grad_norm": 0.6353478530329731, "learning_rate": 7.33325567633006e-05, "loss": 0.5075, "step": 931 }, { "epoch": 1.3410071942446042, "grad_norm": 0.7154586227162724, "learning_rate": 7.331032475297855e-05, "loss": 0.4959, "step": 932 }, { "epoch": 1.3424460431654677, "grad_norm": 0.7249413976258889, "learning_rate": 7.328805912071307e-05, "loss": 0.4797, "step": 933 }, { "epoch": 1.3438848920863309, "grad_norm": 0.517053575901379, "learning_rate": 7.326575988897807e-05, "loss": 0.4764, "step": 934 }, { "epoch": 1.3453237410071943, "grad_norm": 0.5874905496337369, "learning_rate": 7.324342708028141e-05, "loss": 0.469, "step": 935 }, { "epoch": 1.3467625899280575, "grad_norm": 0.43744070662731827, "learning_rate": 7.322106071716483e-05, "loss": 0.4717, "step": 936 }, { "epoch": 1.3482014388489207, "grad_norm": 0.5453893982842335, "learning_rate": 7.319866082220388e-05, "loss": 0.4569, "step": 937 }, { "epoch": 1.3496402877697842, "grad_norm": 0.5200274639447857, "learning_rate": 7.317622741800808e-05, "loss": 0.4547, "step": 938 }, { "epoch": 1.3510791366906476, "grad_norm": 0.4821080932076026, "learning_rate": 7.315376052722065e-05, "loss": 0.4615, "step": 939 }, { "epoch": 1.3525179856115108, "grad_norm": 0.4078836036589005, "learning_rate": 7.313126017251868e-05, "loss": 0.4521, "step": 940 }, { "epoch": 1.353956834532374, "grad_norm": 0.3473670552596463, "learning_rate": 7.3108726376613e-05, "loss": 0.4379, "step": 941 }, { "epoch": 1.3553956834532375, "grad_norm": 0.3806100761112652, "learning_rate": 7.308615916224823e-05, "loss": 0.4441, "step": 942 }, { "epoch": 1.3568345323741007, "grad_norm": 0.410225859601859, "learning_rate": 7.306355855220267e-05, "loss": 0.4445, "step": 943 }, { "epoch": 1.358273381294964, "grad_norm": 0.31067952080530287, "learning_rate": 7.30409245692884e-05, "loss": 0.4376, "step": 944 }, { "epoch": 1.3597122302158273, "grad_norm": 0.36147055056886784, "learning_rate": 7.301825723635111e-05, "loss": 0.4518, "step": 945 }, { "epoch": 1.3611510791366905, "grad_norm": 0.4580320309729214, "learning_rate": 7.299555657627021e-05, "loss": 0.4368, "step": 946 }, { "epoch": 1.362589928057554, "grad_norm": 0.4679993897648468, "learning_rate": 7.29728226119587e-05, "loss": 0.4509, "step": 947 }, { "epoch": 1.3640287769784174, "grad_norm": 0.6932528279206558, "learning_rate": 7.295005536636325e-05, "loss": 0.4418, "step": 948 }, { "epoch": 1.3654676258992806, "grad_norm": 0.7804774459738667, "learning_rate": 7.292725486246407e-05, "loss": 0.4427, "step": 949 }, { "epoch": 1.3669064748201438, "grad_norm": 0.49179197252689205, "learning_rate": 7.290442112327498e-05, "loss": 0.4367, "step": 950 }, { "epoch": 1.3683453237410073, "grad_norm": 0.5982458676214871, "learning_rate": 7.288155417184331e-05, "loss": 0.4351, "step": 951 }, { "epoch": 1.3697841726618705, "grad_norm": 0.6881110302017649, "learning_rate": 7.285865403124995e-05, "loss": 0.4361, "step": 952 }, { "epoch": 1.371223021582734, "grad_norm": 0.9825511136783236, "learning_rate": 7.283572072460927e-05, "loss": 0.4451, "step": 953 }, { "epoch": 1.3726618705035971, "grad_norm": 1.1179334698800347, "learning_rate": 7.28127542750691e-05, "loss": 0.4385, "step": 954 }, { "epoch": 1.3741007194244603, "grad_norm": 0.43264156063929776, "learning_rate": 7.278975470581076e-05, "loss": 0.433, "step": 955 }, { "epoch": 1.3755395683453238, "grad_norm": 0.7286896948512599, "learning_rate": 7.276672204004898e-05, "loss": 0.4349, "step": 956 }, { "epoch": 1.376978417266187, "grad_norm": 0.8825376334064328, "learning_rate": 7.274365630103189e-05, "loss": 0.4338, "step": 957 }, { "epoch": 1.3784172661870504, "grad_norm": 0.6639218424283293, "learning_rate": 7.2720557512041e-05, "loss": 0.4345, "step": 958 }, { "epoch": 1.3798561151079136, "grad_norm": 0.43365041814785904, "learning_rate": 7.269742569639121e-05, "loss": 0.4357, "step": 959 }, { "epoch": 1.381294964028777, "grad_norm": 0.6429578440689165, "learning_rate": 7.267426087743073e-05, "loss": 0.4226, "step": 960 }, { "epoch": 1.3827338129496403, "grad_norm": 0.6408192107115253, "learning_rate": 7.265106307854107e-05, "loss": 0.4308, "step": 961 }, { "epoch": 1.3841726618705037, "grad_norm": 0.6000714524229904, "learning_rate": 7.262783232313706e-05, "loss": 0.4336, "step": 962 }, { "epoch": 1.385611510791367, "grad_norm": 0.5423233672454112, "learning_rate": 7.260456863466676e-05, "loss": 0.4234, "step": 963 }, { "epoch": 1.3870503597122301, "grad_norm": 0.623914979251738, "learning_rate": 7.258127203661153e-05, "loss": 0.4255, "step": 964 }, { "epoch": 1.3884892086330936, "grad_norm": 0.529835304290532, "learning_rate": 7.255794255248587e-05, "loss": 0.431, "step": 965 }, { "epoch": 1.3899280575539568, "grad_norm": 0.4873371283273517, "learning_rate": 7.253458020583752e-05, "loss": 0.4352, "step": 966 }, { "epoch": 1.3913669064748202, "grad_norm": 0.47874618876157593, "learning_rate": 7.25111850202474e-05, "loss": 0.423, "step": 967 }, { "epoch": 1.3928057553956834, "grad_norm": 0.44584845046178967, "learning_rate": 7.248775701932953e-05, "loss": 0.421, "step": 968 }, { "epoch": 1.3942446043165468, "grad_norm": 0.44820331929916596, "learning_rate": 7.246429622673111e-05, "loss": 0.4259, "step": 969 }, { "epoch": 1.39568345323741, "grad_norm": 0.3470987235749387, "learning_rate": 7.244080266613238e-05, "loss": 0.4237, "step": 970 }, { "epoch": 1.3971223021582735, "grad_norm": 0.4200217431351872, "learning_rate": 7.241727636124671e-05, "loss": 0.4169, "step": 971 }, { "epoch": 1.3985611510791367, "grad_norm": 0.47456172127286683, "learning_rate": 7.239371733582047e-05, "loss": 0.4245, "step": 972 }, { "epoch": 1.4, "grad_norm": 0.37151128289895746, "learning_rate": 7.23701256136331e-05, "loss": 0.4287, "step": 973 }, { "epoch": 1.4014388489208633, "grad_norm": 0.4077295931874118, "learning_rate": 7.2346501218497e-05, "loss": 0.4256, "step": 974 }, { "epoch": 1.4028776978417266, "grad_norm": 0.29297584698969287, "learning_rate": 7.23228441742576e-05, "loss": 0.4262, "step": 975 }, { "epoch": 1.40431654676259, "grad_norm": 0.38241491031733416, "learning_rate": 7.229915450479324e-05, "loss": 0.4242, "step": 976 }, { "epoch": 1.4057553956834532, "grad_norm": 0.37225834692829823, "learning_rate": 7.227543223401522e-05, "loss": 0.4224, "step": 977 }, { "epoch": 1.4071942446043164, "grad_norm": 0.2942808197297915, "learning_rate": 7.225167738586772e-05, "loss": 0.4209, "step": 978 }, { "epoch": 1.4086330935251798, "grad_norm": 0.4121809107133147, "learning_rate": 7.22278899843278e-05, "loss": 0.4283, "step": 979 }, { "epoch": 1.4100719424460433, "grad_norm": 0.35849148544830317, "learning_rate": 7.220407005340542e-05, "loss": 0.4249, "step": 980 }, { "epoch": 1.4115107913669065, "grad_norm": 0.37478659814507403, "learning_rate": 7.218021761714336e-05, "loss": 0.4251, "step": 981 }, { "epoch": 1.4129496402877697, "grad_norm": 0.34362680740178464, "learning_rate": 7.215633269961714e-05, "loss": 0.4192, "step": 982 }, { "epoch": 1.4143884892086331, "grad_norm": 0.4142071899637734, "learning_rate": 7.213241532493516e-05, "loss": 0.4252, "step": 983 }, { "epoch": 1.4158273381294963, "grad_norm": 0.3842983010726522, "learning_rate": 7.210846551723855e-05, "loss": 0.4182, "step": 984 }, { "epoch": 1.4172661870503598, "grad_norm": 0.303550811942089, "learning_rate": 7.208448330070116e-05, "loss": 0.4207, "step": 985 }, { "epoch": 1.418705035971223, "grad_norm": 0.31739587700139277, "learning_rate": 7.206046869952954e-05, "loss": 0.4225, "step": 986 }, { "epoch": 1.4201438848920862, "grad_norm": 0.33225743068915725, "learning_rate": 7.203642173796298e-05, "loss": 0.42, "step": 987 }, { "epoch": 1.4215827338129496, "grad_norm": 0.23677363095088855, "learning_rate": 7.201234244027338e-05, "loss": 0.4161, "step": 988 }, { "epoch": 1.423021582733813, "grad_norm": 0.26592553747289, "learning_rate": 7.19882308307653e-05, "loss": 0.4144, "step": 989 }, { "epoch": 1.4244604316546763, "grad_norm": 0.3655310161495709, "learning_rate": 7.196408693377594e-05, "loss": 0.4075, "step": 990 }, { "epoch": 1.4258992805755395, "grad_norm": 0.3103060704669727, "learning_rate": 7.193991077367501e-05, "loss": 0.4206, "step": 991 }, { "epoch": 1.427338129496403, "grad_norm": 0.30395533893193416, "learning_rate": 7.19157023748649e-05, "loss": 0.4205, "step": 992 }, { "epoch": 1.4287769784172661, "grad_norm": 0.29152986731845987, "learning_rate": 7.189146176178044e-05, "loss": 0.4339, "step": 993 }, { "epoch": 1.4302158273381296, "grad_norm": 0.2859730011717553, "learning_rate": 7.186718895888904e-05, "loss": 0.4176, "step": 994 }, { "epoch": 1.4316546762589928, "grad_norm": 0.39314266580090007, "learning_rate": 7.184288399069054e-05, "loss": 0.4129, "step": 995 }, { "epoch": 1.433093525179856, "grad_norm": 0.37407860657904163, "learning_rate": 7.181854688171732e-05, "loss": 0.4226, "step": 996 }, { "epoch": 1.4345323741007194, "grad_norm": 0.33658296177937513, "learning_rate": 7.179417765653413e-05, "loss": 0.4248, "step": 997 }, { "epoch": 1.4359712230215829, "grad_norm": 0.3651105251554801, "learning_rate": 7.17697763397382e-05, "loss": 0.4183, "step": 998 }, { "epoch": 1.437410071942446, "grad_norm": 0.37810065568343026, "learning_rate": 7.174534295595911e-05, "loss": 0.4219, "step": 999 }, { "epoch": 1.4388489208633093, "grad_norm": 0.35778324901029257, "learning_rate": 7.17208775298588e-05, "loss": 0.4241, "step": 1000 }, { "epoch": 1.4402877697841727, "grad_norm": 0.296505444912392, "learning_rate": 7.169638008613158e-05, "loss": 0.4189, "step": 1001 }, { "epoch": 1.441726618705036, "grad_norm": 0.29440226960781385, "learning_rate": 7.16718506495041e-05, "loss": 0.4264, "step": 1002 }, { "epoch": 1.4431654676258994, "grad_norm": 0.37142554533913175, "learning_rate": 7.164728924473522e-05, "loss": 0.4132, "step": 1003 }, { "epoch": 1.4446043165467626, "grad_norm": 0.47661380552179405, "learning_rate": 7.162269589661614e-05, "loss": 0.4065, "step": 1004 }, { "epoch": 1.4460431654676258, "grad_norm": 0.593874208360848, "learning_rate": 7.15980706299703e-05, "loss": 0.4155, "step": 1005 }, { "epoch": 1.4474820143884892, "grad_norm": 0.7149350346081539, "learning_rate": 7.15734134696533e-05, "loss": 0.4255, "step": 1006 }, { "epoch": 1.4489208633093524, "grad_norm": 0.7287849487118199, "learning_rate": 7.1548724440553e-05, "loss": 0.4238, "step": 1007 }, { "epoch": 1.4503597122302159, "grad_norm": 0.5985591154146628, "learning_rate": 7.152400356758937e-05, "loss": 0.422, "step": 1008 }, { "epoch": 1.451798561151079, "grad_norm": 0.5260381396690487, "learning_rate": 7.149925087571456e-05, "loss": 0.42, "step": 1009 }, { "epoch": 1.4532374100719425, "grad_norm": 0.5131418767923352, "learning_rate": 7.147446638991283e-05, "loss": 0.4221, "step": 1010 }, { "epoch": 1.4546762589928057, "grad_norm": 0.43675175574186137, "learning_rate": 7.14496501352005e-05, "loss": 0.4243, "step": 1011 }, { "epoch": 1.4561151079136692, "grad_norm": 0.3857976039087925, "learning_rate": 7.1424802136626e-05, "loss": 0.4235, "step": 1012 }, { "epoch": 1.4575539568345324, "grad_norm": 0.40581206323597546, "learning_rate": 7.139992241926978e-05, "loss": 0.4161, "step": 1013 }, { "epoch": 1.4589928057553956, "grad_norm": 0.4172114133159447, "learning_rate": 7.137501100824432e-05, "loss": 0.4157, "step": 1014 }, { "epoch": 1.460431654676259, "grad_norm": 0.3775952175793599, "learning_rate": 7.135006792869405e-05, "loss": 0.4154, "step": 1015 }, { "epoch": 1.4618705035971222, "grad_norm": 0.3341118295254195, "learning_rate": 7.132509320579542e-05, "loss": 0.4262, "step": 1016 }, { "epoch": 1.4633093525179857, "grad_norm": 0.3204508808850505, "learning_rate": 7.130008686475677e-05, "loss": 0.411, "step": 1017 }, { "epoch": 1.4647482014388489, "grad_norm": 0.4070976049771881, "learning_rate": 7.127504893081839e-05, "loss": 0.4189, "step": 1018 }, { "epoch": 1.4661870503597123, "grad_norm": 0.4921448353070135, "learning_rate": 7.124997942925244e-05, "loss": 0.4171, "step": 1019 }, { "epoch": 1.4676258992805755, "grad_norm": 0.545224514417452, "learning_rate": 7.122487838536295e-05, "loss": 0.4181, "step": 1020 }, { "epoch": 1.469064748201439, "grad_norm": 0.48394482335755673, "learning_rate": 7.119974582448577e-05, "loss": 0.423, "step": 1021 }, { "epoch": 1.4705035971223022, "grad_norm": 0.43772370368974317, "learning_rate": 7.11745817719886e-05, "loss": 0.4181, "step": 1022 }, { "epoch": 1.4719424460431654, "grad_norm": 0.3427269934601975, "learning_rate": 7.114938625327088e-05, "loss": 0.4063, "step": 1023 }, { "epoch": 1.4733812949640288, "grad_norm": 0.2924781802740147, "learning_rate": 7.112415929376385e-05, "loss": 0.4113, "step": 1024 }, { "epoch": 1.474820143884892, "grad_norm": 0.32198837101413197, "learning_rate": 7.109890091893047e-05, "loss": 0.4263, "step": 1025 }, { "epoch": 1.4762589928057555, "grad_norm": 0.282277936977763, "learning_rate": 7.107361115426537e-05, "loss": 0.4194, "step": 1026 }, { "epoch": 1.4776978417266187, "grad_norm": 0.25104791660310116, "learning_rate": 7.104829002529496e-05, "loss": 0.4229, "step": 1027 }, { "epoch": 1.4791366906474819, "grad_norm": 0.27270860838716215, "learning_rate": 7.102293755757721e-05, "loss": 0.4223, "step": 1028 }, { "epoch": 1.4805755395683453, "grad_norm": 0.26112130161175445, "learning_rate": 7.099755377670177e-05, "loss": 0.4196, "step": 1029 }, { "epoch": 1.4820143884892087, "grad_norm": 0.2616174582828157, "learning_rate": 7.097213870828989e-05, "loss": 0.4147, "step": 1030 }, { "epoch": 1.483453237410072, "grad_norm": 0.26475123111466153, "learning_rate": 7.094669237799437e-05, "loss": 0.4068, "step": 1031 }, { "epoch": 1.4848920863309352, "grad_norm": 0.30854399100120633, "learning_rate": 7.092121481149964e-05, "loss": 0.4211, "step": 1032 }, { "epoch": 1.4863309352517986, "grad_norm": 0.4252541911845257, "learning_rate": 7.089570603452157e-05, "loss": 0.4093, "step": 1033 }, { "epoch": 1.4877697841726618, "grad_norm": 0.41391174764328076, "learning_rate": 7.087016607280758e-05, "loss": 0.422, "step": 1034 }, { "epoch": 1.4892086330935252, "grad_norm": 0.31922865280493273, "learning_rate": 7.084459495213658e-05, "loss": 0.4265, "step": 1035 }, { "epoch": 1.4906474820143885, "grad_norm": 0.2818316267301246, "learning_rate": 7.081899269831888e-05, "loss": 0.4161, "step": 1036 }, { "epoch": 1.4920863309352517, "grad_norm": 0.2884472885297439, "learning_rate": 7.079335933719625e-05, "loss": 0.4057, "step": 1037 }, { "epoch": 1.493525179856115, "grad_norm": 0.2823559617033364, "learning_rate": 7.076769489464188e-05, "loss": 0.4228, "step": 1038 }, { "epoch": 1.4949640287769785, "grad_norm": 0.27447245904460427, "learning_rate": 7.074199939656027e-05, "loss": 0.4281, "step": 1039 }, { "epoch": 1.4964028776978417, "grad_norm": 0.30292815899900255, "learning_rate": 7.071627286888731e-05, "loss": 0.4293, "step": 1040 }, { "epoch": 1.497841726618705, "grad_norm": 0.3060320514171228, "learning_rate": 7.06905153375902e-05, "loss": 0.4198, "step": 1041 }, { "epoch": 1.4992805755395684, "grad_norm": 0.27322000525441553, "learning_rate": 7.066472682866744e-05, "loss": 0.42, "step": 1042 }, { "epoch": 1.5007194244604316, "grad_norm": 0.23469857234845679, "learning_rate": 7.063890736814878e-05, "loss": 0.4274, "step": 1043 }, { "epoch": 1.502158273381295, "grad_norm": 0.2623497692808711, "learning_rate": 7.061305698209524e-05, "loss": 0.4152, "step": 1044 }, { "epoch": 1.5035971223021583, "grad_norm": 0.3600783635419875, "learning_rate": 7.058717569659901e-05, "loss": 0.4085, "step": 1045 }, { "epoch": 1.5050359712230215, "grad_norm": 0.43471456079737636, "learning_rate": 7.05612635377835e-05, "loss": 0.4191, "step": 1046 }, { "epoch": 1.506474820143885, "grad_norm": 0.5194537867825783, "learning_rate": 7.053532053180332e-05, "loss": 0.4097, "step": 1047 }, { "epoch": 1.5079136690647483, "grad_norm": 0.5496556685894735, "learning_rate": 7.050934670484413e-05, "loss": 0.4121, "step": 1048 }, { "epoch": 1.5093525179856115, "grad_norm": 0.673626835157798, "learning_rate": 7.048334208312273e-05, "loss": 0.414, "step": 1049 }, { "epoch": 1.5107913669064748, "grad_norm": 0.8602181463655494, "learning_rate": 7.045730669288706e-05, "loss": 0.4102, "step": 1050 }, { "epoch": 1.512230215827338, "grad_norm": 0.8986188065364306, "learning_rate": 7.043124056041606e-05, "loss": 0.4251, "step": 1051 }, { "epoch": 1.5136690647482014, "grad_norm": 0.907388183546702, "learning_rate": 7.040514371201969e-05, "loss": 0.4181, "step": 1052 }, { "epoch": 1.5151079136690648, "grad_norm": 0.8893936713510535, "learning_rate": 7.037901617403894e-05, "loss": 0.4218, "step": 1053 }, { "epoch": 1.516546762589928, "grad_norm": 0.8161310640612155, "learning_rate": 7.035285797284578e-05, "loss": 0.4195, "step": 1054 }, { "epoch": 1.5179856115107913, "grad_norm": 0.7155314514536278, "learning_rate": 7.032666913484313e-05, "loss": 0.4168, "step": 1055 }, { "epoch": 1.5194244604316547, "grad_norm": 0.5447772835854228, "learning_rate": 7.030044968646481e-05, "loss": 0.4081, "step": 1056 }, { "epoch": 1.5208633093525181, "grad_norm": 0.30566260478398155, "learning_rate": 7.027419965417556e-05, "loss": 0.413, "step": 1057 }, { "epoch": 1.5223021582733813, "grad_norm": 0.3168915642410363, "learning_rate": 7.024791906447098e-05, "loss": 0.4231, "step": 1058 }, { "epoch": 1.5237410071942445, "grad_norm": 0.47337167935734187, "learning_rate": 7.022160794387751e-05, "loss": 0.4106, "step": 1059 }, { "epoch": 1.5251798561151078, "grad_norm": 0.5600565641871665, "learning_rate": 7.019526631895242e-05, "loss": 0.4145, "step": 1060 }, { "epoch": 1.5266187050359712, "grad_norm": 0.5347195703673441, "learning_rate": 7.016889421628374e-05, "loss": 0.414, "step": 1061 }, { "epoch": 1.5280575539568346, "grad_norm": 0.5109593840142974, "learning_rate": 7.014249166249032e-05, "loss": 0.429, "step": 1062 }, { "epoch": 1.5294964028776978, "grad_norm": 0.3632853891620355, "learning_rate": 7.011605868422168e-05, "loss": 0.4174, "step": 1063 }, { "epoch": 1.530935251798561, "grad_norm": 0.43564028553592854, "learning_rate": 7.00895953081581e-05, "loss": 0.4224, "step": 1064 }, { "epoch": 1.5323741007194245, "grad_norm": 0.4735520908422807, "learning_rate": 7.00631015610105e-05, "loss": 0.4201, "step": 1065 }, { "epoch": 1.533812949640288, "grad_norm": 0.4633117691574936, "learning_rate": 7.00365774695205e-05, "loss": 0.4219, "step": 1066 }, { "epoch": 1.5352517985611511, "grad_norm": 0.3744894751669038, "learning_rate": 7.001002306046031e-05, "loss": 0.4277, "step": 1067 }, { "epoch": 1.5366906474820143, "grad_norm": 0.3909096065367158, "learning_rate": 6.998343836063276e-05, "loss": 0.42, "step": 1068 }, { "epoch": 1.5381294964028775, "grad_norm": 0.35574327706934766, "learning_rate": 6.995682339687125e-05, "loss": 0.4203, "step": 1069 }, { "epoch": 1.539568345323741, "grad_norm": 0.475660686372296, "learning_rate": 6.993017819603973e-05, "loss": 0.4198, "step": 1070 }, { "epoch": 1.5410071942446044, "grad_norm": 0.5436500042351103, "learning_rate": 6.990350278503267e-05, "loss": 0.4149, "step": 1071 }, { "epoch": 1.5424460431654676, "grad_norm": 0.5620520336725127, "learning_rate": 6.9876797190775e-05, "loss": 0.413, "step": 1072 }, { "epoch": 1.5438848920863308, "grad_norm": 0.5336709442212036, "learning_rate": 6.985006144022219e-05, "loss": 0.4277, "step": 1073 }, { "epoch": 1.5453237410071943, "grad_norm": 0.5000331369952123, "learning_rate": 6.982329556036007e-05, "loss": 0.4179, "step": 1074 }, { "epoch": 1.5467625899280577, "grad_norm": 0.4747773119476601, "learning_rate": 6.979649957820494e-05, "loss": 0.4215, "step": 1075 }, { "epoch": 1.548201438848921, "grad_norm": 0.3942932302309888, "learning_rate": 6.976967352080345e-05, "loss": 0.417, "step": 1076 }, { "epoch": 1.5496402877697841, "grad_norm": 0.3524259398548682, "learning_rate": 6.974281741523259e-05, "loss": 0.4082, "step": 1077 }, { "epoch": 1.5510791366906473, "grad_norm": 0.33503081944705626, "learning_rate": 6.971593128859974e-05, "loss": 0.4135, "step": 1078 }, { "epoch": 1.5525179856115108, "grad_norm": 0.294686683623946, "learning_rate": 6.968901516804254e-05, "loss": 0.4099, "step": 1079 }, { "epoch": 1.5539568345323742, "grad_norm": 0.3048950261303681, "learning_rate": 6.966206908072891e-05, "loss": 0.4171, "step": 1080 }, { "epoch": 1.5553956834532374, "grad_norm": 0.40859414851177817, "learning_rate": 6.963509305385701e-05, "loss": 0.4209, "step": 1081 }, { "epoch": 1.5568345323741006, "grad_norm": 0.4126086910978001, "learning_rate": 6.960808711465524e-05, "loss": 0.4225, "step": 1082 }, { "epoch": 1.558273381294964, "grad_norm": 0.3863251061315781, "learning_rate": 6.958105129038216e-05, "loss": 0.4119, "step": 1083 }, { "epoch": 1.5597122302158275, "grad_norm": 0.32697010940880766, "learning_rate": 6.955398560832654e-05, "loss": 0.4126, "step": 1084 }, { "epoch": 1.5611510791366907, "grad_norm": 0.30920484763692224, "learning_rate": 6.952689009580724e-05, "loss": 0.4145, "step": 1085 }, { "epoch": 1.562589928057554, "grad_norm": 0.3355958190960155, "learning_rate": 6.949976478017327e-05, "loss": 0.429, "step": 1086 }, { "epoch": 1.5640287769784171, "grad_norm": 0.33720997246138995, "learning_rate": 6.947260968880369e-05, "loss": 0.4061, "step": 1087 }, { "epoch": 1.5654676258992806, "grad_norm": 0.365706540668333, "learning_rate": 6.944542484910763e-05, "loss": 0.4196, "step": 1088 }, { "epoch": 1.566906474820144, "grad_norm": 0.3191029511156109, "learning_rate": 6.941821028852424e-05, "loss": 0.4143, "step": 1089 }, { "epoch": 1.5683453237410072, "grad_norm": 0.3337584496700208, "learning_rate": 6.939096603452269e-05, "loss": 0.4056, "step": 1090 }, { "epoch": 1.5697841726618704, "grad_norm": 0.382575244198341, "learning_rate": 6.93636921146021e-05, "loss": 0.421, "step": 1091 }, { "epoch": 1.5712230215827339, "grad_norm": 0.3603814261628815, "learning_rate": 6.933638855629153e-05, "loss": 0.4229, "step": 1092 }, { "epoch": 1.572661870503597, "grad_norm": 0.33198688932968207, "learning_rate": 6.930905538714995e-05, "loss": 0.4063, "step": 1093 }, { "epoch": 1.5741007194244605, "grad_norm": 0.3319118654900316, "learning_rate": 6.928169263476628e-05, "loss": 0.4122, "step": 1094 }, { "epoch": 1.5755395683453237, "grad_norm": 0.24612081962950533, "learning_rate": 6.92543003267592e-05, "loss": 0.4256, "step": 1095 }, { "epoch": 1.576978417266187, "grad_norm": 0.24578792236490904, "learning_rate": 6.922687849077729e-05, "loss": 0.4127, "step": 1096 }, { "epoch": 1.5784172661870504, "grad_norm": 0.3620980186429764, "learning_rate": 6.919942715449893e-05, "loss": 0.417, "step": 1097 }, { "epoch": 1.5798561151079138, "grad_norm": 0.4056281865573301, "learning_rate": 6.917194634563225e-05, "loss": 0.4171, "step": 1098 }, { "epoch": 1.581294964028777, "grad_norm": 0.4151741231993419, "learning_rate": 6.914443609191514e-05, "loss": 0.4202, "step": 1099 }, { "epoch": 1.5827338129496402, "grad_norm": 0.42372551750913784, "learning_rate": 6.911689642111523e-05, "loss": 0.4258, "step": 1100 }, { "epoch": 1.5841726618705037, "grad_norm": 0.33954014694508594, "learning_rate": 6.90893273610298e-05, "loss": 0.4084, "step": 1101 }, { "epoch": 1.5856115107913669, "grad_norm": 0.3472411886216252, "learning_rate": 6.906172893948585e-05, "loss": 0.4279, "step": 1102 }, { "epoch": 1.5870503597122303, "grad_norm": 0.28252822327431176, "learning_rate": 6.903410118433996e-05, "loss": 0.4113, "step": 1103 }, { "epoch": 1.5884892086330935, "grad_norm": 0.21511848522747434, "learning_rate": 6.900644412347836e-05, "loss": 0.4162, "step": 1104 }, { "epoch": 1.5899280575539567, "grad_norm": 0.3377842110574489, "learning_rate": 6.897875778481682e-05, "loss": 0.4183, "step": 1105 }, { "epoch": 1.5913669064748202, "grad_norm": 0.413068015031045, "learning_rate": 6.89510421963007e-05, "loss": 0.418, "step": 1106 }, { "epoch": 1.5928057553956836, "grad_norm": 0.4520641357977242, "learning_rate": 6.892329738590489e-05, "loss": 0.4089, "step": 1107 }, { "epoch": 1.5942446043165468, "grad_norm": 0.5190681917869406, "learning_rate": 6.889552338163372e-05, "loss": 0.4165, "step": 1108 }, { "epoch": 1.59568345323741, "grad_norm": 0.5658167151358489, "learning_rate": 6.886772021152104e-05, "loss": 0.4077, "step": 1109 }, { "epoch": 1.5971223021582732, "grad_norm": 0.6251081781208279, "learning_rate": 6.883988790363009e-05, "loss": 0.4187, "step": 1110 }, { "epoch": 1.5985611510791367, "grad_norm": 0.6977729292257839, "learning_rate": 6.881202648605359e-05, "loss": 0.4192, "step": 1111 }, { "epoch": 1.6, "grad_norm": 0.6784368627526026, "learning_rate": 6.878413598691358e-05, "loss": 0.4081, "step": 1112 }, { "epoch": 1.6014388489208633, "grad_norm": 0.611175666696122, "learning_rate": 6.875621643436147e-05, "loss": 0.4271, "step": 1113 }, { "epoch": 1.6028776978417265, "grad_norm": 0.5275664308141039, "learning_rate": 6.872826785657802e-05, "loss": 0.4148, "step": 1114 }, { "epoch": 1.60431654676259, "grad_norm": 0.44921141035285606, "learning_rate": 6.870029028177324e-05, "loss": 0.4087, "step": 1115 }, { "epoch": 1.6057553956834534, "grad_norm": 0.4219492984920089, "learning_rate": 6.867228373818648e-05, "loss": 0.4096, "step": 1116 }, { "epoch": 1.6071942446043166, "grad_norm": 0.4661276903722583, "learning_rate": 6.864424825408624e-05, "loss": 0.4155, "step": 1117 }, { "epoch": 1.6086330935251798, "grad_norm": 0.45069842468613974, "learning_rate": 6.861618385777028e-05, "loss": 0.4221, "step": 1118 }, { "epoch": 1.610071942446043, "grad_norm": 0.4113236965059679, "learning_rate": 6.858809057756558e-05, "loss": 0.4117, "step": 1119 }, { "epoch": 1.6115107913669064, "grad_norm": 0.4019565134513667, "learning_rate": 6.855996844182819e-05, "loss": 0.4102, "step": 1120 }, { "epoch": 1.6129496402877699, "grad_norm": 0.46925268431368816, "learning_rate": 6.853181747894334e-05, "loss": 0.4158, "step": 1121 }, { "epoch": 1.614388489208633, "grad_norm": 0.5308935521837395, "learning_rate": 6.850363771732536e-05, "loss": 0.411, "step": 1122 }, { "epoch": 1.6158273381294963, "grad_norm": 0.5644359644344518, "learning_rate": 6.847542918541762e-05, "loss": 0.4185, "step": 1123 }, { "epoch": 1.6172661870503597, "grad_norm": 0.5485119854230217, "learning_rate": 6.844719191169254e-05, "loss": 0.4187, "step": 1124 }, { "epoch": 1.6187050359712232, "grad_norm": 0.46044082739927455, "learning_rate": 6.841892592465158e-05, "loss": 0.4059, "step": 1125 }, { "epoch": 1.6201438848920864, "grad_norm": 0.30868193509334346, "learning_rate": 6.839063125282512e-05, "loss": 0.4138, "step": 1126 }, { "epoch": 1.6215827338129496, "grad_norm": 0.28786051831720627, "learning_rate": 6.836230792477256e-05, "loss": 0.4069, "step": 1127 }, { "epoch": 1.6230215827338128, "grad_norm": 0.4336899650080154, "learning_rate": 6.833395596908217e-05, "loss": 0.4225, "step": 1128 }, { "epoch": 1.6244604316546762, "grad_norm": 0.4552106798230225, "learning_rate": 6.830557541437114e-05, "loss": 0.3948, "step": 1129 }, { "epoch": 1.6258992805755397, "grad_norm": 0.3344228802154637, "learning_rate": 6.827716628928556e-05, "loss": 0.4183, "step": 1130 }, { "epoch": 1.6273381294964029, "grad_norm": 0.2719259797015168, "learning_rate": 6.824872862250028e-05, "loss": 0.4107, "step": 1131 }, { "epoch": 1.628776978417266, "grad_norm": 0.38746826794473005, "learning_rate": 6.822026244271903e-05, "loss": 0.4127, "step": 1132 }, { "epoch": 1.6302158273381295, "grad_norm": 0.40749598860469377, "learning_rate": 6.819176777867425e-05, "loss": 0.4161, "step": 1133 }, { "epoch": 1.631654676258993, "grad_norm": 0.28357078247401757, "learning_rate": 6.816324465912723e-05, "loss": 0.4173, "step": 1134 }, { "epoch": 1.6330935251798562, "grad_norm": 0.2497149857828995, "learning_rate": 6.813469311286789e-05, "loss": 0.4053, "step": 1135 }, { "epoch": 1.6345323741007194, "grad_norm": 0.32384723529170906, "learning_rate": 6.810611316871488e-05, "loss": 0.4203, "step": 1136 }, { "epoch": 1.6359712230215826, "grad_norm": 0.4087164952132661, "learning_rate": 6.80775048555155e-05, "loss": 0.4228, "step": 1137 }, { "epoch": 1.637410071942446, "grad_norm": 0.36329025160015227, "learning_rate": 6.804886820214572e-05, "loss": 0.4136, "step": 1138 }, { "epoch": 1.6388489208633095, "grad_norm": 0.32032361616204535, "learning_rate": 6.802020323751008e-05, "loss": 0.4148, "step": 1139 }, { "epoch": 1.6402877697841727, "grad_norm": 0.3441846476219566, "learning_rate": 6.799150999054169e-05, "loss": 0.4168, "step": 1140 }, { "epoch": 1.641726618705036, "grad_norm": 0.76682242723555, "learning_rate": 6.796278849020225e-05, "loss": 0.4102, "step": 1141 }, { "epoch": 1.6431654676258993, "grad_norm": 0.33620190524925814, "learning_rate": 6.79340387654819e-05, "loss": 0.4211, "step": 1142 }, { "epoch": 1.6446043165467625, "grad_norm": 0.43533432543301903, "learning_rate": 6.790526084539939e-05, "loss": 0.4143, "step": 1143 }, { "epoch": 1.646043165467626, "grad_norm": 0.503682323862886, "learning_rate": 6.787645475900182e-05, "loss": 0.4121, "step": 1144 }, { "epoch": 1.6474820143884892, "grad_norm": 0.4407492323966843, "learning_rate": 6.784762053536475e-05, "loss": 0.4061, "step": 1145 }, { "epoch": 1.6489208633093524, "grad_norm": 0.3870577172781102, "learning_rate": 6.781875820359216e-05, "loss": 0.4261, "step": 1146 }, { "epoch": 1.6503597122302158, "grad_norm": 0.34174235825815463, "learning_rate": 6.778986779281639e-05, "loss": 0.4285, "step": 1147 }, { "epoch": 1.6517985611510793, "grad_norm": 0.34649866044879724, "learning_rate": 6.776094933219811e-05, "loss": 0.4176, "step": 1148 }, { "epoch": 1.6532374100719425, "grad_norm": 0.33122534094477013, "learning_rate": 6.773200285092633e-05, "loss": 0.4269, "step": 1149 }, { "epoch": 1.6546762589928057, "grad_norm": 0.38469081599419475, "learning_rate": 6.770302837821833e-05, "loss": 0.4193, "step": 1150 }, { "epoch": 1.6561151079136691, "grad_norm": 0.40768423839818896, "learning_rate": 6.767402594331961e-05, "loss": 0.4228, "step": 1151 }, { "epoch": 1.6575539568345323, "grad_norm": 0.45279080893685925, "learning_rate": 6.764499557550396e-05, "loss": 0.4164, "step": 1152 }, { "epoch": 1.6589928057553958, "grad_norm": 0.4903719398809837, "learning_rate": 6.761593730407329e-05, "loss": 0.4085, "step": 1153 }, { "epoch": 1.660431654676259, "grad_norm": 0.487492750001549, "learning_rate": 6.758685115835776e-05, "loss": 0.4168, "step": 1154 }, { "epoch": 1.6618705035971222, "grad_norm": 0.5583506134292617, "learning_rate": 6.755773716771555e-05, "loss": 0.4145, "step": 1155 }, { "epoch": 1.6633093525179856, "grad_norm": 0.5956179425579616, "learning_rate": 6.752859536153306e-05, "loss": 0.4223, "step": 1156 }, { "epoch": 1.664748201438849, "grad_norm": 0.6033217069177361, "learning_rate": 6.749942576922473e-05, "loss": 0.4039, "step": 1157 }, { "epoch": 1.6661870503597123, "grad_norm": 0.6583228089216405, "learning_rate": 6.7470228420233e-05, "loss": 0.4217, "step": 1158 }, { "epoch": 1.6676258992805755, "grad_norm": 0.6680499004987813, "learning_rate": 6.744100334402836e-05, "loss": 0.4115, "step": 1159 }, { "epoch": 1.6690647482014387, "grad_norm": 0.5380725649234539, "learning_rate": 6.741175057010932e-05, "loss": 0.4228, "step": 1160 }, { "epoch": 1.6705035971223021, "grad_norm": 0.5426590107448707, "learning_rate": 6.738247012800228e-05, "loss": 0.4263, "step": 1161 }, { "epoch": 1.6719424460431656, "grad_norm": 0.3446250027390038, "learning_rate": 6.735316204726163e-05, "loss": 0.406, "step": 1162 }, { "epoch": 1.6733812949640288, "grad_norm": 0.363875910280198, "learning_rate": 6.732382635746961e-05, "loss": 0.4086, "step": 1163 }, { "epoch": 1.674820143884892, "grad_norm": 0.3644991778739921, "learning_rate": 6.729446308823635e-05, "loss": 0.4203, "step": 1164 }, { "epoch": 1.6762589928057554, "grad_norm": 0.349716535674518, "learning_rate": 6.72650722691998e-05, "loss": 0.4159, "step": 1165 }, { "epoch": 1.6776978417266188, "grad_norm": 0.3514867098428374, "learning_rate": 6.723565393002576e-05, "loss": 0.405, "step": 1166 }, { "epoch": 1.679136690647482, "grad_norm": 0.4165407906592057, "learning_rate": 6.720620810040776e-05, "loss": 0.4055, "step": 1167 }, { "epoch": 1.6805755395683453, "grad_norm": 0.5052046649138455, "learning_rate": 6.717673481006709e-05, "loss": 0.4189, "step": 1168 }, { "epoch": 1.6820143884892085, "grad_norm": 0.5142496765109894, "learning_rate": 6.714723408875279e-05, "loss": 0.4089, "step": 1169 }, { "epoch": 1.683453237410072, "grad_norm": 0.4553648017174242, "learning_rate": 6.711770596624153e-05, "loss": 0.417, "step": 1170 }, { "epoch": 1.6848920863309353, "grad_norm": 0.3730268976826378, "learning_rate": 6.708815047233768e-05, "loss": 0.4156, "step": 1171 }, { "epoch": 1.6863309352517986, "grad_norm": 0.37168971826069924, "learning_rate": 6.705856763687324e-05, "loss": 0.4131, "step": 1172 }, { "epoch": 1.6877697841726618, "grad_norm": 0.3092228093700726, "learning_rate": 6.702895748970776e-05, "loss": 0.4152, "step": 1173 }, { "epoch": 1.6892086330935252, "grad_norm": 0.38955015639100576, "learning_rate": 6.699932006072842e-05, "loss": 0.4162, "step": 1174 }, { "epoch": 1.6906474820143886, "grad_norm": 0.5080735895636728, "learning_rate": 6.69696553798499e-05, "loss": 0.4114, "step": 1175 }, { "epoch": 1.6920863309352518, "grad_norm": 0.4620152095782709, "learning_rate": 6.693996347701442e-05, "loss": 0.4149, "step": 1176 }, { "epoch": 1.693525179856115, "grad_norm": 0.47857199036868725, "learning_rate": 6.691024438219159e-05, "loss": 0.4141, "step": 1177 }, { "epoch": 1.6949640287769783, "grad_norm": 0.48308800583112116, "learning_rate": 6.688049812537857e-05, "loss": 0.4176, "step": 1178 }, { "epoch": 1.6964028776978417, "grad_norm": 0.48259699395769545, "learning_rate": 6.685072473659989e-05, "loss": 0.4157, "step": 1179 }, { "epoch": 1.6978417266187051, "grad_norm": 0.36203361838439907, "learning_rate": 6.682092424590747e-05, "loss": 0.4083, "step": 1180 }, { "epoch": 1.6992805755395683, "grad_norm": 0.31495479128819703, "learning_rate": 6.679109668338057e-05, "loss": 0.4087, "step": 1181 }, { "epoch": 1.7007194244604316, "grad_norm": 0.3318572949891459, "learning_rate": 6.676124207912582e-05, "loss": 0.4079, "step": 1182 }, { "epoch": 1.702158273381295, "grad_norm": 0.41650781673916587, "learning_rate": 6.673136046327707e-05, "loss": 0.4184, "step": 1183 }, { "epoch": 1.7035971223021584, "grad_norm": 0.5589325931977038, "learning_rate": 6.670145186599552e-05, "loss": 0.4135, "step": 1184 }, { "epoch": 1.7050359712230216, "grad_norm": 0.6913999906311661, "learning_rate": 6.667151631746953e-05, "loss": 0.413, "step": 1185 }, { "epoch": 1.7064748201438849, "grad_norm": 0.6701838285934596, "learning_rate": 6.664155384791473e-05, "loss": 0.4095, "step": 1186 }, { "epoch": 1.707913669064748, "grad_norm": 0.5579192511025082, "learning_rate": 6.661156448757386e-05, "loss": 0.4234, "step": 1187 }, { "epoch": 1.7093525179856115, "grad_norm": 0.57084696911057, "learning_rate": 6.658154826671685e-05, "loss": 0.4214, "step": 1188 }, { "epoch": 1.710791366906475, "grad_norm": 0.538521220072237, "learning_rate": 6.655150521564072e-05, "loss": 0.4177, "step": 1189 }, { "epoch": 1.7122302158273381, "grad_norm": 0.4459896651146677, "learning_rate": 6.652143536466955e-05, "loss": 0.4151, "step": 1190 }, { "epoch": 1.7136690647482014, "grad_norm": 0.4425563287727174, "learning_rate": 6.649133874415454e-05, "loss": 0.4261, "step": 1191 }, { "epoch": 1.7151079136690648, "grad_norm": 0.5044467565268774, "learning_rate": 6.646121538447382e-05, "loss": 0.4216, "step": 1192 }, { "epoch": 1.7165467625899282, "grad_norm": 0.3354306043962459, "learning_rate": 6.643106531603259e-05, "loss": 0.4219, "step": 1193 }, { "epoch": 1.7179856115107914, "grad_norm": 0.39357004259393435, "learning_rate": 6.640088856926294e-05, "loss": 0.4076, "step": 1194 }, { "epoch": 1.7194244604316546, "grad_norm": 0.5273217051181752, "learning_rate": 6.637068517462395e-05, "loss": 0.4218, "step": 1195 }, { "epoch": 1.7208633093525179, "grad_norm": 0.6087745983365078, "learning_rate": 6.634045516260156e-05, "loss": 0.4126, "step": 1196 }, { "epoch": 1.7223021582733813, "grad_norm": 0.47047396262886537, "learning_rate": 6.631019856370856e-05, "loss": 0.4115, "step": 1197 }, { "epoch": 1.7237410071942447, "grad_norm": 0.33245740608012014, "learning_rate": 6.627991540848464e-05, "loss": 0.4201, "step": 1198 }, { "epoch": 1.725179856115108, "grad_norm": 0.41922774163862975, "learning_rate": 6.624960572749622e-05, "loss": 0.4126, "step": 1199 }, { "epoch": 1.7266187050359711, "grad_norm": 0.4384331345863592, "learning_rate": 6.621926955133657e-05, "loss": 0.4251, "step": 1200 }, { "epoch": 1.7280575539568346, "grad_norm": 0.4079618292679597, "learning_rate": 6.618890691062561e-05, "loss": 0.4146, "step": 1201 }, { "epoch": 1.7294964028776978, "grad_norm": 0.36251729181268255, "learning_rate": 6.615851783601006e-05, "loss": 0.4125, "step": 1202 }, { "epoch": 1.7309352517985612, "grad_norm": 0.3201921174826245, "learning_rate": 6.612810235816326e-05, "loss": 0.4177, "step": 1203 }, { "epoch": 1.7323741007194244, "grad_norm": 0.30717496698971825, "learning_rate": 6.609766050778525e-05, "loss": 0.4134, "step": 1204 }, { "epoch": 1.7338129496402876, "grad_norm": 0.34144771508129956, "learning_rate": 6.606719231560265e-05, "loss": 0.4103, "step": 1205 }, { "epoch": 1.735251798561151, "grad_norm": 0.3399546603031398, "learning_rate": 6.60366978123687e-05, "loss": 0.4168, "step": 1206 }, { "epoch": 1.7366906474820145, "grad_norm": 0.3136693953734484, "learning_rate": 6.600617702886314e-05, "loss": 0.4127, "step": 1207 }, { "epoch": 1.7381294964028777, "grad_norm": 0.31926857746325077, "learning_rate": 6.597562999589233e-05, "loss": 0.4101, "step": 1208 }, { "epoch": 1.739568345323741, "grad_norm": 0.358660556486896, "learning_rate": 6.594505674428903e-05, "loss": 0.4203, "step": 1209 }, { "epoch": 1.7410071942446042, "grad_norm": 0.3659716126309705, "learning_rate": 6.59144573049125e-05, "loss": 0.4167, "step": 1210 }, { "epoch": 1.7424460431654676, "grad_norm": 0.4052981700072071, "learning_rate": 6.588383170864849e-05, "loss": 0.4084, "step": 1211 }, { "epoch": 1.743884892086331, "grad_norm": 0.37102819892247557, "learning_rate": 6.585317998640903e-05, "loss": 0.4114, "step": 1212 }, { "epoch": 1.7453237410071942, "grad_norm": 0.37890137130457685, "learning_rate": 6.582250216913265e-05, "loss": 0.4049, "step": 1213 }, { "epoch": 1.7467625899280574, "grad_norm": 0.47660635938468554, "learning_rate": 6.579179828778414e-05, "loss": 0.4317, "step": 1214 }, { "epoch": 1.7482014388489209, "grad_norm": 0.5403075083072826, "learning_rate": 6.576106837335458e-05, "loss": 0.4058, "step": 1215 }, { "epoch": 1.7496402877697843, "grad_norm": 0.5866191248034582, "learning_rate": 6.573031245686142e-05, "loss": 0.4115, "step": 1216 }, { "epoch": 1.7510791366906475, "grad_norm": 0.504537377126103, "learning_rate": 6.569953056934826e-05, "loss": 0.4177, "step": 1217 }, { "epoch": 1.7525179856115107, "grad_norm": 0.43781511223917, "learning_rate": 6.566872274188496e-05, "loss": 0.4208, "step": 1218 }, { "epoch": 1.753956834532374, "grad_norm": 0.4636740890078483, "learning_rate": 6.563788900556756e-05, "loss": 0.4162, "step": 1219 }, { "epoch": 1.7553956834532374, "grad_norm": 0.4134745031820477, "learning_rate": 6.560702939151826e-05, "loss": 0.4032, "step": 1220 }, { "epoch": 1.7568345323741008, "grad_norm": 0.44431900170972155, "learning_rate": 6.557614393088534e-05, "loss": 0.4132, "step": 1221 }, { "epoch": 1.758273381294964, "grad_norm": 0.43696209712924794, "learning_rate": 6.554523265484321e-05, "loss": 0.4109, "step": 1222 }, { "epoch": 1.7597122302158272, "grad_norm": 0.3936897209104171, "learning_rate": 6.551429559459231e-05, "loss": 0.4185, "step": 1223 }, { "epoch": 1.7611510791366907, "grad_norm": 0.3723578226039111, "learning_rate": 6.548333278135915e-05, "loss": 0.4089, "step": 1224 }, { "epoch": 1.762589928057554, "grad_norm": 0.3569593718648685, "learning_rate": 6.545234424639616e-05, "loss": 0.3998, "step": 1225 }, { "epoch": 1.7640287769784173, "grad_norm": 0.3178602355572999, "learning_rate": 6.542133002098178e-05, "loss": 0.4038, "step": 1226 }, { "epoch": 1.7654676258992805, "grad_norm": 0.3429199290938974, "learning_rate": 6.53902901364204e-05, "loss": 0.4046, "step": 1227 }, { "epoch": 1.7669064748201437, "grad_norm": 0.35686322312114677, "learning_rate": 6.535922462404226e-05, "loss": 0.409, "step": 1228 }, { "epoch": 1.7683453237410072, "grad_norm": 0.33994382692535474, "learning_rate": 6.53281335152035e-05, "loss": 0.4175, "step": 1229 }, { "epoch": 1.7697841726618706, "grad_norm": 0.27286996566385857, "learning_rate": 6.529701684128608e-05, "loss": 0.4114, "step": 1230 }, { "epoch": 1.7712230215827338, "grad_norm": 0.3375280762047218, "learning_rate": 6.526587463369779e-05, "loss": 0.4111, "step": 1231 }, { "epoch": 1.772661870503597, "grad_norm": 0.32409851741711043, "learning_rate": 6.523470692387215e-05, "loss": 0.4067, "step": 1232 }, { "epoch": 1.7741007194244605, "grad_norm": 0.32465639257449525, "learning_rate": 6.520351374326846e-05, "loss": 0.425, "step": 1233 }, { "epoch": 1.775539568345324, "grad_norm": 0.3072756505982714, "learning_rate": 6.51722951233717e-05, "loss": 0.4048, "step": 1234 }, { "epoch": 1.776978417266187, "grad_norm": 0.296270829699046, "learning_rate": 6.514105109569254e-05, "loss": 0.4134, "step": 1235 }, { "epoch": 1.7784172661870503, "grad_norm": 0.3815748221438414, "learning_rate": 6.510978169176731e-05, "loss": 0.4147, "step": 1236 }, { "epoch": 1.7798561151079135, "grad_norm": 0.4231476871569944, "learning_rate": 6.507848694315794e-05, "loss": 0.4042, "step": 1237 }, { "epoch": 1.781294964028777, "grad_norm": 0.38830569275115767, "learning_rate": 6.504716688145192e-05, "loss": 0.4198, "step": 1238 }, { "epoch": 1.7827338129496404, "grad_norm": 0.4021387335238997, "learning_rate": 6.501582153826235e-05, "loss": 0.4188, "step": 1239 }, { "epoch": 1.7841726618705036, "grad_norm": 0.33591783933274055, "learning_rate": 6.498445094522776e-05, "loss": 0.4141, "step": 1240 }, { "epoch": 1.7856115107913668, "grad_norm": 0.3400651749476804, "learning_rate": 6.495305513401226e-05, "loss": 0.4161, "step": 1241 }, { "epoch": 1.7870503597122303, "grad_norm": 0.420865484735002, "learning_rate": 6.492163413630534e-05, "loss": 0.4099, "step": 1242 }, { "epoch": 1.7884892086330937, "grad_norm": 0.4085444891570115, "learning_rate": 6.489018798382195e-05, "loss": 0.4144, "step": 1243 }, { "epoch": 1.789928057553957, "grad_norm": 0.45628227242989433, "learning_rate": 6.485871670830243e-05, "loss": 0.4097, "step": 1244 }, { "epoch": 1.79136690647482, "grad_norm": 0.43906513069837294, "learning_rate": 6.482722034151247e-05, "loss": 0.4063, "step": 1245 }, { "epoch": 1.7928057553956833, "grad_norm": 0.3609264245556682, "learning_rate": 6.479569891524307e-05, "loss": 0.4124, "step": 1246 }, { "epoch": 1.7942446043165468, "grad_norm": 0.3209187946231822, "learning_rate": 6.476415246131056e-05, "loss": 0.4152, "step": 1247 }, { "epoch": 1.7956834532374102, "grad_norm": 0.3604096044404803, "learning_rate": 6.47325810115565e-05, "loss": 0.4058, "step": 1248 }, { "epoch": 1.7971223021582734, "grad_norm": 0.4160391262347793, "learning_rate": 6.470098459784768e-05, "loss": 0.4142, "step": 1249 }, { "epoch": 1.7985611510791366, "grad_norm": 0.33336614192768277, "learning_rate": 6.466936325207612e-05, "loss": 0.4195, "step": 1250 }, { "epoch": 1.8, "grad_norm": 0.2750037566170555, "learning_rate": 6.463771700615898e-05, "loss": 0.4019, "step": 1251 }, { "epoch": 1.8014388489208633, "grad_norm": 0.28172172333368006, "learning_rate": 6.460604589203854e-05, "loss": 0.41, "step": 1252 }, { "epoch": 1.8028776978417267, "grad_norm": 0.29998294888495797, "learning_rate": 6.457434994168224e-05, "loss": 0.4124, "step": 1253 }, { "epoch": 1.80431654676259, "grad_norm": 0.3017781858392867, "learning_rate": 6.454262918708247e-05, "loss": 0.4159, "step": 1254 }, { "epoch": 1.8057553956834531, "grad_norm": 0.39356494672954967, "learning_rate": 6.451088366025682e-05, "loss": 0.4126, "step": 1255 }, { "epoch": 1.8071942446043165, "grad_norm": 0.40728281276855766, "learning_rate": 6.447911339324773e-05, "loss": 0.4142, "step": 1256 }, { "epoch": 1.80863309352518, "grad_norm": 0.35974633943904694, "learning_rate": 6.444731841812274e-05, "loss": 0.4118, "step": 1257 }, { "epoch": 1.8100719424460432, "grad_norm": 0.35993558626760885, "learning_rate": 6.44154987669742e-05, "loss": 0.401, "step": 1258 }, { "epoch": 1.8115107913669064, "grad_norm": 0.36454535373259755, "learning_rate": 6.438365447191947e-05, "loss": 0.412, "step": 1259 }, { "epoch": 1.8129496402877698, "grad_norm": 0.4577783792211193, "learning_rate": 6.435178556510076e-05, "loss": 0.4172, "step": 1260 }, { "epoch": 1.814388489208633, "grad_norm": 0.4624565308209469, "learning_rate": 6.431989207868508e-05, "loss": 0.4134, "step": 1261 }, { "epoch": 1.8158273381294965, "grad_norm": 0.5594542203277918, "learning_rate": 6.428797404486431e-05, "loss": 0.4183, "step": 1262 }, { "epoch": 1.8172661870503597, "grad_norm": 0.7594171209916067, "learning_rate": 6.425603149585507e-05, "loss": 0.4208, "step": 1263 }, { "epoch": 1.818705035971223, "grad_norm": 0.8261431524291456, "learning_rate": 6.422406446389872e-05, "loss": 0.4088, "step": 1264 }, { "epoch": 1.8201438848920863, "grad_norm": 0.8291601036807382, "learning_rate": 6.419207298126135e-05, "loss": 0.427, "step": 1265 }, { "epoch": 1.8215827338129498, "grad_norm": 0.8404790471917353, "learning_rate": 6.416005708023372e-05, "loss": 0.4177, "step": 1266 }, { "epoch": 1.823021582733813, "grad_norm": 0.6143417437150636, "learning_rate": 6.412801679313125e-05, "loss": 0.4157, "step": 1267 }, { "epoch": 1.8244604316546762, "grad_norm": 0.34204692176509743, "learning_rate": 6.409595215229397e-05, "loss": 0.4191, "step": 1268 }, { "epoch": 1.8258992805755394, "grad_norm": 0.40360132906785, "learning_rate": 6.406386319008647e-05, "loss": 0.4093, "step": 1269 }, { "epoch": 1.8273381294964028, "grad_norm": 0.5509190671616278, "learning_rate": 6.403174993889791e-05, "loss": 0.4149, "step": 1270 }, { "epoch": 1.8287769784172663, "grad_norm": 0.6010830771352358, "learning_rate": 6.399961243114197e-05, "loss": 0.4054, "step": 1271 }, { "epoch": 1.8302158273381295, "grad_norm": 0.4913176518711791, "learning_rate": 6.39674506992568e-05, "loss": 0.4156, "step": 1272 }, { "epoch": 1.8316546762589927, "grad_norm": 0.41656157572707764, "learning_rate": 6.393526477570499e-05, "loss": 0.4214, "step": 1273 }, { "epoch": 1.8330935251798561, "grad_norm": 0.2891390865807047, "learning_rate": 6.390305469297357e-05, "loss": 0.4125, "step": 1274 }, { "epoch": 1.8345323741007196, "grad_norm": 0.30863018534875003, "learning_rate": 6.387082048357397e-05, "loss": 0.415, "step": 1275 }, { "epoch": 1.8359712230215828, "grad_norm": 0.37043296322044944, "learning_rate": 6.383856218004193e-05, "loss": 0.4004, "step": 1276 }, { "epoch": 1.837410071942446, "grad_norm": 0.3284467353673348, "learning_rate": 6.380627981493753e-05, "loss": 0.4065, "step": 1277 }, { "epoch": 1.8388489208633092, "grad_norm": 0.26346469726495136, "learning_rate": 6.377397342084514e-05, "loss": 0.4137, "step": 1278 }, { "epoch": 1.8402877697841726, "grad_norm": 0.275682686605177, "learning_rate": 6.37416430303734e-05, "loss": 0.4062, "step": 1279 }, { "epoch": 1.841726618705036, "grad_norm": 0.3188723729080881, "learning_rate": 6.370928867615513e-05, "loss": 0.4074, "step": 1280 }, { "epoch": 1.8431654676258993, "grad_norm": 0.34449166633354666, "learning_rate": 6.367691039084736e-05, "loss": 0.4129, "step": 1281 }, { "epoch": 1.8446043165467625, "grad_norm": 0.32534814011808155, "learning_rate": 6.36445082071313e-05, "loss": 0.4045, "step": 1282 }, { "epoch": 1.846043165467626, "grad_norm": 0.3034647114568148, "learning_rate": 6.361208215771222e-05, "loss": 0.4122, "step": 1283 }, { "epoch": 1.8474820143884894, "grad_norm": 0.3144141267128642, "learning_rate": 6.357963227531954e-05, "loss": 0.415, "step": 1284 }, { "epoch": 1.8489208633093526, "grad_norm": 0.3560605725734302, "learning_rate": 6.35471585927067e-05, "loss": 0.4146, "step": 1285 }, { "epoch": 1.8503597122302158, "grad_norm": 0.31395779215893044, "learning_rate": 6.351466114265118e-05, "loss": 0.4107, "step": 1286 }, { "epoch": 1.851798561151079, "grad_norm": 0.2759705546226354, "learning_rate": 6.348213995795445e-05, "loss": 0.414, "step": 1287 }, { "epoch": 1.8532374100719424, "grad_norm": 0.3899655847777587, "learning_rate": 6.344959507144192e-05, "loss": 0.4116, "step": 1288 }, { "epoch": 1.8546762589928059, "grad_norm": 0.3814047970832645, "learning_rate": 6.341702651596293e-05, "loss": 0.4173, "step": 1289 }, { "epoch": 1.856115107913669, "grad_norm": 0.37854424120396857, "learning_rate": 6.338443432439074e-05, "loss": 0.4188, "step": 1290 }, { "epoch": 1.8575539568345323, "grad_norm": 0.39368141971937126, "learning_rate": 6.335181852962242e-05, "loss": 0.4162, "step": 1291 }, { "epoch": 1.8589928057553957, "grad_norm": 0.8658673346387007, "learning_rate": 6.331917916457889e-05, "loss": 0.4206, "step": 1292 }, { "epoch": 1.8604316546762591, "grad_norm": 0.49994881402323965, "learning_rate": 6.328651626220485e-05, "loss": 0.4059, "step": 1293 }, { "epoch": 1.8618705035971224, "grad_norm": 0.5250618927656368, "learning_rate": 6.325382985546879e-05, "loss": 0.4082, "step": 1294 }, { "epoch": 1.8633093525179856, "grad_norm": 0.6803949679147394, "learning_rate": 6.322111997736288e-05, "loss": 0.4158, "step": 1295 }, { "epoch": 1.8647482014388488, "grad_norm": 0.8362950445425676, "learning_rate": 6.3188386660903e-05, "loss": 0.419, "step": 1296 }, { "epoch": 1.8661870503597122, "grad_norm": 0.823545918156688, "learning_rate": 6.315562993912869e-05, "loss": 0.4124, "step": 1297 }, { "epoch": 1.8676258992805757, "grad_norm": 0.801474049829953, "learning_rate": 6.31228498451031e-05, "loss": 0.4289, "step": 1298 }, { "epoch": 1.8690647482014389, "grad_norm": 0.6485961538539604, "learning_rate": 6.309004641191299e-05, "loss": 0.416, "step": 1299 }, { "epoch": 1.870503597122302, "grad_norm": 0.4779049998944919, "learning_rate": 6.305721967266869e-05, "loss": 0.4081, "step": 1300 }, { "epoch": 1.8719424460431655, "grad_norm": 0.40011164454277504, "learning_rate": 6.302436966050401e-05, "loss": 0.4164, "step": 1301 }, { "epoch": 1.873381294964029, "grad_norm": 0.41896670092009536, "learning_rate": 6.29914964085763e-05, "loss": 0.4083, "step": 1302 }, { "epoch": 1.8748201438848922, "grad_norm": 0.49824582360359393, "learning_rate": 6.295859995006629e-05, "loss": 0.4031, "step": 1303 }, { "epoch": 1.8762589928057554, "grad_norm": 0.41962534284540975, "learning_rate": 6.292568031817823e-05, "loss": 0.417, "step": 1304 }, { "epoch": 1.8776978417266186, "grad_norm": 0.34665690081232475, "learning_rate": 6.28927375461397e-05, "loss": 0.4083, "step": 1305 }, { "epoch": 1.879136690647482, "grad_norm": 0.44765233076906147, "learning_rate": 6.285977166720166e-05, "loss": 0.4172, "step": 1306 }, { "epoch": 1.8805755395683454, "grad_norm": 0.46476012579703596, "learning_rate": 6.28267827146384e-05, "loss": 0.4137, "step": 1307 }, { "epoch": 1.8820143884892087, "grad_norm": 0.38037507101938134, "learning_rate": 6.279377072174744e-05, "loss": 0.4109, "step": 1308 }, { "epoch": 1.8834532374100719, "grad_norm": 0.32169622312544566, "learning_rate": 6.276073572184964e-05, "loss": 0.4055, "step": 1309 }, { "epoch": 1.8848920863309353, "grad_norm": 0.36816131402540186, "learning_rate": 6.272767774828903e-05, "loss": 0.4077, "step": 1310 }, { "epoch": 1.8863309352517985, "grad_norm": 0.3700201083539994, "learning_rate": 6.269459683443283e-05, "loss": 0.4069, "step": 1311 }, { "epoch": 1.887769784172662, "grad_norm": 0.35744457741260083, "learning_rate": 6.266149301367146e-05, "loss": 0.4161, "step": 1312 }, { "epoch": 1.8892086330935252, "grad_norm": 0.3581449937273382, "learning_rate": 6.262836631941839e-05, "loss": 0.415, "step": 1313 }, { "epoch": 1.8906474820143884, "grad_norm": 0.364350376748174, "learning_rate": 6.259521678511023e-05, "loss": 0.4088, "step": 1314 }, { "epoch": 1.8920863309352518, "grad_norm": 0.3099762299417227, "learning_rate": 6.256204444420663e-05, "loss": 0.4036, "step": 1315 }, { "epoch": 1.8935251798561152, "grad_norm": 0.34894540444737093, "learning_rate": 6.252884933019028e-05, "loss": 0.4089, "step": 1316 }, { "epoch": 1.8949640287769784, "grad_norm": 0.44353539498483013, "learning_rate": 6.249563147656679e-05, "loss": 0.4061, "step": 1317 }, { "epoch": 1.8964028776978417, "grad_norm": 0.4498233764462982, "learning_rate": 6.24623909168648e-05, "loss": 0.4165, "step": 1318 }, { "epoch": 1.8978417266187049, "grad_norm": 0.41749822238022266, "learning_rate": 6.242912768463581e-05, "loss": 0.3984, "step": 1319 }, { "epoch": 1.8992805755395683, "grad_norm": 0.34484560899709454, "learning_rate": 6.239584181345426e-05, "loss": 0.4125, "step": 1320 }, { "epoch": 1.9007194244604317, "grad_norm": 0.427987432108927, "learning_rate": 6.236253333691739e-05, "loss": 0.4023, "step": 1321 }, { "epoch": 1.902158273381295, "grad_norm": 0.4823641444906322, "learning_rate": 6.23292022886453e-05, "loss": 0.4042, "step": 1322 }, { "epoch": 1.9035971223021582, "grad_norm": 0.3651976602578544, "learning_rate": 6.229584870228083e-05, "loss": 0.4121, "step": 1323 }, { "epoch": 1.9050359712230216, "grad_norm": 0.2263944366922672, "learning_rate": 6.226247261148958e-05, "loss": 0.4203, "step": 1324 }, { "epoch": 1.906474820143885, "grad_norm": 0.33145983822020075, "learning_rate": 6.22290740499599e-05, "loss": 0.4184, "step": 1325 }, { "epoch": 1.9079136690647482, "grad_norm": 0.35964447853254644, "learning_rate": 6.21956530514028e-05, "loss": 0.4169, "step": 1326 }, { "epoch": 1.9093525179856115, "grad_norm": 0.3066675728373096, "learning_rate": 6.216220964955192e-05, "loss": 0.4094, "step": 1327 }, { "epoch": 1.9107913669064747, "grad_norm": 0.32985730307035266, "learning_rate": 6.21287438781635e-05, "loss": 0.4087, "step": 1328 }, { "epoch": 1.912230215827338, "grad_norm": 0.31748231267418137, "learning_rate": 6.209525577101642e-05, "loss": 0.4121, "step": 1329 }, { "epoch": 1.9136690647482015, "grad_norm": 0.333443294448134, "learning_rate": 6.206174536191207e-05, "loss": 0.4144, "step": 1330 }, { "epoch": 1.9151079136690647, "grad_norm": 0.30500839535193813, "learning_rate": 6.202821268467433e-05, "loss": 0.3967, "step": 1331 }, { "epoch": 1.916546762589928, "grad_norm": 0.3146013067290611, "learning_rate": 6.199465777314958e-05, "loss": 0.4176, "step": 1332 }, { "epoch": 1.9179856115107914, "grad_norm": 0.2769375621970634, "learning_rate": 6.196108066120663e-05, "loss": 0.4074, "step": 1333 }, { "epoch": 1.9194244604316548, "grad_norm": 0.27574004683235964, "learning_rate": 6.192748138273674e-05, "loss": 0.4035, "step": 1334 }, { "epoch": 1.920863309352518, "grad_norm": 0.2912381975508216, "learning_rate": 6.189385997165348e-05, "loss": 0.4105, "step": 1335 }, { "epoch": 1.9223021582733812, "grad_norm": 0.23670494436860218, "learning_rate": 6.186021646189281e-05, "loss": 0.4124, "step": 1336 }, { "epoch": 1.9237410071942445, "grad_norm": 0.20344126310126784, "learning_rate": 6.182655088741294e-05, "loss": 0.4027, "step": 1337 }, { "epoch": 1.925179856115108, "grad_norm": 0.23852188447780012, "learning_rate": 6.179286328219442e-05, "loss": 0.4148, "step": 1338 }, { "epoch": 1.9266187050359713, "grad_norm": 0.2920784841505559, "learning_rate": 6.175915368024e-05, "loss": 0.4069, "step": 1339 }, { "epoch": 1.9280575539568345, "grad_norm": 0.31369463060257147, "learning_rate": 6.172542211557463e-05, "loss": 0.4124, "step": 1340 }, { "epoch": 1.9294964028776977, "grad_norm": 0.37988212667817745, "learning_rate": 6.169166862224542e-05, "loss": 0.4189, "step": 1341 }, { "epoch": 1.9309352517985612, "grad_norm": 0.40533514154650313, "learning_rate": 6.165789323432166e-05, "loss": 0.4003, "step": 1342 }, { "epoch": 1.9323741007194246, "grad_norm": 0.38002459364623914, "learning_rate": 6.162409598589467e-05, "loss": 0.4098, "step": 1343 }, { "epoch": 1.9338129496402878, "grad_norm": 0.5121102181780426, "learning_rate": 6.159027691107791e-05, "loss": 0.4208, "step": 1344 }, { "epoch": 1.935251798561151, "grad_norm": 0.6506184503662952, "learning_rate": 6.15564360440068e-05, "loss": 0.4082, "step": 1345 }, { "epoch": 1.9366906474820142, "grad_norm": 0.5736472588450692, "learning_rate": 6.15225734188388e-05, "loss": 0.4098, "step": 1346 }, { "epoch": 1.9381294964028777, "grad_norm": 0.35704831088633465, "learning_rate": 6.148868906975334e-05, "loss": 0.4118, "step": 1347 }, { "epoch": 1.9395683453237411, "grad_norm": 0.33181184996259766, "learning_rate": 6.145478303095174e-05, "loss": 0.4067, "step": 1348 }, { "epoch": 1.9410071942446043, "grad_norm": 0.4069463361425132, "learning_rate": 6.142085533665722e-05, "loss": 0.4072, "step": 1349 }, { "epoch": 1.9424460431654675, "grad_norm": 0.37388758221848784, "learning_rate": 6.138690602111487e-05, "loss": 0.4173, "step": 1350 }, { "epoch": 1.943884892086331, "grad_norm": 0.3542153370672022, "learning_rate": 6.135293511859164e-05, "loss": 0.4215, "step": 1351 }, { "epoch": 1.9453237410071944, "grad_norm": 0.3454628285076372, "learning_rate": 6.131894266337618e-05, "loss": 0.4132, "step": 1352 }, { "epoch": 1.9467625899280576, "grad_norm": 0.2864504777225727, "learning_rate": 6.128492868977897e-05, "loss": 0.4063, "step": 1353 }, { "epoch": 1.9482014388489208, "grad_norm": 0.25629228963906786, "learning_rate": 6.12508932321322e-05, "loss": 0.4064, "step": 1354 }, { "epoch": 1.949640287769784, "grad_norm": 0.3641256393384321, "learning_rate": 6.12168363247897e-05, "loss": 0.4021, "step": 1355 }, { "epoch": 1.9510791366906475, "grad_norm": 0.395733173150441, "learning_rate": 6.1182758002127e-05, "loss": 0.4067, "step": 1356 }, { "epoch": 1.952517985611511, "grad_norm": 0.3740525490367782, "learning_rate": 6.114865829854123e-05, "loss": 0.4062, "step": 1357 }, { "epoch": 1.9539568345323741, "grad_norm": 0.3948848350037882, "learning_rate": 6.111453724845106e-05, "loss": 0.4035, "step": 1358 }, { "epoch": 1.9553956834532373, "grad_norm": 0.3025584688762319, "learning_rate": 6.108039488629679e-05, "loss": 0.4088, "step": 1359 }, { "epoch": 1.9568345323741008, "grad_norm": 0.3029961956579855, "learning_rate": 6.104623124654016e-05, "loss": 0.4088, "step": 1360 }, { "epoch": 1.958273381294964, "grad_norm": 0.39285369871901754, "learning_rate": 6.101204636366441e-05, "loss": 0.4041, "step": 1361 }, { "epoch": 1.9597122302158274, "grad_norm": 0.2922226995225387, "learning_rate": 6.0977840272174224e-05, "loss": 0.3992, "step": 1362 }, { "epoch": 1.9611510791366906, "grad_norm": 0.26267833177425903, "learning_rate": 6.094361300659571e-05, "loss": 0.4107, "step": 1363 }, { "epoch": 1.9625899280575538, "grad_norm": 0.2617780818706576, "learning_rate": 6.090936460147632e-05, "loss": 0.4123, "step": 1364 }, { "epoch": 1.9640287769784173, "grad_norm": 0.19966697740668615, "learning_rate": 6.087509509138483e-05, "loss": 0.3975, "step": 1365 }, { "epoch": 1.9654676258992807, "grad_norm": 0.2453492732586692, "learning_rate": 6.0840804510911374e-05, "loss": 0.4101, "step": 1366 }, { "epoch": 1.966906474820144, "grad_norm": 0.2776952871988876, "learning_rate": 6.0806492894667315e-05, "loss": 0.4095, "step": 1367 }, { "epoch": 1.9683453237410071, "grad_norm": 0.2837398029906124, "learning_rate": 6.077216027728524e-05, "loss": 0.4064, "step": 1368 }, { "epoch": 1.9697841726618706, "grad_norm": 0.2943359821815254, "learning_rate": 6.073780669341896e-05, "loss": 0.4111, "step": 1369 }, { "epoch": 1.9712230215827338, "grad_norm": 0.3090899888025719, "learning_rate": 6.070343217774343e-05, "loss": 0.4088, "step": 1370 }, { "epoch": 1.9726618705035972, "grad_norm": 0.3297382622058121, "learning_rate": 6.066903676495477e-05, "loss": 0.4038, "step": 1371 }, { "epoch": 1.9741007194244604, "grad_norm": 0.32365191273040594, "learning_rate": 6.063462048977011e-05, "loss": 0.4175, "step": 1372 }, { "epoch": 1.9755395683453236, "grad_norm": 0.36847282941476367, "learning_rate": 6.060018338692774e-05, "loss": 0.4206, "step": 1373 }, { "epoch": 1.976978417266187, "grad_norm": 0.38276412372212654, "learning_rate": 6.056572549118688e-05, "loss": 0.4063, "step": 1374 }, { "epoch": 1.9784172661870505, "grad_norm": 0.36046848840628315, "learning_rate": 6.053124683732781e-05, "loss": 0.4059, "step": 1375 }, { "epoch": 1.9798561151079137, "grad_norm": 0.3460277018852855, "learning_rate": 6.049674746015172e-05, "loss": 0.4052, "step": 1376 }, { "epoch": 1.981294964028777, "grad_norm": 0.4554861210413628, "learning_rate": 6.046222739448075e-05, "loss": 0.4091, "step": 1377 }, { "epoch": 1.9827338129496401, "grad_norm": 0.6046812360447316, "learning_rate": 6.042768667515786e-05, "loss": 0.4126, "step": 1378 }, { "epoch": 1.9841726618705036, "grad_norm": 0.6805215423686101, "learning_rate": 6.039312533704692e-05, "loss": 0.4138, "step": 1379 }, { "epoch": 1.985611510791367, "grad_norm": 0.6521568279627475, "learning_rate": 6.0358543415032625e-05, "loss": 0.4138, "step": 1380 }, { "epoch": 1.9870503597122302, "grad_norm": 0.590040105509639, "learning_rate": 6.032394094402035e-05, "loss": 0.4109, "step": 1381 }, { "epoch": 1.9884892086330934, "grad_norm": 0.5507782563828758, "learning_rate": 6.0289317958936305e-05, "loss": 0.4056, "step": 1382 }, { "epoch": 1.9899280575539569, "grad_norm": 0.46989797761590174, "learning_rate": 6.0254674494727374e-05, "loss": 0.4054, "step": 1383 }, { "epoch": 1.9913669064748203, "grad_norm": 0.3689791897607179, "learning_rate": 6.022001058636111e-05, "loss": 0.4014, "step": 1384 }, { "epoch": 1.9928057553956835, "grad_norm": 0.23556075043882688, "learning_rate": 6.01853262688257e-05, "loss": 0.4125, "step": 1385 }, { "epoch": 1.9942446043165467, "grad_norm": 0.27317519191530143, "learning_rate": 6.0150621577129934e-05, "loss": 0.4248, "step": 1386 }, { "epoch": 1.99568345323741, "grad_norm": 0.32185748101615674, "learning_rate": 6.011589654630318e-05, "loss": 0.3986, "step": 1387 }, { "epoch": 1.9971223021582734, "grad_norm": 0.329612866758165, "learning_rate": 6.008115121139528e-05, "loss": 0.4133, "step": 1388 }, { "epoch": 1.9985611510791368, "grad_norm": 0.32417942052255694, "learning_rate": 6.0046385607476655e-05, "loss": 0.4109, "step": 1389 }, { "epoch": 2.0, "grad_norm": 0.29654712665293514, "learning_rate": 6.001159976963814e-05, "loss": 0.4193, "step": 1390 }, { "epoch": 2.001438848920863, "grad_norm": 0.3459667004954266, "learning_rate": 5.9976793732990965e-05, "loss": 0.3824, "step": 1391 }, { "epoch": 2.0028776978417264, "grad_norm": 0.37518647892071594, "learning_rate": 5.9941967532666806e-05, "loss": 0.3865, "step": 1392 }, { "epoch": 2.00431654676259, "grad_norm": 0.3279409839377487, "learning_rate": 5.990712120381766e-05, "loss": 0.3785, "step": 1393 }, { "epoch": 2.0057553956834533, "grad_norm": 0.37998707272526827, "learning_rate": 5.987225478161583e-05, "loss": 0.3886, "step": 1394 }, { "epoch": 2.0071942446043165, "grad_norm": 0.33163850726507343, "learning_rate": 5.9837368301253905e-05, "loss": 0.3825, "step": 1395 }, { "epoch": 2.0086330935251797, "grad_norm": 0.3461474484939308, "learning_rate": 5.980246179794476e-05, "loss": 0.384, "step": 1396 }, { "epoch": 2.0100719424460434, "grad_norm": 0.3302823572490198, "learning_rate": 5.976753530692144e-05, "loss": 0.3927, "step": 1397 }, { "epoch": 2.0115107913669066, "grad_norm": 0.5977889055298922, "learning_rate": 5.9732588863437155e-05, "loss": 0.3819, "step": 1398 }, { "epoch": 2.01294964028777, "grad_norm": 0.3635705965151427, "learning_rate": 5.96976225027653e-05, "loss": 0.3849, "step": 1399 }, { "epoch": 2.014388489208633, "grad_norm": 0.395481424837781, "learning_rate": 5.966263626019932e-05, "loss": 0.3861, "step": 1400 }, { "epoch": 2.015827338129496, "grad_norm": 0.42318411480225626, "learning_rate": 5.9627630171052774e-05, "loss": 0.3916, "step": 1401 }, { "epoch": 2.01726618705036, "grad_norm": 0.36316339747330917, "learning_rate": 5.9592604270659234e-05, "loss": 0.3827, "step": 1402 }, { "epoch": 2.018705035971223, "grad_norm": 0.37143452419933953, "learning_rate": 5.955755859437225e-05, "loss": 0.3793, "step": 1403 }, { "epoch": 2.0201438848920863, "grad_norm": 0.43755692064098567, "learning_rate": 5.9522493177565366e-05, "loss": 0.3887, "step": 1404 }, { "epoch": 2.0215827338129495, "grad_norm": 0.35195856639014117, "learning_rate": 5.948740805563203e-05, "loss": 0.3867, "step": 1405 }, { "epoch": 2.023021582733813, "grad_norm": 0.2924243151297277, "learning_rate": 5.94523032639856e-05, "loss": 0.3853, "step": 1406 }, { "epoch": 2.0244604316546764, "grad_norm": 0.27236408606934676, "learning_rate": 5.9417178838059254e-05, "loss": 0.3844, "step": 1407 }, { "epoch": 2.0258992805755396, "grad_norm": 0.2808506053100754, "learning_rate": 5.9382034813306014e-05, "loss": 0.3933, "step": 1408 }, { "epoch": 2.027338129496403, "grad_norm": 0.3249046985300833, "learning_rate": 5.934687122519868e-05, "loss": 0.3879, "step": 1409 }, { "epoch": 2.028776978417266, "grad_norm": 0.30255528141566884, "learning_rate": 5.93116881092298e-05, "loss": 0.3933, "step": 1410 }, { "epoch": 2.0302158273381297, "grad_norm": 0.25697615522420125, "learning_rate": 5.927648550091162e-05, "loss": 0.385, "step": 1411 }, { "epoch": 2.031654676258993, "grad_norm": 0.2933321125736272, "learning_rate": 5.9241263435776087e-05, "loss": 0.3742, "step": 1412 }, { "epoch": 2.033093525179856, "grad_norm": 0.2871683640914002, "learning_rate": 5.920602194937474e-05, "loss": 0.382, "step": 1413 }, { "epoch": 2.0345323741007193, "grad_norm": 0.274468606150787, "learning_rate": 5.9170761077278766e-05, "loss": 0.3927, "step": 1414 }, { "epoch": 2.0359712230215825, "grad_norm": 0.22310469914268924, "learning_rate": 5.9135480855078915e-05, "loss": 0.3952, "step": 1415 }, { "epoch": 2.037410071942446, "grad_norm": 0.27311522093486645, "learning_rate": 5.910018131838544e-05, "loss": 0.3837, "step": 1416 }, { "epoch": 2.0388489208633094, "grad_norm": 0.2930255558309253, "learning_rate": 5.906486250282811e-05, "loss": 0.3927, "step": 1417 }, { "epoch": 2.0402877697841726, "grad_norm": 0.2949923450998013, "learning_rate": 5.902952444405615e-05, "loss": 0.3865, "step": 1418 }, { "epoch": 2.041726618705036, "grad_norm": 0.367148447428638, "learning_rate": 5.899416717773822e-05, "loss": 0.3823, "step": 1419 }, { "epoch": 2.0431654676258995, "grad_norm": 0.40861101175486775, "learning_rate": 5.8958790739562316e-05, "loss": 0.3777, "step": 1420 }, { "epoch": 2.0446043165467627, "grad_norm": 0.3336076746347345, "learning_rate": 5.892339516523586e-05, "loss": 0.3872, "step": 1421 }, { "epoch": 2.046043165467626, "grad_norm": 0.31298154003525624, "learning_rate": 5.8887980490485536e-05, "loss": 0.3924, "step": 1422 }, { "epoch": 2.047482014388489, "grad_norm": 0.24082912268403683, "learning_rate": 5.8852546751057337e-05, "loss": 0.3827, "step": 1423 }, { "epoch": 2.0489208633093523, "grad_norm": 0.2438905924001371, "learning_rate": 5.8817093982716455e-05, "loss": 0.38, "step": 1424 }, { "epoch": 2.050359712230216, "grad_norm": 0.2625708850715305, "learning_rate": 5.878162222124735e-05, "loss": 0.38, "step": 1425 }, { "epoch": 2.051798561151079, "grad_norm": 0.24961055276235888, "learning_rate": 5.8746131502453623e-05, "loss": 0.3878, "step": 1426 }, { "epoch": 2.0532374100719424, "grad_norm": 0.22308771158231927, "learning_rate": 5.871062186215799e-05, "loss": 0.3759, "step": 1427 }, { "epoch": 2.0546762589928056, "grad_norm": 0.2840986003311144, "learning_rate": 5.867509333620231e-05, "loss": 0.3843, "step": 1428 }, { "epoch": 2.0561151079136692, "grad_norm": 0.3127898412044105, "learning_rate": 5.863954596044744e-05, "loss": 0.3793, "step": 1429 }, { "epoch": 2.0575539568345325, "grad_norm": 0.3313591176294113, "learning_rate": 5.8603979770773344e-05, "loss": 0.3825, "step": 1430 }, { "epoch": 2.0589928057553957, "grad_norm": 0.3385757526260623, "learning_rate": 5.85683948030789e-05, "loss": 0.3995, "step": 1431 }, { "epoch": 2.060431654676259, "grad_norm": 0.2887325144460221, "learning_rate": 5.8532791093282e-05, "loss": 0.3879, "step": 1432 }, { "epoch": 2.061870503597122, "grad_norm": 0.2978522332390748, "learning_rate": 5.849716867731941e-05, "loss": 0.3917, "step": 1433 }, { "epoch": 2.0633093525179858, "grad_norm": 0.2716509667278568, "learning_rate": 5.84615275911468e-05, "loss": 0.3819, "step": 1434 }, { "epoch": 2.064748201438849, "grad_norm": 0.24739912138242007, "learning_rate": 5.8425867870738684e-05, "loss": 0.3844, "step": 1435 }, { "epoch": 2.066187050359712, "grad_norm": 0.2742167550579385, "learning_rate": 5.839018955208838e-05, "loss": 0.3743, "step": 1436 }, { "epoch": 2.0676258992805754, "grad_norm": 0.3253999143230146, "learning_rate": 5.835449267120796e-05, "loss": 0.3867, "step": 1437 }, { "epoch": 2.069064748201439, "grad_norm": 0.2619192587544168, "learning_rate": 5.831877726412827e-05, "loss": 0.3811, "step": 1438 }, { "epoch": 2.0705035971223023, "grad_norm": 0.23391498232949504, "learning_rate": 5.828304336689883e-05, "loss": 0.3841, "step": 1439 }, { "epoch": 2.0719424460431655, "grad_norm": 0.27588999427877353, "learning_rate": 5.824729101558781e-05, "loss": 0.3887, "step": 1440 }, { "epoch": 2.0733812949640287, "grad_norm": 0.20580107415618934, "learning_rate": 5.821152024628207e-05, "loss": 0.3952, "step": 1441 }, { "epoch": 2.074820143884892, "grad_norm": 0.25656023537508604, "learning_rate": 5.8175731095086974e-05, "loss": 0.3859, "step": 1442 }, { "epoch": 2.0762589928057555, "grad_norm": 0.3102205957999117, "learning_rate": 5.813992359812649e-05, "loss": 0.3808, "step": 1443 }, { "epoch": 2.0776978417266188, "grad_norm": 0.3170056077742382, "learning_rate": 5.8104097791543104e-05, "loss": 0.3843, "step": 1444 }, { "epoch": 2.079136690647482, "grad_norm": 0.264046971425198, "learning_rate": 5.806825371149778e-05, "loss": 0.3891, "step": 1445 }, { "epoch": 2.080575539568345, "grad_norm": 0.27391192188464647, "learning_rate": 5.803239139416989e-05, "loss": 0.3876, "step": 1446 }, { "epoch": 2.082014388489209, "grad_norm": 0.34098754315012436, "learning_rate": 5.799651087575728e-05, "loss": 0.3914, "step": 1447 }, { "epoch": 2.083453237410072, "grad_norm": 0.35068952798880154, "learning_rate": 5.7960612192476096e-05, "loss": 0.3879, "step": 1448 }, { "epoch": 2.0848920863309353, "grad_norm": 0.2949671254441889, "learning_rate": 5.792469538056089e-05, "loss": 0.3943, "step": 1449 }, { "epoch": 2.0863309352517985, "grad_norm": 0.30766133986982047, "learning_rate": 5.7888760476264445e-05, "loss": 0.3803, "step": 1450 }, { "epoch": 2.0877697841726617, "grad_norm": 1.0803950671766844, "learning_rate": 5.785280751585785e-05, "loss": 0.4023, "step": 1451 }, { "epoch": 2.0892086330935253, "grad_norm": 0.39452949530176423, "learning_rate": 5.7816836535630436e-05, "loss": 0.3894, "step": 1452 }, { "epoch": 2.0906474820143885, "grad_norm": 0.34118233214173016, "learning_rate": 5.7780847571889625e-05, "loss": 0.3901, "step": 1453 }, { "epoch": 2.0920863309352518, "grad_norm": 0.3018886581079717, "learning_rate": 5.7744840660961126e-05, "loss": 0.39, "step": 1454 }, { "epoch": 2.093525179856115, "grad_norm": 0.4524051858154562, "learning_rate": 5.770881583918865e-05, "loss": 0.3915, "step": 1455 }, { "epoch": 2.0949640287769786, "grad_norm": 0.42544552361250226, "learning_rate": 5.767277314293404e-05, "loss": 0.39, "step": 1456 }, { "epoch": 2.096402877697842, "grad_norm": 0.37257336494331905, "learning_rate": 5.76367126085772e-05, "loss": 0.3922, "step": 1457 }, { "epoch": 2.097841726618705, "grad_norm": 0.4693117389719594, "learning_rate": 5.760063427251599e-05, "loss": 0.3943, "step": 1458 }, { "epoch": 2.0992805755395683, "grad_norm": 0.45114686143866667, "learning_rate": 5.756453817116624e-05, "loss": 0.3939, "step": 1459 }, { "epoch": 2.1007194244604315, "grad_norm": 0.4839684373567707, "learning_rate": 5.752842434096176e-05, "loss": 0.3882, "step": 1460 }, { "epoch": 2.102158273381295, "grad_norm": 0.5213357084405652, "learning_rate": 5.7492292818354224e-05, "loss": 0.3889, "step": 1461 }, { "epoch": 2.1035971223021583, "grad_norm": 0.4748272524025722, "learning_rate": 5.745614363981316e-05, "loss": 0.3935, "step": 1462 }, { "epoch": 2.1050359712230216, "grad_norm": 0.36349426117437394, "learning_rate": 5.741997684182591e-05, "loss": 0.3856, "step": 1463 }, { "epoch": 2.1064748201438848, "grad_norm": 0.30830762113025034, "learning_rate": 5.7383792460897626e-05, "loss": 0.3944, "step": 1464 }, { "epoch": 2.1079136690647484, "grad_norm": 0.3019268807772761, "learning_rate": 5.73475905335512e-05, "loss": 0.3865, "step": 1465 }, { "epoch": 2.1093525179856116, "grad_norm": 0.3514724192590477, "learning_rate": 5.731137109632722e-05, "loss": 0.3863, "step": 1466 }, { "epoch": 2.110791366906475, "grad_norm": 0.3490486941049694, "learning_rate": 5.727513418578397e-05, "loss": 0.3872, "step": 1467 }, { "epoch": 2.112230215827338, "grad_norm": 0.3145147820518404, "learning_rate": 5.723887983849732e-05, "loss": 0.3891, "step": 1468 }, { "epoch": 2.1136690647482013, "grad_norm": 0.2514407045031779, "learning_rate": 5.720260809106083e-05, "loss": 0.3917, "step": 1469 }, { "epoch": 2.115107913669065, "grad_norm": 0.28621965565971635, "learning_rate": 5.716631898008553e-05, "loss": 0.385, "step": 1470 }, { "epoch": 2.116546762589928, "grad_norm": 0.29628756710605814, "learning_rate": 5.713001254220002e-05, "loss": 0.3846, "step": 1471 }, { "epoch": 2.1179856115107913, "grad_norm": 0.2442164832622535, "learning_rate": 5.7093688814050425e-05, "loss": 0.3889, "step": 1472 }, { "epoch": 2.1194244604316546, "grad_norm": 0.2412959429087779, "learning_rate": 5.705734783230022e-05, "loss": 0.3826, "step": 1473 }, { "epoch": 2.1208633093525178, "grad_norm": 0.2810101099676454, "learning_rate": 5.7020989633630414e-05, "loss": 0.3905, "step": 1474 }, { "epoch": 2.1223021582733814, "grad_norm": 0.25445474818311004, "learning_rate": 5.6984614254739306e-05, "loss": 0.3873, "step": 1475 }, { "epoch": 2.1237410071942446, "grad_norm": 0.25557649490833445, "learning_rate": 5.694822173234257e-05, "loss": 0.3844, "step": 1476 }, { "epoch": 2.125179856115108, "grad_norm": 0.2777684812774492, "learning_rate": 5.691181210317319e-05, "loss": 0.3857, "step": 1477 }, { "epoch": 2.126618705035971, "grad_norm": 0.22345516855286213, "learning_rate": 5.687538540398141e-05, "loss": 0.3872, "step": 1478 }, { "epoch": 2.1280575539568347, "grad_norm": 0.21060976344773377, "learning_rate": 5.683894167153468e-05, "loss": 0.3795, "step": 1479 }, { "epoch": 2.129496402877698, "grad_norm": 0.2861691465626582, "learning_rate": 5.680248094261769e-05, "loss": 0.3922, "step": 1480 }, { "epoch": 2.130935251798561, "grad_norm": 0.3107365211538534, "learning_rate": 5.676600325403224e-05, "loss": 0.3793, "step": 1481 }, { "epoch": 2.1323741007194243, "grad_norm": 0.2748030294623659, "learning_rate": 5.672950864259729e-05, "loss": 0.3832, "step": 1482 }, { "epoch": 2.133812949640288, "grad_norm": 0.27990029682683215, "learning_rate": 5.669299714514884e-05, "loss": 0.388, "step": 1483 }, { "epoch": 2.135251798561151, "grad_norm": 0.22121964580171558, "learning_rate": 5.665646879853995e-05, "loss": 0.388, "step": 1484 }, { "epoch": 2.1366906474820144, "grad_norm": 0.21626914568347483, "learning_rate": 5.661992363964072e-05, "loss": 0.3831, "step": 1485 }, { "epoch": 2.1381294964028776, "grad_norm": 0.29919711192071885, "learning_rate": 5.658336170533814e-05, "loss": 0.3812, "step": 1486 }, { "epoch": 2.139568345323741, "grad_norm": 0.3158627972498642, "learning_rate": 5.654678303253624e-05, "loss": 0.3876, "step": 1487 }, { "epoch": 2.1410071942446045, "grad_norm": 0.28145261964361495, "learning_rate": 5.6510187658155846e-05, "loss": 0.3907, "step": 1488 }, { "epoch": 2.1424460431654677, "grad_norm": 0.342743346803727, "learning_rate": 5.6473575619134686e-05, "loss": 0.3812, "step": 1489 }, { "epoch": 2.143884892086331, "grad_norm": 0.374531993468505, "learning_rate": 5.643694695242731e-05, "loss": 0.3818, "step": 1490 }, { "epoch": 2.145323741007194, "grad_norm": 0.27133829796780834, "learning_rate": 5.640030169500508e-05, "loss": 0.3836, "step": 1491 }, { "epoch": 2.1467625899280574, "grad_norm": 0.2351914169210999, "learning_rate": 5.636363988385601e-05, "loss": 0.396, "step": 1492 }, { "epoch": 2.148201438848921, "grad_norm": 0.24933413811162175, "learning_rate": 5.632696155598493e-05, "loss": 0.3902, "step": 1493 }, { "epoch": 2.149640287769784, "grad_norm": 0.22425796164616213, "learning_rate": 5.6290266748413266e-05, "loss": 0.3867, "step": 1494 }, { "epoch": 2.1510791366906474, "grad_norm": 0.2305010005410816, "learning_rate": 5.6253555498179124e-05, "loss": 0.382, "step": 1495 }, { "epoch": 2.1525179856115106, "grad_norm": 0.23705255037178416, "learning_rate": 5.621682784233718e-05, "loss": 0.3878, "step": 1496 }, { "epoch": 2.1539568345323743, "grad_norm": 0.1768755293978743, "learning_rate": 5.618008381795868e-05, "loss": 0.3903, "step": 1497 }, { "epoch": 2.1553956834532375, "grad_norm": 0.20575095776656013, "learning_rate": 5.61433234621314e-05, "loss": 0.393, "step": 1498 }, { "epoch": 2.1568345323741007, "grad_norm": 0.19699655048659204, "learning_rate": 5.610654681195957e-05, "loss": 0.3875, "step": 1499 }, { "epoch": 2.158273381294964, "grad_norm": 0.19827610098763424, "learning_rate": 5.606975390456391e-05, "loss": 0.3909, "step": 1500 }, { "epoch": 2.159712230215827, "grad_norm": 0.21305747803672276, "learning_rate": 5.603294477708149e-05, "loss": 0.3893, "step": 1501 }, { "epoch": 2.161151079136691, "grad_norm": 0.2459941053050217, "learning_rate": 5.599611946666581e-05, "loss": 0.3942, "step": 1502 }, { "epoch": 2.162589928057554, "grad_norm": 0.23071666194309015, "learning_rate": 5.595927801048669e-05, "loss": 0.3834, "step": 1503 }, { "epoch": 2.1640287769784172, "grad_norm": 0.2305452784564125, "learning_rate": 5.5922420445730245e-05, "loss": 0.3885, "step": 1504 }, { "epoch": 2.1654676258992804, "grad_norm": 0.3170557603384182, "learning_rate": 5.5885546809598805e-05, "loss": 0.3857, "step": 1505 }, { "epoch": 2.166906474820144, "grad_norm": 0.3885810828081808, "learning_rate": 5.584865713931098e-05, "loss": 0.3892, "step": 1506 }, { "epoch": 2.1683453237410073, "grad_norm": 0.3686007865701777, "learning_rate": 5.5811751472101564e-05, "loss": 0.3856, "step": 1507 }, { "epoch": 2.1697841726618705, "grad_norm": 0.3356733847553759, "learning_rate": 5.577482984522145e-05, "loss": 0.3844, "step": 1508 }, { "epoch": 2.1712230215827337, "grad_norm": 0.3917999468495738, "learning_rate": 5.573789229593767e-05, "loss": 0.3839, "step": 1509 }, { "epoch": 2.172661870503597, "grad_norm": 0.47448224083035506, "learning_rate": 5.570093886153334e-05, "loss": 0.3857, "step": 1510 }, { "epoch": 2.1741007194244606, "grad_norm": 0.5088636398067714, "learning_rate": 5.5663969579307594e-05, "loss": 0.3913, "step": 1511 }, { "epoch": 2.175539568345324, "grad_norm": 0.5098767681725841, "learning_rate": 5.562698448657553e-05, "loss": 0.3743, "step": 1512 }, { "epoch": 2.176978417266187, "grad_norm": 0.44226504165942554, "learning_rate": 5.5589983620668286e-05, "loss": 0.3868, "step": 1513 }, { "epoch": 2.1784172661870502, "grad_norm": 0.3701485993906192, "learning_rate": 5.555296701893284e-05, "loss": 0.3891, "step": 1514 }, { "epoch": 2.1798561151079134, "grad_norm": 0.32659810006018575, "learning_rate": 5.551593471873208e-05, "loss": 0.3865, "step": 1515 }, { "epoch": 2.181294964028777, "grad_norm": 0.2739322729567986, "learning_rate": 5.547888675744476e-05, "loss": 0.3904, "step": 1516 }, { "epoch": 2.1827338129496403, "grad_norm": 1.1514120414314388, "learning_rate": 5.5441823172465427e-05, "loss": 0.3816, "step": 1517 }, { "epoch": 2.1841726618705035, "grad_norm": 0.5094085249815662, "learning_rate": 5.540474400120438e-05, "loss": 0.3961, "step": 1518 }, { "epoch": 2.1856115107913667, "grad_norm": 0.3689605263551239, "learning_rate": 5.536764928108769e-05, "loss": 0.3925, "step": 1519 }, { "epoch": 2.1870503597122304, "grad_norm": 0.6710103294342745, "learning_rate": 5.533053904955709e-05, "loss": 0.4006, "step": 1520 }, { "epoch": 2.1884892086330936, "grad_norm": 0.589844121232794, "learning_rate": 5.5293413344069964e-05, "loss": 0.4039, "step": 1521 }, { "epoch": 2.189928057553957, "grad_norm": 0.4465240610476147, "learning_rate": 5.525627220209934e-05, "loss": 0.4048, "step": 1522 }, { "epoch": 2.19136690647482, "grad_norm": 0.5119028970967803, "learning_rate": 5.5219115661133815e-05, "loss": 0.3916, "step": 1523 }, { "epoch": 2.1928057553956837, "grad_norm": 0.5617122367035737, "learning_rate": 5.518194375867754e-05, "loss": 0.3961, "step": 1524 }, { "epoch": 2.194244604316547, "grad_norm": 0.4690244242416867, "learning_rate": 5.514475653225014e-05, "loss": 0.3922, "step": 1525 }, { "epoch": 2.19568345323741, "grad_norm": 0.4015231614518023, "learning_rate": 5.510755401938676e-05, "loss": 0.392, "step": 1526 }, { "epoch": 2.1971223021582733, "grad_norm": 0.5097283725286479, "learning_rate": 5.5070336257637904e-05, "loss": 0.3964, "step": 1527 }, { "epoch": 2.1985611510791365, "grad_norm": 0.435515690531513, "learning_rate": 5.503310328456953e-05, "loss": 0.3974, "step": 1528 }, { "epoch": 2.2, "grad_norm": 0.3806838811922357, "learning_rate": 5.4995855137762926e-05, "loss": 0.3954, "step": 1529 }, { "epoch": 2.2014388489208634, "grad_norm": 0.39291846470544906, "learning_rate": 5.4958591854814695e-05, "loss": 0.3892, "step": 1530 }, { "epoch": 2.2028776978417266, "grad_norm": 0.397331489863752, "learning_rate": 5.492131347333671e-05, "loss": 0.3892, "step": 1531 }, { "epoch": 2.20431654676259, "grad_norm": 0.3429131255958608, "learning_rate": 5.48840200309561e-05, "loss": 0.4013, "step": 1532 }, { "epoch": 2.205755395683453, "grad_norm": 0.34498430285961723, "learning_rate": 5.484671156531519e-05, "loss": 0.3905, "step": 1533 }, { "epoch": 2.2071942446043167, "grad_norm": 0.3627701636779315, "learning_rate": 5.480938811407146e-05, "loss": 0.3866, "step": 1534 }, { "epoch": 2.20863309352518, "grad_norm": 0.32607719730695583, "learning_rate": 5.477204971489753e-05, "loss": 0.3926, "step": 1535 }, { "epoch": 2.210071942446043, "grad_norm": 0.32683828106525564, "learning_rate": 5.473469640548109e-05, "loss": 0.4002, "step": 1536 }, { "epoch": 2.2115107913669063, "grad_norm": 0.3622966949282371, "learning_rate": 5.469732822352491e-05, "loss": 0.3828, "step": 1537 }, { "epoch": 2.21294964028777, "grad_norm": 0.2366794753501399, "learning_rate": 5.465994520674672e-05, "loss": 0.3907, "step": 1538 }, { "epoch": 2.214388489208633, "grad_norm": 0.28809503041902335, "learning_rate": 5.4622547392879295e-05, "loss": 0.3873, "step": 1539 }, { "epoch": 2.2158273381294964, "grad_norm": 0.3237451617454751, "learning_rate": 5.458513481967027e-05, "loss": 0.3916, "step": 1540 }, { "epoch": 2.2172661870503596, "grad_norm": 0.2730893600150766, "learning_rate": 5.454770752488223e-05, "loss": 0.3941, "step": 1541 }, { "epoch": 2.218705035971223, "grad_norm": 0.25974636320336986, "learning_rate": 5.4510265546292615e-05, "loss": 0.3931, "step": 1542 }, { "epoch": 2.2201438848920865, "grad_norm": 0.34671179580477074, "learning_rate": 5.4472808921693657e-05, "loss": 0.3913, "step": 1543 }, { "epoch": 2.2215827338129497, "grad_norm": 0.3535548881559468, "learning_rate": 5.4435337688892396e-05, "loss": 0.3968, "step": 1544 }, { "epoch": 2.223021582733813, "grad_norm": 0.2636642875839242, "learning_rate": 5.4397851885710595e-05, "loss": 0.3848, "step": 1545 }, { "epoch": 2.224460431654676, "grad_norm": 0.3133545904069258, "learning_rate": 5.4360351549984755e-05, "loss": 0.3912, "step": 1546 }, { "epoch": 2.2258992805755398, "grad_norm": 0.3808123551337556, "learning_rate": 5.432283671956601e-05, "loss": 0.3888, "step": 1547 }, { "epoch": 2.227338129496403, "grad_norm": 0.35583536953867234, "learning_rate": 5.428530743232016e-05, "loss": 0.3838, "step": 1548 }, { "epoch": 2.228776978417266, "grad_norm": 0.219318701997273, "learning_rate": 5.4247763726127564e-05, "loss": 0.3751, "step": 1549 }, { "epoch": 2.2302158273381294, "grad_norm": 0.22318535359522693, "learning_rate": 5.421020563888317e-05, "loss": 0.3909, "step": 1550 }, { "epoch": 2.2316546762589926, "grad_norm": 0.24960667305651438, "learning_rate": 5.417263320849641e-05, "loss": 0.3893, "step": 1551 }, { "epoch": 2.2330935251798563, "grad_norm": 0.2615472186059933, "learning_rate": 5.4135046472891205e-05, "loss": 0.3965, "step": 1552 }, { "epoch": 2.2345323741007195, "grad_norm": 0.20597246340467404, "learning_rate": 5.409744547000591e-05, "loss": 0.3926, "step": 1553 }, { "epoch": 2.2359712230215827, "grad_norm": 0.23479074496220995, "learning_rate": 5.405983023779328e-05, "loss": 0.3975, "step": 1554 }, { "epoch": 2.237410071942446, "grad_norm": 0.30908876136663427, "learning_rate": 5.402220081422048e-05, "loss": 0.3794, "step": 1555 }, { "epoch": 2.2388489208633096, "grad_norm": 0.23958569016002487, "learning_rate": 5.3984557237268905e-05, "loss": 0.397, "step": 1556 }, { "epoch": 2.2402877697841728, "grad_norm": 0.231453892673959, "learning_rate": 5.394689954493432e-05, "loss": 0.3874, "step": 1557 }, { "epoch": 2.241726618705036, "grad_norm": 0.2836005114568376, "learning_rate": 5.390922777522669e-05, "loss": 0.3897, "step": 1558 }, { "epoch": 2.243165467625899, "grad_norm": 0.3062651354015222, "learning_rate": 5.3871541966170225e-05, "loss": 0.3971, "step": 1559 }, { "epoch": 2.2446043165467624, "grad_norm": 0.2636891794944903, "learning_rate": 5.383384215580326e-05, "loss": 0.3882, "step": 1560 }, { "epoch": 2.246043165467626, "grad_norm": 0.22758009767252638, "learning_rate": 5.37961283821783e-05, "loss": 0.3924, "step": 1561 }, { "epoch": 2.2474820143884893, "grad_norm": 0.25066092725129, "learning_rate": 5.3758400683361926e-05, "loss": 0.3997, "step": 1562 }, { "epoch": 2.2489208633093525, "grad_norm": 0.2424680271461611, "learning_rate": 5.372065909743479e-05, "loss": 0.383, "step": 1563 }, { "epoch": 2.2503597122302157, "grad_norm": 0.20431411327869473, "learning_rate": 5.368290366249155e-05, "loss": 0.3752, "step": 1564 }, { "epoch": 2.2517985611510793, "grad_norm": 0.20934677671259003, "learning_rate": 5.364513441664084e-05, "loss": 0.3886, "step": 1565 }, { "epoch": 2.2532374100719426, "grad_norm": 0.23967597244584293, "learning_rate": 5.3607351398005234e-05, "loss": 0.39, "step": 1566 }, { "epoch": 2.2546762589928058, "grad_norm": 0.2657640992169594, "learning_rate": 5.356955464472121e-05, "loss": 0.3994, "step": 1567 }, { "epoch": 2.256115107913669, "grad_norm": 0.2618534462802217, "learning_rate": 5.353174419493913e-05, "loss": 0.377, "step": 1568 }, { "epoch": 2.257553956834532, "grad_norm": 0.24253080028381546, "learning_rate": 5.349392008682314e-05, "loss": 0.3921, "step": 1569 }, { "epoch": 2.258992805755396, "grad_norm": 0.34307249380050653, "learning_rate": 5.3456082358551204e-05, "loss": 0.3899, "step": 1570 }, { "epoch": 2.260431654676259, "grad_norm": 0.3409052578002414, "learning_rate": 5.341823104831501e-05, "loss": 0.3903, "step": 1571 }, { "epoch": 2.2618705035971223, "grad_norm": 0.2339187186412542, "learning_rate": 5.338036619431999e-05, "loss": 0.3909, "step": 1572 }, { "epoch": 2.2633093525179855, "grad_norm": 0.21239871731333118, "learning_rate": 5.33424878347852e-05, "loss": 0.3823, "step": 1573 }, { "epoch": 2.2647482014388487, "grad_norm": 0.2929779081662266, "learning_rate": 5.330459600794337e-05, "loss": 0.3921, "step": 1574 }, { "epoch": 2.2661870503597124, "grad_norm": 0.22381918976927967, "learning_rate": 5.32666907520408e-05, "loss": 0.3781, "step": 1575 }, { "epoch": 2.2676258992805756, "grad_norm": 0.23426147068490208, "learning_rate": 5.322877210533735e-05, "loss": 0.3852, "step": 1576 }, { "epoch": 2.2690647482014388, "grad_norm": 0.2719058724374197, "learning_rate": 5.319084010610638e-05, "loss": 0.387, "step": 1577 }, { "epoch": 2.270503597122302, "grad_norm": 0.26897588687119994, "learning_rate": 5.3152894792634785e-05, "loss": 0.3866, "step": 1578 }, { "epoch": 2.2719424460431656, "grad_norm": 0.32395317237830024, "learning_rate": 5.311493620322282e-05, "loss": 0.3819, "step": 1579 }, { "epoch": 2.273381294964029, "grad_norm": 0.33028302802465415, "learning_rate": 5.3076964376184186e-05, "loss": 0.3881, "step": 1580 }, { "epoch": 2.274820143884892, "grad_norm": 0.35374219168145615, "learning_rate": 5.303897934984595e-05, "loss": 0.3833, "step": 1581 }, { "epoch": 2.2762589928057553, "grad_norm": 0.3192560554107421, "learning_rate": 5.300098116254848e-05, "loss": 0.3933, "step": 1582 }, { "epoch": 2.277697841726619, "grad_norm": 0.3020262943029512, "learning_rate": 5.296296985264543e-05, "loss": 0.389, "step": 1583 }, { "epoch": 2.279136690647482, "grad_norm": 0.3224059430495137, "learning_rate": 5.2924945458503713e-05, "loss": 0.3992, "step": 1584 }, { "epoch": 2.2805755395683454, "grad_norm": 0.25799963699770995, "learning_rate": 5.2886908018503454e-05, "loss": 0.3862, "step": 1585 }, { "epoch": 2.2820143884892086, "grad_norm": 0.21127475643898685, "learning_rate": 5.284885757103792e-05, "loss": 0.3994, "step": 1586 }, { "epoch": 2.283453237410072, "grad_norm": 0.2333998104861042, "learning_rate": 5.2810794154513503e-05, "loss": 0.3894, "step": 1587 }, { "epoch": 2.2848920863309354, "grad_norm": 0.26324442876196735, "learning_rate": 5.277271780734975e-05, "loss": 0.3855, "step": 1588 }, { "epoch": 2.2863309352517986, "grad_norm": 0.22322544371503758, "learning_rate": 5.273462856797918e-05, "loss": 0.3931, "step": 1589 }, { "epoch": 2.287769784172662, "grad_norm": 0.22903560510895377, "learning_rate": 5.269652647484735e-05, "loss": 0.3834, "step": 1590 }, { "epoch": 2.289208633093525, "grad_norm": 0.24076772553209855, "learning_rate": 5.2658411566412837e-05, "loss": 0.3968, "step": 1591 }, { "epoch": 2.2906474820143883, "grad_norm": 0.2496121201066887, "learning_rate": 5.262028388114708e-05, "loss": 0.3942, "step": 1592 }, { "epoch": 2.292086330935252, "grad_norm": 0.22160483679343942, "learning_rate": 5.258214345753446e-05, "loss": 0.3858, "step": 1593 }, { "epoch": 2.293525179856115, "grad_norm": 0.2633714450095719, "learning_rate": 5.254399033407221e-05, "loss": 0.3869, "step": 1594 }, { "epoch": 2.2949640287769784, "grad_norm": 0.2665266751171906, "learning_rate": 5.250582454927037e-05, "loss": 0.3863, "step": 1595 }, { "epoch": 2.2964028776978416, "grad_norm": 0.30336682615969185, "learning_rate": 5.2467646141651764e-05, "loss": 0.3944, "step": 1596 }, { "epoch": 2.2978417266187052, "grad_norm": 0.28219159817093825, "learning_rate": 5.2429455149751976e-05, "loss": 0.3869, "step": 1597 }, { "epoch": 2.2992805755395684, "grad_norm": 0.315586024026505, "learning_rate": 5.2391251612119256e-05, "loss": 0.3891, "step": 1598 }, { "epoch": 2.3007194244604317, "grad_norm": 0.34645587224096863, "learning_rate": 5.235303556731456e-05, "loss": 0.3815, "step": 1599 }, { "epoch": 2.302158273381295, "grad_norm": 0.32416702210472587, "learning_rate": 5.23148070539114e-05, "loss": 0.3943, "step": 1600 }, { "epoch": 2.3035971223021585, "grad_norm": 0.34359351373970476, "learning_rate": 5.227656611049598e-05, "loss": 0.3797, "step": 1601 }, { "epoch": 2.3050359712230217, "grad_norm": 0.28629848486179393, "learning_rate": 5.2238312775666935e-05, "loss": 0.3944, "step": 1602 }, { "epoch": 2.306474820143885, "grad_norm": 0.19655450800377178, "learning_rate": 5.220004708803548e-05, "loss": 0.3869, "step": 1603 }, { "epoch": 2.307913669064748, "grad_norm": 0.31199316853355963, "learning_rate": 5.216176908622528e-05, "loss": 0.392, "step": 1604 }, { "epoch": 2.3093525179856114, "grad_norm": 0.31184479529560166, "learning_rate": 5.2123478808872436e-05, "loss": 0.3934, "step": 1605 }, { "epoch": 2.310791366906475, "grad_norm": 0.2672326157880456, "learning_rate": 5.208517629462541e-05, "loss": 0.3896, "step": 1606 }, { "epoch": 2.3122302158273382, "grad_norm": 0.21122181565761514, "learning_rate": 5.204686158214507e-05, "loss": 0.3923, "step": 1607 }, { "epoch": 2.3136690647482014, "grad_norm": 0.2278103420446257, "learning_rate": 5.200853471010453e-05, "loss": 0.3894, "step": 1608 }, { "epoch": 2.3151079136690647, "grad_norm": 0.24873073464908046, "learning_rate": 5.197019571718921e-05, "loss": 0.376, "step": 1609 }, { "epoch": 2.316546762589928, "grad_norm": 0.21658729968501075, "learning_rate": 5.19318446420968e-05, "loss": 0.3855, "step": 1610 }, { "epoch": 2.3179856115107915, "grad_norm": 0.2529075081894275, "learning_rate": 5.189348152353712e-05, "loss": 0.3783, "step": 1611 }, { "epoch": 2.3194244604316547, "grad_norm": 0.2153775315337548, "learning_rate": 5.1855106400232196e-05, "loss": 0.3905, "step": 1612 }, { "epoch": 2.320863309352518, "grad_norm": 0.269495226521631, "learning_rate": 5.181671931091612e-05, "loss": 0.3879, "step": 1613 }, { "epoch": 2.322302158273381, "grad_norm": 0.2707722450348019, "learning_rate": 5.1778320294335126e-05, "loss": 0.3759, "step": 1614 }, { "epoch": 2.3237410071942444, "grad_norm": 0.28244783707110455, "learning_rate": 5.1739909389247445e-05, "loss": 0.3916, "step": 1615 }, { "epoch": 2.325179856115108, "grad_norm": 0.28701297443418267, "learning_rate": 5.17014866344233e-05, "loss": 0.3859, "step": 1616 }, { "epoch": 2.3266187050359712, "grad_norm": 0.18644053030705382, "learning_rate": 5.166305206864492e-05, "loss": 0.3904, "step": 1617 }, { "epoch": 2.3280575539568344, "grad_norm": 0.2709891480835394, "learning_rate": 5.162460573070642e-05, "loss": 0.3871, "step": 1618 }, { "epoch": 2.3294964028776977, "grad_norm": 0.2617322592172387, "learning_rate": 5.158614765941376e-05, "loss": 0.3841, "step": 1619 }, { "epoch": 2.3309352517985613, "grad_norm": 0.26174736986859515, "learning_rate": 5.1547677893584846e-05, "loss": 0.3859, "step": 1620 }, { "epoch": 2.3323741007194245, "grad_norm": 0.2307642654355334, "learning_rate": 5.15091964720493e-05, "loss": 0.3916, "step": 1621 }, { "epoch": 2.3338129496402877, "grad_norm": 0.21977774824301788, "learning_rate": 5.1470703433648556e-05, "loss": 0.3842, "step": 1622 }, { "epoch": 2.335251798561151, "grad_norm": 0.22971288228127282, "learning_rate": 5.143219881723573e-05, "loss": 0.4007, "step": 1623 }, { "epoch": 2.3366906474820146, "grad_norm": 0.28261283623252514, "learning_rate": 5.139368266167567e-05, "loss": 0.3981, "step": 1624 }, { "epoch": 2.338129496402878, "grad_norm": 0.28051381072839177, "learning_rate": 5.135515500584484e-05, "loss": 0.3906, "step": 1625 }, { "epoch": 2.339568345323741, "grad_norm": 0.2813911820987458, "learning_rate": 5.131661588863132e-05, "loss": 0.3875, "step": 1626 }, { "epoch": 2.3410071942446042, "grad_norm": 0.2604237435520954, "learning_rate": 5.1278065348934786e-05, "loss": 0.3827, "step": 1627 }, { "epoch": 2.3424460431654675, "grad_norm": 0.24605378870914793, "learning_rate": 5.123950342566639e-05, "loss": 0.3857, "step": 1628 }, { "epoch": 2.343884892086331, "grad_norm": 0.2820459068205453, "learning_rate": 5.120093015774882e-05, "loss": 0.3891, "step": 1629 }, { "epoch": 2.3453237410071943, "grad_norm": 0.29832490331226674, "learning_rate": 5.116234558411618e-05, "loss": 0.3954, "step": 1630 }, { "epoch": 2.3467625899280575, "grad_norm": 0.27682515377856187, "learning_rate": 5.1123749743714024e-05, "loss": 0.3845, "step": 1631 }, { "epoch": 2.3482014388489207, "grad_norm": 0.27644616429449503, "learning_rate": 5.1085142675499246e-05, "loss": 0.3836, "step": 1632 }, { "epoch": 2.349640287769784, "grad_norm": 0.2686334074934592, "learning_rate": 5.1046524418440075e-05, "loss": 0.3899, "step": 1633 }, { "epoch": 2.3510791366906476, "grad_norm": 0.25101187988966867, "learning_rate": 5.100789501151607e-05, "loss": 0.3919, "step": 1634 }, { "epoch": 2.352517985611511, "grad_norm": 0.25963436453622124, "learning_rate": 5.0969254493717996e-05, "loss": 0.3893, "step": 1635 }, { "epoch": 2.353956834532374, "grad_norm": 0.2857525462609916, "learning_rate": 5.093060290404785e-05, "loss": 0.3838, "step": 1636 }, { "epoch": 2.3553956834532372, "grad_norm": 0.30757391496232317, "learning_rate": 5.089194028151882e-05, "loss": 0.3804, "step": 1637 }, { "epoch": 2.356834532374101, "grad_norm": 0.38990252907497586, "learning_rate": 5.085326666515521e-05, "loss": 0.3904, "step": 1638 }, { "epoch": 2.358273381294964, "grad_norm": 0.3608392668172516, "learning_rate": 5.081458209399243e-05, "loss": 0.3841, "step": 1639 }, { "epoch": 2.3597122302158273, "grad_norm": 0.2955583789377134, "learning_rate": 5.0775886607076954e-05, "loss": 0.3895, "step": 1640 }, { "epoch": 2.3611510791366905, "grad_norm": 0.2586577242218363, "learning_rate": 5.073718024346626e-05, "loss": 0.3761, "step": 1641 }, { "epoch": 2.362589928057554, "grad_norm": 0.2615000083548353, "learning_rate": 5.06984630422288e-05, "loss": 0.3837, "step": 1642 }, { "epoch": 2.3640287769784174, "grad_norm": 0.3326260309023068, "learning_rate": 5.065973504244399e-05, "loss": 0.3815, "step": 1643 }, { "epoch": 2.3654676258992806, "grad_norm": 0.3193365286232002, "learning_rate": 5.062099628320213e-05, "loss": 0.3911, "step": 1644 }, { "epoch": 2.366906474820144, "grad_norm": 0.2704723034935209, "learning_rate": 5.058224680360438e-05, "loss": 0.3902, "step": 1645 }, { "epoch": 2.368345323741007, "grad_norm": 0.28067033155134913, "learning_rate": 5.054348664276271e-05, "loss": 0.397, "step": 1646 }, { "epoch": 2.3697841726618707, "grad_norm": 0.3350444247767652, "learning_rate": 5.05047158397999e-05, "loss": 0.3905, "step": 1647 }, { "epoch": 2.371223021582734, "grad_norm": 0.2931220556257705, "learning_rate": 5.046593443384945e-05, "loss": 0.3839, "step": 1648 }, { "epoch": 2.372661870503597, "grad_norm": 0.2540958503700094, "learning_rate": 5.042714246405555e-05, "loss": 0.3862, "step": 1649 }, { "epoch": 2.3741007194244603, "grad_norm": 0.23977184211113461, "learning_rate": 5.038833996957309e-05, "loss": 0.3906, "step": 1650 }, { "epoch": 2.3755395683453235, "grad_norm": 0.25432728108805436, "learning_rate": 5.0349526989567546e-05, "loss": 0.3853, "step": 1651 }, { "epoch": 2.376978417266187, "grad_norm": 0.2561709827869189, "learning_rate": 5.0310703563215016e-05, "loss": 0.3846, "step": 1652 }, { "epoch": 2.3784172661870504, "grad_norm": 0.21704652844193725, "learning_rate": 5.027186972970211e-05, "loss": 0.388, "step": 1653 }, { "epoch": 2.3798561151079136, "grad_norm": 0.17061332702750404, "learning_rate": 5.0233025528225934e-05, "loss": 0.3789, "step": 1654 }, { "epoch": 2.381294964028777, "grad_norm": 0.18399449260481843, "learning_rate": 5.01941709979941e-05, "loss": 0.3799, "step": 1655 }, { "epoch": 2.38273381294964, "grad_norm": 0.22162020684974135, "learning_rate": 5.015530617822462e-05, "loss": 0.3837, "step": 1656 }, { "epoch": 2.3841726618705037, "grad_norm": 0.20671573781841068, "learning_rate": 5.011643110814589e-05, "loss": 0.3917, "step": 1657 }, { "epoch": 2.385611510791367, "grad_norm": 0.16593354428047088, "learning_rate": 5.007754582699666e-05, "loss": 0.3845, "step": 1658 }, { "epoch": 2.38705035971223, "grad_norm": 0.22254692306957252, "learning_rate": 5.003865037402598e-05, "loss": 0.394, "step": 1659 }, { "epoch": 2.3884892086330938, "grad_norm": 0.22310726078403112, "learning_rate": 4.999974478849319e-05, "loss": 0.3804, "step": 1660 }, { "epoch": 2.389928057553957, "grad_norm": 0.2619630555973278, "learning_rate": 4.99608291096678e-05, "loss": 0.3868, "step": 1661 }, { "epoch": 2.39136690647482, "grad_norm": 0.2458818523957843, "learning_rate": 4.9921903376829565e-05, "loss": 0.3775, "step": 1662 }, { "epoch": 2.3928057553956834, "grad_norm": 0.2579910384042537, "learning_rate": 4.988296762926838e-05, "loss": 0.3845, "step": 1663 }, { "epoch": 2.3942446043165466, "grad_norm": 0.22595778256110546, "learning_rate": 4.984402190628422e-05, "loss": 0.3779, "step": 1664 }, { "epoch": 2.3956834532374103, "grad_norm": 0.2365438089684538, "learning_rate": 4.980506624718716e-05, "loss": 0.3825, "step": 1665 }, { "epoch": 2.3971223021582735, "grad_norm": 0.2854766742388604, "learning_rate": 4.9766100691297284e-05, "loss": 0.3939, "step": 1666 }, { "epoch": 2.3985611510791367, "grad_norm": 0.362537703997184, "learning_rate": 4.9727125277944675e-05, "loss": 0.378, "step": 1667 }, { "epoch": 2.4, "grad_norm": 0.31454121000332186, "learning_rate": 4.968814004646934e-05, "loss": 0.399, "step": 1668 }, { "epoch": 2.401438848920863, "grad_norm": 0.28207607370827553, "learning_rate": 4.964914503622126e-05, "loss": 0.3851, "step": 1669 }, { "epoch": 2.402877697841727, "grad_norm": 0.25651958753351733, "learning_rate": 4.961014028656021e-05, "loss": 0.3913, "step": 1670 }, { "epoch": 2.40431654676259, "grad_norm": 0.26643824821221995, "learning_rate": 4.9571125836855825e-05, "loss": 0.3855, "step": 1671 }, { "epoch": 2.405755395683453, "grad_norm": 0.38308916207841903, "learning_rate": 4.9532101726487564e-05, "loss": 0.3787, "step": 1672 }, { "epoch": 2.4071942446043164, "grad_norm": 0.4127926515912552, "learning_rate": 4.9493067994844606e-05, "loss": 0.3911, "step": 1673 }, { "epoch": 2.4086330935251796, "grad_norm": 0.40012187170633984, "learning_rate": 4.9454024681325815e-05, "loss": 0.3862, "step": 1674 }, { "epoch": 2.4100719424460433, "grad_norm": 0.3244702493554365, "learning_rate": 4.941497182533978e-05, "loss": 0.386, "step": 1675 }, { "epoch": 2.4115107913669065, "grad_norm": 0.2840241844161442, "learning_rate": 4.937590946630469e-05, "loss": 0.3921, "step": 1676 }, { "epoch": 2.4129496402877697, "grad_norm": 0.313971585605015, "learning_rate": 4.9336837643648335e-05, "loss": 0.3821, "step": 1677 }, { "epoch": 2.414388489208633, "grad_norm": 0.4288719828415872, "learning_rate": 4.929775639680805e-05, "loss": 0.3888, "step": 1678 }, { "epoch": 2.4158273381294966, "grad_norm": 0.44916133701646954, "learning_rate": 4.925866576523069e-05, "loss": 0.3868, "step": 1679 }, { "epoch": 2.41726618705036, "grad_norm": 0.3259177807565223, "learning_rate": 4.921956578837259e-05, "loss": 0.3899, "step": 1680 }, { "epoch": 2.418705035971223, "grad_norm": 0.26965941651128966, "learning_rate": 4.918045650569949e-05, "loss": 0.3851, "step": 1681 }, { "epoch": 2.420143884892086, "grad_norm": 0.29360546025499556, "learning_rate": 4.9141337956686564e-05, "loss": 0.3882, "step": 1682 }, { "epoch": 2.42158273381295, "grad_norm": 0.2907702864259603, "learning_rate": 4.91022101808183e-05, "loss": 0.3859, "step": 1683 }, { "epoch": 2.423021582733813, "grad_norm": 0.26413262410779004, "learning_rate": 4.90630732175885e-05, "loss": 0.3816, "step": 1684 }, { "epoch": 2.4244604316546763, "grad_norm": 0.2031957652912718, "learning_rate": 4.902392710650028e-05, "loss": 0.3852, "step": 1685 }, { "epoch": 2.4258992805755395, "grad_norm": 0.21883823433372213, "learning_rate": 4.898477188706596e-05, "loss": 0.3844, "step": 1686 }, { "epoch": 2.4273381294964027, "grad_norm": 0.24813905857170207, "learning_rate": 4.894560759880705e-05, "loss": 0.386, "step": 1687 }, { "epoch": 2.4287769784172664, "grad_norm": 0.29886426314455694, "learning_rate": 4.8906434281254223e-05, "loss": 0.3849, "step": 1688 }, { "epoch": 2.4302158273381296, "grad_norm": 0.2462432057626903, "learning_rate": 4.886725197394726e-05, "loss": 0.3709, "step": 1689 }, { "epoch": 2.431654676258993, "grad_norm": 0.22984391832958947, "learning_rate": 4.882806071643503e-05, "loss": 0.3844, "step": 1690 }, { "epoch": 2.433093525179856, "grad_norm": 0.20851399760817524, "learning_rate": 4.878886054827541e-05, "loss": 0.3885, "step": 1691 }, { "epoch": 2.434532374100719, "grad_norm": 0.24732415430598192, "learning_rate": 4.874965150903529e-05, "loss": 0.3817, "step": 1692 }, { "epoch": 2.435971223021583, "grad_norm": 0.31324080974370105, "learning_rate": 4.871043363829053e-05, "loss": 0.3915, "step": 1693 }, { "epoch": 2.437410071942446, "grad_norm": 0.2797132571446861, "learning_rate": 4.8671206975625856e-05, "loss": 0.3811, "step": 1694 }, { "epoch": 2.4388489208633093, "grad_norm": 0.26446108791806094, "learning_rate": 4.863197156063492e-05, "loss": 0.3806, "step": 1695 }, { "epoch": 2.4402877697841725, "grad_norm": 0.2231399345001575, "learning_rate": 4.859272743292017e-05, "loss": 0.3875, "step": 1696 }, { "epoch": 2.441726618705036, "grad_norm": 0.28628230910642666, "learning_rate": 4.855347463209287e-05, "loss": 0.4024, "step": 1697 }, { "epoch": 2.4431654676258994, "grad_norm": 0.3078005652030596, "learning_rate": 4.851421319777304e-05, "loss": 0.3926, "step": 1698 }, { "epoch": 2.4446043165467626, "grad_norm": 0.26622692722444935, "learning_rate": 4.847494316958939e-05, "loss": 0.3916, "step": 1699 }, { "epoch": 2.446043165467626, "grad_norm": 0.2158880712975389, "learning_rate": 4.8435664587179315e-05, "loss": 0.3827, "step": 1700 }, { "epoch": 2.4474820143884894, "grad_norm": 0.31400694325498657, "learning_rate": 4.839637749018887e-05, "loss": 0.3931, "step": 1701 }, { "epoch": 2.4489208633093527, "grad_norm": 0.32557854065890673, "learning_rate": 4.835708191827268e-05, "loss": 0.3863, "step": 1702 }, { "epoch": 2.450359712230216, "grad_norm": 0.21232550658716104, "learning_rate": 4.831777791109392e-05, "loss": 0.3868, "step": 1703 }, { "epoch": 2.451798561151079, "grad_norm": 0.3361666495557492, "learning_rate": 4.827846550832428e-05, "loss": 0.3865, "step": 1704 }, { "epoch": 2.4532374100719423, "grad_norm": 0.4299216376230312, "learning_rate": 4.8239144749643936e-05, "loss": 0.3796, "step": 1705 }, { "epoch": 2.454676258992806, "grad_norm": 0.30048479764175423, "learning_rate": 4.819981567474152e-05, "loss": 0.3799, "step": 1706 }, { "epoch": 2.456115107913669, "grad_norm": 0.21691420801560146, "learning_rate": 4.8160478323313974e-05, "loss": 0.3896, "step": 1707 }, { "epoch": 2.4575539568345324, "grad_norm": 0.1941356571231044, "learning_rate": 4.812113273506671e-05, "loss": 0.3776, "step": 1708 }, { "epoch": 2.4589928057553956, "grad_norm": 0.22030413531456672, "learning_rate": 4.808177894971336e-05, "loss": 0.3906, "step": 1709 }, { "epoch": 2.460431654676259, "grad_norm": 0.2471830041613509, "learning_rate": 4.804241700697588e-05, "loss": 0.382, "step": 1710 }, { "epoch": 2.4618705035971225, "grad_norm": 0.2632273217637152, "learning_rate": 4.800304694658443e-05, "loss": 0.3955, "step": 1711 }, { "epoch": 2.4633093525179857, "grad_norm": 0.28868079552476417, "learning_rate": 4.796366880827739e-05, "loss": 0.3771, "step": 1712 }, { "epoch": 2.464748201438849, "grad_norm": 0.21361752591660416, "learning_rate": 4.792428263180128e-05, "loss": 0.3871, "step": 1713 }, { "epoch": 2.466187050359712, "grad_norm": 0.2507688064265132, "learning_rate": 4.7884888456910734e-05, "loss": 0.3901, "step": 1714 }, { "epoch": 2.4676258992805753, "grad_norm": 0.2598408449312428, "learning_rate": 4.784548632336846e-05, "loss": 0.3861, "step": 1715 }, { "epoch": 2.469064748201439, "grad_norm": 0.2296367717457473, "learning_rate": 4.7806076270945197e-05, "loss": 0.3857, "step": 1716 }, { "epoch": 2.470503597122302, "grad_norm": 0.1940649865490585, "learning_rate": 4.776665833941968e-05, "loss": 0.3885, "step": 1717 }, { "epoch": 2.4719424460431654, "grad_norm": 0.19295142640898422, "learning_rate": 4.772723256857859e-05, "loss": 0.3814, "step": 1718 }, { "epoch": 2.4733812949640286, "grad_norm": 0.20467746234339637, "learning_rate": 4.768779899821655e-05, "loss": 0.3819, "step": 1719 }, { "epoch": 2.4748201438848922, "grad_norm": 0.20585922948625396, "learning_rate": 4.7648357668135996e-05, "loss": 0.3956, "step": 1720 }, { "epoch": 2.4762589928057555, "grad_norm": 0.2612823747600825, "learning_rate": 4.760890861814726e-05, "loss": 0.3779, "step": 1721 }, { "epoch": 2.4776978417266187, "grad_norm": 0.2665043843681745, "learning_rate": 4.756945188806843e-05, "loss": 0.3887, "step": 1722 }, { "epoch": 2.479136690647482, "grad_norm": 0.23432511588086502, "learning_rate": 4.752998751772536e-05, "loss": 0.3888, "step": 1723 }, { "epoch": 2.4805755395683455, "grad_norm": 0.2565009983326827, "learning_rate": 4.749051554695159e-05, "loss": 0.3918, "step": 1724 }, { "epoch": 2.4820143884892087, "grad_norm": 0.25325968492801615, "learning_rate": 4.745103601558838e-05, "loss": 0.388, "step": 1725 }, { "epoch": 2.483453237410072, "grad_norm": 0.2182418705061013, "learning_rate": 4.741154896348458e-05, "loss": 0.3837, "step": 1726 }, { "epoch": 2.484892086330935, "grad_norm": 0.236976744544838, "learning_rate": 4.7372054430496636e-05, "loss": 0.3875, "step": 1727 }, { "epoch": 2.4863309352517984, "grad_norm": 0.22964815996595087, "learning_rate": 4.733255245648857e-05, "loss": 0.3829, "step": 1728 }, { "epoch": 2.487769784172662, "grad_norm": 0.1651653371026665, "learning_rate": 4.729304308133189e-05, "loss": 0.3886, "step": 1729 }, { "epoch": 2.4892086330935252, "grad_norm": 0.2218187872751732, "learning_rate": 4.725352634490557e-05, "loss": 0.3959, "step": 1730 }, { "epoch": 2.4906474820143885, "grad_norm": 0.21094480244595293, "learning_rate": 4.7214002287096035e-05, "loss": 0.3888, "step": 1731 }, { "epoch": 2.4920863309352517, "grad_norm": 0.4148620177860389, "learning_rate": 4.7174470947797117e-05, "loss": 0.3932, "step": 1732 }, { "epoch": 2.493525179856115, "grad_norm": 0.30534444944427797, "learning_rate": 4.7134932366909915e-05, "loss": 0.3902, "step": 1733 }, { "epoch": 2.4949640287769785, "grad_norm": 0.2879398340647878, "learning_rate": 4.709538658434294e-05, "loss": 0.3931, "step": 1734 }, { "epoch": 2.4964028776978417, "grad_norm": 0.23892029069087156, "learning_rate": 4.705583364001192e-05, "loss": 0.3841, "step": 1735 }, { "epoch": 2.497841726618705, "grad_norm": 0.19158560944302708, "learning_rate": 4.701627357383981e-05, "loss": 0.3881, "step": 1736 }, { "epoch": 2.499280575539568, "grad_norm": 0.2319804244744238, "learning_rate": 4.697670642575675e-05, "loss": 0.3812, "step": 1737 }, { "epoch": 2.5007194244604314, "grad_norm": 0.23479997824854995, "learning_rate": 4.693713223570006e-05, "loss": 0.3945, "step": 1738 }, { "epoch": 2.502158273381295, "grad_norm": 0.2106359414153762, "learning_rate": 4.689755104361414e-05, "loss": 0.3925, "step": 1739 }, { "epoch": 2.5035971223021583, "grad_norm": 0.22985112128848637, "learning_rate": 4.685796288945046e-05, "loss": 0.3846, "step": 1740 }, { "epoch": 2.5050359712230215, "grad_norm": 0.2506365207244284, "learning_rate": 4.6818367813167535e-05, "loss": 0.3913, "step": 1741 }, { "epoch": 2.506474820143885, "grad_norm": 0.2711726587566118, "learning_rate": 4.6778765854730835e-05, "loss": 0.39, "step": 1742 }, { "epoch": 2.5079136690647483, "grad_norm": 0.2345839460545374, "learning_rate": 4.673915705411281e-05, "loss": 0.3734, "step": 1743 }, { "epoch": 2.5093525179856115, "grad_norm": 0.17855830514222112, "learning_rate": 4.6699541451292786e-05, "loss": 0.3916, "step": 1744 }, { "epoch": 2.5107913669064748, "grad_norm": 0.29951869445533486, "learning_rate": 4.665991908625699e-05, "loss": 0.3902, "step": 1745 }, { "epoch": 2.512230215827338, "grad_norm": 0.36775107518074845, "learning_rate": 4.6620289998998445e-05, "loss": 0.3904, "step": 1746 }, { "epoch": 2.5136690647482016, "grad_norm": 0.3677315970085526, "learning_rate": 4.658065422951697e-05, "loss": 0.391, "step": 1747 }, { "epoch": 2.515107913669065, "grad_norm": 0.302362529995463, "learning_rate": 4.654101181781913e-05, "loss": 0.3822, "step": 1748 }, { "epoch": 2.516546762589928, "grad_norm": 0.21053438895006088, "learning_rate": 4.650136280391818e-05, "loss": 0.3924, "step": 1749 }, { "epoch": 2.5179856115107913, "grad_norm": 0.23268040489983013, "learning_rate": 4.646170722783408e-05, "loss": 0.3914, "step": 1750 }, { "epoch": 2.5194244604316545, "grad_norm": 0.41306004698242743, "learning_rate": 4.6422045129593344e-05, "loss": 0.3776, "step": 1751 }, { "epoch": 2.520863309352518, "grad_norm": 0.33919670732582624, "learning_rate": 4.6382376549229146e-05, "loss": 0.3866, "step": 1752 }, { "epoch": 2.5223021582733813, "grad_norm": 0.24703474471833506, "learning_rate": 4.634270152678115e-05, "loss": 0.3815, "step": 1753 }, { "epoch": 2.5237410071942445, "grad_norm": 0.20434553641822384, "learning_rate": 4.630302010229555e-05, "loss": 0.3827, "step": 1754 }, { "epoch": 2.5251798561151078, "grad_norm": 0.24072794662768396, "learning_rate": 4.6263332315824964e-05, "loss": 0.3894, "step": 1755 }, { "epoch": 2.526618705035971, "grad_norm": 0.2786157444020619, "learning_rate": 4.622363820742848e-05, "loss": 0.3916, "step": 1756 }, { "epoch": 2.5280575539568346, "grad_norm": 0.29326672351755495, "learning_rate": 4.618393781717156e-05, "loss": 0.3824, "step": 1757 }, { "epoch": 2.529496402877698, "grad_norm": 0.2739519540372521, "learning_rate": 4.614423118512595e-05, "loss": 0.3963, "step": 1758 }, { "epoch": 2.530935251798561, "grad_norm": 0.27324386407840145, "learning_rate": 4.610451835136978e-05, "loss": 0.3893, "step": 1759 }, { "epoch": 2.5323741007194247, "grad_norm": 0.2379716174002137, "learning_rate": 4.606479935598738e-05, "loss": 0.3815, "step": 1760 }, { "epoch": 2.533812949640288, "grad_norm": 0.24029502092392346, "learning_rate": 4.602507423906931e-05, "loss": 0.3888, "step": 1761 }, { "epoch": 2.535251798561151, "grad_norm": 0.20765957903574359, "learning_rate": 4.598534304071233e-05, "loss": 0.3935, "step": 1762 }, { "epoch": 2.5366906474820143, "grad_norm": 0.21187887496187346, "learning_rate": 4.5945605801019315e-05, "loss": 0.3865, "step": 1763 }, { "epoch": 2.5381294964028775, "grad_norm": 0.24969578617451754, "learning_rate": 4.5905862560099255e-05, "loss": 0.3842, "step": 1764 }, { "epoch": 2.539568345323741, "grad_norm": 0.2132089086991246, "learning_rate": 4.5866113358067187e-05, "loss": 0.3834, "step": 1765 }, { "epoch": 2.5410071942446044, "grad_norm": 0.23531078200450925, "learning_rate": 4.582635823504416e-05, "loss": 0.3836, "step": 1766 }, { "epoch": 2.5424460431654676, "grad_norm": 0.18302970671707344, "learning_rate": 4.5786597231157214e-05, "loss": 0.3833, "step": 1767 }, { "epoch": 2.543884892086331, "grad_norm": 0.19255100995567193, "learning_rate": 4.574683038653932e-05, "loss": 0.401, "step": 1768 }, { "epoch": 2.545323741007194, "grad_norm": 0.23597284237746652, "learning_rate": 4.5707057741329324e-05, "loss": 0.376, "step": 1769 }, { "epoch": 2.5467625899280577, "grad_norm": 0.2007552121621161, "learning_rate": 4.5667279335671986e-05, "loss": 0.3896, "step": 1770 }, { "epoch": 2.548201438848921, "grad_norm": 0.18058634807160767, "learning_rate": 4.56274952097178e-05, "loss": 0.388, "step": 1771 }, { "epoch": 2.549640287769784, "grad_norm": 0.15739369997263258, "learning_rate": 4.558770540362308e-05, "loss": 0.3872, "step": 1772 }, { "epoch": 2.5510791366906473, "grad_norm": 0.1848617608599081, "learning_rate": 4.554790995754988e-05, "loss": 0.3854, "step": 1773 }, { "epoch": 2.5525179856115106, "grad_norm": 0.24291941314215762, "learning_rate": 4.5508108911665926e-05, "loss": 0.3858, "step": 1774 }, { "epoch": 2.553956834532374, "grad_norm": 0.21020910584334165, "learning_rate": 4.5468302306144594e-05, "loss": 0.3824, "step": 1775 }, { "epoch": 2.5553956834532374, "grad_norm": 0.18853792035606604, "learning_rate": 4.542849018116491e-05, "loss": 0.3861, "step": 1776 }, { "epoch": 2.5568345323741006, "grad_norm": 0.21209069463952884, "learning_rate": 4.538867257691141e-05, "loss": 0.3868, "step": 1777 }, { "epoch": 2.5582733812949643, "grad_norm": 0.24596054016837202, "learning_rate": 4.53488495335742e-05, "loss": 0.3934, "step": 1778 }, { "epoch": 2.5597122302158275, "grad_norm": 0.23147225203726496, "learning_rate": 4.5309021091348885e-05, "loss": 0.3864, "step": 1779 }, { "epoch": 2.5611510791366907, "grad_norm": 0.20286262887801743, "learning_rate": 4.5269187290436486e-05, "loss": 0.3847, "step": 1780 }, { "epoch": 2.562589928057554, "grad_norm": 0.20862433491865667, "learning_rate": 4.5229348171043466e-05, "loss": 0.3844, "step": 1781 }, { "epoch": 2.564028776978417, "grad_norm": 0.2627683575084139, "learning_rate": 4.51895037733816e-05, "loss": 0.3868, "step": 1782 }, { "epoch": 2.565467625899281, "grad_norm": 0.32422091188574925, "learning_rate": 4.5149654137668095e-05, "loss": 0.3817, "step": 1783 }, { "epoch": 2.566906474820144, "grad_norm": 0.3165741282069741, "learning_rate": 4.5109799304125333e-05, "loss": 0.3892, "step": 1784 }, { "epoch": 2.568345323741007, "grad_norm": 0.2567119926854359, "learning_rate": 4.5069939312981e-05, "loss": 0.3878, "step": 1785 }, { "epoch": 2.5697841726618704, "grad_norm": 0.2267977012510537, "learning_rate": 4.503007420446798e-05, "loss": 0.3749, "step": 1786 }, { "epoch": 2.5712230215827336, "grad_norm": 0.34461047558661684, "learning_rate": 4.499020401882433e-05, "loss": 0.3943, "step": 1787 }, { "epoch": 2.5726618705035973, "grad_norm": 0.32960505460472705, "learning_rate": 4.49503287962932e-05, "loss": 0.3867, "step": 1788 }, { "epoch": 2.5741007194244605, "grad_norm": 0.22921860434176516, "learning_rate": 4.491044857712288e-05, "loss": 0.3884, "step": 1789 }, { "epoch": 2.5755395683453237, "grad_norm": 0.22852165220912105, "learning_rate": 4.4870563401566634e-05, "loss": 0.3958, "step": 1790 }, { "epoch": 2.576978417266187, "grad_norm": 0.24099158446341248, "learning_rate": 4.483067330988278e-05, "loss": 0.3795, "step": 1791 }, { "epoch": 2.57841726618705, "grad_norm": 0.23129817552242166, "learning_rate": 4.479077834233458e-05, "loss": 0.3877, "step": 1792 }, { "epoch": 2.579856115107914, "grad_norm": 0.23630958464372778, "learning_rate": 4.475087853919023e-05, "loss": 0.3833, "step": 1793 }, { "epoch": 2.581294964028777, "grad_norm": 0.1994916154967065, "learning_rate": 4.4710973940722786e-05, "loss": 0.389, "step": 1794 }, { "epoch": 2.58273381294964, "grad_norm": 0.25994331315702757, "learning_rate": 4.4671064587210146e-05, "loss": 0.3822, "step": 1795 }, { "epoch": 2.584172661870504, "grad_norm": 0.21802541448223417, "learning_rate": 4.4631150518935044e-05, "loss": 0.3842, "step": 1796 }, { "epoch": 2.5856115107913666, "grad_norm": 0.19238044186831893, "learning_rate": 4.459123177618491e-05, "loss": 0.3781, "step": 1797 }, { "epoch": 2.5870503597122303, "grad_norm": 0.18801893052710708, "learning_rate": 4.455130839925195e-05, "loss": 0.3818, "step": 1798 }, { "epoch": 2.5884892086330935, "grad_norm": 0.21243905334812005, "learning_rate": 4.451138042843302e-05, "loss": 0.3956, "step": 1799 }, { "epoch": 2.5899280575539567, "grad_norm": 0.214458072394087, "learning_rate": 4.447144790402963e-05, "loss": 0.3813, "step": 1800 }, { "epoch": 2.5913669064748204, "grad_norm": 0.2653829259754617, "learning_rate": 4.4431510866347837e-05, "loss": 0.3857, "step": 1801 }, { "epoch": 2.5928057553956836, "grad_norm": 0.26311399803344365, "learning_rate": 4.439156935569833e-05, "loss": 0.3837, "step": 1802 }, { "epoch": 2.594244604316547, "grad_norm": 0.24357443397451756, "learning_rate": 4.435162341239625e-05, "loss": 0.3826, "step": 1803 }, { "epoch": 2.59568345323741, "grad_norm": 0.2329412189965746, "learning_rate": 4.4311673076761254e-05, "loss": 0.3874, "step": 1804 }, { "epoch": 2.597122302158273, "grad_norm": 0.20926413357202217, "learning_rate": 4.42717183891174e-05, "loss": 0.3842, "step": 1805 }, { "epoch": 2.598561151079137, "grad_norm": 0.17769587040566737, "learning_rate": 4.4231759389793144e-05, "loss": 0.3868, "step": 1806 }, { "epoch": 2.6, "grad_norm": 0.21004670713763612, "learning_rate": 4.4191796119121335e-05, "loss": 0.3813, "step": 1807 }, { "epoch": 2.6014388489208633, "grad_norm": 0.2235053760814713, "learning_rate": 4.415182861743906e-05, "loss": 0.3765, "step": 1808 }, { "epoch": 2.6028776978417265, "grad_norm": 0.3040060715703648, "learning_rate": 4.411185692508774e-05, "loss": 0.3812, "step": 1809 }, { "epoch": 2.6043165467625897, "grad_norm": 0.26147948047551334, "learning_rate": 4.4071881082413e-05, "loss": 0.3842, "step": 1810 }, { "epoch": 2.6057553956834534, "grad_norm": 0.1745707951688262, "learning_rate": 4.4031901129764665e-05, "loss": 0.3847, "step": 1811 }, { "epoch": 2.6071942446043166, "grad_norm": 0.20669080261916384, "learning_rate": 4.3991917107496695e-05, "loss": 0.3898, "step": 1812 }, { "epoch": 2.60863309352518, "grad_norm": 0.23227452863695033, "learning_rate": 4.395192905596716e-05, "loss": 0.3717, "step": 1813 }, { "epoch": 2.610071942446043, "grad_norm": 0.22790458417236623, "learning_rate": 4.3911937015538186e-05, "loss": 0.3846, "step": 1814 }, { "epoch": 2.6115107913669062, "grad_norm": 0.18928946784415096, "learning_rate": 4.3871941026575965e-05, "loss": 0.3827, "step": 1815 }, { "epoch": 2.61294964028777, "grad_norm": 0.21757958462658672, "learning_rate": 4.383194112945066e-05, "loss": 0.3935, "step": 1816 }, { "epoch": 2.614388489208633, "grad_norm": 0.2827530676409471, "learning_rate": 4.379193736453633e-05, "loss": 0.3868, "step": 1817 }, { "epoch": 2.6158273381294963, "grad_norm": 0.2630615627721376, "learning_rate": 4.375192977221099e-05, "loss": 0.3888, "step": 1818 }, { "epoch": 2.61726618705036, "grad_norm": 0.2168400391998586, "learning_rate": 4.371191839285651e-05, "loss": 0.3856, "step": 1819 }, { "epoch": 2.618705035971223, "grad_norm": 0.22607905125918135, "learning_rate": 4.367190326685858e-05, "loss": 0.3909, "step": 1820 }, { "epoch": 2.6201438848920864, "grad_norm": 0.19453063079707447, "learning_rate": 4.363188443460666e-05, "loss": 0.3864, "step": 1821 }, { "epoch": 2.6215827338129496, "grad_norm": 0.2657942478306949, "learning_rate": 4.3591861936493964e-05, "loss": 0.379, "step": 1822 }, { "epoch": 2.623021582733813, "grad_norm": 0.27540316986659147, "learning_rate": 4.3551835812917395e-05, "loss": 0.3739, "step": 1823 }, { "epoch": 2.6244604316546765, "grad_norm": 0.24557361664055938, "learning_rate": 4.351180610427754e-05, "loss": 0.3875, "step": 1824 }, { "epoch": 2.6258992805755397, "grad_norm": 0.24206244637264343, "learning_rate": 4.347177285097855e-05, "loss": 0.3835, "step": 1825 }, { "epoch": 2.627338129496403, "grad_norm": 0.19725770112484106, "learning_rate": 4.343173609342822e-05, "loss": 0.3881, "step": 1826 }, { "epoch": 2.628776978417266, "grad_norm": 0.2608118908692465, "learning_rate": 4.339169587203785e-05, "loss": 0.3812, "step": 1827 }, { "epoch": 2.6302158273381293, "grad_norm": 0.29338714121904846, "learning_rate": 4.335165222722222e-05, "loss": 0.389, "step": 1828 }, { "epoch": 2.631654676258993, "grad_norm": 0.2562460674118761, "learning_rate": 4.331160519939962e-05, "loss": 0.3902, "step": 1829 }, { "epoch": 2.633093525179856, "grad_norm": 0.2136119726581698, "learning_rate": 4.327155482899168e-05, "loss": 0.3937, "step": 1830 }, { "epoch": 2.6345323741007194, "grad_norm": 0.18708751197016005, "learning_rate": 4.323150115642346e-05, "loss": 0.3867, "step": 1831 }, { "epoch": 2.6359712230215826, "grad_norm": 0.1915683028324215, "learning_rate": 4.3191444222123326e-05, "loss": 0.382, "step": 1832 }, { "epoch": 2.637410071942446, "grad_norm": 0.19480422569077002, "learning_rate": 4.3151384066522964e-05, "loss": 0.3799, "step": 1833 }, { "epoch": 2.6388489208633095, "grad_norm": 0.21530948884558096, "learning_rate": 4.311132073005727e-05, "loss": 0.3857, "step": 1834 }, { "epoch": 2.6402877697841727, "grad_norm": 0.1864530172996806, "learning_rate": 4.3071254253164395e-05, "loss": 0.3879, "step": 1835 }, { "epoch": 2.641726618705036, "grad_norm": 0.17590300501204492, "learning_rate": 4.3031184676285625e-05, "loss": 0.3983, "step": 1836 }, { "epoch": 2.6431654676258995, "grad_norm": 0.20243039026627202, "learning_rate": 4.299111203986539e-05, "loss": 0.3869, "step": 1837 }, { "epoch": 2.6446043165467623, "grad_norm": 0.3099275802406128, "learning_rate": 4.29510363843512e-05, "loss": 0.3842, "step": 1838 }, { "epoch": 2.646043165467626, "grad_norm": 0.20685838531850764, "learning_rate": 4.291095775019364e-05, "loss": 0.3934, "step": 1839 }, { "epoch": 2.647482014388489, "grad_norm": 0.18001002469842584, "learning_rate": 4.287087617784627e-05, "loss": 0.3838, "step": 1840 }, { "epoch": 2.6489208633093524, "grad_norm": 0.21630775495621132, "learning_rate": 4.283079170776561e-05, "loss": 0.3822, "step": 1841 }, { "epoch": 2.650359712230216, "grad_norm": 0.1882221069360396, "learning_rate": 4.279070438041116e-05, "loss": 0.39, "step": 1842 }, { "epoch": 2.6517985611510793, "grad_norm": 0.23126448519474344, "learning_rate": 4.275061423624522e-05, "loss": 0.3866, "step": 1843 }, { "epoch": 2.6532374100719425, "grad_norm": 0.2024574208546783, "learning_rate": 4.2710521315733e-05, "loss": 0.3862, "step": 1844 }, { "epoch": 2.6546762589928057, "grad_norm": 0.24465296829058134, "learning_rate": 4.26704256593425e-05, "loss": 0.3915, "step": 1845 }, { "epoch": 2.656115107913669, "grad_norm": 0.20156038024269043, "learning_rate": 4.2630327307544454e-05, "loss": 0.382, "step": 1846 }, { "epoch": 2.6575539568345325, "grad_norm": 0.20093878292170558, "learning_rate": 4.2590226300812335e-05, "loss": 0.3937, "step": 1847 }, { "epoch": 2.6589928057553958, "grad_norm": 0.20992811409452886, "learning_rate": 4.255012267962232e-05, "loss": 0.389, "step": 1848 }, { "epoch": 2.660431654676259, "grad_norm": 0.19443184008420034, "learning_rate": 4.251001648445317e-05, "loss": 0.3774, "step": 1849 }, { "epoch": 2.661870503597122, "grad_norm": 0.20508834552588467, "learning_rate": 4.246990775578628e-05, "loss": 0.3916, "step": 1850 }, { "epoch": 2.6633093525179854, "grad_norm": 0.21597683207543966, "learning_rate": 4.242979653410562e-05, "loss": 0.3857, "step": 1851 }, { "epoch": 2.664748201438849, "grad_norm": 0.2314417209320147, "learning_rate": 4.238968285989762e-05, "loss": 0.3874, "step": 1852 }, { "epoch": 2.6661870503597123, "grad_norm": 0.21639164495492177, "learning_rate": 4.2349566773651236e-05, "loss": 0.3811, "step": 1853 }, { "epoch": 2.6676258992805755, "grad_norm": 0.21936968244618701, "learning_rate": 4.2309448315857844e-05, "loss": 0.3876, "step": 1854 }, { "epoch": 2.6690647482014387, "grad_norm": 0.23102840657098456, "learning_rate": 4.226932752701122e-05, "loss": 0.3925, "step": 1855 }, { "epoch": 2.670503597122302, "grad_norm": 0.19722110868434983, "learning_rate": 4.2229204447607456e-05, "loss": 0.383, "step": 1856 }, { "epoch": 2.6719424460431656, "grad_norm": 0.2195835985460399, "learning_rate": 4.2189079118145e-05, "loss": 0.386, "step": 1857 }, { "epoch": 2.6733812949640288, "grad_norm": 0.22270094722650063, "learning_rate": 4.214895157912454e-05, "loss": 0.3807, "step": 1858 }, { "epoch": 2.674820143884892, "grad_norm": 0.19377785620489504, "learning_rate": 4.210882187104904e-05, "loss": 0.3869, "step": 1859 }, { "epoch": 2.6762589928057556, "grad_norm": 0.254485241645623, "learning_rate": 4.206869003442358e-05, "loss": 0.3846, "step": 1860 }, { "epoch": 2.677697841726619, "grad_norm": 0.22236843005196488, "learning_rate": 4.2028556109755465e-05, "loss": 0.3839, "step": 1861 }, { "epoch": 2.679136690647482, "grad_norm": 0.1969134763507888, "learning_rate": 4.198842013755408e-05, "loss": 0.3868, "step": 1862 }, { "epoch": 2.6805755395683453, "grad_norm": 0.26206145325810626, "learning_rate": 4.194828215833082e-05, "loss": 0.3842, "step": 1863 }, { "epoch": 2.6820143884892085, "grad_norm": 0.26348235581142704, "learning_rate": 4.1908142212599206e-05, "loss": 0.3962, "step": 1864 }, { "epoch": 2.683453237410072, "grad_norm": 0.27050117521450223, "learning_rate": 4.1868000340874674e-05, "loss": 0.3947, "step": 1865 }, { "epoch": 2.6848920863309353, "grad_norm": 0.24475766021828424, "learning_rate": 4.182785658367462e-05, "loss": 0.3861, "step": 1866 }, { "epoch": 2.6863309352517986, "grad_norm": 0.22801076642145454, "learning_rate": 4.178771098151835e-05, "loss": 0.3888, "step": 1867 }, { "epoch": 2.6877697841726618, "grad_norm": 0.2739226492353372, "learning_rate": 4.1747563574927034e-05, "loss": 0.3846, "step": 1868 }, { "epoch": 2.689208633093525, "grad_norm": 0.21666453088785986, "learning_rate": 4.170741440442366e-05, "loss": 0.3885, "step": 1869 }, { "epoch": 2.6906474820143886, "grad_norm": 0.1935920292066028, "learning_rate": 4.166726351053299e-05, "loss": 0.3842, "step": 1870 }, { "epoch": 2.692086330935252, "grad_norm": 0.2554502113776128, "learning_rate": 4.1627110933781515e-05, "loss": 0.3802, "step": 1871 }, { "epoch": 2.693525179856115, "grad_norm": 0.25847156484503014, "learning_rate": 4.158695671469746e-05, "loss": 0.3892, "step": 1872 }, { "epoch": 2.6949640287769783, "grad_norm": 0.21120440672036592, "learning_rate": 4.154680089381068e-05, "loss": 0.3948, "step": 1873 }, { "epoch": 2.6964028776978415, "grad_norm": 0.23296719621704934, "learning_rate": 4.150664351165266e-05, "loss": 0.3887, "step": 1874 }, { "epoch": 2.697841726618705, "grad_norm": 0.19948198703925238, "learning_rate": 4.146648460875646e-05, "loss": 0.3841, "step": 1875 }, { "epoch": 2.6992805755395683, "grad_norm": 0.23759199087716618, "learning_rate": 4.1426324225656644e-05, "loss": 0.3865, "step": 1876 }, { "epoch": 2.7007194244604316, "grad_norm": 0.22701961869252213, "learning_rate": 4.138616240288934e-05, "loss": 0.3916, "step": 1877 }, { "epoch": 2.702158273381295, "grad_norm": 0.20588673256139395, "learning_rate": 4.134599918099204e-05, "loss": 0.3875, "step": 1878 }, { "epoch": 2.7035971223021584, "grad_norm": 0.21325481866450907, "learning_rate": 4.130583460050371e-05, "loss": 0.3704, "step": 1879 }, { "epoch": 2.7050359712230216, "grad_norm": 0.16806017671260462, "learning_rate": 4.126566870196468e-05, "loss": 0.389, "step": 1880 }, { "epoch": 2.706474820143885, "grad_norm": 0.22813748694019656, "learning_rate": 4.12255015259166e-05, "loss": 0.3863, "step": 1881 }, { "epoch": 2.707913669064748, "grad_norm": 0.18398201430847969, "learning_rate": 4.1185333112902394e-05, "loss": 0.3843, "step": 1882 }, { "epoch": 2.7093525179856117, "grad_norm": 0.20900623881404212, "learning_rate": 4.114516350346626e-05, "loss": 0.3843, "step": 1883 }, { "epoch": 2.710791366906475, "grad_norm": 0.19209389526656, "learning_rate": 4.1104992738153616e-05, "loss": 0.3899, "step": 1884 }, { "epoch": 2.712230215827338, "grad_norm": 0.17657244957473936, "learning_rate": 4.1064820857511e-05, "loss": 0.3904, "step": 1885 }, { "epoch": 2.7136690647482014, "grad_norm": 0.18002473184963683, "learning_rate": 4.1024647902086107e-05, "loss": 0.3788, "step": 1886 }, { "epoch": 2.7151079136690646, "grad_norm": 0.16759575946391514, "learning_rate": 4.0984473912427706e-05, "loss": 0.3938, "step": 1887 }, { "epoch": 2.716546762589928, "grad_norm": 0.18222342273573017, "learning_rate": 4.0944298929085633e-05, "loss": 0.3858, "step": 1888 }, { "epoch": 2.7179856115107914, "grad_norm": 0.21787089769791332, "learning_rate": 4.090412299261068e-05, "loss": 0.3782, "step": 1889 }, { "epoch": 2.7194244604316546, "grad_norm": 0.2055194052955941, "learning_rate": 4.086394614355467e-05, "loss": 0.3806, "step": 1890 }, { "epoch": 2.720863309352518, "grad_norm": 0.17948724453157378, "learning_rate": 4.082376842247027e-05, "loss": 0.3855, "step": 1891 }, { "epoch": 2.722302158273381, "grad_norm": 0.15385133013252994, "learning_rate": 4.0783589869911074e-05, "loss": 0.3838, "step": 1892 }, { "epoch": 2.7237410071942447, "grad_norm": 0.209723402063529, "learning_rate": 4.074341052643152e-05, "loss": 0.3801, "step": 1893 }, { "epoch": 2.725179856115108, "grad_norm": 0.19691672015146391, "learning_rate": 4.070323043258683e-05, "loss": 0.3846, "step": 1894 }, { "epoch": 2.726618705035971, "grad_norm": 0.16722335973554103, "learning_rate": 4.066304962893297e-05, "loss": 0.3821, "step": 1895 }, { "epoch": 2.728057553956835, "grad_norm": 0.19678366933227714, "learning_rate": 4.062286815602661e-05, "loss": 0.385, "step": 1896 }, { "epoch": 2.7294964028776976, "grad_norm": 0.23986977143401303, "learning_rate": 4.0582686054425196e-05, "loss": 0.3815, "step": 1897 }, { "epoch": 2.7309352517985612, "grad_norm": 0.17856562550021124, "learning_rate": 4.054250336468666e-05, "loss": 0.386, "step": 1898 }, { "epoch": 2.7323741007194244, "grad_norm": 0.24355770904784735, "learning_rate": 4.050232012736964e-05, "loss": 0.3833, "step": 1899 }, { "epoch": 2.7338129496402876, "grad_norm": 0.25168778926362656, "learning_rate": 4.0462136383033285e-05, "loss": 0.3914, "step": 1900 }, { "epoch": 2.7352517985611513, "grad_norm": 0.19058711164039246, "learning_rate": 4.0421952172237254e-05, "loss": 0.393, "step": 1901 }, { "epoch": 2.7366906474820145, "grad_norm": 0.19474399024123062, "learning_rate": 4.038176753554166e-05, "loss": 0.3947, "step": 1902 }, { "epoch": 2.7381294964028777, "grad_norm": 0.18578664144744955, "learning_rate": 4.034158251350711e-05, "loss": 0.392, "step": 1903 }, { "epoch": 2.739568345323741, "grad_norm": 0.18928528480319595, "learning_rate": 4.030139714669453e-05, "loss": 0.3836, "step": 1904 }, { "epoch": 2.741007194244604, "grad_norm": 0.19240981764999657, "learning_rate": 4.026121147566522e-05, "loss": 0.3977, "step": 1905 }, { "epoch": 2.742446043165468, "grad_norm": 0.15538158480519618, "learning_rate": 4.02210255409808e-05, "loss": 0.3806, "step": 1906 }, { "epoch": 2.743884892086331, "grad_norm": 0.19774743810262413, "learning_rate": 4.018083938320314e-05, "loss": 0.3853, "step": 1907 }, { "epoch": 2.7453237410071942, "grad_norm": 0.18394363944089592, "learning_rate": 4.014065304289435e-05, "loss": 0.3868, "step": 1908 }, { "epoch": 2.7467625899280574, "grad_norm": 0.20010680409203171, "learning_rate": 4.010046656061669e-05, "loss": 0.3814, "step": 1909 }, { "epoch": 2.7482014388489207, "grad_norm": 0.27375663615626805, "learning_rate": 4.006027997693262e-05, "loss": 0.3852, "step": 1910 }, { "epoch": 2.7496402877697843, "grad_norm": 0.16225804971406108, "learning_rate": 4.002009333240465e-05, "loss": 0.3888, "step": 1911 }, { "epoch": 2.7510791366906475, "grad_norm": 0.21776413666241295, "learning_rate": 3.997990666759536e-05, "loss": 0.3799, "step": 1912 }, { "epoch": 2.7525179856115107, "grad_norm": 0.24881360037822242, "learning_rate": 3.99397200230674e-05, "loss": 0.3808, "step": 1913 }, { "epoch": 2.753956834532374, "grad_norm": 0.19109611424998146, "learning_rate": 3.989953343938331e-05, "loss": 0.3893, "step": 1914 }, { "epoch": 2.755395683453237, "grad_norm": 0.20445928132697555, "learning_rate": 3.985934695710566e-05, "loss": 0.3828, "step": 1915 }, { "epoch": 2.756834532374101, "grad_norm": 0.23828172071503892, "learning_rate": 3.9819160616796873e-05, "loss": 0.383, "step": 1916 }, { "epoch": 2.758273381294964, "grad_norm": 0.2197593560082593, "learning_rate": 3.977897445901922e-05, "loss": 0.4037, "step": 1917 }, { "epoch": 2.7597122302158272, "grad_norm": 0.1553271737678793, "learning_rate": 3.9738788524334794e-05, "loss": 0.3832, "step": 1918 }, { "epoch": 2.761151079136691, "grad_norm": 0.16500774605695115, "learning_rate": 3.969860285330549e-05, "loss": 0.3746, "step": 1919 }, { "epoch": 2.762589928057554, "grad_norm": 0.19886829293428393, "learning_rate": 3.965841748649291e-05, "loss": 0.3916, "step": 1920 }, { "epoch": 2.7640287769784173, "grad_norm": 0.20717357076093512, "learning_rate": 3.961823246445834e-05, "loss": 0.3855, "step": 1921 }, { "epoch": 2.7654676258992805, "grad_norm": 0.1790361182179321, "learning_rate": 3.957804782776276e-05, "loss": 0.3862, "step": 1922 }, { "epoch": 2.7669064748201437, "grad_norm": 0.19813084858714128, "learning_rate": 3.953786361696673e-05, "loss": 0.3753, "step": 1923 }, { "epoch": 2.7683453237410074, "grad_norm": 0.2068279472551279, "learning_rate": 3.9497679872630366e-05, "loss": 0.3838, "step": 1924 }, { "epoch": 2.7697841726618706, "grad_norm": 0.1831003922469856, "learning_rate": 3.945749663531334e-05, "loss": 0.3796, "step": 1925 }, { "epoch": 2.771223021582734, "grad_norm": 0.1818167482893324, "learning_rate": 3.941731394557482e-05, "loss": 0.387, "step": 1926 }, { "epoch": 2.772661870503597, "grad_norm": 0.20082357390537708, "learning_rate": 3.9377131843973394e-05, "loss": 0.3874, "step": 1927 }, { "epoch": 2.7741007194244602, "grad_norm": 0.20408518807963416, "learning_rate": 3.933695037106705e-05, "loss": 0.3863, "step": 1928 }, { "epoch": 2.775539568345324, "grad_norm": 0.2066669950154263, "learning_rate": 3.9296769567413177e-05, "loss": 0.3892, "step": 1929 }, { "epoch": 2.776978417266187, "grad_norm": 0.19174472735428247, "learning_rate": 3.925658947356849e-05, "loss": 0.3856, "step": 1930 }, { "epoch": 2.7784172661870503, "grad_norm": 0.1987859128921994, "learning_rate": 3.921641013008893e-05, "loss": 0.3874, "step": 1931 }, { "epoch": 2.7798561151079135, "grad_norm": 0.22291685372004874, "learning_rate": 3.9176231577529734e-05, "loss": 0.3887, "step": 1932 }, { "epoch": 2.7812949640287767, "grad_norm": 0.22024270869600057, "learning_rate": 3.913605385644535e-05, "loss": 0.3986, "step": 1933 }, { "epoch": 2.7827338129496404, "grad_norm": 0.1721122392744245, "learning_rate": 3.909587700738933e-05, "loss": 0.3811, "step": 1934 }, { "epoch": 2.7841726618705036, "grad_norm": 0.21030490272757463, "learning_rate": 3.9055701070914393e-05, "loss": 0.3843, "step": 1935 }, { "epoch": 2.785611510791367, "grad_norm": 0.20704087889653908, "learning_rate": 3.90155260875723e-05, "loss": 0.3802, "step": 1936 }, { "epoch": 2.7870503597122305, "grad_norm": 0.17365781856632703, "learning_rate": 3.8975352097913914e-05, "loss": 0.3855, "step": 1937 }, { "epoch": 2.7884892086330937, "grad_norm": 0.17770995901551442, "learning_rate": 3.8935179142489016e-05, "loss": 0.3833, "step": 1938 }, { "epoch": 2.789928057553957, "grad_norm": 0.21779087822273127, "learning_rate": 3.8895007261846404e-05, "loss": 0.3904, "step": 1939 }, { "epoch": 2.79136690647482, "grad_norm": 0.24277937539167369, "learning_rate": 3.885483649653374e-05, "loss": 0.3851, "step": 1940 }, { "epoch": 2.7928057553956833, "grad_norm": 0.2341004136712482, "learning_rate": 3.881466688709761e-05, "loss": 0.3942, "step": 1941 }, { "epoch": 2.794244604316547, "grad_norm": 0.18027649424590186, "learning_rate": 3.877449847408342e-05, "loss": 0.3781, "step": 1942 }, { "epoch": 2.79568345323741, "grad_norm": 0.18717571945485895, "learning_rate": 3.873433129803532e-05, "loss": 0.3884, "step": 1943 }, { "epoch": 2.7971223021582734, "grad_norm": 0.1858550625658424, "learning_rate": 3.86941653994963e-05, "loss": 0.3785, "step": 1944 }, { "epoch": 2.7985611510791366, "grad_norm": 0.2026505538974637, "learning_rate": 3.8654000819007974e-05, "loss": 0.3797, "step": 1945 }, { "epoch": 2.8, "grad_norm": 0.19196092906122655, "learning_rate": 3.8613837597110686e-05, "loss": 0.3867, "step": 1946 }, { "epoch": 2.8014388489208635, "grad_norm": 0.1670074465104803, "learning_rate": 3.8573675774343356e-05, "loss": 0.3961, "step": 1947 }, { "epoch": 2.8028776978417267, "grad_norm": 0.17526592934078594, "learning_rate": 3.853351539124355e-05, "loss": 0.3798, "step": 1948 }, { "epoch": 2.80431654676259, "grad_norm": 0.2017938976837373, "learning_rate": 3.8493356488347345e-05, "loss": 0.3852, "step": 1949 }, { "epoch": 2.805755395683453, "grad_norm": 0.16141891543912168, "learning_rate": 3.845319910618933e-05, "loss": 0.3764, "step": 1950 }, { "epoch": 2.8071942446043163, "grad_norm": 0.19869539544738132, "learning_rate": 3.841304328530254e-05, "loss": 0.3862, "step": 1951 }, { "epoch": 2.80863309352518, "grad_norm": 0.1928193491860682, "learning_rate": 3.83728890662185e-05, "loss": 0.3918, "step": 1952 }, { "epoch": 2.810071942446043, "grad_norm": 0.15205698302835358, "learning_rate": 3.833273648946704e-05, "loss": 0.386, "step": 1953 }, { "epoch": 2.8115107913669064, "grad_norm": 0.16265067065993183, "learning_rate": 3.829258559557635e-05, "loss": 0.3862, "step": 1954 }, { "epoch": 2.81294964028777, "grad_norm": 0.17480379497791992, "learning_rate": 3.825243642507297e-05, "loss": 0.383, "step": 1955 }, { "epoch": 2.814388489208633, "grad_norm": 0.21289773047432814, "learning_rate": 3.8212289018481666e-05, "loss": 0.3857, "step": 1956 }, { "epoch": 2.8158273381294965, "grad_norm": 0.17765426769618362, "learning_rate": 3.817214341632539e-05, "loss": 0.3904, "step": 1957 }, { "epoch": 2.8172661870503597, "grad_norm": 0.16439226286825281, "learning_rate": 3.813199965912533e-05, "loss": 0.3848, "step": 1958 }, { "epoch": 2.818705035971223, "grad_norm": 0.16251635630418615, "learning_rate": 3.80918577874008e-05, "loss": 0.3811, "step": 1959 }, { "epoch": 2.8201438848920866, "grad_norm": 0.18480166519435853, "learning_rate": 3.8051717841669196e-05, "loss": 0.3879, "step": 1960 }, { "epoch": 2.8215827338129498, "grad_norm": 0.1805350012707336, "learning_rate": 3.801157986244595e-05, "loss": 0.384, "step": 1961 }, { "epoch": 2.823021582733813, "grad_norm": 0.21508103500088852, "learning_rate": 3.7971443890244534e-05, "loss": 0.3894, "step": 1962 }, { "epoch": 2.824460431654676, "grad_norm": 0.1903700633733518, "learning_rate": 3.7931309965576426e-05, "loss": 0.3844, "step": 1963 }, { "epoch": 2.8258992805755394, "grad_norm": 0.1894381997836098, "learning_rate": 3.7891178128950975e-05, "loss": 0.3955, "step": 1964 }, { "epoch": 2.827338129496403, "grad_norm": 0.18150584566748393, "learning_rate": 3.785104842087546e-05, "loss": 0.3854, "step": 1965 }, { "epoch": 2.8287769784172663, "grad_norm": 0.1891095814148094, "learning_rate": 3.7810920881855016e-05, "loss": 0.3872, "step": 1966 }, { "epoch": 2.8302158273381295, "grad_norm": 0.23099345204149493, "learning_rate": 3.777079555239255e-05, "loss": 0.3761, "step": 1967 }, { "epoch": 2.8316546762589927, "grad_norm": 0.20368419578794905, "learning_rate": 3.77306724729888e-05, "loss": 0.3952, "step": 1968 }, { "epoch": 2.833093525179856, "grad_norm": 0.2013250517032747, "learning_rate": 3.769055168414215e-05, "loss": 0.3802, "step": 1969 }, { "epoch": 2.8345323741007196, "grad_norm": 0.22040252433501764, "learning_rate": 3.765043322634877e-05, "loss": 0.385, "step": 1970 }, { "epoch": 2.8359712230215828, "grad_norm": 0.22321698921110333, "learning_rate": 3.761031714010239e-05, "loss": 0.3928, "step": 1971 }, { "epoch": 2.837410071942446, "grad_norm": 0.17808466214916618, "learning_rate": 3.75702034658944e-05, "loss": 0.394, "step": 1972 }, { "epoch": 2.838848920863309, "grad_norm": 0.16906371940273435, "learning_rate": 3.753009224421373e-05, "loss": 0.3765, "step": 1973 }, { "epoch": 2.8402877697841724, "grad_norm": 0.18308010487641513, "learning_rate": 3.748998351554684e-05, "loss": 0.3903, "step": 1974 }, { "epoch": 2.841726618705036, "grad_norm": 0.2225548415238196, "learning_rate": 3.74498773203777e-05, "loss": 0.3846, "step": 1975 }, { "epoch": 2.8431654676258993, "grad_norm": 0.16580518917297782, "learning_rate": 3.7409773699187664e-05, "loss": 0.3866, "step": 1976 }, { "epoch": 2.8446043165467625, "grad_norm": 0.18060687743856188, "learning_rate": 3.736967269245555e-05, "loss": 0.3752, "step": 1977 }, { "epoch": 2.846043165467626, "grad_norm": 0.2442856647276992, "learning_rate": 3.732957434065751e-05, "loss": 0.3825, "step": 1978 }, { "epoch": 2.8474820143884894, "grad_norm": 0.17377603588060617, "learning_rate": 3.728947868426701e-05, "loss": 0.3918, "step": 1979 }, { "epoch": 2.8489208633093526, "grad_norm": 0.17366991770930496, "learning_rate": 3.724938576375479e-05, "loss": 0.3872, "step": 1980 }, { "epoch": 2.850359712230216, "grad_norm": 0.16520872352908733, "learning_rate": 3.7209295619588856e-05, "loss": 0.3831, "step": 1981 }, { "epoch": 2.851798561151079, "grad_norm": 0.17358900130450544, "learning_rate": 3.7169208292234395e-05, "loss": 0.3847, "step": 1982 }, { "epoch": 2.8532374100719426, "grad_norm": 0.16055853887610544, "learning_rate": 3.7129123822153746e-05, "loss": 0.3761, "step": 1983 }, { "epoch": 2.854676258992806, "grad_norm": 0.16978866572029805, "learning_rate": 3.708904224980636e-05, "loss": 0.3797, "step": 1984 }, { "epoch": 2.856115107913669, "grad_norm": 0.16168212577426672, "learning_rate": 3.704896361564881e-05, "loss": 0.3857, "step": 1985 }, { "epoch": 2.8575539568345323, "grad_norm": 0.21795965698751302, "learning_rate": 3.700888796013462e-05, "loss": 0.387, "step": 1986 }, { "epoch": 2.8589928057553955, "grad_norm": 0.2072773046432437, "learning_rate": 3.696881532371439e-05, "loss": 0.3751, "step": 1987 }, { "epoch": 2.860431654676259, "grad_norm": 0.17241642786348224, "learning_rate": 3.692874574683562e-05, "loss": 0.3925, "step": 1988 }, { "epoch": 2.8618705035971224, "grad_norm": 0.24022281114246952, "learning_rate": 3.688867926994274e-05, "loss": 0.3833, "step": 1989 }, { "epoch": 2.8633093525179856, "grad_norm": 0.2016307515118801, "learning_rate": 3.684861593347705e-05, "loss": 0.3823, "step": 1990 }, { "epoch": 2.864748201438849, "grad_norm": 0.23297438393509676, "learning_rate": 3.6808555777876673e-05, "loss": 0.3765, "step": 1991 }, { "epoch": 2.866187050359712, "grad_norm": 0.24468893457129012, "learning_rate": 3.676849884357655e-05, "loss": 0.3758, "step": 1992 }, { "epoch": 2.8676258992805757, "grad_norm": 0.19354597620274588, "learning_rate": 3.672844517100833e-05, "loss": 0.3831, "step": 1993 }, { "epoch": 2.869064748201439, "grad_norm": 0.20311587736464948, "learning_rate": 3.66883948006004e-05, "loss": 0.3817, "step": 1994 }, { "epoch": 2.870503597122302, "grad_norm": 0.21481338071322306, "learning_rate": 3.664834777277777e-05, "loss": 0.3875, "step": 1995 }, { "epoch": 2.8719424460431657, "grad_norm": 0.22454309712498646, "learning_rate": 3.6608304127962166e-05, "loss": 0.3809, "step": 1996 }, { "epoch": 2.873381294964029, "grad_norm": 0.21854088471309374, "learning_rate": 3.656826390657179e-05, "loss": 0.3895, "step": 1997 }, { "epoch": 2.874820143884892, "grad_norm": 0.18912763371226035, "learning_rate": 3.6528227149021455e-05, "loss": 0.3736, "step": 1998 }, { "epoch": 2.8762589928057554, "grad_norm": 0.24831039293665552, "learning_rate": 3.648819389572248e-05, "loss": 0.3767, "step": 1999 }, { "epoch": 2.8776978417266186, "grad_norm": 0.21648203780760272, "learning_rate": 3.644816418708261e-05, "loss": 0.3798, "step": 2000 }, { "epoch": 2.8791366906474822, "grad_norm": 0.20509193910389217, "learning_rate": 3.6408138063506057e-05, "loss": 0.389, "step": 2001 }, { "epoch": 2.8805755395683454, "grad_norm": 0.21791850596163106, "learning_rate": 3.636811556539335e-05, "loss": 0.3792, "step": 2002 }, { "epoch": 2.8820143884892087, "grad_norm": 0.2554805546344398, "learning_rate": 3.6328096733141423e-05, "loss": 0.3803, "step": 2003 }, { "epoch": 2.883453237410072, "grad_norm": 0.19046409853189405, "learning_rate": 3.6288081607143496e-05, "loss": 0.3833, "step": 2004 }, { "epoch": 2.884892086330935, "grad_norm": 0.23598509312791277, "learning_rate": 3.6248070227789034e-05, "loss": 0.3741, "step": 2005 }, { "epoch": 2.8863309352517987, "grad_norm": 0.1999608065119113, "learning_rate": 3.620806263546369e-05, "loss": 0.3837, "step": 2006 }, { "epoch": 2.887769784172662, "grad_norm": 0.17771440852964768, "learning_rate": 3.6168058870549355e-05, "loss": 0.3804, "step": 2007 }, { "epoch": 2.889208633093525, "grad_norm": 0.24204378684346625, "learning_rate": 3.612805897342405e-05, "loss": 0.3786, "step": 2008 }, { "epoch": 2.8906474820143884, "grad_norm": 0.1980316580190834, "learning_rate": 3.608806298446182e-05, "loss": 0.3836, "step": 2009 }, { "epoch": 2.8920863309352516, "grad_norm": 0.17968345202327035, "learning_rate": 3.604807094403286e-05, "loss": 0.3846, "step": 2010 }, { "epoch": 2.8935251798561152, "grad_norm": 0.1877683648006211, "learning_rate": 3.6008082892503325e-05, "loss": 0.3847, "step": 2011 }, { "epoch": 2.8949640287769784, "grad_norm": 0.20175307646308607, "learning_rate": 3.596809887023534e-05, "loss": 0.386, "step": 2012 }, { "epoch": 2.8964028776978417, "grad_norm": 0.16055923442945932, "learning_rate": 3.5928118917587e-05, "loss": 0.3871, "step": 2013 }, { "epoch": 2.897841726618705, "grad_norm": 0.19637652624877666, "learning_rate": 3.588814307491227e-05, "loss": 0.3857, "step": 2014 }, { "epoch": 2.899280575539568, "grad_norm": 0.19818332483671994, "learning_rate": 3.584817138256096e-05, "loss": 0.3836, "step": 2015 }, { "epoch": 2.9007194244604317, "grad_norm": 0.1451205820964952, "learning_rate": 3.580820388087869e-05, "loss": 0.3933, "step": 2016 }, { "epoch": 2.902158273381295, "grad_norm": 0.19786506703629175, "learning_rate": 3.5768240610206855e-05, "loss": 0.393, "step": 2017 }, { "epoch": 2.903597122302158, "grad_norm": 0.18083530222742925, "learning_rate": 3.572828161088262e-05, "loss": 0.3824, "step": 2018 }, { "epoch": 2.905035971223022, "grad_norm": 0.181636151918574, "learning_rate": 3.568832692323876e-05, "loss": 0.3828, "step": 2019 }, { "epoch": 2.906474820143885, "grad_norm": 0.23088256110171856, "learning_rate": 3.564837658760376e-05, "loss": 0.3884, "step": 2020 }, { "epoch": 2.9079136690647482, "grad_norm": 0.19929833520067272, "learning_rate": 3.560843064430168e-05, "loss": 0.384, "step": 2021 }, { "epoch": 2.9093525179856115, "grad_norm": 0.19123040805220842, "learning_rate": 3.556848913365218e-05, "loss": 0.3761, "step": 2022 }, { "epoch": 2.9107913669064747, "grad_norm": 0.22589457826263576, "learning_rate": 3.552855209597039e-05, "loss": 0.3931, "step": 2023 }, { "epoch": 2.9122302158273383, "grad_norm": 0.21781504032045054, "learning_rate": 3.548861957156698e-05, "loss": 0.3865, "step": 2024 }, { "epoch": 2.9136690647482015, "grad_norm": 0.1850197179194681, "learning_rate": 3.544869160074806e-05, "loss": 0.3864, "step": 2025 }, { "epoch": 2.9151079136690647, "grad_norm": 0.24238459049549935, "learning_rate": 3.5408768223815105e-05, "loss": 0.3801, "step": 2026 }, { "epoch": 2.916546762589928, "grad_norm": 0.21673031475617144, "learning_rate": 3.536884948106498e-05, "loss": 0.3855, "step": 2027 }, { "epoch": 2.917985611510791, "grad_norm": 0.5998072090981956, "learning_rate": 3.532893541278986e-05, "loss": 0.3929, "step": 2028 }, { "epoch": 2.919424460431655, "grad_norm": 0.2621668968319146, "learning_rate": 3.528902605927722e-05, "loss": 0.3911, "step": 2029 }, { "epoch": 2.920863309352518, "grad_norm": 0.22450894990011058, "learning_rate": 3.524912146080978e-05, "loss": 0.3798, "step": 2030 }, { "epoch": 2.9223021582733812, "grad_norm": 0.20134239652713007, "learning_rate": 3.5209221657665436e-05, "loss": 0.3831, "step": 2031 }, { "epoch": 2.9237410071942445, "grad_norm": 0.186279994472004, "learning_rate": 3.516932669011723e-05, "loss": 0.3839, "step": 2032 }, { "epoch": 2.9251798561151077, "grad_norm": 0.22831410659162382, "learning_rate": 3.512943659843337e-05, "loss": 0.3809, "step": 2033 }, { "epoch": 2.9266187050359713, "grad_norm": 0.20425913332091689, "learning_rate": 3.508955142287714e-05, "loss": 0.3891, "step": 2034 }, { "epoch": 2.9280575539568345, "grad_norm": 0.20480759469192697, "learning_rate": 3.50496712037068e-05, "loss": 0.3879, "step": 2035 }, { "epoch": 2.9294964028776977, "grad_norm": 0.2038244372869383, "learning_rate": 3.5009795981175676e-05, "loss": 0.3919, "step": 2036 }, { "epoch": 2.9309352517985614, "grad_norm": 0.22707319489735453, "learning_rate": 3.496992579553203e-05, "loss": 0.3924, "step": 2037 }, { "epoch": 2.9323741007194246, "grad_norm": 0.245562565118133, "learning_rate": 3.4930060687019015e-05, "loss": 0.3888, "step": 2038 }, { "epoch": 2.933812949640288, "grad_norm": 0.2441632626394093, "learning_rate": 3.489020069587467e-05, "loss": 0.3855, "step": 2039 }, { "epoch": 2.935251798561151, "grad_norm": 0.17745251314180563, "learning_rate": 3.485034586233192e-05, "loss": 0.39, "step": 2040 }, { "epoch": 2.9366906474820142, "grad_norm": 0.22487949386773248, "learning_rate": 3.4810496226618404e-05, "loss": 0.3975, "step": 2041 }, { "epoch": 2.938129496402878, "grad_norm": 0.20002018533834687, "learning_rate": 3.477065182895656e-05, "loss": 0.3837, "step": 2042 }, { "epoch": 2.939568345323741, "grad_norm": 0.230180258512151, "learning_rate": 3.473081270956352e-05, "loss": 0.3877, "step": 2043 }, { "epoch": 2.9410071942446043, "grad_norm": 0.1756588002638614, "learning_rate": 3.469097890865113e-05, "loss": 0.3783, "step": 2044 }, { "epoch": 2.9424460431654675, "grad_norm": 0.26835782252608176, "learning_rate": 3.465115046642581e-05, "loss": 0.3815, "step": 2045 }, { "epoch": 2.9438848920863308, "grad_norm": 0.23997963562866945, "learning_rate": 3.461132742308859e-05, "loss": 0.3771, "step": 2046 }, { "epoch": 2.9453237410071944, "grad_norm": 0.20888633375860216, "learning_rate": 3.45715098188351e-05, "loss": 0.3899, "step": 2047 }, { "epoch": 2.9467625899280576, "grad_norm": 0.20176189346580645, "learning_rate": 3.453169769385541e-05, "loss": 0.3809, "step": 2048 }, { "epoch": 2.948201438848921, "grad_norm": 0.17608840392320604, "learning_rate": 3.449189108833409e-05, "loss": 0.3832, "step": 2049 }, { "epoch": 2.949640287769784, "grad_norm": 0.1854169732719609, "learning_rate": 3.445209004245012e-05, "loss": 0.3821, "step": 2050 }, { "epoch": 2.9510791366906473, "grad_norm": 0.1619286806631231, "learning_rate": 3.441229459637693e-05, "loss": 0.3879, "step": 2051 }, { "epoch": 2.952517985611511, "grad_norm": 0.19130379361566077, "learning_rate": 3.4372504790282215e-05, "loss": 0.3875, "step": 2052 }, { "epoch": 2.953956834532374, "grad_norm": 0.19380190248276075, "learning_rate": 3.4332720664328034e-05, "loss": 0.3839, "step": 2053 }, { "epoch": 2.9553956834532373, "grad_norm": 0.22132673629007027, "learning_rate": 3.4292942258670675e-05, "loss": 0.387, "step": 2054 }, { "epoch": 2.956834532374101, "grad_norm": 0.18534009294543696, "learning_rate": 3.425316961346069e-05, "loss": 0.3953, "step": 2055 }, { "epoch": 2.9582733812949638, "grad_norm": 0.17548642072351006, "learning_rate": 3.42134027688428e-05, "loss": 0.3848, "step": 2056 }, { "epoch": 2.9597122302158274, "grad_norm": 0.20392081088111974, "learning_rate": 3.417364176495585e-05, "loss": 0.3897, "step": 2057 }, { "epoch": 2.9611510791366906, "grad_norm": 0.21337179838526357, "learning_rate": 3.4133886641932834e-05, "loss": 0.3823, "step": 2058 }, { "epoch": 2.962589928057554, "grad_norm": 0.17007119429337325, "learning_rate": 3.409413743990076e-05, "loss": 0.3929, "step": 2059 }, { "epoch": 2.9640287769784175, "grad_norm": 0.16472519586281026, "learning_rate": 3.4054394198980705e-05, "loss": 0.3787, "step": 2060 }, { "epoch": 2.9654676258992807, "grad_norm": 0.1708056234057997, "learning_rate": 3.401465695928768e-05, "loss": 0.3772, "step": 2061 }, { "epoch": 2.966906474820144, "grad_norm": 0.15953734302553965, "learning_rate": 3.3974925760930694e-05, "loss": 0.3886, "step": 2062 }, { "epoch": 2.968345323741007, "grad_norm": 0.15527382940345988, "learning_rate": 3.393520064401264e-05, "loss": 0.3803, "step": 2063 }, { "epoch": 2.9697841726618703, "grad_norm": 0.17300060715215387, "learning_rate": 3.3895481648630234e-05, "loss": 0.3851, "step": 2064 }, { "epoch": 2.971223021582734, "grad_norm": 0.16797967283810863, "learning_rate": 3.385576881487405e-05, "loss": 0.3794, "step": 2065 }, { "epoch": 2.972661870503597, "grad_norm": 0.1754748771386963, "learning_rate": 3.381606218282846e-05, "loss": 0.3903, "step": 2066 }, { "epoch": 2.9741007194244604, "grad_norm": 0.14154374320062357, "learning_rate": 3.377636179257153e-05, "loss": 0.3847, "step": 2067 }, { "epoch": 2.9755395683453236, "grad_norm": 0.19033944623921709, "learning_rate": 3.373666768417505e-05, "loss": 0.3765, "step": 2068 }, { "epoch": 2.976978417266187, "grad_norm": 0.20493110424364452, "learning_rate": 3.3696979897704466e-05, "loss": 0.3899, "step": 2069 }, { "epoch": 2.9784172661870505, "grad_norm": 0.17833206085806985, "learning_rate": 3.3657298473218864e-05, "loss": 0.3952, "step": 2070 }, { "epoch": 2.9798561151079137, "grad_norm": 0.1686821102041315, "learning_rate": 3.361762345077087e-05, "loss": 0.4012, "step": 2071 }, { "epoch": 2.981294964028777, "grad_norm": 0.2436521168045105, "learning_rate": 3.3577954870406656e-05, "loss": 0.3909, "step": 2072 }, { "epoch": 2.98273381294964, "grad_norm": 0.22817825742264522, "learning_rate": 3.3538292772165936e-05, "loss": 0.379, "step": 2073 }, { "epoch": 2.9841726618705033, "grad_norm": 0.19740041368304065, "learning_rate": 3.3498637196081825e-05, "loss": 0.3829, "step": 2074 }, { "epoch": 2.985611510791367, "grad_norm": 0.19622604138848831, "learning_rate": 3.345898818218089e-05, "loss": 0.3927, "step": 2075 }, { "epoch": 2.98705035971223, "grad_norm": 0.150841854712471, "learning_rate": 3.341934577048304e-05, "loss": 0.3838, "step": 2076 }, { "epoch": 2.9884892086330934, "grad_norm": 0.1678703869895141, "learning_rate": 3.337971000100157e-05, "loss": 0.3818, "step": 2077 }, { "epoch": 2.989928057553957, "grad_norm": 0.21269206206655902, "learning_rate": 3.334008091374303e-05, "loss": 0.3919, "step": 2078 }, { "epoch": 2.9913669064748203, "grad_norm": 0.18124149976349824, "learning_rate": 3.3300458548707214e-05, "loss": 0.386, "step": 2079 }, { "epoch": 2.9928057553956835, "grad_norm": 0.1672719747664083, "learning_rate": 3.326084294588721e-05, "loss": 0.389, "step": 2080 }, { "epoch": 2.9942446043165467, "grad_norm": 0.2155189730344991, "learning_rate": 3.322123414526917e-05, "loss": 0.3852, "step": 2081 }, { "epoch": 2.99568345323741, "grad_norm": 0.20505929598971243, "learning_rate": 3.3181632186832485e-05, "loss": 0.3914, "step": 2082 }, { "epoch": 2.9971223021582736, "grad_norm": 0.1771213290261762, "learning_rate": 3.3142037110549546e-05, "loss": 0.3786, "step": 2083 }, { "epoch": 2.998561151079137, "grad_norm": 0.20191658002270133, "learning_rate": 3.310244895638587e-05, "loss": 0.3809, "step": 2084 }, { "epoch": 3.0, "grad_norm": 0.20225356044957576, "learning_rate": 3.306286776429995e-05, "loss": 0.3747, "step": 2085 }, { "epoch": 3.001438848920863, "grad_norm": 0.20569754631248985, "learning_rate": 3.302329357424326e-05, "loss": 0.3707, "step": 2086 }, { "epoch": 3.0028776978417264, "grad_norm": 0.20114942842678957, "learning_rate": 3.2983726426160204e-05, "loss": 0.364, "step": 2087 }, { "epoch": 3.00431654676259, "grad_norm": 0.18723446351048248, "learning_rate": 3.2944166359988083e-05, "loss": 0.3681, "step": 2088 }, { "epoch": 3.0057553956834533, "grad_norm": 0.17220903321630057, "learning_rate": 3.290461341565707e-05, "loss": 0.359, "step": 2089 }, { "epoch": 3.0071942446043165, "grad_norm": 0.23430100610978008, "learning_rate": 3.286506763309009e-05, "loss": 0.3547, "step": 2090 }, { "epoch": 3.0086330935251797, "grad_norm": 0.19688002346513944, "learning_rate": 3.2825529052202904e-05, "loss": 0.3652, "step": 2091 }, { "epoch": 3.0100719424460434, "grad_norm": 0.20321597432510047, "learning_rate": 3.278599771290397e-05, "loss": 0.3519, "step": 2092 }, { "epoch": 3.0115107913669066, "grad_norm": 0.19081182863087742, "learning_rate": 3.274647365509445e-05, "loss": 0.3637, "step": 2093 }, { "epoch": 3.01294964028777, "grad_norm": 0.2328943681815441, "learning_rate": 3.2706956918668126e-05, "loss": 0.3582, "step": 2094 }, { "epoch": 3.014388489208633, "grad_norm": 0.18125767772085483, "learning_rate": 3.266744754351144e-05, "loss": 0.3603, "step": 2095 }, { "epoch": 3.015827338129496, "grad_norm": 0.20943968943457905, "learning_rate": 3.262794556950338e-05, "loss": 0.3665, "step": 2096 }, { "epoch": 3.01726618705036, "grad_norm": 0.2016966526436745, "learning_rate": 3.2588451036515435e-05, "loss": 0.362, "step": 2097 }, { "epoch": 3.018705035971223, "grad_norm": 0.18929307021417738, "learning_rate": 3.2548963984411623e-05, "loss": 0.3628, "step": 2098 }, { "epoch": 3.0201438848920863, "grad_norm": 0.2079651623363538, "learning_rate": 3.2509484453048413e-05, "loss": 0.3718, "step": 2099 }, { "epoch": 3.0215827338129495, "grad_norm": 0.17061447600345955, "learning_rate": 3.247001248227465e-05, "loss": 0.3559, "step": 2100 }, { "epoch": 3.023021582733813, "grad_norm": 0.18547696640719988, "learning_rate": 3.2430548111931574e-05, "loss": 0.3596, "step": 2101 }, { "epoch": 3.0244604316546764, "grad_norm": 0.1653250126196465, "learning_rate": 3.239109138185275e-05, "loss": 0.3607, "step": 2102 }, { "epoch": 3.0258992805755396, "grad_norm": 0.18282052325980813, "learning_rate": 3.2351642331864024e-05, "loss": 0.3606, "step": 2103 }, { "epoch": 3.027338129496403, "grad_norm": 0.17101471749049846, "learning_rate": 3.2312201001783473e-05, "loss": 0.3638, "step": 2104 }, { "epoch": 3.028776978417266, "grad_norm": 0.16413900278688257, "learning_rate": 3.2272767431421416e-05, "loss": 0.3632, "step": 2105 }, { "epoch": 3.0302158273381297, "grad_norm": 0.16625703759551594, "learning_rate": 3.2233341660580335e-05, "loss": 0.3532, "step": 2106 }, { "epoch": 3.031654676258993, "grad_norm": 0.2144615795712448, "learning_rate": 3.219392372905482e-05, "loss": 0.3674, "step": 2107 }, { "epoch": 3.033093525179856, "grad_norm": 0.14936298512309384, "learning_rate": 3.215451367663156e-05, "loss": 0.3636, "step": 2108 }, { "epoch": 3.0345323741007193, "grad_norm": 0.20451775844635806, "learning_rate": 3.211511154308927e-05, "loss": 0.3578, "step": 2109 }, { "epoch": 3.0359712230215825, "grad_norm": 0.16421410694291483, "learning_rate": 3.207571736819873e-05, "loss": 0.3619, "step": 2110 }, { "epoch": 3.037410071942446, "grad_norm": 0.16014427605999473, "learning_rate": 3.203633119172262e-05, "loss": 0.3672, "step": 2111 }, { "epoch": 3.0388489208633094, "grad_norm": 0.16802997831800656, "learning_rate": 3.1996953053415575e-05, "loss": 0.3634, "step": 2112 }, { "epoch": 3.0402877697841726, "grad_norm": 0.19318307562131506, "learning_rate": 3.1957582993024135e-05, "loss": 0.3665, "step": 2113 }, { "epoch": 3.041726618705036, "grad_norm": 0.17388976938204928, "learning_rate": 3.191822105028665e-05, "loss": 0.3561, "step": 2114 }, { "epoch": 3.0431654676258995, "grad_norm": 0.1645764090504961, "learning_rate": 3.1878867264933305e-05, "loss": 0.3542, "step": 2115 }, { "epoch": 3.0446043165467627, "grad_norm": 0.18244817741104855, "learning_rate": 3.1839521676686026e-05, "loss": 0.3561, "step": 2116 }, { "epoch": 3.046043165467626, "grad_norm": 0.16508563331234616, "learning_rate": 3.1800184325258494e-05, "loss": 0.3685, "step": 2117 }, { "epoch": 3.047482014388489, "grad_norm": 0.16473841098624484, "learning_rate": 3.176085525035607e-05, "loss": 0.362, "step": 2118 }, { "epoch": 3.0489208633093523, "grad_norm": 0.15113713787417427, "learning_rate": 3.172153449167574e-05, "loss": 0.3584, "step": 2119 }, { "epoch": 3.050359712230216, "grad_norm": 0.19044107514465825, "learning_rate": 3.1682222088906096e-05, "loss": 0.3588, "step": 2120 }, { "epoch": 3.051798561151079, "grad_norm": 0.1478330796372751, "learning_rate": 3.1642918081727327e-05, "loss": 0.3642, "step": 2121 }, { "epoch": 3.0532374100719424, "grad_norm": 0.19689036313293712, "learning_rate": 3.1603622509811144e-05, "loss": 0.3609, "step": 2122 }, { "epoch": 3.0546762589928056, "grad_norm": 0.20486526790795795, "learning_rate": 3.156433541282069e-05, "loss": 0.3623, "step": 2123 }, { "epoch": 3.0561151079136692, "grad_norm": 0.15617141094534115, "learning_rate": 3.152505683041062e-05, "loss": 0.3593, "step": 2124 }, { "epoch": 3.0575539568345325, "grad_norm": 0.2027148810745762, "learning_rate": 3.1485786802226976e-05, "loss": 0.3646, "step": 2125 }, { "epoch": 3.0589928057553957, "grad_norm": 0.21447667121894384, "learning_rate": 3.1446525367907134e-05, "loss": 0.3602, "step": 2126 }, { "epoch": 3.060431654676259, "grad_norm": 0.15381484589567843, "learning_rate": 3.1407272567079834e-05, "loss": 0.3639, "step": 2127 }, { "epoch": 3.061870503597122, "grad_norm": 0.1872412094336588, "learning_rate": 3.136802843936509e-05, "loss": 0.3581, "step": 2128 }, { "epoch": 3.0633093525179858, "grad_norm": 0.2037870768861435, "learning_rate": 3.132879302437416e-05, "loss": 0.3593, "step": 2129 }, { "epoch": 3.064748201438849, "grad_norm": 0.16557314944365134, "learning_rate": 3.128956636170949e-05, "loss": 0.3676, "step": 2130 }, { "epoch": 3.066187050359712, "grad_norm": 0.21739010073156842, "learning_rate": 3.125034849096471e-05, "loss": 0.3616, "step": 2131 }, { "epoch": 3.0676258992805754, "grad_norm": 0.23829079888615076, "learning_rate": 3.1211139451724605e-05, "loss": 0.3666, "step": 2132 }, { "epoch": 3.069064748201439, "grad_norm": 0.19003741526013926, "learning_rate": 3.1171939283564986e-05, "loss": 0.3624, "step": 2133 }, { "epoch": 3.0705035971223023, "grad_norm": 0.19280640697509915, "learning_rate": 3.113274802605276e-05, "loss": 0.3649, "step": 2134 }, { "epoch": 3.0719424460431655, "grad_norm": 0.17465491343281253, "learning_rate": 3.109356571874579e-05, "loss": 0.3597, "step": 2135 }, { "epoch": 3.0733812949640287, "grad_norm": 0.1575551077356398, "learning_rate": 3.105439240119296e-05, "loss": 0.3601, "step": 2136 }, { "epoch": 3.074820143884892, "grad_norm": 0.1582159815146858, "learning_rate": 3.101522811293405e-05, "loss": 0.354, "step": 2137 }, { "epoch": 3.0762589928057555, "grad_norm": 0.1532665643566935, "learning_rate": 3.0976072893499724e-05, "loss": 0.3493, "step": 2138 }, { "epoch": 3.0776978417266188, "grad_norm": 0.1268319703596362, "learning_rate": 3.093692678241151e-05, "loss": 0.3533, "step": 2139 }, { "epoch": 3.079136690647482, "grad_norm": 0.17721432868865453, "learning_rate": 3.0897789819181715e-05, "loss": 0.3524, "step": 2140 }, { "epoch": 3.080575539568345, "grad_norm": 0.17209923810182967, "learning_rate": 3.0858662043313456e-05, "loss": 0.3543, "step": 2141 }, { "epoch": 3.082014388489209, "grad_norm": 0.1642312010716747, "learning_rate": 3.081954349430051e-05, "loss": 0.3562, "step": 2142 }, { "epoch": 3.083453237410072, "grad_norm": 0.17503639457937903, "learning_rate": 3.0780434211627415e-05, "loss": 0.361, "step": 2143 }, { "epoch": 3.0848920863309353, "grad_norm": 0.16118245417268506, "learning_rate": 3.074133423476932e-05, "loss": 0.3644, "step": 2144 }, { "epoch": 3.0863309352517985, "grad_norm": 0.14723476541237263, "learning_rate": 3.070224360319197e-05, "loss": 0.362, "step": 2145 }, { "epoch": 3.0877697841726617, "grad_norm": 0.14714634472098959, "learning_rate": 3.066316235635168e-05, "loss": 0.3614, "step": 2146 }, { "epoch": 3.0892086330935253, "grad_norm": 0.15193247445136177, "learning_rate": 3.0624090533695324e-05, "loss": 0.352, "step": 2147 }, { "epoch": 3.0906474820143885, "grad_norm": 0.18749179624902063, "learning_rate": 3.0585028174660236e-05, "loss": 0.3642, "step": 2148 }, { "epoch": 3.0920863309352518, "grad_norm": 0.1392666647698655, "learning_rate": 3.054597531867419e-05, "loss": 0.3649, "step": 2149 }, { "epoch": 3.093525179856115, "grad_norm": 0.16501078769019958, "learning_rate": 3.0506932005155407e-05, "loss": 0.3628, "step": 2150 }, { "epoch": 3.0949640287769786, "grad_norm": 0.17065492296996415, "learning_rate": 3.0467898273512446e-05, "loss": 0.3673, "step": 2151 }, { "epoch": 3.096402877697842, "grad_norm": 0.15350854835529376, "learning_rate": 3.042887416314418e-05, "loss": 0.3567, "step": 2152 }, { "epoch": 3.097841726618705, "grad_norm": 0.3506708121375363, "learning_rate": 3.03898597134398e-05, "loss": 0.3618, "step": 2153 }, { "epoch": 3.0992805755395683, "grad_norm": 0.1294663959546018, "learning_rate": 3.0350854963778755e-05, "loss": 0.3574, "step": 2154 }, { "epoch": 3.1007194244604315, "grad_norm": 0.14377101612817697, "learning_rate": 3.0311859953530672e-05, "loss": 0.3537, "step": 2155 }, { "epoch": 3.102158273381295, "grad_norm": 0.13514133077016605, "learning_rate": 3.027287472205535e-05, "loss": 0.3686, "step": 2156 }, { "epoch": 3.1035971223021583, "grad_norm": 0.19111383611662044, "learning_rate": 3.0233899308702722e-05, "loss": 0.36, "step": 2157 }, { "epoch": 3.1050359712230216, "grad_norm": 0.14677733546118896, "learning_rate": 3.0194933752812853e-05, "loss": 0.3648, "step": 2158 }, { "epoch": 3.1064748201438848, "grad_norm": 0.1485521726666662, "learning_rate": 3.0155978093715787e-05, "loss": 0.3561, "step": 2159 }, { "epoch": 3.1079136690647484, "grad_norm": 0.1627671669920901, "learning_rate": 3.011703237073162e-05, "loss": 0.3623, "step": 2160 }, { "epoch": 3.1093525179856116, "grad_norm": 0.16502848496230754, "learning_rate": 3.0078096623170442e-05, "loss": 0.3664, "step": 2161 }, { "epoch": 3.110791366906475, "grad_norm": 0.13162514219343133, "learning_rate": 3.0039170890332214e-05, "loss": 0.3602, "step": 2162 }, { "epoch": 3.112230215827338, "grad_norm": 0.15660829701380588, "learning_rate": 3.0000255211506836e-05, "loss": 0.3653, "step": 2163 }, { "epoch": 3.1136690647482013, "grad_norm": 0.17539034318923397, "learning_rate": 2.9961349625974022e-05, "loss": 0.3618, "step": 2164 }, { "epoch": 3.115107913669065, "grad_norm": 0.14793606029669693, "learning_rate": 2.992245417300335e-05, "loss": 0.355, "step": 2165 }, { "epoch": 3.116546762589928, "grad_norm": 0.15900645500930594, "learning_rate": 2.9883568891854118e-05, "loss": 0.3638, "step": 2166 }, { "epoch": 3.1179856115107913, "grad_norm": 0.1612124230990281, "learning_rate": 2.9844693821775394e-05, "loss": 0.3668, "step": 2167 }, { "epoch": 3.1194244604316546, "grad_norm": 0.1511141171936181, "learning_rate": 2.9805829002005907e-05, "loss": 0.3589, "step": 2168 }, { "epoch": 3.1208633093525178, "grad_norm": 0.16026881604644838, "learning_rate": 2.9766974471774072e-05, "loss": 0.3597, "step": 2169 }, { "epoch": 3.1223021582733814, "grad_norm": 0.1595382668139755, "learning_rate": 2.9728130270297913e-05, "loss": 0.3675, "step": 2170 }, { "epoch": 3.1237410071942446, "grad_norm": 0.15483819300457385, "learning_rate": 2.968929643678499e-05, "loss": 0.3665, "step": 2171 }, { "epoch": 3.125179856115108, "grad_norm": 0.1747356909354723, "learning_rate": 2.965047301043246e-05, "loss": 0.3625, "step": 2172 }, { "epoch": 3.126618705035971, "grad_norm": 0.1577084537740128, "learning_rate": 2.961166003042692e-05, "loss": 0.3563, "step": 2173 }, { "epoch": 3.1280575539568347, "grad_norm": 0.16341455776658428, "learning_rate": 2.9572857535944473e-05, "loss": 0.3745, "step": 2174 }, { "epoch": 3.129496402877698, "grad_norm": 0.1478147632598593, "learning_rate": 2.9534065566150567e-05, "loss": 0.3629, "step": 2175 }, { "epoch": 3.130935251798561, "grad_norm": 0.13978710000013367, "learning_rate": 2.9495284160200105e-05, "loss": 0.3634, "step": 2176 }, { "epoch": 3.1323741007194243, "grad_norm": 0.1372616987172764, "learning_rate": 2.9456513357237305e-05, "loss": 0.3619, "step": 2177 }, { "epoch": 3.133812949640288, "grad_norm": 0.13634600501027924, "learning_rate": 2.9417753196395637e-05, "loss": 0.3569, "step": 2178 }, { "epoch": 3.135251798561151, "grad_norm": 0.13330859445128854, "learning_rate": 2.9379003716797877e-05, "loss": 0.3638, "step": 2179 }, { "epoch": 3.1366906474820144, "grad_norm": 0.1387344689681127, "learning_rate": 2.9340264957556018e-05, "loss": 0.3651, "step": 2180 }, { "epoch": 3.1381294964028776, "grad_norm": 0.11495602812743524, "learning_rate": 2.9301536957771218e-05, "loss": 0.3673, "step": 2181 }, { "epoch": 3.139568345323741, "grad_norm": 0.14156198330485406, "learning_rate": 2.9262819756533754e-05, "loss": 0.3601, "step": 2182 }, { "epoch": 3.1410071942446045, "grad_norm": 0.13409357944157357, "learning_rate": 2.922411339292306e-05, "loss": 0.3566, "step": 2183 }, { "epoch": 3.1424460431654677, "grad_norm": 0.1378045890509274, "learning_rate": 2.9185417906007586e-05, "loss": 0.364, "step": 2184 }, { "epoch": 3.143884892086331, "grad_norm": 0.13291284842051967, "learning_rate": 2.914673333484481e-05, "loss": 0.3656, "step": 2185 }, { "epoch": 3.145323741007194, "grad_norm": 0.13332793776054366, "learning_rate": 2.9108059718481184e-05, "loss": 0.3554, "step": 2186 }, { "epoch": 3.1467625899280574, "grad_norm": 0.15458068216679197, "learning_rate": 2.906939709595216e-05, "loss": 0.3652, "step": 2187 }, { "epoch": 3.148201438848921, "grad_norm": 0.15967056211424555, "learning_rate": 2.9030745506282017e-05, "loss": 0.3563, "step": 2188 }, { "epoch": 3.149640287769784, "grad_norm": 0.1834537720451152, "learning_rate": 2.8992104988483943e-05, "loss": 0.3622, "step": 2189 }, { "epoch": 3.1510791366906474, "grad_norm": 0.15455959102393504, "learning_rate": 2.895347558155992e-05, "loss": 0.3581, "step": 2190 }, { "epoch": 3.1525179856115106, "grad_norm": 0.18274074548645566, "learning_rate": 2.8914857324500767e-05, "loss": 0.3619, "step": 2191 }, { "epoch": 3.1539568345323743, "grad_norm": 0.19042272209457872, "learning_rate": 2.887625025628599e-05, "loss": 0.3605, "step": 2192 }, { "epoch": 3.1553956834532375, "grad_norm": 0.178393476055343, "learning_rate": 2.8837654415883817e-05, "loss": 0.362, "step": 2193 }, { "epoch": 3.1568345323741007, "grad_norm": 0.17452940444183646, "learning_rate": 2.879906984225119e-05, "loss": 0.364, "step": 2194 }, { "epoch": 3.158273381294964, "grad_norm": 0.19848713272935012, "learning_rate": 2.8760496574333613e-05, "loss": 0.3637, "step": 2195 }, { "epoch": 3.159712230215827, "grad_norm": 0.22150541825308392, "learning_rate": 2.8721934651065227e-05, "loss": 0.3602, "step": 2196 }, { "epoch": 3.161151079136691, "grad_norm": 0.1564675995472657, "learning_rate": 2.8683384111368675e-05, "loss": 0.3656, "step": 2197 }, { "epoch": 3.162589928057554, "grad_norm": 0.17609019863597677, "learning_rate": 2.864484499415517e-05, "loss": 0.3566, "step": 2198 }, { "epoch": 3.1640287769784172, "grad_norm": 0.1619486377977057, "learning_rate": 2.8606317338324347e-05, "loss": 0.3605, "step": 2199 }, { "epoch": 3.1654676258992804, "grad_norm": 0.14723851731572016, "learning_rate": 2.856780118276429e-05, "loss": 0.3631, "step": 2200 }, { "epoch": 3.166906474820144, "grad_norm": 0.16034339510914788, "learning_rate": 2.852929656635146e-05, "loss": 0.3668, "step": 2201 }, { "epoch": 3.1683453237410073, "grad_norm": 0.14324836151156242, "learning_rate": 2.8490803527950706e-05, "loss": 0.3573, "step": 2202 }, { "epoch": 3.1697841726618705, "grad_norm": 0.13913473975969853, "learning_rate": 2.845232210641517e-05, "loss": 0.3702, "step": 2203 }, { "epoch": 3.1712230215827337, "grad_norm": 0.13827107183418555, "learning_rate": 2.841385234058624e-05, "loss": 0.3655, "step": 2204 }, { "epoch": 3.172661870503597, "grad_norm": 0.14026787843847674, "learning_rate": 2.83753942692936e-05, "loss": 0.351, "step": 2205 }, { "epoch": 3.1741007194244606, "grad_norm": 0.1392484108958197, "learning_rate": 2.8336947931355096e-05, "loss": 0.36, "step": 2206 }, { "epoch": 3.175539568345324, "grad_norm": 0.15614689262731513, "learning_rate": 2.8298513365576715e-05, "loss": 0.3602, "step": 2207 }, { "epoch": 3.176978417266187, "grad_norm": 0.1614798507847841, "learning_rate": 2.826009061075257e-05, "loss": 0.3571, "step": 2208 }, { "epoch": 3.1784172661870502, "grad_norm": 0.14968532541531165, "learning_rate": 2.822167970566488e-05, "loss": 0.3592, "step": 2209 }, { "epoch": 3.1798561151079134, "grad_norm": 0.14943552936276022, "learning_rate": 2.8183280689083895e-05, "loss": 0.3552, "step": 2210 }, { "epoch": 3.181294964028777, "grad_norm": 0.13942741347585594, "learning_rate": 2.8144893599767828e-05, "loss": 0.3539, "step": 2211 }, { "epoch": 3.1827338129496403, "grad_norm": 0.14109944425966342, "learning_rate": 2.8106518476462886e-05, "loss": 0.3633, "step": 2212 }, { "epoch": 3.1841726618705035, "grad_norm": 0.12957269945337271, "learning_rate": 2.806815535790321e-05, "loss": 0.3558, "step": 2213 }, { "epoch": 3.1856115107913667, "grad_norm": 0.17172783398095212, "learning_rate": 2.8029804282810794e-05, "loss": 0.3644, "step": 2214 }, { "epoch": 3.1870503597122304, "grad_norm": 0.1695904394556663, "learning_rate": 2.7991465289895497e-05, "loss": 0.3689, "step": 2215 }, { "epoch": 3.1884892086330936, "grad_norm": 0.15800796596879413, "learning_rate": 2.7953138417854952e-05, "loss": 0.3641, "step": 2216 }, { "epoch": 3.189928057553957, "grad_norm": 0.13255286416329495, "learning_rate": 2.79148237053746e-05, "loss": 0.3644, "step": 2217 }, { "epoch": 3.19136690647482, "grad_norm": 0.16659802965723186, "learning_rate": 2.787652119112758e-05, "loss": 0.3613, "step": 2218 }, { "epoch": 3.1928057553956837, "grad_norm": 0.13041920241377952, "learning_rate": 2.783823091377472e-05, "loss": 0.3581, "step": 2219 }, { "epoch": 3.194244604316547, "grad_norm": 0.14796305058514558, "learning_rate": 2.7799952911964535e-05, "loss": 0.3626, "step": 2220 }, { "epoch": 3.19568345323741, "grad_norm": 0.15254320307560715, "learning_rate": 2.776168722433308e-05, "loss": 0.3697, "step": 2221 }, { "epoch": 3.1971223021582733, "grad_norm": 0.14452204938283672, "learning_rate": 2.7723433889504046e-05, "loss": 0.3625, "step": 2222 }, { "epoch": 3.1985611510791365, "grad_norm": 0.16913298343868277, "learning_rate": 2.7685192946088597e-05, "loss": 0.355, "step": 2223 }, { "epoch": 3.2, "grad_norm": 0.14654307628150673, "learning_rate": 2.7646964432685456e-05, "loss": 0.3628, "step": 2224 }, { "epoch": 3.2014388489208634, "grad_norm": 0.16711073485909242, "learning_rate": 2.7608748387880754e-05, "loss": 0.3677, "step": 2225 }, { "epoch": 3.2028776978417266, "grad_norm": 0.1553248559515571, "learning_rate": 2.7570544850248047e-05, "loss": 0.3607, "step": 2226 }, { "epoch": 3.20431654676259, "grad_norm": 0.14502630303023836, "learning_rate": 2.753235385834824e-05, "loss": 0.3706, "step": 2227 }, { "epoch": 3.205755395683453, "grad_norm": 0.1722575599110508, "learning_rate": 2.749417545072964e-05, "loss": 0.3615, "step": 2228 }, { "epoch": 3.2071942446043167, "grad_norm": 0.14424325687418108, "learning_rate": 2.7456009665927807e-05, "loss": 0.3625, "step": 2229 }, { "epoch": 3.20863309352518, "grad_norm": 0.14399380358513653, "learning_rate": 2.741785654246555e-05, "loss": 0.3693, "step": 2230 }, { "epoch": 3.210071942446043, "grad_norm": 0.15737465739846349, "learning_rate": 2.7379716118852927e-05, "loss": 0.3584, "step": 2231 }, { "epoch": 3.2115107913669063, "grad_norm": 0.18840367516428955, "learning_rate": 2.734158843358718e-05, "loss": 0.3555, "step": 2232 }, { "epoch": 3.21294964028777, "grad_norm": 0.20377731475847594, "learning_rate": 2.730347352515266e-05, "loss": 0.3488, "step": 2233 }, { "epoch": 3.214388489208633, "grad_norm": 0.13565411926982154, "learning_rate": 2.7265371432020836e-05, "loss": 0.3682, "step": 2234 }, { "epoch": 3.2158273381294964, "grad_norm": 0.2139869802667484, "learning_rate": 2.7227282192650258e-05, "loss": 0.3581, "step": 2235 }, { "epoch": 3.2172661870503596, "grad_norm": 0.17260179079279206, "learning_rate": 2.7189205845486503e-05, "loss": 0.3596, "step": 2236 }, { "epoch": 3.218705035971223, "grad_norm": 0.14636440232485984, "learning_rate": 2.7151142428962103e-05, "loss": 0.3656, "step": 2237 }, { "epoch": 3.2201438848920865, "grad_norm": 0.18774944778882494, "learning_rate": 2.711309198149655e-05, "loss": 0.3597, "step": 2238 }, { "epoch": 3.2215827338129497, "grad_norm": 0.16589078085953027, "learning_rate": 2.7075054541496296e-05, "loss": 0.3607, "step": 2239 }, { "epoch": 3.223021582733813, "grad_norm": 0.12980125257144026, "learning_rate": 2.7037030147354582e-05, "loss": 0.3569, "step": 2240 }, { "epoch": 3.224460431654676, "grad_norm": 0.12139736644831225, "learning_rate": 2.6999018837451523e-05, "loss": 0.3647, "step": 2241 }, { "epoch": 3.2258992805755398, "grad_norm": 0.1492330721671925, "learning_rate": 2.6961020650154057e-05, "loss": 0.3647, "step": 2242 }, { "epoch": 3.227338129496403, "grad_norm": 1.35098016235772, "learning_rate": 2.6923035623815824e-05, "loss": 0.3719, "step": 2243 }, { "epoch": 3.228776978417266, "grad_norm": 0.1585641258903242, "learning_rate": 2.6885063796777195e-05, "loss": 0.3579, "step": 2244 }, { "epoch": 3.2302158273381294, "grad_norm": 0.14264742791391744, "learning_rate": 2.6847105207365225e-05, "loss": 0.3533, "step": 2245 }, { "epoch": 3.2316546762589926, "grad_norm": 0.14495436282817886, "learning_rate": 2.6809159893893624e-05, "loss": 0.361, "step": 2246 }, { "epoch": 3.2330935251798563, "grad_norm": 0.15342747948375662, "learning_rate": 2.6771227894662666e-05, "loss": 0.3547, "step": 2247 }, { "epoch": 3.2345323741007195, "grad_norm": 0.12183175364710477, "learning_rate": 2.6733309247959217e-05, "loss": 0.3525, "step": 2248 }, { "epoch": 3.2359712230215827, "grad_norm": 0.15689188085640454, "learning_rate": 2.669540399205664e-05, "loss": 0.3593, "step": 2249 }, { "epoch": 3.237410071942446, "grad_norm": 0.1332782811429741, "learning_rate": 2.6657512165214806e-05, "loss": 0.3537, "step": 2250 }, { "epoch": 3.2388489208633096, "grad_norm": 0.16238962487909972, "learning_rate": 2.6619633805680028e-05, "loss": 0.3579, "step": 2251 }, { "epoch": 3.2402877697841728, "grad_norm": 0.1522823914204186, "learning_rate": 2.6581768951684992e-05, "loss": 0.364, "step": 2252 }, { "epoch": 3.241726618705036, "grad_norm": 0.14345631035638734, "learning_rate": 2.6543917641448813e-05, "loss": 0.3644, "step": 2253 }, { "epoch": 3.243165467625899, "grad_norm": 0.13918719422262607, "learning_rate": 2.650607991317687e-05, "loss": 0.362, "step": 2254 }, { "epoch": 3.2446043165467624, "grad_norm": 0.1367299718289178, "learning_rate": 2.6468255805060885e-05, "loss": 0.3542, "step": 2255 }, { "epoch": 3.246043165467626, "grad_norm": 0.14431389291588131, "learning_rate": 2.6430445355278788e-05, "loss": 0.3688, "step": 2256 }, { "epoch": 3.2474820143884893, "grad_norm": 0.41771378248386287, "learning_rate": 2.639264860199477e-05, "loss": 0.3616, "step": 2257 }, { "epoch": 3.2489208633093525, "grad_norm": 0.14462195721294618, "learning_rate": 2.6354865583359175e-05, "loss": 0.3607, "step": 2258 }, { "epoch": 3.2503597122302157, "grad_norm": 0.11415729967552433, "learning_rate": 2.631709633750847e-05, "loss": 0.3632, "step": 2259 }, { "epoch": 3.2517985611510793, "grad_norm": 0.15255190313869024, "learning_rate": 2.6279340902565217e-05, "loss": 0.3578, "step": 2260 }, { "epoch": 3.2532374100719426, "grad_norm": 0.13805446285540282, "learning_rate": 2.6241599316638084e-05, "loss": 0.3563, "step": 2261 }, { "epoch": 3.2546762589928058, "grad_norm": 0.1418527357355985, "learning_rate": 2.6203871617821717e-05, "loss": 0.3612, "step": 2262 }, { "epoch": 3.256115107913669, "grad_norm": 0.14013824159648403, "learning_rate": 2.6166157844196755e-05, "loss": 0.3538, "step": 2263 }, { "epoch": 3.257553956834532, "grad_norm": 0.13935395781290988, "learning_rate": 2.6128458033829792e-05, "loss": 0.3582, "step": 2264 }, { "epoch": 3.258992805755396, "grad_norm": 0.14360309972844257, "learning_rate": 2.609077222477332e-05, "loss": 0.361, "step": 2265 }, { "epoch": 3.260431654676259, "grad_norm": 0.17150198840354863, "learning_rate": 2.6053100455065693e-05, "loss": 0.3585, "step": 2266 }, { "epoch": 3.2618705035971223, "grad_norm": 0.13678134536281703, "learning_rate": 2.6015442762731095e-05, "loss": 0.3621, "step": 2267 }, { "epoch": 3.2633093525179855, "grad_norm": 0.18600744624660287, "learning_rate": 2.5977799185779534e-05, "loss": 0.3648, "step": 2268 }, { "epoch": 3.2647482014388487, "grad_norm": 0.20982476736184863, "learning_rate": 2.5940169762206722e-05, "loss": 0.3559, "step": 2269 }, { "epoch": 3.2661870503597124, "grad_norm": 0.21102441705807895, "learning_rate": 2.5902554529994105e-05, "loss": 0.3649, "step": 2270 }, { "epoch": 3.2676258992805756, "grad_norm": 0.15796192676257098, "learning_rate": 2.5864953527108805e-05, "loss": 0.3669, "step": 2271 }, { "epoch": 3.2690647482014388, "grad_norm": 0.2025325960295383, "learning_rate": 2.58273667915036e-05, "loss": 0.3578, "step": 2272 }, { "epoch": 3.270503597122302, "grad_norm": 0.20369588142188552, "learning_rate": 2.578979436111684e-05, "loss": 0.3594, "step": 2273 }, { "epoch": 3.2719424460431656, "grad_norm": 0.15584996538262325, "learning_rate": 2.5752236273872432e-05, "loss": 0.3693, "step": 2274 }, { "epoch": 3.273381294964029, "grad_norm": 0.29022040048565395, "learning_rate": 2.5714692567679853e-05, "loss": 0.358, "step": 2275 }, { "epoch": 3.274820143884892, "grad_norm": 0.22458395920921975, "learning_rate": 2.5677163280433995e-05, "loss": 0.3566, "step": 2276 }, { "epoch": 3.2762589928057553, "grad_norm": 0.18734340823549248, "learning_rate": 2.5639648450015268e-05, "loss": 0.3582, "step": 2277 }, { "epoch": 3.277697841726619, "grad_norm": 0.26602126473058135, "learning_rate": 2.5602148114289415e-05, "loss": 0.3626, "step": 2278 }, { "epoch": 3.279136690647482, "grad_norm": 0.1681330470232265, "learning_rate": 2.556466231110762e-05, "loss": 0.3646, "step": 2279 }, { "epoch": 3.2805755395683454, "grad_norm": 0.1992067827528513, "learning_rate": 2.552719107830635e-05, "loss": 0.3509, "step": 2280 }, { "epoch": 3.2820143884892086, "grad_norm": 0.18565700988937925, "learning_rate": 2.54897344537074e-05, "loss": 0.3724, "step": 2281 }, { "epoch": 3.283453237410072, "grad_norm": 0.1512645675478467, "learning_rate": 2.5452292475117767e-05, "loss": 0.3633, "step": 2282 }, { "epoch": 3.2848920863309354, "grad_norm": 0.1904658977836513, "learning_rate": 2.541486518032973e-05, "loss": 0.3627, "step": 2283 }, { "epoch": 3.2863309352517986, "grad_norm": 0.1931450539981554, "learning_rate": 2.5377452607120722e-05, "loss": 0.3556, "step": 2284 }, { "epoch": 3.287769784172662, "grad_norm": 0.1856206254349257, "learning_rate": 2.5340054793253276e-05, "loss": 0.3728, "step": 2285 }, { "epoch": 3.289208633093525, "grad_norm": 0.13019591341177691, "learning_rate": 2.5302671776475098e-05, "loss": 0.3558, "step": 2286 }, { "epoch": 3.2906474820143883, "grad_norm": 0.21782000590473927, "learning_rate": 2.526530359451892e-05, "loss": 0.3662, "step": 2287 }, { "epoch": 3.292086330935252, "grad_norm": 0.1734671650120193, "learning_rate": 2.522795028510249e-05, "loss": 0.3546, "step": 2288 }, { "epoch": 3.293525179856115, "grad_norm": 0.15642074578337598, "learning_rate": 2.5190611885928547e-05, "loss": 0.3673, "step": 2289 }, { "epoch": 3.2949640287769784, "grad_norm": 0.22968584304270215, "learning_rate": 2.5153288434684816e-05, "loss": 0.3634, "step": 2290 }, { "epoch": 3.2964028776978416, "grad_norm": 0.14230543830002376, "learning_rate": 2.5115979969043914e-05, "loss": 0.3618, "step": 2291 }, { "epoch": 3.2978417266187052, "grad_norm": 0.26943377434593496, "learning_rate": 2.5078686526663304e-05, "loss": 0.3717, "step": 2292 }, { "epoch": 3.2992805755395684, "grad_norm": 0.6120181192293852, "learning_rate": 2.5041408145185312e-05, "loss": 0.3615, "step": 2293 }, { "epoch": 3.3007194244604317, "grad_norm": 0.19122352513352453, "learning_rate": 2.5004144862237084e-05, "loss": 0.3679, "step": 2294 }, { "epoch": 3.302158273381295, "grad_norm": 0.23795571722273903, "learning_rate": 2.4966896715430484e-05, "loss": 0.3607, "step": 2295 }, { "epoch": 3.3035971223021585, "grad_norm": 0.1896858659606159, "learning_rate": 2.4929663742362103e-05, "loss": 0.3653, "step": 2296 }, { "epoch": 3.3050359712230217, "grad_norm": 0.22376447671270214, "learning_rate": 2.4892445980613254e-05, "loss": 0.3627, "step": 2297 }, { "epoch": 3.306474820143885, "grad_norm": 0.18157008784299794, "learning_rate": 2.4855243467749865e-05, "loss": 0.3713, "step": 2298 }, { "epoch": 3.307913669064748, "grad_norm": 0.27676117774412173, "learning_rate": 2.481805624132247e-05, "loss": 0.3618, "step": 2299 }, { "epoch": 3.3093525179856114, "grad_norm": 0.25296348485621617, "learning_rate": 2.478088433886618e-05, "loss": 0.3599, "step": 2300 }, { "epoch": 3.310791366906475, "grad_norm": 0.19493846157171688, "learning_rate": 2.4743727797900668e-05, "loss": 0.3652, "step": 2301 }, { "epoch": 3.3122302158273382, "grad_norm": 0.23870314122074515, "learning_rate": 2.4706586655930042e-05, "loss": 0.3593, "step": 2302 }, { "epoch": 3.3136690647482014, "grad_norm": 0.1788363419814252, "learning_rate": 2.4669460950442926e-05, "loss": 0.3556, "step": 2303 }, { "epoch": 3.3151079136690647, "grad_norm": 0.19801080360218584, "learning_rate": 2.463235071891231e-05, "loss": 0.3579, "step": 2304 }, { "epoch": 3.316546762589928, "grad_norm": 0.16986415708063274, "learning_rate": 2.4595255998795625e-05, "loss": 0.3628, "step": 2305 }, { "epoch": 3.3179856115107915, "grad_norm": 0.14381662338937498, "learning_rate": 2.4558176827534587e-05, "loss": 0.3633, "step": 2306 }, { "epoch": 3.3194244604316547, "grad_norm": 0.14437074326676055, "learning_rate": 2.452111324255524e-05, "loss": 0.3597, "step": 2307 }, { "epoch": 3.320863309352518, "grad_norm": 0.17139115163412483, "learning_rate": 2.448406528126793e-05, "loss": 0.3608, "step": 2308 }, { "epoch": 3.322302158273381, "grad_norm": 0.13817997375385707, "learning_rate": 2.444703298106718e-05, "loss": 0.3694, "step": 2309 }, { "epoch": 3.3237410071942444, "grad_norm": 0.1530764392910559, "learning_rate": 2.441001637933173e-05, "loss": 0.3609, "step": 2310 }, { "epoch": 3.325179856115108, "grad_norm": 0.12399449512004619, "learning_rate": 2.437301551342447e-05, "loss": 0.374, "step": 2311 }, { "epoch": 3.3266187050359712, "grad_norm": 0.1709783821443115, "learning_rate": 2.433603042069242e-05, "loss": 0.3634, "step": 2312 }, { "epoch": 3.3280575539568344, "grad_norm": 0.15145193550435615, "learning_rate": 2.4299061138466667e-05, "loss": 0.3705, "step": 2313 }, { "epoch": 3.3294964028776977, "grad_norm": 0.14118503963320508, "learning_rate": 2.4262107704062343e-05, "loss": 0.3584, "step": 2314 }, { "epoch": 3.3309352517985613, "grad_norm": 0.14488814930761457, "learning_rate": 2.4225170154778562e-05, "loss": 0.364, "step": 2315 }, { "epoch": 3.3323741007194245, "grad_norm": 0.14630912827988185, "learning_rate": 2.4188248527898446e-05, "loss": 0.3607, "step": 2316 }, { "epoch": 3.3338129496402877, "grad_norm": 0.1577597818350434, "learning_rate": 2.415134286068903e-05, "loss": 0.357, "step": 2317 }, { "epoch": 3.335251798561151, "grad_norm": 0.14694587128564138, "learning_rate": 2.411445319040121e-05, "loss": 0.3633, "step": 2318 }, { "epoch": 3.3366906474820146, "grad_norm": 0.14965947001787325, "learning_rate": 2.407757955426977e-05, "loss": 0.3621, "step": 2319 }, { "epoch": 3.338129496402878, "grad_norm": 0.13259739344131605, "learning_rate": 2.4040721989513314e-05, "loss": 0.3648, "step": 2320 }, { "epoch": 3.339568345323741, "grad_norm": 0.13523920153875157, "learning_rate": 2.40038805333342e-05, "loss": 0.3618, "step": 2321 }, { "epoch": 3.3410071942446042, "grad_norm": 0.1311362813996723, "learning_rate": 2.396705522291852e-05, "loss": 0.3586, "step": 2322 }, { "epoch": 3.3424460431654675, "grad_norm": 0.12998381952826785, "learning_rate": 2.393024609543611e-05, "loss": 0.3583, "step": 2323 }, { "epoch": 3.343884892086331, "grad_norm": 0.135675371615169, "learning_rate": 2.3893453188040442e-05, "loss": 0.3611, "step": 2324 }, { "epoch": 3.3453237410071943, "grad_norm": 0.14144602755827798, "learning_rate": 2.3856676537868614e-05, "loss": 0.3582, "step": 2325 }, { "epoch": 3.3467625899280575, "grad_norm": 0.13529652876471696, "learning_rate": 2.3819916182041318e-05, "loss": 0.3544, "step": 2326 }, { "epoch": 3.3482014388489207, "grad_norm": 0.13419199847959343, "learning_rate": 2.378317215766283e-05, "loss": 0.3661, "step": 2327 }, { "epoch": 3.349640287769784, "grad_norm": 0.148272002046631, "learning_rate": 2.3746444501820886e-05, "loss": 0.3547, "step": 2328 }, { "epoch": 3.3510791366906476, "grad_norm": 0.14830889739587663, "learning_rate": 2.370973325158675e-05, "loss": 0.3614, "step": 2329 }, { "epoch": 3.352517985611511, "grad_norm": 0.1300783152214282, "learning_rate": 2.3673038444015087e-05, "loss": 0.3635, "step": 2330 }, { "epoch": 3.353956834532374, "grad_norm": 0.16703248529218837, "learning_rate": 2.363636011614401e-05, "loss": 0.3564, "step": 2331 }, { "epoch": 3.3553956834532372, "grad_norm": 0.13588325260397235, "learning_rate": 2.3599698304994946e-05, "loss": 0.3645, "step": 2332 }, { "epoch": 3.356834532374101, "grad_norm": 0.16336185339863962, "learning_rate": 2.3563053047572683e-05, "loss": 0.3629, "step": 2333 }, { "epoch": 3.358273381294964, "grad_norm": 0.14998556339025992, "learning_rate": 2.352642438086533e-05, "loss": 0.3611, "step": 2334 }, { "epoch": 3.3597122302158273, "grad_norm": 0.13681888822607777, "learning_rate": 2.348981234184417e-05, "loss": 0.3595, "step": 2335 }, { "epoch": 3.3611510791366905, "grad_norm": 0.13572005828339215, "learning_rate": 2.3453216967463785e-05, "loss": 0.3669, "step": 2336 }, { "epoch": 3.362589928057554, "grad_norm": 0.15313958123322324, "learning_rate": 2.3416638294661864e-05, "loss": 0.366, "step": 2337 }, { "epoch": 3.3640287769784174, "grad_norm": 0.14478875784069747, "learning_rate": 2.3380076360359293e-05, "loss": 0.361, "step": 2338 }, { "epoch": 3.3654676258992806, "grad_norm": 0.1277587655150165, "learning_rate": 2.3343531201460067e-05, "loss": 0.3597, "step": 2339 }, { "epoch": 3.366906474820144, "grad_norm": 0.1357502343423544, "learning_rate": 2.3307002854851188e-05, "loss": 0.355, "step": 2340 }, { "epoch": 3.368345323741007, "grad_norm": 0.15983854814101542, "learning_rate": 2.3270491357402715e-05, "loss": 0.3621, "step": 2341 }, { "epoch": 3.3697841726618707, "grad_norm": 0.1421995801195904, "learning_rate": 2.3233996745967772e-05, "loss": 0.3575, "step": 2342 }, { "epoch": 3.371223021582734, "grad_norm": 0.1352167104197795, "learning_rate": 2.3197519057382326e-05, "loss": 0.3626, "step": 2343 }, { "epoch": 3.372661870503597, "grad_norm": 0.12458809922761538, "learning_rate": 2.316105832846532e-05, "loss": 0.3634, "step": 2344 }, { "epoch": 3.3741007194244603, "grad_norm": 0.13950285735057272, "learning_rate": 2.3124614596018606e-05, "loss": 0.3621, "step": 2345 }, { "epoch": 3.3755395683453235, "grad_norm": 0.15166983091892183, "learning_rate": 2.308818789682682e-05, "loss": 0.3568, "step": 2346 }, { "epoch": 3.376978417266187, "grad_norm": 0.13848819446565608, "learning_rate": 2.3051778267657436e-05, "loss": 0.3662, "step": 2347 }, { "epoch": 3.3784172661870504, "grad_norm": 0.17810830096812302, "learning_rate": 2.3015385745260704e-05, "loss": 0.375, "step": 2348 }, { "epoch": 3.3798561151079136, "grad_norm": 0.1250981158644372, "learning_rate": 2.2979010366369595e-05, "loss": 0.3536, "step": 2349 }, { "epoch": 3.381294964028777, "grad_norm": 0.16379848518712722, "learning_rate": 2.294265216769978e-05, "loss": 0.3608, "step": 2350 }, { "epoch": 3.38273381294964, "grad_norm": 0.12056498068075343, "learning_rate": 2.2906311185949605e-05, "loss": 0.363, "step": 2351 }, { "epoch": 3.3841726618705037, "grad_norm": 0.16456533418318436, "learning_rate": 2.2869987457799977e-05, "loss": 0.3605, "step": 2352 }, { "epoch": 3.385611510791367, "grad_norm": 0.12932181191485873, "learning_rate": 2.283368101991448e-05, "loss": 0.3615, "step": 2353 }, { "epoch": 3.38705035971223, "grad_norm": 0.1281926781896969, "learning_rate": 2.2797391908939196e-05, "loss": 0.3522, "step": 2354 }, { "epoch": 3.3884892086330938, "grad_norm": 0.13911342135131524, "learning_rate": 2.2761120161502674e-05, "loss": 0.3695, "step": 2355 }, { "epoch": 3.389928057553957, "grad_norm": 0.1397088332645429, "learning_rate": 2.2724865814216042e-05, "loss": 0.36, "step": 2356 }, { "epoch": 3.39136690647482, "grad_norm": 0.1201278802417132, "learning_rate": 2.2688628903672792e-05, "loss": 0.3578, "step": 2357 }, { "epoch": 3.3928057553956834, "grad_norm": 0.1234892259171623, "learning_rate": 2.265240946644881e-05, "loss": 0.3553, "step": 2358 }, { "epoch": 3.3942446043165466, "grad_norm": 0.11589818426007034, "learning_rate": 2.261620753910238e-05, "loss": 0.3524, "step": 2359 }, { "epoch": 3.3956834532374103, "grad_norm": 0.11636042290314506, "learning_rate": 2.25800231581741e-05, "loss": 0.3572, "step": 2360 }, { "epoch": 3.3971223021582735, "grad_norm": 0.1358301033153014, "learning_rate": 2.254385636018686e-05, "loss": 0.3633, "step": 2361 }, { "epoch": 3.3985611510791367, "grad_norm": 0.11423064523067442, "learning_rate": 2.250770718164579e-05, "loss": 0.3648, "step": 2362 }, { "epoch": 3.4, "grad_norm": 0.13761042159576356, "learning_rate": 2.247157565903825e-05, "loss": 0.3539, "step": 2363 }, { "epoch": 3.401438848920863, "grad_norm": 0.13704512777709393, "learning_rate": 2.243546182883377e-05, "loss": 0.3571, "step": 2364 }, { "epoch": 3.402877697841727, "grad_norm": 0.1600033975665896, "learning_rate": 2.2399365727484047e-05, "loss": 0.3756, "step": 2365 }, { "epoch": 3.40431654676259, "grad_norm": 0.13893941806247376, "learning_rate": 2.2363287391422806e-05, "loss": 0.3534, "step": 2366 }, { "epoch": 3.405755395683453, "grad_norm": 0.17822170749507893, "learning_rate": 2.2327226857065954e-05, "loss": 0.357, "step": 2367 }, { "epoch": 3.4071942446043164, "grad_norm": 0.13088365758010723, "learning_rate": 2.2291184160811374e-05, "loss": 0.3592, "step": 2368 }, { "epoch": 3.4086330935251796, "grad_norm": 0.13233694762747514, "learning_rate": 2.22551593390389e-05, "loss": 0.3701, "step": 2369 }, { "epoch": 3.4100719424460433, "grad_norm": 0.1395773832633985, "learning_rate": 2.2219152428110368e-05, "loss": 0.361, "step": 2370 }, { "epoch": 3.4115107913669065, "grad_norm": 0.13488363597766204, "learning_rate": 2.218316346436959e-05, "loss": 0.3654, "step": 2371 }, { "epoch": 3.4129496402877697, "grad_norm": 0.13112663562211102, "learning_rate": 2.2147192484142154e-05, "loss": 0.3655, "step": 2372 }, { "epoch": 3.414388489208633, "grad_norm": 0.15989586902235292, "learning_rate": 2.2111239523735568e-05, "loss": 0.3633, "step": 2373 }, { "epoch": 3.4158273381294966, "grad_norm": 0.12999622257849563, "learning_rate": 2.2075304619439127e-05, "loss": 0.3603, "step": 2374 }, { "epoch": 3.41726618705036, "grad_norm": 0.17371115054607864, "learning_rate": 2.2039387807523914e-05, "loss": 0.3713, "step": 2375 }, { "epoch": 3.418705035971223, "grad_norm": 0.13595918088936446, "learning_rate": 2.2003489124242742e-05, "loss": 0.3545, "step": 2376 }, { "epoch": 3.420143884892086, "grad_norm": 0.1667334733541783, "learning_rate": 2.1967608605830115e-05, "loss": 0.364, "step": 2377 }, { "epoch": 3.42158273381295, "grad_norm": 0.17163831435716062, "learning_rate": 2.1931746288502235e-05, "loss": 0.367, "step": 2378 }, { "epoch": 3.423021582733813, "grad_norm": 0.1660405582847028, "learning_rate": 2.1895902208456903e-05, "loss": 0.3719, "step": 2379 }, { "epoch": 3.4244604316546763, "grad_norm": 0.17241463487185166, "learning_rate": 2.186007640187353e-05, "loss": 0.3548, "step": 2380 }, { "epoch": 3.4258992805755395, "grad_norm": 0.14327969394563383, "learning_rate": 2.1824268904913036e-05, "loss": 0.3528, "step": 2381 }, { "epoch": 3.4273381294964027, "grad_norm": 0.1685271143607362, "learning_rate": 2.1788479753717935e-05, "loss": 0.3724, "step": 2382 }, { "epoch": 3.4287769784172664, "grad_norm": 0.126245263397103, "learning_rate": 2.1752708984412196e-05, "loss": 0.364, "step": 2383 }, { "epoch": 3.4302158273381296, "grad_norm": 0.1350036452412881, "learning_rate": 2.171695663310119e-05, "loss": 0.3616, "step": 2384 }, { "epoch": 3.431654676258993, "grad_norm": 0.12641814167693138, "learning_rate": 2.1681222735871747e-05, "loss": 0.3587, "step": 2385 }, { "epoch": 3.433093525179856, "grad_norm": 0.14682365408094103, "learning_rate": 2.1645507328792058e-05, "loss": 0.364, "step": 2386 }, { "epoch": 3.434532374100719, "grad_norm": 0.14666110922925735, "learning_rate": 2.1609810447911637e-05, "loss": 0.3561, "step": 2387 }, { "epoch": 3.435971223021583, "grad_norm": 0.14493166271009153, "learning_rate": 2.157413212926133e-05, "loss": 0.3618, "step": 2388 }, { "epoch": 3.437410071942446, "grad_norm": 0.1460329551895372, "learning_rate": 2.1538472408853206e-05, "loss": 0.3577, "step": 2389 }, { "epoch": 3.4388489208633093, "grad_norm": 0.1416819326100094, "learning_rate": 2.1502831322680598e-05, "loss": 0.3596, "step": 2390 }, { "epoch": 3.4402877697841725, "grad_norm": 0.13223788299368044, "learning_rate": 2.1467208906718008e-05, "loss": 0.3607, "step": 2391 }, { "epoch": 3.441726618705036, "grad_norm": 0.13332809542290672, "learning_rate": 2.1431605196921103e-05, "loss": 0.3563, "step": 2392 }, { "epoch": 3.4431654676258994, "grad_norm": 0.1447394560751939, "learning_rate": 2.1396020229226666e-05, "loss": 0.3587, "step": 2393 }, { "epoch": 3.4446043165467626, "grad_norm": 0.13384779722057508, "learning_rate": 2.1360454039552577e-05, "loss": 0.3587, "step": 2394 }, { "epoch": 3.446043165467626, "grad_norm": 0.14334642905409703, "learning_rate": 2.1324906663797718e-05, "loss": 0.3749, "step": 2395 }, { "epoch": 3.4474820143884894, "grad_norm": 0.1321639897240863, "learning_rate": 2.1289378137842008e-05, "loss": 0.3655, "step": 2396 }, { "epoch": 3.4489208633093527, "grad_norm": 0.14159494357555968, "learning_rate": 2.125386849754639e-05, "loss": 0.3661, "step": 2397 }, { "epoch": 3.450359712230216, "grad_norm": 0.14423307154734125, "learning_rate": 2.121837777875266e-05, "loss": 0.3546, "step": 2398 }, { "epoch": 3.451798561151079, "grad_norm": 0.1384862291131786, "learning_rate": 2.118290601728354e-05, "loss": 0.3591, "step": 2399 }, { "epoch": 3.4532374100719423, "grad_norm": 0.1378806022213264, "learning_rate": 2.1147453248942687e-05, "loss": 0.3638, "step": 2400 }, { "epoch": 3.454676258992806, "grad_norm": 0.1397248975920691, "learning_rate": 2.1112019509514478e-05, "loss": 0.3662, "step": 2401 }, { "epoch": 3.456115107913669, "grad_norm": 0.14987395038178755, "learning_rate": 2.1076604834764154e-05, "loss": 0.3622, "step": 2402 }, { "epoch": 3.4575539568345324, "grad_norm": 0.14653004538413536, "learning_rate": 2.1041209260437694e-05, "loss": 0.3614, "step": 2403 }, { "epoch": 3.4589928057553956, "grad_norm": 0.14247884835751787, "learning_rate": 2.10058328222618e-05, "loss": 0.3576, "step": 2404 }, { "epoch": 3.460431654676259, "grad_norm": 0.16784949026039206, "learning_rate": 2.097047555594385e-05, "loss": 0.3718, "step": 2405 }, { "epoch": 3.4618705035971225, "grad_norm": 0.13598534286178163, "learning_rate": 2.0935137497171904e-05, "loss": 0.3585, "step": 2406 }, { "epoch": 3.4633093525179857, "grad_norm": 0.15804531633554808, "learning_rate": 2.0899818681614557e-05, "loss": 0.3662, "step": 2407 }, { "epoch": 3.464748201438849, "grad_norm": 0.13091743193933542, "learning_rate": 2.086451914492108e-05, "loss": 0.3475, "step": 2408 }, { "epoch": 3.466187050359712, "grad_norm": 0.1468398185670848, "learning_rate": 2.082923892272124e-05, "loss": 0.3636, "step": 2409 }, { "epoch": 3.4676258992805753, "grad_norm": 0.12952582836962553, "learning_rate": 2.079397805062526e-05, "loss": 0.3576, "step": 2410 }, { "epoch": 3.469064748201439, "grad_norm": 0.1618036519607178, "learning_rate": 2.0758736564223937e-05, "loss": 0.36, "step": 2411 }, { "epoch": 3.470503597122302, "grad_norm": 0.13052504625988254, "learning_rate": 2.0723514499088388e-05, "loss": 0.357, "step": 2412 }, { "epoch": 3.4719424460431654, "grad_norm": 0.14981628312349463, "learning_rate": 2.068831189077021e-05, "loss": 0.3569, "step": 2413 }, { "epoch": 3.4733812949640286, "grad_norm": 0.14676353418467492, "learning_rate": 2.065312877480133e-05, "loss": 0.3642, "step": 2414 }, { "epoch": 3.4748201438848922, "grad_norm": 0.12580317862821208, "learning_rate": 2.0617965186694e-05, "loss": 0.3619, "step": 2415 }, { "epoch": 3.4762589928057555, "grad_norm": 0.16397422943318538, "learning_rate": 2.058282116194076e-05, "loss": 0.3579, "step": 2416 }, { "epoch": 3.4776978417266187, "grad_norm": 0.12879245245884377, "learning_rate": 2.0547696736014415e-05, "loss": 0.3616, "step": 2417 }, { "epoch": 3.479136690647482, "grad_norm": 0.16143321846998793, "learning_rate": 2.0512591944367976e-05, "loss": 0.3607, "step": 2418 }, { "epoch": 3.4805755395683455, "grad_norm": 0.14541572120664673, "learning_rate": 2.0477506822434644e-05, "loss": 0.3632, "step": 2419 }, { "epoch": 3.4820143884892087, "grad_norm": 0.140853997077851, "learning_rate": 2.0442441405627776e-05, "loss": 0.3644, "step": 2420 }, { "epoch": 3.483453237410072, "grad_norm": 0.16358711152968886, "learning_rate": 2.0407395729340792e-05, "loss": 0.3561, "step": 2421 }, { "epoch": 3.484892086330935, "grad_norm": 0.12691895951240845, "learning_rate": 2.037236982894723e-05, "loss": 0.3694, "step": 2422 }, { "epoch": 3.4863309352517984, "grad_norm": 0.12280266266571434, "learning_rate": 2.0337363739800695e-05, "loss": 0.36, "step": 2423 }, { "epoch": 3.487769784172662, "grad_norm": 0.15523018091402865, "learning_rate": 2.030237749723472e-05, "loss": 0.3594, "step": 2424 }, { "epoch": 3.4892086330935252, "grad_norm": 0.12599233392046144, "learning_rate": 2.026741113656284e-05, "loss": 0.3625, "step": 2425 }, { "epoch": 3.4906474820143885, "grad_norm": 0.15043753377349986, "learning_rate": 2.0232464693078578e-05, "loss": 0.3579, "step": 2426 }, { "epoch": 3.4920863309352517, "grad_norm": 0.12535904059882044, "learning_rate": 2.0197538202055246e-05, "loss": 0.3543, "step": 2427 }, { "epoch": 3.493525179856115, "grad_norm": 0.14685558418663916, "learning_rate": 2.01626316987461e-05, "loss": 0.357, "step": 2428 }, { "epoch": 3.4949640287769785, "grad_norm": 0.17985442805567875, "learning_rate": 2.0127745218384193e-05, "loss": 0.3668, "step": 2429 }, { "epoch": 3.4964028776978417, "grad_norm": 0.13826582257846703, "learning_rate": 2.009287879618236e-05, "loss": 0.3613, "step": 2430 }, { "epoch": 3.497841726618705, "grad_norm": 0.14923428690400006, "learning_rate": 2.0058032467333204e-05, "loss": 0.3682, "step": 2431 }, { "epoch": 3.499280575539568, "grad_norm": 0.13411374381163862, "learning_rate": 2.0023206267009056e-05, "loss": 0.3585, "step": 2432 }, { "epoch": 3.5007194244604314, "grad_norm": 0.18295002413954828, "learning_rate": 1.9988400230361872e-05, "loss": 0.3618, "step": 2433 }, { "epoch": 3.502158273381295, "grad_norm": 0.15045896811357695, "learning_rate": 1.9953614392523345e-05, "loss": 0.3583, "step": 2434 }, { "epoch": 3.5035971223021583, "grad_norm": 0.1807016161535834, "learning_rate": 1.9918848788604738e-05, "loss": 0.3594, "step": 2435 }, { "epoch": 3.5050359712230215, "grad_norm": 0.13053791305177803, "learning_rate": 1.9884103453696837e-05, "loss": 0.3605, "step": 2436 }, { "epoch": 3.506474820143885, "grad_norm": 0.15640401309251897, "learning_rate": 1.9849378422870082e-05, "loss": 0.3607, "step": 2437 }, { "epoch": 3.5079136690647483, "grad_norm": 0.13300910014485748, "learning_rate": 1.9814673731174315e-05, "loss": 0.3586, "step": 2438 }, { "epoch": 3.5093525179856115, "grad_norm": 0.14174317411297754, "learning_rate": 1.97799894136389e-05, "loss": 0.3605, "step": 2439 }, { "epoch": 3.5107913669064748, "grad_norm": 0.12256497862548782, "learning_rate": 1.9745325505272633e-05, "loss": 0.3627, "step": 2440 }, { "epoch": 3.512230215827338, "grad_norm": 0.14283792503707346, "learning_rate": 1.9710682041063705e-05, "loss": 0.3602, "step": 2441 }, { "epoch": 3.5136690647482016, "grad_norm": 0.1268181159687369, "learning_rate": 1.9676059055979663e-05, "loss": 0.3605, "step": 2442 }, { "epoch": 3.515107913669065, "grad_norm": 0.14306929986034367, "learning_rate": 1.9641456584967392e-05, "loss": 0.3775, "step": 2443 }, { "epoch": 3.516546762589928, "grad_norm": 0.14205093798260596, "learning_rate": 1.9606874662953076e-05, "loss": 0.3631, "step": 2444 }, { "epoch": 3.5179856115107913, "grad_norm": 0.13463326265482048, "learning_rate": 1.9572313324842148e-05, "loss": 0.3555, "step": 2445 }, { "epoch": 3.5194244604316545, "grad_norm": 0.11819326222058735, "learning_rate": 1.9537772605519285e-05, "loss": 0.3703, "step": 2446 }, { "epoch": 3.520863309352518, "grad_norm": 0.22688031401747902, "learning_rate": 1.950325253984828e-05, "loss": 0.37, "step": 2447 }, { "epoch": 3.5223021582733813, "grad_norm": 0.135085886606042, "learning_rate": 1.946875316267219e-05, "loss": 0.3554, "step": 2448 }, { "epoch": 3.5237410071942445, "grad_norm": 0.12712478217185483, "learning_rate": 1.9434274508813135e-05, "loss": 0.3596, "step": 2449 }, { "epoch": 3.5251798561151078, "grad_norm": 0.12909468816223446, "learning_rate": 1.9399816613072287e-05, "loss": 0.3515, "step": 2450 }, { "epoch": 3.526618705035971, "grad_norm": 0.14655535569434083, "learning_rate": 1.9365379510229888e-05, "loss": 0.3649, "step": 2451 }, { "epoch": 3.5280575539568346, "grad_norm": 0.12981482793178992, "learning_rate": 1.9330963235045253e-05, "loss": 0.371, "step": 2452 }, { "epoch": 3.529496402877698, "grad_norm": 0.16364626318779124, "learning_rate": 1.9296567822256577e-05, "loss": 0.3742, "step": 2453 }, { "epoch": 3.530935251798561, "grad_norm": 0.11319185092004506, "learning_rate": 1.9262193306581052e-05, "loss": 0.3622, "step": 2454 }, { "epoch": 3.5323741007194247, "grad_norm": 0.13702614039758187, "learning_rate": 1.922783972271477e-05, "loss": 0.3515, "step": 2455 }, { "epoch": 3.533812949640288, "grad_norm": 0.12785770585094197, "learning_rate": 1.9193507105332702e-05, "loss": 0.3592, "step": 2456 }, { "epoch": 3.535251798561151, "grad_norm": 0.14364796412529102, "learning_rate": 1.9159195489088636e-05, "loss": 0.3678, "step": 2457 }, { "epoch": 3.5366906474820143, "grad_norm": 0.1311224626798062, "learning_rate": 1.9124904908615178e-05, "loss": 0.3638, "step": 2458 }, { "epoch": 3.5381294964028775, "grad_norm": 0.1143751908426819, "learning_rate": 1.9090635398523698e-05, "loss": 0.3542, "step": 2459 }, { "epoch": 3.539568345323741, "grad_norm": 0.11960495133083206, "learning_rate": 1.9056386993404294e-05, "loss": 0.3513, "step": 2460 }, { "epoch": 3.5410071942446044, "grad_norm": 0.1242060469217888, "learning_rate": 1.902215972782579e-05, "loss": 0.3584, "step": 2461 }, { "epoch": 3.5424460431654676, "grad_norm": 0.11815072560041254, "learning_rate": 1.8987953636335595e-05, "loss": 0.3661, "step": 2462 }, { "epoch": 3.543884892086331, "grad_norm": 0.12024541003788661, "learning_rate": 1.8953768753459863e-05, "loss": 0.365, "step": 2463 }, { "epoch": 3.545323741007194, "grad_norm": 0.11095016672614343, "learning_rate": 1.8919605113703227e-05, "loss": 0.3611, "step": 2464 }, { "epoch": 3.5467625899280577, "grad_norm": 0.12991405641396, "learning_rate": 1.888546275154895e-05, "loss": 0.3682, "step": 2465 }, { "epoch": 3.548201438848921, "grad_norm": 0.1119292554235309, "learning_rate": 1.885134170145879e-05, "loss": 0.3538, "step": 2466 }, { "epoch": 3.549640287769784, "grad_norm": 0.12423616963521807, "learning_rate": 1.8817241997873007e-05, "loss": 0.3617, "step": 2467 }, { "epoch": 3.5510791366906473, "grad_norm": 0.11604277374417908, "learning_rate": 1.8783163675210307e-05, "loss": 0.3602, "step": 2468 }, { "epoch": 3.5525179856115106, "grad_norm": 0.1372858234476605, "learning_rate": 1.8749106767867808e-05, "loss": 0.3678, "step": 2469 }, { "epoch": 3.553956834532374, "grad_norm": 0.10744657188136675, "learning_rate": 1.871507131022103e-05, "loss": 0.364, "step": 2470 }, { "epoch": 3.5553956834532374, "grad_norm": 0.12982426605734915, "learning_rate": 1.8681057336623825e-05, "loss": 0.3592, "step": 2471 }, { "epoch": 3.5568345323741006, "grad_norm": 0.11708731330532512, "learning_rate": 1.864706488140839e-05, "loss": 0.3629, "step": 2472 }, { "epoch": 3.5582733812949643, "grad_norm": 0.12605064899795568, "learning_rate": 1.861309397888513e-05, "loss": 0.3608, "step": 2473 }, { "epoch": 3.5597122302158275, "grad_norm": 0.12874151067946632, "learning_rate": 1.857914466334279e-05, "loss": 0.3584, "step": 2474 }, { "epoch": 3.5611510791366907, "grad_norm": 0.13231097084358348, "learning_rate": 1.8545216969048288e-05, "loss": 0.369, "step": 2475 }, { "epoch": 3.562589928057554, "grad_norm": 0.12529594305664066, "learning_rate": 1.851131093024668e-05, "loss": 0.3591, "step": 2476 }, { "epoch": 3.564028776978417, "grad_norm": 0.11728122094530871, "learning_rate": 1.8477426581161192e-05, "loss": 0.3605, "step": 2477 }, { "epoch": 3.565467625899281, "grad_norm": 0.13442675436700735, "learning_rate": 1.844356395599322e-05, "loss": 0.3633, "step": 2478 }, { "epoch": 3.566906474820144, "grad_norm": 0.12863775394917398, "learning_rate": 1.840972308892211e-05, "loss": 0.3738, "step": 2479 }, { "epoch": 3.568345323741007, "grad_norm": 0.13669933094304723, "learning_rate": 1.837590401410532e-05, "loss": 0.3646, "step": 2480 }, { "epoch": 3.5697841726618704, "grad_norm": 0.13848020004952785, "learning_rate": 1.8342106765678358e-05, "loss": 0.3647, "step": 2481 }, { "epoch": 3.5712230215827336, "grad_norm": 0.11722366973269575, "learning_rate": 1.8308331377754584e-05, "loss": 0.3578, "step": 2482 }, { "epoch": 3.5726618705035973, "grad_norm": 0.11214821744928535, "learning_rate": 1.8274577884425383e-05, "loss": 0.3624, "step": 2483 }, { "epoch": 3.5741007194244605, "grad_norm": 0.1342441684020052, "learning_rate": 1.8240846319760012e-05, "loss": 0.3604, "step": 2484 }, { "epoch": 3.5755395683453237, "grad_norm": 0.12462232422906017, "learning_rate": 1.8207136717805585e-05, "loss": 0.3511, "step": 2485 }, { "epoch": 3.576978417266187, "grad_norm": 0.12892567568478816, "learning_rate": 1.8173449112587062e-05, "loss": 0.3539, "step": 2486 }, { "epoch": 3.57841726618705, "grad_norm": 0.13031483613689354, "learning_rate": 1.813978353810722e-05, "loss": 0.3647, "step": 2487 }, { "epoch": 3.579856115107914, "grad_norm": 0.14172223735283349, "learning_rate": 1.8106140028346526e-05, "loss": 0.3711, "step": 2488 }, { "epoch": 3.581294964028777, "grad_norm": 0.13642062629442642, "learning_rate": 1.8072518617263276e-05, "loss": 0.3596, "step": 2489 }, { "epoch": 3.58273381294964, "grad_norm": 0.14056365127860532, "learning_rate": 1.803891933879338e-05, "loss": 0.3683, "step": 2490 }, { "epoch": 3.584172661870504, "grad_norm": 0.13568708568140467, "learning_rate": 1.8005342226850423e-05, "loss": 0.3634, "step": 2491 }, { "epoch": 3.5856115107913666, "grad_norm": 0.11816877259119497, "learning_rate": 1.7971787315325684e-05, "loss": 0.3504, "step": 2492 }, { "epoch": 3.5870503597122303, "grad_norm": 0.12781359007090007, "learning_rate": 1.7938254638087946e-05, "loss": 0.3685, "step": 2493 }, { "epoch": 3.5884892086330935, "grad_norm": 0.11390141299480655, "learning_rate": 1.7904744228983585e-05, "loss": 0.3696, "step": 2494 }, { "epoch": 3.5899280575539567, "grad_norm": 0.11336779470702313, "learning_rate": 1.7871256121836507e-05, "loss": 0.3541, "step": 2495 }, { "epoch": 3.5913669064748204, "grad_norm": 0.1138319923084464, "learning_rate": 1.7837790350448098e-05, "loss": 0.3604, "step": 2496 }, { "epoch": 3.5928057553956836, "grad_norm": 0.11728097483001036, "learning_rate": 1.7804346948597206e-05, "loss": 0.3589, "step": 2497 }, { "epoch": 3.594244604316547, "grad_norm": 0.10378891310250349, "learning_rate": 1.7770925950040114e-05, "loss": 0.3463, "step": 2498 }, { "epoch": 3.59568345323741, "grad_norm": 0.11345324723403843, "learning_rate": 1.773752738851042e-05, "loss": 0.3596, "step": 2499 }, { "epoch": 3.597122302158273, "grad_norm": 0.11309436317467, "learning_rate": 1.770415129771918e-05, "loss": 0.3695, "step": 2500 }, { "epoch": 3.598561151079137, "grad_norm": 0.10853114682496083, "learning_rate": 1.7670797711354724e-05, "loss": 0.3643, "step": 2501 }, { "epoch": 3.6, "grad_norm": 0.13250831651816233, "learning_rate": 1.763746666308261e-05, "loss": 0.3549, "step": 2502 }, { "epoch": 3.6014388489208633, "grad_norm": 0.12205829710881429, "learning_rate": 1.760415818654574e-05, "loss": 0.3704, "step": 2503 }, { "epoch": 3.6028776978417265, "grad_norm": 0.12321635496401367, "learning_rate": 1.75708723153642e-05, "loss": 0.3627, "step": 2504 }, { "epoch": 3.6043165467625897, "grad_norm": 0.12800767751956713, "learning_rate": 1.7537609083135224e-05, "loss": 0.3597, "step": 2505 }, { "epoch": 3.6057553956834534, "grad_norm": 0.13900200317563696, "learning_rate": 1.7504368523433216e-05, "loss": 0.353, "step": 2506 }, { "epoch": 3.6071942446043166, "grad_norm": 0.12316202576985372, "learning_rate": 1.747115066980974e-05, "loss": 0.3586, "step": 2507 }, { "epoch": 3.60863309352518, "grad_norm": 0.13760818995581628, "learning_rate": 1.7437955555793372e-05, "loss": 0.3555, "step": 2508 }, { "epoch": 3.610071942446043, "grad_norm": 0.11355210115229894, "learning_rate": 1.740478321488978e-05, "loss": 0.3555, "step": 2509 }, { "epoch": 3.6115107913669062, "grad_norm": 0.15416602094829648, "learning_rate": 1.737163368058162e-05, "loss": 0.3603, "step": 2510 }, { "epoch": 3.61294964028777, "grad_norm": 0.11189006173598844, "learning_rate": 1.7338506986328552e-05, "loss": 0.359, "step": 2511 }, { "epoch": 3.614388489208633, "grad_norm": 0.1468371487818277, "learning_rate": 1.730540316556717e-05, "loss": 0.3626, "step": 2512 }, { "epoch": 3.6158273381294963, "grad_norm": 0.11321436813944175, "learning_rate": 1.727232225171098e-05, "loss": 0.3627, "step": 2513 }, { "epoch": 3.61726618705036, "grad_norm": 0.11380782275839862, "learning_rate": 1.7239264278150364e-05, "loss": 0.3588, "step": 2514 }, { "epoch": 3.618705035971223, "grad_norm": 0.12040023848313533, "learning_rate": 1.7206229278252577e-05, "loss": 0.3609, "step": 2515 }, { "epoch": 3.6201438848920864, "grad_norm": 0.11452473435933457, "learning_rate": 1.717321728536163e-05, "loss": 0.3637, "step": 2516 }, { "epoch": 3.6215827338129496, "grad_norm": 0.11317997303110827, "learning_rate": 1.7140228332798336e-05, "loss": 0.3622, "step": 2517 }, { "epoch": 3.623021582733813, "grad_norm": 0.12973695947641795, "learning_rate": 1.7107262453860308e-05, "loss": 0.3674, "step": 2518 }, { "epoch": 3.6244604316546765, "grad_norm": 0.11362680022756308, "learning_rate": 1.707431968182179e-05, "loss": 0.3581, "step": 2519 }, { "epoch": 3.6258992805755397, "grad_norm": 0.13445202472189502, "learning_rate": 1.7041400049933726e-05, "loss": 0.3585, "step": 2520 }, { "epoch": 3.627338129496403, "grad_norm": 0.12766773295452266, "learning_rate": 1.700850359142373e-05, "loss": 0.369, "step": 2521 }, { "epoch": 3.628776978417266, "grad_norm": 0.1281623527561033, "learning_rate": 1.6975630339496e-05, "loss": 0.3586, "step": 2522 }, { "epoch": 3.6302158273381293, "grad_norm": 0.12272089410013572, "learning_rate": 1.6942780327331317e-05, "loss": 0.3667, "step": 2523 }, { "epoch": 3.631654676258993, "grad_norm": 0.12798702385340094, "learning_rate": 1.6909953588087024e-05, "loss": 0.3597, "step": 2524 }, { "epoch": 3.633093525179856, "grad_norm": 0.11597463713977145, "learning_rate": 1.687715015489691e-05, "loss": 0.3597, "step": 2525 }, { "epoch": 3.6345323741007194, "grad_norm": 0.11928967616784174, "learning_rate": 1.6844370060871324e-05, "loss": 0.3619, "step": 2526 }, { "epoch": 3.6359712230215826, "grad_norm": 0.12846214158720146, "learning_rate": 1.6811613339097022e-05, "loss": 0.3572, "step": 2527 }, { "epoch": 3.637410071942446, "grad_norm": 0.11488451532344186, "learning_rate": 1.6778880022637123e-05, "loss": 0.3549, "step": 2528 }, { "epoch": 3.6388489208633095, "grad_norm": 0.13407144008798366, "learning_rate": 1.674617014453121e-05, "loss": 0.365, "step": 2529 }, { "epoch": 3.6402877697841727, "grad_norm": 0.1355485639637215, "learning_rate": 1.6713483737795155e-05, "loss": 0.3632, "step": 2530 }, { "epoch": 3.641726618705036, "grad_norm": 0.10844701831092295, "learning_rate": 1.6680820835421124e-05, "loss": 0.3573, "step": 2531 }, { "epoch": 3.6431654676258995, "grad_norm": 0.16354547575824532, "learning_rate": 1.664818147037758e-05, "loss": 0.36, "step": 2532 }, { "epoch": 3.6446043165467623, "grad_norm": 0.11764460107006486, "learning_rate": 1.6615565675609272e-05, "loss": 0.3566, "step": 2533 }, { "epoch": 3.646043165467626, "grad_norm": 0.14988060309989518, "learning_rate": 1.6582973484037076e-05, "loss": 0.3577, "step": 2534 }, { "epoch": 3.647482014388489, "grad_norm": 0.11848140017555707, "learning_rate": 1.6550404928558094e-05, "loss": 0.3649, "step": 2535 }, { "epoch": 3.6489208633093524, "grad_norm": 0.15544540214821592, "learning_rate": 1.6517860042045564e-05, "loss": 0.3635, "step": 2536 }, { "epoch": 3.650359712230216, "grad_norm": 0.11932637817742124, "learning_rate": 1.6485338857348826e-05, "loss": 0.3666, "step": 2537 }, { "epoch": 3.6517985611510793, "grad_norm": 0.10945585272046333, "learning_rate": 1.6452841407293307e-05, "loss": 0.352, "step": 2538 }, { "epoch": 3.6532374100719425, "grad_norm": 0.1602187491110357, "learning_rate": 1.642036772468047e-05, "loss": 0.3668, "step": 2539 }, { "epoch": 3.6546762589928057, "grad_norm": 0.11601223141768494, "learning_rate": 1.6387917842287783e-05, "loss": 0.3571, "step": 2540 }, { "epoch": 3.656115107913669, "grad_norm": 0.14922969910297568, "learning_rate": 1.635549179286871e-05, "loss": 0.3669, "step": 2541 }, { "epoch": 3.6575539568345325, "grad_norm": 0.11373684076883384, "learning_rate": 1.6323089609152648e-05, "loss": 0.3696, "step": 2542 }, { "epoch": 3.6589928057553958, "grad_norm": 0.14063225178210415, "learning_rate": 1.6290711323844866e-05, "loss": 0.3708, "step": 2543 }, { "epoch": 3.660431654676259, "grad_norm": 0.11673634471305658, "learning_rate": 1.6258356969626614e-05, "loss": 0.3691, "step": 2544 }, { "epoch": 3.661870503597122, "grad_norm": 0.14829695978497334, "learning_rate": 1.622602657915487e-05, "loss": 0.3722, "step": 2545 }, { "epoch": 3.6633093525179854, "grad_norm": 0.1394931984719001, "learning_rate": 1.6193720185062484e-05, "loss": 0.3681, "step": 2546 }, { "epoch": 3.664748201438849, "grad_norm": 0.13122276548358858, "learning_rate": 1.6161437819958087e-05, "loss": 0.3655, "step": 2547 }, { "epoch": 3.6661870503597123, "grad_norm": 0.14332210065033338, "learning_rate": 1.6129179516426048e-05, "loss": 0.3634, "step": 2548 }, { "epoch": 3.6676258992805755, "grad_norm": 0.1492095853609529, "learning_rate": 1.609694530702644e-05, "loss": 0.3653, "step": 2549 }, { "epoch": 3.6690647482014387, "grad_norm": 0.1253180629490774, "learning_rate": 1.6064735224295027e-05, "loss": 0.3606, "step": 2550 }, { "epoch": 3.670503597122302, "grad_norm": 0.14174121524182898, "learning_rate": 1.603254930074322e-05, "loss": 0.3599, "step": 2551 }, { "epoch": 3.6719424460431656, "grad_norm": 0.13329892305991034, "learning_rate": 1.6000387568858042e-05, "loss": 0.3552, "step": 2552 }, { "epoch": 3.6733812949640288, "grad_norm": 0.14095030488587826, "learning_rate": 1.5968250061102105e-05, "loss": 0.3622, "step": 2553 }, { "epoch": 3.674820143884892, "grad_norm": 0.130119087262895, "learning_rate": 1.593613680991353e-05, "loss": 0.3616, "step": 2554 }, { "epoch": 3.6762589928057556, "grad_norm": 0.10573893978496751, "learning_rate": 1.590404784770603e-05, "loss": 0.3562, "step": 2555 }, { "epoch": 3.677697841726619, "grad_norm": 0.1425703024959088, "learning_rate": 1.5871983206868756e-05, "loss": 0.3598, "step": 2556 }, { "epoch": 3.679136690647482, "grad_norm": 0.14003753245179645, "learning_rate": 1.583994291976629e-05, "loss": 0.3733, "step": 2557 }, { "epoch": 3.6805755395683453, "grad_norm": 0.10566309600002534, "learning_rate": 1.580792701873865e-05, "loss": 0.3595, "step": 2558 }, { "epoch": 3.6820143884892085, "grad_norm": 0.1337458720644581, "learning_rate": 1.5775935536101296e-05, "loss": 0.359, "step": 2559 }, { "epoch": 3.683453237410072, "grad_norm": 0.13452100949385928, "learning_rate": 1.5743968504144946e-05, "loss": 0.3602, "step": 2560 }, { "epoch": 3.6848920863309353, "grad_norm": 0.1141164300059276, "learning_rate": 1.57120259551357e-05, "loss": 0.3588, "step": 2561 }, { "epoch": 3.6863309352517986, "grad_norm": 0.12200071347526566, "learning_rate": 1.5680107921314926e-05, "loss": 0.3641, "step": 2562 }, { "epoch": 3.6877697841726618, "grad_norm": 0.12161005020780033, "learning_rate": 1.5648214434899257e-05, "loss": 0.3629, "step": 2563 }, { "epoch": 3.689208633093525, "grad_norm": 0.10535673795518845, "learning_rate": 1.5616345528080537e-05, "loss": 0.3532, "step": 2564 }, { "epoch": 3.6906474820143886, "grad_norm": 0.13843402653199047, "learning_rate": 1.5584501233025813e-05, "loss": 0.3618, "step": 2565 }, { "epoch": 3.692086330935252, "grad_norm": 0.10102346363401019, "learning_rate": 1.555268158187728e-05, "loss": 0.3546, "step": 2566 }, { "epoch": 3.693525179856115, "grad_norm": 0.13534669321488643, "learning_rate": 1.552088660675227e-05, "loss": 0.3673, "step": 2567 }, { "epoch": 3.6949640287769783, "grad_norm": 0.1037905989181797, "learning_rate": 1.54891163397432e-05, "loss": 0.3578, "step": 2568 }, { "epoch": 3.6964028776978415, "grad_norm": 0.12973135784699819, "learning_rate": 1.5457370812917526e-05, "loss": 0.3578, "step": 2569 }, { "epoch": 3.697841726618705, "grad_norm": 0.10545431149666043, "learning_rate": 1.5425650058317795e-05, "loss": 0.3618, "step": 2570 }, { "epoch": 3.6992805755395683, "grad_norm": 0.11620996412674596, "learning_rate": 1.5393954107961467e-05, "loss": 0.3628, "step": 2571 }, { "epoch": 3.7007194244604316, "grad_norm": 0.12248263561229705, "learning_rate": 1.536228299384102e-05, "loss": 0.3535, "step": 2572 }, { "epoch": 3.702158273381295, "grad_norm": 0.12492464754252347, "learning_rate": 1.533063674792389e-05, "loss": 0.3652, "step": 2573 }, { "epoch": 3.7035971223021584, "grad_norm": 0.1374768174174054, "learning_rate": 1.529901540215233e-05, "loss": 0.353, "step": 2574 }, { "epoch": 3.7050359712230216, "grad_norm": 0.11331164182765775, "learning_rate": 1.5267418988443517e-05, "loss": 0.3717, "step": 2575 }, { "epoch": 3.706474820143885, "grad_norm": 0.12210755308569196, "learning_rate": 1.5235847538689452e-05, "loss": 0.3628, "step": 2576 }, { "epoch": 3.707913669064748, "grad_norm": 0.13691028227148266, "learning_rate": 1.5204301084756936e-05, "loss": 0.3664, "step": 2577 }, { "epoch": 3.7093525179856117, "grad_norm": 0.12456712506114138, "learning_rate": 1.5172779658487539e-05, "loss": 0.3611, "step": 2578 }, { "epoch": 3.710791366906475, "grad_norm": 0.11450658605211136, "learning_rate": 1.5141283291697587e-05, "loss": 0.3589, "step": 2579 }, { "epoch": 3.712230215827338, "grad_norm": 0.12082326327275167, "learning_rate": 1.5109812016178053e-05, "loss": 0.355, "step": 2580 }, { "epoch": 3.7136690647482014, "grad_norm": 0.12871886244640798, "learning_rate": 1.5078365863694667e-05, "loss": 0.3593, "step": 2581 }, { "epoch": 3.7151079136690646, "grad_norm": 0.12454464902255565, "learning_rate": 1.5046944865987763e-05, "loss": 0.36, "step": 2582 }, { "epoch": 3.716546762589928, "grad_norm": 0.11084581787132927, "learning_rate": 1.501554905477224e-05, "loss": 0.3556, "step": 2583 }, { "epoch": 3.7179856115107914, "grad_norm": 0.1085932755865407, "learning_rate": 1.4984178461737663e-05, "loss": 0.3735, "step": 2584 }, { "epoch": 3.7194244604316546, "grad_norm": 0.11993697365057852, "learning_rate": 1.4952833118548094e-05, "loss": 0.3567, "step": 2585 }, { "epoch": 3.720863309352518, "grad_norm": 0.13108234909015168, "learning_rate": 1.492151305684208e-05, "loss": 0.3612, "step": 2586 }, { "epoch": 3.722302158273381, "grad_norm": 0.12257848253539193, "learning_rate": 1.4890218308232704e-05, "loss": 0.3657, "step": 2587 }, { "epoch": 3.7237410071942447, "grad_norm": 0.10565647987771688, "learning_rate": 1.4858948904307476e-05, "loss": 0.3644, "step": 2588 }, { "epoch": 3.725179856115108, "grad_norm": 0.15214811899217365, "learning_rate": 1.4827704876628319e-05, "loss": 0.3663, "step": 2589 }, { "epoch": 3.726618705035971, "grad_norm": 0.11821078712737504, "learning_rate": 1.4796486256731561e-05, "loss": 0.3595, "step": 2590 }, { "epoch": 3.728057553956835, "grad_norm": 0.12834242190669048, "learning_rate": 1.4765293076127862e-05, "loss": 0.3578, "step": 2591 }, { "epoch": 3.7294964028776976, "grad_norm": 0.15115163524953665, "learning_rate": 1.4734125366302224e-05, "loss": 0.3616, "step": 2592 }, { "epoch": 3.7309352517985612, "grad_norm": 0.13582216648067696, "learning_rate": 1.470298315871392e-05, "loss": 0.3618, "step": 2593 }, { "epoch": 3.7323741007194244, "grad_norm": 0.14893276799316646, "learning_rate": 1.4671866484796505e-05, "loss": 0.3603, "step": 2594 }, { "epoch": 3.7338129496402876, "grad_norm": 0.11681439057036415, "learning_rate": 1.4640775375957742e-05, "loss": 0.3623, "step": 2595 }, { "epoch": 3.7352517985611513, "grad_norm": 0.14681579919735874, "learning_rate": 1.4609709863579622e-05, "loss": 0.3579, "step": 2596 }, { "epoch": 3.7366906474820145, "grad_norm": 0.10968043209668547, "learning_rate": 1.4578669979018231e-05, "loss": 0.3595, "step": 2597 }, { "epoch": 3.7381294964028777, "grad_norm": 0.12582209444001602, "learning_rate": 1.454765575360385e-05, "loss": 0.361, "step": 2598 }, { "epoch": 3.739568345323741, "grad_norm": 0.13210425620212404, "learning_rate": 1.4516667218640877e-05, "loss": 0.3605, "step": 2599 }, { "epoch": 3.741007194244604, "grad_norm": 0.11481028562948814, "learning_rate": 1.4485704405407699e-05, "loss": 0.36, "step": 2600 }, { "epoch": 3.742446043165468, "grad_norm": 0.13825468448413022, "learning_rate": 1.4454767345156806e-05, "loss": 0.3611, "step": 2601 }, { "epoch": 3.743884892086331, "grad_norm": 0.09926469784063446, "learning_rate": 1.4423856069114677e-05, "loss": 0.3552, "step": 2602 }, { "epoch": 3.7453237410071942, "grad_norm": 0.15498737636073653, "learning_rate": 1.4392970608481758e-05, "loss": 0.3638, "step": 2603 }, { "epoch": 3.7467625899280574, "grad_norm": 0.11990393870595022, "learning_rate": 1.4362110994432445e-05, "loss": 0.3591, "step": 2604 }, { "epoch": 3.7482014388489207, "grad_norm": 0.12322270345908738, "learning_rate": 1.433127725811505e-05, "loss": 0.3567, "step": 2605 }, { "epoch": 3.7496402877697843, "grad_norm": 0.12819082916741287, "learning_rate": 1.4300469430651754e-05, "loss": 0.3596, "step": 2606 }, { "epoch": 3.7510791366906475, "grad_norm": 0.1212969449865843, "learning_rate": 1.4269687543138594e-05, "loss": 0.3578, "step": 2607 }, { "epoch": 3.7525179856115107, "grad_norm": 0.15757106796906958, "learning_rate": 1.4238931626645434e-05, "loss": 0.3615, "step": 2608 }, { "epoch": 3.753956834532374, "grad_norm": 0.14413627556955358, "learning_rate": 1.4208201712215871e-05, "loss": 0.3692, "step": 2609 }, { "epoch": 3.755395683453237, "grad_norm": 0.1501816207712942, "learning_rate": 1.4177497830867348e-05, "loss": 0.366, "step": 2610 }, { "epoch": 3.756834532374101, "grad_norm": 0.1311310678361702, "learning_rate": 1.4146820013590973e-05, "loss": 0.3541, "step": 2611 }, { "epoch": 3.758273381294964, "grad_norm": 0.14674841924402438, "learning_rate": 1.411616829135153e-05, "loss": 0.36, "step": 2612 }, { "epoch": 3.7597122302158272, "grad_norm": 0.13296654783018239, "learning_rate": 1.4085542695087502e-05, "loss": 0.358, "step": 2613 }, { "epoch": 3.761151079136691, "grad_norm": 0.1358015049583282, "learning_rate": 1.4054943255710987e-05, "loss": 0.3608, "step": 2614 }, { "epoch": 3.762589928057554, "grad_norm": 0.12831505931112983, "learning_rate": 1.4024370004107683e-05, "loss": 0.3697, "step": 2615 }, { "epoch": 3.7640287769784173, "grad_norm": 0.1323113967521606, "learning_rate": 1.3993822971136859e-05, "loss": 0.3609, "step": 2616 }, { "epoch": 3.7654676258992805, "grad_norm": 0.12769123081942618, "learning_rate": 1.3963302187631316e-05, "loss": 0.357, "step": 2617 }, { "epoch": 3.7669064748201437, "grad_norm": 0.1407961080621416, "learning_rate": 1.3932807684397348e-05, "loss": 0.3538, "step": 2618 }, { "epoch": 3.7683453237410074, "grad_norm": 0.12449058240087635, "learning_rate": 1.3902339492214751e-05, "loss": 0.3536, "step": 2619 }, { "epoch": 3.7697841726618706, "grad_norm": 0.1326906224890556, "learning_rate": 1.387189764183674e-05, "loss": 0.3567, "step": 2620 }, { "epoch": 3.771223021582734, "grad_norm": 0.11401026260483377, "learning_rate": 1.384148216398995e-05, "loss": 0.3654, "step": 2621 }, { "epoch": 3.772661870503597, "grad_norm": 0.12168061074136714, "learning_rate": 1.381109308937441e-05, "loss": 0.3625, "step": 2622 }, { "epoch": 3.7741007194244602, "grad_norm": 0.13387443422805737, "learning_rate": 1.3780730448663456e-05, "loss": 0.3636, "step": 2623 }, { "epoch": 3.775539568345324, "grad_norm": 0.11349669555243698, "learning_rate": 1.3750394272503775e-05, "loss": 0.3608, "step": 2624 }, { "epoch": 3.776978417266187, "grad_norm": 0.1181487230287458, "learning_rate": 1.3720084591515374e-05, "loss": 0.3607, "step": 2625 }, { "epoch": 3.7784172661870503, "grad_norm": 0.13030452340281176, "learning_rate": 1.3689801436291448e-05, "loss": 0.3616, "step": 2626 }, { "epoch": 3.7798561151079135, "grad_norm": 0.11183480055976384, "learning_rate": 1.365954483739846e-05, "loss": 0.3728, "step": 2627 }, { "epoch": 3.7812949640287767, "grad_norm": 0.140740320543905, "learning_rate": 1.3629314825376061e-05, "loss": 0.3572, "step": 2628 }, { "epoch": 3.7827338129496404, "grad_norm": 0.10512997696773077, "learning_rate": 1.359911143073707e-05, "loss": 0.3569, "step": 2629 }, { "epoch": 3.7841726618705036, "grad_norm": 0.11841566025187138, "learning_rate": 1.3568934683967427e-05, "loss": 0.3542, "step": 2630 }, { "epoch": 3.785611510791367, "grad_norm": 0.13508715727642095, "learning_rate": 1.3538784615526188e-05, "loss": 0.3584, "step": 2631 }, { "epoch": 3.7870503597122305, "grad_norm": 0.12569299187321786, "learning_rate": 1.3508661255845477e-05, "loss": 0.3709, "step": 2632 }, { "epoch": 3.7884892086330937, "grad_norm": 0.10732638862230746, "learning_rate": 1.3478564635330455e-05, "loss": 0.3503, "step": 2633 }, { "epoch": 3.789928057553957, "grad_norm": 0.1251059736980278, "learning_rate": 1.344849478435931e-05, "loss": 0.3595, "step": 2634 }, { "epoch": 3.79136690647482, "grad_norm": 0.11947824093913274, "learning_rate": 1.3418451733283156e-05, "loss": 0.3598, "step": 2635 }, { "epoch": 3.7928057553956833, "grad_norm": 0.11577844202764537, "learning_rate": 1.3388435512426142e-05, "loss": 0.3652, "step": 2636 }, { "epoch": 3.794244604316547, "grad_norm": 0.1337616812891817, "learning_rate": 1.3358446152085289e-05, "loss": 0.3497, "step": 2637 }, { "epoch": 3.79568345323741, "grad_norm": 0.10072223135771624, "learning_rate": 1.332848368253048e-05, "loss": 0.3523, "step": 2638 }, { "epoch": 3.7971223021582734, "grad_norm": 0.1236485929567498, "learning_rate": 1.3298548134004498e-05, "loss": 0.3573, "step": 2639 }, { "epoch": 3.7985611510791366, "grad_norm": 0.10987681896024151, "learning_rate": 1.326863953672294e-05, "loss": 0.3684, "step": 2640 }, { "epoch": 3.8, "grad_norm": 0.11930130321558292, "learning_rate": 1.3238757920874203e-05, "loss": 0.3551, "step": 2641 }, { "epoch": 3.8014388489208635, "grad_norm": 0.10748542105822156, "learning_rate": 1.3208903316619436e-05, "loss": 0.361, "step": 2642 }, { "epoch": 3.8028776978417267, "grad_norm": 0.09818385057623447, "learning_rate": 1.317907575409254e-05, "loss": 0.3631, "step": 2643 }, { "epoch": 3.80431654676259, "grad_norm": 0.09617813037382426, "learning_rate": 1.3149275263400116e-05, "loss": 0.3627, "step": 2644 }, { "epoch": 3.805755395683453, "grad_norm": 0.10794047897019735, "learning_rate": 1.3119501874621437e-05, "loss": 0.3626, "step": 2645 }, { "epoch": 3.8071942446043163, "grad_norm": 0.1028120508008396, "learning_rate": 1.3089755617808417e-05, "loss": 0.3498, "step": 2646 }, { "epoch": 3.80863309352518, "grad_norm": 0.11536134670582235, "learning_rate": 1.3060036522985598e-05, "loss": 0.3583, "step": 2647 }, { "epoch": 3.810071942446043, "grad_norm": 0.1014443276620562, "learning_rate": 1.3030344620150105e-05, "loss": 0.358, "step": 2648 }, { "epoch": 3.8115107913669064, "grad_norm": 0.10555213738641042, "learning_rate": 1.3000679939271588e-05, "loss": 0.3623, "step": 2649 }, { "epoch": 3.81294964028777, "grad_norm": 0.10616309893328842, "learning_rate": 1.2971042510292238e-05, "loss": 0.3629, "step": 2650 }, { "epoch": 3.814388489208633, "grad_norm": 0.11041867870782239, "learning_rate": 1.2941432363126784e-05, "loss": 0.3633, "step": 2651 }, { "epoch": 3.8158273381294965, "grad_norm": 0.1190498763283357, "learning_rate": 1.2911849527662335e-05, "loss": 0.3671, "step": 2652 }, { "epoch": 3.8172661870503597, "grad_norm": 0.12022477616666787, "learning_rate": 1.2882294033758473e-05, "loss": 0.3663, "step": 2653 }, { "epoch": 3.818705035971223, "grad_norm": 0.10685794435909542, "learning_rate": 1.2852765911247227e-05, "loss": 0.3571, "step": 2654 }, { "epoch": 3.8201438848920866, "grad_norm": 0.10794695689107299, "learning_rate": 1.2823265189932914e-05, "loss": 0.3617, "step": 2655 }, { "epoch": 3.8215827338129498, "grad_norm": 0.11693638820113349, "learning_rate": 1.2793791899592254e-05, "loss": 0.3691, "step": 2656 }, { "epoch": 3.823021582733813, "grad_norm": 0.09592170560544878, "learning_rate": 1.2764346069974249e-05, "loss": 0.3548, "step": 2657 }, { "epoch": 3.824460431654676, "grad_norm": 0.09943996269770372, "learning_rate": 1.2734927730800206e-05, "loss": 0.3541, "step": 2658 }, { "epoch": 3.8258992805755394, "grad_norm": 0.13401545776494028, "learning_rate": 1.2705536911763665e-05, "loss": 0.3666, "step": 2659 }, { "epoch": 3.827338129496403, "grad_norm": 0.10808324164765093, "learning_rate": 1.2676173642530417e-05, "loss": 0.3661, "step": 2660 }, { "epoch": 3.8287769784172663, "grad_norm": 0.0985569274615787, "learning_rate": 1.2646837952738382e-05, "loss": 0.3521, "step": 2661 }, { "epoch": 3.8302158273381295, "grad_norm": 0.10177724691359441, "learning_rate": 1.2617529871997727e-05, "loss": 0.3687, "step": 2662 }, { "epoch": 3.8316546762589927, "grad_norm": 0.11808499539692265, "learning_rate": 1.2588249429890706e-05, "loss": 0.3532, "step": 2663 }, { "epoch": 3.833093525179856, "grad_norm": 0.1111604668473334, "learning_rate": 1.2558996655971644e-05, "loss": 0.3558, "step": 2664 }, { "epoch": 3.8345323741007196, "grad_norm": 0.1279783316430316, "learning_rate": 1.2529771579767024e-05, "loss": 0.359, "step": 2665 }, { "epoch": 3.8359712230215828, "grad_norm": 0.09848527975004501, "learning_rate": 1.2500574230775294e-05, "loss": 0.3663, "step": 2666 }, { "epoch": 3.837410071942446, "grad_norm": 0.12029030117458071, "learning_rate": 1.2471404638466949e-05, "loss": 0.362, "step": 2667 }, { "epoch": 3.838848920863309, "grad_norm": 0.11000558152107098, "learning_rate": 1.2442262832284464e-05, "loss": 0.3515, "step": 2668 }, { "epoch": 3.8402877697841724, "grad_norm": 0.11723113093826766, "learning_rate": 1.2413148841642268e-05, "loss": 0.3615, "step": 2669 }, { "epoch": 3.841726618705036, "grad_norm": 0.11302251660022884, "learning_rate": 1.2384062695926713e-05, "loss": 0.3592, "step": 2670 }, { "epoch": 3.8431654676258993, "grad_norm": 0.12412774957255451, "learning_rate": 1.235500442449605e-05, "loss": 0.3618, "step": 2671 }, { "epoch": 3.8446043165467625, "grad_norm": 0.09488577240409062, "learning_rate": 1.232597405668039e-05, "loss": 0.3611, "step": 2672 }, { "epoch": 3.846043165467626, "grad_norm": 0.11443172578323646, "learning_rate": 1.2296971621781677e-05, "loss": 0.3581, "step": 2673 }, { "epoch": 3.8474820143884894, "grad_norm": 0.10661096580146355, "learning_rate": 1.2267997149073679e-05, "loss": 0.366, "step": 2674 }, { "epoch": 3.8489208633093526, "grad_norm": 0.10099669801061048, "learning_rate": 1.2239050667801885e-05, "loss": 0.3566, "step": 2675 }, { "epoch": 3.850359712230216, "grad_norm": 0.11442540074877293, "learning_rate": 1.2210132207183611e-05, "loss": 0.3663, "step": 2676 }, { "epoch": 3.851798561151079, "grad_norm": 0.10326919646973422, "learning_rate": 1.2181241796407855e-05, "loss": 0.3655, "step": 2677 }, { "epoch": 3.8532374100719426, "grad_norm": 0.11407317713325474, "learning_rate": 1.2152379464635264e-05, "loss": 0.36, "step": 2678 }, { "epoch": 3.854676258992806, "grad_norm": 0.10025218616264969, "learning_rate": 1.2123545240998182e-05, "loss": 0.355, "step": 2679 }, { "epoch": 3.856115107913669, "grad_norm": 0.09564519109975264, "learning_rate": 1.2094739154600616e-05, "loss": 0.3531, "step": 2680 }, { "epoch": 3.8575539568345323, "grad_norm": 0.1259784386469854, "learning_rate": 1.2065961234518096e-05, "loss": 0.3579, "step": 2681 }, { "epoch": 3.8589928057553955, "grad_norm": 0.10632725682300842, "learning_rate": 1.2037211509797771e-05, "loss": 0.3613, "step": 2682 }, { "epoch": 3.860431654676259, "grad_norm": 0.10146620175603975, "learning_rate": 1.2008490009458322e-05, "loss": 0.3625, "step": 2683 }, { "epoch": 3.8618705035971224, "grad_norm": 0.09500020982820409, "learning_rate": 1.1979796762489934e-05, "loss": 0.3581, "step": 2684 }, { "epoch": 3.8633093525179856, "grad_norm": 0.10521972710858278, "learning_rate": 1.195113179785429e-05, "loss": 0.3454, "step": 2685 }, { "epoch": 3.864748201438849, "grad_norm": 0.09582255281437967, "learning_rate": 1.1922495144484504e-05, "loss": 0.3614, "step": 2686 }, { "epoch": 3.866187050359712, "grad_norm": 0.10135225534705446, "learning_rate": 1.1893886831285136e-05, "loss": 0.3635, "step": 2687 }, { "epoch": 3.8676258992805757, "grad_norm": 0.09511353365433746, "learning_rate": 1.1865306887132122e-05, "loss": 0.3566, "step": 2688 }, { "epoch": 3.869064748201439, "grad_norm": 0.10025779281801901, "learning_rate": 1.183675534087279e-05, "loss": 0.3619, "step": 2689 }, { "epoch": 3.870503597122302, "grad_norm": 0.08858656791121877, "learning_rate": 1.1808232221325749e-05, "loss": 0.3628, "step": 2690 }, { "epoch": 3.8719424460431657, "grad_norm": 0.09260597242456182, "learning_rate": 1.1779737557280985e-05, "loss": 0.3608, "step": 2691 }, { "epoch": 3.873381294964029, "grad_norm": 0.10298075180631532, "learning_rate": 1.1751271377499736e-05, "loss": 0.3587, "step": 2692 }, { "epoch": 3.874820143884892, "grad_norm": 0.09065564279621754, "learning_rate": 1.1722833710714454e-05, "loss": 0.3672, "step": 2693 }, { "epoch": 3.8762589928057554, "grad_norm": 0.1028082028401011, "learning_rate": 1.1694424585628861e-05, "loss": 0.3635, "step": 2694 }, { "epoch": 3.8776978417266186, "grad_norm": 0.0930008537084772, "learning_rate": 1.166604403091784e-05, "loss": 0.362, "step": 2695 }, { "epoch": 3.8791366906474822, "grad_norm": 0.10968089516367259, "learning_rate": 1.1637692075227451e-05, "loss": 0.3643, "step": 2696 }, { "epoch": 3.8805755395683454, "grad_norm": 0.09167057693074489, "learning_rate": 1.1609368747174883e-05, "loss": 0.3631, "step": 2697 }, { "epoch": 3.8820143884892087, "grad_norm": 0.09608290565096088, "learning_rate": 1.1581074075348431e-05, "loss": 0.3558, "step": 2698 }, { "epoch": 3.883453237410072, "grad_norm": 0.11203789648383493, "learning_rate": 1.155280808830746e-05, "loss": 0.3637, "step": 2699 }, { "epoch": 3.884892086330935, "grad_norm": 0.09942035822295522, "learning_rate": 1.15245708145824e-05, "loss": 0.3676, "step": 2700 }, { "epoch": 3.8863309352517987, "grad_norm": 0.10350023938493974, "learning_rate": 1.1496362282674647e-05, "loss": 0.3614, "step": 2701 }, { "epoch": 3.887769784172662, "grad_norm": 0.09981408944424823, "learning_rate": 1.1468182521056663e-05, "loss": 0.359, "step": 2702 }, { "epoch": 3.889208633093525, "grad_norm": 0.11511236345501889, "learning_rate": 1.1440031558171834e-05, "loss": 0.3608, "step": 2703 }, { "epoch": 3.8906474820143884, "grad_norm": 0.10090422597951822, "learning_rate": 1.1411909422434441e-05, "loss": 0.3637, "step": 2704 }, { "epoch": 3.8920863309352516, "grad_norm": 0.10735672779609931, "learning_rate": 1.1383816142229715e-05, "loss": 0.3629, "step": 2705 }, { "epoch": 3.8935251798561152, "grad_norm": 0.11333440737627762, "learning_rate": 1.1355751745913781e-05, "loss": 0.3596, "step": 2706 }, { "epoch": 3.8949640287769784, "grad_norm": 0.0950438089714313, "learning_rate": 1.1327716261813539e-05, "loss": 0.3573, "step": 2707 }, { "epoch": 3.8964028776978417, "grad_norm": 0.10940745545202456, "learning_rate": 1.1299709718226745e-05, "loss": 0.3588, "step": 2708 }, { "epoch": 3.897841726618705, "grad_norm": 0.12759866682775797, "learning_rate": 1.1271732143421992e-05, "loss": 0.3525, "step": 2709 }, { "epoch": 3.899280575539568, "grad_norm": 0.11431046414492715, "learning_rate": 1.1243783565638533e-05, "loss": 0.3651, "step": 2710 }, { "epoch": 3.9007194244604317, "grad_norm": 0.9564132684060368, "learning_rate": 1.121586401308643e-05, "loss": 0.3685, "step": 2711 }, { "epoch": 3.902158273381295, "grad_norm": 0.11077573358899574, "learning_rate": 1.1187973513946417e-05, "loss": 0.3634, "step": 2712 }, { "epoch": 3.903597122302158, "grad_norm": 0.10778792389149551, "learning_rate": 1.1160112096369913e-05, "loss": 0.3593, "step": 2713 }, { "epoch": 3.905035971223022, "grad_norm": 0.11189896505320317, "learning_rate": 1.1132279788478977e-05, "loss": 0.3585, "step": 2714 }, { "epoch": 3.906474820143885, "grad_norm": 0.1052821688406415, "learning_rate": 1.1104476618366298e-05, "loss": 0.3558, "step": 2715 }, { "epoch": 3.9079136690647482, "grad_norm": 0.11120022223037514, "learning_rate": 1.1076702614095116e-05, "loss": 0.3627, "step": 2716 }, { "epoch": 3.9093525179856115, "grad_norm": 0.11008204754639704, "learning_rate": 1.1048957803699292e-05, "loss": 0.3629, "step": 2717 }, { "epoch": 3.9107913669064747, "grad_norm": 0.12393216658264912, "learning_rate": 1.1021242215183193e-05, "loss": 0.3694, "step": 2718 }, { "epoch": 3.9122302158273383, "grad_norm": 0.4970887948932858, "learning_rate": 1.0993555876521658e-05, "loss": 0.3655, "step": 2719 }, { "epoch": 3.9136690647482015, "grad_norm": 0.14163532032564988, "learning_rate": 1.096589881566005e-05, "loss": 0.3663, "step": 2720 }, { "epoch": 3.9151079136690647, "grad_norm": 0.09692858784116108, "learning_rate": 1.0938271060514162e-05, "loss": 0.3583, "step": 2721 }, { "epoch": 3.916546762589928, "grad_norm": 0.12425912973919774, "learning_rate": 1.0910672638970206e-05, "loss": 0.357, "step": 2722 }, { "epoch": 3.917985611510791, "grad_norm": 0.11842405300151483, "learning_rate": 1.0883103578884784e-05, "loss": 0.3635, "step": 2723 }, { "epoch": 3.919424460431655, "grad_norm": 0.09541247158438779, "learning_rate": 1.085556390808487e-05, "loss": 0.3595, "step": 2724 }, { "epoch": 3.920863309352518, "grad_norm": 0.11200656304991745, "learning_rate": 1.082805365436776e-05, "loss": 0.3679, "step": 2725 }, { "epoch": 3.9223021582733812, "grad_norm": 0.11181453719163843, "learning_rate": 1.0800572845501095e-05, "loss": 0.3567, "step": 2726 }, { "epoch": 3.9237410071942445, "grad_norm": 0.09861364204229336, "learning_rate": 1.0773121509222712e-05, "loss": 0.3672, "step": 2727 }, { "epoch": 3.9251798561151077, "grad_norm": 0.1057757176136241, "learning_rate": 1.0745699673240808e-05, "loss": 0.36, "step": 2728 }, { "epoch": 3.9266187050359713, "grad_norm": 0.1019318459420541, "learning_rate": 1.0718307365233737e-05, "loss": 0.3543, "step": 2729 }, { "epoch": 3.9280575539568345, "grad_norm": 0.10306055253627819, "learning_rate": 1.0690944612850052e-05, "loss": 0.3577, "step": 2730 }, { "epoch": 3.9294964028776977, "grad_norm": 0.11664140763494153, "learning_rate": 1.0663611443708471e-05, "loss": 0.3594, "step": 2731 }, { "epoch": 3.9309352517985614, "grad_norm": 0.11073226114226936, "learning_rate": 1.0636307885397911e-05, "loss": 0.3689, "step": 2732 }, { "epoch": 3.9323741007194246, "grad_norm": 0.10681885965436957, "learning_rate": 1.0609033965477318e-05, "loss": 0.3624, "step": 2733 }, { "epoch": 3.933812949640288, "grad_norm": 0.12500736336387938, "learning_rate": 1.0581789711475752e-05, "loss": 0.3665, "step": 2734 }, { "epoch": 3.935251798561151, "grad_norm": 0.09930500365361575, "learning_rate": 1.0554575150892386e-05, "loss": 0.3548, "step": 2735 }, { "epoch": 3.9366906474820142, "grad_norm": 0.1021624596019113, "learning_rate": 1.0527390311196326e-05, "loss": 0.3545, "step": 2736 }, { "epoch": 3.938129496402878, "grad_norm": 0.10628437462092273, "learning_rate": 1.0500235219826748e-05, "loss": 0.362, "step": 2737 }, { "epoch": 3.939568345323741, "grad_norm": 0.11071067883850721, "learning_rate": 1.0473109904192773e-05, "loss": 0.3587, "step": 2738 }, { "epoch": 3.9410071942446043, "grad_norm": 0.1068522520290516, "learning_rate": 1.0446014391673476e-05, "loss": 0.3657, "step": 2739 }, { "epoch": 3.9424460431654675, "grad_norm": 0.09185630866760745, "learning_rate": 1.0418948709617846e-05, "loss": 0.3657, "step": 2740 }, { "epoch": 3.9438848920863308, "grad_norm": 0.12260735019479156, "learning_rate": 1.0391912885344784e-05, "loss": 0.3527, "step": 2741 }, { "epoch": 3.9453237410071944, "grad_norm": 0.10653215957239656, "learning_rate": 1.0364906946142996e-05, "loss": 0.3583, "step": 2742 }, { "epoch": 3.9467625899280576, "grad_norm": 0.1042669525207971, "learning_rate": 1.0337930919271094e-05, "loss": 0.365, "step": 2743 }, { "epoch": 3.948201438848921, "grad_norm": 0.1121419412537695, "learning_rate": 1.0310984831957471e-05, "loss": 0.3572, "step": 2744 }, { "epoch": 3.949640287769784, "grad_norm": 0.10209132317781502, "learning_rate": 1.0284068711400254e-05, "loss": 0.3573, "step": 2745 }, { "epoch": 3.9510791366906473, "grad_norm": 0.0996760617468679, "learning_rate": 1.0257182584767423e-05, "loss": 0.3623, "step": 2746 }, { "epoch": 3.952517985611511, "grad_norm": 0.11716774537978202, "learning_rate": 1.0230326479196573e-05, "loss": 0.3556, "step": 2747 }, { "epoch": 3.953956834532374, "grad_norm": 0.09557787024252977, "learning_rate": 1.0203500421795075e-05, "loss": 0.362, "step": 2748 }, { "epoch": 3.9553956834532373, "grad_norm": 0.1148898935436786, "learning_rate": 1.017670443963994e-05, "loss": 0.3663, "step": 2749 }, { "epoch": 3.956834532374101, "grad_norm": 0.10351097078129262, "learning_rate": 1.0149938559777825e-05, "loss": 0.363, "step": 2750 }, { "epoch": 3.9582733812949638, "grad_norm": 0.0980401694645122, "learning_rate": 1.0123202809225009e-05, "loss": 0.3575, "step": 2751 }, { "epoch": 3.9597122302158274, "grad_norm": 0.09750837058182665, "learning_rate": 1.0096497214967349e-05, "loss": 0.3614, "step": 2752 }, { "epoch": 3.9611510791366906, "grad_norm": 0.11031556302217374, "learning_rate": 1.0069821803960277e-05, "loss": 0.3541, "step": 2753 }, { "epoch": 3.962589928057554, "grad_norm": 0.09977694465125021, "learning_rate": 1.0043176603128755e-05, "loss": 0.3623, "step": 2754 }, { "epoch": 3.9640287769784175, "grad_norm": 0.11347225388698515, "learning_rate": 1.0016561639367253e-05, "loss": 0.3611, "step": 2755 }, { "epoch": 3.9654676258992807, "grad_norm": 0.11182600990952397, "learning_rate": 9.989976939539687e-06, "loss": 0.3589, "step": 2756 }, { "epoch": 3.966906474820144, "grad_norm": 0.09250923238264663, "learning_rate": 9.963422530479496e-06, "loss": 0.3565, "step": 2757 }, { "epoch": 3.968345323741007, "grad_norm": 0.09920114232935916, "learning_rate": 9.936898438989507e-06, "loss": 0.351, "step": 2758 }, { "epoch": 3.9697841726618703, "grad_norm": 0.10377812097578386, "learning_rate": 9.910404691841915e-06, "loss": 0.3604, "step": 2759 }, { "epoch": 3.971223021582734, "grad_norm": 0.09473159919254981, "learning_rate": 9.883941315778319e-06, "loss": 0.3551, "step": 2760 }, { "epoch": 3.972661870503597, "grad_norm": 0.12287944307641946, "learning_rate": 9.857508337509692e-06, "loss": 0.3622, "step": 2761 }, { "epoch": 3.9741007194244604, "grad_norm": 0.09875799435187659, "learning_rate": 9.831105783716266e-06, "loss": 0.3619, "step": 2762 }, { "epoch": 3.9755395683453236, "grad_norm": 0.10299870999836316, "learning_rate": 9.8047336810476e-06, "loss": 0.3603, "step": 2763 }, { "epoch": 3.976978417266187, "grad_norm": 0.10514063205962414, "learning_rate": 9.778392056122503e-06, "loss": 0.3573, "step": 2764 }, { "epoch": 3.9784172661870505, "grad_norm": 0.08862739370425651, "learning_rate": 9.752080935529037e-06, "loss": 0.3566, "step": 2765 }, { "epoch": 3.9798561151079137, "grad_norm": 0.10420965063882555, "learning_rate": 9.725800345824453e-06, "loss": 0.3601, "step": 2766 }, { "epoch": 3.981294964028777, "grad_norm": 0.11741476939494742, "learning_rate": 9.699550313535196e-06, "loss": 0.3626, "step": 2767 }, { "epoch": 3.98273381294964, "grad_norm": 0.1051511996966301, "learning_rate": 9.673330865156875e-06, "loss": 0.3697, "step": 2768 }, { "epoch": 3.9841726618705033, "grad_norm": 0.10984037723207088, "learning_rate": 9.647142027154222e-06, "loss": 0.362, "step": 2769 }, { "epoch": 3.985611510791367, "grad_norm": 0.10093858750504218, "learning_rate": 9.620983825961078e-06, "loss": 0.3552, "step": 2770 }, { "epoch": 3.98705035971223, "grad_norm": 0.09746498465302932, "learning_rate": 9.594856287980323e-06, "loss": 0.3524, "step": 2771 }, { "epoch": 3.9884892086330934, "grad_norm": 0.10533187198362187, "learning_rate": 9.56875943958396e-06, "loss": 0.3623, "step": 2772 }, { "epoch": 3.989928057553957, "grad_norm": 0.09715565692470797, "learning_rate": 9.542693307112949e-06, "loss": 0.3556, "step": 2773 }, { "epoch": 3.9913669064748203, "grad_norm": 0.09676232181036293, "learning_rate": 9.516657916877272e-06, "loss": 0.3578, "step": 2774 }, { "epoch": 3.9928057553956835, "grad_norm": 0.12015577947176528, "learning_rate": 9.490653295155891e-06, "loss": 0.3594, "step": 2775 }, { "epoch": 3.9942446043165467, "grad_norm": 0.09481055578760383, "learning_rate": 9.464679468196696e-06, "loss": 0.3558, "step": 2776 }, { "epoch": 3.99568345323741, "grad_norm": 0.09389360547241736, "learning_rate": 9.438736462216496e-06, "loss": 0.3605, "step": 2777 }, { "epoch": 3.9971223021582736, "grad_norm": 0.10665313888423696, "learning_rate": 9.412824303401003e-06, "loss": 0.362, "step": 2778 }, { "epoch": 3.998561151079137, "grad_norm": 0.09496625781759228, "learning_rate": 9.38694301790478e-06, "loss": 0.374, "step": 2779 }, { "epoch": 4.0, "grad_norm": 0.10124169992214632, "learning_rate": 9.361092631851228e-06, "loss": 0.3679, "step": 2780 }, { "epoch": 4.001438848920864, "grad_norm": 0.13559259159060802, "learning_rate": 9.335273171332581e-06, "loss": 0.3454, "step": 2781 }, { "epoch": 4.002877697841726, "grad_norm": 0.10035380451874652, "learning_rate": 9.30948466240981e-06, "loss": 0.3419, "step": 2782 }, { "epoch": 4.00431654676259, "grad_norm": 0.09733148277656034, "learning_rate": 9.2837271311127e-06, "loss": 0.3421, "step": 2783 }, { "epoch": 4.005755395683453, "grad_norm": 0.10382884714560332, "learning_rate": 9.25800060343975e-06, "loss": 0.3401, "step": 2784 }, { "epoch": 4.0071942446043165, "grad_norm": 0.11033527019094946, "learning_rate": 9.232305105358139e-06, "loss": 0.3463, "step": 2785 }, { "epoch": 4.00863309352518, "grad_norm": 0.12516376026251724, "learning_rate": 9.206640662803746e-06, "loss": 0.3374, "step": 2786 }, { "epoch": 4.010071942446043, "grad_norm": 0.1094679305415975, "learning_rate": 9.181007301681135e-06, "loss": 0.3488, "step": 2787 }, { "epoch": 4.011510791366907, "grad_norm": 0.11467050689120065, "learning_rate": 9.155405047863439e-06, "loss": 0.3389, "step": 2788 }, { "epoch": 4.012949640287769, "grad_norm": 0.11131158738810967, "learning_rate": 9.12983392719243e-06, "loss": 0.3412, "step": 2789 }, { "epoch": 4.014388489208633, "grad_norm": 0.11992712129415503, "learning_rate": 9.104293965478446e-06, "loss": 0.3379, "step": 2790 }, { "epoch": 4.015827338129497, "grad_norm": 0.10721680297702718, "learning_rate": 9.078785188500378e-06, "loss": 0.3454, "step": 2791 }, { "epoch": 4.017266187050359, "grad_norm": 0.11297504942643836, "learning_rate": 9.053307622005639e-06, "loss": 0.3383, "step": 2792 }, { "epoch": 4.018705035971223, "grad_norm": 0.10600194033425428, "learning_rate": 9.02786129171013e-06, "loss": 0.3315, "step": 2793 }, { "epoch": 4.020143884892087, "grad_norm": 0.10334289899927786, "learning_rate": 9.002446223298244e-06, "loss": 0.3448, "step": 2794 }, { "epoch": 4.0215827338129495, "grad_norm": 0.11321397054770597, "learning_rate": 8.977062442422796e-06, "loss": 0.3368, "step": 2795 }, { "epoch": 4.023021582733813, "grad_norm": 0.1035332486262864, "learning_rate": 8.951709974705057e-06, "loss": 0.3514, "step": 2796 }, { "epoch": 4.024460431654676, "grad_norm": 0.09504472096495813, "learning_rate": 8.926388845734624e-06, "loss": 0.3445, "step": 2797 }, { "epoch": 4.02589928057554, "grad_norm": 0.10885943183780947, "learning_rate": 8.901099081069553e-06, "loss": 0.337, "step": 2798 }, { "epoch": 4.027338129496403, "grad_norm": 0.09986907982849921, "learning_rate": 8.875840706236163e-06, "loss": 0.3352, "step": 2799 }, { "epoch": 4.028776978417266, "grad_norm": 0.10310959624014754, "learning_rate": 8.850613746729117e-06, "loss": 0.3403, "step": 2800 }, { "epoch": 4.03021582733813, "grad_norm": 0.10520753354202317, "learning_rate": 8.825418228011413e-06, "loss": 0.3388, "step": 2801 }, { "epoch": 4.031654676258992, "grad_norm": 0.09357355812436322, "learning_rate": 8.80025417551424e-06, "loss": 0.3393, "step": 2802 }, { "epoch": 4.033093525179856, "grad_norm": 0.09943262643204327, "learning_rate": 8.775121614637064e-06, "loss": 0.3445, "step": 2803 }, { "epoch": 4.03453237410072, "grad_norm": 0.0931000734423195, "learning_rate": 8.750020570747568e-06, "loss": 0.3434, "step": 2804 }, { "epoch": 4.0359712230215825, "grad_norm": 0.10407012795568063, "learning_rate": 8.724951069181617e-06, "loss": 0.3408, "step": 2805 }, { "epoch": 4.037410071942446, "grad_norm": 0.09234876336430461, "learning_rate": 8.699913135243237e-06, "loss": 0.346, "step": 2806 }, { "epoch": 4.038848920863309, "grad_norm": 0.0948259182251536, "learning_rate": 8.6749067942046e-06, "loss": 0.3477, "step": 2807 }, { "epoch": 4.040287769784173, "grad_norm": 0.09581875981707894, "learning_rate": 8.649932071305952e-06, "loss": 0.3444, "step": 2808 }, { "epoch": 4.041726618705036, "grad_norm": 0.09404360483482738, "learning_rate": 8.624988991755687e-06, "loss": 0.3484, "step": 2809 }, { "epoch": 4.043165467625899, "grad_norm": 0.09424538079465061, "learning_rate": 8.60007758073023e-06, "loss": 0.3422, "step": 2810 }, { "epoch": 4.044604316546763, "grad_norm": 0.09028279429198212, "learning_rate": 8.575197863374006e-06, "loss": 0.3389, "step": 2811 }, { "epoch": 4.046043165467626, "grad_norm": 0.09737949864105801, "learning_rate": 8.550349864799505e-06, "loss": 0.3464, "step": 2812 }, { "epoch": 4.047482014388489, "grad_norm": 0.09569267256554466, "learning_rate": 8.525533610087193e-06, "loss": 0.3433, "step": 2813 }, { "epoch": 4.048920863309353, "grad_norm": 0.816965116942467, "learning_rate": 8.500749124285455e-06, "loss": 0.3512, "step": 2814 }, { "epoch": 4.0503597122302155, "grad_norm": 0.08126219538723158, "learning_rate": 8.475996432410642e-06, "loss": 0.3271, "step": 2815 }, { "epoch": 4.051798561151079, "grad_norm": 0.08746993619788168, "learning_rate": 8.451275559447011e-06, "loss": 0.3399, "step": 2816 }, { "epoch": 4.053237410071943, "grad_norm": 0.0902846287819087, "learning_rate": 8.426586530346705e-06, "loss": 0.3496, "step": 2817 }, { "epoch": 4.054676258992806, "grad_norm": 0.10193643857742915, "learning_rate": 8.401929370029708e-06, "loss": 0.3486, "step": 2818 }, { "epoch": 4.056115107913669, "grad_norm": 0.09145226631320796, "learning_rate": 8.377304103383857e-06, "loss": 0.3482, "step": 2819 }, { "epoch": 4.057553956834532, "grad_norm": 0.09814500261065982, "learning_rate": 8.352710755264786e-06, "loss": 0.336, "step": 2820 }, { "epoch": 4.058992805755396, "grad_norm": 0.09982457529970826, "learning_rate": 8.328149350495916e-06, "loss": 0.3435, "step": 2821 }, { "epoch": 4.060431654676259, "grad_norm": 0.08801249772630416, "learning_rate": 8.303619913868427e-06, "loss": 0.3424, "step": 2822 }, { "epoch": 4.061870503597122, "grad_norm": 0.09319954228614029, "learning_rate": 8.279122470141208e-06, "loss": 0.3392, "step": 2823 }, { "epoch": 4.063309352517986, "grad_norm": 0.0960504249714296, "learning_rate": 8.254657044040914e-06, "loss": 0.3507, "step": 2824 }, { "epoch": 4.0647482014388485, "grad_norm": 0.08805181094143594, "learning_rate": 8.230223660261814e-06, "loss": 0.3441, "step": 2825 }, { "epoch": 4.066187050359712, "grad_norm": 0.08993959326068168, "learning_rate": 8.205822343465865e-06, "loss": 0.3438, "step": 2826 }, { "epoch": 4.067625899280576, "grad_norm": 0.09462892477326966, "learning_rate": 8.181453118282694e-06, "loss": 0.3461, "step": 2827 }, { "epoch": 4.069064748201439, "grad_norm": 0.09223113763588198, "learning_rate": 8.157116009309467e-06, "loss": 0.3357, "step": 2828 }, { "epoch": 4.070503597122302, "grad_norm": 0.0947373152478922, "learning_rate": 8.132811041110976e-06, "loss": 0.3385, "step": 2829 }, { "epoch": 4.071942446043165, "grad_norm": 0.0911691108813414, "learning_rate": 8.108538238219564e-06, "loss": 0.3482, "step": 2830 }, { "epoch": 4.073381294964029, "grad_norm": 0.09481325760507595, "learning_rate": 8.084297625135104e-06, "loss": 0.3484, "step": 2831 }, { "epoch": 4.074820143884892, "grad_norm": 0.09280800444964485, "learning_rate": 8.060089226324987e-06, "loss": 0.3396, "step": 2832 }, { "epoch": 4.076258992805755, "grad_norm": 0.09206834340203833, "learning_rate": 8.035913066224088e-06, "loss": 0.3402, "step": 2833 }, { "epoch": 4.077697841726619, "grad_norm": 0.09786690966344266, "learning_rate": 8.0117691692347e-06, "loss": 0.3448, "step": 2834 }, { "epoch": 4.079136690647482, "grad_norm": 0.09435932024235821, "learning_rate": 7.987657559726628e-06, "loss": 0.3394, "step": 2835 }, { "epoch": 4.080575539568345, "grad_norm": 0.10264188177189495, "learning_rate": 7.963578262037038e-06, "loss": 0.343, "step": 2836 }, { "epoch": 4.082014388489209, "grad_norm": 0.08142038934543594, "learning_rate": 7.939531300470458e-06, "loss": 0.3377, "step": 2837 }, { "epoch": 4.083453237410072, "grad_norm": 0.10294910680683012, "learning_rate": 7.915516699298847e-06, "loss": 0.3332, "step": 2838 }, { "epoch": 4.084892086330935, "grad_norm": 0.09467645237728697, "learning_rate": 7.891534482761463e-06, "loss": 0.3411, "step": 2839 }, { "epoch": 4.086330935251799, "grad_norm": 0.09106635791743539, "learning_rate": 7.867584675064846e-06, "loss": 0.342, "step": 2840 }, { "epoch": 4.087769784172662, "grad_norm": 0.10286179689172759, "learning_rate": 7.843667300382863e-06, "loss": 0.3383, "step": 2841 }, { "epoch": 4.089208633093525, "grad_norm": 0.09020293381361195, "learning_rate": 7.81978238285667e-06, "loss": 0.3416, "step": 2842 }, { "epoch": 4.090647482014388, "grad_norm": 0.09102358622946989, "learning_rate": 7.795929946594584e-06, "loss": 0.3543, "step": 2843 }, { "epoch": 4.092086330935252, "grad_norm": 0.09068947018842027, "learning_rate": 7.772110015672209e-06, "loss": 0.3471, "step": 2844 }, { "epoch": 4.093525179856115, "grad_norm": 0.1039169415430003, "learning_rate": 7.748322614132297e-06, "loss": 0.3429, "step": 2845 }, { "epoch": 4.094964028776978, "grad_norm": 0.09358126911792906, "learning_rate": 7.72456776598479e-06, "loss": 0.3385, "step": 2846 }, { "epoch": 4.096402877697842, "grad_norm": 0.0917379650216092, "learning_rate": 7.70084549520676e-06, "loss": 0.3536, "step": 2847 }, { "epoch": 4.097841726618705, "grad_norm": 0.09264993647622498, "learning_rate": 7.6771558257424e-06, "loss": 0.3459, "step": 2848 }, { "epoch": 4.099280575539568, "grad_norm": 0.1618684592841564, "learning_rate": 7.653498781502997e-06, "loss": 0.3381, "step": 2849 }, { "epoch": 4.100719424460432, "grad_norm": 0.09029043245153334, "learning_rate": 7.629874386366918e-06, "loss": 0.3446, "step": 2850 }, { "epoch": 4.102158273381295, "grad_norm": 0.09763948580556599, "learning_rate": 7.606282664179545e-06, "loss": 0.3391, "step": 2851 }, { "epoch": 4.103597122302158, "grad_norm": 0.08585510341128091, "learning_rate": 7.5827236387532976e-06, "loss": 0.3444, "step": 2852 }, { "epoch": 4.105035971223022, "grad_norm": 0.08803938625909877, "learning_rate": 7.559197333867629e-06, "loss": 0.3463, "step": 2853 }, { "epoch": 4.106474820143885, "grad_norm": 0.08232335896419818, "learning_rate": 7.53570377326891e-06, "loss": 0.3366, "step": 2854 }, { "epoch": 4.107913669064748, "grad_norm": 0.09020494999270168, "learning_rate": 7.512242980670481e-06, "loss": 0.3452, "step": 2855 }, { "epoch": 4.109352517985611, "grad_norm": 0.08940659670696266, "learning_rate": 7.488814979752615e-06, "loss": 0.3468, "step": 2856 }, { "epoch": 4.110791366906475, "grad_norm": 0.0849791874102727, "learning_rate": 7.465419794162487e-06, "loss": 0.3388, "step": 2857 }, { "epoch": 4.1122302158273385, "grad_norm": 0.09211763625939283, "learning_rate": 7.442057447514144e-06, "loss": 0.336, "step": 2858 }, { "epoch": 4.113669064748201, "grad_norm": 0.09220375553797318, "learning_rate": 7.418727963388481e-06, "loss": 0.3368, "step": 2859 }, { "epoch": 4.115107913669065, "grad_norm": 0.08853290729700264, "learning_rate": 7.395431365333241e-06, "loss": 0.3392, "step": 2860 }, { "epoch": 4.116546762589928, "grad_norm": 0.09169049439407151, "learning_rate": 7.372167676862952e-06, "loss": 0.3507, "step": 2861 }, { "epoch": 4.117985611510791, "grad_norm": 0.09536571699126575, "learning_rate": 7.348936921458949e-06, "loss": 0.3428, "step": 2862 }, { "epoch": 4.119424460431655, "grad_norm": 0.08894051229026097, "learning_rate": 7.325739122569282e-06, "loss": 0.3465, "step": 2863 }, { "epoch": 4.120863309352518, "grad_norm": 0.10288922380997478, "learning_rate": 7.302574303608794e-06, "loss": 0.3428, "step": 2864 }, { "epoch": 4.122302158273381, "grad_norm": 0.09496950439862678, "learning_rate": 7.279442487959012e-06, "loss": 0.3409, "step": 2865 }, { "epoch": 4.123741007194244, "grad_norm": 0.0900851360203878, "learning_rate": 7.256343698968131e-06, "loss": 0.339, "step": 2866 }, { "epoch": 4.125179856115108, "grad_norm": 0.10725679388307317, "learning_rate": 7.233277959951026e-06, "loss": 0.3503, "step": 2867 }, { "epoch": 4.1266187050359715, "grad_norm": 0.09041396138224692, "learning_rate": 7.210245294189251e-06, "loss": 0.3356, "step": 2868 }, { "epoch": 4.128057553956834, "grad_norm": 0.08832255033104126, "learning_rate": 7.187245724930911e-06, "loss": 0.3417, "step": 2869 }, { "epoch": 4.129496402877698, "grad_norm": 0.09162282354223023, "learning_rate": 7.164279275390749e-06, "loss": 0.3523, "step": 2870 }, { "epoch": 4.130935251798562, "grad_norm": 0.09564916976117638, "learning_rate": 7.14134596875006e-06, "loss": 0.3455, "step": 2871 }, { "epoch": 4.132374100719424, "grad_norm": 0.08867960402246636, "learning_rate": 7.118445828156697e-06, "loss": 0.3403, "step": 2872 }, { "epoch": 4.133812949640288, "grad_norm": 0.0935344041429054, "learning_rate": 7.0955788767250334e-06, "loss": 0.3447, "step": 2873 }, { "epoch": 4.135251798561151, "grad_norm": 0.10034740439432684, "learning_rate": 7.0727451375359345e-06, "loss": 0.3431, "step": 2874 }, { "epoch": 4.136690647482014, "grad_norm": 0.08893466563609016, "learning_rate": 7.049944633636756e-06, "loss": 0.3402, "step": 2875 }, { "epoch": 4.138129496402878, "grad_norm": 0.09170257237482983, "learning_rate": 7.027177388041311e-06, "loss": 0.344, "step": 2876 }, { "epoch": 4.139568345323741, "grad_norm": 0.09811187802598147, "learning_rate": 7.004443423729808e-06, "loss": 0.3527, "step": 2877 }, { "epoch": 4.1410071942446045, "grad_norm": 0.08494697223687862, "learning_rate": 6.981742763648891e-06, "loss": 0.3307, "step": 2878 }, { "epoch": 4.142446043165467, "grad_norm": 0.0892102471859085, "learning_rate": 6.959075430711614e-06, "loss": 0.3417, "step": 2879 }, { "epoch": 4.143884892086331, "grad_norm": 0.08902351209775089, "learning_rate": 6.936441447797335e-06, "loss": 0.3433, "step": 2880 }, { "epoch": 4.145323741007195, "grad_norm": 0.09282861304930493, "learning_rate": 6.913840837751778e-06, "loss": 0.3512, "step": 2881 }, { "epoch": 4.146762589928057, "grad_norm": 0.08886461511735246, "learning_rate": 6.8912736233870095e-06, "loss": 0.3457, "step": 2882 }, { "epoch": 4.148201438848921, "grad_norm": 0.08739195560940712, "learning_rate": 6.868739827481335e-06, "loss": 0.3505, "step": 2883 }, { "epoch": 4.149640287769784, "grad_norm": 0.08097011756110467, "learning_rate": 6.846239472779359e-06, "loss": 0.3376, "step": 2884 }, { "epoch": 4.151079136690647, "grad_norm": 0.08747604945027315, "learning_rate": 6.82377258199193e-06, "loss": 0.3412, "step": 2885 }, { "epoch": 4.152517985611511, "grad_norm": 0.08555350508939398, "learning_rate": 6.80133917779612e-06, "loss": 0.3374, "step": 2886 }, { "epoch": 4.153956834532374, "grad_norm": 0.0938565644787564, "learning_rate": 6.778939282835195e-06, "loss": 0.3465, "step": 2887 }, { "epoch": 4.1553956834532375, "grad_norm": 0.08617666533820252, "learning_rate": 6.756572919718611e-06, "loss": 0.3406, "step": 2888 }, { "epoch": 4.1568345323741, "grad_norm": 0.09348935554855656, "learning_rate": 6.734240111021937e-06, "loss": 0.3354, "step": 2889 }, { "epoch": 4.158273381294964, "grad_norm": 0.08692755170714953, "learning_rate": 6.711940879286944e-06, "loss": 0.3536, "step": 2890 }, { "epoch": 4.159712230215828, "grad_norm": 0.09515313442434982, "learning_rate": 6.689675247021461e-06, "loss": 0.3456, "step": 2891 }, { "epoch": 4.16115107913669, "grad_norm": 0.10434344890434036, "learning_rate": 6.667443236699398e-06, "loss": 0.3529, "step": 2892 }, { "epoch": 4.162589928057554, "grad_norm": 0.08869915921598391, "learning_rate": 6.64524487076077e-06, "loss": 0.3383, "step": 2893 }, { "epoch": 4.164028776978418, "grad_norm": 0.0920159140031608, "learning_rate": 6.623080171611605e-06, "loss": 0.3447, "step": 2894 }, { "epoch": 4.16546762589928, "grad_norm": 0.09685356082379706, "learning_rate": 6.600949161623939e-06, "loss": 0.3511, "step": 2895 }, { "epoch": 4.166906474820144, "grad_norm": 0.10215445669580586, "learning_rate": 6.578851863135831e-06, "loss": 0.3424, "step": 2896 }, { "epoch": 4.168345323741007, "grad_norm": 0.09084195527806635, "learning_rate": 6.556788298451291e-06, "loss": 0.3358, "step": 2897 }, { "epoch": 4.1697841726618705, "grad_norm": 0.09877614034532968, "learning_rate": 6.534758489840296e-06, "loss": 0.3506, "step": 2898 }, { "epoch": 4.171223021582734, "grad_norm": 0.09779949648074235, "learning_rate": 6.512762459538744e-06, "loss": 0.3439, "step": 2899 }, { "epoch": 4.172661870503597, "grad_norm": 0.08404080007595911, "learning_rate": 6.49080022974843e-06, "loss": 0.3417, "step": 2900 }, { "epoch": 4.174100719424461, "grad_norm": 0.0964919782724387, "learning_rate": 6.468871822637051e-06, "loss": 0.3412, "step": 2901 }, { "epoch": 4.175539568345323, "grad_norm": 0.12962737131413704, "learning_rate": 6.446977260338152e-06, "loss": 0.3445, "step": 2902 }, { "epoch": 4.176978417266187, "grad_norm": 0.08778057787721012, "learning_rate": 6.425116564951115e-06, "loss": 0.3392, "step": 2903 }, { "epoch": 4.178417266187051, "grad_norm": 0.09106657357644896, "learning_rate": 6.403289758541143e-06, "loss": 0.338, "step": 2904 }, { "epoch": 4.179856115107913, "grad_norm": 0.09364878729531954, "learning_rate": 6.381496863139247e-06, "loss": 0.3487, "step": 2905 }, { "epoch": 4.181294964028777, "grad_norm": 0.08472370937562462, "learning_rate": 6.3597379007421755e-06, "loss": 0.3533, "step": 2906 }, { "epoch": 4.18273381294964, "grad_norm": 0.0931818196092673, "learning_rate": 6.338012893312444e-06, "loss": 0.3544, "step": 2907 }, { "epoch": 4.1841726618705035, "grad_norm": 0.08969813506885384, "learning_rate": 6.31632186277833e-06, "loss": 0.3588, "step": 2908 }, { "epoch": 4.185611510791367, "grad_norm": 0.08392440444302791, "learning_rate": 6.294664831033746e-06, "loss": 0.3434, "step": 2909 }, { "epoch": 4.18705035971223, "grad_norm": 0.08789028884788187, "learning_rate": 6.273041819938343e-06, "loss": 0.353, "step": 2910 }, { "epoch": 4.188489208633094, "grad_norm": 0.089880166974654, "learning_rate": 6.251452851317421e-06, "loss": 0.3456, "step": 2911 }, { "epoch": 4.189928057553957, "grad_norm": 0.09249524397394611, "learning_rate": 6.229897946961903e-06, "loss": 0.3412, "step": 2912 }, { "epoch": 4.19136690647482, "grad_norm": 0.0858512697385683, "learning_rate": 6.20837712862834e-06, "loss": 0.3415, "step": 2913 }, { "epoch": 4.192805755395684, "grad_norm": 0.08636326475073688, "learning_rate": 6.186890418038887e-06, "loss": 0.3407, "step": 2914 }, { "epoch": 4.194244604316546, "grad_norm": 0.08556622288993193, "learning_rate": 6.165437836881256e-06, "loss": 0.3357, "step": 2915 }, { "epoch": 4.19568345323741, "grad_norm": 0.08470488394732564, "learning_rate": 6.144019406808724e-06, "loss": 0.3398, "step": 2916 }, { "epoch": 4.197122302158274, "grad_norm": 0.0880734728867388, "learning_rate": 6.122635149440093e-06, "loss": 0.3415, "step": 2917 }, { "epoch": 4.1985611510791365, "grad_norm": 0.11309470200267255, "learning_rate": 6.101285086359645e-06, "loss": 0.3442, "step": 2918 }, { "epoch": 4.2, "grad_norm": 0.08545897562989867, "learning_rate": 6.079969239117201e-06, "loss": 0.3596, "step": 2919 }, { "epoch": 4.201438848920863, "grad_norm": 0.08692145679669436, "learning_rate": 6.05868762922802e-06, "loss": 0.3499, "step": 2920 }, { "epoch": 4.202877697841727, "grad_norm": 0.08901956079471854, "learning_rate": 6.037440278172782e-06, "loss": 0.344, "step": 2921 }, { "epoch": 4.20431654676259, "grad_norm": 0.08386206223360473, "learning_rate": 6.016227207397616e-06, "loss": 0.3456, "step": 2922 }, { "epoch": 4.205755395683453, "grad_norm": 0.08630288027571373, "learning_rate": 5.995048438314044e-06, "loss": 0.3433, "step": 2923 }, { "epoch": 4.207194244604317, "grad_norm": 0.08400093611736381, "learning_rate": 5.973903992298962e-06, "loss": 0.3497, "step": 2924 }, { "epoch": 4.2086330935251794, "grad_norm": 0.08297635184757658, "learning_rate": 5.952793890694617e-06, "loss": 0.3466, "step": 2925 }, { "epoch": 4.210071942446043, "grad_norm": 0.08475397330983742, "learning_rate": 5.9317181548086055e-06, "loss": 0.344, "step": 2926 }, { "epoch": 4.211510791366907, "grad_norm": 0.08413977347488912, "learning_rate": 5.910676805913822e-06, "loss": 0.3411, "step": 2927 }, { "epoch": 4.2129496402877695, "grad_norm": 0.07947451939078741, "learning_rate": 5.889669865248455e-06, "loss": 0.3418, "step": 2928 }, { "epoch": 4.214388489208633, "grad_norm": 0.08340989022991739, "learning_rate": 5.8686973540159706e-06, "loss": 0.3362, "step": 2929 }, { "epoch": 4.215827338129497, "grad_norm": 0.0850053530095417, "learning_rate": 5.847759293385075e-06, "loss": 0.341, "step": 2930 }, { "epoch": 4.21726618705036, "grad_norm": 0.0840802055462811, "learning_rate": 5.8268557044897175e-06, "loss": 0.3498, "step": 2931 }, { "epoch": 4.218705035971223, "grad_norm": 0.08122030264526585, "learning_rate": 5.805986608429019e-06, "loss": 0.3404, "step": 2932 }, { "epoch": 4.220143884892086, "grad_norm": 0.0871425920990511, "learning_rate": 5.785152026267309e-06, "loss": 0.3367, "step": 2933 }, { "epoch": 4.22158273381295, "grad_norm": 0.08484976538740269, "learning_rate": 5.764351979034102e-06, "loss": 0.3473, "step": 2934 }, { "epoch": 4.223021582733813, "grad_norm": 0.08530919098709946, "learning_rate": 5.743586487724e-06, "loss": 0.3478, "step": 2935 }, { "epoch": 4.224460431654676, "grad_norm": 0.08269622733550601, "learning_rate": 5.722855573296775e-06, "loss": 0.3425, "step": 2936 }, { "epoch": 4.22589928057554, "grad_norm": 0.08307626370675829, "learning_rate": 5.702159256677266e-06, "loss": 0.3475, "step": 2937 }, { "epoch": 4.2273381294964025, "grad_norm": 0.08473481837112164, "learning_rate": 5.681497558755417e-06, "loss": 0.3501, "step": 2938 }, { "epoch": 4.228776978417266, "grad_norm": 0.08407206284132669, "learning_rate": 5.6608705003862085e-06, "loss": 0.3466, "step": 2939 }, { "epoch": 4.23021582733813, "grad_norm": 0.08111194604984087, "learning_rate": 5.6402781023896695e-06, "loss": 0.3487, "step": 2940 }, { "epoch": 4.231654676258993, "grad_norm": 0.08693074502656296, "learning_rate": 5.619720385550835e-06, "loss": 0.3408, "step": 2941 }, { "epoch": 4.233093525179856, "grad_norm": 0.0815234523575802, "learning_rate": 5.5991973706197445e-06, "loss": 0.337, "step": 2942 }, { "epoch": 4.234532374100719, "grad_norm": 0.08087147008035375, "learning_rate": 5.578709078311417e-06, "loss": 0.3371, "step": 2943 }, { "epoch": 4.235971223021583, "grad_norm": 0.08160489185806802, "learning_rate": 5.558255529305779e-06, "loss": 0.3375, "step": 2944 }, { "epoch": 4.237410071942446, "grad_norm": 0.08593421266525207, "learning_rate": 5.537836744247753e-06, "loss": 0.3487, "step": 2945 }, { "epoch": 4.238848920863309, "grad_norm": 0.08061252299367003, "learning_rate": 5.517452743747145e-06, "loss": 0.3447, "step": 2946 }, { "epoch": 4.240287769784173, "grad_norm": 0.07808709488841185, "learning_rate": 5.497103548378628e-06, "loss": 0.3324, "step": 2947 }, { "epoch": 4.2417266187050355, "grad_norm": 0.08486904781829617, "learning_rate": 5.476789178681769e-06, "loss": 0.3476, "step": 2948 }, { "epoch": 4.243165467625899, "grad_norm": 0.08844928626365854, "learning_rate": 5.456509655160989e-06, "loss": 0.341, "step": 2949 }, { "epoch": 4.244604316546763, "grad_norm": 0.07812810108986816, "learning_rate": 5.436264998285516e-06, "loss": 0.3549, "step": 2950 }, { "epoch": 4.246043165467626, "grad_norm": 0.08236018914944991, "learning_rate": 5.4160552284894075e-06, "loss": 0.3423, "step": 2951 }, { "epoch": 4.247482014388489, "grad_norm": 0.08402920013679564, "learning_rate": 5.3958803661714865e-06, "loss": 0.3536, "step": 2952 }, { "epoch": 4.248920863309353, "grad_norm": 0.0802633227777653, "learning_rate": 5.375740431695353e-06, "loss": 0.3425, "step": 2953 }, { "epoch": 4.250359712230216, "grad_norm": 0.0831045914244016, "learning_rate": 5.355635445389355e-06, "loss": 0.3546, "step": 2954 }, { "epoch": 4.251798561151079, "grad_norm": 0.08031457033414124, "learning_rate": 5.3355654275465584e-06, "loss": 0.3471, "step": 2955 }, { "epoch": 4.253237410071942, "grad_norm": 0.0892289236594425, "learning_rate": 5.315530398424735e-06, "loss": 0.3427, "step": 2956 }, { "epoch": 4.254676258992806, "grad_norm": 0.08371451371942847, "learning_rate": 5.295530378246354e-06, "loss": 0.3448, "step": 2957 }, { "epoch": 4.256115107913669, "grad_norm": 0.08107850094332589, "learning_rate": 5.27556538719852e-06, "loss": 0.3457, "step": 2958 }, { "epoch": 4.257553956834532, "grad_norm": 0.08153789081196773, "learning_rate": 5.2556354454329895e-06, "loss": 0.3417, "step": 2959 }, { "epoch": 4.258992805755396, "grad_norm": 0.07830203977660653, "learning_rate": 5.235740573066186e-06, "loss": 0.3346, "step": 2960 }, { "epoch": 4.260431654676259, "grad_norm": 0.08268550399709272, "learning_rate": 5.21588079017906e-06, "loss": 0.3457, "step": 2961 }, { "epoch": 4.261870503597122, "grad_norm": 0.08568080562957445, "learning_rate": 5.196056116817194e-06, "loss": 0.3517, "step": 2962 }, { "epoch": 4.263309352517986, "grad_norm": 0.07714225270430754, "learning_rate": 5.1762665729907424e-06, "loss": 0.3364, "step": 2963 }, { "epoch": 4.264748201438849, "grad_norm": 0.08884219846371594, "learning_rate": 5.156512178674358e-06, "loss": 0.3482, "step": 2964 }, { "epoch": 4.266187050359712, "grad_norm": 0.08076451805775184, "learning_rate": 5.136792953807242e-06, "loss": 0.3411, "step": 2965 }, { "epoch": 4.267625899280576, "grad_norm": 0.08119686193415565, "learning_rate": 5.117108918293095e-06, "loss": 0.3431, "step": 2966 }, { "epoch": 4.269064748201439, "grad_norm": 0.08696701935451825, "learning_rate": 5.097460092000095e-06, "loss": 0.3456, "step": 2967 }, { "epoch": 4.270503597122302, "grad_norm": 0.08407445072592444, "learning_rate": 5.07784649476089e-06, "loss": 0.342, "step": 2968 }, { "epoch": 4.271942446043165, "grad_norm": 0.07878122354523241, "learning_rate": 5.058268146372562e-06, "loss": 0.3453, "step": 2969 }, { "epoch": 4.273381294964029, "grad_norm": 0.08790159343290388, "learning_rate": 5.038725066596595e-06, "loss": 0.349, "step": 2970 }, { "epoch": 4.274820143884892, "grad_norm": 0.0857332703437221, "learning_rate": 5.019217275158923e-06, "loss": 0.3445, "step": 2971 }, { "epoch": 4.276258992805755, "grad_norm": 0.08472709552697828, "learning_rate": 4.9997447917498276e-06, "loss": 0.3394, "step": 2972 }, { "epoch": 4.277697841726619, "grad_norm": 0.08659514612357604, "learning_rate": 4.9803076360239335e-06, "loss": 0.3386, "step": 2973 }, { "epoch": 4.279136690647482, "grad_norm": 0.08682369026157778, "learning_rate": 4.960905827600266e-06, "loss": 0.3478, "step": 2974 }, { "epoch": 4.280575539568345, "grad_norm": 0.08332643444853381, "learning_rate": 4.941539386062113e-06, "loss": 0.3358, "step": 2975 }, { "epoch": 4.282014388489209, "grad_norm": 0.08330925158192157, "learning_rate": 4.922208330957094e-06, "loss": 0.3414, "step": 2976 }, { "epoch": 4.283453237410072, "grad_norm": 0.09450486650057416, "learning_rate": 4.902912681797114e-06, "loss": 0.3451, "step": 2977 }, { "epoch": 4.284892086330935, "grad_norm": 0.08335715797784177, "learning_rate": 4.88365245805833e-06, "loss": 0.3445, "step": 2978 }, { "epoch": 4.286330935251798, "grad_norm": 0.08214126141522422, "learning_rate": 4.864427679181143e-06, "loss": 0.3459, "step": 2979 }, { "epoch": 4.287769784172662, "grad_norm": 0.08242959504595915, "learning_rate": 4.8452383645701815e-06, "loss": 0.3442, "step": 2980 }, { "epoch": 4.2892086330935255, "grad_norm": 0.08389674770581983, "learning_rate": 4.826084533594277e-06, "loss": 0.346, "step": 2981 }, { "epoch": 4.290647482014388, "grad_norm": 0.09101394194088418, "learning_rate": 4.806966205586441e-06, "loss": 0.3565, "step": 2982 }, { "epoch": 4.292086330935252, "grad_norm": 0.0833235081630495, "learning_rate": 4.787883399843871e-06, "loss": 0.346, "step": 2983 }, { "epoch": 4.293525179856115, "grad_norm": 0.08064060682534645, "learning_rate": 4.768836135627859e-06, "loss": 0.3468, "step": 2984 }, { "epoch": 4.294964028776978, "grad_norm": 0.07966757656484869, "learning_rate": 4.749824432163888e-06, "loss": 0.3482, "step": 2985 }, { "epoch": 4.296402877697842, "grad_norm": 0.08182163847871617, "learning_rate": 4.730848308641509e-06, "loss": 0.337, "step": 2986 }, { "epoch": 4.297841726618705, "grad_norm": 0.08060734717765031, "learning_rate": 4.711907784214358e-06, "loss": 0.3477, "step": 2987 }, { "epoch": 4.299280575539568, "grad_norm": 0.07935103021292428, "learning_rate": 4.693002878000146e-06, "loss": 0.3371, "step": 2988 }, { "epoch": 4.300719424460432, "grad_norm": 0.0858043873763291, "learning_rate": 4.674133609080658e-06, "loss": 0.3458, "step": 2989 }, { "epoch": 4.302158273381295, "grad_norm": 0.08259003310366053, "learning_rate": 4.6552999965016634e-06, "loss": 0.3442, "step": 2990 }, { "epoch": 4.3035971223021585, "grad_norm": 0.08425042343414377, "learning_rate": 4.6365020592729694e-06, "loss": 0.354, "step": 2991 }, { "epoch": 4.305035971223021, "grad_norm": 0.0776034386383685, "learning_rate": 4.617739816368367e-06, "loss": 0.3486, "step": 2992 }, { "epoch": 4.306474820143885, "grad_norm": 0.08175188536756343, "learning_rate": 4.599013286725624e-06, "loss": 0.344, "step": 2993 }, { "epoch": 4.307913669064749, "grad_norm": 0.0859993200464421, "learning_rate": 4.580322489246456e-06, "loss": 0.351, "step": 2994 }, { "epoch": 4.309352517985611, "grad_norm": 0.08306294500700052, "learning_rate": 4.5616674427965135e-06, "loss": 0.3469, "step": 2995 }, { "epoch": 4.310791366906475, "grad_norm": 0.08738722292533195, "learning_rate": 4.543048166205357e-06, "loss": 0.3454, "step": 2996 }, { "epoch": 4.312230215827338, "grad_norm": 0.09862492416042193, "learning_rate": 4.524464678266452e-06, "loss": 0.339, "step": 2997 }, { "epoch": 4.313669064748201, "grad_norm": 0.07988019638011482, "learning_rate": 4.505916997737143e-06, "loss": 0.3451, "step": 2998 }, { "epoch": 4.315107913669065, "grad_norm": 0.39842864982525744, "learning_rate": 4.487405143338599e-06, "loss": 0.3492, "step": 2999 }, { "epoch": 4.316546762589928, "grad_norm": 0.09590906047464921, "learning_rate": 4.468929133755881e-06, "loss": 0.3412, "step": 3000 }, { "epoch": 4.3179856115107915, "grad_norm": 0.08344721370744118, "learning_rate": 4.450488987637824e-06, "loss": 0.3529, "step": 3001 }, { "epoch": 4.319424460431654, "grad_norm": 0.08400604144374915, "learning_rate": 4.43208472359709e-06, "loss": 0.3411, "step": 3002 }, { "epoch": 4.320863309352518, "grad_norm": 0.08687667408588712, "learning_rate": 4.4137163602101114e-06, "loss": 0.3423, "step": 3003 }, { "epoch": 4.322302158273382, "grad_norm": 0.0936330058653016, "learning_rate": 4.3953839160170906e-06, "loss": 0.3397, "step": 3004 }, { "epoch": 4.323741007194244, "grad_norm": 0.08504338924868987, "learning_rate": 4.377087409521972e-06, "loss": 0.3473, "step": 3005 }, { "epoch": 4.325179856115108, "grad_norm": 0.08688979011045339, "learning_rate": 4.358826859192422e-06, "loss": 0.3449, "step": 3006 }, { "epoch": 4.326618705035971, "grad_norm": 0.08143198876697011, "learning_rate": 4.340602283459827e-06, "loss": 0.3433, "step": 3007 }, { "epoch": 4.3280575539568344, "grad_norm": 0.08829899153966095, "learning_rate": 4.322413700719246e-06, "loss": 0.34, "step": 3008 }, { "epoch": 4.329496402877698, "grad_norm": 0.08095509235938958, "learning_rate": 4.3042611293294276e-06, "loss": 0.3403, "step": 3009 }, { "epoch": 4.330935251798561, "grad_norm": 0.08489746893790455, "learning_rate": 4.28614458761274e-06, "loss": 0.346, "step": 3010 }, { "epoch": 4.3323741007194245, "grad_norm": 0.08758068155942195, "learning_rate": 4.2680640938552245e-06, "loss": 0.3519, "step": 3011 }, { "epoch": 4.333812949640288, "grad_norm": 0.07758055235191116, "learning_rate": 4.250019666306515e-06, "loss": 0.3374, "step": 3012 }, { "epoch": 4.335251798561151, "grad_norm": 0.08417028239751985, "learning_rate": 4.232011323179839e-06, "loss": 0.3434, "step": 3013 }, { "epoch": 4.336690647482015, "grad_norm": 0.08546010877698863, "learning_rate": 4.214039082652002e-06, "loss": 0.3504, "step": 3014 }, { "epoch": 4.338129496402877, "grad_norm": 0.08671087781325834, "learning_rate": 4.1961029628634e-06, "loss": 0.3502, "step": 3015 }, { "epoch": 4.339568345323741, "grad_norm": 0.08271817851173717, "learning_rate": 4.17820298191792e-06, "loss": 0.3431, "step": 3016 }, { "epoch": 4.341007194244605, "grad_norm": 0.07408603517895467, "learning_rate": 4.160339157883e-06, "loss": 0.3379, "step": 3017 }, { "epoch": 4.3424460431654675, "grad_norm": 0.07970953911087428, "learning_rate": 4.142511508789606e-06, "loss": 0.3442, "step": 3018 }, { "epoch": 4.343884892086331, "grad_norm": 0.07987015124686353, "learning_rate": 4.1247200526321364e-06, "loss": 0.3473, "step": 3019 }, { "epoch": 4.345323741007194, "grad_norm": 0.08194500782890717, "learning_rate": 4.106964807368496e-06, "loss": 0.3364, "step": 3020 }, { "epoch": 4.3467625899280575, "grad_norm": 0.08249522124831325, "learning_rate": 4.089245790920031e-06, "loss": 0.354, "step": 3021 }, { "epoch": 4.348201438848921, "grad_norm": 0.08249382730822125, "learning_rate": 4.071563021171523e-06, "loss": 0.3328, "step": 3022 }, { "epoch": 4.349640287769784, "grad_norm": 0.07931275073284706, "learning_rate": 4.0539165159711615e-06, "loss": 0.3464, "step": 3023 }, { "epoch": 4.351079136690648, "grad_norm": 0.08131517415986211, "learning_rate": 4.036306293130543e-06, "loss": 0.345, "step": 3024 }, { "epoch": 4.35251798561151, "grad_norm": 0.07903812308544889, "learning_rate": 4.01873237042461e-06, "loss": 0.3397, "step": 3025 }, { "epoch": 4.353956834532374, "grad_norm": 0.07840845669837926, "learning_rate": 4.001194765591723e-06, "loss": 0.3396, "step": 3026 }, { "epoch": 4.355395683453238, "grad_norm": 0.0830296898055647, "learning_rate": 3.983693496333522e-06, "loss": 0.3519, "step": 3027 }, { "epoch": 4.3568345323741005, "grad_norm": 0.07885795958132294, "learning_rate": 3.966228580315017e-06, "loss": 0.3445, "step": 3028 }, { "epoch": 4.358273381294964, "grad_norm": 0.08003001337742997, "learning_rate": 3.9488000351645036e-06, "loss": 0.3417, "step": 3029 }, { "epoch": 4.359712230215827, "grad_norm": 0.08063767668536838, "learning_rate": 3.931407878473575e-06, "loss": 0.3324, "step": 3030 }, { "epoch": 4.3611510791366905, "grad_norm": 0.07926094066180016, "learning_rate": 3.914052127797088e-06, "loss": 0.3459, "step": 3031 }, { "epoch": 4.362589928057554, "grad_norm": 0.07902322917345408, "learning_rate": 3.8967328006531605e-06, "loss": 0.342, "step": 3032 }, { "epoch": 4.364028776978417, "grad_norm": 0.08061229625077315, "learning_rate": 3.879449914523137e-06, "loss": 0.339, "step": 3033 }, { "epoch": 4.365467625899281, "grad_norm": 0.08700057193394455, "learning_rate": 3.862203486851588e-06, "loss": 0.3408, "step": 3034 }, { "epoch": 4.366906474820144, "grad_norm": 0.07834873633345833, "learning_rate": 3.844993535046291e-06, "loss": 0.3486, "step": 3035 }, { "epoch": 4.368345323741007, "grad_norm": 0.07885514080870147, "learning_rate": 3.8278200764781725e-06, "loss": 0.3461, "step": 3036 }, { "epoch": 4.369784172661871, "grad_norm": 0.07559422550427315, "learning_rate": 3.8106831284813718e-06, "loss": 0.3372, "step": 3037 }, { "epoch": 4.3712230215827335, "grad_norm": 0.0870947644704936, "learning_rate": 3.7935827083531585e-06, "loss": 0.3332, "step": 3038 }, { "epoch": 4.372661870503597, "grad_norm": 0.08258021084009133, "learning_rate": 3.7765188333539037e-06, "loss": 0.3389, "step": 3039 }, { "epoch": 4.374100719424461, "grad_norm": 0.07932424220275947, "learning_rate": 3.759491520707119e-06, "loss": 0.3467, "step": 3040 }, { "epoch": 4.3755395683453235, "grad_norm": 0.0784928567073004, "learning_rate": 3.74250078759943e-06, "loss": 0.3442, "step": 3041 }, { "epoch": 4.376978417266187, "grad_norm": 0.07612824084672545, "learning_rate": 3.7255466511805007e-06, "loss": 0.3378, "step": 3042 }, { "epoch": 4.37841726618705, "grad_norm": 0.07607197089248201, "learning_rate": 3.7086291285630683e-06, "loss": 0.3394, "step": 3043 }, { "epoch": 4.379856115107914, "grad_norm": 0.08569927997685474, "learning_rate": 3.6917482368229406e-06, "loss": 0.3445, "step": 3044 }, { "epoch": 4.381294964028777, "grad_norm": 0.08379836100508248, "learning_rate": 3.674903992998915e-06, "loss": 0.3549, "step": 3045 }, { "epoch": 4.38273381294964, "grad_norm": 0.08145298713314272, "learning_rate": 3.6580964140928133e-06, "loss": 0.3391, "step": 3046 }, { "epoch": 4.384172661870504, "grad_norm": 0.07745765869883456, "learning_rate": 3.6413255170694515e-06, "loss": 0.3468, "step": 3047 }, { "epoch": 4.385611510791367, "grad_norm": 0.0753590285626336, "learning_rate": 3.6245913188566227e-06, "loss": 0.3466, "step": 3048 }, { "epoch": 4.38705035971223, "grad_norm": 0.08003272434455065, "learning_rate": 3.607893836345069e-06, "loss": 0.3397, "step": 3049 }, { "epoch": 4.388489208633094, "grad_norm": 0.08303712301744344, "learning_rate": 3.5912330863884904e-06, "loss": 0.3358, "step": 3050 }, { "epoch": 4.3899280575539565, "grad_norm": 0.07970129873064605, "learning_rate": 3.574609085803471e-06, "loss": 0.3373, "step": 3051 }, { "epoch": 4.39136690647482, "grad_norm": 0.08026216266485611, "learning_rate": 3.5580218513695573e-06, "loss": 0.3391, "step": 3052 }, { "epoch": 4.392805755395684, "grad_norm": 0.08038839081273219, "learning_rate": 3.5414713998291483e-06, "loss": 0.3467, "step": 3053 }, { "epoch": 4.394244604316547, "grad_norm": 0.0837473501114694, "learning_rate": 3.524957747887512e-06, "loss": 0.3497, "step": 3054 }, { "epoch": 4.39568345323741, "grad_norm": 0.08117837642280494, "learning_rate": 3.5084809122128125e-06, "loss": 0.344, "step": 3055 }, { "epoch": 4.397122302158273, "grad_norm": 0.08677566517858053, "learning_rate": 3.4920409094360054e-06, "loss": 0.347, "step": 3056 }, { "epoch": 4.398561151079137, "grad_norm": 0.07741552711244623, "learning_rate": 3.475637756150896e-06, "loss": 0.3412, "step": 3057 }, { "epoch": 4.4, "grad_norm": 0.07796545704124043, "learning_rate": 3.4592714689140895e-06, "loss": 0.3459, "step": 3058 }, { "epoch": 4.401438848920863, "grad_norm": 0.08349721210960026, "learning_rate": 3.442942064244981e-06, "loss": 0.3448, "step": 3059 }, { "epoch": 4.402877697841727, "grad_norm": 0.08567701444737574, "learning_rate": 3.426649558625732e-06, "loss": 0.3384, "step": 3060 }, { "epoch": 4.4043165467625895, "grad_norm": 0.08528488543691852, "learning_rate": 3.4103939685012823e-06, "loss": 0.3406, "step": 3061 }, { "epoch": 4.405755395683453, "grad_norm": 0.0812158856847987, "learning_rate": 3.3941753102792617e-06, "loss": 0.3482, "step": 3062 }, { "epoch": 4.407194244604317, "grad_norm": 0.0763936300963798, "learning_rate": 3.377993600330083e-06, "loss": 0.342, "step": 3063 }, { "epoch": 4.40863309352518, "grad_norm": 0.07925828017399017, "learning_rate": 3.361848854986831e-06, "loss": 0.3398, "step": 3064 }, { "epoch": 4.410071942446043, "grad_norm": 0.0808463931849389, "learning_rate": 3.3457410905452624e-06, "loss": 0.349, "step": 3065 }, { "epoch": 4.411510791366906, "grad_norm": 0.07916967431613016, "learning_rate": 3.3296703232638606e-06, "loss": 0.3402, "step": 3066 }, { "epoch": 4.41294964028777, "grad_norm": 0.08007145961041624, "learning_rate": 3.3136365693637294e-06, "loss": 0.3496, "step": 3067 }, { "epoch": 4.414388489208633, "grad_norm": 0.07548848994529554, "learning_rate": 3.297639845028604e-06, "loss": 0.3462, "step": 3068 }, { "epoch": 4.415827338129496, "grad_norm": 0.07812403206266526, "learning_rate": 3.281680166404857e-06, "loss": 0.3497, "step": 3069 }, { "epoch": 4.41726618705036, "grad_norm": 0.07731420504538403, "learning_rate": 3.265757549601496e-06, "loss": 0.3546, "step": 3070 }, { "epoch": 4.418705035971223, "grad_norm": 0.07795361207341998, "learning_rate": 3.249872010690074e-06, "loss": 0.3426, "step": 3071 }, { "epoch": 4.420143884892086, "grad_norm": 0.07931639958151648, "learning_rate": 3.234023565704738e-06, "loss": 0.347, "step": 3072 }, { "epoch": 4.42158273381295, "grad_norm": 0.07836346779831599, "learning_rate": 3.2182122306422035e-06, "loss": 0.353, "step": 3073 }, { "epoch": 4.423021582733813, "grad_norm": 0.08262669100761946, "learning_rate": 3.2024380214617136e-06, "loss": 0.342, "step": 3074 }, { "epoch": 4.424460431654676, "grad_norm": 0.07827073218996666, "learning_rate": 3.186700954085056e-06, "loss": 0.3428, "step": 3075 }, { "epoch": 4.42589928057554, "grad_norm": 0.07531361706208638, "learning_rate": 3.1710010443965065e-06, "loss": 0.3419, "step": 3076 }, { "epoch": 4.427338129496403, "grad_norm": 0.08151124013280812, "learning_rate": 3.1553383082428568e-06, "loss": 0.3401, "step": 3077 }, { "epoch": 4.428776978417266, "grad_norm": 0.07240958763049264, "learning_rate": 3.139712761433367e-06, "loss": 0.3452, "step": 3078 }, { "epoch": 4.430215827338129, "grad_norm": 1.719996588847683, "learning_rate": 3.1241244197397626e-06, "loss": 0.369, "step": 3079 }, { "epoch": 4.431654676258993, "grad_norm": 0.07852556430797913, "learning_rate": 3.1085732988962003e-06, "loss": 0.3488, "step": 3080 }, { "epoch": 4.433093525179856, "grad_norm": 0.077465495792956, "learning_rate": 3.0930594145993063e-06, "loss": 0.3451, "step": 3081 }, { "epoch": 4.434532374100719, "grad_norm": 0.07838966292674063, "learning_rate": 3.077582782508075e-06, "loss": 0.3499, "step": 3082 }, { "epoch": 4.435971223021583, "grad_norm": 0.07511031589145918, "learning_rate": 3.0621434182439345e-06, "loss": 0.3461, "step": 3083 }, { "epoch": 4.437410071942446, "grad_norm": 0.07451080337790358, "learning_rate": 3.0467413373906773e-06, "loss": 0.3388, "step": 3084 }, { "epoch": 4.438848920863309, "grad_norm": 0.07774781165516859, "learning_rate": 3.0313765554944806e-06, "loss": 0.3393, "step": 3085 }, { "epoch": 4.440287769784173, "grad_norm": 0.07923173316894326, "learning_rate": 3.0160490880638593e-06, "loss": 0.3399, "step": 3086 }, { "epoch": 4.441726618705036, "grad_norm": 0.07684985118117568, "learning_rate": 3.0007589505696645e-06, "loss": 0.3415, "step": 3087 }, { "epoch": 4.443165467625899, "grad_norm": 0.07590897552642471, "learning_rate": 2.9855061584450795e-06, "loss": 0.3457, "step": 3088 }, { "epoch": 4.444604316546762, "grad_norm": 0.07207050368652652, "learning_rate": 2.97029072708559e-06, "loss": 0.3371, "step": 3089 }, { "epoch": 4.446043165467626, "grad_norm": 0.08091106041280295, "learning_rate": 2.955112671848963e-06, "loss": 0.3382, "step": 3090 }, { "epoch": 4.4474820143884894, "grad_norm": 0.07479526898230791, "learning_rate": 2.9399720080552383e-06, "loss": 0.3379, "step": 3091 }, { "epoch": 4.448920863309352, "grad_norm": 0.07771800650294812, "learning_rate": 2.924868750986729e-06, "loss": 0.3473, "step": 3092 }, { "epoch": 4.450359712230216, "grad_norm": 0.07650163090027194, "learning_rate": 2.9098029158879914e-06, "loss": 0.341, "step": 3093 }, { "epoch": 4.4517985611510795, "grad_norm": 0.07242688225084594, "learning_rate": 2.8947745179657815e-06, "loss": 0.3392, "step": 3094 }, { "epoch": 4.453237410071942, "grad_norm": 0.07827079698152796, "learning_rate": 2.8797835723890944e-06, "loss": 0.3479, "step": 3095 }, { "epoch": 4.454676258992806, "grad_norm": 0.08222727514776726, "learning_rate": 2.864830094289137e-06, "loss": 0.3433, "step": 3096 }, { "epoch": 4.456115107913669, "grad_norm": 0.07515734856204817, "learning_rate": 2.84991409875925e-06, "loss": 0.3405, "step": 3097 }, { "epoch": 4.457553956834532, "grad_norm": 0.0804660226850685, "learning_rate": 2.8350356008549806e-06, "loss": 0.3345, "step": 3098 }, { "epoch": 4.458992805755396, "grad_norm": 0.07612451962872864, "learning_rate": 2.8201946155940142e-06, "loss": 0.3448, "step": 3099 }, { "epoch": 4.460431654676259, "grad_norm": 0.0769348032440111, "learning_rate": 2.8053911579561764e-06, "loss": 0.3494, "step": 3100 }, { "epoch": 4.4618705035971225, "grad_norm": 0.08222538513305827, "learning_rate": 2.7906252428834044e-06, "loss": 0.3485, "step": 3101 }, { "epoch": 4.463309352517985, "grad_norm": 0.08580367255632801, "learning_rate": 2.7758968852797542e-06, "loss": 0.3451, "step": 3102 }, { "epoch": 4.464748201438849, "grad_norm": 0.07748436073576172, "learning_rate": 2.761206100011369e-06, "loss": 0.3366, "step": 3103 }, { "epoch": 4.4661870503597125, "grad_norm": 0.07354963767301814, "learning_rate": 2.746552901906463e-06, "loss": 0.3456, "step": 3104 }, { "epoch": 4.467625899280575, "grad_norm": 0.0868966731070377, "learning_rate": 2.731937305755321e-06, "loss": 0.3377, "step": 3105 }, { "epoch": 4.469064748201439, "grad_norm": 0.07788029701225116, "learning_rate": 2.717359326310249e-06, "loss": 0.3415, "step": 3106 }, { "epoch": 4.470503597122303, "grad_norm": 0.08433311420417798, "learning_rate": 2.702818978285633e-06, "loss": 0.35, "step": 3107 }, { "epoch": 4.471942446043165, "grad_norm": 0.07443447534638524, "learning_rate": 2.688316276357825e-06, "loss": 0.3479, "step": 3108 }, { "epoch": 4.473381294964029, "grad_norm": 0.08059610410020289, "learning_rate": 2.6738512351652012e-06, "loss": 0.34, "step": 3109 }, { "epoch": 4.474820143884892, "grad_norm": 0.08193043694761587, "learning_rate": 2.65942386930814e-06, "loss": 0.3406, "step": 3110 }, { "epoch": 4.4762589928057555, "grad_norm": 0.08286360992951043, "learning_rate": 2.645034193348961e-06, "loss": 0.3405, "step": 3111 }, { "epoch": 4.477697841726619, "grad_norm": 0.07424441864030382, "learning_rate": 2.6306822218119533e-06, "loss": 0.3431, "step": 3112 }, { "epoch": 4.479136690647482, "grad_norm": 0.07416878253769449, "learning_rate": 2.61636796918336e-06, "loss": 0.3339, "step": 3113 }, { "epoch": 4.4805755395683455, "grad_norm": 0.07750644969509364, "learning_rate": 2.6020914499113438e-06, "loss": 0.3406, "step": 3114 }, { "epoch": 4.482014388489208, "grad_norm": 0.07582412223932053, "learning_rate": 2.587852678405973e-06, "loss": 0.3448, "step": 3115 }, { "epoch": 4.483453237410072, "grad_norm": 0.07739574975156975, "learning_rate": 2.5736516690392366e-06, "loss": 0.3453, "step": 3116 }, { "epoch": 4.484892086330936, "grad_norm": 0.07794678185547059, "learning_rate": 2.5594884361449746e-06, "loss": 0.3467, "step": 3117 }, { "epoch": 4.486330935251798, "grad_norm": 0.07718586965445183, "learning_rate": 2.5453629940189338e-06, "loss": 0.3433, "step": 3118 }, { "epoch": 4.487769784172662, "grad_norm": 0.07665613157929926, "learning_rate": 2.531275356918701e-06, "loss": 0.3421, "step": 3119 }, { "epoch": 4.489208633093525, "grad_norm": 0.07599974393918184, "learning_rate": 2.5172255390636878e-06, "loss": 0.3463, "step": 3120 }, { "epoch": 4.4906474820143885, "grad_norm": 0.07487563325987932, "learning_rate": 2.5032135546351644e-06, "loss": 0.3385, "step": 3121 }, { "epoch": 4.492086330935252, "grad_norm": 0.07483111999350718, "learning_rate": 2.4892394177761947e-06, "loss": 0.3364, "step": 3122 }, { "epoch": 4.493525179856115, "grad_norm": 0.07789042210389767, "learning_rate": 2.475303142591634e-06, "loss": 0.3547, "step": 3123 }, { "epoch": 4.4949640287769785, "grad_norm": 0.07604161844796199, "learning_rate": 2.461404743148141e-06, "loss": 0.341, "step": 3124 }, { "epoch": 4.496402877697841, "grad_norm": 0.07830740061499633, "learning_rate": 2.4475442334741306e-06, "loss": 0.3412, "step": 3125 }, { "epoch": 4.497841726618705, "grad_norm": 0.07520493007992193, "learning_rate": 2.43372162755978e-06, "loss": 0.347, "step": 3126 }, { "epoch": 4.499280575539569, "grad_norm": 0.07341117177491838, "learning_rate": 2.419936939357004e-06, "loss": 0.3445, "step": 3127 }, { "epoch": 4.500719424460431, "grad_norm": 0.07534906304450155, "learning_rate": 2.4061901827794466e-06, "loss": 0.3414, "step": 3128 }, { "epoch": 4.502158273381295, "grad_norm": 0.07719835961500789, "learning_rate": 2.3924813717024663e-06, "loss": 0.3431, "step": 3129 }, { "epoch": 4.503597122302159, "grad_norm": 0.07607486825470737, "learning_rate": 2.378810519963124e-06, "loss": 0.3566, "step": 3130 }, { "epoch": 4.5050359712230215, "grad_norm": 0.07706948930854154, "learning_rate": 2.3651776413601634e-06, "loss": 0.3472, "step": 3131 }, { "epoch": 4.506474820143885, "grad_norm": 0.07022086982451177, "learning_rate": 2.3515827496539823e-06, "loss": 0.3338, "step": 3132 }, { "epoch": 4.507913669064748, "grad_norm": 0.07459929303125831, "learning_rate": 2.3380258585666793e-06, "loss": 0.3506, "step": 3133 }, { "epoch": 4.5093525179856115, "grad_norm": 0.07260413687930173, "learning_rate": 2.324506981781949e-06, "loss": 0.3476, "step": 3134 }, { "epoch": 4.510791366906475, "grad_norm": 0.07566417402073569, "learning_rate": 2.311026132945138e-06, "loss": 0.3458, "step": 3135 }, { "epoch": 4.512230215827338, "grad_norm": 0.08908274283781233, "learning_rate": 2.297583325663233e-06, "loss": 0.3413, "step": 3136 }, { "epoch": 4.513669064748202, "grad_norm": 0.07587416190007623, "learning_rate": 2.2841785735047717e-06, "loss": 0.3432, "step": 3137 }, { "epoch": 4.515107913669064, "grad_norm": 0.07235969865275474, "learning_rate": 2.2708118899999175e-06, "loss": 0.3403, "step": 3138 }, { "epoch": 4.516546762589928, "grad_norm": 0.07571752220304737, "learning_rate": 2.2574832886403988e-06, "loss": 0.3382, "step": 3139 }, { "epoch": 4.517985611510792, "grad_norm": 0.2794684253861891, "learning_rate": 2.2441927828795106e-06, "loss": 0.3579, "step": 3140 }, { "epoch": 4.5194244604316545, "grad_norm": 0.08188168452046989, "learning_rate": 2.230940386132088e-06, "loss": 0.3467, "step": 3141 }, { "epoch": 4.520863309352518, "grad_norm": 0.07617472768450836, "learning_rate": 2.21772611177451e-06, "loss": 0.337, "step": 3142 }, { "epoch": 4.522302158273382, "grad_norm": 0.07322485314476528, "learning_rate": 2.204549973144654e-06, "loss": 0.3377, "step": 3143 }, { "epoch": 4.5237410071942445, "grad_norm": 0.07232542143510301, "learning_rate": 2.1914119835419358e-06, "loss": 0.3335, "step": 3144 }, { "epoch": 4.525179856115108, "grad_norm": 0.07279597533953248, "learning_rate": 2.178312156227258e-06, "loss": 0.3464, "step": 3145 }, { "epoch": 4.526618705035971, "grad_norm": 0.07658839683786779, "learning_rate": 2.1652505044229734e-06, "loss": 0.3374, "step": 3146 }, { "epoch": 4.528057553956835, "grad_norm": 0.07706757501700515, "learning_rate": 2.1522270413129444e-06, "loss": 0.3329, "step": 3147 }, { "epoch": 4.529496402877697, "grad_norm": 0.07503480245602073, "learning_rate": 2.1392417800424738e-06, "loss": 0.3372, "step": 3148 }, { "epoch": 4.530935251798561, "grad_norm": 0.07445954513335917, "learning_rate": 2.1262947337182815e-06, "loss": 0.3449, "step": 3149 }, { "epoch": 4.532374100719425, "grad_norm": 0.0762378365425642, "learning_rate": 2.113385915408546e-06, "loss": 0.3463, "step": 3150 }, { "epoch": 4.5338129496402875, "grad_norm": 0.07408879638626946, "learning_rate": 2.100515338142839e-06, "loss": 0.3396, "step": 3151 }, { "epoch": 4.535251798561151, "grad_norm": 0.07776095998479204, "learning_rate": 2.087683014912152e-06, "loss": 0.3448, "step": 3152 }, { "epoch": 4.536690647482015, "grad_norm": 0.07329928007429921, "learning_rate": 2.0748889586688526e-06, "loss": 0.3456, "step": 3153 }, { "epoch": 4.5381294964028775, "grad_norm": 0.0716971153654081, "learning_rate": 2.0621331823266777e-06, "loss": 0.3434, "step": 3154 }, { "epoch": 4.539568345323741, "grad_norm": 0.07432013950757492, "learning_rate": 2.049415698760746e-06, "loss": 0.3365, "step": 3155 }, { "epoch": 4.541007194244604, "grad_norm": 0.07400174544490337, "learning_rate": 2.036736520807505e-06, "loss": 0.3405, "step": 3156 }, { "epoch": 4.542446043165468, "grad_norm": 0.07618606779242666, "learning_rate": 2.0240956612647487e-06, "loss": 0.3431, "step": 3157 }, { "epoch": 4.543884892086331, "grad_norm": 0.08163048401173585, "learning_rate": 2.011493132891591e-06, "loss": 0.3449, "step": 3158 }, { "epoch": 4.545323741007194, "grad_norm": 0.07680418104811734, "learning_rate": 1.998928948408465e-06, "loss": 0.3411, "step": 3159 }, { "epoch": 4.546762589928058, "grad_norm": 0.07883169821475698, "learning_rate": 1.9864031204970847e-06, "loss": 0.3514, "step": 3160 }, { "epoch": 4.5482014388489205, "grad_norm": 0.07376418720795792, "learning_rate": 1.973915661800452e-06, "loss": 0.3449, "step": 3161 }, { "epoch": 4.549640287769784, "grad_norm": 0.07628001522921406, "learning_rate": 1.9614665849228666e-06, "loss": 0.3412, "step": 3162 }, { "epoch": 4.551079136690648, "grad_norm": 0.07268400084313503, "learning_rate": 1.949055902429846e-06, "loss": 0.3429, "step": 3163 }, { "epoch": 4.5525179856115106, "grad_norm": 0.07535215186043386, "learning_rate": 1.936683626848179e-06, "loss": 0.3418, "step": 3164 }, { "epoch": 4.553956834532374, "grad_norm": 0.08101213007294282, "learning_rate": 1.9243497706658944e-06, "loss": 0.3359, "step": 3165 }, { "epoch": 4.555395683453238, "grad_norm": 0.07516027494800719, "learning_rate": 1.9120543463322238e-06, "loss": 0.3362, "step": 3166 }, { "epoch": 4.556834532374101, "grad_norm": 0.07281658610061269, "learning_rate": 1.899797366257614e-06, "loss": 0.3506, "step": 3167 }, { "epoch": 4.558273381294964, "grad_norm": 0.0749458781232783, "learning_rate": 1.887578842813711e-06, "loss": 0.3408, "step": 3168 }, { "epoch": 4.559712230215827, "grad_norm": 0.07758718799261118, "learning_rate": 1.875398788333347e-06, "loss": 0.3445, "step": 3169 }, { "epoch": 4.561151079136691, "grad_norm": 0.07215043406316093, "learning_rate": 1.8632572151105189e-06, "loss": 0.3311, "step": 3170 }, { "epoch": 4.5625899280575535, "grad_norm": 0.07224962203751209, "learning_rate": 1.8511541354003882e-06, "loss": 0.3345, "step": 3171 }, { "epoch": 4.564028776978417, "grad_norm": 0.07432208385081143, "learning_rate": 1.8390895614192405e-06, "loss": 0.3437, "step": 3172 }, { "epoch": 4.565467625899281, "grad_norm": 0.07270811076997219, "learning_rate": 1.8270635053445352e-06, "loss": 0.345, "step": 3173 }, { "epoch": 4.566906474820144, "grad_norm": 0.07275907743997405, "learning_rate": 1.8150759793148332e-06, "loss": 0.3376, "step": 3174 }, { "epoch": 4.568345323741007, "grad_norm": 0.07308875094806778, "learning_rate": 1.803126995429789e-06, "loss": 0.3438, "step": 3175 }, { "epoch": 4.569784172661871, "grad_norm": 0.0737266373370186, "learning_rate": 1.7912165657501779e-06, "loss": 0.346, "step": 3176 }, { "epoch": 4.571223021582734, "grad_norm": 0.0741984037891141, "learning_rate": 1.779344702297845e-06, "loss": 0.3408, "step": 3177 }, { "epoch": 4.572661870503597, "grad_norm": 0.07262141217459457, "learning_rate": 1.767511417055725e-06, "loss": 0.3423, "step": 3178 }, { "epoch": 4.57410071942446, "grad_norm": 0.07357686840899615, "learning_rate": 1.7557167219678018e-06, "loss": 0.3402, "step": 3179 }, { "epoch": 4.575539568345324, "grad_norm": 0.07504841556503612, "learning_rate": 1.7439606289391032e-06, "loss": 0.3416, "step": 3180 }, { "epoch": 4.576978417266187, "grad_norm": 0.07086931022268723, "learning_rate": 1.7322431498357063e-06, "loss": 0.3462, "step": 3181 }, { "epoch": 4.57841726618705, "grad_norm": 0.07113355151124676, "learning_rate": 1.7205642964847103e-06, "loss": 0.3435, "step": 3182 }, { "epoch": 4.579856115107914, "grad_norm": 0.0739077808553021, "learning_rate": 1.7089240806742147e-06, "loss": 0.3555, "step": 3183 }, { "epoch": 4.581294964028777, "grad_norm": 0.07502195655337876, "learning_rate": 1.697322514153341e-06, "loss": 0.3361, "step": 3184 }, { "epoch": 4.58273381294964, "grad_norm": 0.07176624941980808, "learning_rate": 1.6857596086321848e-06, "loss": 0.3377, "step": 3185 }, { "epoch": 4.584172661870504, "grad_norm": 0.0747136618152716, "learning_rate": 1.6742353757818187e-06, "loss": 0.3387, "step": 3186 }, { "epoch": 4.585611510791367, "grad_norm": 0.07512691471483383, "learning_rate": 1.6627498272342802e-06, "loss": 0.3419, "step": 3187 }, { "epoch": 4.58705035971223, "grad_norm": 0.07190887116303142, "learning_rate": 1.6513029745825803e-06, "loss": 0.3382, "step": 3188 }, { "epoch": 4.588489208633094, "grad_norm": 0.0692314368608551, "learning_rate": 1.6398948293806504e-06, "loss": 0.3397, "step": 3189 }, { "epoch": 4.589928057553957, "grad_norm": 0.07877068717631153, "learning_rate": 1.6285254031433462e-06, "loss": 0.3426, "step": 3190 }, { "epoch": 4.59136690647482, "grad_norm": 0.07065417528656995, "learning_rate": 1.6171947073464834e-06, "loss": 0.3461, "step": 3191 }, { "epoch": 4.592805755395683, "grad_norm": 0.07350813952071268, "learning_rate": 1.6059027534267313e-06, "loss": 0.348, "step": 3192 }, { "epoch": 4.594244604316547, "grad_norm": 0.07717010230209637, "learning_rate": 1.594649552781693e-06, "loss": 0.3494, "step": 3193 }, { "epoch": 4.5956834532374105, "grad_norm": 0.07926012384009389, "learning_rate": 1.5834351167698336e-06, "loss": 0.3477, "step": 3194 }, { "epoch": 4.597122302158273, "grad_norm": 0.07283523499323487, "learning_rate": 1.572259456710512e-06, "loss": 0.3403, "step": 3195 }, { "epoch": 4.598561151079137, "grad_norm": 0.07370832275629358, "learning_rate": 1.5611225838839272e-06, "loss": 0.3447, "step": 3196 }, { "epoch": 4.6, "grad_norm": 0.07672318443808299, "learning_rate": 1.550024509531145e-06, "loss": 0.3416, "step": 3197 }, { "epoch": 4.601438848920863, "grad_norm": 0.0784881040441822, "learning_rate": 1.5389652448540537e-06, "loss": 0.3463, "step": 3198 }, { "epoch": 4.602877697841727, "grad_norm": 0.07560071813185676, "learning_rate": 1.527944801015382e-06, "loss": 0.3484, "step": 3199 }, { "epoch": 4.60431654676259, "grad_norm": 0.07470938049897778, "learning_rate": 1.5169631891386805e-06, "loss": 0.3421, "step": 3200 }, { "epoch": 4.605755395683453, "grad_norm": 0.07310946501465287, "learning_rate": 1.506020420308274e-06, "loss": 0.3449, "step": 3201 }, { "epoch": 4.607194244604317, "grad_norm": 0.07419465325977942, "learning_rate": 1.495116505569314e-06, "loss": 0.34, "step": 3202 }, { "epoch": 4.60863309352518, "grad_norm": 0.07401016996922452, "learning_rate": 1.4842514559277254e-06, "loss": 0.3319, "step": 3203 }, { "epoch": 4.6100719424460435, "grad_norm": 0.07823238049951006, "learning_rate": 1.4734252823501894e-06, "loss": 0.3431, "step": 3204 }, { "epoch": 4.611510791366906, "grad_norm": 0.07590369101605819, "learning_rate": 1.4626379957641646e-06, "loss": 0.3414, "step": 3205 }, { "epoch": 4.61294964028777, "grad_norm": 0.07282825632276907, "learning_rate": 1.451889607057848e-06, "loss": 0.341, "step": 3206 }, { "epoch": 4.614388489208633, "grad_norm": 0.0758640363462003, "learning_rate": 1.4411801270801885e-06, "loss": 0.3458, "step": 3207 }, { "epoch": 4.615827338129496, "grad_norm": 0.07657141865565384, "learning_rate": 1.4305095666408453e-06, "loss": 0.3351, "step": 3208 }, { "epoch": 4.61726618705036, "grad_norm": 0.07517839689819975, "learning_rate": 1.4198779365102077e-06, "loss": 0.3478, "step": 3209 }, { "epoch": 4.618705035971223, "grad_norm": 0.07344136344657473, "learning_rate": 1.409285247419363e-06, "loss": 0.3438, "step": 3210 }, { "epoch": 4.620143884892086, "grad_norm": 0.0732225156045941, "learning_rate": 1.3987315100600961e-06, "loss": 0.3462, "step": 3211 }, { "epoch": 4.62158273381295, "grad_norm": 0.07564182080034411, "learning_rate": 1.3882167350848686e-06, "loss": 0.3422, "step": 3212 }, { "epoch": 4.623021582733813, "grad_norm": 0.07356043598482831, "learning_rate": 1.3777409331068258e-06, "loss": 0.3463, "step": 3213 }, { "epoch": 4.6244604316546765, "grad_norm": 0.07247769492428899, "learning_rate": 1.3673041146997768e-06, "loss": 0.3425, "step": 3214 }, { "epoch": 4.625899280575539, "grad_norm": 0.0747090378206456, "learning_rate": 1.35690629039817e-06, "loss": 0.3555, "step": 3215 }, { "epoch": 4.627338129496403, "grad_norm": 0.07456576634752349, "learning_rate": 1.346547470697095e-06, "loss": 0.3447, "step": 3216 }, { "epoch": 4.6287769784172665, "grad_norm": 0.07166479986017334, "learning_rate": 1.3362276660522943e-06, "loss": 0.3423, "step": 3217 }, { "epoch": 4.630215827338129, "grad_norm": 0.07296834092957366, "learning_rate": 1.325946886880103e-06, "loss": 0.3335, "step": 3218 }, { "epoch": 4.631654676258993, "grad_norm": 0.07689047760758543, "learning_rate": 1.315705143557482e-06, "loss": 0.3356, "step": 3219 }, { "epoch": 4.633093525179856, "grad_norm": 0.07306585852255368, "learning_rate": 1.3055024464219846e-06, "loss": 0.3409, "step": 3220 }, { "epoch": 4.634532374100719, "grad_norm": 0.07244722903402687, "learning_rate": 1.295338805771751e-06, "loss": 0.3395, "step": 3221 }, { "epoch": 4.635971223021583, "grad_norm": 0.07044884383448612, "learning_rate": 1.285214231865508e-06, "loss": 0.3345, "step": 3222 }, { "epoch": 4.637410071942446, "grad_norm": 0.07340245842043562, "learning_rate": 1.2751287349225484e-06, "loss": 0.3507, "step": 3223 }, { "epoch": 4.6388489208633095, "grad_norm": 0.078882411830014, "learning_rate": 1.2650823251227062e-06, "loss": 0.3507, "step": 3224 }, { "epoch": 4.640287769784173, "grad_norm": 0.07770979722776154, "learning_rate": 1.255075012606386e-06, "loss": 0.3442, "step": 3225 }, { "epoch": 4.641726618705036, "grad_norm": 0.07589427534198123, "learning_rate": 1.2451068074745254e-06, "loss": 0.3472, "step": 3226 }, { "epoch": 4.6431654676258995, "grad_norm": 0.07696166151922781, "learning_rate": 1.2351777197885606e-06, "loss": 0.349, "step": 3227 }, { "epoch": 4.644604316546762, "grad_norm": 0.07219860964692072, "learning_rate": 1.2252877595704838e-06, "loss": 0.3421, "step": 3228 }, { "epoch": 4.646043165467626, "grad_norm": 0.07242363649293503, "learning_rate": 1.2154369368027763e-06, "loss": 0.3407, "step": 3229 }, { "epoch": 4.647482014388489, "grad_norm": 0.17023544957477652, "learning_rate": 1.2056252614284047e-06, "loss": 0.3479, "step": 3230 }, { "epoch": 4.648920863309352, "grad_norm": 0.07162113454038929, "learning_rate": 1.1958527433508381e-06, "loss": 0.3381, "step": 3231 }, { "epoch": 4.650359712230216, "grad_norm": 0.0728067529289856, "learning_rate": 1.1861193924340176e-06, "loss": 0.3432, "step": 3232 }, { "epoch": 4.651798561151079, "grad_norm": 0.07059212351903399, "learning_rate": 1.176425218502346e-06, "loss": 0.3426, "step": 3233 }, { "epoch": 4.6532374100719425, "grad_norm": 0.07235953337046205, "learning_rate": 1.1667702313406903e-06, "loss": 0.3432, "step": 3234 }, { "epoch": 4.654676258992806, "grad_norm": 0.0696798547491888, "learning_rate": 1.1571544406943614e-06, "loss": 0.3386, "step": 3235 }, { "epoch": 4.656115107913669, "grad_norm": 0.07107800574432338, "learning_rate": 1.147577856269102e-06, "loss": 0.3393, "step": 3236 }, { "epoch": 4.6575539568345325, "grad_norm": 0.07468832447428979, "learning_rate": 1.1380404877310957e-06, "loss": 0.3376, "step": 3237 }, { "epoch": 4.658992805755395, "grad_norm": 0.07634705223343236, "learning_rate": 1.1285423447069133e-06, "loss": 0.343, "step": 3238 }, { "epoch": 4.660431654676259, "grad_norm": 0.07352989842580739, "learning_rate": 1.1190834367835701e-06, "loss": 0.3483, "step": 3239 }, { "epoch": 4.661870503597123, "grad_norm": 0.07129723426655685, "learning_rate": 1.1096637735084602e-06, "loss": 0.345, "step": 3240 }, { "epoch": 4.663309352517985, "grad_norm": 0.07292570415856726, "learning_rate": 1.1002833643893606e-06, "loss": 0.3434, "step": 3241 }, { "epoch": 4.664748201438849, "grad_norm": 0.07098015505406921, "learning_rate": 1.0909422188944308e-06, "loss": 0.3455, "step": 3242 }, { "epoch": 4.666187050359712, "grad_norm": 0.07307054276226224, "learning_rate": 1.0816403464522262e-06, "loss": 0.3491, "step": 3243 }, { "epoch": 4.6676258992805755, "grad_norm": 0.07755083661084325, "learning_rate": 1.0723777564516148e-06, "loss": 0.3445, "step": 3244 }, { "epoch": 4.669064748201439, "grad_norm": 0.0718690723346127, "learning_rate": 1.0631544582418463e-06, "loss": 0.3432, "step": 3245 }, { "epoch": 4.670503597122302, "grad_norm": 0.07259792487389534, "learning_rate": 1.0539704611325008e-06, "loss": 0.3474, "step": 3246 }, { "epoch": 4.6719424460431656, "grad_norm": 0.070609052250591, "learning_rate": 1.0448257743934964e-06, "loss": 0.3369, "step": 3247 }, { "epoch": 4.673381294964029, "grad_norm": 0.07081140150355565, "learning_rate": 1.0357204072550676e-06, "loss": 0.3382, "step": 3248 }, { "epoch": 4.674820143884892, "grad_norm": 0.07256789962790246, "learning_rate": 1.0266543689077602e-06, "loss": 0.3412, "step": 3249 }, { "epoch": 4.676258992805756, "grad_norm": 0.07373465051135351, "learning_rate": 1.0176276685024233e-06, "loss": 0.3509, "step": 3250 }, { "epoch": 4.677697841726618, "grad_norm": 0.07107883300929509, "learning_rate": 1.0086403151502088e-06, "loss": 0.351, "step": 3251 }, { "epoch": 4.679136690647482, "grad_norm": 0.07329045248109889, "learning_rate": 9.996923179225448e-07, "loss": 0.3352, "step": 3252 }, { "epoch": 4.680575539568346, "grad_norm": 0.0722249679325728, "learning_rate": 9.90783685851131e-07, "loss": 0.3493, "step": 3253 }, { "epoch": 4.6820143884892085, "grad_norm": 0.07246260082915003, "learning_rate": 9.81914427927948e-07, "loss": 0.3423, "step": 3254 }, { "epoch": 4.683453237410072, "grad_norm": 0.0713126062283184, "learning_rate": 9.730845531052214e-07, "loss": 0.3433, "step": 3255 }, { "epoch": 4.684892086330935, "grad_norm": 0.07355877347085475, "learning_rate": 9.642940702954306e-07, "loss": 0.3408, "step": 3256 }, { "epoch": 4.686330935251799, "grad_norm": 0.07052278467158665, "learning_rate": 9.555429883712963e-07, "loss": 0.3356, "step": 3257 }, { "epoch": 4.687769784172662, "grad_norm": 0.0735207329605086, "learning_rate": 9.468313161657617e-07, "loss": 0.3402, "step": 3258 }, { "epoch": 4.689208633093525, "grad_norm": 0.06859602773813163, "learning_rate": 9.381590624719972e-07, "loss": 0.3428, "step": 3259 }, { "epoch": 4.690647482014389, "grad_norm": 0.0692500493142813, "learning_rate": 9.295262360433921e-07, "loss": 0.3373, "step": 3260 }, { "epoch": 4.692086330935252, "grad_norm": 0.07026107245791961, "learning_rate": 9.209328455935274e-07, "loss": 0.3446, "step": 3261 }, { "epoch": 4.693525179856115, "grad_norm": 0.07121245958557837, "learning_rate": 9.123788997961847e-07, "loss": 0.3498, "step": 3262 }, { "epoch": 4.694964028776979, "grad_norm": 0.07428187585450076, "learning_rate": 9.038644072853331e-07, "loss": 0.3481, "step": 3263 }, { "epoch": 4.6964028776978415, "grad_norm": 0.07209399796559547, "learning_rate": 8.953893766551203e-07, "loss": 0.3438, "step": 3264 }, { "epoch": 4.697841726618705, "grad_norm": 0.07119356925873069, "learning_rate": 8.86953816459859e-07, "loss": 0.3447, "step": 3265 }, { "epoch": 4.699280575539568, "grad_norm": 0.07518530283351295, "learning_rate": 8.785577352140317e-07, "loss": 0.3493, "step": 3266 }, { "epoch": 4.700719424460432, "grad_norm": 0.0746073054119646, "learning_rate": 8.702011413922506e-07, "loss": 0.3531, "step": 3267 }, { "epoch": 4.702158273381295, "grad_norm": 0.07090028442671885, "learning_rate": 8.61884043429293e-07, "loss": 0.3446, "step": 3268 }, { "epoch": 4.703597122302158, "grad_norm": 0.0712249508346909, "learning_rate": 8.536064497200702e-07, "loss": 0.3454, "step": 3269 }, { "epoch": 4.705035971223022, "grad_norm": 0.07349652446599979, "learning_rate": 8.453683686196012e-07, "loss": 0.3562, "step": 3270 }, { "epoch": 4.706474820143885, "grad_norm": 0.07385176044106828, "learning_rate": 8.371698084430346e-07, "loss": 0.3471, "step": 3271 }, { "epoch": 4.707913669064748, "grad_norm": 0.07317079825374315, "learning_rate": 8.290107774656441e-07, "loss": 0.3449, "step": 3272 }, { "epoch": 4.709352517985612, "grad_norm": 0.07568657649877363, "learning_rate": 8.208912839227712e-07, "loss": 0.3467, "step": 3273 }, { "epoch": 4.7107913669064745, "grad_norm": 0.06993763923796827, "learning_rate": 8.128113360098777e-07, "loss": 0.3431, "step": 3274 }, { "epoch": 4.712230215827338, "grad_norm": 0.06839755659991058, "learning_rate": 8.047709418824934e-07, "loss": 0.3421, "step": 3275 }, { "epoch": 4.713669064748202, "grad_norm": 0.07195142770477857, "learning_rate": 7.96770109656233e-07, "loss": 0.3354, "step": 3276 }, { "epoch": 4.715107913669065, "grad_norm": 0.0725067551374577, "learning_rate": 7.88808847406779e-07, "loss": 0.3469, "step": 3277 }, { "epoch": 4.716546762589928, "grad_norm": 0.07219832598972031, "learning_rate": 7.808871631698723e-07, "loss": 0.3534, "step": 3278 }, { "epoch": 4.717985611510791, "grad_norm": 0.0721936519889365, "learning_rate": 7.730050649412946e-07, "loss": 0.3439, "step": 3279 }, { "epoch": 4.719424460431655, "grad_norm": 0.07203963159496164, "learning_rate": 7.651625606768908e-07, "loss": 0.3487, "step": 3280 }, { "epoch": 4.720863309352518, "grad_norm": 0.06976106999488406, "learning_rate": 7.573596582925291e-07, "loss": 0.3392, "step": 3281 }, { "epoch": 4.722302158273381, "grad_norm": 0.0702046647598994, "learning_rate": 7.495963656641048e-07, "loss": 0.3503, "step": 3282 }, { "epoch": 4.723741007194245, "grad_norm": 0.06946908302821556, "learning_rate": 7.418726906275497e-07, "loss": 0.3442, "step": 3283 }, { "epoch": 4.725179856115108, "grad_norm": 0.06859861930076236, "learning_rate": 7.341886409787746e-07, "loss": 0.3383, "step": 3284 }, { "epoch": 4.726618705035971, "grad_norm": 0.07028895278734329, "learning_rate": 7.265442244737264e-07, "loss": 0.3466, "step": 3285 }, { "epoch": 4.728057553956835, "grad_norm": 0.07010202037342096, "learning_rate": 7.189394488283307e-07, "loss": 0.34, "step": 3286 }, { "epoch": 4.729496402877698, "grad_norm": 0.07159689438722366, "learning_rate": 7.113743217185099e-07, "loss": 0.3539, "step": 3287 }, { "epoch": 4.730935251798561, "grad_norm": 0.07020760800692664, "learning_rate": 7.0384885078016e-07, "loss": 0.3365, "step": 3288 }, { "epoch": 4.732374100719424, "grad_norm": 0.0703843916403367, "learning_rate": 6.963630436091518e-07, "loss": 0.3467, "step": 3289 }, { "epoch": 4.733812949640288, "grad_norm": 0.07092803734791567, "learning_rate": 6.889169077613212e-07, "loss": 0.3364, "step": 3290 }, { "epoch": 4.735251798561151, "grad_norm": 0.07035847540003927, "learning_rate": 6.815104507524695e-07, "loss": 0.3396, "step": 3291 }, { "epoch": 4.736690647482014, "grad_norm": 0.06895173835205612, "learning_rate": 6.741436800583367e-07, "loss": 0.3433, "step": 3292 }, { "epoch": 4.738129496402878, "grad_norm": 0.07197375726645833, "learning_rate": 6.668166031146062e-07, "loss": 0.3472, "step": 3293 }, { "epoch": 4.739568345323741, "grad_norm": 0.06847000902765593, "learning_rate": 6.595292273169041e-07, "loss": 0.3392, "step": 3294 }, { "epoch": 4.741007194244604, "grad_norm": 0.0704583019496255, "learning_rate": 6.522815600207866e-07, "loss": 0.3406, "step": 3295 }, { "epoch": 4.742446043165468, "grad_norm": 0.07086617229826032, "learning_rate": 6.450736085417086e-07, "loss": 0.3398, "step": 3296 }, { "epoch": 4.743884892086331, "grad_norm": 0.07247846141002307, "learning_rate": 6.379053801550594e-07, "loss": 0.3474, "step": 3297 }, { "epoch": 4.745323741007194, "grad_norm": 0.07146186302062525, "learning_rate": 6.307768820961269e-07, "loss": 0.3457, "step": 3298 }, { "epoch": 4.746762589928058, "grad_norm": 0.07267016536214223, "learning_rate": 6.236881215600976e-07, "loss": 0.344, "step": 3299 }, { "epoch": 4.748201438848921, "grad_norm": 0.07158713318428622, "learning_rate": 6.166391057020438e-07, "loss": 0.3386, "step": 3300 }, { "epoch": 4.749640287769784, "grad_norm": 0.07202452509042864, "learning_rate": 6.096298416369273e-07, "loss": 0.343, "step": 3301 }, { "epoch": 4.751079136690647, "grad_norm": 0.07242217820098035, "learning_rate": 6.026603364395867e-07, "loss": 0.347, "step": 3302 }, { "epoch": 4.752517985611511, "grad_norm": 0.07090701666068121, "learning_rate": 5.957305971447192e-07, "loss": 0.3409, "step": 3303 }, { "epoch": 4.753956834532374, "grad_norm": 0.07061662022993838, "learning_rate": 5.888406307468986e-07, "loss": 0.3496, "step": 3304 }, { "epoch": 4.755395683453237, "grad_norm": 0.07367594232626853, "learning_rate": 5.819904442005442e-07, "loss": 0.3467, "step": 3305 }, { "epoch": 4.756834532374101, "grad_norm": 0.07097064998410584, "learning_rate": 5.751800444199295e-07, "loss": 0.343, "step": 3306 }, { "epoch": 4.7582733812949645, "grad_norm": 0.07204400418422385, "learning_rate": 5.684094382791605e-07, "loss": 0.3424, "step": 3307 }, { "epoch": 4.759712230215827, "grad_norm": 0.07319040042214421, "learning_rate": 5.616786326121837e-07, "loss": 0.3463, "step": 3308 }, { "epoch": 4.761151079136691, "grad_norm": 0.06954107897437739, "learning_rate": 5.54987634212778e-07, "loss": 0.3447, "step": 3309 }, { "epoch": 4.762589928057554, "grad_norm": 0.0712861956177736, "learning_rate": 5.483364498345279e-07, "loss": 0.3414, "step": 3310 }, { "epoch": 4.764028776978417, "grad_norm": 0.06919069782996472, "learning_rate": 5.417250861908452e-07, "loss": 0.342, "step": 3311 }, { "epoch": 4.76546762589928, "grad_norm": 0.06769359574302063, "learning_rate": 5.351535499549387e-07, "loss": 0.3364, "step": 3312 }, { "epoch": 4.766906474820144, "grad_norm": 0.06966426302621624, "learning_rate": 5.286218477598226e-07, "loss": 0.3407, "step": 3313 }, { "epoch": 4.768345323741007, "grad_norm": 0.06988763571315693, "learning_rate": 5.221299861983075e-07, "loss": 0.3346, "step": 3314 }, { "epoch": 4.76978417266187, "grad_norm": 0.07286917998653716, "learning_rate": 5.156779718229787e-07, "loss": 0.3508, "step": 3315 }, { "epoch": 4.771223021582734, "grad_norm": 0.07050658311773654, "learning_rate": 5.092658111462179e-07, "loss": 0.341, "step": 3316 }, { "epoch": 4.7726618705035975, "grad_norm": 0.07028255172120178, "learning_rate": 5.028935106401678e-07, "loss": 0.3403, "step": 3317 }, { "epoch": 4.77410071942446, "grad_norm": 0.07289048655011893, "learning_rate": 4.965610767367413e-07, "loss": 0.3457, "step": 3318 }, { "epoch": 4.775539568345324, "grad_norm": 0.06787022097041073, "learning_rate": 4.902685158276078e-07, "loss": 0.3374, "step": 3319 }, { "epoch": 4.7769784172661875, "grad_norm": 0.07081096261177364, "learning_rate": 4.840158342642021e-07, "loss": 0.3392, "step": 3320 }, { "epoch": 4.77841726618705, "grad_norm": 0.06997345301677844, "learning_rate": 4.778030383577026e-07, "loss": 0.3399, "step": 3321 }, { "epoch": 4.779856115107914, "grad_norm": 0.06859614733030588, "learning_rate": 4.716301343790175e-07, "loss": 0.3428, "step": 3322 }, { "epoch": 4.781294964028777, "grad_norm": 0.07171319473830083, "learning_rate": 4.6549712855879837e-07, "loss": 0.3371, "step": 3323 }, { "epoch": 4.78273381294964, "grad_norm": 0.07370327698272955, "learning_rate": 4.5940402708744005e-07, "loss": 0.3527, "step": 3324 }, { "epoch": 4.784172661870503, "grad_norm": 0.07050434053534654, "learning_rate": 4.5335083611502293e-07, "loss": 0.3415, "step": 3325 }, { "epoch": 4.785611510791367, "grad_norm": 0.11270684077356648, "learning_rate": 4.473375617513842e-07, "loss": 0.3456, "step": 3326 }, { "epoch": 4.7870503597122305, "grad_norm": 0.0714616413391725, "learning_rate": 4.41364210066042e-07, "loss": 0.3505, "step": 3327 }, { "epoch": 4.788489208633093, "grad_norm": 0.07188375038968106, "learning_rate": 4.3543078708823126e-07, "loss": 0.3507, "step": 3328 }, { "epoch": 4.789928057553957, "grad_norm": 0.06887456261390511, "learning_rate": 4.295372988068813e-07, "loss": 0.3351, "step": 3329 }, { "epoch": 4.7913669064748206, "grad_norm": 0.07098726200078284, "learning_rate": 4.2368375117062043e-07, "loss": 0.3491, "step": 3330 }, { "epoch": 4.792805755395683, "grad_norm": 0.07395405969237014, "learning_rate": 4.178701500877491e-07, "loss": 0.3505, "step": 3331 }, { "epoch": 4.794244604316547, "grad_norm": 0.0705628304126942, "learning_rate": 4.120965014262579e-07, "loss": 0.3376, "step": 3332 }, { "epoch": 4.79568345323741, "grad_norm": 0.0687397453264519, "learning_rate": 4.063628110138096e-07, "loss": 0.34, "step": 3333 }, { "epoch": 4.797122302158273, "grad_norm": 0.07100196280713694, "learning_rate": 4.0066908463772593e-07, "loss": 0.3426, "step": 3334 }, { "epoch": 4.798561151079137, "grad_norm": 0.07064818367164927, "learning_rate": 3.9501532804500974e-07, "loss": 0.3355, "step": 3335 }, { "epoch": 4.8, "grad_norm": 0.07034955191773087, "learning_rate": 3.894015469423007e-07, "loss": 0.3495, "step": 3336 }, { "epoch": 4.8014388489208635, "grad_norm": 0.07098980654888197, "learning_rate": 3.838277469958973e-07, "loss": 0.3418, "step": 3337 }, { "epoch": 4.802877697841726, "grad_norm": 0.0693087014278541, "learning_rate": 3.7829393383174375e-07, "loss": 0.3465, "step": 3338 }, { "epoch": 4.80431654676259, "grad_norm": 0.06885712962710298, "learning_rate": 3.7280011303542084e-07, "loss": 0.3389, "step": 3339 }, { "epoch": 4.805755395683454, "grad_norm": 0.07252644943272837, "learning_rate": 3.673462901521463e-07, "loss": 0.3401, "step": 3340 }, { "epoch": 4.807194244604316, "grad_norm": 0.07000126262931973, "learning_rate": 3.619324706867655e-07, "loss": 0.3403, "step": 3341 }, { "epoch": 4.80863309352518, "grad_norm": 0.07092740269571697, "learning_rate": 3.5655866010373853e-07, "loss": 0.3434, "step": 3342 }, { "epoch": 4.810071942446044, "grad_norm": 0.07381551346368283, "learning_rate": 3.5122486382715314e-07, "loss": 0.3504, "step": 3343 }, { "epoch": 4.811510791366906, "grad_norm": 0.0667687378931654, "learning_rate": 3.459310872407029e-07, "loss": 0.3369, "step": 3344 }, { "epoch": 4.81294964028777, "grad_norm": 0.06986551611735359, "learning_rate": 3.4067733568768246e-07, "loss": 0.3455, "step": 3345 }, { "epoch": 4.814388489208633, "grad_norm": 0.0755061097725317, "learning_rate": 3.3546361447099664e-07, "loss": 0.3499, "step": 3346 }, { "epoch": 4.8158273381294965, "grad_norm": 0.07154098628724741, "learning_rate": 3.3028992885314247e-07, "loss": 0.3452, "step": 3347 }, { "epoch": 4.817266187050359, "grad_norm": 0.07282192416556188, "learning_rate": 3.2515628405620503e-07, "loss": 0.3455, "step": 3348 }, { "epoch": 4.818705035971223, "grad_norm": 0.06891706028658195, "learning_rate": 3.2006268526184824e-07, "loss": 0.3455, "step": 3349 }, { "epoch": 4.820143884892087, "grad_norm": 0.07276800257251627, "learning_rate": 3.150091376113329e-07, "loss": 0.3364, "step": 3350 }, { "epoch": 4.821582733812949, "grad_norm": 0.07407440886093652, "learning_rate": 3.0999564620547207e-07, "loss": 0.3434, "step": 3351 }, { "epoch": 4.823021582733813, "grad_norm": 0.06941563104967682, "learning_rate": 3.0502221610465786e-07, "loss": 0.3356, "step": 3352 }, { "epoch": 4.824460431654677, "grad_norm": 0.06863053348683697, "learning_rate": 3.0008885232886144e-07, "loss": 0.3415, "step": 3353 }, { "epoch": 4.825899280575539, "grad_norm": 0.07364843527397481, "learning_rate": 2.95195559857584e-07, "loss": 0.3369, "step": 3354 }, { "epoch": 4.827338129496403, "grad_norm": 0.06804047164493178, "learning_rate": 2.9034234362989687e-07, "loss": 0.3425, "step": 3355 }, { "epoch": 4.828776978417266, "grad_norm": 0.07216054806563041, "learning_rate": 2.855292085444239e-07, "loss": 0.3507, "step": 3356 }, { "epoch": 4.8302158273381295, "grad_norm": 0.06982074106557941, "learning_rate": 2.8075615945932333e-07, "loss": 0.3442, "step": 3357 }, { "epoch": 4.831654676258993, "grad_norm": 0.06827818079322905, "learning_rate": 2.7602320119229254e-07, "loss": 0.3385, "step": 3358 }, { "epoch": 4.833093525179856, "grad_norm": 0.06923145674977711, "learning_rate": 2.7133033852057675e-07, "loss": 0.3498, "step": 3359 }, { "epoch": 4.83453237410072, "grad_norm": 0.06746504159607383, "learning_rate": 2.666775761809337e-07, "loss": 0.3371, "step": 3360 }, { "epoch": 4.835971223021582, "grad_norm": 0.07075266970183086, "learning_rate": 2.620649188696511e-07, "loss": 0.3439, "step": 3361 }, { "epoch": 4.837410071942446, "grad_norm": 0.06885614287105779, "learning_rate": 2.574923712425426e-07, "loss": 0.3447, "step": 3362 }, { "epoch": 4.83884892086331, "grad_norm": 0.06863579601971304, "learning_rate": 2.52959937914925e-07, "loss": 0.3417, "step": 3363 }, { "epoch": 4.840287769784172, "grad_norm": 0.07207973539335823, "learning_rate": 2.484676234616412e-07, "loss": 0.3408, "step": 3364 }, { "epoch": 4.841726618705036, "grad_norm": 0.06925055589688599, "learning_rate": 2.440154324170285e-07, "loss": 0.3525, "step": 3365 }, { "epoch": 4.8431654676259, "grad_norm": 0.06858531426032595, "learning_rate": 2.3960336927492333e-07, "loss": 0.3462, "step": 3366 }, { "epoch": 4.8446043165467625, "grad_norm": 0.07021885567068153, "learning_rate": 2.3523143848867003e-07, "loss": 0.3438, "step": 3367 }, { "epoch": 4.846043165467626, "grad_norm": 0.06985375092417792, "learning_rate": 2.3089964447109425e-07, "loss": 0.342, "step": 3368 }, { "epoch": 4.847482014388489, "grad_norm": 0.070370726460504, "learning_rate": 2.2660799159451629e-07, "loss": 0.3477, "step": 3369 }, { "epoch": 4.848920863309353, "grad_norm": 0.06937979912952354, "learning_rate": 2.2235648419073773e-07, "loss": 0.3442, "step": 3370 }, { "epoch": 4.850359712230215, "grad_norm": 0.06921640426519425, "learning_rate": 2.1814512655103703e-07, "loss": 0.3427, "step": 3371 }, { "epoch": 4.851798561151079, "grad_norm": 0.07121538254960598, "learning_rate": 2.1397392292617392e-07, "loss": 0.3408, "step": 3372 }, { "epoch": 4.853237410071943, "grad_norm": 0.07054649424777172, "learning_rate": 2.0984287752636722e-07, "loss": 0.3567, "step": 3373 }, { "epoch": 4.854676258992805, "grad_norm": 0.06859174423295826, "learning_rate": 2.0575199452131268e-07, "loss": 0.3452, "step": 3374 }, { "epoch": 4.856115107913669, "grad_norm": 0.06924411163154238, "learning_rate": 2.017012780401606e-07, "loss": 0.3433, "step": 3375 }, { "epoch": 4.857553956834533, "grad_norm": 0.07349884644632264, "learning_rate": 1.9769073217152933e-07, "loss": 0.345, "step": 3376 }, { "epoch": 4.8589928057553955, "grad_norm": 0.07081026265598288, "learning_rate": 1.9372036096347414e-07, "loss": 0.3453, "step": 3377 }, { "epoch": 4.860431654676259, "grad_norm": 0.06952943020219723, "learning_rate": 1.8979016842350928e-07, "loss": 0.3492, "step": 3378 }, { "epoch": 4.861870503597122, "grad_norm": 0.07076935581770977, "learning_rate": 1.8590015851860376e-07, "loss": 0.3526, "step": 3379 }, { "epoch": 4.863309352517986, "grad_norm": 0.06881864526271664, "learning_rate": 1.8205033517515015e-07, "loss": 0.3423, "step": 3380 }, { "epoch": 4.864748201438849, "grad_norm": 0.06788157979408772, "learning_rate": 1.7824070227899115e-07, "loss": 0.3454, "step": 3381 }, { "epoch": 4.866187050359712, "grad_norm": 0.07196156739195902, "learning_rate": 1.7447126367539313e-07, "loss": 0.344, "step": 3382 }, { "epoch": 4.867625899280576, "grad_norm": 0.06738594219189462, "learning_rate": 1.7074202316906374e-07, "loss": 0.331, "step": 3383 }, { "epoch": 4.869064748201438, "grad_norm": 0.06930807246944674, "learning_rate": 1.6705298452412978e-07, "loss": 0.3403, "step": 3384 }, { "epoch": 4.870503597122302, "grad_norm": 0.06932608247336144, "learning_rate": 1.6340415146414157e-07, "loss": 0.3424, "step": 3385 }, { "epoch": 4.871942446043166, "grad_norm": 0.06815944754953394, "learning_rate": 1.597955276720642e-07, "loss": 0.3421, "step": 3386 }, { "epoch": 4.8733812949640285, "grad_norm": 0.07345784152517565, "learning_rate": 1.562271167902818e-07, "loss": 0.3379, "step": 3387 }, { "epoch": 4.874820143884892, "grad_norm": 0.06893531752585413, "learning_rate": 1.526989224205888e-07, "loss": 0.3348, "step": 3388 }, { "epoch": 4.876258992805756, "grad_norm": 0.06931265398781795, "learning_rate": 1.4921094812418103e-07, "loss": 0.3437, "step": 3389 }, { "epoch": 4.877697841726619, "grad_norm": 0.06913324074740719, "learning_rate": 1.457631974216689e-07, "loss": 0.3357, "step": 3390 }, { "epoch": 4.879136690647482, "grad_norm": 0.06999009405527448, "learning_rate": 1.4235567379305536e-07, "loss": 0.342, "step": 3391 }, { "epoch": 4.880575539568345, "grad_norm": 0.06704443213036577, "learning_rate": 1.389883806777359e-07, "loss": 0.3381, "step": 3392 }, { "epoch": 4.882014388489209, "grad_norm": 0.06954696835327542, "learning_rate": 1.356613214745117e-07, "loss": 0.3555, "step": 3393 }, { "epoch": 4.883453237410072, "grad_norm": 0.07257864704658673, "learning_rate": 1.3237449954156767e-07, "loss": 0.3497, "step": 3394 }, { "epoch": 4.884892086330935, "grad_norm": 0.06835497987647865, "learning_rate": 1.2912791819646774e-07, "loss": 0.3472, "step": 3395 }, { "epoch": 4.886330935251799, "grad_norm": 0.07115444901613881, "learning_rate": 1.2592158071616844e-07, "loss": 0.3474, "step": 3396 }, { "epoch": 4.8877697841726615, "grad_norm": 0.07128270297209283, "learning_rate": 1.2275549033700097e-07, "loss": 0.3553, "step": 3397 }, { "epoch": 4.889208633093525, "grad_norm": 0.06944010179810285, "learning_rate": 1.1962965025467564e-07, "loss": 0.3525, "step": 3398 }, { "epoch": 4.890647482014389, "grad_norm": 0.06802836608159324, "learning_rate": 1.1654406362427762e-07, "loss": 0.3418, "step": 3399 }, { "epoch": 4.892086330935252, "grad_norm": 0.07014184850529115, "learning_rate": 1.1349873356025332e-07, "loss": 0.3503, "step": 3400 }, { "epoch": 4.893525179856115, "grad_norm": 0.06877137343946575, "learning_rate": 1.1049366313642395e-07, "loss": 0.3432, "step": 3401 }, { "epoch": 4.894964028776979, "grad_norm": 0.07090780883347099, "learning_rate": 1.0752885538598102e-07, "loss": 0.3453, "step": 3402 }, { "epoch": 4.896402877697842, "grad_norm": 0.06937322732548545, "learning_rate": 1.0460431330145515e-07, "loss": 0.3429, "step": 3403 }, { "epoch": 4.897841726618705, "grad_norm": 0.07059332959477979, "learning_rate": 1.0172003983475176e-07, "loss": 0.3442, "step": 3404 }, { "epoch": 4.899280575539568, "grad_norm": 0.0691006213917355, "learning_rate": 9.887603789712875e-08, "loss": 0.3456, "step": 3405 }, { "epoch": 4.900719424460432, "grad_norm": 0.07090172393618893, "learning_rate": 9.607231035919651e-08, "loss": 0.3423, "step": 3406 }, { "epoch": 4.9021582733812945, "grad_norm": 0.07005456018551516, "learning_rate": 9.330886005090467e-08, "loss": 0.3493, "step": 3407 }, { "epoch": 4.903597122302158, "grad_norm": 0.0718711843822888, "learning_rate": 9.058568976155979e-08, "loss": 0.3389, "step": 3408 }, { "epoch": 4.905035971223022, "grad_norm": 0.06994140125733872, "learning_rate": 8.790280223980763e-08, "loss": 0.343, "step": 3409 }, { "epoch": 4.906474820143885, "grad_norm": 0.07023114151718875, "learning_rate": 8.526020019363313e-08, "loss": 0.349, "step": 3410 }, { "epoch": 4.907913669064748, "grad_norm": 0.07040002767426613, "learning_rate": 8.265788629036043e-08, "loss": 0.3284, "step": 3411 }, { "epoch": 4.909352517985612, "grad_norm": 0.0703481353829406, "learning_rate": 8.009586315664842e-08, "loss": 0.354, "step": 3412 }, { "epoch": 4.910791366906475, "grad_norm": 0.07351480314016241, "learning_rate": 7.757413337848629e-08, "loss": 0.3385, "step": 3413 }, { "epoch": 4.912230215827338, "grad_norm": 0.07161459246687522, "learning_rate": 7.509269950119358e-08, "loss": 0.3539, "step": 3414 }, { "epoch": 4.913669064748201, "grad_norm": 0.07032513023391523, "learning_rate": 7.265156402942452e-08, "loss": 0.3483, "step": 3415 }, { "epoch": 4.915107913669065, "grad_norm": 0.06914653953914185, "learning_rate": 7.025072942714595e-08, "loss": 0.3499, "step": 3416 }, { "epoch": 4.916546762589928, "grad_norm": 0.07079020521232746, "learning_rate": 6.789019811765052e-08, "loss": 0.3469, "step": 3417 }, { "epoch": 4.917985611510791, "grad_norm": 0.06854317738438706, "learning_rate": 6.556997248355679e-08, "loss": 0.3461, "step": 3418 }, { "epoch": 4.919424460431655, "grad_norm": 0.07083775956631229, "learning_rate": 6.329005486679584e-08, "loss": 0.3447, "step": 3419 }, { "epoch": 4.920863309352518, "grad_norm": 0.06983250569314274, "learning_rate": 6.105044756861134e-08, "loss": 0.3417, "step": 3420 }, { "epoch": 4.922302158273381, "grad_norm": 0.07146295606304731, "learning_rate": 5.8851152849563886e-08, "loss": 0.3529, "step": 3421 }, { "epoch": 4.923741007194245, "grad_norm": 0.06983646641591423, "learning_rate": 5.669217292952223e-08, "loss": 0.3381, "step": 3422 }, { "epoch": 4.925179856115108, "grad_norm": 0.06950858504583642, "learning_rate": 5.4573509987663196e-08, "loss": 0.3496, "step": 3423 }, { "epoch": 4.926618705035971, "grad_norm": 0.0673192526049635, "learning_rate": 5.2495166162471747e-08, "loss": 0.3429, "step": 3424 }, { "epoch": 4.928057553956835, "grad_norm": 0.06773038018463526, "learning_rate": 5.045714355173203e-08, "loss": 0.3414, "step": 3425 }, { "epoch": 4.929496402877698, "grad_norm": 0.06898430126079236, "learning_rate": 4.845944421253634e-08, "loss": 0.3416, "step": 3426 }, { "epoch": 4.930935251798561, "grad_norm": 0.06814436446211519, "learning_rate": 4.650207016126729e-08, "loss": 0.3429, "step": 3427 }, { "epoch": 4.932374100719424, "grad_norm": 0.06859985130717335, "learning_rate": 4.458502337361115e-08, "loss": 0.3524, "step": 3428 }, { "epoch": 4.933812949640288, "grad_norm": 0.06996981696239825, "learning_rate": 4.270830578455343e-08, "loss": 0.3434, "step": 3429 }, { "epoch": 4.935251798561151, "grad_norm": 0.10234678520498625, "learning_rate": 4.087191928836554e-08, "loss": 0.3342, "step": 3430 }, { "epoch": 4.936690647482014, "grad_norm": 0.0711256466785614, "learning_rate": 3.907586573860922e-08, "loss": 0.3442, "step": 3431 }, { "epoch": 4.938129496402878, "grad_norm": 0.06915931117632583, "learning_rate": 3.7320146948149894e-08, "loss": 0.3387, "step": 3432 }, { "epoch": 4.939568345323741, "grad_norm": 0.07217472866366731, "learning_rate": 3.560476468912111e-08, "loss": 0.3416, "step": 3433 }, { "epoch": 4.941007194244604, "grad_norm": 0.06985373626638046, "learning_rate": 3.392972069295564e-08, "loss": 0.3426, "step": 3434 }, { "epoch": 4.942446043165468, "grad_norm": 0.06908507990367821, "learning_rate": 3.229501665037216e-08, "loss": 0.3446, "step": 3435 }, { "epoch": 4.943884892086331, "grad_norm": 0.06921946977253772, "learning_rate": 3.0700654211361925e-08, "loss": 0.3395, "step": 3436 }, { "epoch": 4.945323741007194, "grad_norm": 0.06958416841465791, "learning_rate": 2.9146634985206535e-08, "loss": 0.3433, "step": 3437 }, { "epoch": 4.946762589928057, "grad_norm": 0.06839442541200638, "learning_rate": 2.7632960540460162e-08, "loss": 0.3373, "step": 3438 }, { "epoch": 4.948201438848921, "grad_norm": 0.06973767335219232, "learning_rate": 2.6159632404958447e-08, "loss": 0.3406, "step": 3439 }, { "epoch": 4.9496402877697845, "grad_norm": 0.07048415571233173, "learning_rate": 2.472665206581404e-08, "loss": 0.3494, "step": 3440 }, { "epoch": 4.951079136690647, "grad_norm": 0.071024230979393, "learning_rate": 2.3334020969407733e-08, "loss": 0.3482, "step": 3441 }, { "epoch": 4.952517985611511, "grad_norm": 0.06840496393214036, "learning_rate": 2.1981740521406226e-08, "loss": 0.3396, "step": 3442 }, { "epoch": 4.953956834532374, "grad_norm": 0.06957410661638072, "learning_rate": 2.0669812086735464e-08, "loss": 0.339, "step": 3443 }, { "epoch": 4.955395683453237, "grad_norm": 0.07314235710964967, "learning_rate": 1.9398236989598418e-08, "loss": 0.3453, "step": 3444 }, { "epoch": 4.956834532374101, "grad_norm": 0.07084760603623917, "learning_rate": 1.8167016513470636e-08, "loss": 0.3497, "step": 3445 }, { "epoch": 4.958273381294964, "grad_norm": 0.0676189923280852, "learning_rate": 1.697615190107804e-08, "loss": 0.3475, "step": 3446 }, { "epoch": 4.959712230215827, "grad_norm": 0.0708687898940551, "learning_rate": 1.582564435444134e-08, "loss": 0.341, "step": 3447 }, { "epoch": 4.961151079136691, "grad_norm": 0.0703179408620645, "learning_rate": 1.4715495034818284e-08, "loss": 0.334, "step": 3448 }, { "epoch": 4.962589928057554, "grad_norm": 0.07113636099874597, "learning_rate": 1.3645705062748094e-08, "loss": 0.3441, "step": 3449 }, { "epoch": 4.9640287769784175, "grad_norm": 0.07345379436488886, "learning_rate": 1.2616275518033683e-08, "loss": 0.3394, "step": 3450 }, { "epoch": 4.96546762589928, "grad_norm": 0.07026785929363381, "learning_rate": 1.1627207439728339e-08, "loss": 0.3398, "step": 3451 }, { "epoch": 4.966906474820144, "grad_norm": 0.06962592911490988, "learning_rate": 1.0678501826153486e-08, "loss": 0.3519, "step": 3452 }, { "epoch": 4.968345323741008, "grad_norm": 0.06548377636152011, "learning_rate": 9.770159634894249e-09, "loss": 0.3486, "step": 3453 }, { "epoch": 4.96978417266187, "grad_norm": 0.06879843007364607, "learning_rate": 8.902181782786124e-09, "loss": 0.3346, "step": 3454 }, { "epoch": 4.971223021582734, "grad_norm": 0.06919510418451531, "learning_rate": 8.074569145928301e-09, "loss": 0.3444, "step": 3455 }, { "epoch": 4.972661870503597, "grad_norm": 0.06857966949274884, "learning_rate": 7.287322559679233e-09, "loss": 0.3378, "step": 3456 }, { "epoch": 4.97410071942446, "grad_norm": 0.06709999989660448, "learning_rate": 6.5404428186433e-09, "loss": 0.3387, "step": 3457 }, { "epoch": 4.975539568345324, "grad_norm": 0.06821727148625614, "learning_rate": 5.833930676693023e-09, "loss": 0.3292, "step": 3458 }, { "epoch": 4.976978417266187, "grad_norm": 0.06616684632878932, "learning_rate": 5.167786846946854e-09, "loss": 0.3421, "step": 3459 }, { "epoch": 4.9784172661870505, "grad_norm": 0.07530907256547428, "learning_rate": 4.542012001778062e-09, "loss": 0.3454, "step": 3460 }, { "epoch": 4.979856115107914, "grad_norm": 0.06900234637341345, "learning_rate": 3.956606772823613e-09, "loss": 0.3465, "step": 3461 }, { "epoch": 4.981294964028777, "grad_norm": 0.07441430381886469, "learning_rate": 3.4115717509619616e-09, "loss": 0.3556, "step": 3462 }, { "epoch": 4.982733812949641, "grad_norm": 0.39321687521638127, "learning_rate": 2.9069074863219414e-09, "loss": 0.3508, "step": 3463 }, { "epoch": 4.984172661870503, "grad_norm": 0.0708930817455364, "learning_rate": 2.4426144882916392e-09, "loss": 0.3458, "step": 3464 }, { "epoch": 4.985611510791367, "grad_norm": 0.06812053265762419, "learning_rate": 2.018693225509516e-09, "loss": 0.3433, "step": 3465 }, { "epoch": 4.98705035971223, "grad_norm": 0.06911482196637857, "learning_rate": 1.6351441258644073e-09, "loss": 0.3396, "step": 3466 }, { "epoch": 4.988489208633093, "grad_norm": 0.07016639908358226, "learning_rate": 1.2919675764910823e-09, "loss": 0.3405, "step": 3467 }, { "epoch": 4.989928057553957, "grad_norm": 0.0688260652987113, "learning_rate": 9.89163923770242e-10, "loss": 0.3394, "step": 3468 }, { "epoch": 4.99136690647482, "grad_norm": 0.06872232929291658, "learning_rate": 7.26733473350727e-10, "loss": 0.3366, "step": 3469 }, { "epoch": 4.9928057553956835, "grad_norm": 0.07188593839486318, "learning_rate": 5.046764901095457e-10, "loss": 0.3463, "step": 3470 }, { "epoch": 4.994244604316547, "grad_norm": 0.06941468398555707, "learning_rate": 3.2299319817852283e-10, "loss": 0.3409, "step": 3471 }, { "epoch": 4.99568345323741, "grad_norm": 0.06822491198591221, "learning_rate": 1.816837809487382e-10, "loss": 0.3453, "step": 3472 }, { "epoch": 4.997122302158274, "grad_norm": 0.07047016856587555, "learning_rate": 8.074838104832338e-11, "loss": 0.3421, "step": 3473 }, { "epoch": 4.998561151079136, "grad_norm": 0.06851680587861847, "learning_rate": 2.0187100355784085e-11, "loss": 0.3459, "step": 3474 }, { "epoch": 5.0, "grad_norm": 0.06834187171522385, "learning_rate": 0.0, "loss": 0.3406, "step": 3475 }, { "epoch": 5.0, "step": 3475, "total_flos": 5.82970943471616e+16, "train_loss": 0.3069635923474813, "train_runtime": 45586.6789, "train_samples_per_second": 39.019, "train_steps_per_second": 0.076 } ], "logging_steps": 1, "max_steps": 3475, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.82970943471616e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }