{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9995035578355123, "eval_steps": 500, "global_step": 6042, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004964421644878371, "grad_norm": 5.537712097167969, "learning_rate": 1.652892561983471e-08, "loss": 0.8251, "step": 1 }, { "epoch": 0.0009928843289756743, "grad_norm": 5.653915882110596, "learning_rate": 3.305785123966942e-08, "loss": 0.8712, "step": 2 }, { "epoch": 0.0014893264934635116, "grad_norm": 5.490711212158203, "learning_rate": 4.958677685950414e-08, "loss": 0.8127, "step": 3 }, { "epoch": 0.0019857686579513485, "grad_norm": 5.549751281738281, "learning_rate": 6.611570247933884e-08, "loss": 0.8435, "step": 4 }, { "epoch": 0.0024822108224391857, "grad_norm": 5.879448890686035, "learning_rate": 8.264462809917357e-08, "loss": 0.8598, "step": 5 }, { "epoch": 0.002978652986927023, "grad_norm": 5.602003574371338, "learning_rate": 9.917355371900828e-08, "loss": 0.8498, "step": 6 }, { "epoch": 0.0034750951514148603, "grad_norm": 5.4635539054870605, "learning_rate": 1.1570247933884297e-07, "loss": 0.8395, "step": 7 }, { "epoch": 0.003971537315902697, "grad_norm": 5.812467575073242, "learning_rate": 1.3223140495867768e-07, "loss": 0.8498, "step": 8 }, { "epoch": 0.004467979480390534, "grad_norm": 5.34513521194458, "learning_rate": 1.487603305785124e-07, "loss": 0.8071, "step": 9 }, { "epoch": 0.004964421644878371, "grad_norm": 5.561849117279053, "learning_rate": 1.6528925619834713e-07, "loss": 0.8271, "step": 10 }, { "epoch": 0.005460863809366208, "grad_norm": 5.711532115936279, "learning_rate": 1.8181818181818183e-07, "loss": 0.8503, "step": 11 }, { "epoch": 0.005957305973854046, "grad_norm": 5.887825965881348, "learning_rate": 1.9834710743801655e-07, "loss": 0.8558, "step": 12 }, { "epoch": 0.0064537481383418836, "grad_norm": 5.878129482269287, "learning_rate": 2.1487603305785125e-07, "loss": 0.8657, "step": 13 }, { "epoch": 0.006950190302829721, "grad_norm": 5.460679531097412, "learning_rate": 2.3140495867768595e-07, "loss": 0.8544, "step": 14 }, { "epoch": 0.007446632467317558, "grad_norm": 5.463341236114502, "learning_rate": 2.4793388429752067e-07, "loss": 0.8512, "step": 15 }, { "epoch": 0.007943074631805394, "grad_norm": 5.639693737030029, "learning_rate": 2.6446280991735537e-07, "loss": 0.8301, "step": 16 }, { "epoch": 0.008439516796293232, "grad_norm": 5.407242298126221, "learning_rate": 2.809917355371901e-07, "loss": 0.812, "step": 17 }, { "epoch": 0.008935958960781068, "grad_norm": 5.06992769241333, "learning_rate": 2.975206611570248e-07, "loss": 0.806, "step": 18 }, { "epoch": 0.009432401125268906, "grad_norm": 5.385615348815918, "learning_rate": 3.1404958677685957e-07, "loss": 0.8511, "step": 19 }, { "epoch": 0.009928843289756743, "grad_norm": 5.125736236572266, "learning_rate": 3.3057851239669426e-07, "loss": 0.8112, "step": 20 }, { "epoch": 0.01042528545424458, "grad_norm": 5.141676425933838, "learning_rate": 3.4710743801652896e-07, "loss": 0.8105, "step": 21 }, { "epoch": 0.010921727618732417, "grad_norm": 4.966934680938721, "learning_rate": 3.6363636363636366e-07, "loss": 0.7885, "step": 22 }, { "epoch": 0.011418169783220255, "grad_norm": 5.24598503112793, "learning_rate": 3.8016528925619836e-07, "loss": 0.8576, "step": 23 }, { "epoch": 0.011914611947708093, "grad_norm": 4.434146404266357, "learning_rate": 3.966942148760331e-07, "loss": 0.7978, "step": 24 }, { "epoch": 0.012411054112195929, "grad_norm": 4.320842742919922, "learning_rate": 4.132231404958678e-07, "loss": 0.801, "step": 25 }, { "epoch": 0.012907496276683767, "grad_norm": 4.290024280548096, "learning_rate": 4.297520661157025e-07, "loss": 0.8274, "step": 26 }, { "epoch": 0.013403938441171603, "grad_norm": 4.211623191833496, "learning_rate": 4.462809917355372e-07, "loss": 0.798, "step": 27 }, { "epoch": 0.013900380605659441, "grad_norm": 4.045799732208252, "learning_rate": 4.628099173553719e-07, "loss": 0.7869, "step": 28 }, { "epoch": 0.014396822770147278, "grad_norm": 4.017869472503662, "learning_rate": 4.793388429752067e-07, "loss": 0.7986, "step": 29 }, { "epoch": 0.014893264934635116, "grad_norm": 3.9717729091644287, "learning_rate": 4.958677685950413e-07, "loss": 0.7853, "step": 30 }, { "epoch": 0.015389707099122952, "grad_norm": 3.8529884815216064, "learning_rate": 5.123966942148761e-07, "loss": 0.7474, "step": 31 }, { "epoch": 0.015886149263610788, "grad_norm": 3.2095980644226074, "learning_rate": 5.289256198347107e-07, "loss": 0.7911, "step": 32 }, { "epoch": 0.016382591428098628, "grad_norm": 2.215912342071533, "learning_rate": 5.454545454545455e-07, "loss": 0.7406, "step": 33 }, { "epoch": 0.016879033592586464, "grad_norm": 2.2716546058654785, "learning_rate": 5.619834710743802e-07, "loss": 0.7695, "step": 34 }, { "epoch": 0.0173754757570743, "grad_norm": 2.2271697521209717, "learning_rate": 5.78512396694215e-07, "loss": 0.745, "step": 35 }, { "epoch": 0.017871917921562137, "grad_norm": 2.231329917907715, "learning_rate": 5.950413223140496e-07, "loss": 0.7153, "step": 36 }, { "epoch": 0.018368360086049976, "grad_norm": 2.0750136375427246, "learning_rate": 6.115702479338844e-07, "loss": 0.7562, "step": 37 }, { "epoch": 0.018864802250537813, "grad_norm": 2.0137922763824463, "learning_rate": 6.280991735537191e-07, "loss": 0.742, "step": 38 }, { "epoch": 0.01936124441502565, "grad_norm": 1.925102710723877, "learning_rate": 6.446280991735538e-07, "loss": 0.7436, "step": 39 }, { "epoch": 0.019857686579513485, "grad_norm": 1.9578804969787598, "learning_rate": 6.611570247933885e-07, "loss": 0.7501, "step": 40 }, { "epoch": 0.020354128744001325, "grad_norm": 1.8270113468170166, "learning_rate": 6.776859504132232e-07, "loss": 0.7272, "step": 41 }, { "epoch": 0.02085057090848916, "grad_norm": 1.6372075080871582, "learning_rate": 6.942148760330579e-07, "loss": 0.7249, "step": 42 }, { "epoch": 0.021347013072976997, "grad_norm": 1.419540524482727, "learning_rate": 7.107438016528927e-07, "loss": 0.7136, "step": 43 }, { "epoch": 0.021843455237464834, "grad_norm": 1.3901582956314087, "learning_rate": 7.272727272727273e-07, "loss": 0.7284, "step": 44 }, { "epoch": 0.022339897401952673, "grad_norm": 1.4561810493469238, "learning_rate": 7.438016528925621e-07, "loss": 0.7039, "step": 45 }, { "epoch": 0.02283633956644051, "grad_norm": 1.6852504014968872, "learning_rate": 7.603305785123967e-07, "loss": 0.6976, "step": 46 }, { "epoch": 0.023332781730928346, "grad_norm": 1.7738722562789917, "learning_rate": 7.768595041322315e-07, "loss": 0.6721, "step": 47 }, { "epoch": 0.023829223895416186, "grad_norm": 1.7503758668899536, "learning_rate": 7.933884297520662e-07, "loss": 0.6905, "step": 48 }, { "epoch": 0.024325666059904022, "grad_norm": 1.8305506706237793, "learning_rate": 8.099173553719009e-07, "loss": 0.7314, "step": 49 }, { "epoch": 0.024822108224391858, "grad_norm": 1.533512830734253, "learning_rate": 8.264462809917356e-07, "loss": 0.6842, "step": 50 }, { "epoch": 0.025318550388879695, "grad_norm": 1.5481945276260376, "learning_rate": 8.429752066115703e-07, "loss": 0.6644, "step": 51 }, { "epoch": 0.025814992553367534, "grad_norm": 1.255188226699829, "learning_rate": 8.59504132231405e-07, "loss": 0.6496, "step": 52 }, { "epoch": 0.02631143471785537, "grad_norm": 1.1710678339004517, "learning_rate": 8.760330578512398e-07, "loss": 0.68, "step": 53 }, { "epoch": 0.026807876882343207, "grad_norm": 1.1016267538070679, "learning_rate": 8.925619834710744e-07, "loss": 0.6571, "step": 54 }, { "epoch": 0.027304319046831043, "grad_norm": 1.0589491128921509, "learning_rate": 9.090909090909091e-07, "loss": 0.6814, "step": 55 }, { "epoch": 0.027800761211318883, "grad_norm": 0.9275434017181396, "learning_rate": 9.256198347107438e-07, "loss": 0.6838, "step": 56 }, { "epoch": 0.02829720337580672, "grad_norm": 0.8655200004577637, "learning_rate": 9.421487603305785e-07, "loss": 0.6831, "step": 57 }, { "epoch": 0.028793645540294555, "grad_norm": 0.999332845211029, "learning_rate": 9.586776859504134e-07, "loss": 0.6652, "step": 58 }, { "epoch": 0.02929008770478239, "grad_norm": 0.9379848837852478, "learning_rate": 9.75206611570248e-07, "loss": 0.6534, "step": 59 }, { "epoch": 0.02978652986927023, "grad_norm": 1.004515528678894, "learning_rate": 9.917355371900827e-07, "loss": 0.6568, "step": 60 }, { "epoch": 0.030282972033758068, "grad_norm": 0.891022264957428, "learning_rate": 1.0082644628099174e-06, "loss": 0.6516, "step": 61 }, { "epoch": 0.030779414198245904, "grad_norm": 0.8138242959976196, "learning_rate": 1.0247933884297522e-06, "loss": 0.5937, "step": 62 }, { "epoch": 0.031275856362733744, "grad_norm": 0.7089791297912598, "learning_rate": 1.041322314049587e-06, "loss": 0.6385, "step": 63 }, { "epoch": 0.031772298527221576, "grad_norm": 0.7549603581428528, "learning_rate": 1.0578512396694215e-06, "loss": 0.6165, "step": 64 }, { "epoch": 0.032268740691709416, "grad_norm": 0.6580888628959656, "learning_rate": 1.0743801652892562e-06, "loss": 0.6448, "step": 65 }, { "epoch": 0.032765182856197256, "grad_norm": 0.6925730109214783, "learning_rate": 1.090909090909091e-06, "loss": 0.6402, "step": 66 }, { "epoch": 0.03326162502068509, "grad_norm": 0.673387885093689, "learning_rate": 1.1074380165289257e-06, "loss": 0.6458, "step": 67 }, { "epoch": 0.03375806718517293, "grad_norm": 0.6633987426757812, "learning_rate": 1.1239669421487605e-06, "loss": 0.6077, "step": 68 }, { "epoch": 0.03425450934966076, "grad_norm": 0.6271533966064453, "learning_rate": 1.140495867768595e-06, "loss": 0.6497, "step": 69 }, { "epoch": 0.0347509515141486, "grad_norm": 0.5847249031066895, "learning_rate": 1.15702479338843e-06, "loss": 0.6234, "step": 70 }, { "epoch": 0.03524739367863644, "grad_norm": 0.5981353521347046, "learning_rate": 1.1735537190082645e-06, "loss": 0.5965, "step": 71 }, { "epoch": 0.03574383584312427, "grad_norm": 0.5607947707176208, "learning_rate": 1.1900826446280993e-06, "loss": 0.5932, "step": 72 }, { "epoch": 0.03624027800761211, "grad_norm": 0.5636858344078064, "learning_rate": 1.206611570247934e-06, "loss": 0.5865, "step": 73 }, { "epoch": 0.03673672017209995, "grad_norm": 0.6038545966148376, "learning_rate": 1.2231404958677688e-06, "loss": 0.615, "step": 74 }, { "epoch": 0.037233162336587786, "grad_norm": 0.5776389241218567, "learning_rate": 1.2396694214876035e-06, "loss": 0.6066, "step": 75 }, { "epoch": 0.037729604501075625, "grad_norm": 0.611750602722168, "learning_rate": 1.2561983471074383e-06, "loss": 0.6306, "step": 76 }, { "epoch": 0.038226046665563465, "grad_norm": 0.5761667490005493, "learning_rate": 1.2727272727272728e-06, "loss": 0.6257, "step": 77 }, { "epoch": 0.0387224888300513, "grad_norm": 0.5354779362678528, "learning_rate": 1.2892561983471076e-06, "loss": 0.5943, "step": 78 }, { "epoch": 0.03921893099453914, "grad_norm": 0.49696028232574463, "learning_rate": 1.3057851239669423e-06, "loss": 0.6216, "step": 79 }, { "epoch": 0.03971537315902697, "grad_norm": 0.46249544620513916, "learning_rate": 1.322314049586777e-06, "loss": 0.6036, "step": 80 }, { "epoch": 0.04021181532351481, "grad_norm": 0.44712987542152405, "learning_rate": 1.3388429752066118e-06, "loss": 0.5592, "step": 81 }, { "epoch": 0.04070825748800265, "grad_norm": 0.47506338357925415, "learning_rate": 1.3553719008264463e-06, "loss": 0.6026, "step": 82 }, { "epoch": 0.04120469965249048, "grad_norm": 0.48472481966018677, "learning_rate": 1.371900826446281e-06, "loss": 0.6044, "step": 83 }, { "epoch": 0.04170114181697832, "grad_norm": 0.48860955238342285, "learning_rate": 1.3884297520661158e-06, "loss": 0.6051, "step": 84 }, { "epoch": 0.04219758398146616, "grad_norm": 0.43376126885414124, "learning_rate": 1.4049586776859506e-06, "loss": 0.5719, "step": 85 }, { "epoch": 0.042694026145953995, "grad_norm": 0.49441155791282654, "learning_rate": 1.4214876033057853e-06, "loss": 0.5948, "step": 86 }, { "epoch": 0.043190468310441835, "grad_norm": 0.4431954622268677, "learning_rate": 1.4380165289256199e-06, "loss": 0.596, "step": 87 }, { "epoch": 0.04368691047492967, "grad_norm": 0.4832960069179535, "learning_rate": 1.4545454545454546e-06, "loss": 0.6115, "step": 88 }, { "epoch": 0.04418335263941751, "grad_norm": 0.4512057304382324, "learning_rate": 1.4710743801652894e-06, "loss": 0.5811, "step": 89 }, { "epoch": 0.04467979480390535, "grad_norm": 0.46607622504234314, "learning_rate": 1.4876033057851241e-06, "loss": 0.6139, "step": 90 }, { "epoch": 0.04517623696839318, "grad_norm": 0.43149900436401367, "learning_rate": 1.5041322314049589e-06, "loss": 0.584, "step": 91 }, { "epoch": 0.04567267913288102, "grad_norm": 0.40727904438972473, "learning_rate": 1.5206611570247934e-06, "loss": 0.5502, "step": 92 }, { "epoch": 0.04616912129736886, "grad_norm": 0.4200940430164337, "learning_rate": 1.5371900826446282e-06, "loss": 0.5777, "step": 93 }, { "epoch": 0.04666556346185669, "grad_norm": 0.42519038915634155, "learning_rate": 1.553719008264463e-06, "loss": 0.5788, "step": 94 }, { "epoch": 0.04716200562634453, "grad_norm": 0.43745777010917664, "learning_rate": 1.5702479338842977e-06, "loss": 0.5757, "step": 95 }, { "epoch": 0.04765844779083237, "grad_norm": 0.429045170545578, "learning_rate": 1.5867768595041324e-06, "loss": 0.5785, "step": 96 }, { "epoch": 0.048154889955320204, "grad_norm": 0.5629072189331055, "learning_rate": 1.603305785123967e-06, "loss": 0.6079, "step": 97 }, { "epoch": 0.048651332119808044, "grad_norm": 0.39946743845939636, "learning_rate": 1.6198347107438017e-06, "loss": 0.581, "step": 98 }, { "epoch": 0.04914777428429588, "grad_norm": 0.46745991706848145, "learning_rate": 1.6363636363636365e-06, "loss": 0.5736, "step": 99 }, { "epoch": 0.049644216448783716, "grad_norm": 0.39259618520736694, "learning_rate": 1.6528925619834712e-06, "loss": 0.5364, "step": 100 }, { "epoch": 0.050140658613271556, "grad_norm": 0.42921721935272217, "learning_rate": 1.669421487603306e-06, "loss": 0.5865, "step": 101 }, { "epoch": 0.05063710077775939, "grad_norm": 0.4561671018600464, "learning_rate": 1.6859504132231405e-06, "loss": 0.5718, "step": 102 }, { "epoch": 0.05113354294224723, "grad_norm": 0.3832397758960724, "learning_rate": 1.7024793388429753e-06, "loss": 0.5507, "step": 103 }, { "epoch": 0.05162998510673507, "grad_norm": 0.468263179063797, "learning_rate": 1.71900826446281e-06, "loss": 0.588, "step": 104 }, { "epoch": 0.0521264272712229, "grad_norm": 0.42221948504447937, "learning_rate": 1.7355371900826448e-06, "loss": 0.5674, "step": 105 }, { "epoch": 0.05262286943571074, "grad_norm": 0.47966468334198, "learning_rate": 1.7520661157024795e-06, "loss": 0.5779, "step": 106 }, { "epoch": 0.053119311600198574, "grad_norm": 0.37807726860046387, "learning_rate": 1.768595041322314e-06, "loss": 0.5629, "step": 107 }, { "epoch": 0.053615753764686414, "grad_norm": 0.4216509461402893, "learning_rate": 1.7851239669421488e-06, "loss": 0.5719, "step": 108 }, { "epoch": 0.05411219592917425, "grad_norm": 0.43120044469833374, "learning_rate": 1.8016528925619835e-06, "loss": 0.5712, "step": 109 }, { "epoch": 0.054608638093662086, "grad_norm": 0.4126465320587158, "learning_rate": 1.8181818181818183e-06, "loss": 0.5496, "step": 110 }, { "epoch": 0.055105080258149926, "grad_norm": 0.43618887662887573, "learning_rate": 1.8347107438016533e-06, "loss": 0.5587, "step": 111 }, { "epoch": 0.055601522422637766, "grad_norm": 0.4335828721523285, "learning_rate": 1.8512396694214876e-06, "loss": 0.583, "step": 112 }, { "epoch": 0.0560979645871256, "grad_norm": 0.36888957023620605, "learning_rate": 1.8677685950413223e-06, "loss": 0.5195, "step": 113 }, { "epoch": 0.05659440675161344, "grad_norm": 0.3870840072631836, "learning_rate": 1.884297520661157e-06, "loss": 0.5602, "step": 114 }, { "epoch": 0.05709084891610127, "grad_norm": 0.4921371042728424, "learning_rate": 1.900826446280992e-06, "loss": 0.5727, "step": 115 }, { "epoch": 0.05758729108058911, "grad_norm": 0.4245266318321228, "learning_rate": 1.917355371900827e-06, "loss": 0.5511, "step": 116 }, { "epoch": 0.05808373324507695, "grad_norm": 0.4542331099510193, "learning_rate": 1.9338842975206613e-06, "loss": 0.5311, "step": 117 }, { "epoch": 0.05858017540956478, "grad_norm": 0.4490202069282532, "learning_rate": 1.950413223140496e-06, "loss": 0.5512, "step": 118 }, { "epoch": 0.05907661757405262, "grad_norm": 0.39388081431388855, "learning_rate": 1.966942148760331e-06, "loss": 0.5579, "step": 119 }, { "epoch": 0.05957305973854046, "grad_norm": 0.41405603289604187, "learning_rate": 1.9834710743801654e-06, "loss": 0.5619, "step": 120 }, { "epoch": 0.060069501903028295, "grad_norm": 0.42730504274368286, "learning_rate": 2.0000000000000003e-06, "loss": 0.5808, "step": 121 }, { "epoch": 0.060565944067516135, "grad_norm": 0.4391515552997589, "learning_rate": 2.016528925619835e-06, "loss": 0.5644, "step": 122 }, { "epoch": 0.061062386232003975, "grad_norm": 0.41354671120643616, "learning_rate": 2.0330578512396694e-06, "loss": 0.5414, "step": 123 }, { "epoch": 0.06155882839649181, "grad_norm": 0.43981659412384033, "learning_rate": 2.0495867768595044e-06, "loss": 0.5512, "step": 124 }, { "epoch": 0.06205527056097965, "grad_norm": 0.39013755321502686, "learning_rate": 2.066115702479339e-06, "loss": 0.5441, "step": 125 }, { "epoch": 0.06255171272546749, "grad_norm": 0.5506719946861267, "learning_rate": 2.082644628099174e-06, "loss": 0.5712, "step": 126 }, { "epoch": 0.06304815488995533, "grad_norm": 0.4118545949459076, "learning_rate": 2.0991735537190084e-06, "loss": 0.5275, "step": 127 }, { "epoch": 0.06354459705444315, "grad_norm": 0.4093695878982544, "learning_rate": 2.115702479338843e-06, "loss": 0.5266, "step": 128 }, { "epoch": 0.06404103921893099, "grad_norm": 0.436404824256897, "learning_rate": 2.132231404958678e-06, "loss": 0.5471, "step": 129 }, { "epoch": 0.06453748138341883, "grad_norm": 0.4181048572063446, "learning_rate": 2.1487603305785124e-06, "loss": 0.5414, "step": 130 }, { "epoch": 0.06503392354790667, "grad_norm": 0.4052174985408783, "learning_rate": 2.1652892561983474e-06, "loss": 0.5465, "step": 131 }, { "epoch": 0.06553036571239451, "grad_norm": 0.3919404447078705, "learning_rate": 2.181818181818182e-06, "loss": 0.5425, "step": 132 }, { "epoch": 0.06602680787688234, "grad_norm": 0.4083176255226135, "learning_rate": 2.1983471074380165e-06, "loss": 0.5434, "step": 133 }, { "epoch": 0.06652325004137018, "grad_norm": 0.49461936950683594, "learning_rate": 2.2148760330578515e-06, "loss": 0.5638, "step": 134 }, { "epoch": 0.06701969220585802, "grad_norm": 0.3807922601699829, "learning_rate": 2.231404958677686e-06, "loss": 0.5289, "step": 135 }, { "epoch": 0.06751613437034586, "grad_norm": 0.4261229932308197, "learning_rate": 2.247933884297521e-06, "loss": 0.5398, "step": 136 }, { "epoch": 0.0680125765348337, "grad_norm": 0.39319565892219543, "learning_rate": 2.2644628099173555e-06, "loss": 0.5637, "step": 137 }, { "epoch": 0.06850901869932152, "grad_norm": 0.39692559838294983, "learning_rate": 2.28099173553719e-06, "loss": 0.5154, "step": 138 }, { "epoch": 0.06900546086380936, "grad_norm": 0.4601784646511078, "learning_rate": 2.297520661157025e-06, "loss": 0.5389, "step": 139 }, { "epoch": 0.0695019030282972, "grad_norm": 0.3952353298664093, "learning_rate": 2.31404958677686e-06, "loss": 0.5321, "step": 140 }, { "epoch": 0.06999834519278504, "grad_norm": 0.4491402506828308, "learning_rate": 2.3305785123966945e-06, "loss": 0.5325, "step": 141 }, { "epoch": 0.07049478735727288, "grad_norm": 0.4495823383331299, "learning_rate": 2.347107438016529e-06, "loss": 0.5347, "step": 142 }, { "epoch": 0.07099122952176072, "grad_norm": 0.4525030553340912, "learning_rate": 2.363636363636364e-06, "loss": 0.5706, "step": 143 }, { "epoch": 0.07148767168624855, "grad_norm": 0.4462507963180542, "learning_rate": 2.3801652892561985e-06, "loss": 0.5527, "step": 144 }, { "epoch": 0.07198411385073639, "grad_norm": 0.4047246277332306, "learning_rate": 2.3966942148760335e-06, "loss": 0.5476, "step": 145 }, { "epoch": 0.07248055601522423, "grad_norm": 0.5119194984436035, "learning_rate": 2.413223140495868e-06, "loss": 0.5601, "step": 146 }, { "epoch": 0.07297699817971207, "grad_norm": 0.4200640022754669, "learning_rate": 2.4297520661157026e-06, "loss": 0.5466, "step": 147 }, { "epoch": 0.0734734403441999, "grad_norm": 0.4022303819656372, "learning_rate": 2.4462809917355375e-06, "loss": 0.5133, "step": 148 }, { "epoch": 0.07396988250868773, "grad_norm": 0.4304042160511017, "learning_rate": 2.462809917355372e-06, "loss": 0.5474, "step": 149 }, { "epoch": 0.07446632467317557, "grad_norm": 0.47765541076660156, "learning_rate": 2.479338842975207e-06, "loss": 0.5687, "step": 150 }, { "epoch": 0.07496276683766341, "grad_norm": 0.4207160770893097, "learning_rate": 2.4958677685950416e-06, "loss": 0.5644, "step": 151 }, { "epoch": 0.07545920900215125, "grad_norm": 0.4479532539844513, "learning_rate": 2.5123966942148765e-06, "loss": 0.5568, "step": 152 }, { "epoch": 0.07595565116663909, "grad_norm": 0.49835506081581116, "learning_rate": 2.528925619834711e-06, "loss": 0.5376, "step": 153 }, { "epoch": 0.07645209333112693, "grad_norm": 0.4120480716228485, "learning_rate": 2.5454545454545456e-06, "loss": 0.5293, "step": 154 }, { "epoch": 0.07694853549561476, "grad_norm": 0.4124598801136017, "learning_rate": 2.56198347107438e-06, "loss": 0.5495, "step": 155 }, { "epoch": 0.0774449776601026, "grad_norm": 0.38867881894111633, "learning_rate": 2.578512396694215e-06, "loss": 0.5336, "step": 156 }, { "epoch": 0.07794141982459044, "grad_norm": 0.4549954831600189, "learning_rate": 2.5950413223140496e-06, "loss": 0.5611, "step": 157 }, { "epoch": 0.07843786198907828, "grad_norm": 0.4409182071685791, "learning_rate": 2.6115702479338846e-06, "loss": 0.5681, "step": 158 }, { "epoch": 0.07893430415356611, "grad_norm": 0.4545327126979828, "learning_rate": 2.628099173553719e-06, "loss": 0.5473, "step": 159 }, { "epoch": 0.07943074631805394, "grad_norm": 0.47488734126091003, "learning_rate": 2.644628099173554e-06, "loss": 0.5619, "step": 160 }, { "epoch": 0.07992718848254178, "grad_norm": 0.4208771586418152, "learning_rate": 2.6611570247933886e-06, "loss": 0.5381, "step": 161 }, { "epoch": 0.08042363064702962, "grad_norm": 0.4197986125946045, "learning_rate": 2.6776859504132236e-06, "loss": 0.5333, "step": 162 }, { "epoch": 0.08092007281151746, "grad_norm": 0.445722371339798, "learning_rate": 2.694214876033058e-06, "loss": 0.5525, "step": 163 }, { "epoch": 0.0814165149760053, "grad_norm": 0.39569711685180664, "learning_rate": 2.7107438016528927e-06, "loss": 0.5454, "step": 164 }, { "epoch": 0.08191295714049314, "grad_norm": 0.40795499086380005, "learning_rate": 2.7272727272727272e-06, "loss": 0.5542, "step": 165 }, { "epoch": 0.08240939930498097, "grad_norm": 0.445604532957077, "learning_rate": 2.743801652892562e-06, "loss": 0.4988, "step": 166 }, { "epoch": 0.0829058414694688, "grad_norm": 0.39959976077079773, "learning_rate": 2.7603305785123967e-06, "loss": 0.5161, "step": 167 }, { "epoch": 0.08340228363395664, "grad_norm": 0.4388590455055237, "learning_rate": 2.7768595041322317e-06, "loss": 0.5285, "step": 168 }, { "epoch": 0.08389872579844448, "grad_norm": 0.4415607452392578, "learning_rate": 2.7933884297520662e-06, "loss": 0.516, "step": 169 }, { "epoch": 0.08439516796293232, "grad_norm": 0.43430596590042114, "learning_rate": 2.809917355371901e-06, "loss": 0.5494, "step": 170 }, { "epoch": 0.08489161012742015, "grad_norm": 0.45921802520751953, "learning_rate": 2.8264462809917357e-06, "loss": 0.53, "step": 171 }, { "epoch": 0.08538805229190799, "grad_norm": 0.45011910796165466, "learning_rate": 2.8429752066115707e-06, "loss": 0.5184, "step": 172 }, { "epoch": 0.08588449445639583, "grad_norm": 0.4204288423061371, "learning_rate": 2.8595041322314052e-06, "loss": 0.5451, "step": 173 }, { "epoch": 0.08638093662088367, "grad_norm": 0.47453558444976807, "learning_rate": 2.8760330578512398e-06, "loss": 0.5383, "step": 174 }, { "epoch": 0.08687737878537151, "grad_norm": 0.45714735984802246, "learning_rate": 2.8925619834710743e-06, "loss": 0.5256, "step": 175 }, { "epoch": 0.08737382094985933, "grad_norm": 0.466596394777298, "learning_rate": 2.9090909090909093e-06, "loss": 0.5192, "step": 176 }, { "epoch": 0.08787026311434717, "grad_norm": 0.4168410003185272, "learning_rate": 2.925619834710744e-06, "loss": 0.5347, "step": 177 }, { "epoch": 0.08836670527883501, "grad_norm": 0.4217172861099243, "learning_rate": 2.9421487603305788e-06, "loss": 0.5226, "step": 178 }, { "epoch": 0.08886314744332285, "grad_norm": 0.4084828794002533, "learning_rate": 2.9586776859504133e-06, "loss": 0.534, "step": 179 }, { "epoch": 0.0893595896078107, "grad_norm": 0.43430769443511963, "learning_rate": 2.9752066115702483e-06, "loss": 0.5285, "step": 180 }, { "epoch": 0.08985603177229853, "grad_norm": 0.4680427014827728, "learning_rate": 2.9917355371900832e-06, "loss": 0.5323, "step": 181 }, { "epoch": 0.09035247393678636, "grad_norm": 0.42526760697364807, "learning_rate": 3.0082644628099178e-06, "loss": 0.5296, "step": 182 }, { "epoch": 0.0908489161012742, "grad_norm": 0.42928963899612427, "learning_rate": 3.0247933884297527e-06, "loss": 0.5268, "step": 183 }, { "epoch": 0.09134535826576204, "grad_norm": 0.44855114817619324, "learning_rate": 3.041322314049587e-06, "loss": 0.548, "step": 184 }, { "epoch": 0.09184180043024988, "grad_norm": 0.4150737524032593, "learning_rate": 3.0578512396694214e-06, "loss": 0.4941, "step": 185 }, { "epoch": 0.09233824259473772, "grad_norm": 0.42198947072029114, "learning_rate": 3.0743801652892563e-06, "loss": 0.5122, "step": 186 }, { "epoch": 0.09283468475922554, "grad_norm": 0.4320473372936249, "learning_rate": 3.090909090909091e-06, "loss": 0.5165, "step": 187 }, { "epoch": 0.09333112692371338, "grad_norm": 0.4010680019855499, "learning_rate": 3.107438016528926e-06, "loss": 0.509, "step": 188 }, { "epoch": 0.09382756908820122, "grad_norm": 0.4039880335330963, "learning_rate": 3.123966942148761e-06, "loss": 0.49, "step": 189 }, { "epoch": 0.09432401125268906, "grad_norm": 0.42069172859191895, "learning_rate": 3.1404958677685953e-06, "loss": 0.5225, "step": 190 }, { "epoch": 0.0948204534171769, "grad_norm": 0.4326605796813965, "learning_rate": 3.1570247933884303e-06, "loss": 0.5183, "step": 191 }, { "epoch": 0.09531689558166474, "grad_norm": 0.491259902715683, "learning_rate": 3.173553719008265e-06, "loss": 0.5251, "step": 192 }, { "epoch": 0.09581333774615257, "grad_norm": 0.4212777316570282, "learning_rate": 3.1900826446281e-06, "loss": 0.5466, "step": 193 }, { "epoch": 0.09630977991064041, "grad_norm": 0.4416753947734833, "learning_rate": 3.206611570247934e-06, "loss": 0.516, "step": 194 }, { "epoch": 0.09680622207512825, "grad_norm": 0.3960636854171753, "learning_rate": 3.2231404958677685e-06, "loss": 0.4936, "step": 195 }, { "epoch": 0.09730266423961609, "grad_norm": 0.4701974093914032, "learning_rate": 3.2396694214876034e-06, "loss": 0.5243, "step": 196 }, { "epoch": 0.09779910640410393, "grad_norm": 0.45834606885910034, "learning_rate": 3.2561983471074384e-06, "loss": 0.5307, "step": 197 }, { "epoch": 0.09829554856859175, "grad_norm": 0.41191598773002625, "learning_rate": 3.272727272727273e-06, "loss": 0.508, "step": 198 }, { "epoch": 0.0987919907330796, "grad_norm": 0.44801607728004456, "learning_rate": 3.289256198347108e-06, "loss": 0.5183, "step": 199 }, { "epoch": 0.09928843289756743, "grad_norm": 0.4123517870903015, "learning_rate": 3.3057851239669424e-06, "loss": 0.5362, "step": 200 }, { "epoch": 0.09978487506205527, "grad_norm": 0.42854413390159607, "learning_rate": 3.3223140495867774e-06, "loss": 0.5142, "step": 201 }, { "epoch": 0.10028131722654311, "grad_norm": 0.4421244263648987, "learning_rate": 3.338842975206612e-06, "loss": 0.5082, "step": 202 }, { "epoch": 0.10077775939103094, "grad_norm": 0.48917990922927856, "learning_rate": 3.355371900826447e-06, "loss": 0.5401, "step": 203 }, { "epoch": 0.10127420155551878, "grad_norm": 0.3849780857563019, "learning_rate": 3.371900826446281e-06, "loss": 0.5038, "step": 204 }, { "epoch": 0.10177064372000662, "grad_norm": 0.4039342701435089, "learning_rate": 3.388429752066116e-06, "loss": 0.52, "step": 205 }, { "epoch": 0.10226708588449446, "grad_norm": 0.39868512749671936, "learning_rate": 3.4049586776859505e-06, "loss": 0.4937, "step": 206 }, { "epoch": 0.1027635280489823, "grad_norm": 0.41511204838752747, "learning_rate": 3.4214876033057855e-06, "loss": 0.5019, "step": 207 }, { "epoch": 0.10325997021347014, "grad_norm": 0.4304979741573334, "learning_rate": 3.43801652892562e-06, "loss": 0.4951, "step": 208 }, { "epoch": 0.10375641237795796, "grad_norm": 0.5028613805770874, "learning_rate": 3.454545454545455e-06, "loss": 0.5457, "step": 209 }, { "epoch": 0.1042528545424458, "grad_norm": 0.42927849292755127, "learning_rate": 3.4710743801652895e-06, "loss": 0.5103, "step": 210 }, { "epoch": 0.10474929670693364, "grad_norm": 0.516677975654602, "learning_rate": 3.4876033057851245e-06, "loss": 0.5253, "step": 211 }, { "epoch": 0.10524573887142148, "grad_norm": 0.4295234978199005, "learning_rate": 3.504132231404959e-06, "loss": 0.4954, "step": 212 }, { "epoch": 0.10574218103590932, "grad_norm": 0.5253714919090271, "learning_rate": 3.520661157024794e-06, "loss": 0.5297, "step": 213 }, { "epoch": 0.10623862320039715, "grad_norm": 0.47706469893455505, "learning_rate": 3.537190082644628e-06, "loss": 0.5283, "step": 214 }, { "epoch": 0.10673506536488499, "grad_norm": 0.41907650232315063, "learning_rate": 3.553719008264463e-06, "loss": 0.5004, "step": 215 }, { "epoch": 0.10723150752937283, "grad_norm": 0.5059155821800232, "learning_rate": 3.5702479338842976e-06, "loss": 0.5231, "step": 216 }, { "epoch": 0.10772794969386067, "grad_norm": 0.48403531312942505, "learning_rate": 3.5867768595041325e-06, "loss": 0.5329, "step": 217 }, { "epoch": 0.1082243918583485, "grad_norm": 0.40404579043388367, "learning_rate": 3.603305785123967e-06, "loss": 0.5242, "step": 218 }, { "epoch": 0.10872083402283635, "grad_norm": 0.5540491938591003, "learning_rate": 3.619834710743802e-06, "loss": 0.5071, "step": 219 }, { "epoch": 0.10921727618732417, "grad_norm": 0.4164169728755951, "learning_rate": 3.6363636363636366e-06, "loss": 0.4914, "step": 220 }, { "epoch": 0.10971371835181201, "grad_norm": 0.4793415069580078, "learning_rate": 3.6528925619834715e-06, "loss": 0.5655, "step": 221 }, { "epoch": 0.11021016051629985, "grad_norm": 0.6084384918212891, "learning_rate": 3.6694214876033065e-06, "loss": 0.5263, "step": 222 }, { "epoch": 0.11070660268078769, "grad_norm": 0.47772252559661865, "learning_rate": 3.685950413223141e-06, "loss": 0.4993, "step": 223 }, { "epoch": 0.11120304484527553, "grad_norm": 0.45164763927459717, "learning_rate": 3.702479338842975e-06, "loss": 0.5479, "step": 224 }, { "epoch": 0.11169948700976336, "grad_norm": 0.41422829031944275, "learning_rate": 3.71900826446281e-06, "loss": 0.4887, "step": 225 }, { "epoch": 0.1121959291742512, "grad_norm": 0.49492308497428894, "learning_rate": 3.7355371900826447e-06, "loss": 0.517, "step": 226 }, { "epoch": 0.11269237133873904, "grad_norm": 0.43511784076690674, "learning_rate": 3.7520661157024796e-06, "loss": 0.5153, "step": 227 }, { "epoch": 0.11318881350322688, "grad_norm": 0.433106929063797, "learning_rate": 3.768595041322314e-06, "loss": 0.5369, "step": 228 }, { "epoch": 0.11368525566771472, "grad_norm": 0.45887044072151184, "learning_rate": 3.785123966942149e-06, "loss": 0.5264, "step": 229 }, { "epoch": 0.11418169783220254, "grad_norm": 0.4132455885410309, "learning_rate": 3.801652892561984e-06, "loss": 0.4706, "step": 230 }, { "epoch": 0.11467813999669038, "grad_norm": 0.4735982418060303, "learning_rate": 3.818181818181819e-06, "loss": 0.5302, "step": 231 }, { "epoch": 0.11517458216117822, "grad_norm": 0.41978907585144043, "learning_rate": 3.834710743801654e-06, "loss": 0.4976, "step": 232 }, { "epoch": 0.11567102432566606, "grad_norm": 0.4424271583557129, "learning_rate": 3.851239669421488e-06, "loss": 0.5238, "step": 233 }, { "epoch": 0.1161674664901539, "grad_norm": 0.4278840720653534, "learning_rate": 3.867768595041323e-06, "loss": 0.5309, "step": 234 }, { "epoch": 0.11666390865464174, "grad_norm": 0.4424324333667755, "learning_rate": 3.884297520661157e-06, "loss": 0.5152, "step": 235 }, { "epoch": 0.11716035081912957, "grad_norm": 0.4276709258556366, "learning_rate": 3.900826446280992e-06, "loss": 0.5317, "step": 236 }, { "epoch": 0.1176567929836174, "grad_norm": 0.4686425030231476, "learning_rate": 3.917355371900827e-06, "loss": 0.5055, "step": 237 }, { "epoch": 0.11815323514810525, "grad_norm": 0.5172693729400635, "learning_rate": 3.933884297520662e-06, "loss": 0.5192, "step": 238 }, { "epoch": 0.11864967731259309, "grad_norm": 0.4274580478668213, "learning_rate": 3.950413223140496e-06, "loss": 0.4912, "step": 239 }, { "epoch": 0.11914611947708093, "grad_norm": 0.53415447473526, "learning_rate": 3.966942148760331e-06, "loss": 0.4859, "step": 240 }, { "epoch": 0.11964256164156875, "grad_norm": 0.4708065986633301, "learning_rate": 3.983471074380166e-06, "loss": 0.5128, "step": 241 }, { "epoch": 0.12013900380605659, "grad_norm": 0.40249568223953247, "learning_rate": 4.000000000000001e-06, "loss": 0.5175, "step": 242 }, { "epoch": 0.12063544597054443, "grad_norm": 0.4640263617038727, "learning_rate": 4.016528925619834e-06, "loss": 0.5126, "step": 243 }, { "epoch": 0.12113188813503227, "grad_norm": 0.43821629881858826, "learning_rate": 4.03305785123967e-06, "loss": 0.4934, "step": 244 }, { "epoch": 0.12162833029952011, "grad_norm": 0.4355541169643402, "learning_rate": 4.049586776859504e-06, "loss": 0.4755, "step": 245 }, { "epoch": 0.12212477246400795, "grad_norm": 0.42904800176620483, "learning_rate": 4.066115702479339e-06, "loss": 0.4976, "step": 246 }, { "epoch": 0.12262121462849578, "grad_norm": 0.45794782042503357, "learning_rate": 4.082644628099174e-06, "loss": 0.5105, "step": 247 }, { "epoch": 0.12311765679298362, "grad_norm": 0.4353063404560089, "learning_rate": 4.099173553719009e-06, "loss": 0.4915, "step": 248 }, { "epoch": 0.12361409895747145, "grad_norm": 0.5163282155990601, "learning_rate": 4.115702479338843e-06, "loss": 0.5209, "step": 249 }, { "epoch": 0.1241105411219593, "grad_norm": 0.540636420249939, "learning_rate": 4.132231404958678e-06, "loss": 0.5522, "step": 250 }, { "epoch": 0.12460698328644713, "grad_norm": 0.4323466420173645, "learning_rate": 4.148760330578513e-06, "loss": 0.4943, "step": 251 }, { "epoch": 0.12510342545093497, "grad_norm": 0.46424269676208496, "learning_rate": 4.165289256198348e-06, "loss": 0.5089, "step": 252 }, { "epoch": 0.1255998676154228, "grad_norm": 0.4883274435997009, "learning_rate": 4.181818181818182e-06, "loss": 0.478, "step": 253 }, { "epoch": 0.12609630977991065, "grad_norm": 0.4855574071407318, "learning_rate": 4.198347107438017e-06, "loss": 0.5122, "step": 254 }, { "epoch": 0.12659275194439848, "grad_norm": 0.43058347702026367, "learning_rate": 4.214876033057851e-06, "loss": 0.5315, "step": 255 }, { "epoch": 0.1270891941088863, "grad_norm": 0.5215668678283691, "learning_rate": 4.231404958677686e-06, "loss": 0.5132, "step": 256 }, { "epoch": 0.12758563627337416, "grad_norm": 0.4685426950454712, "learning_rate": 4.247933884297521e-06, "loss": 0.5322, "step": 257 }, { "epoch": 0.12808207843786198, "grad_norm": 0.445178359746933, "learning_rate": 4.264462809917356e-06, "loss": 0.5147, "step": 258 }, { "epoch": 0.12857852060234984, "grad_norm": 0.418763667345047, "learning_rate": 4.28099173553719e-06, "loss": 0.5006, "step": 259 }, { "epoch": 0.12907496276683766, "grad_norm": 0.42760413885116577, "learning_rate": 4.297520661157025e-06, "loss": 0.4548, "step": 260 }, { "epoch": 0.1295714049313255, "grad_norm": 0.47623905539512634, "learning_rate": 4.31404958677686e-06, "loss": 0.5059, "step": 261 }, { "epoch": 0.13006784709581334, "grad_norm": 0.49740535020828247, "learning_rate": 4.330578512396695e-06, "loss": 0.521, "step": 262 }, { "epoch": 0.13056428926030117, "grad_norm": 0.46235817670822144, "learning_rate": 4.347107438016529e-06, "loss": 0.4794, "step": 263 }, { "epoch": 0.13106073142478902, "grad_norm": 0.4386070966720581, "learning_rate": 4.363636363636364e-06, "loss": 0.4941, "step": 264 }, { "epoch": 0.13155717358927685, "grad_norm": 0.46288546919822693, "learning_rate": 4.3801652892561984e-06, "loss": 0.5021, "step": 265 }, { "epoch": 0.13205361575376467, "grad_norm": 0.5500044226646423, "learning_rate": 4.396694214876033e-06, "loss": 0.4981, "step": 266 }, { "epoch": 0.13255005791825253, "grad_norm": 0.4679776132106781, "learning_rate": 4.413223140495868e-06, "loss": 0.5188, "step": 267 }, { "epoch": 0.13304650008274035, "grad_norm": 0.44262880086898804, "learning_rate": 4.429752066115703e-06, "loss": 0.4941, "step": 268 }, { "epoch": 0.1335429422472282, "grad_norm": 0.5109505653381348, "learning_rate": 4.4462809917355374e-06, "loss": 0.5106, "step": 269 }, { "epoch": 0.13403938441171603, "grad_norm": 0.4763473570346832, "learning_rate": 4.462809917355372e-06, "loss": 0.5252, "step": 270 }, { "epoch": 0.13453582657620386, "grad_norm": 0.4389742612838745, "learning_rate": 4.479338842975207e-06, "loss": 0.4843, "step": 271 }, { "epoch": 0.1350322687406917, "grad_norm": 0.41112667322158813, "learning_rate": 4.495867768595042e-06, "loss": 0.4987, "step": 272 }, { "epoch": 0.13552871090517954, "grad_norm": 0.42999914288520813, "learning_rate": 4.5123966942148764e-06, "loss": 0.4804, "step": 273 }, { "epoch": 0.1360251530696674, "grad_norm": 0.42137667536735535, "learning_rate": 4.528925619834711e-06, "loss": 0.5123, "step": 274 }, { "epoch": 0.13652159523415522, "grad_norm": 0.45484447479248047, "learning_rate": 4.5454545454545455e-06, "loss": 0.5129, "step": 275 }, { "epoch": 0.13701803739864304, "grad_norm": 0.5030915141105652, "learning_rate": 4.56198347107438e-06, "loss": 0.4857, "step": 276 }, { "epoch": 0.1375144795631309, "grad_norm": 0.4345345199108124, "learning_rate": 4.5785123966942154e-06, "loss": 0.5302, "step": 277 }, { "epoch": 0.13801092172761872, "grad_norm": 0.504103422164917, "learning_rate": 4.59504132231405e-06, "loss": 0.51, "step": 278 }, { "epoch": 0.13850736389210658, "grad_norm": 0.45938870310783386, "learning_rate": 4.6115702479338845e-06, "loss": 0.5036, "step": 279 }, { "epoch": 0.1390038060565944, "grad_norm": 0.40972739458084106, "learning_rate": 4.62809917355372e-06, "loss": 0.4991, "step": 280 }, { "epoch": 0.13950024822108226, "grad_norm": 0.48454973101615906, "learning_rate": 4.6446280991735544e-06, "loss": 0.4946, "step": 281 }, { "epoch": 0.13999669038557008, "grad_norm": 0.44292736053466797, "learning_rate": 4.661157024793389e-06, "loss": 0.4967, "step": 282 }, { "epoch": 0.1404931325500579, "grad_norm": 0.4697224497795105, "learning_rate": 4.6776859504132235e-06, "loss": 0.5297, "step": 283 }, { "epoch": 0.14098957471454576, "grad_norm": 0.4456779956817627, "learning_rate": 4.694214876033058e-06, "loss": 0.5075, "step": 284 }, { "epoch": 0.1414860168790336, "grad_norm": 0.45146000385284424, "learning_rate": 4.710743801652893e-06, "loss": 0.5129, "step": 285 }, { "epoch": 0.14198245904352144, "grad_norm": 0.45760053396224976, "learning_rate": 4.727272727272728e-06, "loss": 0.4782, "step": 286 }, { "epoch": 0.14247890120800927, "grad_norm": 0.44641372561454773, "learning_rate": 4.7438016528925625e-06, "loss": 0.5073, "step": 287 }, { "epoch": 0.1429753433724971, "grad_norm": 0.42665234208106995, "learning_rate": 4.760330578512397e-06, "loss": 0.4969, "step": 288 }, { "epoch": 0.14347178553698495, "grad_norm": 0.458766371011734, "learning_rate": 4.776859504132232e-06, "loss": 0.4933, "step": 289 }, { "epoch": 0.14396822770147277, "grad_norm": 0.39108675718307495, "learning_rate": 4.793388429752067e-06, "loss": 0.5095, "step": 290 }, { "epoch": 0.14446466986596063, "grad_norm": 0.4508987069129944, "learning_rate": 4.8099173553719015e-06, "loss": 0.5058, "step": 291 }, { "epoch": 0.14496111203044845, "grad_norm": 0.42819514870643616, "learning_rate": 4.826446280991736e-06, "loss": 0.481, "step": 292 }, { "epoch": 0.14545755419493628, "grad_norm": 0.4528975784778595, "learning_rate": 4.842975206611571e-06, "loss": 0.4983, "step": 293 }, { "epoch": 0.14595399635942413, "grad_norm": 0.430054247379303, "learning_rate": 4.859504132231405e-06, "loss": 0.5008, "step": 294 }, { "epoch": 0.14645043852391196, "grad_norm": 0.43812480568885803, "learning_rate": 4.87603305785124e-06, "loss": 0.4725, "step": 295 }, { "epoch": 0.1469468806883998, "grad_norm": 0.39980730414390564, "learning_rate": 4.892561983471075e-06, "loss": 0.5071, "step": 296 }, { "epoch": 0.14744332285288764, "grad_norm": 0.512586236000061, "learning_rate": 4.90909090909091e-06, "loss": 0.497, "step": 297 }, { "epoch": 0.14793976501737546, "grad_norm": 0.4782851040363312, "learning_rate": 4.925619834710744e-06, "loss": 0.5006, "step": 298 }, { "epoch": 0.14843620718186332, "grad_norm": 0.4393535554409027, "learning_rate": 4.942148760330579e-06, "loss": 0.5173, "step": 299 }, { "epoch": 0.14893264934635114, "grad_norm": 0.45083752274513245, "learning_rate": 4.958677685950414e-06, "loss": 0.497, "step": 300 }, { "epoch": 0.149429091510839, "grad_norm": 0.45653530955314636, "learning_rate": 4.975206611570249e-06, "loss": 0.5175, "step": 301 }, { "epoch": 0.14992553367532682, "grad_norm": 0.43480002880096436, "learning_rate": 4.991735537190083e-06, "loss": 0.4989, "step": 302 }, { "epoch": 0.15042197583981465, "grad_norm": 0.41854220628738403, "learning_rate": 5.008264462809918e-06, "loss": 0.5055, "step": 303 }, { "epoch": 0.1509184180043025, "grad_norm": 0.4131919741630554, "learning_rate": 5.024793388429753e-06, "loss": 0.5218, "step": 304 }, { "epoch": 0.15141486016879033, "grad_norm": 0.4328848719596863, "learning_rate": 5.041322314049587e-06, "loss": 0.5097, "step": 305 }, { "epoch": 0.15191130233327818, "grad_norm": 0.4288451671600342, "learning_rate": 5.057851239669422e-06, "loss": 0.4787, "step": 306 }, { "epoch": 0.152407744497766, "grad_norm": 0.41574257612228394, "learning_rate": 5.074380165289257e-06, "loss": 0.5262, "step": 307 }, { "epoch": 0.15290418666225386, "grad_norm": 0.46898153424263, "learning_rate": 5.090909090909091e-06, "loss": 0.4925, "step": 308 }, { "epoch": 0.1534006288267417, "grad_norm": 0.4394037425518036, "learning_rate": 5.107438016528926e-06, "loss": 0.486, "step": 309 }, { "epoch": 0.1538970709912295, "grad_norm": 0.4655103385448456, "learning_rate": 5.12396694214876e-06, "loss": 0.4975, "step": 310 }, { "epoch": 0.15439351315571737, "grad_norm": 0.48275241255760193, "learning_rate": 5.140495867768596e-06, "loss": 0.5214, "step": 311 }, { "epoch": 0.1548899553202052, "grad_norm": 0.5412445068359375, "learning_rate": 5.15702479338843e-06, "loss": 0.5176, "step": 312 }, { "epoch": 0.15538639748469305, "grad_norm": 0.4623423218727112, "learning_rate": 5.173553719008266e-06, "loss": 0.5118, "step": 313 }, { "epoch": 0.15588283964918087, "grad_norm": 0.40195977687835693, "learning_rate": 5.190082644628099e-06, "loss": 0.4703, "step": 314 }, { "epoch": 0.1563792818136687, "grad_norm": 0.5132428407669067, "learning_rate": 5.206611570247935e-06, "loss": 0.4771, "step": 315 }, { "epoch": 0.15687572397815655, "grad_norm": 0.46620309352874756, "learning_rate": 5.223140495867769e-06, "loss": 0.4786, "step": 316 }, { "epoch": 0.15737216614264438, "grad_norm": 0.4137311577796936, "learning_rate": 5.239669421487605e-06, "loss": 0.4731, "step": 317 }, { "epoch": 0.15786860830713223, "grad_norm": 0.5077406167984009, "learning_rate": 5.256198347107438e-06, "loss": 0.5018, "step": 318 }, { "epoch": 0.15836505047162006, "grad_norm": 0.431933730840683, "learning_rate": 5.272727272727273e-06, "loss": 0.4613, "step": 319 }, { "epoch": 0.15886149263610788, "grad_norm": 0.4351625442504883, "learning_rate": 5.289256198347108e-06, "loss": 0.4888, "step": 320 }, { "epoch": 0.15935793480059574, "grad_norm": 0.4152326285839081, "learning_rate": 5.305785123966942e-06, "loss": 0.4903, "step": 321 }, { "epoch": 0.15985437696508356, "grad_norm": 0.42277032136917114, "learning_rate": 5.322314049586777e-06, "loss": 0.4971, "step": 322 }, { "epoch": 0.16035081912957141, "grad_norm": 0.4607337713241577, "learning_rate": 5.338842975206612e-06, "loss": 0.499, "step": 323 }, { "epoch": 0.16084726129405924, "grad_norm": 0.42749732732772827, "learning_rate": 5.355371900826447e-06, "loss": 0.4819, "step": 324 }, { "epoch": 0.16134370345854707, "grad_norm": 0.4331699013710022, "learning_rate": 5.371900826446281e-06, "loss": 0.4736, "step": 325 }, { "epoch": 0.16184014562303492, "grad_norm": 0.4189608097076416, "learning_rate": 5.388429752066116e-06, "loss": 0.5142, "step": 326 }, { "epoch": 0.16233658778752275, "grad_norm": 0.516372799873352, "learning_rate": 5.404958677685951e-06, "loss": 0.4939, "step": 327 }, { "epoch": 0.1628330299520106, "grad_norm": 0.47925591468811035, "learning_rate": 5.421487603305785e-06, "loss": 0.5093, "step": 328 }, { "epoch": 0.16332947211649843, "grad_norm": 0.4464106261730194, "learning_rate": 5.438016528925621e-06, "loss": 0.5059, "step": 329 }, { "epoch": 0.16382591428098628, "grad_norm": 0.5140085220336914, "learning_rate": 5.4545454545454545e-06, "loss": 0.4965, "step": 330 }, { "epoch": 0.1643223564454741, "grad_norm": 0.49028441309928894, "learning_rate": 5.47107438016529e-06, "loss": 0.4819, "step": 331 }, { "epoch": 0.16481879860996193, "grad_norm": 0.5272459983825684, "learning_rate": 5.487603305785124e-06, "loss": 0.4925, "step": 332 }, { "epoch": 0.16531524077444978, "grad_norm": 0.4355497658252716, "learning_rate": 5.50413223140496e-06, "loss": 0.4934, "step": 333 }, { "epoch": 0.1658116829389376, "grad_norm": 0.4769214391708374, "learning_rate": 5.5206611570247935e-06, "loss": 0.504, "step": 334 }, { "epoch": 0.16630812510342546, "grad_norm": 0.5962668657302856, "learning_rate": 5.537190082644629e-06, "loss": 0.5135, "step": 335 }, { "epoch": 0.1668045672679133, "grad_norm": 0.5365744829177856, "learning_rate": 5.553719008264463e-06, "loss": 0.4653, "step": 336 }, { "epoch": 0.16730100943240112, "grad_norm": 0.4301178455352783, "learning_rate": 5.570247933884299e-06, "loss": 0.4832, "step": 337 }, { "epoch": 0.16779745159688897, "grad_norm": 0.5364788174629211, "learning_rate": 5.5867768595041325e-06, "loss": 0.5107, "step": 338 }, { "epoch": 0.1682938937613768, "grad_norm": 0.5546157956123352, "learning_rate": 5.603305785123967e-06, "loss": 0.4772, "step": 339 }, { "epoch": 0.16879033592586465, "grad_norm": 0.4280966818332672, "learning_rate": 5.619834710743802e-06, "loss": 0.4913, "step": 340 }, { "epoch": 0.16928677809035247, "grad_norm": 0.48190242052078247, "learning_rate": 5.636363636363636e-06, "loss": 0.4701, "step": 341 }, { "epoch": 0.1697832202548403, "grad_norm": 0.6100616455078125, "learning_rate": 5.6528925619834715e-06, "loss": 0.5105, "step": 342 }, { "epoch": 0.17027966241932815, "grad_norm": 0.47204649448394775, "learning_rate": 5.669421487603306e-06, "loss": 0.4928, "step": 343 }, { "epoch": 0.17077610458381598, "grad_norm": 0.4829736649990082, "learning_rate": 5.685950413223141e-06, "loss": 0.4886, "step": 344 }, { "epoch": 0.17127254674830383, "grad_norm": 0.5550112128257751, "learning_rate": 5.702479338842976e-06, "loss": 0.5113, "step": 345 }, { "epoch": 0.17176898891279166, "grad_norm": 0.5019325613975525, "learning_rate": 5.7190082644628105e-06, "loss": 0.4718, "step": 346 }, { "epoch": 0.17226543107727949, "grad_norm": 0.4609982669353485, "learning_rate": 5.735537190082645e-06, "loss": 0.5123, "step": 347 }, { "epoch": 0.17276187324176734, "grad_norm": 0.42521995306015015, "learning_rate": 5.7520661157024795e-06, "loss": 0.4838, "step": 348 }, { "epoch": 0.17325831540625516, "grad_norm": 0.44526734948158264, "learning_rate": 5.768595041322315e-06, "loss": 0.4625, "step": 349 }, { "epoch": 0.17375475757074302, "grad_norm": 0.4344416558742523, "learning_rate": 5.785123966942149e-06, "loss": 0.4851, "step": 350 }, { "epoch": 0.17425119973523084, "grad_norm": 0.4648102819919586, "learning_rate": 5.801652892561984e-06, "loss": 0.4852, "step": 351 }, { "epoch": 0.17474764189971867, "grad_norm": 0.4053942561149597, "learning_rate": 5.8181818181818185e-06, "loss": 0.5001, "step": 352 }, { "epoch": 0.17524408406420652, "grad_norm": 0.4345317780971527, "learning_rate": 5.834710743801654e-06, "loss": 0.5036, "step": 353 }, { "epoch": 0.17574052622869435, "grad_norm": 0.47739464044570923, "learning_rate": 5.851239669421488e-06, "loss": 0.4797, "step": 354 }, { "epoch": 0.1762369683931822, "grad_norm": 0.4071372449398041, "learning_rate": 5.867768595041323e-06, "loss": 0.536, "step": 355 }, { "epoch": 0.17673341055767003, "grad_norm": 0.4067287743091583, "learning_rate": 5.8842975206611575e-06, "loss": 0.5233, "step": 356 }, { "epoch": 0.17722985272215788, "grad_norm": 0.39644941687583923, "learning_rate": 5.900826446280993e-06, "loss": 0.4887, "step": 357 }, { "epoch": 0.1777262948866457, "grad_norm": 0.408863365650177, "learning_rate": 5.917355371900827e-06, "loss": 0.4821, "step": 358 }, { "epoch": 0.17822273705113353, "grad_norm": 0.41061681509017944, "learning_rate": 5.933884297520661e-06, "loss": 0.5068, "step": 359 }, { "epoch": 0.1787191792156214, "grad_norm": 0.3956266939640045, "learning_rate": 5.9504132231404965e-06, "loss": 0.5153, "step": 360 }, { "epoch": 0.1792156213801092, "grad_norm": 0.4016287624835968, "learning_rate": 5.966942148760331e-06, "loss": 0.476, "step": 361 }, { "epoch": 0.17971206354459707, "grad_norm": 0.4079864025115967, "learning_rate": 5.9834710743801665e-06, "loss": 0.4929, "step": 362 }, { "epoch": 0.1802085057090849, "grad_norm": 0.39284032583236694, "learning_rate": 6e-06, "loss": 0.4809, "step": 363 }, { "epoch": 0.18070494787357272, "grad_norm": 0.4458436071872711, "learning_rate": 6.0165289256198355e-06, "loss": 0.4804, "step": 364 }, { "epoch": 0.18120139003806057, "grad_norm": 0.44298434257507324, "learning_rate": 6.03305785123967e-06, "loss": 0.5015, "step": 365 }, { "epoch": 0.1816978322025484, "grad_norm": 0.4829196631908417, "learning_rate": 6.0495867768595055e-06, "loss": 0.4885, "step": 366 }, { "epoch": 0.18219427436703625, "grad_norm": 0.44937562942504883, "learning_rate": 6.066115702479339e-06, "loss": 0.4811, "step": 367 }, { "epoch": 0.18269071653152408, "grad_norm": 0.4311468005180359, "learning_rate": 6.082644628099174e-06, "loss": 0.4846, "step": 368 }, { "epoch": 0.1831871586960119, "grad_norm": 0.49652591347694397, "learning_rate": 6.099173553719009e-06, "loss": 0.4907, "step": 369 }, { "epoch": 0.18368360086049976, "grad_norm": 0.4506331980228424, "learning_rate": 6.115702479338843e-06, "loss": 0.5033, "step": 370 }, { "epoch": 0.18418004302498758, "grad_norm": 0.4681533873081207, "learning_rate": 6.132231404958678e-06, "loss": 0.4747, "step": 371 }, { "epoch": 0.18467648518947544, "grad_norm": 0.4863337576389313, "learning_rate": 6.148760330578513e-06, "loss": 0.5102, "step": 372 }, { "epoch": 0.18517292735396326, "grad_norm": 0.48086097836494446, "learning_rate": 6.165289256198348e-06, "loss": 0.4656, "step": 373 }, { "epoch": 0.1856693695184511, "grad_norm": 0.4399857819080353, "learning_rate": 6.181818181818182e-06, "loss": 0.4737, "step": 374 }, { "epoch": 0.18616581168293894, "grad_norm": 0.46418774127960205, "learning_rate": 6.198347107438017e-06, "loss": 0.4831, "step": 375 }, { "epoch": 0.18666225384742677, "grad_norm": 0.46316665410995483, "learning_rate": 6.214876033057852e-06, "loss": 0.5068, "step": 376 }, { "epoch": 0.18715869601191462, "grad_norm": 0.4405292272567749, "learning_rate": 6.231404958677686e-06, "loss": 0.4942, "step": 377 }, { "epoch": 0.18765513817640245, "grad_norm": 0.4127674996852875, "learning_rate": 6.247933884297522e-06, "loss": 0.4757, "step": 378 }, { "epoch": 0.18815158034089027, "grad_norm": 0.5264269113540649, "learning_rate": 6.264462809917355e-06, "loss": 0.5104, "step": 379 }, { "epoch": 0.18864802250537813, "grad_norm": 0.4527742564678192, "learning_rate": 6.280991735537191e-06, "loss": 0.4829, "step": 380 }, { "epoch": 0.18914446466986595, "grad_norm": 0.5366259813308716, "learning_rate": 6.297520661157025e-06, "loss": 0.4572, "step": 381 }, { "epoch": 0.1896409068343538, "grad_norm": 0.529551088809967, "learning_rate": 6.314049586776861e-06, "loss": 0.5132, "step": 382 }, { "epoch": 0.19013734899884163, "grad_norm": 0.4312463104724884, "learning_rate": 6.330578512396694e-06, "loss": 0.4706, "step": 383 }, { "epoch": 0.19063379116332949, "grad_norm": 0.4677755534648895, "learning_rate": 6.34710743801653e-06, "loss": 0.466, "step": 384 }, { "epoch": 0.1911302333278173, "grad_norm": 0.5087398886680603, "learning_rate": 6.363636363636364e-06, "loss": 0.4779, "step": 385 }, { "epoch": 0.19162667549230514, "grad_norm": 0.44555503129959106, "learning_rate": 6.3801652892562e-06, "loss": 0.4855, "step": 386 }, { "epoch": 0.192123117656793, "grad_norm": 0.46300268173217773, "learning_rate": 6.396694214876033e-06, "loss": 0.4958, "step": 387 }, { "epoch": 0.19261955982128082, "grad_norm": 0.39818844199180603, "learning_rate": 6.413223140495868e-06, "loss": 0.4937, "step": 388 }, { "epoch": 0.19311600198576867, "grad_norm": 0.44149044156074524, "learning_rate": 6.429752066115703e-06, "loss": 0.4722, "step": 389 }, { "epoch": 0.1936124441502565, "grad_norm": 0.4207805395126343, "learning_rate": 6.446280991735537e-06, "loss": 0.469, "step": 390 }, { "epoch": 0.19410888631474432, "grad_norm": 0.39153826236724854, "learning_rate": 6.462809917355372e-06, "loss": 0.4794, "step": 391 }, { "epoch": 0.19460532847923218, "grad_norm": 0.4542931020259857, "learning_rate": 6.479338842975207e-06, "loss": 0.504, "step": 392 }, { "epoch": 0.19510177064372, "grad_norm": 0.4424765706062317, "learning_rate": 6.495867768595042e-06, "loss": 0.4917, "step": 393 }, { "epoch": 0.19559821280820786, "grad_norm": 0.43294790387153625, "learning_rate": 6.512396694214877e-06, "loss": 0.4988, "step": 394 }, { "epoch": 0.19609465497269568, "grad_norm": 0.44429928064346313, "learning_rate": 6.528925619834712e-06, "loss": 0.4825, "step": 395 }, { "epoch": 0.1965910971371835, "grad_norm": 0.4204387664794922, "learning_rate": 6.545454545454546e-06, "loss": 0.5072, "step": 396 }, { "epoch": 0.19708753930167136, "grad_norm": 0.38960006833076477, "learning_rate": 6.56198347107438e-06, "loss": 0.494, "step": 397 }, { "epoch": 0.1975839814661592, "grad_norm": 0.4361496567726135, "learning_rate": 6.578512396694216e-06, "loss": 0.4924, "step": 398 }, { "epoch": 0.19808042363064704, "grad_norm": 0.44507384300231934, "learning_rate": 6.5950413223140495e-06, "loss": 0.4839, "step": 399 }, { "epoch": 0.19857686579513487, "grad_norm": 0.4427904486656189, "learning_rate": 6.611570247933885e-06, "loss": 0.488, "step": 400 }, { "epoch": 0.1990733079596227, "grad_norm": 0.45119184255599976, "learning_rate": 6.628099173553719e-06, "loss": 0.471, "step": 401 }, { "epoch": 0.19956975012411055, "grad_norm": 0.4546463191509247, "learning_rate": 6.644628099173555e-06, "loss": 0.469, "step": 402 }, { "epoch": 0.20006619228859837, "grad_norm": 0.38960105180740356, "learning_rate": 6.6611570247933885e-06, "loss": 0.4879, "step": 403 }, { "epoch": 0.20056263445308622, "grad_norm": 0.42599162459373474, "learning_rate": 6.677685950413224e-06, "loss": 0.496, "step": 404 }, { "epoch": 0.20105907661757405, "grad_norm": 0.4420008361339569, "learning_rate": 6.694214876033058e-06, "loss": 0.4822, "step": 405 }, { "epoch": 0.20155551878206188, "grad_norm": 0.3967284560203552, "learning_rate": 6.710743801652894e-06, "loss": 0.4746, "step": 406 }, { "epoch": 0.20205196094654973, "grad_norm": 0.43280869722366333, "learning_rate": 6.7272727272727275e-06, "loss": 0.4877, "step": 407 }, { "epoch": 0.20254840311103756, "grad_norm": 0.43010562658309937, "learning_rate": 6.743801652892562e-06, "loss": 0.5003, "step": 408 }, { "epoch": 0.2030448452755254, "grad_norm": 0.4346367418766022, "learning_rate": 6.760330578512397e-06, "loss": 0.5053, "step": 409 }, { "epoch": 0.20354128744001324, "grad_norm": 0.4739544987678528, "learning_rate": 6.776859504132232e-06, "loss": 0.4846, "step": 410 }, { "epoch": 0.2040377296045011, "grad_norm": 0.49574145674705505, "learning_rate": 6.793388429752067e-06, "loss": 0.5229, "step": 411 }, { "epoch": 0.20453417176898891, "grad_norm": 0.44652432203292847, "learning_rate": 6.809917355371901e-06, "loss": 0.4943, "step": 412 }, { "epoch": 0.20503061393347674, "grad_norm": 0.45023566484451294, "learning_rate": 6.826446280991736e-06, "loss": 0.451, "step": 413 }, { "epoch": 0.2055270560979646, "grad_norm": 0.5838633179664612, "learning_rate": 6.842975206611571e-06, "loss": 0.502, "step": 414 }, { "epoch": 0.20602349826245242, "grad_norm": 0.4622734487056732, "learning_rate": 6.859504132231406e-06, "loss": 0.4843, "step": 415 }, { "epoch": 0.20651994042694027, "grad_norm": 0.4720219671726227, "learning_rate": 6.87603305785124e-06, "loss": 0.4479, "step": 416 }, { "epoch": 0.2070163825914281, "grad_norm": 0.5015128254890442, "learning_rate": 6.8925619834710745e-06, "loss": 0.4917, "step": 417 }, { "epoch": 0.20751282475591593, "grad_norm": 0.486129492521286, "learning_rate": 6.90909090909091e-06, "loss": 0.4606, "step": 418 }, { "epoch": 0.20800926692040378, "grad_norm": 0.44019004702568054, "learning_rate": 6.925619834710744e-06, "loss": 0.4618, "step": 419 }, { "epoch": 0.2085057090848916, "grad_norm": 0.5136095285415649, "learning_rate": 6.942148760330579e-06, "loss": 0.4736, "step": 420 }, { "epoch": 0.20900215124937946, "grad_norm": 0.4492869973182678, "learning_rate": 6.9586776859504135e-06, "loss": 0.4861, "step": 421 }, { "epoch": 0.20949859341386728, "grad_norm": 0.4770965874195099, "learning_rate": 6.975206611570249e-06, "loss": 0.4938, "step": 422 }, { "epoch": 0.2099950355783551, "grad_norm": 0.5074143409729004, "learning_rate": 6.991735537190083e-06, "loss": 0.4713, "step": 423 }, { "epoch": 0.21049147774284296, "grad_norm": 0.4692121744155884, "learning_rate": 7.008264462809918e-06, "loss": 0.5038, "step": 424 }, { "epoch": 0.2109879199073308, "grad_norm": 0.5075427889823914, "learning_rate": 7.0247933884297525e-06, "loss": 0.4788, "step": 425 }, { "epoch": 0.21148436207181864, "grad_norm": 0.5473235249519348, "learning_rate": 7.041322314049588e-06, "loss": 0.4779, "step": 426 }, { "epoch": 0.21198080423630647, "grad_norm": 0.42127126455307007, "learning_rate": 7.0578512396694225e-06, "loss": 0.4872, "step": 427 }, { "epoch": 0.2124772464007943, "grad_norm": 0.4250279366970062, "learning_rate": 7.074380165289256e-06, "loss": 0.4809, "step": 428 }, { "epoch": 0.21297368856528215, "grad_norm": 0.5111218690872192, "learning_rate": 7.0909090909090916e-06, "loss": 0.4924, "step": 429 }, { "epoch": 0.21347013072976997, "grad_norm": 0.45380088686943054, "learning_rate": 7.107438016528926e-06, "loss": 0.492, "step": 430 }, { "epoch": 0.21396657289425783, "grad_norm": 0.4494556188583374, "learning_rate": 7.1239669421487615e-06, "loss": 0.483, "step": 431 }, { "epoch": 0.21446301505874565, "grad_norm": 0.4579288959503174, "learning_rate": 7.140495867768595e-06, "loss": 0.4791, "step": 432 }, { "epoch": 0.21495945722323348, "grad_norm": 0.5026068091392517, "learning_rate": 7.1570247933884306e-06, "loss": 0.5006, "step": 433 }, { "epoch": 0.21545589938772133, "grad_norm": 0.42490726709365845, "learning_rate": 7.173553719008265e-06, "loss": 0.4832, "step": 434 }, { "epoch": 0.21595234155220916, "grad_norm": 0.5383440256118774, "learning_rate": 7.1900826446281005e-06, "loss": 0.4374, "step": 435 }, { "epoch": 0.216448783716697, "grad_norm": 0.45012277364730835, "learning_rate": 7.206611570247934e-06, "loss": 0.4995, "step": 436 }, { "epoch": 0.21694522588118484, "grad_norm": 0.4084751307964325, "learning_rate": 7.223140495867769e-06, "loss": 0.5075, "step": 437 }, { "epoch": 0.2174416680456727, "grad_norm": 0.4805709421634674, "learning_rate": 7.239669421487604e-06, "loss": 0.4936, "step": 438 }, { "epoch": 0.21793811021016052, "grad_norm": 0.4777836501598358, "learning_rate": 7.256198347107438e-06, "loss": 0.4922, "step": 439 }, { "epoch": 0.21843455237464834, "grad_norm": 0.4382144808769226, "learning_rate": 7.272727272727273e-06, "loss": 0.4821, "step": 440 }, { "epoch": 0.2189309945391362, "grad_norm": 0.4246453046798706, "learning_rate": 7.289256198347108e-06, "loss": 0.4634, "step": 441 }, { "epoch": 0.21942743670362402, "grad_norm": 0.44211432337760925, "learning_rate": 7.305785123966943e-06, "loss": 0.4758, "step": 442 }, { "epoch": 0.21992387886811188, "grad_norm": 0.441456139087677, "learning_rate": 7.322314049586778e-06, "loss": 0.4733, "step": 443 }, { "epoch": 0.2204203210325997, "grad_norm": 0.42692282795906067, "learning_rate": 7.338842975206613e-06, "loss": 0.4783, "step": 444 }, { "epoch": 0.22091676319708753, "grad_norm": 0.42438337206840515, "learning_rate": 7.355371900826447e-06, "loss": 0.4576, "step": 445 }, { "epoch": 0.22141320536157538, "grad_norm": 0.46246010065078735, "learning_rate": 7.371900826446282e-06, "loss": 0.4728, "step": 446 }, { "epoch": 0.2219096475260632, "grad_norm": 0.4487130343914032, "learning_rate": 7.388429752066117e-06, "loss": 0.4665, "step": 447 }, { "epoch": 0.22240608969055106, "grad_norm": 0.44916442036628723, "learning_rate": 7.40495867768595e-06, "loss": 0.4883, "step": 448 }, { "epoch": 0.2229025318550389, "grad_norm": 0.4653342068195343, "learning_rate": 7.421487603305786e-06, "loss": 0.4749, "step": 449 }, { "epoch": 0.2233989740195267, "grad_norm": 0.4609503746032715, "learning_rate": 7.43801652892562e-06, "loss": 0.5025, "step": 450 }, { "epoch": 0.22389541618401457, "grad_norm": 0.4563218057155609, "learning_rate": 7.454545454545456e-06, "loss": 0.4688, "step": 451 }, { "epoch": 0.2243918583485024, "grad_norm": 0.435838520526886, "learning_rate": 7.471074380165289e-06, "loss": 0.455, "step": 452 }, { "epoch": 0.22488830051299025, "grad_norm": 0.4936482310295105, "learning_rate": 7.487603305785125e-06, "loss": 0.4688, "step": 453 }, { "epoch": 0.22538474267747807, "grad_norm": 0.5021178126335144, "learning_rate": 7.504132231404959e-06, "loss": 0.4838, "step": 454 }, { "epoch": 0.2258811848419659, "grad_norm": 0.46295812726020813, "learning_rate": 7.520661157024795e-06, "loss": 0.5016, "step": 455 }, { "epoch": 0.22637762700645375, "grad_norm": 0.5141716599464417, "learning_rate": 7.537190082644628e-06, "loss": 0.4936, "step": 456 }, { "epoch": 0.22687406917094158, "grad_norm": 0.4894197881221771, "learning_rate": 7.553719008264463e-06, "loss": 0.4753, "step": 457 }, { "epoch": 0.22737051133542943, "grad_norm": 0.45301294326782227, "learning_rate": 7.570247933884298e-06, "loss": 0.4676, "step": 458 }, { "epoch": 0.22786695349991726, "grad_norm": 0.5068886280059814, "learning_rate": 7.586776859504133e-06, "loss": 0.4715, "step": 459 }, { "epoch": 0.22836339566440508, "grad_norm": 0.46936553716659546, "learning_rate": 7.603305785123968e-06, "loss": 0.4908, "step": 460 }, { "epoch": 0.22885983782889294, "grad_norm": 0.47695082426071167, "learning_rate": 7.619834710743802e-06, "loss": 0.4991, "step": 461 }, { "epoch": 0.22935627999338076, "grad_norm": 0.5140798687934875, "learning_rate": 7.636363636363638e-06, "loss": 0.499, "step": 462 }, { "epoch": 0.22985272215786862, "grad_norm": 0.4222148358821869, "learning_rate": 7.652892561983471e-06, "loss": 0.4813, "step": 463 }, { "epoch": 0.23034916432235644, "grad_norm": 0.4898736774921417, "learning_rate": 7.669421487603307e-06, "loss": 0.4853, "step": 464 }, { "epoch": 0.2308456064868443, "grad_norm": 0.4096435308456421, "learning_rate": 7.685950413223142e-06, "loss": 0.4682, "step": 465 }, { "epoch": 0.23134204865133212, "grad_norm": 0.4285362660884857, "learning_rate": 7.702479338842976e-06, "loss": 0.4703, "step": 466 }, { "epoch": 0.23183849081581995, "grad_norm": 0.4479823708534241, "learning_rate": 7.71900826446281e-06, "loss": 0.4811, "step": 467 }, { "epoch": 0.2323349329803078, "grad_norm": 0.4493817389011383, "learning_rate": 7.735537190082645e-06, "loss": 0.4833, "step": 468 }, { "epoch": 0.23283137514479563, "grad_norm": 0.43533170223236084, "learning_rate": 7.75206611570248e-06, "loss": 0.4574, "step": 469 }, { "epoch": 0.23332781730928348, "grad_norm": 0.44062215089797974, "learning_rate": 7.768595041322314e-06, "loss": 0.4507, "step": 470 }, { "epoch": 0.2338242594737713, "grad_norm": 0.5106300711631775, "learning_rate": 7.785123966942149e-06, "loss": 0.4874, "step": 471 }, { "epoch": 0.23432070163825913, "grad_norm": 0.4412551820278168, "learning_rate": 7.801652892561983e-06, "loss": 0.4792, "step": 472 }, { "epoch": 0.23481714380274699, "grad_norm": 0.518272340297699, "learning_rate": 7.81818181818182e-06, "loss": 0.4803, "step": 473 }, { "epoch": 0.2353135859672348, "grad_norm": 0.45427006483078003, "learning_rate": 7.834710743801654e-06, "loss": 0.4851, "step": 474 }, { "epoch": 0.23581002813172267, "grad_norm": 0.4602647125720978, "learning_rate": 7.851239669421489e-06, "loss": 0.4816, "step": 475 }, { "epoch": 0.2363064702962105, "grad_norm": 0.38854336738586426, "learning_rate": 7.867768595041323e-06, "loss": 0.4549, "step": 476 }, { "epoch": 0.23680291246069832, "grad_norm": 0.4697539806365967, "learning_rate": 7.884297520661158e-06, "loss": 0.4943, "step": 477 }, { "epoch": 0.23729935462518617, "grad_norm": 0.4514499008655548, "learning_rate": 7.900826446280992e-06, "loss": 0.4694, "step": 478 }, { "epoch": 0.237795796789674, "grad_norm": 0.43897998332977295, "learning_rate": 7.917355371900827e-06, "loss": 0.4527, "step": 479 }, { "epoch": 0.23829223895416185, "grad_norm": 0.47715821862220764, "learning_rate": 7.933884297520661e-06, "loss": 0.4942, "step": 480 }, { "epoch": 0.23878868111864968, "grad_norm": 0.4838358163833618, "learning_rate": 7.950413223140496e-06, "loss": 0.4546, "step": 481 }, { "epoch": 0.2392851232831375, "grad_norm": 0.4644853174686432, "learning_rate": 7.966942148760332e-06, "loss": 0.4668, "step": 482 }, { "epoch": 0.23978156544762536, "grad_norm": 0.4526386857032776, "learning_rate": 7.983471074380165e-06, "loss": 0.4952, "step": 483 }, { "epoch": 0.24027800761211318, "grad_norm": 0.46851634979248047, "learning_rate": 8.000000000000001e-06, "loss": 0.4868, "step": 484 }, { "epoch": 0.24077444977660103, "grad_norm": 0.41484642028808594, "learning_rate": 8.016528925619836e-06, "loss": 0.4527, "step": 485 }, { "epoch": 0.24127089194108886, "grad_norm": 0.4286535382270813, "learning_rate": 8.033057851239669e-06, "loss": 0.4738, "step": 486 }, { "epoch": 0.2417673341055767, "grad_norm": 0.44905754923820496, "learning_rate": 8.049586776859505e-06, "loss": 0.494, "step": 487 }, { "epoch": 0.24226377627006454, "grad_norm": 0.4167950451374054, "learning_rate": 8.06611570247934e-06, "loss": 0.4656, "step": 488 }, { "epoch": 0.24276021843455237, "grad_norm": 0.5053528547286987, "learning_rate": 8.082644628099174e-06, "loss": 0.4471, "step": 489 }, { "epoch": 0.24325666059904022, "grad_norm": 0.41606348752975464, "learning_rate": 8.099173553719009e-06, "loss": 0.4639, "step": 490 }, { "epoch": 0.24375310276352805, "grad_norm": 0.431012362241745, "learning_rate": 8.115702479338843e-06, "loss": 0.4632, "step": 491 }, { "epoch": 0.2442495449280159, "grad_norm": 0.5186803340911865, "learning_rate": 8.132231404958678e-06, "loss": 0.4704, "step": 492 }, { "epoch": 0.24474598709250373, "grad_norm": 0.4335934817790985, "learning_rate": 8.148760330578514e-06, "loss": 0.452, "step": 493 }, { "epoch": 0.24524242925699155, "grad_norm": 0.5087138414382935, "learning_rate": 8.165289256198348e-06, "loss": 0.4641, "step": 494 }, { "epoch": 0.2457388714214794, "grad_norm": 0.4543677568435669, "learning_rate": 8.181818181818183e-06, "loss": 0.4498, "step": 495 }, { "epoch": 0.24623531358596723, "grad_norm": 0.47935521602630615, "learning_rate": 8.198347107438017e-06, "loss": 0.4825, "step": 496 }, { "epoch": 0.24673175575045508, "grad_norm": 0.4916725158691406, "learning_rate": 8.214876033057852e-06, "loss": 0.4958, "step": 497 }, { "epoch": 0.2472281979149429, "grad_norm": 0.44447386264801025, "learning_rate": 8.231404958677687e-06, "loss": 0.4761, "step": 498 }, { "epoch": 0.24772464007943074, "grad_norm": 0.4816417992115021, "learning_rate": 8.247933884297521e-06, "loss": 0.4425, "step": 499 }, { "epoch": 0.2482210822439186, "grad_norm": 0.39101460576057434, "learning_rate": 8.264462809917356e-06, "loss": 0.4572, "step": 500 }, { "epoch": 0.24871752440840642, "grad_norm": 0.4931960701942444, "learning_rate": 8.28099173553719e-06, "loss": 0.4603, "step": 501 }, { "epoch": 0.24921396657289427, "grad_norm": 0.4605782926082611, "learning_rate": 8.297520661157026e-06, "loss": 0.4871, "step": 502 }, { "epoch": 0.2497104087373821, "grad_norm": 0.4365110993385315, "learning_rate": 8.31404958677686e-06, "loss": 0.4862, "step": 503 }, { "epoch": 0.25020685090186995, "grad_norm": 0.4782628118991852, "learning_rate": 8.330578512396695e-06, "loss": 0.4617, "step": 504 }, { "epoch": 0.25070329306635775, "grad_norm": 0.463056743144989, "learning_rate": 8.34710743801653e-06, "loss": 0.4851, "step": 505 }, { "epoch": 0.2511997352308456, "grad_norm": 0.4545626938343048, "learning_rate": 8.363636363636365e-06, "loss": 0.4723, "step": 506 }, { "epoch": 0.25169617739533345, "grad_norm": 0.4492689073085785, "learning_rate": 8.380165289256199e-06, "loss": 0.4779, "step": 507 }, { "epoch": 0.2521926195598213, "grad_norm": 0.3990391790866852, "learning_rate": 8.396694214876034e-06, "loss": 0.4851, "step": 508 }, { "epoch": 0.2526890617243091, "grad_norm": 0.4360460638999939, "learning_rate": 8.413223140495868e-06, "loss": 0.4869, "step": 509 }, { "epoch": 0.25318550388879696, "grad_norm": 0.4606689512729645, "learning_rate": 8.429752066115703e-06, "loss": 0.4744, "step": 510 }, { "epoch": 0.2536819460532848, "grad_norm": 0.44839054346084595, "learning_rate": 8.446280991735539e-06, "loss": 0.4699, "step": 511 }, { "epoch": 0.2541783882177726, "grad_norm": 0.4268187880516052, "learning_rate": 8.462809917355372e-06, "loss": 0.4699, "step": 512 }, { "epoch": 0.25467483038226046, "grad_norm": 0.4956466555595398, "learning_rate": 8.479338842975208e-06, "loss": 0.4438, "step": 513 }, { "epoch": 0.2551712725467483, "grad_norm": 0.5241757035255432, "learning_rate": 8.495867768595043e-06, "loss": 0.4793, "step": 514 }, { "epoch": 0.2556677147112361, "grad_norm": 0.5011031031608582, "learning_rate": 8.512396694214877e-06, "loss": 0.4685, "step": 515 }, { "epoch": 0.25616415687572397, "grad_norm": 0.5413616895675659, "learning_rate": 8.528925619834712e-06, "loss": 0.4993, "step": 516 }, { "epoch": 0.2566605990402118, "grad_norm": 0.4687687158584595, "learning_rate": 8.545454545454546e-06, "loss": 0.4427, "step": 517 }, { "epoch": 0.2571570412046997, "grad_norm": 0.49259305000305176, "learning_rate": 8.56198347107438e-06, "loss": 0.481, "step": 518 }, { "epoch": 0.2576534833691875, "grad_norm": 0.4790956377983093, "learning_rate": 8.578512396694215e-06, "loss": 0.4971, "step": 519 }, { "epoch": 0.25814992553367533, "grad_norm": 0.5737018585205078, "learning_rate": 8.59504132231405e-06, "loss": 0.4794, "step": 520 }, { "epoch": 0.2586463676981632, "grad_norm": 0.45854851603507996, "learning_rate": 8.611570247933884e-06, "loss": 0.47, "step": 521 }, { "epoch": 0.259142809862651, "grad_norm": 0.5231150984764099, "learning_rate": 8.62809917355372e-06, "loss": 0.4939, "step": 522 }, { "epoch": 0.25963925202713883, "grad_norm": 0.5035465955734253, "learning_rate": 8.644628099173555e-06, "loss": 0.4791, "step": 523 }, { "epoch": 0.2601356941916267, "grad_norm": 0.5404543876647949, "learning_rate": 8.66115702479339e-06, "loss": 0.4599, "step": 524 }, { "epoch": 0.2606321363561145, "grad_norm": 0.5602558851242065, "learning_rate": 8.677685950413224e-06, "loss": 0.4963, "step": 525 }, { "epoch": 0.26112857852060234, "grad_norm": 0.46093133091926575, "learning_rate": 8.694214876033059e-06, "loss": 0.4716, "step": 526 }, { "epoch": 0.2616250206850902, "grad_norm": 0.5049439072608948, "learning_rate": 8.710743801652893e-06, "loss": 0.5008, "step": 527 }, { "epoch": 0.26212146284957805, "grad_norm": 0.40452849864959717, "learning_rate": 8.727272727272728e-06, "loss": 0.433, "step": 528 }, { "epoch": 0.26261790501406584, "grad_norm": 0.540416955947876, "learning_rate": 8.743801652892562e-06, "loss": 0.4487, "step": 529 }, { "epoch": 0.2631143471785537, "grad_norm": 0.4491650462150574, "learning_rate": 8.760330578512397e-06, "loss": 0.4704, "step": 530 }, { "epoch": 0.26361078934304155, "grad_norm": 0.41992658376693726, "learning_rate": 8.776859504132233e-06, "loss": 0.4712, "step": 531 }, { "epoch": 0.26410723150752935, "grad_norm": 0.46915292739868164, "learning_rate": 8.793388429752066e-06, "loss": 0.4748, "step": 532 }, { "epoch": 0.2646036736720172, "grad_norm": 0.5058100819587708, "learning_rate": 8.809917355371902e-06, "loss": 0.4705, "step": 533 }, { "epoch": 0.26510011583650506, "grad_norm": 0.39895468950271606, "learning_rate": 8.826446280991737e-06, "loss": 0.4452, "step": 534 }, { "epoch": 0.2655965580009929, "grad_norm": 0.5076683759689331, "learning_rate": 8.842975206611571e-06, "loss": 0.4892, "step": 535 }, { "epoch": 0.2660930001654807, "grad_norm": 0.44137799739837646, "learning_rate": 8.859504132231406e-06, "loss": 0.5046, "step": 536 }, { "epoch": 0.26658944232996856, "grad_norm": 0.5025840997695923, "learning_rate": 8.87603305785124e-06, "loss": 0.4835, "step": 537 }, { "epoch": 0.2670858844944564, "grad_norm": 0.4313961863517761, "learning_rate": 8.892561983471075e-06, "loss": 0.4549, "step": 538 }, { "epoch": 0.2675823266589442, "grad_norm": 0.43231362104415894, "learning_rate": 8.90909090909091e-06, "loss": 0.4591, "step": 539 }, { "epoch": 0.26807876882343207, "grad_norm": 0.45324286818504333, "learning_rate": 8.925619834710744e-06, "loss": 0.4826, "step": 540 }, { "epoch": 0.2685752109879199, "grad_norm": 0.404909610748291, "learning_rate": 8.942148760330578e-06, "loss": 0.4612, "step": 541 }, { "epoch": 0.2690716531524077, "grad_norm": 0.39962807297706604, "learning_rate": 8.958677685950415e-06, "loss": 0.4709, "step": 542 }, { "epoch": 0.2695680953168956, "grad_norm": 0.4469228684902191, "learning_rate": 8.97520661157025e-06, "loss": 0.4799, "step": 543 }, { "epoch": 0.2700645374813834, "grad_norm": 0.43718475103378296, "learning_rate": 8.991735537190084e-06, "loss": 0.5196, "step": 544 }, { "epoch": 0.2705609796458713, "grad_norm": 0.41625452041625977, "learning_rate": 9.008264462809918e-06, "loss": 0.452, "step": 545 }, { "epoch": 0.2710574218103591, "grad_norm": 0.4890074133872986, "learning_rate": 9.024793388429753e-06, "loss": 0.4819, "step": 546 }, { "epoch": 0.27155386397484693, "grad_norm": 0.47165021300315857, "learning_rate": 9.041322314049587e-06, "loss": 0.5205, "step": 547 }, { "epoch": 0.2720503061393348, "grad_norm": 0.4145393371582031, "learning_rate": 9.057851239669422e-06, "loss": 0.4694, "step": 548 }, { "epoch": 0.2725467483038226, "grad_norm": 0.41746217012405396, "learning_rate": 9.074380165289256e-06, "loss": 0.4686, "step": 549 }, { "epoch": 0.27304319046831044, "grad_norm": 0.45429131388664246, "learning_rate": 9.090909090909091e-06, "loss": 0.4728, "step": 550 }, { "epoch": 0.2735396326327983, "grad_norm": 0.4117630124092102, "learning_rate": 9.107438016528927e-06, "loss": 0.5045, "step": 551 }, { "epoch": 0.2740360747972861, "grad_norm": 0.4909166693687439, "learning_rate": 9.12396694214876e-06, "loss": 0.4905, "step": 552 }, { "epoch": 0.27453251696177394, "grad_norm": 0.42463821172714233, "learning_rate": 9.140495867768596e-06, "loss": 0.4542, "step": 553 }, { "epoch": 0.2750289591262618, "grad_norm": 0.48478174209594727, "learning_rate": 9.157024793388431e-06, "loss": 0.4414, "step": 554 }, { "epoch": 0.27552540129074965, "grad_norm": 0.4859165549278259, "learning_rate": 9.173553719008265e-06, "loss": 0.4816, "step": 555 }, { "epoch": 0.27602184345523745, "grad_norm": 0.4848686754703522, "learning_rate": 9.1900826446281e-06, "loss": 0.4694, "step": 556 }, { "epoch": 0.2765182856197253, "grad_norm": 0.4271481931209564, "learning_rate": 9.206611570247935e-06, "loss": 0.4683, "step": 557 }, { "epoch": 0.27701472778421316, "grad_norm": 0.4258889853954315, "learning_rate": 9.223140495867769e-06, "loss": 0.457, "step": 558 }, { "epoch": 0.27751116994870095, "grad_norm": 0.4222073554992676, "learning_rate": 9.239669421487604e-06, "loss": 0.4464, "step": 559 }, { "epoch": 0.2780076121131888, "grad_norm": 0.4856571555137634, "learning_rate": 9.25619834710744e-06, "loss": 0.48, "step": 560 }, { "epoch": 0.27850405427767666, "grad_norm": 0.4507788419723511, "learning_rate": 9.272727272727273e-06, "loss": 0.4758, "step": 561 }, { "epoch": 0.2790004964421645, "grad_norm": 0.48318004608154297, "learning_rate": 9.289256198347109e-06, "loss": 0.4553, "step": 562 }, { "epoch": 0.2794969386066523, "grad_norm": 0.5148987770080566, "learning_rate": 9.305785123966943e-06, "loss": 0.4911, "step": 563 }, { "epoch": 0.27999338077114017, "grad_norm": 0.5731751322746277, "learning_rate": 9.322314049586778e-06, "loss": 0.4721, "step": 564 }, { "epoch": 0.280489822935628, "grad_norm": 0.44388818740844727, "learning_rate": 9.338842975206613e-06, "loss": 0.4673, "step": 565 }, { "epoch": 0.2809862651001158, "grad_norm": 0.483881413936615, "learning_rate": 9.355371900826447e-06, "loss": 0.4563, "step": 566 }, { "epoch": 0.28148270726460367, "grad_norm": 0.5167564153671265, "learning_rate": 9.371900826446282e-06, "loss": 0.4808, "step": 567 }, { "epoch": 0.2819791494290915, "grad_norm": 0.46658211946487427, "learning_rate": 9.388429752066116e-06, "loss": 0.5032, "step": 568 }, { "epoch": 0.2824755915935793, "grad_norm": 0.4640190303325653, "learning_rate": 9.40495867768595e-06, "loss": 0.4738, "step": 569 }, { "epoch": 0.2829720337580672, "grad_norm": 0.44989728927612305, "learning_rate": 9.421487603305785e-06, "loss": 0.4782, "step": 570 }, { "epoch": 0.28346847592255503, "grad_norm": 0.4337916076183319, "learning_rate": 9.438016528925621e-06, "loss": 0.4396, "step": 571 }, { "epoch": 0.2839649180870429, "grad_norm": 0.422832190990448, "learning_rate": 9.454545454545456e-06, "loss": 0.4446, "step": 572 }, { "epoch": 0.2844613602515307, "grad_norm": 0.42181551456451416, "learning_rate": 9.47107438016529e-06, "loss": 0.4811, "step": 573 }, { "epoch": 0.28495780241601854, "grad_norm": 0.4736669361591339, "learning_rate": 9.487603305785125e-06, "loss": 0.4763, "step": 574 }, { "epoch": 0.2854542445805064, "grad_norm": 0.44046393036842346, "learning_rate": 9.50413223140496e-06, "loss": 0.4385, "step": 575 }, { "epoch": 0.2859506867449942, "grad_norm": 0.5293316841125488, "learning_rate": 9.520661157024794e-06, "loss": 0.4553, "step": 576 }, { "epoch": 0.28644712890948204, "grad_norm": 0.47409000992774963, "learning_rate": 9.537190082644629e-06, "loss": 0.4532, "step": 577 }, { "epoch": 0.2869435710739699, "grad_norm": 0.4991501271724701, "learning_rate": 9.553719008264463e-06, "loss": 0.4679, "step": 578 }, { "epoch": 0.2874400132384577, "grad_norm": 0.44962936639785767, "learning_rate": 9.570247933884298e-06, "loss": 0.4306, "step": 579 }, { "epoch": 0.28793645540294555, "grad_norm": 0.4663037359714508, "learning_rate": 9.586776859504134e-06, "loss": 0.4462, "step": 580 }, { "epoch": 0.2884328975674334, "grad_norm": 0.4381522536277771, "learning_rate": 9.603305785123967e-06, "loss": 0.4651, "step": 581 }, { "epoch": 0.28892933973192125, "grad_norm": 0.4041360914707184, "learning_rate": 9.619834710743803e-06, "loss": 0.4711, "step": 582 }, { "epoch": 0.28942578189640905, "grad_norm": 0.4971594214439392, "learning_rate": 9.636363636363638e-06, "loss": 0.4734, "step": 583 }, { "epoch": 0.2899222240608969, "grad_norm": 0.4420693814754486, "learning_rate": 9.652892561983472e-06, "loss": 0.4665, "step": 584 }, { "epoch": 0.29041866622538476, "grad_norm": 0.40839841961860657, "learning_rate": 9.669421487603307e-06, "loss": 0.4542, "step": 585 }, { "epoch": 0.29091510838987256, "grad_norm": 0.45588889718055725, "learning_rate": 9.685950413223141e-06, "loss": 0.4791, "step": 586 }, { "epoch": 0.2914115505543604, "grad_norm": 0.4695976972579956, "learning_rate": 9.702479338842976e-06, "loss": 0.4908, "step": 587 }, { "epoch": 0.29190799271884826, "grad_norm": 0.4354483485221863, "learning_rate": 9.71900826446281e-06, "loss": 0.4642, "step": 588 }, { "epoch": 0.2924044348833361, "grad_norm": 0.48239463567733765, "learning_rate": 9.735537190082645e-06, "loss": 0.4627, "step": 589 }, { "epoch": 0.2929008770478239, "grad_norm": 0.4359372556209564, "learning_rate": 9.75206611570248e-06, "loss": 0.464, "step": 590 }, { "epoch": 0.29339731921231177, "grad_norm": 0.44007065892219543, "learning_rate": 9.768595041322316e-06, "loss": 0.444, "step": 591 }, { "epoch": 0.2938937613767996, "grad_norm": 0.47367116808891296, "learning_rate": 9.78512396694215e-06, "loss": 0.4394, "step": 592 }, { "epoch": 0.2943902035412874, "grad_norm": 0.4762154817581177, "learning_rate": 9.801652892561985e-06, "loss": 0.4481, "step": 593 }, { "epoch": 0.2948866457057753, "grad_norm": 0.4874429404735565, "learning_rate": 9.81818181818182e-06, "loss": 0.4522, "step": 594 }, { "epoch": 0.29538308787026313, "grad_norm": 0.48049837350845337, "learning_rate": 9.834710743801654e-06, "loss": 0.4641, "step": 595 }, { "epoch": 0.2958795300347509, "grad_norm": 0.4341600239276886, "learning_rate": 9.851239669421488e-06, "loss": 0.4428, "step": 596 }, { "epoch": 0.2963759721992388, "grad_norm": 0.5187180638313293, "learning_rate": 9.867768595041323e-06, "loss": 0.452, "step": 597 }, { "epoch": 0.29687241436372663, "grad_norm": 0.44286608695983887, "learning_rate": 9.884297520661157e-06, "loss": 0.4786, "step": 598 }, { "epoch": 0.2973688565282145, "grad_norm": 0.4424167275428772, "learning_rate": 9.900826446280992e-06, "loss": 0.4796, "step": 599 }, { "epoch": 0.2978652986927023, "grad_norm": 0.41045081615448, "learning_rate": 9.917355371900828e-06, "loss": 0.4581, "step": 600 }, { "epoch": 0.29836174085719014, "grad_norm": 0.43121814727783203, "learning_rate": 9.933884297520661e-06, "loss": 0.4698, "step": 601 }, { "epoch": 0.298858183021678, "grad_norm": 0.4507293105125427, "learning_rate": 9.950413223140497e-06, "loss": 0.5005, "step": 602 }, { "epoch": 0.2993546251861658, "grad_norm": 0.40033653378486633, "learning_rate": 9.966942148760332e-06, "loss": 0.4571, "step": 603 }, { "epoch": 0.29985106735065364, "grad_norm": 0.46912050247192383, "learning_rate": 9.983471074380166e-06, "loss": 0.4925, "step": 604 }, { "epoch": 0.3003475095151415, "grad_norm": 0.4521634578704834, "learning_rate": 1e-05, "loss": 0.4851, "step": 605 }, { "epoch": 0.3008439516796293, "grad_norm": 0.46425825357437134, "learning_rate": 9.999999165317946e-06, "loss": 0.4772, "step": 606 }, { "epoch": 0.30134039384411715, "grad_norm": 0.49937519431114197, "learning_rate": 9.999996661272064e-06, "loss": 0.4694, "step": 607 }, { "epoch": 0.301836836008605, "grad_norm": 0.49192437529563904, "learning_rate": 9.999992487863189e-06, "loss": 0.4619, "step": 608 }, { "epoch": 0.30233327817309286, "grad_norm": 0.46220317482948303, "learning_rate": 9.999986645092714e-06, "loss": 0.4725, "step": 609 }, { "epoch": 0.30282972033758065, "grad_norm": 0.4528823792934418, "learning_rate": 9.99997913296259e-06, "loss": 0.4703, "step": 610 }, { "epoch": 0.3033261625020685, "grad_norm": 0.4757075309753418, "learning_rate": 9.999969951475326e-06, "loss": 0.4649, "step": 611 }, { "epoch": 0.30382260466655636, "grad_norm": 0.4548949897289276, "learning_rate": 9.999959100633987e-06, "loss": 0.4515, "step": 612 }, { "epoch": 0.30431904683104416, "grad_norm": 0.4900111258029938, "learning_rate": 9.999946580442195e-06, "loss": 0.4806, "step": 613 }, { "epoch": 0.304815488995532, "grad_norm": 0.4817737638950348, "learning_rate": 9.999932390904133e-06, "loss": 0.4532, "step": 614 }, { "epoch": 0.30531193116001987, "grad_norm": 0.48041269183158875, "learning_rate": 9.999916532024533e-06, "loss": 0.4647, "step": 615 }, { "epoch": 0.3058083733245077, "grad_norm": 0.45443153381347656, "learning_rate": 9.999899003808695e-06, "loss": 0.4614, "step": 616 }, { "epoch": 0.3063048154889955, "grad_norm": 0.5431375503540039, "learning_rate": 9.99987980626247e-06, "loss": 0.494, "step": 617 }, { "epoch": 0.3068012576534834, "grad_norm": 0.4105706810951233, "learning_rate": 9.999858939392263e-06, "loss": 0.4759, "step": 618 }, { "epoch": 0.3072976998179712, "grad_norm": 0.48473069071769714, "learning_rate": 9.99983640320505e-06, "loss": 0.4533, "step": 619 }, { "epoch": 0.307794141982459, "grad_norm": 0.4805839955806732, "learning_rate": 9.999812197708347e-06, "loss": 0.4415, "step": 620 }, { "epoch": 0.3082905841469469, "grad_norm": 0.45407167077064514, "learning_rate": 9.999786322910239e-06, "loss": 0.4673, "step": 621 }, { "epoch": 0.30878702631143473, "grad_norm": 0.5333812832832336, "learning_rate": 9.999758778819363e-06, "loss": 0.46, "step": 622 }, { "epoch": 0.30928346847592253, "grad_norm": 0.459966242313385, "learning_rate": 9.99972956544492e-06, "loss": 0.4601, "step": 623 }, { "epoch": 0.3097799106404104, "grad_norm": 0.5285501480102539, "learning_rate": 9.999698682796658e-06, "loss": 0.451, "step": 624 }, { "epoch": 0.31027635280489824, "grad_norm": 0.44586649537086487, "learning_rate": 9.99966613088489e-06, "loss": 0.4901, "step": 625 }, { "epoch": 0.3107727949693861, "grad_norm": 0.47442322969436646, "learning_rate": 9.999631909720487e-06, "loss": 0.4723, "step": 626 }, { "epoch": 0.3112692371338739, "grad_norm": 0.5053892135620117, "learning_rate": 9.999596019314868e-06, "loss": 0.5024, "step": 627 }, { "epoch": 0.31176567929836174, "grad_norm": 0.4274972975254059, "learning_rate": 9.999558459680022e-06, "loss": 0.4736, "step": 628 }, { "epoch": 0.3122621214628496, "grad_norm": 0.49138620495796204, "learning_rate": 9.999519230828486e-06, "loss": 0.4698, "step": 629 }, { "epoch": 0.3127585636273374, "grad_norm": 0.5152320861816406, "learning_rate": 9.999478332773357e-06, "loss": 0.481, "step": 630 }, { "epoch": 0.31325500579182525, "grad_norm": 0.4688166081905365, "learning_rate": 9.999435765528293e-06, "loss": 0.4358, "step": 631 }, { "epoch": 0.3137514479563131, "grad_norm": 0.46488305926322937, "learning_rate": 9.999391529107504e-06, "loss": 0.4617, "step": 632 }, { "epoch": 0.3142478901208009, "grad_norm": 0.527978241443634, "learning_rate": 9.999345623525758e-06, "loss": 0.47, "step": 633 }, { "epoch": 0.31474433228528875, "grad_norm": 0.42899540066719055, "learning_rate": 9.999298048798385e-06, "loss": 0.4471, "step": 634 }, { "epoch": 0.3152407744497766, "grad_norm": 0.5108292102813721, "learning_rate": 9.999248804941265e-06, "loss": 0.4578, "step": 635 }, { "epoch": 0.31573721661426446, "grad_norm": 0.46431219577789307, "learning_rate": 9.999197891970843e-06, "loss": 0.4708, "step": 636 }, { "epoch": 0.31623365877875226, "grad_norm": 0.4491477310657501, "learning_rate": 9.999145309904112e-06, "loss": 0.4665, "step": 637 }, { "epoch": 0.3167301009432401, "grad_norm": 0.448596328496933, "learning_rate": 9.999091058758634e-06, "loss": 0.4589, "step": 638 }, { "epoch": 0.31722654310772797, "grad_norm": 0.42760810256004333, "learning_rate": 9.99903513855252e-06, "loss": 0.4694, "step": 639 }, { "epoch": 0.31772298527221576, "grad_norm": 0.427337110042572, "learning_rate": 9.998977549304436e-06, "loss": 0.4712, "step": 640 }, { "epoch": 0.3182194274367036, "grad_norm": 0.4343109726905823, "learning_rate": 9.998918291033617e-06, "loss": 0.4736, "step": 641 }, { "epoch": 0.31871586960119147, "grad_norm": 0.46056056022644043, "learning_rate": 9.998857363759842e-06, "loss": 0.4625, "step": 642 }, { "epoch": 0.3192123117656793, "grad_norm": 0.44514575600624084, "learning_rate": 9.998794767503455e-06, "loss": 0.4883, "step": 643 }, { "epoch": 0.3197087539301671, "grad_norm": 0.4214226305484772, "learning_rate": 9.998730502285354e-06, "loss": 0.4683, "step": 644 }, { "epoch": 0.320205196094655, "grad_norm": 0.4675305187702179, "learning_rate": 9.998664568126996e-06, "loss": 0.467, "step": 645 }, { "epoch": 0.32070163825914283, "grad_norm": 0.4716167151927948, "learning_rate": 9.998596965050395e-06, "loss": 0.4583, "step": 646 }, { "epoch": 0.3211980804236306, "grad_norm": 0.4303293526172638, "learning_rate": 9.998527693078122e-06, "loss": 0.4366, "step": 647 }, { "epoch": 0.3216945225881185, "grad_norm": 0.43465134501457214, "learning_rate": 9.998456752233305e-06, "loss": 0.4597, "step": 648 }, { "epoch": 0.32219096475260633, "grad_norm": 0.5869894623756409, "learning_rate": 9.99838414253963e-06, "loss": 0.483, "step": 649 }, { "epoch": 0.32268740691709413, "grad_norm": 0.46892422437667847, "learning_rate": 9.998309864021337e-06, "loss": 0.4697, "step": 650 }, { "epoch": 0.323183849081582, "grad_norm": 0.5754559636116028, "learning_rate": 9.998233916703225e-06, "loss": 0.4925, "step": 651 }, { "epoch": 0.32368029124606984, "grad_norm": 0.4987186789512634, "learning_rate": 9.998156300610658e-06, "loss": 0.4325, "step": 652 }, { "epoch": 0.3241767334105577, "grad_norm": 0.5070486068725586, "learning_rate": 9.99807701576954e-06, "loss": 0.4422, "step": 653 }, { "epoch": 0.3246731755750455, "grad_norm": 0.48529526591300964, "learning_rate": 9.997996062206348e-06, "loss": 0.4617, "step": 654 }, { "epoch": 0.32516961773953335, "grad_norm": 0.4924617111682892, "learning_rate": 9.99791343994811e-06, "loss": 0.5015, "step": 655 }, { "epoch": 0.3256660599040212, "grad_norm": 0.5063813924789429, "learning_rate": 9.997829149022408e-06, "loss": 0.4523, "step": 656 }, { "epoch": 0.326162502068509, "grad_norm": 0.4872376620769501, "learning_rate": 9.997743189457387e-06, "loss": 0.4747, "step": 657 }, { "epoch": 0.32665894423299685, "grad_norm": 0.44963982701301575, "learning_rate": 9.997655561281747e-06, "loss": 0.4764, "step": 658 }, { "epoch": 0.3271553863974847, "grad_norm": 0.468538373708725, "learning_rate": 9.997566264524745e-06, "loss": 0.4603, "step": 659 }, { "epoch": 0.32765182856197256, "grad_norm": 0.49369126558303833, "learning_rate": 9.997475299216191e-06, "loss": 0.4636, "step": 660 }, { "epoch": 0.32814827072646036, "grad_norm": 0.438283771276474, "learning_rate": 9.99738266538646e-06, "loss": 0.4539, "step": 661 }, { "epoch": 0.3286447128909482, "grad_norm": 0.4795251190662384, "learning_rate": 9.997288363066479e-06, "loss": 0.4436, "step": 662 }, { "epoch": 0.32914115505543606, "grad_norm": 0.4147178530693054, "learning_rate": 9.99719239228773e-06, "loss": 0.4443, "step": 663 }, { "epoch": 0.32963759721992386, "grad_norm": 0.3992593586444855, "learning_rate": 9.99709475308226e-06, "loss": 0.4758, "step": 664 }, { "epoch": 0.3301340393844117, "grad_norm": 0.4761015474796295, "learning_rate": 9.996995445482664e-06, "loss": 0.4496, "step": 665 }, { "epoch": 0.33063048154889957, "grad_norm": 0.3964051902294159, "learning_rate": 9.9968944695221e-06, "loss": 0.4578, "step": 666 }, { "epoch": 0.33112692371338737, "grad_norm": 0.4298408329486847, "learning_rate": 9.99679182523428e-06, "loss": 0.4423, "step": 667 }, { "epoch": 0.3316233658778752, "grad_norm": 0.41519659757614136, "learning_rate": 9.996687512653476e-06, "loss": 0.4694, "step": 668 }, { "epoch": 0.3321198080423631, "grad_norm": 0.45390287041664124, "learning_rate": 9.996581531814513e-06, "loss": 0.474, "step": 669 }, { "epoch": 0.3326162502068509, "grad_norm": 0.40822839736938477, "learning_rate": 9.996473882752777e-06, "loss": 0.4564, "step": 670 }, { "epoch": 0.3331126923713387, "grad_norm": 0.45899900794029236, "learning_rate": 9.996364565504208e-06, "loss": 0.4768, "step": 671 }, { "epoch": 0.3336091345358266, "grad_norm": 0.5035883784294128, "learning_rate": 9.996253580105302e-06, "loss": 0.4687, "step": 672 }, { "epoch": 0.33410557670031443, "grad_norm": 0.41712138056755066, "learning_rate": 9.996140926593119e-06, "loss": 0.4356, "step": 673 }, { "epoch": 0.33460201886480223, "grad_norm": 0.5176399946212769, "learning_rate": 9.996026605005266e-06, "loss": 0.4725, "step": 674 }, { "epoch": 0.3350984610292901, "grad_norm": 0.5114835500717163, "learning_rate": 9.995910615379917e-06, "loss": 0.4625, "step": 675 }, { "epoch": 0.33559490319377794, "grad_norm": 0.45367226004600525, "learning_rate": 9.995792957755793e-06, "loss": 0.4755, "step": 676 }, { "epoch": 0.33609134535826574, "grad_norm": 0.499668687582016, "learning_rate": 9.995673632172179e-06, "loss": 0.4554, "step": 677 }, { "epoch": 0.3365877875227536, "grad_norm": 0.5260589122772217, "learning_rate": 9.995552638668912e-06, "loss": 0.4669, "step": 678 }, { "epoch": 0.33708422968724144, "grad_norm": 0.490307480096817, "learning_rate": 9.995429977286394e-06, "loss": 0.4696, "step": 679 }, { "epoch": 0.3375806718517293, "grad_norm": 0.4749196767807007, "learning_rate": 9.995305648065573e-06, "loss": 0.4746, "step": 680 }, { "epoch": 0.3380771140162171, "grad_norm": 0.5080386996269226, "learning_rate": 9.995179651047961e-06, "loss": 0.4499, "step": 681 }, { "epoch": 0.33857355618070495, "grad_norm": 0.518105149269104, "learning_rate": 9.995051986275626e-06, "loss": 0.4708, "step": 682 }, { "epoch": 0.3390699983451928, "grad_norm": 0.4434483051300049, "learning_rate": 9.99492265379119e-06, "loss": 0.4783, "step": 683 }, { "epoch": 0.3395664405096806, "grad_norm": 0.5089012384414673, "learning_rate": 9.994791653637834e-06, "loss": 0.443, "step": 684 }, { "epoch": 0.34006288267416845, "grad_norm": 0.4536261558532715, "learning_rate": 9.994658985859295e-06, "loss": 0.446, "step": 685 }, { "epoch": 0.3405593248386563, "grad_norm": 0.539293646812439, "learning_rate": 9.99452465049987e-06, "loss": 0.4599, "step": 686 }, { "epoch": 0.34105576700314416, "grad_norm": 0.4853936433792114, "learning_rate": 9.994388647604408e-06, "loss": 0.4573, "step": 687 }, { "epoch": 0.34155220916763196, "grad_norm": 0.4860484004020691, "learning_rate": 9.994250977218313e-06, "loss": 0.4557, "step": 688 }, { "epoch": 0.3420486513321198, "grad_norm": 0.494132936000824, "learning_rate": 9.994111639387557e-06, "loss": 0.4553, "step": 689 }, { "epoch": 0.34254509349660767, "grad_norm": 0.5010596513748169, "learning_rate": 9.993970634158656e-06, "loss": 0.4716, "step": 690 }, { "epoch": 0.34304153566109546, "grad_norm": 0.46240052580833435, "learning_rate": 9.993827961578688e-06, "loss": 0.4449, "step": 691 }, { "epoch": 0.3435379778255833, "grad_norm": 0.5007613301277161, "learning_rate": 9.993683621695287e-06, "loss": 0.4777, "step": 692 }, { "epoch": 0.34403441999007117, "grad_norm": 0.5012892484664917, "learning_rate": 9.993537614556648e-06, "loss": 0.4659, "step": 693 }, { "epoch": 0.34453086215455897, "grad_norm": 0.5059606432914734, "learning_rate": 9.993389940211515e-06, "loss": 0.4589, "step": 694 }, { "epoch": 0.3450273043190468, "grad_norm": 0.43466609716415405, "learning_rate": 9.993240598709195e-06, "loss": 0.4343, "step": 695 }, { "epoch": 0.3455237464835347, "grad_norm": 0.4844149947166443, "learning_rate": 9.993089590099547e-06, "loss": 0.4432, "step": 696 }, { "epoch": 0.34602018864802253, "grad_norm": 0.49753373861312866, "learning_rate": 9.99293691443299e-06, "loss": 0.4661, "step": 697 }, { "epoch": 0.34651663081251033, "grad_norm": 0.4831410050392151, "learning_rate": 9.992782571760497e-06, "loss": 0.4529, "step": 698 }, { "epoch": 0.3470130729769982, "grad_norm": 0.4929013252258301, "learning_rate": 9.9926265621336e-06, "loss": 0.4417, "step": 699 }, { "epoch": 0.34750951514148604, "grad_norm": 0.46492505073547363, "learning_rate": 9.992468885604385e-06, "loss": 0.4591, "step": 700 }, { "epoch": 0.34800595730597383, "grad_norm": 0.48195117712020874, "learning_rate": 9.992309542225497e-06, "loss": 0.4369, "step": 701 }, { "epoch": 0.3485023994704617, "grad_norm": 0.44062772393226624, "learning_rate": 9.992148532050139e-06, "loss": 0.4388, "step": 702 }, { "epoch": 0.34899884163494954, "grad_norm": 0.48918500542640686, "learning_rate": 9.991985855132062e-06, "loss": 0.4565, "step": 703 }, { "epoch": 0.34949528379943734, "grad_norm": 0.4990057349205017, "learning_rate": 9.991821511525584e-06, "loss": 0.4801, "step": 704 }, { "epoch": 0.3499917259639252, "grad_norm": 0.4621891975402832, "learning_rate": 9.991655501285574e-06, "loss": 0.4728, "step": 705 }, { "epoch": 0.35048816812841305, "grad_norm": 0.5391308069229126, "learning_rate": 9.991487824467458e-06, "loss": 0.5012, "step": 706 }, { "epoch": 0.3509846102929009, "grad_norm": 0.4377981126308441, "learning_rate": 9.991318481127218e-06, "loss": 0.4538, "step": 707 }, { "epoch": 0.3514810524573887, "grad_norm": 0.5195844173431396, "learning_rate": 9.991147471321392e-06, "loss": 0.471, "step": 708 }, { "epoch": 0.35197749462187655, "grad_norm": 0.46802738308906555, "learning_rate": 9.990974795107078e-06, "loss": 0.4617, "step": 709 }, { "epoch": 0.3524739367863644, "grad_norm": 0.4354342818260193, "learning_rate": 9.990800452541929e-06, "loss": 0.5033, "step": 710 }, { "epoch": 0.3529703789508522, "grad_norm": 0.4489007592201233, "learning_rate": 9.99062444368415e-06, "loss": 0.4789, "step": 711 }, { "epoch": 0.35346682111534006, "grad_norm": 0.5134022235870361, "learning_rate": 9.990446768592507e-06, "loss": 0.4895, "step": 712 }, { "epoch": 0.3539632632798279, "grad_norm": 0.4457527995109558, "learning_rate": 9.99026742732632e-06, "loss": 0.4399, "step": 713 }, { "epoch": 0.35445970544431576, "grad_norm": 0.5064969658851624, "learning_rate": 9.990086419945469e-06, "loss": 0.4588, "step": 714 }, { "epoch": 0.35495614760880356, "grad_norm": 0.5112425088882446, "learning_rate": 9.989903746510383e-06, "loss": 0.4957, "step": 715 }, { "epoch": 0.3554525897732914, "grad_norm": 0.522133469581604, "learning_rate": 9.989719407082056e-06, "loss": 0.4502, "step": 716 }, { "epoch": 0.35594903193777927, "grad_norm": 0.42204388976097107, "learning_rate": 9.989533401722031e-06, "loss": 0.45, "step": 717 }, { "epoch": 0.35644547410226707, "grad_norm": 0.48389917612075806, "learning_rate": 9.98934573049241e-06, "loss": 0.4466, "step": 718 }, { "epoch": 0.3569419162667549, "grad_norm": 0.4974230229854584, "learning_rate": 9.989156393455856e-06, "loss": 0.4552, "step": 719 }, { "epoch": 0.3574383584312428, "grad_norm": 0.4142509698867798, "learning_rate": 9.988965390675578e-06, "loss": 0.4476, "step": 720 }, { "epoch": 0.3579348005957306, "grad_norm": 0.44663283228874207, "learning_rate": 9.988772722215348e-06, "loss": 0.4762, "step": 721 }, { "epoch": 0.3584312427602184, "grad_norm": 0.4466852843761444, "learning_rate": 9.988578388139493e-06, "loss": 0.4724, "step": 722 }, { "epoch": 0.3589276849247063, "grad_norm": 0.44083407521247864, "learning_rate": 9.988382388512898e-06, "loss": 0.4385, "step": 723 }, { "epoch": 0.35942412708919413, "grad_norm": 0.4966927468776703, "learning_rate": 9.988184723400999e-06, "loss": 0.4893, "step": 724 }, { "epoch": 0.35992056925368193, "grad_norm": 0.4738323986530304, "learning_rate": 9.987985392869792e-06, "loss": 0.4549, "step": 725 }, { "epoch": 0.3604170114181698, "grad_norm": 0.4962857663631439, "learning_rate": 9.987784396985829e-06, "loss": 0.4668, "step": 726 }, { "epoch": 0.36091345358265764, "grad_norm": 0.47727835178375244, "learning_rate": 9.987581735816216e-06, "loss": 0.4729, "step": 727 }, { "epoch": 0.36140989574714544, "grad_norm": 0.4034305214881897, "learning_rate": 9.987377409428617e-06, "loss": 0.4644, "step": 728 }, { "epoch": 0.3619063379116333, "grad_norm": 0.4307596683502197, "learning_rate": 9.98717141789125e-06, "loss": 0.4658, "step": 729 }, { "epoch": 0.36240278007612114, "grad_norm": 0.5103350877761841, "learning_rate": 9.98696376127289e-06, "loss": 0.4508, "step": 730 }, { "epoch": 0.36289922224060894, "grad_norm": 0.39697036147117615, "learning_rate": 9.98675443964287e-06, "loss": 0.4676, "step": 731 }, { "epoch": 0.3633956644050968, "grad_norm": 0.47787100076675415, "learning_rate": 9.986543453071074e-06, "loss": 0.4604, "step": 732 }, { "epoch": 0.36389210656958465, "grad_norm": 0.4539291560649872, "learning_rate": 9.986330801627944e-06, "loss": 0.4439, "step": 733 }, { "epoch": 0.3643885487340725, "grad_norm": 0.3859419524669647, "learning_rate": 9.986116485384481e-06, "loss": 0.4619, "step": 734 }, { "epoch": 0.3648849908985603, "grad_norm": 0.5138053297996521, "learning_rate": 9.98590050441224e-06, "loss": 0.4273, "step": 735 }, { "epoch": 0.36538143306304816, "grad_norm": 0.41417205333709717, "learning_rate": 9.98568285878333e-06, "loss": 0.4426, "step": 736 }, { "epoch": 0.365877875227536, "grad_norm": 0.5332003235816956, "learning_rate": 9.985463548570416e-06, "loss": 0.4629, "step": 737 }, { "epoch": 0.3663743173920238, "grad_norm": 0.4435381293296814, "learning_rate": 9.985242573846721e-06, "loss": 0.4766, "step": 738 }, { "epoch": 0.36687075955651166, "grad_norm": 0.47234880924224854, "learning_rate": 9.98501993468602e-06, "loss": 0.4883, "step": 739 }, { "epoch": 0.3673672017209995, "grad_norm": 0.4269942343235016, "learning_rate": 9.984795631162651e-06, "loss": 0.4486, "step": 740 }, { "epoch": 0.36786364388548737, "grad_norm": 0.4545361399650574, "learning_rate": 9.984569663351497e-06, "loss": 0.4675, "step": 741 }, { "epoch": 0.36836008604997517, "grad_norm": 0.4595291018486023, "learning_rate": 9.984342031328007e-06, "loss": 0.4752, "step": 742 }, { "epoch": 0.368856528214463, "grad_norm": 0.4073132872581482, "learning_rate": 9.984112735168182e-06, "loss": 0.4136, "step": 743 }, { "epoch": 0.3693529703789509, "grad_norm": 0.496504545211792, "learning_rate": 9.983881774948572e-06, "loss": 0.4613, "step": 744 }, { "epoch": 0.36984941254343867, "grad_norm": 0.4916771352291107, "learning_rate": 9.983649150746292e-06, "loss": 0.458, "step": 745 }, { "epoch": 0.3703458547079265, "grad_norm": 0.4463868737220764, "learning_rate": 9.983414862639011e-06, "loss": 0.4582, "step": 746 }, { "epoch": 0.3708422968724144, "grad_norm": 0.48840340971946716, "learning_rate": 9.983178910704947e-06, "loss": 0.4769, "step": 747 }, { "epoch": 0.3713387390369022, "grad_norm": 0.42919665575027466, "learning_rate": 9.982941295022881e-06, "loss": 0.4476, "step": 748 }, { "epoch": 0.37183518120139003, "grad_norm": 0.41322648525238037, "learning_rate": 9.982702015672145e-06, "loss": 0.4672, "step": 749 }, { "epoch": 0.3723316233658779, "grad_norm": 0.43473634123802185, "learning_rate": 9.982461072732628e-06, "loss": 0.4597, "step": 750 }, { "epoch": 0.37282806553036574, "grad_norm": 0.42603549361228943, "learning_rate": 9.982218466284775e-06, "loss": 0.4561, "step": 751 }, { "epoch": 0.37332450769485354, "grad_norm": 0.39597687125205994, "learning_rate": 9.981974196409586e-06, "loss": 0.4519, "step": 752 }, { "epoch": 0.3738209498593414, "grad_norm": 0.442074179649353, "learning_rate": 9.981728263188615e-06, "loss": 0.4839, "step": 753 }, { "epoch": 0.37431739202382924, "grad_norm": 0.4657537043094635, "learning_rate": 9.98148066670397e-06, "loss": 0.462, "step": 754 }, { "epoch": 0.37481383418831704, "grad_norm": 0.4299393594264984, "learning_rate": 9.981231407038324e-06, "loss": 0.4592, "step": 755 }, { "epoch": 0.3753102763528049, "grad_norm": 0.4728623032569885, "learning_rate": 9.98098048427489e-06, "loss": 0.4405, "step": 756 }, { "epoch": 0.37580671851729275, "grad_norm": 0.4984114170074463, "learning_rate": 9.98072789849745e-06, "loss": 0.4865, "step": 757 }, { "epoch": 0.37630316068178055, "grad_norm": 0.4861014187335968, "learning_rate": 9.980473649790333e-06, "loss": 0.4751, "step": 758 }, { "epoch": 0.3767996028462684, "grad_norm": 0.494928240776062, "learning_rate": 9.980217738238427e-06, "loss": 0.4675, "step": 759 }, { "epoch": 0.37729604501075625, "grad_norm": 0.45532166957855225, "learning_rate": 9.979960163927172e-06, "loss": 0.4572, "step": 760 }, { "epoch": 0.3777924871752441, "grad_norm": 0.3587944507598877, "learning_rate": 9.979700926942564e-06, "loss": 0.4423, "step": 761 }, { "epoch": 0.3782889293397319, "grad_norm": 0.4763025939464569, "learning_rate": 9.97944002737116e-06, "loss": 0.5015, "step": 762 }, { "epoch": 0.37878537150421976, "grad_norm": 0.41155770421028137, "learning_rate": 9.979177465300063e-06, "loss": 0.4453, "step": 763 }, { "epoch": 0.3792818136687076, "grad_norm": 0.4644562900066376, "learning_rate": 9.978913240816938e-06, "loss": 0.4802, "step": 764 }, { "epoch": 0.3797782558331954, "grad_norm": 0.44735202193260193, "learning_rate": 9.978647354010002e-06, "loss": 0.4663, "step": 765 }, { "epoch": 0.38027469799768326, "grad_norm": 0.44925662875175476, "learning_rate": 9.978379804968026e-06, "loss": 0.4448, "step": 766 }, { "epoch": 0.3807711401621711, "grad_norm": 0.47172847390174866, "learning_rate": 9.978110593780338e-06, "loss": 0.47, "step": 767 }, { "epoch": 0.38126758232665897, "grad_norm": 0.4666939079761505, "learning_rate": 9.977839720536818e-06, "loss": 0.4538, "step": 768 }, { "epoch": 0.38176402449114677, "grad_norm": 0.5225818157196045, "learning_rate": 9.977567185327907e-06, "loss": 0.4747, "step": 769 }, { "epoch": 0.3822604666556346, "grad_norm": 0.41211405396461487, "learning_rate": 9.977292988244597e-06, "loss": 0.4462, "step": 770 }, { "epoch": 0.3827569088201225, "grad_norm": 0.4758220613002777, "learning_rate": 9.977017129378432e-06, "loss": 0.4407, "step": 771 }, { "epoch": 0.3832533509846103, "grad_norm": 0.48878994584083557, "learning_rate": 9.976739608821515e-06, "loss": 0.485, "step": 772 }, { "epoch": 0.38374979314909813, "grad_norm": 0.4285222589969635, "learning_rate": 9.976460426666505e-06, "loss": 0.476, "step": 773 }, { "epoch": 0.384246235313586, "grad_norm": 0.44847458600997925, "learning_rate": 9.976179583006608e-06, "loss": 0.4571, "step": 774 }, { "epoch": 0.3847426774780738, "grad_norm": 0.4585331380367279, "learning_rate": 9.975897077935597e-06, "loss": 0.4566, "step": 775 }, { "epoch": 0.38523911964256163, "grad_norm": 0.43495091795921326, "learning_rate": 9.975612911547787e-06, "loss": 0.4583, "step": 776 }, { "epoch": 0.3857355618070495, "grad_norm": 0.43480491638183594, "learning_rate": 9.975327083938056e-06, "loss": 0.4572, "step": 777 }, { "epoch": 0.38623200397153734, "grad_norm": 0.42583706974983215, "learning_rate": 9.975039595201833e-06, "loss": 0.4472, "step": 778 }, { "epoch": 0.38672844613602514, "grad_norm": 0.4337292015552521, "learning_rate": 9.974750445435104e-06, "loss": 0.4791, "step": 779 }, { "epoch": 0.387224888300513, "grad_norm": 0.44408854842185974, "learning_rate": 9.974459634734407e-06, "loss": 0.4654, "step": 780 }, { "epoch": 0.38772133046500085, "grad_norm": 0.4273568093776703, "learning_rate": 9.974167163196837e-06, "loss": 0.4502, "step": 781 }, { "epoch": 0.38821777262948864, "grad_norm": 0.45517706871032715, "learning_rate": 9.97387303092004e-06, "loss": 0.4679, "step": 782 }, { "epoch": 0.3887142147939765, "grad_norm": 0.46944743394851685, "learning_rate": 9.97357723800222e-06, "loss": 0.4833, "step": 783 }, { "epoch": 0.38921065695846435, "grad_norm": 0.4541037380695343, "learning_rate": 9.973279784542137e-06, "loss": 0.4629, "step": 784 }, { "epoch": 0.38970709912295215, "grad_norm": 0.4593735933303833, "learning_rate": 9.972980670639098e-06, "loss": 0.4366, "step": 785 }, { "epoch": 0.39020354128744, "grad_norm": 0.4297684133052826, "learning_rate": 9.972679896392973e-06, "loss": 0.4565, "step": 786 }, { "epoch": 0.39069998345192786, "grad_norm": 0.44318804144859314, "learning_rate": 9.97237746190418e-06, "loss": 0.4577, "step": 787 }, { "epoch": 0.3911964256164157, "grad_norm": 0.4611779749393463, "learning_rate": 9.972073367273694e-06, "loss": 0.4199, "step": 788 }, { "epoch": 0.3916928677809035, "grad_norm": 0.44335052371025085, "learning_rate": 9.971767612603045e-06, "loss": 0.4319, "step": 789 }, { "epoch": 0.39218930994539136, "grad_norm": 0.4529220461845398, "learning_rate": 9.971460197994314e-06, "loss": 0.4479, "step": 790 }, { "epoch": 0.3926857521098792, "grad_norm": 0.4577249586582184, "learning_rate": 9.97115112355014e-06, "loss": 0.4447, "step": 791 }, { "epoch": 0.393182194274367, "grad_norm": 0.470720499753952, "learning_rate": 9.970840389373715e-06, "loss": 0.433, "step": 792 }, { "epoch": 0.39367863643885487, "grad_norm": 0.4469447731971741, "learning_rate": 9.970527995568783e-06, "loss": 0.4661, "step": 793 }, { "epoch": 0.3941750786033427, "grad_norm": 0.46378248929977417, "learning_rate": 9.970213942239644e-06, "loss": 0.4595, "step": 794 }, { "epoch": 0.3946715207678306, "grad_norm": 0.45704886317253113, "learning_rate": 9.969898229491155e-06, "loss": 0.4596, "step": 795 }, { "epoch": 0.3951679629323184, "grad_norm": 0.4324401617050171, "learning_rate": 9.96958085742872e-06, "loss": 0.4551, "step": 796 }, { "epoch": 0.3956644050968062, "grad_norm": 0.49685418605804443, "learning_rate": 9.969261826158303e-06, "loss": 0.4451, "step": 797 }, { "epoch": 0.3961608472612941, "grad_norm": 0.44975095987319946, "learning_rate": 9.968941135786418e-06, "loss": 0.44, "step": 798 }, { "epoch": 0.3966572894257819, "grad_norm": 0.4547087252140045, "learning_rate": 9.968618786420136e-06, "loss": 0.4659, "step": 799 }, { "epoch": 0.39715373159026973, "grad_norm": 0.4910312294960022, "learning_rate": 9.968294778167083e-06, "loss": 0.4712, "step": 800 }, { "epoch": 0.3976501737547576, "grad_norm": 0.44719740748405457, "learning_rate": 9.967969111135434e-06, "loss": 0.4775, "step": 801 }, { "epoch": 0.3981466159192454, "grad_norm": 0.3687772750854492, "learning_rate": 9.96764178543392e-06, "loss": 0.442, "step": 802 }, { "epoch": 0.39864305808373324, "grad_norm": 0.5067430138587952, "learning_rate": 9.967312801171825e-06, "loss": 0.454, "step": 803 }, { "epoch": 0.3991395002482211, "grad_norm": 0.45588546991348267, "learning_rate": 9.966982158458992e-06, "loss": 0.4597, "step": 804 }, { "epoch": 0.39963594241270894, "grad_norm": 0.43663445115089417, "learning_rate": 9.96664985740581e-06, "loss": 0.4222, "step": 805 }, { "epoch": 0.40013238457719674, "grad_norm": 0.5759568214416504, "learning_rate": 9.96631589812323e-06, "loss": 0.4911, "step": 806 }, { "epoch": 0.4006288267416846, "grad_norm": 0.44393160939216614, "learning_rate": 9.965980280722744e-06, "loss": 0.4591, "step": 807 }, { "epoch": 0.40112526890617245, "grad_norm": 0.44337198138237, "learning_rate": 9.965643005316413e-06, "loss": 0.4859, "step": 808 }, { "epoch": 0.40162171107066025, "grad_norm": 0.5350134372711182, "learning_rate": 9.965304072016842e-06, "loss": 0.4406, "step": 809 }, { "epoch": 0.4021181532351481, "grad_norm": 0.44242754578590393, "learning_rate": 9.964963480937189e-06, "loss": 0.4476, "step": 810 }, { "epoch": 0.40261459539963595, "grad_norm": 0.43807685375213623, "learning_rate": 9.964621232191169e-06, "loss": 0.4503, "step": 811 }, { "epoch": 0.40311103756412375, "grad_norm": 0.4362418055534363, "learning_rate": 9.964277325893053e-06, "loss": 0.4733, "step": 812 }, { "epoch": 0.4036074797286116, "grad_norm": 0.3933125138282776, "learning_rate": 9.963931762157657e-06, "loss": 0.4356, "step": 813 }, { "epoch": 0.40410392189309946, "grad_norm": 0.4793570637702942, "learning_rate": 9.96358454110036e-06, "loss": 0.4593, "step": 814 }, { "epoch": 0.4046003640575873, "grad_norm": 0.43228474259376526, "learning_rate": 9.963235662837085e-06, "loss": 0.4669, "step": 815 }, { "epoch": 0.4050968062220751, "grad_norm": 0.44163134694099426, "learning_rate": 9.962885127484318e-06, "loss": 0.4326, "step": 816 }, { "epoch": 0.40559324838656297, "grad_norm": 0.45465216040611267, "learning_rate": 9.96253293515909e-06, "loss": 0.4271, "step": 817 }, { "epoch": 0.4060896905510508, "grad_norm": 0.3964511454105377, "learning_rate": 9.96217908597899e-06, "loss": 0.4684, "step": 818 }, { "epoch": 0.4065861327155386, "grad_norm": 0.47800129652023315, "learning_rate": 9.961823580062155e-06, "loss": 0.492, "step": 819 }, { "epoch": 0.40708257488002647, "grad_norm": 0.39003321528434753, "learning_rate": 9.961466417527283e-06, "loss": 0.4598, "step": 820 }, { "epoch": 0.4075790170445143, "grad_norm": 0.44100221991539, "learning_rate": 9.96110759849362e-06, "loss": 0.4364, "step": 821 }, { "epoch": 0.4080754592090022, "grad_norm": 0.4909787178039551, "learning_rate": 9.960747123080965e-06, "loss": 0.4636, "step": 822 }, { "epoch": 0.40857190137349, "grad_norm": 0.45945319533348083, "learning_rate": 9.96038499140967e-06, "loss": 0.443, "step": 823 }, { "epoch": 0.40906834353797783, "grad_norm": 0.4821777939796448, "learning_rate": 9.960021203600642e-06, "loss": 0.442, "step": 824 }, { "epoch": 0.4095647857024657, "grad_norm": 0.40093451738357544, "learning_rate": 9.959655759775342e-06, "loss": 0.4262, "step": 825 }, { "epoch": 0.4100612278669535, "grad_norm": 0.41921931505203247, "learning_rate": 9.95928866005578e-06, "loss": 0.4402, "step": 826 }, { "epoch": 0.41055767003144134, "grad_norm": 0.4160793423652649, "learning_rate": 9.958919904564519e-06, "loss": 0.4662, "step": 827 }, { "epoch": 0.4110541121959292, "grad_norm": 0.46481066942214966, "learning_rate": 9.958549493424678e-06, "loss": 0.4626, "step": 828 }, { "epoch": 0.411550554360417, "grad_norm": 0.38921990990638733, "learning_rate": 9.958177426759928e-06, "loss": 0.4493, "step": 829 }, { "epoch": 0.41204699652490484, "grad_norm": 0.4213390052318573, "learning_rate": 9.957803704694488e-06, "loss": 0.4331, "step": 830 }, { "epoch": 0.4125434386893927, "grad_norm": 0.48865050077438354, "learning_rate": 9.95742832735314e-06, "loss": 0.4782, "step": 831 }, { "epoch": 0.41303988085388055, "grad_norm": 0.44271498918533325, "learning_rate": 9.957051294861208e-06, "loss": 0.4761, "step": 832 }, { "epoch": 0.41353632301836835, "grad_norm": 0.4330582916736603, "learning_rate": 9.956672607344572e-06, "loss": 0.4323, "step": 833 }, { "epoch": 0.4140327651828562, "grad_norm": 0.45815330743789673, "learning_rate": 9.95629226492967e-06, "loss": 0.4258, "step": 834 }, { "epoch": 0.41452920734734405, "grad_norm": 0.494812548160553, "learning_rate": 9.955910267743486e-06, "loss": 0.4412, "step": 835 }, { "epoch": 0.41502564951183185, "grad_norm": 0.5098872780799866, "learning_rate": 9.955526615913554e-06, "loss": 0.4606, "step": 836 }, { "epoch": 0.4155220916763197, "grad_norm": 0.4558444917201996, "learning_rate": 9.95514130956797e-06, "loss": 0.457, "step": 837 }, { "epoch": 0.41601853384080756, "grad_norm": 0.5201241970062256, "learning_rate": 9.954754348835379e-06, "loss": 0.4673, "step": 838 }, { "epoch": 0.41651497600529536, "grad_norm": 0.5476892590522766, "learning_rate": 9.954365733844971e-06, "loss": 0.476, "step": 839 }, { "epoch": 0.4170114181697832, "grad_norm": 0.4569700062274933, "learning_rate": 9.953975464726495e-06, "loss": 0.4558, "step": 840 }, { "epoch": 0.41750786033427106, "grad_norm": 0.4302135109901428, "learning_rate": 9.953583541610257e-06, "loss": 0.4387, "step": 841 }, { "epoch": 0.4180043024987589, "grad_norm": 0.4827544391155243, "learning_rate": 9.953189964627102e-06, "loss": 0.4358, "step": 842 }, { "epoch": 0.4185007446632467, "grad_norm": 0.4920971393585205, "learning_rate": 9.95279473390844e-06, "loss": 0.4535, "step": 843 }, { "epoch": 0.41899718682773457, "grad_norm": 0.48830318450927734, "learning_rate": 9.952397849586225e-06, "loss": 0.4254, "step": 844 }, { "epoch": 0.4194936289922224, "grad_norm": 0.4739255905151367, "learning_rate": 9.951999311792966e-06, "loss": 0.4714, "step": 845 }, { "epoch": 0.4199900711567102, "grad_norm": 0.5125837326049805, "learning_rate": 9.951599120661726e-06, "loss": 0.4518, "step": 846 }, { "epoch": 0.4204865133211981, "grad_norm": 0.45180732011795044, "learning_rate": 9.951197276326117e-06, "loss": 0.4313, "step": 847 }, { "epoch": 0.42098295548568593, "grad_norm": 0.4402235150337219, "learning_rate": 9.950793778920303e-06, "loss": 0.4635, "step": 848 }, { "epoch": 0.4214793976501738, "grad_norm": 0.45105183124542236, "learning_rate": 9.950388628579e-06, "loss": 0.4806, "step": 849 }, { "epoch": 0.4219758398146616, "grad_norm": 0.403543621301651, "learning_rate": 9.94998182543748e-06, "loss": 0.4299, "step": 850 }, { "epoch": 0.42247228197914943, "grad_norm": 0.477633535861969, "learning_rate": 9.94957336963156e-06, "loss": 0.4835, "step": 851 }, { "epoch": 0.4229687241436373, "grad_norm": 0.4411155581474304, "learning_rate": 9.949163261297616e-06, "loss": 0.4593, "step": 852 }, { "epoch": 0.4234651663081251, "grad_norm": 0.47250181436538696, "learning_rate": 9.948751500572568e-06, "loss": 0.4254, "step": 853 }, { "epoch": 0.42396160847261294, "grad_norm": 0.4608052670955658, "learning_rate": 9.948338087593894e-06, "loss": 0.4158, "step": 854 }, { "epoch": 0.4244580506371008, "grad_norm": 0.43747368454933167, "learning_rate": 9.94792302249962e-06, "loss": 0.4378, "step": 855 }, { "epoch": 0.4249544928015886, "grad_norm": 0.46896636486053467, "learning_rate": 9.947506305428328e-06, "loss": 0.4513, "step": 856 }, { "epoch": 0.42545093496607644, "grad_norm": 0.44210490584373474, "learning_rate": 9.947087936519143e-06, "loss": 0.4489, "step": 857 }, { "epoch": 0.4259473771305643, "grad_norm": 0.420298308134079, "learning_rate": 9.946667915911754e-06, "loss": 0.4522, "step": 858 }, { "epoch": 0.42644381929505215, "grad_norm": 0.4735203981399536, "learning_rate": 9.94624624374639e-06, "loss": 0.4593, "step": 859 }, { "epoch": 0.42694026145953995, "grad_norm": 0.42980650067329407, "learning_rate": 9.945822920163835e-06, "loss": 0.4471, "step": 860 }, { "epoch": 0.4274367036240278, "grad_norm": 0.4183298349380493, "learning_rate": 9.945397945305428e-06, "loss": 0.4218, "step": 861 }, { "epoch": 0.42793314578851566, "grad_norm": 0.4172977805137634, "learning_rate": 9.944971319313055e-06, "loss": 0.4294, "step": 862 }, { "epoch": 0.42842958795300345, "grad_norm": 0.39395618438720703, "learning_rate": 9.944543042329157e-06, "loss": 0.4463, "step": 863 }, { "epoch": 0.4289260301174913, "grad_norm": 0.44454315304756165, "learning_rate": 9.94411311449672e-06, "loss": 0.475, "step": 864 }, { "epoch": 0.42942247228197916, "grad_norm": 0.49097609519958496, "learning_rate": 9.94368153595929e-06, "loss": 0.4353, "step": 865 }, { "epoch": 0.42991891444646696, "grad_norm": 0.4718441367149353, "learning_rate": 9.943248306860956e-06, "loss": 0.4655, "step": 866 }, { "epoch": 0.4304153566109548, "grad_norm": 0.4770773947238922, "learning_rate": 9.942813427346363e-06, "loss": 0.4281, "step": 867 }, { "epoch": 0.43091179877544267, "grad_norm": 0.4899311363697052, "learning_rate": 9.942376897560703e-06, "loss": 0.4718, "step": 868 }, { "epoch": 0.4314082409399305, "grad_norm": 0.4272003173828125, "learning_rate": 9.941938717649724e-06, "loss": 0.4107, "step": 869 }, { "epoch": 0.4319046831044183, "grad_norm": 0.4432256817817688, "learning_rate": 9.941498887759724e-06, "loss": 0.4345, "step": 870 }, { "epoch": 0.4324011252689062, "grad_norm": 0.4196687936782837, "learning_rate": 9.941057408037546e-06, "loss": 0.4549, "step": 871 }, { "epoch": 0.432897567433394, "grad_norm": 0.4436062276363373, "learning_rate": 9.94061427863059e-06, "loss": 0.4721, "step": 872 }, { "epoch": 0.4333940095978818, "grad_norm": 0.4296574294567108, "learning_rate": 9.940169499686803e-06, "loss": 0.4506, "step": 873 }, { "epoch": 0.4338904517623697, "grad_norm": 0.4823082983493805, "learning_rate": 9.93972307135469e-06, "loss": 0.4837, "step": 874 }, { "epoch": 0.43438689392685753, "grad_norm": 0.46853548288345337, "learning_rate": 9.939274993783295e-06, "loss": 0.4447, "step": 875 }, { "epoch": 0.4348833360913454, "grad_norm": 0.43947356939315796, "learning_rate": 9.938825267122223e-06, "loss": 0.4362, "step": 876 }, { "epoch": 0.4353797782558332, "grad_norm": 0.5535815954208374, "learning_rate": 9.938373891521622e-06, "loss": 0.4606, "step": 877 }, { "epoch": 0.43587622042032104, "grad_norm": 0.4019046425819397, "learning_rate": 9.937920867132199e-06, "loss": 0.4599, "step": 878 }, { "epoch": 0.4363726625848089, "grad_norm": 0.4957868158817291, "learning_rate": 9.937466194105202e-06, "loss": 0.4534, "step": 879 }, { "epoch": 0.4368691047492967, "grad_norm": 0.4275073707103729, "learning_rate": 9.937009872592435e-06, "loss": 0.4667, "step": 880 }, { "epoch": 0.43736554691378454, "grad_norm": 0.42577898502349854, "learning_rate": 9.936551902746255e-06, "loss": 0.4739, "step": 881 }, { "epoch": 0.4378619890782724, "grad_norm": 0.49710920453071594, "learning_rate": 9.93609228471956e-06, "loss": 0.4573, "step": 882 }, { "epoch": 0.4383584312427602, "grad_norm": 0.4017353355884552, "learning_rate": 9.935631018665808e-06, "loss": 0.4463, "step": 883 }, { "epoch": 0.43885487340724805, "grad_norm": 0.5104085206985474, "learning_rate": 9.935168104739002e-06, "loss": 0.4536, "step": 884 }, { "epoch": 0.4393513155717359, "grad_norm": 0.4130265712738037, "learning_rate": 9.934703543093695e-06, "loss": 0.4228, "step": 885 }, { "epoch": 0.43984775773622375, "grad_norm": 0.45690828561782837, "learning_rate": 9.934237333884994e-06, "loss": 0.4465, "step": 886 }, { "epoch": 0.44034419990071155, "grad_norm": 0.4949433207511902, "learning_rate": 9.933769477268552e-06, "loss": 0.4432, "step": 887 }, { "epoch": 0.4408406420651994, "grad_norm": 0.44475415349006653, "learning_rate": 9.933299973400574e-06, "loss": 0.4571, "step": 888 }, { "epoch": 0.44133708422968726, "grad_norm": 0.4890322685241699, "learning_rate": 9.932828822437815e-06, "loss": 0.445, "step": 889 }, { "epoch": 0.44183352639417506, "grad_norm": 0.4410271942615509, "learning_rate": 9.932356024537577e-06, "loss": 0.421, "step": 890 }, { "epoch": 0.4423299685586629, "grad_norm": 0.4681800305843353, "learning_rate": 9.931881579857719e-06, "loss": 0.4307, "step": 891 }, { "epoch": 0.44282641072315077, "grad_norm": 0.4934321939945221, "learning_rate": 9.931405488556642e-06, "loss": 0.4612, "step": 892 }, { "epoch": 0.44332285288763856, "grad_norm": 0.405764639377594, "learning_rate": 9.930927750793298e-06, "loss": 0.4454, "step": 893 }, { "epoch": 0.4438192950521264, "grad_norm": 0.4520834684371948, "learning_rate": 9.930448366727197e-06, "loss": 0.4472, "step": 894 }, { "epoch": 0.44431573721661427, "grad_norm": 0.40851593017578125, "learning_rate": 9.929967336518387e-06, "loss": 0.4574, "step": 895 }, { "epoch": 0.4448121793811021, "grad_norm": 0.44736841320991516, "learning_rate": 9.929484660327472e-06, "loss": 0.4543, "step": 896 }, { "epoch": 0.4453086215455899, "grad_norm": 0.4378875195980072, "learning_rate": 9.929000338315604e-06, "loss": 0.4512, "step": 897 }, { "epoch": 0.4458050637100778, "grad_norm": 0.4314629137516022, "learning_rate": 9.928514370644487e-06, "loss": 0.4224, "step": 898 }, { "epoch": 0.44630150587456563, "grad_norm": 0.46567705273628235, "learning_rate": 9.92802675747637e-06, "loss": 0.4514, "step": 899 }, { "epoch": 0.4467979480390534, "grad_norm": 0.490694135427475, "learning_rate": 9.927537498974059e-06, "loss": 0.4681, "step": 900 }, { "epoch": 0.4472943902035413, "grad_norm": 0.48219695687294006, "learning_rate": 9.927046595300895e-06, "loss": 0.4248, "step": 901 }, { "epoch": 0.44779083236802913, "grad_norm": 0.42192763090133667, "learning_rate": 9.926554046620785e-06, "loss": 0.4642, "step": 902 }, { "epoch": 0.448287274532517, "grad_norm": 0.4466627538204193, "learning_rate": 9.926059853098175e-06, "loss": 0.4518, "step": 903 }, { "epoch": 0.4487837166970048, "grad_norm": 0.41284987330436707, "learning_rate": 9.925564014898063e-06, "loss": 0.4335, "step": 904 }, { "epoch": 0.44928015886149264, "grad_norm": 0.4035791754722595, "learning_rate": 9.925066532185996e-06, "loss": 0.4469, "step": 905 }, { "epoch": 0.4497766010259805, "grad_norm": 0.43450620770454407, "learning_rate": 9.924567405128069e-06, "loss": 0.4654, "step": 906 }, { "epoch": 0.4502730431904683, "grad_norm": 0.4006285071372986, "learning_rate": 9.924066633890929e-06, "loss": 0.4568, "step": 907 }, { "epoch": 0.45076948535495615, "grad_norm": 0.43048474192619324, "learning_rate": 9.923564218641768e-06, "loss": 0.4195, "step": 908 }, { "epoch": 0.451265927519444, "grad_norm": 0.5081758499145508, "learning_rate": 9.92306015954833e-06, "loss": 0.4638, "step": 909 }, { "epoch": 0.4517623696839318, "grad_norm": 0.5051038265228271, "learning_rate": 9.922554456778905e-06, "loss": 0.4738, "step": 910 }, { "epoch": 0.45225881184841965, "grad_norm": 0.5413469076156616, "learning_rate": 9.922047110502335e-06, "loss": 0.4326, "step": 911 }, { "epoch": 0.4527552540129075, "grad_norm": 0.36954253911972046, "learning_rate": 9.921538120888007e-06, "loss": 0.43, "step": 912 }, { "epoch": 0.45325169617739536, "grad_norm": 0.4378874599933624, "learning_rate": 9.921027488105864e-06, "loss": 0.4296, "step": 913 }, { "epoch": 0.45374813834188316, "grad_norm": 0.4973445534706116, "learning_rate": 9.920515212326386e-06, "loss": 0.4434, "step": 914 }, { "epoch": 0.454244580506371, "grad_norm": 0.40948498249053955, "learning_rate": 9.920001293720612e-06, "loss": 0.4474, "step": 915 }, { "epoch": 0.45474102267085886, "grad_norm": 0.49535462260246277, "learning_rate": 9.919485732460123e-06, "loss": 0.4638, "step": 916 }, { "epoch": 0.45523746483534666, "grad_norm": 0.4605354964733124, "learning_rate": 9.918968528717053e-06, "loss": 0.4496, "step": 917 }, { "epoch": 0.4557339069998345, "grad_norm": 0.4518708884716034, "learning_rate": 9.918449682664082e-06, "loss": 0.4672, "step": 918 }, { "epoch": 0.45623034916432237, "grad_norm": 0.47889459133148193, "learning_rate": 9.917929194474438e-06, "loss": 0.4538, "step": 919 }, { "epoch": 0.45672679132881017, "grad_norm": 0.4152355492115021, "learning_rate": 9.917407064321897e-06, "loss": 0.4475, "step": 920 }, { "epoch": 0.457223233493298, "grad_norm": 0.3902584910392761, "learning_rate": 9.916883292380786e-06, "loss": 0.4755, "step": 921 }, { "epoch": 0.4577196756577859, "grad_norm": 0.4627668559551239, "learning_rate": 9.916357878825974e-06, "loss": 0.4487, "step": 922 }, { "epoch": 0.4582161178222737, "grad_norm": 0.42386484146118164, "learning_rate": 9.91583082383289e-06, "loss": 0.4369, "step": 923 }, { "epoch": 0.4587125599867615, "grad_norm": 0.40957245230674744, "learning_rate": 9.915302127577496e-06, "loss": 0.4436, "step": 924 }, { "epoch": 0.4592090021512494, "grad_norm": 0.5021793246269226, "learning_rate": 9.914771790236313e-06, "loss": 0.4694, "step": 925 }, { "epoch": 0.45970544431573723, "grad_norm": 0.4069765508174896, "learning_rate": 9.914239811986406e-06, "loss": 0.4478, "step": 926 }, { "epoch": 0.46020188648022503, "grad_norm": 0.45261216163635254, "learning_rate": 9.913706193005386e-06, "loss": 0.4601, "step": 927 }, { "epoch": 0.4606983286447129, "grad_norm": 0.41394373774528503, "learning_rate": 9.913170933471416e-06, "loss": 0.4383, "step": 928 }, { "epoch": 0.46119477080920074, "grad_norm": 0.4105486273765564, "learning_rate": 9.912634033563205e-06, "loss": 0.4375, "step": 929 }, { "epoch": 0.4616912129736886, "grad_norm": 0.39754047989845276, "learning_rate": 9.912095493460005e-06, "loss": 0.4401, "step": 930 }, { "epoch": 0.4621876551381764, "grad_norm": 0.38555729389190674, "learning_rate": 9.911555313341625e-06, "loss": 0.4083, "step": 931 }, { "epoch": 0.46268409730266424, "grad_norm": 0.36639195680618286, "learning_rate": 9.911013493388416e-06, "loss": 0.4249, "step": 932 }, { "epoch": 0.4631805394671521, "grad_norm": 0.4437360465526581, "learning_rate": 9.910470033781274e-06, "loss": 0.4527, "step": 933 }, { "epoch": 0.4636769816316399, "grad_norm": 0.4479067325592041, "learning_rate": 9.909924934701647e-06, "loss": 0.4294, "step": 934 }, { "epoch": 0.46417342379612775, "grad_norm": 0.3711012899875641, "learning_rate": 9.909378196331527e-06, "loss": 0.4383, "step": 935 }, { "epoch": 0.4646698659606156, "grad_norm": 0.39012885093688965, "learning_rate": 9.908829818853459e-06, "loss": 0.4234, "step": 936 }, { "epoch": 0.4651663081251034, "grad_norm": 0.5056318640708923, "learning_rate": 9.908279802450529e-06, "loss": 0.451, "step": 937 }, { "epoch": 0.46566275028959125, "grad_norm": 0.4175751805305481, "learning_rate": 9.907728147306373e-06, "loss": 0.4331, "step": 938 }, { "epoch": 0.4661591924540791, "grad_norm": 0.44483646750450134, "learning_rate": 9.90717485360517e-06, "loss": 0.4471, "step": 939 }, { "epoch": 0.46665563461856696, "grad_norm": 0.4274883270263672, "learning_rate": 9.906619921531658e-06, "loss": 0.4642, "step": 940 }, { "epoch": 0.46715207678305476, "grad_norm": 0.4446362555027008, "learning_rate": 9.906063351271104e-06, "loss": 0.4538, "step": 941 }, { "epoch": 0.4676485189475426, "grad_norm": 0.41237977147102356, "learning_rate": 9.90550514300934e-06, "loss": 0.4236, "step": 942 }, { "epoch": 0.46814496111203047, "grad_norm": 0.40541309118270874, "learning_rate": 9.904945296932731e-06, "loss": 0.4769, "step": 943 }, { "epoch": 0.46864140327651826, "grad_norm": 0.39442679286003113, "learning_rate": 9.904383813228197e-06, "loss": 0.4244, "step": 944 }, { "epoch": 0.4691378454410061, "grad_norm": 0.3929031789302826, "learning_rate": 9.9038206920832e-06, "loss": 0.4455, "step": 945 }, { "epoch": 0.46963428760549397, "grad_norm": 0.39185142517089844, "learning_rate": 9.903255933685755e-06, "loss": 0.4256, "step": 946 }, { "epoch": 0.47013072976998177, "grad_norm": 0.4270944595336914, "learning_rate": 9.902689538224415e-06, "loss": 0.4783, "step": 947 }, { "epoch": 0.4706271719344696, "grad_norm": 0.4495737552642822, "learning_rate": 9.902121505888287e-06, "loss": 0.4664, "step": 948 }, { "epoch": 0.4711236140989575, "grad_norm": 0.4212367534637451, "learning_rate": 9.90155183686702e-06, "loss": 0.4837, "step": 949 }, { "epoch": 0.47162005626344533, "grad_norm": 0.44150209426879883, "learning_rate": 9.900980531350813e-06, "loss": 0.4271, "step": 950 }, { "epoch": 0.47211649842793313, "grad_norm": 0.4236411452293396, "learning_rate": 9.900407589530405e-06, "loss": 0.4306, "step": 951 }, { "epoch": 0.472612940592421, "grad_norm": 0.4233941435813904, "learning_rate": 9.89983301159709e-06, "loss": 0.4611, "step": 952 }, { "epoch": 0.47310938275690884, "grad_norm": 0.389588326215744, "learning_rate": 9.899256797742702e-06, "loss": 0.4184, "step": 953 }, { "epoch": 0.47360582492139663, "grad_norm": 0.40490585565567017, "learning_rate": 9.898678948159625e-06, "loss": 0.4331, "step": 954 }, { "epoch": 0.4741022670858845, "grad_norm": 0.39457905292510986, "learning_rate": 9.898099463040784e-06, "loss": 0.4241, "step": 955 }, { "epoch": 0.47459870925037234, "grad_norm": 0.36943021416664124, "learning_rate": 9.897518342579657e-06, "loss": 0.4444, "step": 956 }, { "epoch": 0.4750951514148602, "grad_norm": 0.4135393500328064, "learning_rate": 9.896935586970262e-06, "loss": 0.4567, "step": 957 }, { "epoch": 0.475591593579348, "grad_norm": 0.4185926616191864, "learning_rate": 9.896351196407166e-06, "loss": 0.4316, "step": 958 }, { "epoch": 0.47608803574383585, "grad_norm": 0.3894549608230591, "learning_rate": 9.89576517108548e-06, "loss": 0.4669, "step": 959 }, { "epoch": 0.4765844779083237, "grad_norm": 0.3933331370353699, "learning_rate": 9.895177511200864e-06, "loss": 0.434, "step": 960 }, { "epoch": 0.4770809200728115, "grad_norm": 0.40891775488853455, "learning_rate": 9.89458821694952e-06, "loss": 0.4343, "step": 961 }, { "epoch": 0.47757736223729935, "grad_norm": 0.370406836271286, "learning_rate": 9.893997288528198e-06, "loss": 0.4444, "step": 962 }, { "epoch": 0.4780738044017872, "grad_norm": 0.3936581611633301, "learning_rate": 9.893404726134193e-06, "loss": 0.4326, "step": 963 }, { "epoch": 0.478570246566275, "grad_norm": 0.4038683772087097, "learning_rate": 9.892810529965344e-06, "loss": 0.4362, "step": 964 }, { "epoch": 0.47906668873076286, "grad_norm": 0.42634937167167664, "learning_rate": 9.89221470022004e-06, "loss": 0.4556, "step": 965 }, { "epoch": 0.4795631308952507, "grad_norm": 0.3812307119369507, "learning_rate": 9.891617237097209e-06, "loss": 0.4502, "step": 966 }, { "epoch": 0.48005957305973856, "grad_norm": 0.4029131531715393, "learning_rate": 9.891018140796332e-06, "loss": 0.4593, "step": 967 }, { "epoch": 0.48055601522422636, "grad_norm": 0.4463265538215637, "learning_rate": 9.890417411517426e-06, "loss": 0.4919, "step": 968 }, { "epoch": 0.4810524573887142, "grad_norm": 0.40043652057647705, "learning_rate": 9.889815049461062e-06, "loss": 0.4375, "step": 969 }, { "epoch": 0.48154889955320207, "grad_norm": 0.4241425096988678, "learning_rate": 9.88921105482835e-06, "loss": 0.469, "step": 970 }, { "epoch": 0.48204534171768987, "grad_norm": 0.4270949959754944, "learning_rate": 9.888605427820947e-06, "loss": 0.4477, "step": 971 }, { "epoch": 0.4825417838821777, "grad_norm": 0.436489999294281, "learning_rate": 9.887998168641057e-06, "loss": 0.4379, "step": 972 }, { "epoch": 0.4830382260466656, "grad_norm": 0.41422900557518005, "learning_rate": 9.887389277491429e-06, "loss": 0.4819, "step": 973 }, { "epoch": 0.4835346682111534, "grad_norm": 0.4311436712741852, "learning_rate": 9.88677875457535e-06, "loss": 0.4529, "step": 974 }, { "epoch": 0.4840311103756412, "grad_norm": 0.40417003631591797, "learning_rate": 9.886166600096662e-06, "loss": 0.437, "step": 975 }, { "epoch": 0.4845275525401291, "grad_norm": 0.41669604182243347, "learning_rate": 9.885552814259746e-06, "loss": 0.4232, "step": 976 }, { "epoch": 0.48502399470461693, "grad_norm": 0.43506285548210144, "learning_rate": 9.884937397269525e-06, "loss": 0.4276, "step": 977 }, { "epoch": 0.48552043686910473, "grad_norm": 0.39480060338974, "learning_rate": 9.884320349331474e-06, "loss": 0.4417, "step": 978 }, { "epoch": 0.4860168790335926, "grad_norm": 1.0900508165359497, "learning_rate": 9.883701670651607e-06, "loss": 0.4254, "step": 979 }, { "epoch": 0.48651332119808044, "grad_norm": 0.3909410536289215, "learning_rate": 9.883081361436482e-06, "loss": 0.4609, "step": 980 }, { "epoch": 0.48700976336256824, "grad_norm": 0.40695449709892273, "learning_rate": 9.882459421893206e-06, "loss": 0.4502, "step": 981 }, { "epoch": 0.4875062055270561, "grad_norm": 0.4045730531215668, "learning_rate": 9.881835852229427e-06, "loss": 0.4413, "step": 982 }, { "epoch": 0.48800264769154394, "grad_norm": 0.4155671000480652, "learning_rate": 9.881210652653338e-06, "loss": 0.4604, "step": 983 }, { "epoch": 0.4884990898560318, "grad_norm": 0.4051297903060913, "learning_rate": 9.880583823373676e-06, "loss": 0.4673, "step": 984 }, { "epoch": 0.4889955320205196, "grad_norm": 0.4878663420677185, "learning_rate": 9.879955364599722e-06, "loss": 0.4506, "step": 985 }, { "epoch": 0.48949197418500745, "grad_norm": 0.42784014344215393, "learning_rate": 9.879325276541303e-06, "loss": 0.435, "step": 986 }, { "epoch": 0.4899884163494953, "grad_norm": 0.42198318243026733, "learning_rate": 9.878693559408785e-06, "loss": 0.4543, "step": 987 }, { "epoch": 0.4904848585139831, "grad_norm": 0.4310964345932007, "learning_rate": 9.878060213413083e-06, "loss": 0.456, "step": 988 }, { "epoch": 0.49098130067847096, "grad_norm": 0.43341559171676636, "learning_rate": 9.877425238765657e-06, "loss": 0.4399, "step": 989 }, { "epoch": 0.4914777428429588, "grad_norm": 0.4136214256286621, "learning_rate": 9.876788635678502e-06, "loss": 0.4388, "step": 990 }, { "epoch": 0.4919741850074466, "grad_norm": 0.547549843788147, "learning_rate": 9.876150404364166e-06, "loss": 0.4476, "step": 991 }, { "epoch": 0.49247062717193446, "grad_norm": 0.38803765177726746, "learning_rate": 9.875510545035736e-06, "loss": 0.4279, "step": 992 }, { "epoch": 0.4929670693364223, "grad_norm": 0.537960946559906, "learning_rate": 9.874869057906844e-06, "loss": 0.4569, "step": 993 }, { "epoch": 0.49346351150091017, "grad_norm": 0.4341985881328583, "learning_rate": 9.874225943191666e-06, "loss": 0.4028, "step": 994 }, { "epoch": 0.49395995366539797, "grad_norm": 0.48057854175567627, "learning_rate": 9.87358120110492e-06, "loss": 0.4444, "step": 995 }, { "epoch": 0.4944563958298858, "grad_norm": 0.42116060853004456, "learning_rate": 9.872934831861867e-06, "loss": 0.4607, "step": 996 }, { "epoch": 0.4949528379943737, "grad_norm": 0.4386623799800873, "learning_rate": 9.872286835678313e-06, "loss": 0.4183, "step": 997 }, { "epoch": 0.49544928015886147, "grad_norm": 0.4305936396121979, "learning_rate": 9.871637212770606e-06, "loss": 0.4707, "step": 998 }, { "epoch": 0.4959457223233493, "grad_norm": 0.4585387408733368, "learning_rate": 9.870985963355636e-06, "loss": 0.4369, "step": 999 }, { "epoch": 0.4964421644878372, "grad_norm": 0.41592171788215637, "learning_rate": 9.87033308765084e-06, "loss": 0.4399, "step": 1000 }, { "epoch": 0.496938606652325, "grad_norm": 0.44728997349739075, "learning_rate": 9.869678585874193e-06, "loss": 0.447, "step": 1001 }, { "epoch": 0.49743504881681283, "grad_norm": 0.4444276988506317, "learning_rate": 9.86902245824422e-06, "loss": 0.4123, "step": 1002 }, { "epoch": 0.4979314909813007, "grad_norm": 0.4298102557659149, "learning_rate": 9.868364704979977e-06, "loss": 0.4487, "step": 1003 }, { "epoch": 0.49842793314578854, "grad_norm": 0.40809527039527893, "learning_rate": 9.867705326301077e-06, "loss": 0.4595, "step": 1004 }, { "epoch": 0.49892437531027634, "grad_norm": 0.3991101086139679, "learning_rate": 9.867044322427663e-06, "loss": 0.4326, "step": 1005 }, { "epoch": 0.4994208174747642, "grad_norm": 0.3953841030597687, "learning_rate": 9.86638169358043e-06, "loss": 0.4529, "step": 1006 }, { "epoch": 0.49991725963925204, "grad_norm": 0.39102113246917725, "learning_rate": 9.865717439980611e-06, "loss": 0.4391, "step": 1007 }, { "epoch": 0.5004137018037399, "grad_norm": 0.42161303758621216, "learning_rate": 9.86505156184998e-06, "loss": 0.4311, "step": 1008 }, { "epoch": 0.5009101439682278, "grad_norm": 0.4036874771118164, "learning_rate": 9.864384059410858e-06, "loss": 0.4326, "step": 1009 }, { "epoch": 0.5014065861327155, "grad_norm": 0.40938684344291687, "learning_rate": 9.863714932886106e-06, "loss": 0.4422, "step": 1010 }, { "epoch": 0.5019030282972033, "grad_norm": 0.39215776324272156, "learning_rate": 9.863044182499126e-06, "loss": 0.4454, "step": 1011 }, { "epoch": 0.5023994704616912, "grad_norm": 0.3710356652736664, "learning_rate": 9.862371808473862e-06, "loss": 0.4118, "step": 1012 }, { "epoch": 0.502895912626179, "grad_norm": 0.38487741351127625, "learning_rate": 9.861697811034805e-06, "loss": 0.4287, "step": 1013 }, { "epoch": 0.5033923547906669, "grad_norm": 0.3613832890987396, "learning_rate": 9.861022190406982e-06, "loss": 0.4301, "step": 1014 }, { "epoch": 0.5038887969551548, "grad_norm": 0.39821162819862366, "learning_rate": 9.860344946815966e-06, "loss": 0.4534, "step": 1015 }, { "epoch": 0.5043852391196426, "grad_norm": 0.40819066762924194, "learning_rate": 9.859666080487868e-06, "loss": 0.4444, "step": 1016 }, { "epoch": 0.5048816812841304, "grad_norm": 0.3523252308368683, "learning_rate": 9.858985591649343e-06, "loss": 0.4124, "step": 1017 }, { "epoch": 0.5053781234486182, "grad_norm": 0.45549070835113525, "learning_rate": 9.85830348052759e-06, "loss": 0.4396, "step": 1018 }, { "epoch": 0.5058745656131061, "grad_norm": 0.4199966490268707, "learning_rate": 9.857619747350346e-06, "loss": 0.4464, "step": 1019 }, { "epoch": 0.5063710077775939, "grad_norm": 0.41206297278404236, "learning_rate": 9.856934392345892e-06, "loss": 0.4396, "step": 1020 }, { "epoch": 0.5068674499420818, "grad_norm": 0.37858814001083374, "learning_rate": 9.856247415743048e-06, "loss": 0.4127, "step": 1021 }, { "epoch": 0.5073638921065696, "grad_norm": 0.3800447881221771, "learning_rate": 9.855558817771177e-06, "loss": 0.4359, "step": 1022 }, { "epoch": 0.5078603342710574, "grad_norm": 0.43333229422569275, "learning_rate": 9.854868598660184e-06, "loss": 0.4409, "step": 1023 }, { "epoch": 0.5083567764355452, "grad_norm": 0.43846574425697327, "learning_rate": 9.854176758640513e-06, "loss": 0.4307, "step": 1024 }, { "epoch": 0.5088532186000331, "grad_norm": 0.47530990839004517, "learning_rate": 9.853483297943153e-06, "loss": 0.4529, "step": 1025 }, { "epoch": 0.5093496607645209, "grad_norm": 0.35539692640304565, "learning_rate": 9.85278821679963e-06, "loss": 0.4379, "step": 1026 }, { "epoch": 0.5098461029290088, "grad_norm": 0.5066800117492676, "learning_rate": 9.852091515442012e-06, "loss": 0.4437, "step": 1027 }, { "epoch": 0.5103425450934966, "grad_norm": 0.4272255599498749, "learning_rate": 9.85139319410291e-06, "loss": 0.4421, "step": 1028 }, { "epoch": 0.5108389872579845, "grad_norm": 0.4576813280582428, "learning_rate": 9.850693253015473e-06, "loss": 0.4374, "step": 1029 }, { "epoch": 0.5113354294224722, "grad_norm": 0.43643760681152344, "learning_rate": 9.849991692413394e-06, "loss": 0.4327, "step": 1030 }, { "epoch": 0.5118318715869601, "grad_norm": 0.44543421268463135, "learning_rate": 9.849288512530906e-06, "loss": 0.4785, "step": 1031 }, { "epoch": 0.5123283137514479, "grad_norm": 0.4664360284805298, "learning_rate": 9.848583713602777e-06, "loss": 0.4434, "step": 1032 }, { "epoch": 0.5128247559159358, "grad_norm": 0.3997759222984314, "learning_rate": 9.847877295864326e-06, "loss": 0.4304, "step": 1033 }, { "epoch": 0.5133211980804236, "grad_norm": 0.41222214698791504, "learning_rate": 9.847169259551403e-06, "loss": 0.4357, "step": 1034 }, { "epoch": 0.5138176402449115, "grad_norm": 0.46146491169929504, "learning_rate": 9.846459604900403e-06, "loss": 0.4554, "step": 1035 }, { "epoch": 0.5143140824093994, "grad_norm": 0.46253007650375366, "learning_rate": 9.845748332148259e-06, "loss": 0.4776, "step": 1036 }, { "epoch": 0.5148105245738871, "grad_norm": 0.42322590947151184, "learning_rate": 9.845035441532448e-06, "loss": 0.4547, "step": 1037 }, { "epoch": 0.515306966738375, "grad_norm": 0.4566199481487274, "learning_rate": 9.844320933290986e-06, "loss": 0.4387, "step": 1038 }, { "epoch": 0.5158034089028628, "grad_norm": 0.39536648988723755, "learning_rate": 9.843604807662422e-06, "loss": 0.4471, "step": 1039 }, { "epoch": 0.5162998510673507, "grad_norm": 0.40988093614578247, "learning_rate": 9.842887064885856e-06, "loss": 0.4463, "step": 1040 }, { "epoch": 0.5167962932318385, "grad_norm": 0.5053913593292236, "learning_rate": 9.842167705200923e-06, "loss": 0.4497, "step": 1041 }, { "epoch": 0.5172927353963264, "grad_norm": 0.3870870769023895, "learning_rate": 9.841446728847795e-06, "loss": 0.4316, "step": 1042 }, { "epoch": 0.5177891775608142, "grad_norm": 0.4454042911529541, "learning_rate": 9.840724136067186e-06, "loss": 0.4465, "step": 1043 }, { "epoch": 0.518285619725302, "grad_norm": 0.5183713436126709, "learning_rate": 9.839999927100354e-06, "loss": 0.4377, "step": 1044 }, { "epoch": 0.5187820618897898, "grad_norm": 0.40680569410324097, "learning_rate": 9.839274102189089e-06, "loss": 0.4556, "step": 1045 }, { "epoch": 0.5192785040542777, "grad_norm": 0.5411706566810608, "learning_rate": 9.838546661575725e-06, "loss": 0.4553, "step": 1046 }, { "epoch": 0.5197749462187655, "grad_norm": 0.48632004857063293, "learning_rate": 9.837817605503134e-06, "loss": 0.444, "step": 1047 }, { "epoch": 0.5202713883832534, "grad_norm": 0.48219895362854004, "learning_rate": 9.837086934214733e-06, "loss": 0.4616, "step": 1048 }, { "epoch": 0.5207678305477412, "grad_norm": 0.5294618010520935, "learning_rate": 9.836354647954467e-06, "loss": 0.4649, "step": 1049 }, { "epoch": 0.521264272712229, "grad_norm": 0.402498722076416, "learning_rate": 9.835620746966829e-06, "loss": 0.464, "step": 1050 }, { "epoch": 0.5217607148767168, "grad_norm": 0.3926226794719696, "learning_rate": 9.834885231496847e-06, "loss": 0.4319, "step": 1051 }, { "epoch": 0.5222571570412047, "grad_norm": 0.4574315547943115, "learning_rate": 9.834148101790093e-06, "loss": 0.4277, "step": 1052 }, { "epoch": 0.5227535992056925, "grad_norm": 0.41065502166748047, "learning_rate": 9.833409358092673e-06, "loss": 0.4371, "step": 1053 }, { "epoch": 0.5232500413701804, "grad_norm": 0.41410186886787415, "learning_rate": 9.832669000651231e-06, "loss": 0.4339, "step": 1054 }, { "epoch": 0.5237464835346682, "grad_norm": 0.4922197163105011, "learning_rate": 9.831927029712957e-06, "loss": 0.465, "step": 1055 }, { "epoch": 0.5242429256991561, "grad_norm": 0.41521695256233215, "learning_rate": 9.831183445525571e-06, "loss": 0.4423, "step": 1056 }, { "epoch": 0.5247393678636438, "grad_norm": 0.4148644208908081, "learning_rate": 9.830438248337337e-06, "loss": 0.4647, "step": 1057 }, { "epoch": 0.5252358100281317, "grad_norm": 0.526225745677948, "learning_rate": 9.829691438397056e-06, "loss": 0.436, "step": 1058 }, { "epoch": 0.5257322521926195, "grad_norm": 0.4354744851589203, "learning_rate": 9.828943015954066e-06, "loss": 0.4335, "step": 1059 }, { "epoch": 0.5262286943571074, "grad_norm": 0.37658125162124634, "learning_rate": 9.828192981258249e-06, "loss": 0.4261, "step": 1060 }, { "epoch": 0.5267251365215952, "grad_norm": 0.5266714096069336, "learning_rate": 9.827441334560017e-06, "loss": 0.4703, "step": 1061 }, { "epoch": 0.5272215786860831, "grad_norm": 0.4350641071796417, "learning_rate": 9.826688076110328e-06, "loss": 0.4574, "step": 1062 }, { "epoch": 0.527718020850571, "grad_norm": 0.4225671887397766, "learning_rate": 9.82593320616067e-06, "loss": 0.4344, "step": 1063 }, { "epoch": 0.5282144630150587, "grad_norm": 0.5400461554527283, "learning_rate": 9.825176724963075e-06, "loss": 0.4526, "step": 1064 }, { "epoch": 0.5287109051795466, "grad_norm": 0.4571073353290558, "learning_rate": 9.824418632770115e-06, "loss": 0.4415, "step": 1065 }, { "epoch": 0.5292073473440344, "grad_norm": 0.449850857257843, "learning_rate": 9.823658929834892e-06, "loss": 0.4477, "step": 1066 }, { "epoch": 0.5297037895085223, "grad_norm": 0.4859752058982849, "learning_rate": 9.822897616411055e-06, "loss": 0.4779, "step": 1067 }, { "epoch": 0.5302002316730101, "grad_norm": 0.4832402169704437, "learning_rate": 9.82213469275278e-06, "loss": 0.4751, "step": 1068 }, { "epoch": 0.530696673837498, "grad_norm": 0.3928971290588379, "learning_rate": 9.821370159114792e-06, "loss": 0.4452, "step": 1069 }, { "epoch": 0.5311931160019858, "grad_norm": 0.4554174244403839, "learning_rate": 9.820604015752344e-06, "loss": 0.453, "step": 1070 }, { "epoch": 0.5316895581664736, "grad_norm": 0.42962804436683655, "learning_rate": 9.819836262921231e-06, "loss": 0.4679, "step": 1071 }, { "epoch": 0.5321860003309614, "grad_norm": 0.4061743915081024, "learning_rate": 9.819066900877787e-06, "loss": 0.458, "step": 1072 }, { "epoch": 0.5326824424954493, "grad_norm": 0.44172975420951843, "learning_rate": 9.81829592987888e-06, "loss": 0.4253, "step": 1073 }, { "epoch": 0.5331788846599371, "grad_norm": 0.43648770451545715, "learning_rate": 9.817523350181916e-06, "loss": 0.4467, "step": 1074 }, { "epoch": 0.533675326824425, "grad_norm": 0.40728119015693665, "learning_rate": 9.81674916204484e-06, "loss": 0.4489, "step": 1075 }, { "epoch": 0.5341717689889128, "grad_norm": 0.46754953265190125, "learning_rate": 9.815973365726126e-06, "loss": 0.4419, "step": 1076 }, { "epoch": 0.5346682111534006, "grad_norm": 0.4223959445953369, "learning_rate": 9.8151959614848e-06, "loss": 0.4296, "step": 1077 }, { "epoch": 0.5351646533178884, "grad_norm": 0.48979684710502625, "learning_rate": 9.814416949580412e-06, "loss": 0.4599, "step": 1078 }, { "epoch": 0.5356610954823763, "grad_norm": 0.40310990810394287, "learning_rate": 9.813636330273051e-06, "loss": 0.4161, "step": 1079 }, { "epoch": 0.5361575376468641, "grad_norm": 0.41457417607307434, "learning_rate": 9.812854103823349e-06, "loss": 0.4575, "step": 1080 }, { "epoch": 0.536653979811352, "grad_norm": 0.5164874792098999, "learning_rate": 9.812070270492467e-06, "loss": 0.4258, "step": 1081 }, { "epoch": 0.5371504219758398, "grad_norm": 0.42441895604133606, "learning_rate": 9.811284830542105e-06, "loss": 0.458, "step": 1082 }, { "epoch": 0.5376468641403277, "grad_norm": 0.44595471024513245, "learning_rate": 9.810497784234503e-06, "loss": 0.4634, "step": 1083 }, { "epoch": 0.5381433063048154, "grad_norm": 0.4514204263687134, "learning_rate": 9.809709131832431e-06, "loss": 0.4629, "step": 1084 }, { "epoch": 0.5386397484693033, "grad_norm": 0.47432631254196167, "learning_rate": 9.808918873599205e-06, "loss": 0.4507, "step": 1085 }, { "epoch": 0.5391361906337911, "grad_norm": 0.4720109701156616, "learning_rate": 9.808127009798662e-06, "loss": 0.4537, "step": 1086 }, { "epoch": 0.539632632798279, "grad_norm": 0.45561596751213074, "learning_rate": 9.807333540695192e-06, "loss": 0.4452, "step": 1087 }, { "epoch": 0.5401290749627669, "grad_norm": 0.46909254789352417, "learning_rate": 9.806538466553705e-06, "loss": 0.414, "step": 1088 }, { "epoch": 0.5406255171272547, "grad_norm": 0.41632574796676636, "learning_rate": 9.80574178763966e-06, "loss": 0.4441, "step": 1089 }, { "epoch": 0.5411219592917426, "grad_norm": 0.4217323660850525, "learning_rate": 9.804943504219046e-06, "loss": 0.4365, "step": 1090 }, { "epoch": 0.5416184014562303, "grad_norm": 0.4664926826953888, "learning_rate": 9.804143616558387e-06, "loss": 0.4362, "step": 1091 }, { "epoch": 0.5421148436207182, "grad_norm": 0.44954970479011536, "learning_rate": 9.803342124924742e-06, "loss": 0.4309, "step": 1092 }, { "epoch": 0.542611285785206, "grad_norm": 0.38139691948890686, "learning_rate": 9.802539029585709e-06, "loss": 0.4424, "step": 1093 }, { "epoch": 0.5431077279496939, "grad_norm": 0.42648622393608093, "learning_rate": 9.80173433080942e-06, "loss": 0.4441, "step": 1094 }, { "epoch": 0.5436041701141817, "grad_norm": 0.4272594749927521, "learning_rate": 9.800928028864543e-06, "loss": 0.4544, "step": 1095 }, { "epoch": 0.5441006122786696, "grad_norm": 0.43768012523651123, "learning_rate": 9.80012012402028e-06, "loss": 0.4166, "step": 1096 }, { "epoch": 0.5445970544431574, "grad_norm": 0.4205953776836395, "learning_rate": 9.799310616546367e-06, "loss": 0.4446, "step": 1097 }, { "epoch": 0.5450934966076452, "grad_norm": 0.432310551404953, "learning_rate": 9.798499506713075e-06, "loss": 0.4129, "step": 1098 }, { "epoch": 0.545589938772133, "grad_norm": 0.41934728622436523, "learning_rate": 9.797686794791216e-06, "loss": 0.439, "step": 1099 }, { "epoch": 0.5460863809366209, "grad_norm": 0.4193020462989807, "learning_rate": 9.79687248105213e-06, "loss": 0.4419, "step": 1100 }, { "epoch": 0.5465828231011087, "grad_norm": 0.41924306750297546, "learning_rate": 9.796056565767694e-06, "loss": 0.4322, "step": 1101 }, { "epoch": 0.5470792652655966, "grad_norm": 0.44275856018066406, "learning_rate": 9.79523904921032e-06, "loss": 0.4427, "step": 1102 }, { "epoch": 0.5475757074300844, "grad_norm": 0.48175638914108276, "learning_rate": 9.794419931652954e-06, "loss": 0.4684, "step": 1103 }, { "epoch": 0.5480721495945722, "grad_norm": 0.378415584564209, "learning_rate": 9.793599213369078e-06, "loss": 0.4284, "step": 1104 }, { "epoch": 0.54856859175906, "grad_norm": 0.4436701536178589, "learning_rate": 9.792776894632709e-06, "loss": 0.4444, "step": 1105 }, { "epoch": 0.5490650339235479, "grad_norm": 0.4141138195991516, "learning_rate": 9.791952975718395e-06, "loss": 0.462, "step": 1106 }, { "epoch": 0.5495614760880357, "grad_norm": 0.3965829610824585, "learning_rate": 9.791127456901219e-06, "loss": 0.4282, "step": 1107 }, { "epoch": 0.5500579182525236, "grad_norm": 0.38075658679008484, "learning_rate": 9.790300338456802e-06, "loss": 0.4609, "step": 1108 }, { "epoch": 0.5505543604170114, "grad_norm": 0.42693158984184265, "learning_rate": 9.789471620661296e-06, "loss": 0.4858, "step": 1109 }, { "epoch": 0.5510508025814993, "grad_norm": 0.3761470317840576, "learning_rate": 9.788641303791384e-06, "loss": 0.4261, "step": 1110 }, { "epoch": 0.551547244745987, "grad_norm": 0.4042470455169678, "learning_rate": 9.78780938812429e-06, "loss": 0.4561, "step": 1111 }, { "epoch": 0.5520436869104749, "grad_norm": 0.4576355516910553, "learning_rate": 9.786975873937768e-06, "loss": 0.4495, "step": 1112 }, { "epoch": 0.5525401290749627, "grad_norm": 0.3827768564224243, "learning_rate": 9.786140761510103e-06, "loss": 0.4381, "step": 1113 }, { "epoch": 0.5530365712394506, "grad_norm": 0.4643932282924652, "learning_rate": 9.785304051120117e-06, "loss": 0.4758, "step": 1114 }, { "epoch": 0.5535330134039385, "grad_norm": 0.4149339199066162, "learning_rate": 9.784465743047168e-06, "loss": 0.4479, "step": 1115 }, { "epoch": 0.5540294555684263, "grad_norm": 0.4344440698623657, "learning_rate": 9.78362583757114e-06, "loss": 0.4363, "step": 1116 }, { "epoch": 0.5545258977329142, "grad_norm": 0.38696280121803284, "learning_rate": 9.782784334972459e-06, "loss": 0.455, "step": 1117 }, { "epoch": 0.5550223398974019, "grad_norm": 0.41420692205429077, "learning_rate": 9.781941235532076e-06, "loss": 0.4194, "step": 1118 }, { "epoch": 0.5555187820618898, "grad_norm": 0.5090983510017395, "learning_rate": 9.781096539531479e-06, "loss": 0.4424, "step": 1119 }, { "epoch": 0.5560152242263776, "grad_norm": 0.39849579334259033, "learning_rate": 9.780250247252692e-06, "loss": 0.4255, "step": 1120 }, { "epoch": 0.5565116663908655, "grad_norm": 0.4463026225566864, "learning_rate": 9.779402358978267e-06, "loss": 0.4582, "step": 1121 }, { "epoch": 0.5570081085553533, "grad_norm": 0.4401877522468567, "learning_rate": 9.778552874991291e-06, "loss": 0.4331, "step": 1122 }, { "epoch": 0.5575045507198412, "grad_norm": 0.40466001629829407, "learning_rate": 9.777701795575385e-06, "loss": 0.41, "step": 1123 }, { "epoch": 0.558000992884329, "grad_norm": 0.47397923469543457, "learning_rate": 9.7768491210147e-06, "loss": 0.4398, "step": 1124 }, { "epoch": 0.5584974350488168, "grad_norm": 0.411964476108551, "learning_rate": 9.775994851593921e-06, "loss": 0.4422, "step": 1125 }, { "epoch": 0.5589938772133046, "grad_norm": 0.3996008634567261, "learning_rate": 9.775138987598264e-06, "loss": 0.4305, "step": 1126 }, { "epoch": 0.5594903193777925, "grad_norm": 0.43117260932922363, "learning_rate": 9.774281529313483e-06, "loss": 0.425, "step": 1127 }, { "epoch": 0.5599867615422803, "grad_norm": 0.3982381820678711, "learning_rate": 9.773422477025854e-06, "loss": 0.4559, "step": 1128 }, { "epoch": 0.5604832037067682, "grad_norm": 0.45569583773612976, "learning_rate": 9.772561831022195e-06, "loss": 0.4356, "step": 1129 }, { "epoch": 0.560979645871256, "grad_norm": 0.4551222026348114, "learning_rate": 9.771699591589854e-06, "loss": 0.4276, "step": 1130 }, { "epoch": 0.5614760880357438, "grad_norm": 0.44524747133255005, "learning_rate": 9.770835759016704e-06, "loss": 0.4589, "step": 1131 }, { "epoch": 0.5619725302002316, "grad_norm": 0.4707545340061188, "learning_rate": 9.76997033359116e-06, "loss": 0.4366, "step": 1132 }, { "epoch": 0.5624689723647195, "grad_norm": 0.4589802920818329, "learning_rate": 9.769103315602161e-06, "loss": 0.463, "step": 1133 }, { "epoch": 0.5629654145292073, "grad_norm": 0.4371453523635864, "learning_rate": 9.768234705339184e-06, "loss": 0.4102, "step": 1134 }, { "epoch": 0.5634618566936952, "grad_norm": 0.49405983090400696, "learning_rate": 9.76736450309223e-06, "loss": 0.4132, "step": 1135 }, { "epoch": 0.563958298858183, "grad_norm": 0.4474470019340515, "learning_rate": 9.76649270915184e-06, "loss": 0.4368, "step": 1136 }, { "epoch": 0.5644547410226709, "grad_norm": 0.4862598180770874, "learning_rate": 9.765619323809078e-06, "loss": 0.4276, "step": 1137 }, { "epoch": 0.5649511831871586, "grad_norm": 0.49308261275291443, "learning_rate": 9.76474434735555e-06, "loss": 0.4516, "step": 1138 }, { "epoch": 0.5654476253516465, "grad_norm": 0.4433296322822571, "learning_rate": 9.76386778008338e-06, "loss": 0.4292, "step": 1139 }, { "epoch": 0.5659440675161344, "grad_norm": 0.5049310326576233, "learning_rate": 9.762989622285234e-06, "loss": 0.4392, "step": 1140 }, { "epoch": 0.5664405096806222, "grad_norm": 0.43997663259506226, "learning_rate": 9.762109874254305e-06, "loss": 0.4158, "step": 1141 }, { "epoch": 0.5669369518451101, "grad_norm": 0.40339770913124084, "learning_rate": 9.761228536284313e-06, "loss": 0.4101, "step": 1142 }, { "epoch": 0.5674333940095979, "grad_norm": 0.42658576369285583, "learning_rate": 9.76034560866952e-06, "loss": 0.4338, "step": 1143 }, { "epoch": 0.5679298361740858, "grad_norm": 0.41169577836990356, "learning_rate": 9.759461091704703e-06, "loss": 0.4419, "step": 1144 }, { "epoch": 0.5684262783385735, "grad_norm": 0.4149167835712433, "learning_rate": 9.758574985685186e-06, "loss": 0.4104, "step": 1145 }, { "epoch": 0.5689227205030614, "grad_norm": 0.40243813395500183, "learning_rate": 9.75768729090681e-06, "loss": 0.476, "step": 1146 }, { "epoch": 0.5694191626675492, "grad_norm": 0.39286455512046814, "learning_rate": 9.756798007665954e-06, "loss": 0.4209, "step": 1147 }, { "epoch": 0.5699156048320371, "grad_norm": 0.4198743999004364, "learning_rate": 9.755907136259525e-06, "loss": 0.4371, "step": 1148 }, { "epoch": 0.5704120469965249, "grad_norm": 0.39197665452957153, "learning_rate": 9.755014676984965e-06, "loss": 0.4516, "step": 1149 }, { "epoch": 0.5709084891610128, "grad_norm": 0.428911030292511, "learning_rate": 9.754120630140237e-06, "loss": 0.4411, "step": 1150 }, { "epoch": 0.5714049313255006, "grad_norm": 0.4537036120891571, "learning_rate": 9.75322499602384e-06, "loss": 0.4416, "step": 1151 }, { "epoch": 0.5719013734899884, "grad_norm": 0.49313363432884216, "learning_rate": 9.752327774934802e-06, "loss": 0.4994, "step": 1152 }, { "epoch": 0.5723978156544762, "grad_norm": 0.4337575435638428, "learning_rate": 9.751428967172683e-06, "loss": 0.4392, "step": 1153 }, { "epoch": 0.5728942578189641, "grad_norm": 0.3816051483154297, "learning_rate": 9.750528573037566e-06, "loss": 0.4309, "step": 1154 }, { "epoch": 0.5733906999834519, "grad_norm": 0.42320939898490906, "learning_rate": 9.749626592830073e-06, "loss": 0.4429, "step": 1155 }, { "epoch": 0.5738871421479398, "grad_norm": 0.4288228750228882, "learning_rate": 9.748723026851346e-06, "loss": 0.4365, "step": 1156 }, { "epoch": 0.5743835843124276, "grad_norm": 0.4043278992176056, "learning_rate": 9.747817875403066e-06, "loss": 0.4751, "step": 1157 }, { "epoch": 0.5748800264769154, "grad_norm": 0.39630746841430664, "learning_rate": 9.746911138787434e-06, "loss": 0.438, "step": 1158 }, { "epoch": 0.5753764686414032, "grad_norm": 0.4504854679107666, "learning_rate": 9.746002817307187e-06, "loss": 0.4803, "step": 1159 }, { "epoch": 0.5758729108058911, "grad_norm": 0.3536142110824585, "learning_rate": 9.745092911265587e-06, "loss": 0.411, "step": 1160 }, { "epoch": 0.576369352970379, "grad_norm": 0.4393535852432251, "learning_rate": 9.744181420966432e-06, "loss": 0.4322, "step": 1161 }, { "epoch": 0.5768657951348668, "grad_norm": 0.3949313461780548, "learning_rate": 9.743268346714037e-06, "loss": 0.4223, "step": 1162 }, { "epoch": 0.5773622372993547, "grad_norm": 0.39311686158180237, "learning_rate": 9.742353688813257e-06, "loss": 0.4241, "step": 1163 }, { "epoch": 0.5778586794638425, "grad_norm": 0.4230150282382965, "learning_rate": 9.741437447569473e-06, "loss": 0.4272, "step": 1164 }, { "epoch": 0.5783551216283302, "grad_norm": 0.42800208926200867, "learning_rate": 9.740519623288587e-06, "loss": 0.4334, "step": 1165 }, { "epoch": 0.5788515637928181, "grad_norm": 0.39702239632606506, "learning_rate": 9.73960021627704e-06, "loss": 0.4441, "step": 1166 }, { "epoch": 0.579348005957306, "grad_norm": 0.44652098417282104, "learning_rate": 9.738679226841796e-06, "loss": 0.4184, "step": 1167 }, { "epoch": 0.5798444481217938, "grad_norm": 0.39705511927604675, "learning_rate": 9.737756655290348e-06, "loss": 0.4309, "step": 1168 }, { "epoch": 0.5803408902862817, "grad_norm": 0.43707922101020813, "learning_rate": 9.736832501930717e-06, "loss": 0.453, "step": 1169 }, { "epoch": 0.5808373324507695, "grad_norm": 0.382352739572525, "learning_rate": 9.735906767071456e-06, "loss": 0.4125, "step": 1170 }, { "epoch": 0.5813337746152574, "grad_norm": 0.4060574769973755, "learning_rate": 9.73497945102164e-06, "loss": 0.4525, "step": 1171 }, { "epoch": 0.5818302167797451, "grad_norm": 0.3942406177520752, "learning_rate": 9.734050554090872e-06, "loss": 0.4277, "step": 1172 }, { "epoch": 0.582326658944233, "grad_norm": 0.411515474319458, "learning_rate": 9.733120076589291e-06, "loss": 0.4268, "step": 1173 }, { "epoch": 0.5828231011087208, "grad_norm": 0.39701709151268005, "learning_rate": 9.732188018827556e-06, "loss": 0.478, "step": 1174 }, { "epoch": 0.5833195432732087, "grad_norm": 0.4553658366203308, "learning_rate": 9.731254381116852e-06, "loss": 0.4466, "step": 1175 }, { "epoch": 0.5838159854376965, "grad_norm": 0.41805586218833923, "learning_rate": 9.730319163768902e-06, "loss": 0.4541, "step": 1176 }, { "epoch": 0.5843124276021844, "grad_norm": 0.3929709196090698, "learning_rate": 9.729382367095944e-06, "loss": 0.4309, "step": 1177 }, { "epoch": 0.5848088697666722, "grad_norm": 0.4429134428501129, "learning_rate": 9.728443991410752e-06, "loss": 0.4717, "step": 1178 }, { "epoch": 0.58530531193116, "grad_norm": 0.41660717129707336, "learning_rate": 9.727504037026623e-06, "loss": 0.4417, "step": 1179 }, { "epoch": 0.5858017540956478, "grad_norm": 0.39509081840515137, "learning_rate": 9.726562504257383e-06, "loss": 0.4252, "step": 1180 }, { "epoch": 0.5862981962601357, "grad_norm": 0.4189780056476593, "learning_rate": 9.725619393417382e-06, "loss": 0.4178, "step": 1181 }, { "epoch": 0.5867946384246235, "grad_norm": 0.4418380558490753, "learning_rate": 9.724674704821503e-06, "loss": 0.437, "step": 1182 }, { "epoch": 0.5872910805891114, "grad_norm": 0.40177544951438904, "learning_rate": 9.72372843878515e-06, "loss": 0.419, "step": 1183 }, { "epoch": 0.5877875227535992, "grad_norm": 0.5707026720046997, "learning_rate": 9.722780595624253e-06, "loss": 0.4381, "step": 1184 }, { "epoch": 0.588283964918087, "grad_norm": 0.43667757511138916, "learning_rate": 9.721831175655274e-06, "loss": 0.4629, "step": 1185 }, { "epoch": 0.5887804070825748, "grad_norm": 0.4497312903404236, "learning_rate": 9.720880179195196e-06, "loss": 0.436, "step": 1186 }, { "epoch": 0.5892768492470627, "grad_norm": 0.4429398775100708, "learning_rate": 9.719927606561534e-06, "loss": 0.4283, "step": 1187 }, { "epoch": 0.5897732914115505, "grad_norm": 0.43697604537010193, "learning_rate": 9.718973458072325e-06, "loss": 0.4604, "step": 1188 }, { "epoch": 0.5902697335760384, "grad_norm": 0.38515719771385193, "learning_rate": 9.718017734046134e-06, "loss": 0.4545, "step": 1189 }, { "epoch": 0.5907661757405263, "grad_norm": 0.3940352499485016, "learning_rate": 9.717060434802049e-06, "loss": 0.4397, "step": 1190 }, { "epoch": 0.5912626179050141, "grad_norm": 0.39752864837646484, "learning_rate": 9.716101560659688e-06, "loss": 0.4463, "step": 1191 }, { "epoch": 0.5917590600695019, "grad_norm": 0.38725847005844116, "learning_rate": 9.715141111939192e-06, "loss": 0.4239, "step": 1192 }, { "epoch": 0.5922555022339897, "grad_norm": 0.4677368402481079, "learning_rate": 9.714179088961228e-06, "loss": 0.4435, "step": 1193 }, { "epoch": 0.5927519443984776, "grad_norm": 0.4573880434036255, "learning_rate": 9.713215492046992e-06, "loss": 0.4078, "step": 1194 }, { "epoch": 0.5932483865629654, "grad_norm": 0.43993857502937317, "learning_rate": 9.712250321518201e-06, "loss": 0.4322, "step": 1195 }, { "epoch": 0.5937448287274533, "grad_norm": 0.4365018606185913, "learning_rate": 9.711283577697099e-06, "loss": 0.4307, "step": 1196 }, { "epoch": 0.5942412708919411, "grad_norm": 0.47080644965171814, "learning_rate": 9.710315260906456e-06, "loss": 0.4344, "step": 1197 }, { "epoch": 0.594737713056429, "grad_norm": 0.4204980432987213, "learning_rate": 9.709345371469567e-06, "loss": 0.4471, "step": 1198 }, { "epoch": 0.5952341552209167, "grad_norm": 0.4205401539802551, "learning_rate": 9.708373909710251e-06, "loss": 0.4462, "step": 1199 }, { "epoch": 0.5957305973854046, "grad_norm": 0.40738004446029663, "learning_rate": 9.707400875952856e-06, "loss": 0.4528, "step": 1200 }, { "epoch": 0.5962270395498924, "grad_norm": 0.41225865483283997, "learning_rate": 9.706426270522244e-06, "loss": 0.4461, "step": 1201 }, { "epoch": 0.5967234817143803, "grad_norm": 0.41413214802742004, "learning_rate": 9.705450093743815e-06, "loss": 0.4339, "step": 1202 }, { "epoch": 0.5972199238788681, "grad_norm": 0.3768315613269806, "learning_rate": 9.704472345943489e-06, "loss": 0.4407, "step": 1203 }, { "epoch": 0.597716366043356, "grad_norm": 0.4505842626094818, "learning_rate": 9.703493027447705e-06, "loss": 0.4264, "step": 1204 }, { "epoch": 0.5982128082078438, "grad_norm": 0.4653708338737488, "learning_rate": 9.702512138583435e-06, "loss": 0.4537, "step": 1205 }, { "epoch": 0.5987092503723316, "grad_norm": 0.43203824758529663, "learning_rate": 9.701529679678168e-06, "loss": 0.4308, "step": 1206 }, { "epoch": 0.5992056925368194, "grad_norm": 0.38732483983039856, "learning_rate": 9.700545651059921e-06, "loss": 0.4122, "step": 1207 }, { "epoch": 0.5997021347013073, "grad_norm": 0.43412891030311584, "learning_rate": 9.699560053057236e-06, "loss": 0.4338, "step": 1208 }, { "epoch": 0.6001985768657951, "grad_norm": 0.43384769558906555, "learning_rate": 9.698572885999174e-06, "loss": 0.4563, "step": 1209 }, { "epoch": 0.600695019030283, "grad_norm": 0.44437846541404724, "learning_rate": 9.697584150215326e-06, "loss": 0.4395, "step": 1210 }, { "epoch": 0.6011914611947708, "grad_norm": 0.42025142908096313, "learning_rate": 9.696593846035807e-06, "loss": 0.442, "step": 1211 }, { "epoch": 0.6016879033592586, "grad_norm": 0.4155829846858978, "learning_rate": 9.695601973791245e-06, "loss": 0.4477, "step": 1212 }, { "epoch": 0.6021843455237464, "grad_norm": 0.4569908380508423, "learning_rate": 9.694608533812807e-06, "loss": 0.4538, "step": 1213 }, { "epoch": 0.6026807876882343, "grad_norm": 0.37979602813720703, "learning_rate": 9.693613526432168e-06, "loss": 0.4217, "step": 1214 }, { "epoch": 0.6031772298527222, "grad_norm": 0.40700197219848633, "learning_rate": 9.692616951981539e-06, "loss": 0.4257, "step": 1215 }, { "epoch": 0.60367367201721, "grad_norm": 0.458310067653656, "learning_rate": 9.69161881079365e-06, "loss": 0.4591, "step": 1216 }, { "epoch": 0.6041701141816979, "grad_norm": 0.3989141583442688, "learning_rate": 9.690619103201751e-06, "loss": 0.4411, "step": 1217 }, { "epoch": 0.6046665563461857, "grad_norm": 0.39301976561546326, "learning_rate": 9.689617829539616e-06, "loss": 0.441, "step": 1218 }, { "epoch": 0.6051629985106735, "grad_norm": 0.42882612347602844, "learning_rate": 9.688614990141545e-06, "loss": 0.4384, "step": 1219 }, { "epoch": 0.6056594406751613, "grad_norm": 0.4048568904399872, "learning_rate": 9.687610585342358e-06, "loss": 0.4542, "step": 1220 }, { "epoch": 0.6061558828396492, "grad_norm": 0.403303325176239, "learning_rate": 9.686604615477398e-06, "loss": 0.433, "step": 1221 }, { "epoch": 0.606652325004137, "grad_norm": 0.3791772425174713, "learning_rate": 9.685597080882533e-06, "loss": 0.4277, "step": 1222 }, { "epoch": 0.6071487671686249, "grad_norm": 0.39373505115509033, "learning_rate": 9.684587981894148e-06, "loss": 0.4381, "step": 1223 }, { "epoch": 0.6076452093331127, "grad_norm": 0.37963634729385376, "learning_rate": 9.68357731884916e-06, "loss": 0.437, "step": 1224 }, { "epoch": 0.6081416514976006, "grad_norm": 0.4390088617801666, "learning_rate": 9.682565092084994e-06, "loss": 0.4411, "step": 1225 }, { "epoch": 0.6086380936620883, "grad_norm": 0.39951807260513306, "learning_rate": 9.681551301939612e-06, "loss": 0.4467, "step": 1226 }, { "epoch": 0.6091345358265762, "grad_norm": 0.3744092583656311, "learning_rate": 9.680535948751485e-06, "loss": 0.4501, "step": 1227 }, { "epoch": 0.609630977991064, "grad_norm": 0.3974815905094147, "learning_rate": 9.679519032859616e-06, "loss": 0.4442, "step": 1228 }, { "epoch": 0.6101274201555519, "grad_norm": 0.37198707461357117, "learning_rate": 9.678500554603524e-06, "loss": 0.4408, "step": 1229 }, { "epoch": 0.6106238623200397, "grad_norm": 0.40467944741249084, "learning_rate": 9.677480514323253e-06, "loss": 0.4415, "step": 1230 }, { "epoch": 0.6111203044845276, "grad_norm": 0.35272422432899475, "learning_rate": 9.676458912359362e-06, "loss": 0.4452, "step": 1231 }, { "epoch": 0.6116167466490154, "grad_norm": 0.3876346945762634, "learning_rate": 9.675435749052941e-06, "loss": 0.4227, "step": 1232 }, { "epoch": 0.6121131888135032, "grad_norm": 0.4058470129966736, "learning_rate": 9.674411024745593e-06, "loss": 0.4298, "step": 1233 }, { "epoch": 0.612609630977991, "grad_norm": 0.3949258625507355, "learning_rate": 9.67338473977945e-06, "loss": 0.4203, "step": 1234 }, { "epoch": 0.6131060731424789, "grad_norm": 0.4437556266784668, "learning_rate": 9.672356894497157e-06, "loss": 0.4147, "step": 1235 }, { "epoch": 0.6136025153069667, "grad_norm": 0.5022907257080078, "learning_rate": 9.671327489241884e-06, "loss": 0.4731, "step": 1236 }, { "epoch": 0.6140989574714546, "grad_norm": 0.4558546245098114, "learning_rate": 9.670296524357322e-06, "loss": 0.4362, "step": 1237 }, { "epoch": 0.6145953996359425, "grad_norm": 0.4707479178905487, "learning_rate": 9.669264000187681e-06, "loss": 0.4232, "step": 1238 }, { "epoch": 0.6150918418004302, "grad_norm": 0.47685506939888, "learning_rate": 9.668229917077696e-06, "loss": 0.4425, "step": 1239 }, { "epoch": 0.615588283964918, "grad_norm": 0.4460746943950653, "learning_rate": 9.667194275372618e-06, "loss": 0.4475, "step": 1240 }, { "epoch": 0.6160847261294059, "grad_norm": 0.4275963604450226, "learning_rate": 9.666157075418216e-06, "loss": 0.4507, "step": 1241 }, { "epoch": 0.6165811682938938, "grad_norm": 0.5124191641807556, "learning_rate": 9.665118317560786e-06, "loss": 0.4379, "step": 1242 }, { "epoch": 0.6170776104583816, "grad_norm": 0.3684218227863312, "learning_rate": 9.664078002147143e-06, "loss": 0.4249, "step": 1243 }, { "epoch": 0.6175740526228695, "grad_norm": 0.4333219826221466, "learning_rate": 9.663036129524616e-06, "loss": 0.4421, "step": 1244 }, { "epoch": 0.6180704947873573, "grad_norm": 0.4032171368598938, "learning_rate": 9.66199270004106e-06, "loss": 0.4049, "step": 1245 }, { "epoch": 0.6185669369518451, "grad_norm": 0.41696596145629883, "learning_rate": 9.660947714044846e-06, "loss": 0.4566, "step": 1246 }, { "epoch": 0.6190633791163329, "grad_norm": 0.4322136640548706, "learning_rate": 9.659901171884869e-06, "loss": 0.4295, "step": 1247 }, { "epoch": 0.6195598212808208, "grad_norm": 0.4076768755912781, "learning_rate": 9.658853073910541e-06, "loss": 0.4289, "step": 1248 }, { "epoch": 0.6200562634453086, "grad_norm": 0.42790043354034424, "learning_rate": 9.65780342047179e-06, "loss": 0.4446, "step": 1249 }, { "epoch": 0.6205527056097965, "grad_norm": 0.42553967237472534, "learning_rate": 9.65675221191907e-06, "loss": 0.4082, "step": 1250 }, { "epoch": 0.6210491477742843, "grad_norm": 0.392670601606369, "learning_rate": 9.65569944860335e-06, "loss": 0.4255, "step": 1251 }, { "epoch": 0.6215455899387722, "grad_norm": 0.44925782084465027, "learning_rate": 9.65464513087612e-06, "loss": 0.4085, "step": 1252 }, { "epoch": 0.6220420321032599, "grad_norm": 0.4176597595214844, "learning_rate": 9.653589259089386e-06, "loss": 0.4542, "step": 1253 }, { "epoch": 0.6225384742677478, "grad_norm": 0.3843705356121063, "learning_rate": 9.652531833595675e-06, "loss": 0.4348, "step": 1254 }, { "epoch": 0.6230349164322356, "grad_norm": 0.3946625292301178, "learning_rate": 9.651472854748036e-06, "loss": 0.4428, "step": 1255 }, { "epoch": 0.6235313585967235, "grad_norm": 0.46045252680778503, "learning_rate": 9.65041232290003e-06, "loss": 0.4569, "step": 1256 }, { "epoch": 0.6240278007612113, "grad_norm": 0.42577433586120605, "learning_rate": 9.649350238405739e-06, "loss": 0.4524, "step": 1257 }, { "epoch": 0.6245242429256992, "grad_norm": 0.3948012888431549, "learning_rate": 9.648286601619766e-06, "loss": 0.4422, "step": 1258 }, { "epoch": 0.625020685090187, "grad_norm": 0.3975149691104889, "learning_rate": 9.647221412897232e-06, "loss": 0.4331, "step": 1259 }, { "epoch": 0.6255171272546748, "grad_norm": 0.4156121015548706, "learning_rate": 9.646154672593771e-06, "loss": 0.4371, "step": 1260 }, { "epoch": 0.6260135694191626, "grad_norm": 0.48353710770606995, "learning_rate": 9.64508638106554e-06, "loss": 0.4375, "step": 1261 }, { "epoch": 0.6265100115836505, "grad_norm": 0.4747493267059326, "learning_rate": 9.644016538669214e-06, "loss": 0.4212, "step": 1262 }, { "epoch": 0.6270064537481383, "grad_norm": 0.43612679839134216, "learning_rate": 9.642945145761983e-06, "loss": 0.4633, "step": 1263 }, { "epoch": 0.6275028959126262, "grad_norm": 0.3732624053955078, "learning_rate": 9.641872202701557e-06, "loss": 0.4166, "step": 1264 }, { "epoch": 0.627999338077114, "grad_norm": 0.47336140275001526, "learning_rate": 9.640797709846159e-06, "loss": 0.4278, "step": 1265 }, { "epoch": 0.6284957802416018, "grad_norm": 0.43745025992393494, "learning_rate": 9.639721667554537e-06, "loss": 0.4385, "step": 1266 }, { "epoch": 0.6289922224060897, "grad_norm": 0.4116722345352173, "learning_rate": 9.638644076185953e-06, "loss": 0.422, "step": 1267 }, { "epoch": 0.6294886645705775, "grad_norm": 0.4541510343551636, "learning_rate": 9.63756493610018e-06, "loss": 0.447, "step": 1268 }, { "epoch": 0.6299851067350654, "grad_norm": 0.40390342473983765, "learning_rate": 9.636484247657519e-06, "loss": 0.427, "step": 1269 }, { "epoch": 0.6304815488995532, "grad_norm": 0.40352770686149597, "learning_rate": 9.635402011218778e-06, "loss": 0.4209, "step": 1270 }, { "epoch": 0.6309779910640411, "grad_norm": 0.39389240741729736, "learning_rate": 9.634318227145291e-06, "loss": 0.4435, "step": 1271 }, { "epoch": 0.6314744332285289, "grad_norm": 0.40138769149780273, "learning_rate": 9.633232895798901e-06, "loss": 0.444, "step": 1272 }, { "epoch": 0.6319708753930167, "grad_norm": 0.4243171215057373, "learning_rate": 9.63214601754197e-06, "loss": 0.4189, "step": 1273 }, { "epoch": 0.6324673175575045, "grad_norm": 0.39413243532180786, "learning_rate": 9.63105759273738e-06, "loss": 0.4325, "step": 1274 }, { "epoch": 0.6329637597219924, "grad_norm": 0.4507105052471161, "learning_rate": 9.629967621748527e-06, "loss": 0.4254, "step": 1275 }, { "epoch": 0.6334602018864802, "grad_norm": 0.4305225610733032, "learning_rate": 9.628876104939318e-06, "loss": 0.4535, "step": 1276 }, { "epoch": 0.6339566440509681, "grad_norm": 0.39704835414886475, "learning_rate": 9.627783042674182e-06, "loss": 0.4352, "step": 1277 }, { "epoch": 0.6344530862154559, "grad_norm": 0.41241106390953064, "learning_rate": 9.626688435318066e-06, "loss": 0.4399, "step": 1278 }, { "epoch": 0.6349495283799438, "grad_norm": 0.42637577652931213, "learning_rate": 9.62559228323643e-06, "loss": 0.4645, "step": 1279 }, { "epoch": 0.6354459705444315, "grad_norm": 0.42497876286506653, "learning_rate": 9.624494586795243e-06, "loss": 0.4269, "step": 1280 }, { "epoch": 0.6359424127089194, "grad_norm": 0.37073275446891785, "learning_rate": 9.623395346361004e-06, "loss": 0.4403, "step": 1281 }, { "epoch": 0.6364388548734072, "grad_norm": 0.4727330505847931, "learning_rate": 9.622294562300714e-06, "loss": 0.4385, "step": 1282 }, { "epoch": 0.6369352970378951, "grad_norm": 0.4253538250923157, "learning_rate": 9.621192234981897e-06, "loss": 0.4494, "step": 1283 }, { "epoch": 0.6374317392023829, "grad_norm": 0.38500604033470154, "learning_rate": 9.620088364772589e-06, "loss": 0.4496, "step": 1284 }, { "epoch": 0.6379281813668708, "grad_norm": 0.4271199703216553, "learning_rate": 9.618982952041344e-06, "loss": 0.4315, "step": 1285 }, { "epoch": 0.6384246235313586, "grad_norm": 0.3522931635379791, "learning_rate": 9.61787599715723e-06, "loss": 0.4055, "step": 1286 }, { "epoch": 0.6389210656958464, "grad_norm": 0.39129558205604553, "learning_rate": 9.616767500489822e-06, "loss": 0.4156, "step": 1287 }, { "epoch": 0.6394175078603342, "grad_norm": 0.4662975072860718, "learning_rate": 9.615657462409227e-06, "loss": 0.424, "step": 1288 }, { "epoch": 0.6399139500248221, "grad_norm": 0.41697201132774353, "learning_rate": 9.614545883286051e-06, "loss": 0.441, "step": 1289 }, { "epoch": 0.64041039218931, "grad_norm": 0.4283103942871094, "learning_rate": 9.613432763491422e-06, "loss": 0.4173, "step": 1290 }, { "epoch": 0.6409068343537978, "grad_norm": 0.42272254824638367, "learning_rate": 9.612318103396977e-06, "loss": 0.4251, "step": 1291 }, { "epoch": 0.6414032765182857, "grad_norm": 0.39685001969337463, "learning_rate": 9.611201903374873e-06, "loss": 0.4042, "step": 1292 }, { "epoch": 0.6418997186827734, "grad_norm": 0.41097205877304077, "learning_rate": 9.610084163797782e-06, "loss": 0.4527, "step": 1293 }, { "epoch": 0.6423961608472613, "grad_norm": 0.40955016016960144, "learning_rate": 9.608964885038882e-06, "loss": 0.4069, "step": 1294 }, { "epoch": 0.6428926030117491, "grad_norm": 0.4848541021347046, "learning_rate": 9.607844067471871e-06, "loss": 0.429, "step": 1295 }, { "epoch": 0.643389045176237, "grad_norm": 0.4577844738960266, "learning_rate": 9.606721711470962e-06, "loss": 0.4654, "step": 1296 }, { "epoch": 0.6438854873407248, "grad_norm": 0.4103187620639801, "learning_rate": 9.605597817410875e-06, "loss": 0.4213, "step": 1297 }, { "epoch": 0.6443819295052127, "grad_norm": 0.49751538038253784, "learning_rate": 9.604472385666851e-06, "loss": 0.4477, "step": 1298 }, { "epoch": 0.6448783716697005, "grad_norm": 0.4154302477836609, "learning_rate": 9.60334541661464e-06, "loss": 0.4522, "step": 1299 }, { "epoch": 0.6453748138341883, "grad_norm": 0.44403496384620667, "learning_rate": 9.602216910630507e-06, "loss": 0.4247, "step": 1300 }, { "epoch": 0.6458712559986761, "grad_norm": 0.41366544365882874, "learning_rate": 9.60108686809123e-06, "loss": 0.4305, "step": 1301 }, { "epoch": 0.646367698163164, "grad_norm": 0.4102383852005005, "learning_rate": 9.599955289374097e-06, "loss": 0.4222, "step": 1302 }, { "epoch": 0.6468641403276518, "grad_norm": 0.47967714071273804, "learning_rate": 9.598822174856912e-06, "loss": 0.4256, "step": 1303 }, { "epoch": 0.6473605824921397, "grad_norm": 0.42989903688430786, "learning_rate": 9.597687524917992e-06, "loss": 0.4392, "step": 1304 }, { "epoch": 0.6478570246566275, "grad_norm": 0.4602229595184326, "learning_rate": 9.596551339936167e-06, "loss": 0.4366, "step": 1305 }, { "epoch": 0.6483534668211154, "grad_norm": 0.43717435002326965, "learning_rate": 9.595413620290774e-06, "loss": 0.4175, "step": 1306 }, { "epoch": 0.6488499089856031, "grad_norm": 0.516043484210968, "learning_rate": 9.594274366361673e-06, "loss": 0.4265, "step": 1307 }, { "epoch": 0.649346351150091, "grad_norm": 0.4251888394355774, "learning_rate": 9.593133578529224e-06, "loss": 0.4196, "step": 1308 }, { "epoch": 0.6498427933145788, "grad_norm": 0.426708847284317, "learning_rate": 9.59199125717431e-06, "loss": 0.4082, "step": 1309 }, { "epoch": 0.6503392354790667, "grad_norm": 0.4923485815525055, "learning_rate": 9.590847402678316e-06, "loss": 0.4356, "step": 1310 }, { "epoch": 0.6508356776435545, "grad_norm": 0.4465941786766052, "learning_rate": 9.589702015423148e-06, "loss": 0.4517, "step": 1311 }, { "epoch": 0.6513321198080424, "grad_norm": 0.4979263246059418, "learning_rate": 9.588555095791219e-06, "loss": 0.4489, "step": 1312 }, { "epoch": 0.6518285619725303, "grad_norm": 0.43775296211242676, "learning_rate": 9.587406644165453e-06, "loss": 0.4419, "step": 1313 }, { "epoch": 0.652325004137018, "grad_norm": 0.4804302752017975, "learning_rate": 9.586256660929287e-06, "loss": 0.412, "step": 1314 }, { "epoch": 0.6528214463015058, "grad_norm": 0.4968571960926056, "learning_rate": 9.585105146466668e-06, "loss": 0.413, "step": 1315 }, { "epoch": 0.6533178884659937, "grad_norm": 0.5360041856765747, "learning_rate": 9.58395210116206e-06, "loss": 0.4323, "step": 1316 }, { "epoch": 0.6538143306304816, "grad_norm": 0.41034501791000366, "learning_rate": 9.582797525400428e-06, "loss": 0.4084, "step": 1317 }, { "epoch": 0.6543107727949694, "grad_norm": 0.457785964012146, "learning_rate": 9.581641419567256e-06, "loss": 0.4483, "step": 1318 }, { "epoch": 0.6548072149594573, "grad_norm": 0.49286141991615295, "learning_rate": 9.580483784048537e-06, "loss": 0.464, "step": 1319 }, { "epoch": 0.6553036571239451, "grad_norm": 0.3854372501373291, "learning_rate": 9.579324619230772e-06, "loss": 0.4404, "step": 1320 }, { "epoch": 0.6558000992884329, "grad_norm": 0.5243598222732544, "learning_rate": 9.578163925500978e-06, "loss": 0.4275, "step": 1321 }, { "epoch": 0.6562965414529207, "grad_norm": 0.37719297409057617, "learning_rate": 9.577001703246676e-06, "loss": 0.4352, "step": 1322 }, { "epoch": 0.6567929836174086, "grad_norm": 0.44731009006500244, "learning_rate": 9.5758379528559e-06, "loss": 0.4285, "step": 1323 }, { "epoch": 0.6572894257818964, "grad_norm": 0.4489118754863739, "learning_rate": 9.574672674717196e-06, "loss": 0.4401, "step": 1324 }, { "epoch": 0.6577858679463843, "grad_norm": 0.4079752564430237, "learning_rate": 9.57350586921962e-06, "loss": 0.431, "step": 1325 }, { "epoch": 0.6582823101108721, "grad_norm": 0.4711299538612366, "learning_rate": 9.572337536752733e-06, "loss": 0.4565, "step": 1326 }, { "epoch": 0.6587787522753599, "grad_norm": 0.385219544172287, "learning_rate": 9.571167677706615e-06, "loss": 0.4515, "step": 1327 }, { "epoch": 0.6592751944398477, "grad_norm": 0.4511515498161316, "learning_rate": 9.569996292471844e-06, "loss": 0.43, "step": 1328 }, { "epoch": 0.6597716366043356, "grad_norm": 0.4033202528953552, "learning_rate": 9.568823381439518e-06, "loss": 0.4497, "step": 1329 }, { "epoch": 0.6602680787688234, "grad_norm": 0.42720940709114075, "learning_rate": 9.567648945001238e-06, "loss": 0.4528, "step": 1330 }, { "epoch": 0.6607645209333113, "grad_norm": 0.4596070945262909, "learning_rate": 9.566472983549118e-06, "loss": 0.4408, "step": 1331 }, { "epoch": 0.6612609630977991, "grad_norm": 0.4035940170288086, "learning_rate": 9.565295497475777e-06, "loss": 0.4131, "step": 1332 }, { "epoch": 0.661757405262287, "grad_norm": 0.45256832242012024, "learning_rate": 9.564116487174348e-06, "loss": 0.4205, "step": 1333 }, { "epoch": 0.6622538474267747, "grad_norm": 0.4633987247943878, "learning_rate": 9.56293595303847e-06, "loss": 0.4237, "step": 1334 }, { "epoch": 0.6627502895912626, "grad_norm": 0.4571603536605835, "learning_rate": 9.561753895462292e-06, "loss": 0.4255, "step": 1335 }, { "epoch": 0.6632467317557504, "grad_norm": 0.5023860335350037, "learning_rate": 9.560570314840469e-06, "loss": 0.439, "step": 1336 }, { "epoch": 0.6637431739202383, "grad_norm": 0.4380987286567688, "learning_rate": 9.559385211568167e-06, "loss": 0.4494, "step": 1337 }, { "epoch": 0.6642396160847261, "grad_norm": 0.4193746745586395, "learning_rate": 9.558198586041062e-06, "loss": 0.432, "step": 1338 }, { "epoch": 0.664736058249214, "grad_norm": 0.40151044726371765, "learning_rate": 9.557010438655332e-06, "loss": 0.4502, "step": 1339 }, { "epoch": 0.6652325004137019, "grad_norm": 0.422317773103714, "learning_rate": 9.555820769807668e-06, "loss": 0.4464, "step": 1340 }, { "epoch": 0.6657289425781896, "grad_norm": 0.37627777457237244, "learning_rate": 9.554629579895272e-06, "loss": 0.428, "step": 1341 }, { "epoch": 0.6662253847426775, "grad_norm": 0.3790498375892639, "learning_rate": 9.553436869315846e-06, "loss": 0.4528, "step": 1342 }, { "epoch": 0.6667218269071653, "grad_norm": 0.4184736907482147, "learning_rate": 9.552242638467604e-06, "loss": 0.4265, "step": 1343 }, { "epoch": 0.6672182690716532, "grad_norm": 0.4231106638908386, "learning_rate": 9.55104688774927e-06, "loss": 0.421, "step": 1344 }, { "epoch": 0.667714711236141, "grad_norm": 0.4196281433105469, "learning_rate": 9.54984961756007e-06, "loss": 0.4179, "step": 1345 }, { "epoch": 0.6682111534006289, "grad_norm": 0.3949560523033142, "learning_rate": 9.548650828299742e-06, "loss": 0.4393, "step": 1346 }, { "epoch": 0.6687075955651167, "grad_norm": 0.41208523511886597, "learning_rate": 9.547450520368526e-06, "loss": 0.4474, "step": 1347 }, { "epoch": 0.6692040377296045, "grad_norm": 0.3966695964336395, "learning_rate": 9.546248694167175e-06, "loss": 0.4503, "step": 1348 }, { "epoch": 0.6697004798940923, "grad_norm": 0.38605812191963196, "learning_rate": 9.545045350096944e-06, "loss": 0.4353, "step": 1349 }, { "epoch": 0.6701969220585802, "grad_norm": 0.4345308244228363, "learning_rate": 9.5438404885596e-06, "loss": 0.402, "step": 1350 }, { "epoch": 0.670693364223068, "grad_norm": 0.42831385135650635, "learning_rate": 9.54263410995741e-06, "loss": 0.4318, "step": 1351 }, { "epoch": 0.6711898063875559, "grad_norm": 0.3978542387485504, "learning_rate": 9.541426214693153e-06, "loss": 0.4135, "step": 1352 }, { "epoch": 0.6716862485520437, "grad_norm": 0.41623950004577637, "learning_rate": 9.540216803170113e-06, "loss": 0.4545, "step": 1353 }, { "epoch": 0.6721826907165315, "grad_norm": 0.4107891917228699, "learning_rate": 9.539005875792077e-06, "loss": 0.4412, "step": 1354 }, { "epoch": 0.6726791328810193, "grad_norm": 0.4004199802875519, "learning_rate": 9.537793432963345e-06, "loss": 0.4498, "step": 1355 }, { "epoch": 0.6731755750455072, "grad_norm": 0.47353246808052063, "learning_rate": 9.536579475088714e-06, "loss": 0.4416, "step": 1356 }, { "epoch": 0.673672017209995, "grad_norm": 0.38804739713668823, "learning_rate": 9.535364002573495e-06, "loss": 0.4271, "step": 1357 }, { "epoch": 0.6741684593744829, "grad_norm": 0.37334445118904114, "learning_rate": 9.534147015823498e-06, "loss": 0.4097, "step": 1358 }, { "epoch": 0.6746649015389707, "grad_norm": 0.46157944202423096, "learning_rate": 9.532928515245046e-06, "loss": 0.449, "step": 1359 }, { "epoch": 0.6751613437034586, "grad_norm": 0.41280093789100647, "learning_rate": 9.531708501244958e-06, "loss": 0.4355, "step": 1360 }, { "epoch": 0.6756577858679463, "grad_norm": 0.47634565830230713, "learning_rate": 9.530486974230568e-06, "loss": 0.4439, "step": 1361 }, { "epoch": 0.6761542280324342, "grad_norm": 0.44875568151474, "learning_rate": 9.52926393460971e-06, "loss": 0.4462, "step": 1362 }, { "epoch": 0.676650670196922, "grad_norm": 0.42062708735466003, "learning_rate": 9.528039382790722e-06, "loss": 0.4357, "step": 1363 }, { "epoch": 0.6771471123614099, "grad_norm": 0.4927869141101837, "learning_rate": 9.526813319182449e-06, "loss": 0.4384, "step": 1364 }, { "epoch": 0.6776435545258978, "grad_norm": 0.413518488407135, "learning_rate": 9.525585744194243e-06, "loss": 0.4201, "step": 1365 }, { "epoch": 0.6781399966903856, "grad_norm": 0.41098010540008545, "learning_rate": 9.524356658235954e-06, "loss": 0.4477, "step": 1366 }, { "epoch": 0.6786364388548735, "grad_norm": 0.43680691719055176, "learning_rate": 9.52312606171794e-06, "loss": 0.4173, "step": 1367 }, { "epoch": 0.6791328810193612, "grad_norm": 0.3868040144443512, "learning_rate": 9.52189395505107e-06, "loss": 0.4324, "step": 1368 }, { "epoch": 0.679629323183849, "grad_norm": 0.3933830261230469, "learning_rate": 9.520660338646702e-06, "loss": 0.4291, "step": 1369 }, { "epoch": 0.6801257653483369, "grad_norm": 0.44241610169410706, "learning_rate": 9.519425212916714e-06, "loss": 0.4625, "step": 1370 }, { "epoch": 0.6806222075128248, "grad_norm": 0.3596033453941345, "learning_rate": 9.51818857827348e-06, "loss": 0.453, "step": 1371 }, { "epoch": 0.6811186496773126, "grad_norm": 0.4040820896625519, "learning_rate": 9.516950435129875e-06, "loss": 0.4432, "step": 1372 }, { "epoch": 0.6816150918418005, "grad_norm": 0.4064486026763916, "learning_rate": 9.515710783899284e-06, "loss": 0.408, "step": 1373 }, { "epoch": 0.6821115340062883, "grad_norm": 0.40679699182510376, "learning_rate": 9.514469624995593e-06, "loss": 0.426, "step": 1374 }, { "epoch": 0.6826079761707761, "grad_norm": 0.36843225359916687, "learning_rate": 9.51322695883319e-06, "loss": 0.4043, "step": 1375 }, { "epoch": 0.6831044183352639, "grad_norm": 0.38306936621665955, "learning_rate": 9.51198278582697e-06, "loss": 0.4125, "step": 1376 }, { "epoch": 0.6836008604997518, "grad_norm": 0.41934823989868164, "learning_rate": 9.510737106392325e-06, "loss": 0.4242, "step": 1377 }, { "epoch": 0.6840973026642396, "grad_norm": 0.44217631220817566, "learning_rate": 9.509489920945155e-06, "loss": 0.4246, "step": 1378 }, { "epoch": 0.6845937448287275, "grad_norm": 0.4035203754901886, "learning_rate": 9.508241229901862e-06, "loss": 0.4251, "step": 1379 }, { "epoch": 0.6850901869932153, "grad_norm": 0.48175230622291565, "learning_rate": 9.50699103367935e-06, "loss": 0.4383, "step": 1380 }, { "epoch": 0.6855866291577031, "grad_norm": 0.4191337823867798, "learning_rate": 9.505739332695026e-06, "loss": 0.4519, "step": 1381 }, { "epoch": 0.6860830713221909, "grad_norm": 0.4702138900756836, "learning_rate": 9.504486127366796e-06, "loss": 0.4281, "step": 1382 }, { "epoch": 0.6865795134866788, "grad_norm": 0.3940064609050751, "learning_rate": 9.503231418113073e-06, "loss": 0.4346, "step": 1383 }, { "epoch": 0.6870759556511666, "grad_norm": 0.5342278480529785, "learning_rate": 9.501975205352772e-06, "loss": 0.4268, "step": 1384 }, { "epoch": 0.6875723978156545, "grad_norm": 0.3876984417438507, "learning_rate": 9.500717489505307e-06, "loss": 0.4309, "step": 1385 }, { "epoch": 0.6880688399801423, "grad_norm": 0.3960898816585541, "learning_rate": 9.499458270990593e-06, "loss": 0.4414, "step": 1386 }, { "epoch": 0.6885652821446302, "grad_norm": 0.47141197323799133, "learning_rate": 9.498197550229054e-06, "loss": 0.4287, "step": 1387 }, { "epoch": 0.6890617243091179, "grad_norm": 0.3970928192138672, "learning_rate": 9.496935327641605e-06, "loss": 0.4293, "step": 1388 }, { "epoch": 0.6895581664736058, "grad_norm": 0.4569436311721802, "learning_rate": 9.49567160364967e-06, "loss": 0.4393, "step": 1389 }, { "epoch": 0.6900546086380936, "grad_norm": 0.4364161491394043, "learning_rate": 9.494406378675173e-06, "loss": 0.4461, "step": 1390 }, { "epoch": 0.6905510508025815, "grad_norm": 0.406311959028244, "learning_rate": 9.493139653140537e-06, "loss": 0.4316, "step": 1391 }, { "epoch": 0.6910474929670694, "grad_norm": 0.49378830194473267, "learning_rate": 9.491871427468687e-06, "loss": 0.4495, "step": 1392 }, { "epoch": 0.6915439351315572, "grad_norm": 0.4028064012527466, "learning_rate": 9.490601702083051e-06, "loss": 0.4184, "step": 1393 }, { "epoch": 0.6920403772960451, "grad_norm": 0.418178528547287, "learning_rate": 9.489330477407554e-06, "loss": 0.4745, "step": 1394 }, { "epoch": 0.6925368194605328, "grad_norm": 0.4877155125141144, "learning_rate": 9.488057753866623e-06, "loss": 0.4489, "step": 1395 }, { "epoch": 0.6930332616250207, "grad_norm": 0.4235035181045532, "learning_rate": 9.486783531885187e-06, "loss": 0.4066, "step": 1396 }, { "epoch": 0.6935297037895085, "grad_norm": 0.42065244913101196, "learning_rate": 9.485507811888673e-06, "loss": 0.4138, "step": 1397 }, { "epoch": 0.6940261459539964, "grad_norm": 0.45952239632606506, "learning_rate": 9.48423059430301e-06, "loss": 0.4304, "step": 1398 }, { "epoch": 0.6945225881184842, "grad_norm": 0.44655489921569824, "learning_rate": 9.482951879554628e-06, "loss": 0.462, "step": 1399 }, { "epoch": 0.6950190302829721, "grad_norm": 0.4896790385246277, "learning_rate": 9.481671668070452e-06, "loss": 0.4492, "step": 1400 }, { "epoch": 0.6955154724474599, "grad_norm": 0.41520121693611145, "learning_rate": 9.480389960277911e-06, "loss": 0.4338, "step": 1401 }, { "epoch": 0.6960119146119477, "grad_norm": 0.41289788484573364, "learning_rate": 9.479106756604935e-06, "loss": 0.4387, "step": 1402 }, { "epoch": 0.6965083567764355, "grad_norm": 0.47464853525161743, "learning_rate": 9.477822057479945e-06, "loss": 0.4631, "step": 1403 }, { "epoch": 0.6970047989409234, "grad_norm": 0.42315223813056946, "learning_rate": 9.476535863331873e-06, "loss": 0.4417, "step": 1404 }, { "epoch": 0.6975012411054112, "grad_norm": 0.4638376533985138, "learning_rate": 9.47524817459014e-06, "loss": 0.4282, "step": 1405 }, { "epoch": 0.6979976832698991, "grad_norm": 0.39112573862075806, "learning_rate": 9.473958991684671e-06, "loss": 0.3902, "step": 1406 }, { "epoch": 0.6984941254343869, "grad_norm": 0.4355223476886749, "learning_rate": 9.472668315045893e-06, "loss": 0.4056, "step": 1407 }, { "epoch": 0.6989905675988747, "grad_norm": 0.4324764311313629, "learning_rate": 9.471376145104723e-06, "loss": 0.4473, "step": 1408 }, { "epoch": 0.6994870097633625, "grad_norm": 0.4459781050682068, "learning_rate": 9.470082482292585e-06, "loss": 0.4353, "step": 1409 }, { "epoch": 0.6999834519278504, "grad_norm": 0.41651982069015503, "learning_rate": 9.468787327041394e-06, "loss": 0.4136, "step": 1410 }, { "epoch": 0.7004798940923382, "grad_norm": 0.40642544627189636, "learning_rate": 9.467490679783571e-06, "loss": 0.408, "step": 1411 }, { "epoch": 0.7009763362568261, "grad_norm": 0.38587844371795654, "learning_rate": 9.46619254095203e-06, "loss": 0.429, "step": 1412 }, { "epoch": 0.701472778421314, "grad_norm": 0.40110430121421814, "learning_rate": 9.464892910980184e-06, "loss": 0.4397, "step": 1413 }, { "epoch": 0.7019692205858018, "grad_norm": 0.43806320428848267, "learning_rate": 9.463591790301942e-06, "loss": 0.4264, "step": 1414 }, { "epoch": 0.7024656627502895, "grad_norm": 0.4070325493812561, "learning_rate": 9.462289179351716e-06, "loss": 0.4268, "step": 1415 }, { "epoch": 0.7029621049147774, "grad_norm": 0.3710170090198517, "learning_rate": 9.460985078564414e-06, "loss": 0.4119, "step": 1416 }, { "epoch": 0.7034585470792653, "grad_norm": 0.4250202476978302, "learning_rate": 9.459679488375432e-06, "loss": 0.4198, "step": 1417 }, { "epoch": 0.7039549892437531, "grad_norm": 0.45716139674186707, "learning_rate": 9.45837240922068e-06, "loss": 0.451, "step": 1418 }, { "epoch": 0.704451431408241, "grad_norm": 0.3803211450576782, "learning_rate": 9.45706384153655e-06, "loss": 0.4137, "step": 1419 }, { "epoch": 0.7049478735727288, "grad_norm": 0.44927355647087097, "learning_rate": 9.455753785759942e-06, "loss": 0.433, "step": 1420 }, { "epoch": 0.7054443157372167, "grad_norm": 0.3816840946674347, "learning_rate": 9.454442242328246e-06, "loss": 0.4307, "step": 1421 }, { "epoch": 0.7059407579017044, "grad_norm": 0.41462355852127075, "learning_rate": 9.453129211679348e-06, "loss": 0.4386, "step": 1422 }, { "epoch": 0.7064372000661923, "grad_norm": 0.4304116368293762, "learning_rate": 9.451814694251636e-06, "loss": 0.4344, "step": 1423 }, { "epoch": 0.7069336422306801, "grad_norm": 0.40894564986228943, "learning_rate": 9.450498690483993e-06, "loss": 0.4354, "step": 1424 }, { "epoch": 0.707430084395168, "grad_norm": 0.45169875025749207, "learning_rate": 9.449181200815793e-06, "loss": 0.4473, "step": 1425 }, { "epoch": 0.7079265265596558, "grad_norm": 0.42012956738471985, "learning_rate": 9.447862225686912e-06, "loss": 0.4392, "step": 1426 }, { "epoch": 0.7084229687241437, "grad_norm": 0.4041118621826172, "learning_rate": 9.446541765537723e-06, "loss": 0.4354, "step": 1427 }, { "epoch": 0.7089194108886315, "grad_norm": 0.41721275448799133, "learning_rate": 9.445219820809086e-06, "loss": 0.4258, "step": 1428 }, { "epoch": 0.7094158530531193, "grad_norm": 0.39190176129341125, "learning_rate": 9.443896391942365e-06, "loss": 0.4495, "step": 1429 }, { "epoch": 0.7099122952176071, "grad_norm": 0.4218344986438751, "learning_rate": 9.442571479379419e-06, "loss": 0.4165, "step": 1430 }, { "epoch": 0.710408737382095, "grad_norm": 0.37468311190605164, "learning_rate": 9.441245083562597e-06, "loss": 0.4169, "step": 1431 }, { "epoch": 0.7109051795465828, "grad_norm": 0.5103012919425964, "learning_rate": 9.439917204934748e-06, "loss": 0.4533, "step": 1432 }, { "epoch": 0.7114016217110707, "grad_norm": 0.4211650490760803, "learning_rate": 9.438587843939216e-06, "loss": 0.4471, "step": 1433 }, { "epoch": 0.7118980638755585, "grad_norm": 0.42823201417922974, "learning_rate": 9.437257001019835e-06, "loss": 0.4135, "step": 1434 }, { "epoch": 0.7123945060400463, "grad_norm": 0.46609586477279663, "learning_rate": 9.435924676620941e-06, "loss": 0.437, "step": 1435 }, { "epoch": 0.7128909482045341, "grad_norm": 0.38086259365081787, "learning_rate": 9.43459087118736e-06, "loss": 0.4056, "step": 1436 }, { "epoch": 0.713387390369022, "grad_norm": 0.45401692390441895, "learning_rate": 9.43325558516441e-06, "loss": 0.4485, "step": 1437 }, { "epoch": 0.7138838325335098, "grad_norm": 0.3603178858757019, "learning_rate": 9.43191881899791e-06, "loss": 0.4322, "step": 1438 }, { "epoch": 0.7143802746979977, "grad_norm": 0.4563452899456024, "learning_rate": 9.430580573134169e-06, "loss": 0.4311, "step": 1439 }, { "epoch": 0.7148767168624856, "grad_norm": 0.4092944264411926, "learning_rate": 9.429240848019992e-06, "loss": 0.4069, "step": 1440 }, { "epoch": 0.7153731590269734, "grad_norm": 0.4148683547973633, "learning_rate": 9.427899644102676e-06, "loss": 0.441, "step": 1441 }, { "epoch": 0.7158696011914611, "grad_norm": 0.4243881404399872, "learning_rate": 9.426556961830013e-06, "loss": 0.4368, "step": 1442 }, { "epoch": 0.716366043355949, "grad_norm": 0.43767157196998596, "learning_rate": 9.425212801650286e-06, "loss": 0.4165, "step": 1443 }, { "epoch": 0.7168624855204369, "grad_norm": 0.441358745098114, "learning_rate": 9.423867164012276e-06, "loss": 0.4173, "step": 1444 }, { "epoch": 0.7173589276849247, "grad_norm": 0.40249183773994446, "learning_rate": 9.422520049365254e-06, "loss": 0.4236, "step": 1445 }, { "epoch": 0.7178553698494126, "grad_norm": 0.42431801557540894, "learning_rate": 9.421171458158986e-06, "loss": 0.4009, "step": 1446 }, { "epoch": 0.7183518120139004, "grad_norm": 0.4143202602863312, "learning_rate": 9.419821390843728e-06, "loss": 0.4216, "step": 1447 }, { "epoch": 0.7188482541783883, "grad_norm": 0.4698046147823334, "learning_rate": 9.41846984787023e-06, "loss": 0.3938, "step": 1448 }, { "epoch": 0.719344696342876, "grad_norm": 0.4480956792831421, "learning_rate": 9.41711682968974e-06, "loss": 0.4358, "step": 1449 }, { "epoch": 0.7198411385073639, "grad_norm": 0.4999115765094757, "learning_rate": 9.41576233675399e-06, "loss": 0.432, "step": 1450 }, { "epoch": 0.7203375806718517, "grad_norm": 1.0066134929656982, "learning_rate": 9.414406369515208e-06, "loss": 0.4091, "step": 1451 }, { "epoch": 0.7208340228363396, "grad_norm": 0.3855555057525635, "learning_rate": 9.413048928426118e-06, "loss": 0.4266, "step": 1452 }, { "epoch": 0.7213304650008274, "grad_norm": 0.5427743792533875, "learning_rate": 9.411690013939932e-06, "loss": 0.421, "step": 1453 }, { "epoch": 0.7218269071653153, "grad_norm": 0.43093135952949524, "learning_rate": 9.41032962651035e-06, "loss": 0.4662, "step": 1454 }, { "epoch": 0.7223233493298031, "grad_norm": 0.4404541552066803, "learning_rate": 9.408967766591574e-06, "loss": 0.433, "step": 1455 }, { "epoch": 0.7228197914942909, "grad_norm": 0.44221729040145874, "learning_rate": 9.40760443463829e-06, "loss": 0.4461, "step": 1456 }, { "epoch": 0.7233162336587787, "grad_norm": 0.3904377222061157, "learning_rate": 9.406239631105675e-06, "loss": 0.4414, "step": 1457 }, { "epoch": 0.7238126758232666, "grad_norm": 0.3865761458873749, "learning_rate": 9.404873356449406e-06, "loss": 0.4253, "step": 1458 }, { "epoch": 0.7243091179877544, "grad_norm": 0.41681918501853943, "learning_rate": 9.403505611125638e-06, "loss": 0.4131, "step": 1459 }, { "epoch": 0.7248055601522423, "grad_norm": 0.41669148206710815, "learning_rate": 9.402136395591028e-06, "loss": 0.4297, "step": 1460 }, { "epoch": 0.7253020023167301, "grad_norm": 0.37449419498443604, "learning_rate": 9.40076571030272e-06, "loss": 0.3876, "step": 1461 }, { "epoch": 0.7257984444812179, "grad_norm": 0.4099705219268799, "learning_rate": 9.399393555718346e-06, "loss": 0.4667, "step": 1462 }, { "epoch": 0.7262948866457057, "grad_norm": 0.3941470682621002, "learning_rate": 9.398019932296033e-06, "loss": 0.4176, "step": 1463 }, { "epoch": 0.7267913288101936, "grad_norm": 0.37210696935653687, "learning_rate": 9.396644840494396e-06, "loss": 0.4214, "step": 1464 }, { "epoch": 0.7272877709746814, "grad_norm": 0.3757534325122833, "learning_rate": 9.395268280772542e-06, "loss": 0.418, "step": 1465 }, { "epoch": 0.7277842131391693, "grad_norm": 0.38585469126701355, "learning_rate": 9.393890253590064e-06, "loss": 0.3992, "step": 1466 }, { "epoch": 0.7282806553036572, "grad_norm": 0.34409859776496887, "learning_rate": 9.392510759407053e-06, "loss": 0.4189, "step": 1467 }, { "epoch": 0.728777097468145, "grad_norm": 0.41075262427330017, "learning_rate": 9.391129798684078e-06, "loss": 0.4516, "step": 1468 }, { "epoch": 0.7292735396326328, "grad_norm": 0.4065111577510834, "learning_rate": 9.389747371882207e-06, "loss": 0.4312, "step": 1469 }, { "epoch": 0.7297699817971206, "grad_norm": 0.3895357847213745, "learning_rate": 9.388363479462997e-06, "loss": 0.3924, "step": 1470 }, { "epoch": 0.7302664239616085, "grad_norm": 0.4128851592540741, "learning_rate": 9.38697812188849e-06, "loss": 0.4237, "step": 1471 }, { "epoch": 0.7307628661260963, "grad_norm": 0.4051092267036438, "learning_rate": 9.38559129962122e-06, "loss": 0.4256, "step": 1472 }, { "epoch": 0.7312593082905842, "grad_norm": 0.36582502722740173, "learning_rate": 9.384203013124209e-06, "loss": 0.4352, "step": 1473 }, { "epoch": 0.731755750455072, "grad_norm": 0.45871204137802124, "learning_rate": 9.382813262860968e-06, "loss": 0.4358, "step": 1474 }, { "epoch": 0.7322521926195599, "grad_norm": 0.4001859128475189, "learning_rate": 9.381422049295496e-06, "loss": 0.4099, "step": 1475 }, { "epoch": 0.7327486347840476, "grad_norm": 0.3763517141342163, "learning_rate": 9.380029372892282e-06, "loss": 0.4253, "step": 1476 }, { "epoch": 0.7332450769485355, "grad_norm": 0.339546263217926, "learning_rate": 9.378635234116303e-06, "loss": 0.4391, "step": 1477 }, { "epoch": 0.7337415191130233, "grad_norm": 0.4090888798236847, "learning_rate": 9.377239633433026e-06, "loss": 0.4342, "step": 1478 }, { "epoch": 0.7342379612775112, "grad_norm": 0.3667282462120056, "learning_rate": 9.3758425713084e-06, "loss": 0.4477, "step": 1479 }, { "epoch": 0.734734403441999, "grad_norm": 0.37853044271469116, "learning_rate": 9.374444048208868e-06, "loss": 0.3899, "step": 1480 }, { "epoch": 0.7352308456064869, "grad_norm": 0.3784453868865967, "learning_rate": 9.37304406460136e-06, "loss": 0.4318, "step": 1481 }, { "epoch": 0.7357272877709747, "grad_norm": 0.41752052307128906, "learning_rate": 9.371642620953293e-06, "loss": 0.4232, "step": 1482 }, { "epoch": 0.7362237299354625, "grad_norm": 0.423281192779541, "learning_rate": 9.370239717732567e-06, "loss": 0.4624, "step": 1483 }, { "epoch": 0.7367201720999503, "grad_norm": 0.41418176889419556, "learning_rate": 9.368835355407577e-06, "loss": 0.4373, "step": 1484 }, { "epoch": 0.7372166142644382, "grad_norm": 0.37135693430900574, "learning_rate": 9.367429534447199e-06, "loss": 0.411, "step": 1485 }, { "epoch": 0.737713056428926, "grad_norm": 0.42677608132362366, "learning_rate": 9.3660222553208e-06, "loss": 0.4646, "step": 1486 }, { "epoch": 0.7382094985934139, "grad_norm": 0.38734686374664307, "learning_rate": 9.364613518498233e-06, "loss": 0.413, "step": 1487 }, { "epoch": 0.7387059407579017, "grad_norm": 0.46174323558807373, "learning_rate": 9.363203324449837e-06, "loss": 0.4123, "step": 1488 }, { "epoch": 0.7392023829223895, "grad_norm": 0.4714295566082001, "learning_rate": 9.361791673646434e-06, "loss": 0.4467, "step": 1489 }, { "epoch": 0.7396988250868773, "grad_norm": 0.3929242491722107, "learning_rate": 9.360378566559338e-06, "loss": 0.4128, "step": 1490 }, { "epoch": 0.7401952672513652, "grad_norm": 0.43580830097198486, "learning_rate": 9.358964003660347e-06, "loss": 0.4472, "step": 1491 }, { "epoch": 0.740691709415853, "grad_norm": 0.4355698823928833, "learning_rate": 9.357547985421746e-06, "loss": 0.3987, "step": 1492 }, { "epoch": 0.7411881515803409, "grad_norm": 0.4166388511657715, "learning_rate": 9.356130512316306e-06, "loss": 0.4125, "step": 1493 }, { "epoch": 0.7416845937448288, "grad_norm": 0.40577924251556396, "learning_rate": 9.354711584817278e-06, "loss": 0.4202, "step": 1494 }, { "epoch": 0.7421810359093166, "grad_norm": 0.38044944405555725, "learning_rate": 9.353291203398409e-06, "loss": 0.398, "step": 1495 }, { "epoch": 0.7426774780738044, "grad_norm": 0.46603497862815857, "learning_rate": 9.351869368533921e-06, "loss": 0.4381, "step": 1496 }, { "epoch": 0.7431739202382922, "grad_norm": 0.40617862343788147, "learning_rate": 9.350446080698528e-06, "loss": 0.4301, "step": 1497 }, { "epoch": 0.7436703624027801, "grad_norm": 0.42731213569641113, "learning_rate": 9.349021340367429e-06, "loss": 0.422, "step": 1498 }, { "epoch": 0.7441668045672679, "grad_norm": 0.3669068515300751, "learning_rate": 9.347595148016304e-06, "loss": 0.4368, "step": 1499 }, { "epoch": 0.7446632467317558, "grad_norm": 0.424686998128891, "learning_rate": 9.34616750412132e-06, "loss": 0.4402, "step": 1500 }, { "epoch": 0.7451596888962436, "grad_norm": 0.48465946316719055, "learning_rate": 9.344738409159126e-06, "loss": 0.4364, "step": 1501 }, { "epoch": 0.7456561310607315, "grad_norm": 0.3990052342414856, "learning_rate": 9.343307863606865e-06, "loss": 0.4254, "step": 1502 }, { "epoch": 0.7461525732252192, "grad_norm": 0.3971939980983734, "learning_rate": 9.34187586794215e-06, "loss": 0.4491, "step": 1503 }, { "epoch": 0.7466490153897071, "grad_norm": 0.43258967995643616, "learning_rate": 9.340442422643087e-06, "loss": 0.4549, "step": 1504 }, { "epoch": 0.7471454575541949, "grad_norm": 0.4461072087287903, "learning_rate": 9.33900752818827e-06, "loss": 0.4442, "step": 1505 }, { "epoch": 0.7476418997186828, "grad_norm": 0.3778024911880493, "learning_rate": 9.337571185056764e-06, "loss": 0.4252, "step": 1506 }, { "epoch": 0.7481383418831706, "grad_norm": 0.3580498993396759, "learning_rate": 9.336133393728128e-06, "loss": 0.4101, "step": 1507 }, { "epoch": 0.7486347840476585, "grad_norm": 0.5160278081893921, "learning_rate": 9.334694154682403e-06, "loss": 0.4224, "step": 1508 }, { "epoch": 0.7491312262121463, "grad_norm": 0.40865445137023926, "learning_rate": 9.33325346840011e-06, "loss": 0.4369, "step": 1509 }, { "epoch": 0.7496276683766341, "grad_norm": 0.3660036325454712, "learning_rate": 9.331811335362256e-06, "loss": 0.4522, "step": 1510 }, { "epoch": 0.7501241105411219, "grad_norm": 0.4430111050605774, "learning_rate": 9.330367756050326e-06, "loss": 0.4316, "step": 1511 }, { "epoch": 0.7506205527056098, "grad_norm": 0.43433529138565063, "learning_rate": 9.328922730946297e-06, "loss": 0.4431, "step": 1512 }, { "epoch": 0.7511169948700976, "grad_norm": 0.4255540668964386, "learning_rate": 9.327476260532623e-06, "loss": 0.4244, "step": 1513 }, { "epoch": 0.7516134370345855, "grad_norm": 0.40298500657081604, "learning_rate": 9.326028345292237e-06, "loss": 0.4307, "step": 1514 }, { "epoch": 0.7521098791990734, "grad_norm": 0.43740716576576233, "learning_rate": 9.324578985708563e-06, "loss": 0.4151, "step": 1515 }, { "epoch": 0.7526063213635611, "grad_norm": 0.39715197682380676, "learning_rate": 9.323128182265502e-06, "loss": 0.4429, "step": 1516 }, { "epoch": 0.753102763528049, "grad_norm": 0.47228729724884033, "learning_rate": 9.321675935447436e-06, "loss": 0.4286, "step": 1517 }, { "epoch": 0.7535992056925368, "grad_norm": 0.4463198482990265, "learning_rate": 9.320222245739233e-06, "loss": 0.4663, "step": 1518 }, { "epoch": 0.7540956478570247, "grad_norm": 0.4002208113670349, "learning_rate": 9.318767113626237e-06, "loss": 0.4224, "step": 1519 }, { "epoch": 0.7545920900215125, "grad_norm": 0.39593884348869324, "learning_rate": 9.317310539594282e-06, "loss": 0.4363, "step": 1520 }, { "epoch": 0.7550885321860004, "grad_norm": 0.4159860610961914, "learning_rate": 9.315852524129673e-06, "loss": 0.4363, "step": 1521 }, { "epoch": 0.7555849743504882, "grad_norm": 0.4056793749332428, "learning_rate": 9.314393067719208e-06, "loss": 0.4428, "step": 1522 }, { "epoch": 0.756081416514976, "grad_norm": 0.4146648347377777, "learning_rate": 9.312932170850153e-06, "loss": 0.4381, "step": 1523 }, { "epoch": 0.7565778586794638, "grad_norm": 0.4170178771018982, "learning_rate": 9.311469834010267e-06, "loss": 0.4499, "step": 1524 }, { "epoch": 0.7570743008439517, "grad_norm": 0.4397692382335663, "learning_rate": 9.310006057687782e-06, "loss": 0.4179, "step": 1525 }, { "epoch": 0.7575707430084395, "grad_norm": 0.4015755355358124, "learning_rate": 9.308540842371415e-06, "loss": 0.418, "step": 1526 }, { "epoch": 0.7580671851729274, "grad_norm": 0.36518171429634094, "learning_rate": 9.30707418855036e-06, "loss": 0.4035, "step": 1527 }, { "epoch": 0.7585636273374152, "grad_norm": 0.4233193099498749, "learning_rate": 9.305606096714292e-06, "loss": 0.4162, "step": 1528 }, { "epoch": 0.7590600695019031, "grad_norm": 0.40879374742507935, "learning_rate": 9.304136567353371e-06, "loss": 0.4388, "step": 1529 }, { "epoch": 0.7595565116663908, "grad_norm": 0.3940625488758087, "learning_rate": 9.302665600958227e-06, "loss": 0.4531, "step": 1530 }, { "epoch": 0.7600529538308787, "grad_norm": 0.43436142802238464, "learning_rate": 9.30119319801998e-06, "loss": 0.4255, "step": 1531 }, { "epoch": 0.7605493959953665, "grad_norm": 0.35401299595832825, "learning_rate": 9.299719359030224e-06, "loss": 0.4479, "step": 1532 }, { "epoch": 0.7610458381598544, "grad_norm": 0.405460923910141, "learning_rate": 9.298244084481034e-06, "loss": 0.4384, "step": 1533 }, { "epoch": 0.7615422803243422, "grad_norm": 0.37142282724380493, "learning_rate": 9.296767374864963e-06, "loss": 0.4463, "step": 1534 }, { "epoch": 0.7620387224888301, "grad_norm": 0.3675410747528076, "learning_rate": 9.295289230675046e-06, "loss": 0.4214, "step": 1535 }, { "epoch": 0.7625351646533179, "grad_norm": 0.38596707582473755, "learning_rate": 9.293809652404795e-06, "loss": 0.4237, "step": 1536 }, { "epoch": 0.7630316068178057, "grad_norm": 0.36130332946777344, "learning_rate": 9.292328640548201e-06, "loss": 0.4078, "step": 1537 }, { "epoch": 0.7635280489822935, "grad_norm": 0.3778937757015228, "learning_rate": 9.290846195599732e-06, "loss": 0.4304, "step": 1538 }, { "epoch": 0.7640244911467814, "grad_norm": 0.43819281458854675, "learning_rate": 9.289362318054337e-06, "loss": 0.4602, "step": 1539 }, { "epoch": 0.7645209333112692, "grad_norm": 0.3938853442668915, "learning_rate": 9.28787700840744e-06, "loss": 0.4274, "step": 1540 }, { "epoch": 0.7650173754757571, "grad_norm": 0.36670711636543274, "learning_rate": 9.286390267154951e-06, "loss": 0.4214, "step": 1541 }, { "epoch": 0.765513817640245, "grad_norm": 0.408368319272995, "learning_rate": 9.284902094793248e-06, "loss": 0.4617, "step": 1542 }, { "epoch": 0.7660102598047327, "grad_norm": 0.3550085723400116, "learning_rate": 9.283412491819194e-06, "loss": 0.425, "step": 1543 }, { "epoch": 0.7665067019692205, "grad_norm": 0.400819331407547, "learning_rate": 9.281921458730126e-06, "loss": 0.429, "step": 1544 }, { "epoch": 0.7670031441337084, "grad_norm": 0.3586559295654297, "learning_rate": 9.280428996023857e-06, "loss": 0.4455, "step": 1545 }, { "epoch": 0.7674995862981963, "grad_norm": 0.32685765624046326, "learning_rate": 9.278935104198682e-06, "loss": 0.4198, "step": 1546 }, { "epoch": 0.7679960284626841, "grad_norm": 0.4124971628189087, "learning_rate": 9.277439783753373e-06, "loss": 0.4018, "step": 1547 }, { "epoch": 0.768492470627172, "grad_norm": 0.4312984049320221, "learning_rate": 9.275943035187173e-06, "loss": 0.4484, "step": 1548 }, { "epoch": 0.7689889127916598, "grad_norm": 0.380618691444397, "learning_rate": 9.274444858999808e-06, "loss": 0.4143, "step": 1549 }, { "epoch": 0.7694853549561476, "grad_norm": 0.43092700839042664, "learning_rate": 9.272945255691476e-06, "loss": 0.4435, "step": 1550 }, { "epoch": 0.7699817971206354, "grad_norm": 0.4354059100151062, "learning_rate": 9.271444225762857e-06, "loss": 0.4415, "step": 1551 }, { "epoch": 0.7704782392851233, "grad_norm": 0.4291730523109436, "learning_rate": 9.269941769715102e-06, "loss": 0.4556, "step": 1552 }, { "epoch": 0.7709746814496111, "grad_norm": 0.43749648332595825, "learning_rate": 9.268437888049839e-06, "loss": 0.4099, "step": 1553 }, { "epoch": 0.771471123614099, "grad_norm": 0.4126862585544586, "learning_rate": 9.266932581269177e-06, "loss": 0.38, "step": 1554 }, { "epoch": 0.7719675657785868, "grad_norm": 0.4034210443496704, "learning_rate": 9.265425849875696e-06, "loss": 0.4284, "step": 1555 }, { "epoch": 0.7724640079430747, "grad_norm": 0.40353643894195557, "learning_rate": 9.26391769437245e-06, "loss": 0.4271, "step": 1556 }, { "epoch": 0.7729604501075624, "grad_norm": 0.4125841557979584, "learning_rate": 9.262408115262971e-06, "loss": 0.4079, "step": 1557 }, { "epoch": 0.7734568922720503, "grad_norm": 0.36541005969047546, "learning_rate": 9.26089711305127e-06, "loss": 0.4064, "step": 1558 }, { "epoch": 0.7739533344365381, "grad_norm": 0.44078466296195984, "learning_rate": 9.259384688241828e-06, "loss": 0.4266, "step": 1559 }, { "epoch": 0.774449776601026, "grad_norm": 0.4268731474876404, "learning_rate": 9.257870841339601e-06, "loss": 0.4215, "step": 1560 }, { "epoch": 0.7749462187655138, "grad_norm": 0.36654284596443176, "learning_rate": 9.256355572850024e-06, "loss": 0.4107, "step": 1561 }, { "epoch": 0.7754426609300017, "grad_norm": 0.4628910422325134, "learning_rate": 9.254838883279002e-06, "loss": 0.4018, "step": 1562 }, { "epoch": 0.7759391030944895, "grad_norm": 0.4600302278995514, "learning_rate": 9.253320773132917e-06, "loss": 0.4159, "step": 1563 }, { "epoch": 0.7764355452589773, "grad_norm": 0.4735943377017975, "learning_rate": 9.251801242918623e-06, "loss": 0.4367, "step": 1564 }, { "epoch": 0.7769319874234651, "grad_norm": 0.472411185503006, "learning_rate": 9.250280293143455e-06, "loss": 0.4207, "step": 1565 }, { "epoch": 0.777428429587953, "grad_norm": 0.41192781925201416, "learning_rate": 9.248757924315211e-06, "loss": 0.4135, "step": 1566 }, { "epoch": 0.7779248717524408, "grad_norm": 0.41261810064315796, "learning_rate": 9.24723413694217e-06, "loss": 0.3878, "step": 1567 }, { "epoch": 0.7784213139169287, "grad_norm": 0.40218302607536316, "learning_rate": 9.245708931533087e-06, "loss": 0.4261, "step": 1568 }, { "epoch": 0.7789177560814166, "grad_norm": 0.42309895157814026, "learning_rate": 9.24418230859718e-06, "loss": 0.4312, "step": 1569 }, { "epoch": 0.7794141982459043, "grad_norm": 0.39758190512657166, "learning_rate": 9.242654268644153e-06, "loss": 0.428, "step": 1570 }, { "epoch": 0.7799106404103922, "grad_norm": 0.3847161531448364, "learning_rate": 9.241124812184176e-06, "loss": 0.433, "step": 1571 }, { "epoch": 0.78040708257488, "grad_norm": 0.36627575755119324, "learning_rate": 9.239593939727889e-06, "loss": 0.4241, "step": 1572 }, { "epoch": 0.7809035247393679, "grad_norm": 0.4111729562282562, "learning_rate": 9.238061651786414e-06, "loss": 0.426, "step": 1573 }, { "epoch": 0.7813999669038557, "grad_norm": 0.3828927278518677, "learning_rate": 9.236527948871335e-06, "loss": 0.4605, "step": 1574 }, { "epoch": 0.7818964090683436, "grad_norm": 0.39274147152900696, "learning_rate": 9.234992831494718e-06, "loss": 0.4259, "step": 1575 }, { "epoch": 0.7823928512328314, "grad_norm": 0.37524762749671936, "learning_rate": 9.233456300169093e-06, "loss": 0.4265, "step": 1576 }, { "epoch": 0.7828892933973192, "grad_norm": 0.37058714032173157, "learning_rate": 9.23191835540747e-06, "loss": 0.4217, "step": 1577 }, { "epoch": 0.783385735561807, "grad_norm": 0.4208095371723175, "learning_rate": 9.230378997723326e-06, "loss": 0.4365, "step": 1578 }, { "epoch": 0.7838821777262949, "grad_norm": 0.38172727823257446, "learning_rate": 9.228838227630609e-06, "loss": 0.4357, "step": 1579 }, { "epoch": 0.7843786198907827, "grad_norm": 0.3691891133785248, "learning_rate": 9.22729604564374e-06, "loss": 0.4235, "step": 1580 }, { "epoch": 0.7848750620552706, "grad_norm": 0.4037191569805145, "learning_rate": 9.225752452277617e-06, "loss": 0.4095, "step": 1581 }, { "epoch": 0.7853715042197584, "grad_norm": 0.47503721714019775, "learning_rate": 9.224207448047594e-06, "loss": 0.4238, "step": 1582 }, { "epoch": 0.7858679463842463, "grad_norm": 0.39686861634254456, "learning_rate": 9.222661033469517e-06, "loss": 0.4272, "step": 1583 }, { "epoch": 0.786364388548734, "grad_norm": 0.3798079788684845, "learning_rate": 9.221113209059684e-06, "loss": 0.4011, "step": 1584 }, { "epoch": 0.7868608307132219, "grad_norm": 0.411347895860672, "learning_rate": 9.219563975334875e-06, "loss": 0.4214, "step": 1585 }, { "epoch": 0.7873572728777097, "grad_norm": 0.429290235042572, "learning_rate": 9.218013332812334e-06, "loss": 0.4287, "step": 1586 }, { "epoch": 0.7878537150421976, "grad_norm": 0.36360999941825867, "learning_rate": 9.216461282009783e-06, "loss": 0.4219, "step": 1587 }, { "epoch": 0.7883501572066854, "grad_norm": 0.3892785608768463, "learning_rate": 9.214907823445405e-06, "loss": 0.4066, "step": 1588 }, { "epoch": 0.7888465993711733, "grad_norm": 0.42500850558280945, "learning_rate": 9.213352957637862e-06, "loss": 0.4304, "step": 1589 }, { "epoch": 0.7893430415356611, "grad_norm": 0.3729934096336365, "learning_rate": 9.211796685106275e-06, "loss": 0.4219, "step": 1590 }, { "epoch": 0.7898394837001489, "grad_norm": 0.35806456208229065, "learning_rate": 9.210239006370249e-06, "loss": 0.4203, "step": 1591 }, { "epoch": 0.7903359258646367, "grad_norm": 0.3778175115585327, "learning_rate": 9.208679921949845e-06, "loss": 0.4317, "step": 1592 }, { "epoch": 0.7908323680291246, "grad_norm": 0.4181416928768158, "learning_rate": 9.2071194323656e-06, "loss": 0.454, "step": 1593 }, { "epoch": 0.7913288101936125, "grad_norm": 0.3779582381248474, "learning_rate": 9.205557538138522e-06, "loss": 0.409, "step": 1594 }, { "epoch": 0.7918252523581003, "grad_norm": 0.3846624493598938, "learning_rate": 9.203994239790081e-06, "loss": 0.4286, "step": 1595 }, { "epoch": 0.7923216945225882, "grad_norm": 0.3811035752296448, "learning_rate": 9.202429537842221e-06, "loss": 0.42, "step": 1596 }, { "epoch": 0.7928181366870759, "grad_norm": 0.3895227015018463, "learning_rate": 9.200863432817355e-06, "loss": 0.4239, "step": 1597 }, { "epoch": 0.7933145788515638, "grad_norm": 0.36652982234954834, "learning_rate": 9.199295925238362e-06, "loss": 0.4351, "step": 1598 }, { "epoch": 0.7938110210160516, "grad_norm": 0.35944584012031555, "learning_rate": 9.19772701562859e-06, "loss": 0.4037, "step": 1599 }, { "epoch": 0.7943074631805395, "grad_norm": 0.3885008692741394, "learning_rate": 9.196156704511856e-06, "loss": 0.4036, "step": 1600 }, { "epoch": 0.7948039053450273, "grad_norm": 0.39122581481933594, "learning_rate": 9.194584992412442e-06, "loss": 0.4255, "step": 1601 }, { "epoch": 0.7953003475095152, "grad_norm": 0.4135722517967224, "learning_rate": 9.193011879855103e-06, "loss": 0.4359, "step": 1602 }, { "epoch": 0.795796789674003, "grad_norm": 0.4004303216934204, "learning_rate": 9.191437367365056e-06, "loss": 0.4245, "step": 1603 }, { "epoch": 0.7962932318384908, "grad_norm": 0.38865625858306885, "learning_rate": 9.18986145546799e-06, "loss": 0.4174, "step": 1604 }, { "epoch": 0.7967896740029786, "grad_norm": 0.402431458234787, "learning_rate": 9.188284144690057e-06, "loss": 0.4229, "step": 1605 }, { "epoch": 0.7972861161674665, "grad_norm": 0.46119824051856995, "learning_rate": 9.18670543555788e-06, "loss": 0.4109, "step": 1606 }, { "epoch": 0.7977825583319543, "grad_norm": 0.37266120314598083, "learning_rate": 9.185125328598547e-06, "loss": 0.435, "step": 1607 }, { "epoch": 0.7982790004964422, "grad_norm": 0.3712843060493469, "learning_rate": 9.183543824339612e-06, "loss": 0.4315, "step": 1608 }, { "epoch": 0.79877544266093, "grad_norm": 0.3886561095714569, "learning_rate": 9.181960923309094e-06, "loss": 0.4258, "step": 1609 }, { "epoch": 0.7992718848254179, "grad_norm": 0.39356082677841187, "learning_rate": 9.180376626035486e-06, "loss": 0.4344, "step": 1610 }, { "epoch": 0.7997683269899056, "grad_norm": 0.42910388112068176, "learning_rate": 9.178790933047739e-06, "loss": 0.4486, "step": 1611 }, { "epoch": 0.8002647691543935, "grad_norm": 0.3963761627674103, "learning_rate": 9.17720384487527e-06, "loss": 0.4331, "step": 1612 }, { "epoch": 0.8007612113188813, "grad_norm": 0.38831308484077454, "learning_rate": 9.175615362047969e-06, "loss": 0.4225, "step": 1613 }, { "epoch": 0.8012576534833692, "grad_norm": 0.4610259532928467, "learning_rate": 9.174025485096188e-06, "loss": 0.4189, "step": 1614 }, { "epoch": 0.801754095647857, "grad_norm": 0.38697150349617004, "learning_rate": 9.172434214550739e-06, "loss": 0.3953, "step": 1615 }, { "epoch": 0.8022505378123449, "grad_norm": 0.3885546028614044, "learning_rate": 9.170841550942905e-06, "loss": 0.4016, "step": 1616 }, { "epoch": 0.8027469799768328, "grad_norm": 0.3746548891067505, "learning_rate": 9.169247494804436e-06, "loss": 0.3937, "step": 1617 }, { "epoch": 0.8032434221413205, "grad_norm": 0.4274401366710663, "learning_rate": 9.167652046667542e-06, "loss": 0.4223, "step": 1618 }, { "epoch": 0.8037398643058083, "grad_norm": 0.39040738344192505, "learning_rate": 9.166055207064899e-06, "loss": 0.4435, "step": 1619 }, { "epoch": 0.8042363064702962, "grad_norm": 0.39380523562431335, "learning_rate": 9.16445697652965e-06, "loss": 0.4266, "step": 1620 }, { "epoch": 0.8047327486347841, "grad_norm": 0.3673664331436157, "learning_rate": 9.162857355595401e-06, "loss": 0.4114, "step": 1621 }, { "epoch": 0.8052291907992719, "grad_norm": 0.38951772451400757, "learning_rate": 9.161256344796221e-06, "loss": 0.4325, "step": 1622 }, { "epoch": 0.8057256329637598, "grad_norm": 0.37110674381256104, "learning_rate": 9.159653944666643e-06, "loss": 0.4156, "step": 1623 }, { "epoch": 0.8062220751282475, "grad_norm": 0.353771448135376, "learning_rate": 9.158050155741667e-06, "loss": 0.4279, "step": 1624 }, { "epoch": 0.8067185172927354, "grad_norm": 0.34209752082824707, "learning_rate": 9.156444978556753e-06, "loss": 0.414, "step": 1625 }, { "epoch": 0.8072149594572232, "grad_norm": 0.405704528093338, "learning_rate": 9.154838413647828e-06, "loss": 0.4126, "step": 1626 }, { "epoch": 0.8077114016217111, "grad_norm": 0.3899478614330292, "learning_rate": 9.153230461551276e-06, "loss": 0.4422, "step": 1627 }, { "epoch": 0.8082078437861989, "grad_norm": 0.3604156970977783, "learning_rate": 9.151621122803954e-06, "loss": 0.419, "step": 1628 }, { "epoch": 0.8087042859506868, "grad_norm": 0.3771144151687622, "learning_rate": 9.150010397943175e-06, "loss": 0.4075, "step": 1629 }, { "epoch": 0.8092007281151746, "grad_norm": 0.4005292057991028, "learning_rate": 9.148398287506713e-06, "loss": 0.3669, "step": 1630 }, { "epoch": 0.8096971702796624, "grad_norm": 0.3787948489189148, "learning_rate": 9.14678479203281e-06, "loss": 0.4376, "step": 1631 }, { "epoch": 0.8101936124441502, "grad_norm": 0.43001988530158997, "learning_rate": 9.145169912060168e-06, "loss": 0.4436, "step": 1632 }, { "epoch": 0.8106900546086381, "grad_norm": 0.3734782636165619, "learning_rate": 9.143553648127954e-06, "loss": 0.4277, "step": 1633 }, { "epoch": 0.8111864967731259, "grad_norm": 0.398652046918869, "learning_rate": 9.14193600077579e-06, "loss": 0.4434, "step": 1634 }, { "epoch": 0.8116829389376138, "grad_norm": 0.3720133602619171, "learning_rate": 9.140316970543768e-06, "loss": 0.4198, "step": 1635 }, { "epoch": 0.8121793811021016, "grad_norm": 0.3860694169998169, "learning_rate": 9.138696557972437e-06, "loss": 0.4252, "step": 1636 }, { "epoch": 0.8126758232665895, "grad_norm": 0.4012965261936188, "learning_rate": 9.137074763602809e-06, "loss": 0.4698, "step": 1637 }, { "epoch": 0.8131722654310772, "grad_norm": 0.39055952429771423, "learning_rate": 9.135451587976357e-06, "loss": 0.4391, "step": 1638 }, { "epoch": 0.8136687075955651, "grad_norm": 0.4181646704673767, "learning_rate": 9.133827031635015e-06, "loss": 0.4127, "step": 1639 }, { "epoch": 0.8141651497600529, "grad_norm": 0.4048299193382263, "learning_rate": 9.132201095121178e-06, "loss": 0.4176, "step": 1640 }, { "epoch": 0.8146615919245408, "grad_norm": 0.423301637172699, "learning_rate": 9.130573778977702e-06, "loss": 0.4378, "step": 1641 }, { "epoch": 0.8151580340890286, "grad_norm": 0.4163750410079956, "learning_rate": 9.128945083747906e-06, "loss": 0.4144, "step": 1642 }, { "epoch": 0.8156544762535165, "grad_norm": 0.3682147264480591, "learning_rate": 9.127315009975564e-06, "loss": 0.408, "step": 1643 }, { "epoch": 0.8161509184180044, "grad_norm": 0.4210602641105652, "learning_rate": 9.125683558204914e-06, "loss": 0.4368, "step": 1644 }, { "epoch": 0.8166473605824921, "grad_norm": 0.3999893367290497, "learning_rate": 9.124050728980652e-06, "loss": 0.4375, "step": 1645 }, { "epoch": 0.81714380274698, "grad_norm": 0.3988727629184723, "learning_rate": 9.122416522847939e-06, "loss": 0.4259, "step": 1646 }, { "epoch": 0.8176402449114678, "grad_norm": 0.3944082260131836, "learning_rate": 9.12078094035239e-06, "loss": 0.4128, "step": 1647 }, { "epoch": 0.8181366870759557, "grad_norm": 0.39909249544143677, "learning_rate": 9.119143982040082e-06, "loss": 0.4149, "step": 1648 }, { "epoch": 0.8186331292404435, "grad_norm": 0.3386387228965759, "learning_rate": 9.117505648457549e-06, "loss": 0.4402, "step": 1649 }, { "epoch": 0.8191295714049314, "grad_norm": 0.4078071415424347, "learning_rate": 9.115865940151788e-06, "loss": 0.4241, "step": 1650 }, { "epoch": 0.8196260135694191, "grad_norm": 0.3851243555545807, "learning_rate": 9.114224857670255e-06, "loss": 0.4036, "step": 1651 }, { "epoch": 0.820122455733907, "grad_norm": 0.3808426260948181, "learning_rate": 9.112582401560858e-06, "loss": 0.4162, "step": 1652 }, { "epoch": 0.8206188978983948, "grad_norm": 0.37273016571998596, "learning_rate": 9.110938572371972e-06, "loss": 0.4275, "step": 1653 }, { "epoch": 0.8211153400628827, "grad_norm": 0.3905301094055176, "learning_rate": 9.109293370652426e-06, "loss": 0.4237, "step": 1654 }, { "epoch": 0.8216117822273705, "grad_norm": 0.3988618850708008, "learning_rate": 9.107646796951507e-06, "loss": 0.4277, "step": 1655 }, { "epoch": 0.8221082243918584, "grad_norm": 0.35443422198295593, "learning_rate": 9.105998851818963e-06, "loss": 0.4283, "step": 1656 }, { "epoch": 0.8226046665563462, "grad_norm": 0.36172881722450256, "learning_rate": 9.104349535804996e-06, "loss": 0.4224, "step": 1657 }, { "epoch": 0.823101108720834, "grad_norm": 0.36192864179611206, "learning_rate": 9.102698849460269e-06, "loss": 0.4381, "step": 1658 }, { "epoch": 0.8235975508853218, "grad_norm": 0.39767709374427795, "learning_rate": 9.101046793335904e-06, "loss": 0.4472, "step": 1659 }, { "epoch": 0.8240939930498097, "grad_norm": 0.40034860372543335, "learning_rate": 9.099393367983473e-06, "loss": 0.4239, "step": 1660 }, { "epoch": 0.8245904352142975, "grad_norm": 0.3614179491996765, "learning_rate": 9.09773857395501e-06, "loss": 0.399, "step": 1661 }, { "epoch": 0.8250868773787854, "grad_norm": 0.39943981170654297, "learning_rate": 9.09608241180301e-06, "loss": 0.4358, "step": 1662 }, { "epoch": 0.8255833195432732, "grad_norm": 0.3811211585998535, "learning_rate": 9.094424882080419e-06, "loss": 0.4136, "step": 1663 }, { "epoch": 0.8260797617077611, "grad_norm": 0.4017103612422943, "learning_rate": 9.092765985340639e-06, "loss": 0.436, "step": 1664 }, { "epoch": 0.8265762038722488, "grad_norm": 0.3919966518878937, "learning_rate": 9.09110572213753e-06, "loss": 0.4283, "step": 1665 }, { "epoch": 0.8270726460367367, "grad_norm": 0.3973107635974884, "learning_rate": 9.089444093025412e-06, "loss": 0.409, "step": 1666 }, { "epoch": 0.8275690882012245, "grad_norm": 0.40432557463645935, "learning_rate": 9.087781098559056e-06, "loss": 0.4309, "step": 1667 }, { "epoch": 0.8280655303657124, "grad_norm": 0.4126158356666565, "learning_rate": 9.086116739293692e-06, "loss": 0.4252, "step": 1668 }, { "epoch": 0.8285619725302003, "grad_norm": 0.422652930021286, "learning_rate": 9.084451015785001e-06, "loss": 0.4238, "step": 1669 }, { "epoch": 0.8290584146946881, "grad_norm": 0.40817832946777344, "learning_rate": 9.082783928589127e-06, "loss": 0.3989, "step": 1670 }, { "epoch": 0.829554856859176, "grad_norm": 0.398415744304657, "learning_rate": 9.081115478262664e-06, "loss": 0.4243, "step": 1671 }, { "epoch": 0.8300512990236637, "grad_norm": 0.44816961884498596, "learning_rate": 9.079445665362659e-06, "loss": 0.4488, "step": 1672 }, { "epoch": 0.8305477411881516, "grad_norm": 0.3872481882572174, "learning_rate": 9.077774490446619e-06, "loss": 0.4161, "step": 1673 }, { "epoch": 0.8310441833526394, "grad_norm": 0.40380915999412537, "learning_rate": 9.076101954072506e-06, "loss": 0.4226, "step": 1674 }, { "epoch": 0.8315406255171273, "grad_norm": 0.4187042713165283, "learning_rate": 9.074428056798733e-06, "loss": 0.423, "step": 1675 }, { "epoch": 0.8320370676816151, "grad_norm": 0.3509662449359894, "learning_rate": 9.072752799184167e-06, "loss": 0.4219, "step": 1676 }, { "epoch": 0.832533509846103, "grad_norm": 0.39821338653564453, "learning_rate": 9.071076181788134e-06, "loss": 0.4264, "step": 1677 }, { "epoch": 0.8330299520105907, "grad_norm": 0.41705095767974854, "learning_rate": 9.06939820517041e-06, "loss": 0.3888, "step": 1678 }, { "epoch": 0.8335263941750786, "grad_norm": 0.37961140275001526, "learning_rate": 9.067718869891226e-06, "loss": 0.4249, "step": 1679 }, { "epoch": 0.8340228363395664, "grad_norm": 0.42516839504241943, "learning_rate": 9.066038176511265e-06, "loss": 0.4398, "step": 1680 }, { "epoch": 0.8345192785040543, "grad_norm": 0.36257031559944153, "learning_rate": 9.064356125591664e-06, "loss": 0.4099, "step": 1681 }, { "epoch": 0.8350157206685421, "grad_norm": 0.3798970580101013, "learning_rate": 9.062672717694019e-06, "loss": 0.44, "step": 1682 }, { "epoch": 0.83551216283303, "grad_norm": 0.40093180537223816, "learning_rate": 9.06098795338037e-06, "loss": 0.4156, "step": 1683 }, { "epoch": 0.8360086049975178, "grad_norm": 0.3718702793121338, "learning_rate": 9.059301833213213e-06, "loss": 0.4163, "step": 1684 }, { "epoch": 0.8365050471620056, "grad_norm": 0.387927770614624, "learning_rate": 9.0576143577555e-06, "loss": 0.4131, "step": 1685 }, { "epoch": 0.8370014893264934, "grad_norm": 0.372903436422348, "learning_rate": 9.055925527570633e-06, "loss": 0.4143, "step": 1686 }, { "epoch": 0.8374979314909813, "grad_norm": 0.4216030538082123, "learning_rate": 9.054235343222466e-06, "loss": 0.4299, "step": 1687 }, { "epoch": 0.8379943736554691, "grad_norm": 0.44799843430519104, "learning_rate": 9.052543805275307e-06, "loss": 0.4081, "step": 1688 }, { "epoch": 0.838490815819957, "grad_norm": 0.37658727169036865, "learning_rate": 9.050850914293914e-06, "loss": 0.4087, "step": 1689 }, { "epoch": 0.8389872579844448, "grad_norm": 0.42633056640625, "learning_rate": 9.049156670843495e-06, "loss": 0.4384, "step": 1690 }, { "epoch": 0.8394837001489327, "grad_norm": 0.39838269352912903, "learning_rate": 9.047461075489714e-06, "loss": 0.4229, "step": 1691 }, { "epoch": 0.8399801423134204, "grad_norm": 0.42642942070961, "learning_rate": 9.045764128798684e-06, "loss": 0.4137, "step": 1692 }, { "epoch": 0.8404765844779083, "grad_norm": 0.46397629380226135, "learning_rate": 9.04406583133697e-06, "loss": 0.4432, "step": 1693 }, { "epoch": 0.8409730266423961, "grad_norm": 0.42499232292175293, "learning_rate": 9.042366183671585e-06, "loss": 0.4012, "step": 1694 }, { "epoch": 0.841469468806884, "grad_norm": 0.4167265295982361, "learning_rate": 9.040665186369999e-06, "loss": 0.4106, "step": 1695 }, { "epoch": 0.8419659109713719, "grad_norm": 0.5245203971862793, "learning_rate": 9.038962840000125e-06, "loss": 0.4222, "step": 1696 }, { "epoch": 0.8424623531358597, "grad_norm": 0.4506007432937622, "learning_rate": 9.03725914513033e-06, "loss": 0.4284, "step": 1697 }, { "epoch": 0.8429587953003476, "grad_norm": 0.41615286469459534, "learning_rate": 9.035554102329435e-06, "loss": 0.4398, "step": 1698 }, { "epoch": 0.8434552374648353, "grad_norm": 0.36976879835128784, "learning_rate": 9.033847712166706e-06, "loss": 0.4144, "step": 1699 }, { "epoch": 0.8439516796293232, "grad_norm": 0.4682364761829376, "learning_rate": 9.03213997521186e-06, "loss": 0.4232, "step": 1700 }, { "epoch": 0.844448121793811, "grad_norm": 0.5257864594459534, "learning_rate": 9.030430892035062e-06, "loss": 0.4205, "step": 1701 }, { "epoch": 0.8449445639582989, "grad_norm": 0.3938426077365875, "learning_rate": 9.02872046320693e-06, "loss": 0.4179, "step": 1702 }, { "epoch": 0.8454410061227867, "grad_norm": 0.4919627010822296, "learning_rate": 9.027008689298531e-06, "loss": 0.4363, "step": 1703 }, { "epoch": 0.8459374482872746, "grad_norm": 0.4676920771598816, "learning_rate": 9.025295570881378e-06, "loss": 0.411, "step": 1704 }, { "epoch": 0.8464338904517623, "grad_norm": 0.40490132570266724, "learning_rate": 9.023581108527437e-06, "loss": 0.426, "step": 1705 }, { "epoch": 0.8469303326162502, "grad_norm": 0.4066742956638336, "learning_rate": 9.021865302809117e-06, "loss": 0.3855, "step": 1706 }, { "epoch": 0.847426774780738, "grad_norm": 0.42129650712013245, "learning_rate": 9.020148154299282e-06, "loss": 0.4252, "step": 1707 }, { "epoch": 0.8479232169452259, "grad_norm": 0.4382795989513397, "learning_rate": 9.01842966357124e-06, "loss": 0.4288, "step": 1708 }, { "epoch": 0.8484196591097137, "grad_norm": 0.4368549883365631, "learning_rate": 9.016709831198746e-06, "loss": 0.3926, "step": 1709 }, { "epoch": 0.8489161012742016, "grad_norm": 0.3964707851409912, "learning_rate": 9.01498865775601e-06, "loss": 0.4126, "step": 1710 }, { "epoch": 0.8494125434386894, "grad_norm": 0.45270782709121704, "learning_rate": 9.013266143817681e-06, "loss": 0.4326, "step": 1711 }, { "epoch": 0.8499089856031772, "grad_norm": 0.4165816903114319, "learning_rate": 9.011542289958861e-06, "loss": 0.4071, "step": 1712 }, { "epoch": 0.850405427767665, "grad_norm": 0.5350325107574463, "learning_rate": 9.009817096755098e-06, "loss": 0.4186, "step": 1713 }, { "epoch": 0.8509018699321529, "grad_norm": 0.41999441385269165, "learning_rate": 9.008090564782388e-06, "loss": 0.4204, "step": 1714 }, { "epoch": 0.8513983120966407, "grad_norm": 0.4641819894313812, "learning_rate": 9.006362694617173e-06, "loss": 0.4166, "step": 1715 }, { "epoch": 0.8518947542611286, "grad_norm": 0.4579728841781616, "learning_rate": 9.004633486836339e-06, "loss": 0.404, "step": 1716 }, { "epoch": 0.8523911964256164, "grad_norm": 0.3897906243801117, "learning_rate": 9.002902942017225e-06, "loss": 0.4007, "step": 1717 }, { "epoch": 0.8528876385901043, "grad_norm": 0.48129794001579285, "learning_rate": 9.00117106073761e-06, "loss": 0.4434, "step": 1718 }, { "epoch": 0.853384080754592, "grad_norm": 0.4814259111881256, "learning_rate": 8.999437843575727e-06, "loss": 0.4572, "step": 1719 }, { "epoch": 0.8538805229190799, "grad_norm": 0.42361974716186523, "learning_rate": 8.997703291110243e-06, "loss": 0.4396, "step": 1720 }, { "epoch": 0.8543769650835678, "grad_norm": 0.39638620615005493, "learning_rate": 8.995967403920283e-06, "loss": 0.379, "step": 1721 }, { "epoch": 0.8548734072480556, "grad_norm": 0.4012767970561981, "learning_rate": 8.994230182585412e-06, "loss": 0.4366, "step": 1722 }, { "epoch": 0.8553698494125435, "grad_norm": 0.4169899523258209, "learning_rate": 8.99249162768564e-06, "loss": 0.3937, "step": 1723 }, { "epoch": 0.8558662915770313, "grad_norm": 0.4072791635990143, "learning_rate": 8.990751739801424e-06, "loss": 0.4168, "step": 1724 }, { "epoch": 0.8563627337415192, "grad_norm": 0.4006689786911011, "learning_rate": 8.989010519513664e-06, "loss": 0.4391, "step": 1725 }, { "epoch": 0.8568591759060069, "grad_norm": 0.4648005962371826, "learning_rate": 8.987267967403706e-06, "loss": 0.4189, "step": 1726 }, { "epoch": 0.8573556180704948, "grad_norm": 0.3818444311618805, "learning_rate": 8.985524084053342e-06, "loss": 0.449, "step": 1727 }, { "epoch": 0.8578520602349826, "grad_norm": 0.42742857336997986, "learning_rate": 8.983778870044806e-06, "loss": 0.4135, "step": 1728 }, { "epoch": 0.8583485023994705, "grad_norm": 0.4213395416736603, "learning_rate": 8.982032325960781e-06, "loss": 0.4301, "step": 1729 }, { "epoch": 0.8588449445639583, "grad_norm": 0.3809884786605835, "learning_rate": 8.980284452384387e-06, "loss": 0.4135, "step": 1730 }, { "epoch": 0.8593413867284462, "grad_norm": 0.49091988801956177, "learning_rate": 8.978535249899191e-06, "loss": 0.4389, "step": 1731 }, { "epoch": 0.8598378288929339, "grad_norm": 0.3672778308391571, "learning_rate": 8.976784719089206e-06, "loss": 0.4404, "step": 1732 }, { "epoch": 0.8603342710574218, "grad_norm": 0.43338143825531006, "learning_rate": 8.975032860538888e-06, "loss": 0.4461, "step": 1733 }, { "epoch": 0.8608307132219096, "grad_norm": 0.403358519077301, "learning_rate": 8.973279674833133e-06, "loss": 0.4198, "step": 1734 }, { "epoch": 0.8613271553863975, "grad_norm": 0.3788500130176544, "learning_rate": 8.971525162557282e-06, "loss": 0.4458, "step": 1735 }, { "epoch": 0.8618235975508853, "grad_norm": 0.4151068925857544, "learning_rate": 8.969769324297118e-06, "loss": 0.4269, "step": 1736 }, { "epoch": 0.8623200397153732, "grad_norm": 0.37192845344543457, "learning_rate": 8.96801216063887e-06, "loss": 0.437, "step": 1737 }, { "epoch": 0.862816481879861, "grad_norm": 0.41725102066993713, "learning_rate": 8.966253672169206e-06, "loss": 0.4379, "step": 1738 }, { "epoch": 0.8633129240443488, "grad_norm": 0.37976205348968506, "learning_rate": 8.964493859475239e-06, "loss": 0.4144, "step": 1739 }, { "epoch": 0.8638093662088366, "grad_norm": 0.3996419608592987, "learning_rate": 8.962732723144518e-06, "loss": 0.4261, "step": 1740 }, { "epoch": 0.8643058083733245, "grad_norm": 0.40902063250541687, "learning_rate": 8.960970263765044e-06, "loss": 0.4571, "step": 1741 }, { "epoch": 0.8648022505378123, "grad_norm": 0.35137784481048584, "learning_rate": 8.959206481925252e-06, "loss": 0.3848, "step": 1742 }, { "epoch": 0.8652986927023002, "grad_norm": 0.4367865025997162, "learning_rate": 8.957441378214021e-06, "loss": 0.4275, "step": 1743 }, { "epoch": 0.865795134866788, "grad_norm": 0.3888480067253113, "learning_rate": 8.95567495322067e-06, "loss": 0.403, "step": 1744 }, { "epoch": 0.8662915770312759, "grad_norm": 0.4135153889656067, "learning_rate": 8.953907207534964e-06, "loss": 0.4422, "step": 1745 }, { "epoch": 0.8667880191957636, "grad_norm": 0.39550983905792236, "learning_rate": 8.9521381417471e-06, "loss": 0.4098, "step": 1746 }, { "epoch": 0.8672844613602515, "grad_norm": 0.3746364414691925, "learning_rate": 8.950367756447727e-06, "loss": 0.4072, "step": 1747 }, { "epoch": 0.8677809035247394, "grad_norm": 0.41821664571762085, "learning_rate": 8.948596052227921e-06, "loss": 0.4204, "step": 1748 }, { "epoch": 0.8682773456892272, "grad_norm": 0.3819909393787384, "learning_rate": 8.946823029679213e-06, "loss": 0.4323, "step": 1749 }, { "epoch": 0.8687737878537151, "grad_norm": 0.4145582616329193, "learning_rate": 8.945048689393563e-06, "loss": 0.4446, "step": 1750 }, { "epoch": 0.8692702300182029, "grad_norm": 0.4490146338939667, "learning_rate": 8.943273031963375e-06, "loss": 0.4724, "step": 1751 }, { "epoch": 0.8697666721826908, "grad_norm": 0.39614808559417725, "learning_rate": 8.941496057981495e-06, "loss": 0.4441, "step": 1752 }, { "epoch": 0.8702631143471785, "grad_norm": 0.40738382935523987, "learning_rate": 8.939717768041206e-06, "loss": 0.3804, "step": 1753 }, { "epoch": 0.8707595565116664, "grad_norm": 0.3674822449684143, "learning_rate": 8.937938162736229e-06, "loss": 0.4186, "step": 1754 }, { "epoch": 0.8712559986761542, "grad_norm": 0.39797094464302063, "learning_rate": 8.936157242660726e-06, "loss": 0.4346, "step": 1755 }, { "epoch": 0.8717524408406421, "grad_norm": 0.333802193403244, "learning_rate": 8.9343750084093e-06, "loss": 0.4278, "step": 1756 }, { "epoch": 0.8722488830051299, "grad_norm": 0.36950117349624634, "learning_rate": 8.932591460576988e-06, "loss": 0.4507, "step": 1757 }, { "epoch": 0.8727453251696178, "grad_norm": 0.3365027904510498, "learning_rate": 8.93080659975927e-06, "loss": 0.4016, "step": 1758 }, { "epoch": 0.8732417673341055, "grad_norm": 0.3683069050312042, "learning_rate": 8.92902042655206e-06, "loss": 0.4123, "step": 1759 }, { "epoch": 0.8737382094985934, "grad_norm": 0.4018745422363281, "learning_rate": 8.927232941551716e-06, "loss": 0.412, "step": 1760 }, { "epoch": 0.8742346516630812, "grad_norm": 0.38720032572746277, "learning_rate": 8.92544414535503e-06, "loss": 0.4336, "step": 1761 }, { "epoch": 0.8747310938275691, "grad_norm": 0.39226070046424866, "learning_rate": 8.92365403855923e-06, "loss": 0.4008, "step": 1762 }, { "epoch": 0.8752275359920569, "grad_norm": 0.377726674079895, "learning_rate": 8.921862621761985e-06, "loss": 0.4173, "step": 1763 }, { "epoch": 0.8757239781565448, "grad_norm": 0.3995196223258972, "learning_rate": 8.920069895561403e-06, "loss": 0.4287, "step": 1764 }, { "epoch": 0.8762204203210326, "grad_norm": 0.3688774108886719, "learning_rate": 8.918275860556022e-06, "loss": 0.4287, "step": 1765 }, { "epoch": 0.8767168624855204, "grad_norm": 0.41713494062423706, "learning_rate": 8.916480517344826e-06, "loss": 0.4065, "step": 1766 }, { "epoch": 0.8772133046500082, "grad_norm": 0.430107444524765, "learning_rate": 8.914683866527227e-06, "loss": 0.4173, "step": 1767 }, { "epoch": 0.8777097468144961, "grad_norm": 0.4095325767993927, "learning_rate": 8.912885908703083e-06, "loss": 0.4403, "step": 1768 }, { "epoch": 0.878206188978984, "grad_norm": 0.38425639271736145, "learning_rate": 8.911086644472679e-06, "loss": 0.4263, "step": 1769 }, { "epoch": 0.8787026311434718, "grad_norm": 0.4193624258041382, "learning_rate": 8.909286074436742e-06, "loss": 0.4365, "step": 1770 }, { "epoch": 0.8791990733079597, "grad_norm": 0.4316038191318512, "learning_rate": 8.907484199196432e-06, "loss": 0.4263, "step": 1771 }, { "epoch": 0.8796955154724475, "grad_norm": 0.4014749526977539, "learning_rate": 8.905681019353349e-06, "loss": 0.3866, "step": 1772 }, { "epoch": 0.8801919576369353, "grad_norm": 0.3465031087398529, "learning_rate": 8.903876535509524e-06, "loss": 0.4063, "step": 1773 }, { "epoch": 0.8806883998014231, "grad_norm": 0.37165379524230957, "learning_rate": 8.902070748267425e-06, "loss": 0.4166, "step": 1774 }, { "epoch": 0.881184841965911, "grad_norm": 0.3981352746486664, "learning_rate": 8.900263658229954e-06, "loss": 0.3959, "step": 1775 }, { "epoch": 0.8816812841303988, "grad_norm": 0.4078894555568695, "learning_rate": 8.898455266000455e-06, "loss": 0.4156, "step": 1776 }, { "epoch": 0.8821777262948867, "grad_norm": 0.41355255246162415, "learning_rate": 8.896645572182694e-06, "loss": 0.4447, "step": 1777 }, { "epoch": 0.8826741684593745, "grad_norm": 0.4172033369541168, "learning_rate": 8.894834577380882e-06, "loss": 0.4325, "step": 1778 }, { "epoch": 0.8831706106238624, "grad_norm": 0.4050782024860382, "learning_rate": 8.89302228219966e-06, "loss": 0.4162, "step": 1779 }, { "epoch": 0.8836670527883501, "grad_norm": 0.4115501046180725, "learning_rate": 8.891208687244104e-06, "loss": 0.4332, "step": 1780 }, { "epoch": 0.884163494952838, "grad_norm": 0.401984840631485, "learning_rate": 8.889393793119725e-06, "loss": 0.4159, "step": 1781 }, { "epoch": 0.8846599371173258, "grad_norm": 0.3848888874053955, "learning_rate": 8.887577600432466e-06, "loss": 0.4359, "step": 1782 }, { "epoch": 0.8851563792818137, "grad_norm": 0.40092942118644714, "learning_rate": 8.885760109788705e-06, "loss": 0.4218, "step": 1783 }, { "epoch": 0.8856528214463015, "grad_norm": 0.39097893238067627, "learning_rate": 8.883941321795254e-06, "loss": 0.4283, "step": 1784 }, { "epoch": 0.8861492636107894, "grad_norm": 0.39728599786758423, "learning_rate": 8.882121237059353e-06, "loss": 0.3892, "step": 1785 }, { "epoch": 0.8866457057752771, "grad_norm": 0.38196295499801636, "learning_rate": 8.880299856188681e-06, "loss": 0.4202, "step": 1786 }, { "epoch": 0.887142147939765, "grad_norm": 0.44436636567115784, "learning_rate": 8.878477179791349e-06, "loss": 0.4093, "step": 1787 }, { "epoch": 0.8876385901042528, "grad_norm": 0.44297075271606445, "learning_rate": 8.876653208475898e-06, "loss": 0.4307, "step": 1788 }, { "epoch": 0.8881350322687407, "grad_norm": 0.4518827199935913, "learning_rate": 8.874827942851302e-06, "loss": 0.4433, "step": 1789 }, { "epoch": 0.8886314744332285, "grad_norm": 0.48459285497665405, "learning_rate": 8.873001383526966e-06, "loss": 0.4384, "step": 1790 }, { "epoch": 0.8891279165977164, "grad_norm": 0.4078236222267151, "learning_rate": 8.871173531112733e-06, "loss": 0.3961, "step": 1791 }, { "epoch": 0.8896243587622042, "grad_norm": 0.4665946364402771, "learning_rate": 8.86934438621887e-06, "loss": 0.4308, "step": 1792 }, { "epoch": 0.890120800926692, "grad_norm": 0.42011144757270813, "learning_rate": 8.86751394945608e-06, "loss": 0.3973, "step": 1793 }, { "epoch": 0.8906172430911798, "grad_norm": 0.3973124921321869, "learning_rate": 8.865682221435495e-06, "loss": 0.4303, "step": 1794 }, { "epoch": 0.8911136852556677, "grad_norm": 0.4618796408176422, "learning_rate": 8.863849202768677e-06, "loss": 0.4294, "step": 1795 }, { "epoch": 0.8916101274201556, "grad_norm": 0.43169865012168884, "learning_rate": 8.862014894067627e-06, "loss": 0.426, "step": 1796 }, { "epoch": 0.8921065695846434, "grad_norm": 0.47018224000930786, "learning_rate": 8.860179295944766e-06, "loss": 0.4241, "step": 1797 }, { "epoch": 0.8926030117491313, "grad_norm": 0.3884674310684204, "learning_rate": 8.858342409012953e-06, "loss": 0.4246, "step": 1798 }, { "epoch": 0.8930994539136191, "grad_norm": 0.37989890575408936, "learning_rate": 8.856504233885473e-06, "loss": 0.3888, "step": 1799 }, { "epoch": 0.8935958960781069, "grad_norm": 0.42814064025878906, "learning_rate": 8.854664771176044e-06, "loss": 0.4353, "step": 1800 }, { "epoch": 0.8940923382425947, "grad_norm": 0.3959041237831116, "learning_rate": 8.852824021498811e-06, "loss": 0.4143, "step": 1801 }, { "epoch": 0.8945887804070826, "grad_norm": 0.4023890197277069, "learning_rate": 8.850981985468351e-06, "loss": 0.4448, "step": 1802 }, { "epoch": 0.8950852225715704, "grad_norm": 0.44920146465301514, "learning_rate": 8.849138663699671e-06, "loss": 0.4483, "step": 1803 }, { "epoch": 0.8955816647360583, "grad_norm": 0.34890586137771606, "learning_rate": 8.847294056808204e-06, "loss": 0.4312, "step": 1804 }, { "epoch": 0.8960781069005461, "grad_norm": 0.4163515269756317, "learning_rate": 8.845448165409815e-06, "loss": 0.4375, "step": 1805 }, { "epoch": 0.896574549065034, "grad_norm": 0.38630735874176025, "learning_rate": 8.8436009901208e-06, "loss": 0.4478, "step": 1806 }, { "epoch": 0.8970709912295217, "grad_norm": 0.37653666734695435, "learning_rate": 8.841752531557875e-06, "loss": 0.4167, "step": 1807 }, { "epoch": 0.8975674333940096, "grad_norm": 0.42014002799987793, "learning_rate": 8.839902790338193e-06, "loss": 0.4189, "step": 1808 }, { "epoch": 0.8980638755584974, "grad_norm": 0.40045204758644104, "learning_rate": 8.838051767079332e-06, "loss": 0.4365, "step": 1809 }, { "epoch": 0.8985603177229853, "grad_norm": 0.3810298442840576, "learning_rate": 8.836199462399298e-06, "loss": 0.442, "step": 1810 }, { "epoch": 0.8990567598874731, "grad_norm": 0.43862199783325195, "learning_rate": 8.834345876916526e-06, "loss": 0.4075, "step": 1811 }, { "epoch": 0.899553202051961, "grad_norm": 0.42315050959587097, "learning_rate": 8.832491011249878e-06, "loss": 0.4513, "step": 1812 }, { "epoch": 0.9000496442164487, "grad_norm": 0.3950519859790802, "learning_rate": 8.830634866018641e-06, "loss": 0.404, "step": 1813 }, { "epoch": 0.9005460863809366, "grad_norm": 0.3878178894519806, "learning_rate": 8.828777441842536e-06, "loss": 0.4299, "step": 1814 }, { "epoch": 0.9010425285454244, "grad_norm": 0.3676861524581909, "learning_rate": 8.826918739341701e-06, "loss": 0.4065, "step": 1815 }, { "epoch": 0.9015389707099123, "grad_norm": 0.37744519114494324, "learning_rate": 8.82505875913671e-06, "loss": 0.419, "step": 1816 }, { "epoch": 0.9020354128744001, "grad_norm": 0.34625378251075745, "learning_rate": 8.82319750184856e-06, "loss": 0.4293, "step": 1817 }, { "epoch": 0.902531855038888, "grad_norm": 0.3905632495880127, "learning_rate": 8.821334968098671e-06, "loss": 0.4244, "step": 1818 }, { "epoch": 0.9030282972033759, "grad_norm": 0.4061894118785858, "learning_rate": 8.819471158508894e-06, "loss": 0.4318, "step": 1819 }, { "epoch": 0.9035247393678636, "grad_norm": 0.36935582756996155, "learning_rate": 8.817606073701505e-06, "loss": 0.4569, "step": 1820 }, { "epoch": 0.9040211815323514, "grad_norm": 0.39965692162513733, "learning_rate": 8.815739714299206e-06, "loss": 0.4346, "step": 1821 }, { "epoch": 0.9045176236968393, "grad_norm": 0.3633206784725189, "learning_rate": 8.813872080925122e-06, "loss": 0.4205, "step": 1822 }, { "epoch": 0.9050140658613272, "grad_norm": 0.3410821259021759, "learning_rate": 8.812003174202803e-06, "loss": 0.4193, "step": 1823 }, { "epoch": 0.905510508025815, "grad_norm": 0.3862614035606384, "learning_rate": 8.810132994756232e-06, "loss": 0.4101, "step": 1824 }, { "epoch": 0.9060069501903029, "grad_norm": 0.3666878938674927, "learning_rate": 8.808261543209807e-06, "loss": 0.4114, "step": 1825 }, { "epoch": 0.9065033923547907, "grad_norm": 0.4127109944820404, "learning_rate": 8.806388820188354e-06, "loss": 0.419, "step": 1826 }, { "epoch": 0.9069998345192785, "grad_norm": 0.3812290132045746, "learning_rate": 8.804514826317125e-06, "loss": 0.4541, "step": 1827 }, { "epoch": 0.9074962766837663, "grad_norm": 0.36078575253486633, "learning_rate": 8.8026395622218e-06, "loss": 0.4322, "step": 1828 }, { "epoch": 0.9079927188482542, "grad_norm": 0.39358967542648315, "learning_rate": 8.800763028528472e-06, "loss": 0.4132, "step": 1829 }, { "epoch": 0.908489161012742, "grad_norm": 0.368725448846817, "learning_rate": 8.79888522586367e-06, "loss": 0.4081, "step": 1830 }, { "epoch": 0.9089856031772299, "grad_norm": 0.3812626600265503, "learning_rate": 8.797006154854338e-06, "loss": 0.4378, "step": 1831 }, { "epoch": 0.9094820453417177, "grad_norm": 0.4321485757827759, "learning_rate": 8.795125816127849e-06, "loss": 0.4248, "step": 1832 }, { "epoch": 0.9099784875062056, "grad_norm": 0.368996262550354, "learning_rate": 8.793244210311995e-06, "loss": 0.4027, "step": 1833 }, { "epoch": 0.9104749296706933, "grad_norm": 0.4171300232410431, "learning_rate": 8.791361338034993e-06, "loss": 0.4225, "step": 1834 }, { "epoch": 0.9109713718351812, "grad_norm": 0.35315248370170593, "learning_rate": 8.789477199925485e-06, "loss": 0.4493, "step": 1835 }, { "epoch": 0.911467813999669, "grad_norm": 0.38603708148002625, "learning_rate": 8.787591796612531e-06, "loss": 0.3986, "step": 1836 }, { "epoch": 0.9119642561641569, "grad_norm": 0.3596018850803375, "learning_rate": 8.785705128725618e-06, "loss": 0.4119, "step": 1837 }, { "epoch": 0.9124606983286447, "grad_norm": 0.38489142060279846, "learning_rate": 8.783817196894652e-06, "loss": 0.4177, "step": 1838 }, { "epoch": 0.9129571404931326, "grad_norm": 0.39042040705680847, "learning_rate": 8.781928001749961e-06, "loss": 0.4113, "step": 1839 }, { "epoch": 0.9134535826576203, "grad_norm": 0.36592456698417664, "learning_rate": 8.780037543922299e-06, "loss": 0.4262, "step": 1840 }, { "epoch": 0.9139500248221082, "grad_norm": 0.36912038922309875, "learning_rate": 8.778145824042838e-06, "loss": 0.4015, "step": 1841 }, { "epoch": 0.914446466986596, "grad_norm": 0.3923741579055786, "learning_rate": 8.776252842743169e-06, "loss": 0.4068, "step": 1842 }, { "epoch": 0.9149429091510839, "grad_norm": 0.41194450855255127, "learning_rate": 8.774358600655309e-06, "loss": 0.4168, "step": 1843 }, { "epoch": 0.9154393513155717, "grad_norm": 0.4201279282569885, "learning_rate": 8.772463098411694e-06, "loss": 0.4215, "step": 1844 }, { "epoch": 0.9159357934800596, "grad_norm": 0.38101714849472046, "learning_rate": 8.77056633664518e-06, "loss": 0.4465, "step": 1845 }, { "epoch": 0.9164322356445475, "grad_norm": 0.3559664785861969, "learning_rate": 8.768668315989045e-06, "loss": 0.3807, "step": 1846 }, { "epoch": 0.9169286778090352, "grad_norm": 0.3877433240413666, "learning_rate": 8.766769037076986e-06, "loss": 0.4161, "step": 1847 }, { "epoch": 0.917425119973523, "grad_norm": 0.3519996702671051, "learning_rate": 8.76486850054312e-06, "loss": 0.429, "step": 1848 }, { "epoch": 0.9179215621380109, "grad_norm": 0.3528066575527191, "learning_rate": 8.762966707021985e-06, "loss": 0.4119, "step": 1849 }, { "epoch": 0.9184180043024988, "grad_norm": 0.4044789969921112, "learning_rate": 8.761063657148537e-06, "loss": 0.4427, "step": 1850 }, { "epoch": 0.9189144464669866, "grad_norm": 0.39951545000076294, "learning_rate": 8.759159351558155e-06, "loss": 0.4118, "step": 1851 }, { "epoch": 0.9194108886314745, "grad_norm": 0.4199252426624298, "learning_rate": 8.757253790886635e-06, "loss": 0.4019, "step": 1852 }, { "epoch": 0.9199073307959623, "grad_norm": 0.37701231241226196, "learning_rate": 8.75534697577019e-06, "loss": 0.397, "step": 1853 }, { "epoch": 0.9204037729604501, "grad_norm": 0.39681166410446167, "learning_rate": 8.753438906845454e-06, "loss": 0.4147, "step": 1854 }, { "epoch": 0.9209002151249379, "grad_norm": 0.38123324513435364, "learning_rate": 8.751529584749482e-06, "loss": 0.4333, "step": 1855 }, { "epoch": 0.9213966572894258, "grad_norm": 0.3836019039154053, "learning_rate": 8.749619010119738e-06, "loss": 0.4197, "step": 1856 }, { "epoch": 0.9218930994539136, "grad_norm": 0.3565029799938202, "learning_rate": 8.74770718359412e-06, "loss": 0.4112, "step": 1857 }, { "epoch": 0.9223895416184015, "grad_norm": 0.3606451749801636, "learning_rate": 8.745794105810928e-06, "loss": 0.4089, "step": 1858 }, { "epoch": 0.9228859837828893, "grad_norm": 0.3590800166130066, "learning_rate": 8.74387977740889e-06, "loss": 0.4021, "step": 1859 }, { "epoch": 0.9233824259473772, "grad_norm": 0.36268478631973267, "learning_rate": 8.741964199027147e-06, "loss": 0.4195, "step": 1860 }, { "epoch": 0.9238788681118649, "grad_norm": 0.32692933082580566, "learning_rate": 8.740047371305259e-06, "loss": 0.4135, "step": 1861 }, { "epoch": 0.9243753102763528, "grad_norm": 0.36968088150024414, "learning_rate": 8.738129294883202e-06, "loss": 0.4166, "step": 1862 }, { "epoch": 0.9248717524408406, "grad_norm": 0.40573427081108093, "learning_rate": 8.73620997040137e-06, "loss": 0.4223, "step": 1863 }, { "epoch": 0.9253681946053285, "grad_norm": 0.36606812477111816, "learning_rate": 8.734289398500576e-06, "loss": 0.4125, "step": 1864 }, { "epoch": 0.9258646367698163, "grad_norm": 0.38212183117866516, "learning_rate": 8.732367579822043e-06, "loss": 0.418, "step": 1865 }, { "epoch": 0.9263610789343042, "grad_norm": 0.3642016351222992, "learning_rate": 8.730444515007413e-06, "loss": 0.4225, "step": 1866 }, { "epoch": 0.9268575210987919, "grad_norm": 0.4174303412437439, "learning_rate": 8.72852020469875e-06, "loss": 0.4224, "step": 1867 }, { "epoch": 0.9273539632632798, "grad_norm": 0.38388484716415405, "learning_rate": 8.726594649538524e-06, "loss": 0.4099, "step": 1868 }, { "epoch": 0.9278504054277676, "grad_norm": 0.3694220781326294, "learning_rate": 8.72466785016963e-06, "loss": 0.3908, "step": 1869 }, { "epoch": 0.9283468475922555, "grad_norm": 0.3775377571582794, "learning_rate": 8.72273980723537e-06, "loss": 0.4093, "step": 1870 }, { "epoch": 0.9288432897567434, "grad_norm": 0.39530089497566223, "learning_rate": 8.720810521379467e-06, "loss": 0.3991, "step": 1871 }, { "epoch": 0.9293397319212312, "grad_norm": 0.3527381718158722, "learning_rate": 8.718879993246058e-06, "loss": 0.3889, "step": 1872 }, { "epoch": 0.9298361740857191, "grad_norm": 0.36854881048202515, "learning_rate": 8.716948223479693e-06, "loss": 0.4055, "step": 1873 }, { "epoch": 0.9303326162502068, "grad_norm": 0.3794046938419342, "learning_rate": 8.715015212725336e-06, "loss": 0.4096, "step": 1874 }, { "epoch": 0.9308290584146947, "grad_norm": 0.39099451899528503, "learning_rate": 8.713080961628368e-06, "loss": 0.4225, "step": 1875 }, { "epoch": 0.9313255005791825, "grad_norm": 0.4028542935848236, "learning_rate": 8.711145470834584e-06, "loss": 0.4317, "step": 1876 }, { "epoch": 0.9318219427436704, "grad_norm": 0.377544641494751, "learning_rate": 8.709208740990189e-06, "loss": 0.4209, "step": 1877 }, { "epoch": 0.9323183849081582, "grad_norm": 0.3997279703617096, "learning_rate": 8.707270772741807e-06, "loss": 0.4145, "step": 1878 }, { "epoch": 0.9328148270726461, "grad_norm": 0.5162120461463928, "learning_rate": 8.705331566736473e-06, "loss": 0.4303, "step": 1879 }, { "epoch": 0.9333112692371339, "grad_norm": 0.3853559195995331, "learning_rate": 8.703391123621632e-06, "loss": 0.4121, "step": 1880 }, { "epoch": 0.9338077114016217, "grad_norm": 0.35878023505210876, "learning_rate": 8.701449444045149e-06, "loss": 0.4268, "step": 1881 }, { "epoch": 0.9343041535661095, "grad_norm": 0.41182810068130493, "learning_rate": 8.699506528655297e-06, "loss": 0.4276, "step": 1882 }, { "epoch": 0.9348005957305974, "grad_norm": 0.4211418032646179, "learning_rate": 8.697562378100761e-06, "loss": 0.44, "step": 1883 }, { "epoch": 0.9352970378950852, "grad_norm": 0.38644978404045105, "learning_rate": 8.695616993030642e-06, "loss": 0.4103, "step": 1884 }, { "epoch": 0.9357934800595731, "grad_norm": 0.4168637692928314, "learning_rate": 8.69367037409445e-06, "loss": 0.4224, "step": 1885 }, { "epoch": 0.9362899222240609, "grad_norm": 0.3828428089618683, "learning_rate": 8.691722521942107e-06, "loss": 0.4051, "step": 1886 }, { "epoch": 0.9367863643885488, "grad_norm": 0.4248144030570984, "learning_rate": 8.68977343722395e-06, "loss": 0.3968, "step": 1887 }, { "epoch": 0.9372828065530365, "grad_norm": 0.43700316548347473, "learning_rate": 8.687823120590727e-06, "loss": 0.4418, "step": 1888 }, { "epoch": 0.9377792487175244, "grad_norm": 0.4410456418991089, "learning_rate": 8.685871572693592e-06, "loss": 0.4176, "step": 1889 }, { "epoch": 0.9382756908820122, "grad_norm": 0.4237629473209381, "learning_rate": 8.683918794184115e-06, "loss": 0.4194, "step": 1890 }, { "epoch": 0.9387721330465001, "grad_norm": 0.3875899910926819, "learning_rate": 8.681964785714275e-06, "loss": 0.432, "step": 1891 }, { "epoch": 0.9392685752109879, "grad_norm": 0.398041695356369, "learning_rate": 8.680009547936465e-06, "loss": 0.401, "step": 1892 }, { "epoch": 0.9397650173754758, "grad_norm": 0.4478461444377899, "learning_rate": 8.678053081503484e-06, "loss": 0.4451, "step": 1893 }, { "epoch": 0.9402614595399635, "grad_norm": 0.3451896607875824, "learning_rate": 8.676095387068542e-06, "loss": 0.417, "step": 1894 }, { "epoch": 0.9407579017044514, "grad_norm": 0.35995563864707947, "learning_rate": 8.674136465285261e-06, "loss": 0.4074, "step": 1895 }, { "epoch": 0.9412543438689392, "grad_norm": 0.3871385455131531, "learning_rate": 8.672176316807672e-06, "loss": 0.431, "step": 1896 }, { "epoch": 0.9417507860334271, "grad_norm": 0.38195639848709106, "learning_rate": 8.670214942290215e-06, "loss": 0.4136, "step": 1897 }, { "epoch": 0.942247228197915, "grad_norm": 0.4146149158477783, "learning_rate": 8.66825234238774e-06, "loss": 0.4342, "step": 1898 }, { "epoch": 0.9427436703624028, "grad_norm": 0.3849220871925354, "learning_rate": 8.666288517755505e-06, "loss": 0.4066, "step": 1899 }, { "epoch": 0.9432401125268907, "grad_norm": 0.45213478803634644, "learning_rate": 8.66432346904918e-06, "loss": 0.4152, "step": 1900 }, { "epoch": 0.9437365546913784, "grad_norm": 0.4269937574863434, "learning_rate": 8.662357196924838e-06, "loss": 0.4604, "step": 1901 }, { "epoch": 0.9442329968558663, "grad_norm": 0.40015286207199097, "learning_rate": 8.660389702038965e-06, "loss": 0.3931, "step": 1902 }, { "epoch": 0.9447294390203541, "grad_norm": 0.41188859939575195, "learning_rate": 8.658420985048455e-06, "loss": 0.4177, "step": 1903 }, { "epoch": 0.945225881184842, "grad_norm": 0.3982900083065033, "learning_rate": 8.656451046610607e-06, "loss": 0.4141, "step": 1904 }, { "epoch": 0.9457223233493298, "grad_norm": 0.3552129566669464, "learning_rate": 8.654479887383134e-06, "loss": 0.4318, "step": 1905 }, { "epoch": 0.9462187655138177, "grad_norm": 0.44390416145324707, "learning_rate": 8.652507508024148e-06, "loss": 0.4176, "step": 1906 }, { "epoch": 0.9467152076783055, "grad_norm": 0.3776560425758362, "learning_rate": 8.650533909192174e-06, "loss": 0.4292, "step": 1907 }, { "epoch": 0.9472116498427933, "grad_norm": 0.4000738263130188, "learning_rate": 8.648559091546145e-06, "loss": 0.4356, "step": 1908 }, { "epoch": 0.9477080920072811, "grad_norm": 0.39985474944114685, "learning_rate": 8.646583055745398e-06, "loss": 0.4229, "step": 1909 }, { "epoch": 0.948204534171769, "grad_norm": 0.3709854483604431, "learning_rate": 8.644605802449677e-06, "loss": 0.4174, "step": 1910 }, { "epoch": 0.9487009763362568, "grad_norm": 0.39343491196632385, "learning_rate": 8.642627332319133e-06, "loss": 0.432, "step": 1911 }, { "epoch": 0.9491974185007447, "grad_norm": 0.3869503438472748, "learning_rate": 8.640647646014324e-06, "loss": 0.4237, "step": 1912 }, { "epoch": 0.9496938606652325, "grad_norm": 0.40200793743133545, "learning_rate": 8.638666744196213e-06, "loss": 0.4399, "step": 1913 }, { "epoch": 0.9501903028297204, "grad_norm": 0.4245280921459198, "learning_rate": 8.636684627526171e-06, "loss": 0.4282, "step": 1914 }, { "epoch": 0.9506867449942081, "grad_norm": 0.3257196247577667, "learning_rate": 8.63470129666597e-06, "loss": 0.4277, "step": 1915 }, { "epoch": 0.951183187158696, "grad_norm": 0.4272817075252533, "learning_rate": 8.632716752277792e-06, "loss": 0.3924, "step": 1916 }, { "epoch": 0.9516796293231838, "grad_norm": 0.3457334041595459, "learning_rate": 8.630730995024224e-06, "loss": 0.4026, "step": 1917 }, { "epoch": 0.9521760714876717, "grad_norm": 0.3898588716983795, "learning_rate": 8.628744025568252e-06, "loss": 0.4522, "step": 1918 }, { "epoch": 0.9526725136521595, "grad_norm": 0.36967167258262634, "learning_rate": 8.626755844573274e-06, "loss": 0.4345, "step": 1919 }, { "epoch": 0.9531689558166474, "grad_norm": 0.4193633794784546, "learning_rate": 8.62476645270309e-06, "loss": 0.4218, "step": 1920 }, { "epoch": 0.9536653979811351, "grad_norm": 0.37094900012016296, "learning_rate": 8.622775850621904e-06, "loss": 0.4341, "step": 1921 }, { "epoch": 0.954161840145623, "grad_norm": 0.3418593108654022, "learning_rate": 8.62078403899432e-06, "loss": 0.4314, "step": 1922 }, { "epoch": 0.9546582823101109, "grad_norm": 0.3609347641468048, "learning_rate": 8.618791018485357e-06, "loss": 0.43, "step": 1923 }, { "epoch": 0.9551547244745987, "grad_norm": 0.3490521013736725, "learning_rate": 8.616796789760424e-06, "loss": 0.4073, "step": 1924 }, { "epoch": 0.9556511666390866, "grad_norm": 0.3627062737941742, "learning_rate": 8.614801353485343e-06, "loss": 0.4002, "step": 1925 }, { "epoch": 0.9561476088035744, "grad_norm": 0.38209766149520874, "learning_rate": 8.612804710326332e-06, "loss": 0.4403, "step": 1926 }, { "epoch": 0.9566440509680623, "grad_norm": 0.3936656415462494, "learning_rate": 8.610806860950023e-06, "loss": 0.4155, "step": 1927 }, { "epoch": 0.95714049313255, "grad_norm": 0.379164457321167, "learning_rate": 8.608807806023436e-06, "loss": 0.4374, "step": 1928 }, { "epoch": 0.9576369352970379, "grad_norm": 0.4213271141052246, "learning_rate": 8.606807546214007e-06, "loss": 0.4076, "step": 1929 }, { "epoch": 0.9581333774615257, "grad_norm": 0.3824509382247925, "learning_rate": 8.604806082189564e-06, "loss": 0.4135, "step": 1930 }, { "epoch": 0.9586298196260136, "grad_norm": 0.36723557114601135, "learning_rate": 8.602803414618343e-06, "loss": 0.4073, "step": 1931 }, { "epoch": 0.9591262617905014, "grad_norm": 0.39366868138313293, "learning_rate": 8.600799544168983e-06, "loss": 0.444, "step": 1932 }, { "epoch": 0.9596227039549893, "grad_norm": 0.3544536530971527, "learning_rate": 8.598794471510519e-06, "loss": 0.4291, "step": 1933 }, { "epoch": 0.9601191461194771, "grad_norm": 0.41843485832214355, "learning_rate": 8.596788197312389e-06, "loss": 0.4236, "step": 1934 }, { "epoch": 0.9606155882839649, "grad_norm": 0.34987616539001465, "learning_rate": 8.594780722244436e-06, "loss": 0.415, "step": 1935 }, { "epoch": 0.9611120304484527, "grad_norm": 0.3739016652107239, "learning_rate": 8.592772046976901e-06, "loss": 0.4159, "step": 1936 }, { "epoch": 0.9616084726129406, "grad_norm": 0.35965824127197266, "learning_rate": 8.590762172180426e-06, "loss": 0.4061, "step": 1937 }, { "epoch": 0.9621049147774284, "grad_norm": 0.34984123706817627, "learning_rate": 8.588751098526053e-06, "loss": 0.4217, "step": 1938 }, { "epoch": 0.9626013569419163, "grad_norm": 0.37234780192375183, "learning_rate": 8.586738826685223e-06, "loss": 0.4158, "step": 1939 }, { "epoch": 0.9630977991064041, "grad_norm": 0.362560898065567, "learning_rate": 8.584725357329784e-06, "loss": 0.4305, "step": 1940 }, { "epoch": 0.963594241270892, "grad_norm": 0.36391422152519226, "learning_rate": 8.582710691131975e-06, "loss": 0.4204, "step": 1941 }, { "epoch": 0.9640906834353797, "grad_norm": 0.36079633235931396, "learning_rate": 8.580694828764438e-06, "loss": 0.3815, "step": 1942 }, { "epoch": 0.9645871255998676, "grad_norm": 0.3820597231388092, "learning_rate": 8.578677770900215e-06, "loss": 0.4199, "step": 1943 }, { "epoch": 0.9650835677643554, "grad_norm": 0.3910003900527954, "learning_rate": 8.57665951821275e-06, "loss": 0.4079, "step": 1944 }, { "epoch": 0.9655800099288433, "grad_norm": 0.38215702772140503, "learning_rate": 8.574640071375877e-06, "loss": 0.4322, "step": 1945 }, { "epoch": 0.9660764520933312, "grad_norm": 0.40718406438827515, "learning_rate": 8.572619431063839e-06, "loss": 0.4173, "step": 1946 }, { "epoch": 0.966572894257819, "grad_norm": 0.38165032863616943, "learning_rate": 8.570597597951272e-06, "loss": 0.4311, "step": 1947 }, { "epoch": 0.9670693364223067, "grad_norm": 0.36876222491264343, "learning_rate": 8.568574572713208e-06, "loss": 0.3904, "step": 1948 }, { "epoch": 0.9675657785867946, "grad_norm": 0.3796617388725281, "learning_rate": 8.566550356025083e-06, "loss": 0.4175, "step": 1949 }, { "epoch": 0.9680622207512825, "grad_norm": 0.3777402341365814, "learning_rate": 8.56452494856273e-06, "loss": 0.3953, "step": 1950 }, { "epoch": 0.9685586629157703, "grad_norm": 0.3571105897426605, "learning_rate": 8.562498351002375e-06, "loss": 0.4017, "step": 1951 }, { "epoch": 0.9690551050802582, "grad_norm": 0.3593992590904236, "learning_rate": 8.560470564020642e-06, "loss": 0.4058, "step": 1952 }, { "epoch": 0.969551547244746, "grad_norm": 0.4290255010128021, "learning_rate": 8.558441588294556e-06, "loss": 0.4271, "step": 1953 }, { "epoch": 0.9700479894092339, "grad_norm": 0.3927137851715088, "learning_rate": 8.556411424501539e-06, "loss": 0.42, "step": 1954 }, { "epoch": 0.9705444315737216, "grad_norm": 0.3735405206680298, "learning_rate": 8.554380073319403e-06, "loss": 0.4023, "step": 1955 }, { "epoch": 0.9710408737382095, "grad_norm": 0.4413205683231354, "learning_rate": 8.552347535426365e-06, "loss": 0.4081, "step": 1956 }, { "epoch": 0.9715373159026973, "grad_norm": 0.3993518352508545, "learning_rate": 8.55031381150103e-06, "loss": 0.4481, "step": 1957 }, { "epoch": 0.9720337580671852, "grad_norm": 0.3292062282562256, "learning_rate": 8.548278902222408e-06, "loss": 0.388, "step": 1958 }, { "epoch": 0.972530200231673, "grad_norm": 0.3814934492111206, "learning_rate": 8.546242808269895e-06, "loss": 0.4258, "step": 1959 }, { "epoch": 0.9730266423961609, "grad_norm": 0.4093024432659149, "learning_rate": 8.544205530323294e-06, "loss": 0.4177, "step": 1960 }, { "epoch": 0.9735230845606487, "grad_norm": 0.3822249472141266, "learning_rate": 8.542167069062788e-06, "loss": 0.441, "step": 1961 }, { "epoch": 0.9740195267251365, "grad_norm": 0.3693905472755432, "learning_rate": 8.54012742516897e-06, "loss": 0.4227, "step": 1962 }, { "epoch": 0.9745159688896243, "grad_norm": 0.3516564965248108, "learning_rate": 8.538086599322821e-06, "loss": 0.3997, "step": 1963 }, { "epoch": 0.9750124110541122, "grad_norm": 0.42061370611190796, "learning_rate": 8.536044592205716e-06, "loss": 0.4461, "step": 1964 }, { "epoch": 0.9755088532186, "grad_norm": 0.3635367751121521, "learning_rate": 8.534001404499426e-06, "loss": 0.423, "step": 1965 }, { "epoch": 0.9760052953830879, "grad_norm": 0.39285922050476074, "learning_rate": 8.531957036886114e-06, "loss": 0.4078, "step": 1966 }, { "epoch": 0.9765017375475757, "grad_norm": 0.4139218330383301, "learning_rate": 8.529911490048343e-06, "loss": 0.4485, "step": 1967 }, { "epoch": 0.9769981797120636, "grad_norm": 0.33792293071746826, "learning_rate": 8.527864764669063e-06, "loss": 0.4308, "step": 1968 }, { "epoch": 0.9774946218765513, "grad_norm": 0.3435049057006836, "learning_rate": 8.525816861431617e-06, "loss": 0.3965, "step": 1969 }, { "epoch": 0.9779910640410392, "grad_norm": 0.3321777284145355, "learning_rate": 8.523767781019752e-06, "loss": 0.407, "step": 1970 }, { "epoch": 0.978487506205527, "grad_norm": 0.3706909120082855, "learning_rate": 8.521717524117592e-06, "loss": 0.4048, "step": 1971 }, { "epoch": 0.9789839483700149, "grad_norm": 0.3942832052707672, "learning_rate": 8.519666091409669e-06, "loss": 0.4219, "step": 1972 }, { "epoch": 0.9794803905345028, "grad_norm": 0.35712024569511414, "learning_rate": 8.517613483580893e-06, "loss": 0.4031, "step": 1973 }, { "epoch": 0.9799768326989906, "grad_norm": 0.341674268245697, "learning_rate": 8.515559701316583e-06, "loss": 0.402, "step": 1974 }, { "epoch": 0.9804732748634784, "grad_norm": 0.37885424494743347, "learning_rate": 8.513504745302432e-06, "loss": 0.414, "step": 1975 }, { "epoch": 0.9809697170279662, "grad_norm": 0.38012731075286865, "learning_rate": 8.51144861622454e-06, "loss": 0.4185, "step": 1976 }, { "epoch": 0.9814661591924541, "grad_norm": 0.35857200622558594, "learning_rate": 8.509391314769394e-06, "loss": 0.3996, "step": 1977 }, { "epoch": 0.9819626013569419, "grad_norm": 0.4076017439365387, "learning_rate": 8.507332841623862e-06, "loss": 0.4364, "step": 1978 }, { "epoch": 0.9824590435214298, "grad_norm": 0.3748103678226471, "learning_rate": 8.505273197475224e-06, "loss": 0.4033, "step": 1979 }, { "epoch": 0.9829554856859176, "grad_norm": 0.37853068113327026, "learning_rate": 8.50321238301113e-06, "loss": 0.414, "step": 1980 }, { "epoch": 0.9834519278504055, "grad_norm": 0.44796639680862427, "learning_rate": 8.501150398919634e-06, "loss": 0.4297, "step": 1981 }, { "epoch": 0.9839483700148932, "grad_norm": 0.3638538122177124, "learning_rate": 8.499087245889176e-06, "loss": 0.4385, "step": 1982 }, { "epoch": 0.9844448121793811, "grad_norm": 0.38885489106178284, "learning_rate": 8.497022924608587e-06, "loss": 0.4526, "step": 1983 }, { "epoch": 0.9849412543438689, "grad_norm": 0.41402122378349304, "learning_rate": 8.494957435767086e-06, "loss": 0.3862, "step": 1984 }, { "epoch": 0.9854376965083568, "grad_norm": 0.37422969937324524, "learning_rate": 8.492890780054285e-06, "loss": 0.4258, "step": 1985 }, { "epoch": 0.9859341386728446, "grad_norm": 0.4265333116054535, "learning_rate": 8.490822958160186e-06, "loss": 0.4061, "step": 1986 }, { "epoch": 0.9864305808373325, "grad_norm": 0.4319775402545929, "learning_rate": 8.488753970775176e-06, "loss": 0.4208, "step": 1987 }, { "epoch": 0.9869270230018203, "grad_norm": 0.40758469700813293, "learning_rate": 8.486683818590033e-06, "loss": 0.4171, "step": 1988 }, { "epoch": 0.9874234651663081, "grad_norm": 0.3685147762298584, "learning_rate": 8.484612502295926e-06, "loss": 0.4161, "step": 1989 }, { "epoch": 0.9879199073307959, "grad_norm": 0.4149249196052551, "learning_rate": 8.48254002258441e-06, "loss": 0.4082, "step": 1990 }, { "epoch": 0.9884163494952838, "grad_norm": 0.3979734778404236, "learning_rate": 8.480466380147435e-06, "loss": 0.4348, "step": 1991 }, { "epoch": 0.9889127916597716, "grad_norm": 0.3562667667865753, "learning_rate": 8.478391575677325e-06, "loss": 0.3968, "step": 1992 }, { "epoch": 0.9894092338242595, "grad_norm": 0.37458497285842896, "learning_rate": 8.476315609866807e-06, "loss": 0.4251, "step": 1993 }, { "epoch": 0.9899056759887473, "grad_norm": 0.4396924376487732, "learning_rate": 8.474238483408987e-06, "loss": 0.4236, "step": 1994 }, { "epoch": 0.9904021181532352, "grad_norm": 0.435179203748703, "learning_rate": 8.472160196997364e-06, "loss": 0.4154, "step": 1995 }, { "epoch": 0.9908985603177229, "grad_norm": 0.35832908749580383, "learning_rate": 8.470080751325816e-06, "loss": 0.3854, "step": 1996 }, { "epoch": 0.9913950024822108, "grad_norm": 0.396345853805542, "learning_rate": 8.468000147088619e-06, "loss": 0.399, "step": 1997 }, { "epoch": 0.9918914446466987, "grad_norm": 0.46219322085380554, "learning_rate": 8.465918384980429e-06, "loss": 0.4104, "step": 1998 }, { "epoch": 0.9923878868111865, "grad_norm": 0.4048832356929779, "learning_rate": 8.463835465696286e-06, "loss": 0.3732, "step": 1999 }, { "epoch": 0.9928843289756744, "grad_norm": 0.3430626094341278, "learning_rate": 8.461751389931624e-06, "loss": 0.404, "step": 2000 }, { "epoch": 0.9933807711401622, "grad_norm": 0.4182393550872803, "learning_rate": 8.459666158382257e-06, "loss": 0.4126, "step": 2001 }, { "epoch": 0.99387721330465, "grad_norm": 0.38088467717170715, "learning_rate": 8.457579771744391e-06, "loss": 0.4282, "step": 2002 }, { "epoch": 0.9943736554691378, "grad_norm": 0.38678205013275146, "learning_rate": 8.455492230714611e-06, "loss": 0.4099, "step": 2003 }, { "epoch": 0.9948700976336257, "grad_norm": 0.3960254192352295, "learning_rate": 8.453403535989888e-06, "loss": 0.4111, "step": 2004 }, { "epoch": 0.9953665397981135, "grad_norm": 0.3672551214694977, "learning_rate": 8.451313688267582e-06, "loss": 0.4052, "step": 2005 }, { "epoch": 0.9958629819626014, "grad_norm": 0.3852755129337311, "learning_rate": 8.44922268824544e-06, "loss": 0.3963, "step": 2006 }, { "epoch": 0.9963594241270892, "grad_norm": 0.3594263792037964, "learning_rate": 8.447130536621584e-06, "loss": 0.421, "step": 2007 }, { "epoch": 0.9968558662915771, "grad_norm": 0.34516894817352295, "learning_rate": 8.44503723409453e-06, "loss": 0.3982, "step": 2008 }, { "epoch": 0.9973523084560648, "grad_norm": 0.3760300874710083, "learning_rate": 8.442942781363177e-06, "loss": 0.4255, "step": 2009 }, { "epoch": 0.9978487506205527, "grad_norm": 0.3789212107658386, "learning_rate": 8.440847179126802e-06, "loss": 0.4107, "step": 2010 }, { "epoch": 0.9983451927850405, "grad_norm": 0.40309029817581177, "learning_rate": 8.43875042808507e-06, "loss": 0.4304, "step": 2011 }, { "epoch": 0.9988416349495284, "grad_norm": 0.47544294595718384, "learning_rate": 8.43665252893803e-06, "loss": 0.4088, "step": 2012 }, { "epoch": 0.9993380771140162, "grad_norm": 0.38703134655952454, "learning_rate": 8.434553482386116e-06, "loss": 0.4191, "step": 2013 }, { "epoch": 0.9998345192785041, "grad_norm": 0.3764602243900299, "learning_rate": 8.432453289130139e-06, "loss": 0.4067, "step": 2014 }, { "epoch": 1.0003309614429918, "grad_norm": 0.773108959197998, "learning_rate": 8.430351949871298e-06, "loss": 0.6266, "step": 2015 }, { "epoch": 1.0008274036074798, "grad_norm": 0.39761829376220703, "learning_rate": 8.42824946531117e-06, "loss": 0.3681, "step": 2016 }, { "epoch": 1.0013238457719675, "grad_norm": 0.41880178451538086, "learning_rate": 8.426145836151723e-06, "loss": 0.3773, "step": 2017 }, { "epoch": 1.0018202879364555, "grad_norm": 0.4684564173221588, "learning_rate": 8.424041063095298e-06, "loss": 0.4247, "step": 2018 }, { "epoch": 1.0023167301009432, "grad_norm": 0.41831687092781067, "learning_rate": 8.421935146844622e-06, "loss": 0.3714, "step": 2019 }, { "epoch": 1.002813172265431, "grad_norm": 0.44369399547576904, "learning_rate": 8.419828088102804e-06, "loss": 0.4018, "step": 2020 }, { "epoch": 1.003309614429919, "grad_norm": 0.4473067820072174, "learning_rate": 8.417719887573334e-06, "loss": 0.3775, "step": 2021 }, { "epoch": 1.0038060565944067, "grad_norm": 0.40185222029685974, "learning_rate": 8.41561054596008e-06, "loss": 0.3844, "step": 2022 }, { "epoch": 1.0043024987588947, "grad_norm": 0.3920283615589142, "learning_rate": 8.413500063967296e-06, "loss": 0.3815, "step": 2023 }, { "epoch": 1.0047989409233824, "grad_norm": 0.44845959544181824, "learning_rate": 8.411388442299617e-06, "loss": 0.4094, "step": 2024 }, { "epoch": 1.0052953830878704, "grad_norm": 0.4070073664188385, "learning_rate": 8.40927568166205e-06, "loss": 0.3984, "step": 2025 }, { "epoch": 1.005791825252358, "grad_norm": 0.399277925491333, "learning_rate": 8.407161782759995e-06, "loss": 0.4162, "step": 2026 }, { "epoch": 1.0062882674168458, "grad_norm": 0.393991082906723, "learning_rate": 8.405046746299221e-06, "loss": 0.3754, "step": 2027 }, { "epoch": 1.0067847095813338, "grad_norm": 0.3946228623390198, "learning_rate": 8.402930572985884e-06, "loss": 0.4055, "step": 2028 }, { "epoch": 1.0072811517458216, "grad_norm": 0.3783324956893921, "learning_rate": 8.400813263526512e-06, "loss": 0.3674, "step": 2029 }, { "epoch": 1.0077775939103095, "grad_norm": 0.45279255509376526, "learning_rate": 8.398694818628023e-06, "loss": 0.3887, "step": 2030 }, { "epoch": 1.0082740360747973, "grad_norm": 0.37674441933631897, "learning_rate": 8.396575238997704e-06, "loss": 0.34, "step": 2031 }, { "epoch": 1.0087704782392852, "grad_norm": 0.4030028283596039, "learning_rate": 8.394454525343227e-06, "loss": 0.4123, "step": 2032 }, { "epoch": 1.009266920403773, "grad_norm": 0.35604146122932434, "learning_rate": 8.39233267837264e-06, "loss": 0.3861, "step": 2033 }, { "epoch": 1.0097633625682607, "grad_norm": 0.42971545457839966, "learning_rate": 8.390209698794371e-06, "loss": 0.4132, "step": 2034 }, { "epoch": 1.0102598047327487, "grad_norm": 0.3627679646015167, "learning_rate": 8.388085587317224e-06, "loss": 0.372, "step": 2035 }, { "epoch": 1.0107562468972364, "grad_norm": 0.35414889454841614, "learning_rate": 8.38596034465038e-06, "loss": 0.3555, "step": 2036 }, { "epoch": 1.0112526890617244, "grad_norm": 0.37428995966911316, "learning_rate": 8.383833971503405e-06, "loss": 0.3472, "step": 2037 }, { "epoch": 1.0117491312262121, "grad_norm": 0.37734055519104004, "learning_rate": 8.381706468586234e-06, "loss": 0.3975, "step": 2038 }, { "epoch": 1.0122455733907, "grad_norm": 0.3294685184955597, "learning_rate": 8.379577836609183e-06, "loss": 0.3909, "step": 2039 }, { "epoch": 1.0127420155551878, "grad_norm": 0.3826034963130951, "learning_rate": 8.377448076282942e-06, "loss": 0.3979, "step": 2040 }, { "epoch": 1.0132384577196756, "grad_norm": 0.35725831985473633, "learning_rate": 8.375317188318586e-06, "loss": 0.3804, "step": 2041 }, { "epoch": 1.0137348998841635, "grad_norm": 0.3600703775882721, "learning_rate": 8.373185173427553e-06, "loss": 0.3958, "step": 2042 }, { "epoch": 1.0142313420486513, "grad_norm": 0.38967254757881165, "learning_rate": 8.371052032321672e-06, "loss": 0.3784, "step": 2043 }, { "epoch": 1.0147277842131393, "grad_norm": 0.3693313002586365, "learning_rate": 8.368917765713136e-06, "loss": 0.3826, "step": 2044 }, { "epoch": 1.015224226377627, "grad_norm": 0.3522603213787079, "learning_rate": 8.36678237431452e-06, "loss": 0.4139, "step": 2045 }, { "epoch": 1.0157206685421147, "grad_norm": 0.3520358204841614, "learning_rate": 8.364645858838773e-06, "loss": 0.3552, "step": 2046 }, { "epoch": 1.0162171107066027, "grad_norm": 0.402457058429718, "learning_rate": 8.362508219999222e-06, "loss": 0.4489, "step": 2047 }, { "epoch": 1.0167135528710904, "grad_norm": 0.3466389775276184, "learning_rate": 8.36036945850956e-06, "loss": 0.3568, "step": 2048 }, { "epoch": 1.0172099950355784, "grad_norm": 0.3530232012271881, "learning_rate": 8.35822957508387e-06, "loss": 0.37, "step": 2049 }, { "epoch": 1.0177064372000661, "grad_norm": 0.46518945693969727, "learning_rate": 8.356088570436593e-06, "loss": 0.4592, "step": 2050 }, { "epoch": 1.0182028793645541, "grad_norm": 0.3850618898868561, "learning_rate": 8.353946445282558e-06, "loss": 0.3965, "step": 2051 }, { "epoch": 1.0186993215290419, "grad_norm": 0.3937636911869049, "learning_rate": 8.35180320033696e-06, "loss": 0.3733, "step": 2052 }, { "epoch": 1.0191957636935296, "grad_norm": 0.4036066234111786, "learning_rate": 8.349658836315369e-06, "loss": 0.3915, "step": 2053 }, { "epoch": 1.0196922058580176, "grad_norm": 0.40514594316482544, "learning_rate": 8.347513353933733e-06, "loss": 0.3628, "step": 2054 }, { "epoch": 1.0201886480225053, "grad_norm": 0.4089788794517517, "learning_rate": 8.345366753908366e-06, "loss": 0.4022, "step": 2055 }, { "epoch": 1.0206850901869933, "grad_norm": 0.37966278195381165, "learning_rate": 8.343219036955965e-06, "loss": 0.3666, "step": 2056 }, { "epoch": 1.021181532351481, "grad_norm": 0.4466352164745331, "learning_rate": 8.34107020379359e-06, "loss": 0.4179, "step": 2057 }, { "epoch": 1.021677974515969, "grad_norm": 0.36988502740859985, "learning_rate": 8.338920255138679e-06, "loss": 0.3673, "step": 2058 }, { "epoch": 1.0221744166804567, "grad_norm": 0.374826580286026, "learning_rate": 8.336769191709041e-06, "loss": 0.3977, "step": 2059 }, { "epoch": 1.0226708588449445, "grad_norm": 0.3295980393886566, "learning_rate": 8.334617014222858e-06, "loss": 0.3717, "step": 2060 }, { "epoch": 1.0231673010094324, "grad_norm": 0.42135104537010193, "learning_rate": 8.332463723398684e-06, "loss": 0.4207, "step": 2061 }, { "epoch": 1.0236637431739202, "grad_norm": 0.39542248845100403, "learning_rate": 8.330309319955446e-06, "loss": 0.3599, "step": 2062 }, { "epoch": 1.0241601853384081, "grad_norm": 0.47270718216896057, "learning_rate": 8.328153804612437e-06, "loss": 0.4002, "step": 2063 }, { "epoch": 1.0246566275028959, "grad_norm": 0.3374427258968353, "learning_rate": 8.325997178089329e-06, "loss": 0.3722, "step": 2064 }, { "epoch": 1.0251530696673838, "grad_norm": 0.4421992003917694, "learning_rate": 8.323839441106156e-06, "loss": 0.3915, "step": 2065 }, { "epoch": 1.0256495118318716, "grad_norm": 0.4027402698993683, "learning_rate": 8.321680594383332e-06, "loss": 0.3859, "step": 2066 }, { "epoch": 1.0261459539963593, "grad_norm": 0.3244870603084564, "learning_rate": 8.319520638641636e-06, "loss": 0.3864, "step": 2067 }, { "epoch": 1.0266423961608473, "grad_norm": 0.40958938002586365, "learning_rate": 8.317359574602217e-06, "loss": 0.4224, "step": 2068 }, { "epoch": 1.027138838325335, "grad_norm": 0.3950890898704529, "learning_rate": 8.315197402986599e-06, "loss": 0.3801, "step": 2069 }, { "epoch": 1.027635280489823, "grad_norm": 0.3430311977863312, "learning_rate": 8.313034124516668e-06, "loss": 0.3681, "step": 2070 }, { "epoch": 1.0281317226543107, "grad_norm": 0.3976898193359375, "learning_rate": 8.310869739914688e-06, "loss": 0.4161, "step": 2071 }, { "epoch": 1.0286281648187987, "grad_norm": 0.37889760732650757, "learning_rate": 8.308704249903286e-06, "loss": 0.3977, "step": 2072 }, { "epoch": 1.0291246069832864, "grad_norm": 0.4024152457714081, "learning_rate": 8.30653765520546e-06, "loss": 0.3792, "step": 2073 }, { "epoch": 1.0296210491477742, "grad_norm": 0.4046730399131775, "learning_rate": 8.304369956544576e-06, "loss": 0.3562, "step": 2074 }, { "epoch": 1.0301174913122622, "grad_norm": 0.3650938868522644, "learning_rate": 8.302201154644373e-06, "loss": 0.4059, "step": 2075 }, { "epoch": 1.03061393347675, "grad_norm": 0.3792347311973572, "learning_rate": 8.300031250228954e-06, "loss": 0.3604, "step": 2076 }, { "epoch": 1.0311103756412379, "grad_norm": 0.3619602918624878, "learning_rate": 8.29786024402279e-06, "loss": 0.385, "step": 2077 }, { "epoch": 1.0316068178057256, "grad_norm": 0.3934377133846283, "learning_rate": 8.295688136750721e-06, "loss": 0.3806, "step": 2078 }, { "epoch": 1.0321032599702136, "grad_norm": 0.31635820865631104, "learning_rate": 8.293514929137954e-06, "loss": 0.3278, "step": 2079 }, { "epoch": 1.0325997021347013, "grad_norm": 0.36716070771217346, "learning_rate": 8.291340621910066e-06, "loss": 0.3688, "step": 2080 }, { "epoch": 1.033096144299189, "grad_norm": 0.40970879793167114, "learning_rate": 8.289165215792998e-06, "loss": 0.4435, "step": 2081 }, { "epoch": 1.033592586463677, "grad_norm": 0.355609267950058, "learning_rate": 8.28698871151306e-06, "loss": 0.3951, "step": 2082 }, { "epoch": 1.0340890286281648, "grad_norm": 0.3582027554512024, "learning_rate": 8.284811109796926e-06, "loss": 0.3764, "step": 2083 }, { "epoch": 1.0345854707926527, "grad_norm": 0.3632797300815582, "learning_rate": 8.282632411371639e-06, "loss": 0.3834, "step": 2084 }, { "epoch": 1.0350819129571405, "grad_norm": 0.388430655002594, "learning_rate": 8.280452616964604e-06, "loss": 0.4231, "step": 2085 }, { "epoch": 1.0355783551216284, "grad_norm": 0.31115320324897766, "learning_rate": 8.278271727303602e-06, "loss": 0.3457, "step": 2086 }, { "epoch": 1.0360747972861162, "grad_norm": 0.4137135148048401, "learning_rate": 8.276089743116765e-06, "loss": 0.3832, "step": 2087 }, { "epoch": 1.036571239450604, "grad_norm": 0.3698047399520874, "learning_rate": 8.273906665132605e-06, "loss": 0.3991, "step": 2088 }, { "epoch": 1.0370676816150919, "grad_norm": 0.3954959809780121, "learning_rate": 8.271722494079987e-06, "loss": 0.3778, "step": 2089 }, { "epoch": 1.0375641237795796, "grad_norm": 0.38647761940956116, "learning_rate": 8.26953723068815e-06, "loss": 0.4231, "step": 2090 }, { "epoch": 1.0380605659440676, "grad_norm": 0.3644237816333771, "learning_rate": 8.267350875686693e-06, "loss": 0.4003, "step": 2091 }, { "epoch": 1.0385570081085553, "grad_norm": 0.3733663856983185, "learning_rate": 8.26516342980558e-06, "loss": 0.3934, "step": 2092 }, { "epoch": 1.0390534502730433, "grad_norm": 0.36825332045555115, "learning_rate": 8.26297489377514e-06, "loss": 0.3599, "step": 2093 }, { "epoch": 1.039549892437531, "grad_norm": 0.3756932020187378, "learning_rate": 8.260785268326066e-06, "loss": 0.4058, "step": 2094 }, { "epoch": 1.0400463346020188, "grad_norm": 0.3824720084667206, "learning_rate": 8.258594554189415e-06, "loss": 0.3771, "step": 2095 }, { "epoch": 1.0405427767665067, "grad_norm": 0.35530564188957214, "learning_rate": 8.256402752096603e-06, "loss": 0.3809, "step": 2096 }, { "epoch": 1.0410392189309945, "grad_norm": 0.4004719853401184, "learning_rate": 8.25420986277942e-06, "loss": 0.3593, "step": 2097 }, { "epoch": 1.0415356610954825, "grad_norm": 0.39908117055892944, "learning_rate": 8.252015886970005e-06, "loss": 0.3866, "step": 2098 }, { "epoch": 1.0420321032599702, "grad_norm": 0.415581613779068, "learning_rate": 8.249820825400871e-06, "loss": 0.4278, "step": 2099 }, { "epoch": 1.042528545424458, "grad_norm": 0.4321013391017914, "learning_rate": 8.24762467880489e-06, "loss": 0.3936, "step": 2100 }, { "epoch": 1.043024987588946, "grad_norm": 0.35715317726135254, "learning_rate": 8.245427447915293e-06, "loss": 0.3129, "step": 2101 }, { "epoch": 1.0435214297534336, "grad_norm": 0.39057600498199463, "learning_rate": 8.243229133465677e-06, "loss": 0.408, "step": 2102 }, { "epoch": 1.0440178719179216, "grad_norm": 0.3775971531867981, "learning_rate": 8.241029736190001e-06, "loss": 0.4019, "step": 2103 }, { "epoch": 1.0445143140824094, "grad_norm": 0.4297982156276703, "learning_rate": 8.23882925682258e-06, "loss": 0.3935, "step": 2104 }, { "epoch": 1.0450107562468973, "grad_norm": 0.37020421028137207, "learning_rate": 8.236627696098099e-06, "loss": 0.3211, "step": 2105 }, { "epoch": 1.045507198411385, "grad_norm": 0.3700840175151825, "learning_rate": 8.234425054751595e-06, "loss": 0.3914, "step": 2106 }, { "epoch": 1.0460036405758728, "grad_norm": 0.395827054977417, "learning_rate": 8.232221333518474e-06, "loss": 0.4223, "step": 2107 }, { "epoch": 1.0465000827403608, "grad_norm": 0.4135538935661316, "learning_rate": 8.230016533134495e-06, "loss": 0.3958, "step": 2108 }, { "epoch": 1.0469965249048485, "grad_norm": 0.3333798944950104, "learning_rate": 8.227810654335784e-06, "loss": 0.3764, "step": 2109 }, { "epoch": 1.0474929670693365, "grad_norm": 0.4000759720802307, "learning_rate": 8.225603697858822e-06, "loss": 0.3646, "step": 2110 }, { "epoch": 1.0479894092338242, "grad_norm": 0.34949368238449097, "learning_rate": 8.223395664440451e-06, "loss": 0.3873, "step": 2111 }, { "epoch": 1.0484858513983122, "grad_norm": 0.4058420956134796, "learning_rate": 8.221186554817877e-06, "loss": 0.3772, "step": 2112 }, { "epoch": 1.0489822935628, "grad_norm": 0.38279417157173157, "learning_rate": 8.218976369728658e-06, "loss": 0.3654, "step": 2113 }, { "epoch": 1.0494787357272877, "grad_norm": 0.32806119322776794, "learning_rate": 8.216765109910716e-06, "loss": 0.3516, "step": 2114 }, { "epoch": 1.0499751778917756, "grad_norm": 0.3521556854248047, "learning_rate": 8.21455277610233e-06, "loss": 0.3682, "step": 2115 }, { "epoch": 1.0504716200562634, "grad_norm": 0.3824714422225952, "learning_rate": 8.212339369042139e-06, "loss": 0.3716, "step": 2116 }, { "epoch": 1.0509680622207513, "grad_norm": 0.37684255838394165, "learning_rate": 8.21012488946914e-06, "loss": 0.3753, "step": 2117 }, { "epoch": 1.051464504385239, "grad_norm": 0.3369491398334503, "learning_rate": 8.207909338122687e-06, "loss": 0.3683, "step": 2118 }, { "epoch": 1.051960946549727, "grad_norm": 0.40964752435684204, "learning_rate": 8.205692715742491e-06, "loss": 0.3533, "step": 2119 }, { "epoch": 1.0524573887142148, "grad_norm": 0.41442134976387024, "learning_rate": 8.203475023068624e-06, "loss": 0.3873, "step": 2120 }, { "epoch": 1.0529538308787025, "grad_norm": 0.35688209533691406, "learning_rate": 8.201256260841513e-06, "loss": 0.3823, "step": 2121 }, { "epoch": 1.0534502730431905, "grad_norm": 0.41384658217430115, "learning_rate": 8.199036429801942e-06, "loss": 0.4063, "step": 2122 }, { "epoch": 1.0539467152076782, "grad_norm": 0.41385704278945923, "learning_rate": 8.19681553069105e-06, "loss": 0.3921, "step": 2123 }, { "epoch": 1.0544431573721662, "grad_norm": 0.3682198226451874, "learning_rate": 8.194593564250337e-06, "loss": 0.3782, "step": 2124 }, { "epoch": 1.054939599536654, "grad_norm": 0.4080114960670471, "learning_rate": 8.192370531221659e-06, "loss": 0.4014, "step": 2125 }, { "epoch": 1.055436041701142, "grad_norm": 0.36291515827178955, "learning_rate": 8.190146432347223e-06, "loss": 0.3421, "step": 2126 }, { "epoch": 1.0559324838656297, "grad_norm": 0.3890489339828491, "learning_rate": 8.187921268369598e-06, "loss": 0.373, "step": 2127 }, { "epoch": 1.0564289260301174, "grad_norm": 0.3694446384906769, "learning_rate": 8.185695040031702e-06, "loss": 0.3695, "step": 2128 }, { "epoch": 1.0569253681946054, "grad_norm": 0.3984377682209015, "learning_rate": 8.183467748076817e-06, "loss": 0.3922, "step": 2129 }, { "epoch": 1.057421810359093, "grad_norm": 0.37950676679611206, "learning_rate": 8.181239393248572e-06, "loss": 0.4077, "step": 2130 }, { "epoch": 1.057918252523581, "grad_norm": 0.4164874255657196, "learning_rate": 8.179009976290955e-06, "loss": 0.426, "step": 2131 }, { "epoch": 1.0584146946880688, "grad_norm": 0.37924954295158386, "learning_rate": 8.176779497948308e-06, "loss": 0.3881, "step": 2132 }, { "epoch": 1.0589111368525568, "grad_norm": 0.33743345737457275, "learning_rate": 8.174547958965325e-06, "loss": 0.3828, "step": 2133 }, { "epoch": 1.0594075790170445, "grad_norm": 0.3631286919116974, "learning_rate": 8.17231536008706e-06, "loss": 0.4036, "step": 2134 }, { "epoch": 1.0599040211815323, "grad_norm": 0.3658657670021057, "learning_rate": 8.170081702058914e-06, "loss": 0.4065, "step": 2135 }, { "epoch": 1.0604004633460202, "grad_norm": 0.3447650074958801, "learning_rate": 8.167846985626646e-06, "loss": 0.3648, "step": 2136 }, { "epoch": 1.060896905510508, "grad_norm": 0.3546299934387207, "learning_rate": 8.165611211536365e-06, "loss": 0.4093, "step": 2137 }, { "epoch": 1.061393347674996, "grad_norm": 0.3557119369506836, "learning_rate": 8.16337438053454e-06, "loss": 0.407, "step": 2138 }, { "epoch": 1.0618897898394837, "grad_norm": 0.3694717288017273, "learning_rate": 8.161136493367983e-06, "loss": 0.4006, "step": 2139 }, { "epoch": 1.0623862320039716, "grad_norm": 0.38199812173843384, "learning_rate": 8.158897550783868e-06, "loss": 0.4098, "step": 2140 }, { "epoch": 1.0628826741684594, "grad_norm": 0.35754144191741943, "learning_rate": 8.156657553529712e-06, "loss": 0.3644, "step": 2141 }, { "epoch": 1.0633791163329471, "grad_norm": 0.4022863805294037, "learning_rate": 8.154416502353394e-06, "loss": 0.4197, "step": 2142 }, { "epoch": 1.063875558497435, "grad_norm": 0.36401796340942383, "learning_rate": 8.152174398003138e-06, "loss": 0.3608, "step": 2143 }, { "epoch": 1.0643720006619228, "grad_norm": 0.3773067891597748, "learning_rate": 8.149931241227522e-06, "loss": 0.3841, "step": 2144 }, { "epoch": 1.0648684428264108, "grad_norm": 0.35591527819633484, "learning_rate": 8.147687032775473e-06, "loss": 0.3941, "step": 2145 }, { "epoch": 1.0653648849908985, "grad_norm": 0.40955284237861633, "learning_rate": 8.145441773396276e-06, "loss": 0.3911, "step": 2146 }, { "epoch": 1.0658613271553863, "grad_norm": 0.35035672783851624, "learning_rate": 8.143195463839557e-06, "loss": 0.3956, "step": 2147 }, { "epoch": 1.0663577693198742, "grad_norm": 0.36095160245895386, "learning_rate": 8.140948104855301e-06, "loss": 0.3971, "step": 2148 }, { "epoch": 1.066854211484362, "grad_norm": 0.3798268437385559, "learning_rate": 8.13869969719384e-06, "loss": 0.3631, "step": 2149 }, { "epoch": 1.06735065364885, "grad_norm": 0.42894065380096436, "learning_rate": 8.136450241605854e-06, "loss": 0.3996, "step": 2150 }, { "epoch": 1.0678470958133377, "grad_norm": 0.38194239139556885, "learning_rate": 8.134199738842376e-06, "loss": 0.4222, "step": 2151 }, { "epoch": 1.0683435379778257, "grad_norm": 0.3878590762615204, "learning_rate": 8.131948189654789e-06, "loss": 0.3739, "step": 2152 }, { "epoch": 1.0688399801423134, "grad_norm": 0.40790706872940063, "learning_rate": 8.129695594794822e-06, "loss": 0.3917, "step": 2153 }, { "epoch": 1.0693364223068014, "grad_norm": 0.39253684878349304, "learning_rate": 8.127441955014557e-06, "loss": 0.411, "step": 2154 }, { "epoch": 1.0698328644712891, "grad_norm": 0.3587706685066223, "learning_rate": 8.12518727106642e-06, "loss": 0.395, "step": 2155 }, { "epoch": 1.0703293066357769, "grad_norm": 0.4169301986694336, "learning_rate": 8.122931543703194e-06, "loss": 0.352, "step": 2156 }, { "epoch": 1.0708257488002648, "grad_norm": 0.35836315155029297, "learning_rate": 8.120674773678e-06, "loss": 0.3662, "step": 2157 }, { "epoch": 1.0713221909647526, "grad_norm": 0.4390971064567566, "learning_rate": 8.118416961744318e-06, "loss": 0.4394, "step": 2158 }, { "epoch": 1.0718186331292405, "grad_norm": 0.38597020506858826, "learning_rate": 8.116158108655964e-06, "loss": 0.3822, "step": 2159 }, { "epoch": 1.0723150752937283, "grad_norm": 0.36819201707839966, "learning_rate": 8.113898215167109e-06, "loss": 0.3953, "step": 2160 }, { "epoch": 1.072811517458216, "grad_norm": 0.3702963888645172, "learning_rate": 8.111637282032273e-06, "loss": 0.3562, "step": 2161 }, { "epoch": 1.073307959622704, "grad_norm": 0.40401536226272583, "learning_rate": 8.109375310006317e-06, "loss": 0.3979, "step": 2162 }, { "epoch": 1.0738044017871917, "grad_norm": 0.3972966969013214, "learning_rate": 8.107112299844453e-06, "loss": 0.4185, "step": 2163 }, { "epoch": 1.0743008439516797, "grad_norm": 0.34863534569740295, "learning_rate": 8.10484825230224e-06, "loss": 0.3429, "step": 2164 }, { "epoch": 1.0747972861161674, "grad_norm": 0.37496665120124817, "learning_rate": 8.102583168135579e-06, "loss": 0.3772, "step": 2165 }, { "epoch": 1.0752937282806554, "grad_norm": 0.3874848484992981, "learning_rate": 8.100317048100722e-06, "loss": 0.3694, "step": 2166 }, { "epoch": 1.0757901704451431, "grad_norm": 0.3553217947483063, "learning_rate": 8.098049892954264e-06, "loss": 0.3305, "step": 2167 }, { "epoch": 1.0762866126096309, "grad_norm": 0.45050421357154846, "learning_rate": 8.095781703453149e-06, "loss": 0.4132, "step": 2168 }, { "epoch": 1.0767830547741188, "grad_norm": 0.40059518814086914, "learning_rate": 8.093512480354662e-06, "loss": 0.4249, "step": 2169 }, { "epoch": 1.0772794969386066, "grad_norm": 0.4339570105075836, "learning_rate": 8.091242224416434e-06, "loss": 0.3543, "step": 2170 }, { "epoch": 1.0777759391030945, "grad_norm": 0.4424574375152588, "learning_rate": 8.08897093639644e-06, "loss": 0.3827, "step": 2171 }, { "epoch": 1.0782723812675823, "grad_norm": 0.3898351490497589, "learning_rate": 8.086698617053009e-06, "loss": 0.3758, "step": 2172 }, { "epoch": 1.0787688234320703, "grad_norm": 0.38825660943984985, "learning_rate": 8.084425267144798e-06, "loss": 0.385, "step": 2173 }, { "epoch": 1.079265265596558, "grad_norm": 0.3929385244846344, "learning_rate": 8.08215088743082e-06, "loss": 0.3549, "step": 2174 }, { "epoch": 1.0797617077610457, "grad_norm": 0.41983547806739807, "learning_rate": 8.079875478670431e-06, "loss": 0.4168, "step": 2175 }, { "epoch": 1.0802581499255337, "grad_norm": 0.36557140946388245, "learning_rate": 8.077599041623325e-06, "loss": 0.3613, "step": 2176 }, { "epoch": 1.0807545920900214, "grad_norm": 0.4427846074104309, "learning_rate": 8.075321577049545e-06, "loss": 0.3883, "step": 2177 }, { "epoch": 1.0812510342545094, "grad_norm": 0.3646920323371887, "learning_rate": 8.07304308570947e-06, "loss": 0.3626, "step": 2178 }, { "epoch": 1.0817474764189972, "grad_norm": 0.4184908866882324, "learning_rate": 8.07076356836383e-06, "loss": 0.4199, "step": 2179 }, { "epoch": 1.0822439185834851, "grad_norm": 0.4016585350036621, "learning_rate": 8.068483025773694e-06, "loss": 0.334, "step": 2180 }, { "epoch": 1.0827403607479729, "grad_norm": 0.42132261395454407, "learning_rate": 8.066201458700474e-06, "loss": 0.4065, "step": 2181 }, { "epoch": 1.0832368029124606, "grad_norm": 0.4435719847679138, "learning_rate": 8.06391886790592e-06, "loss": 0.3776, "step": 2182 }, { "epoch": 1.0837332450769486, "grad_norm": 0.4013338088989258, "learning_rate": 8.061635254152129e-06, "loss": 0.4387, "step": 2183 }, { "epoch": 1.0842296872414363, "grad_norm": 0.4154370129108429, "learning_rate": 8.059350618201538e-06, "loss": 0.4069, "step": 2184 }, { "epoch": 1.0847261294059243, "grad_norm": 0.42340394854545593, "learning_rate": 8.057064960816924e-06, "loss": 0.3777, "step": 2185 }, { "epoch": 1.085222571570412, "grad_norm": 0.3989575207233429, "learning_rate": 8.054778282761405e-06, "loss": 0.3876, "step": 2186 }, { "epoch": 1.0857190137349, "grad_norm": 0.3946067690849304, "learning_rate": 8.052490584798442e-06, "loss": 0.3701, "step": 2187 }, { "epoch": 1.0862154558993877, "grad_norm": 0.3886314332485199, "learning_rate": 8.050201867691836e-06, "loss": 0.4064, "step": 2188 }, { "epoch": 1.0867118980638755, "grad_norm": 0.38769304752349854, "learning_rate": 8.047912132205725e-06, "loss": 0.398, "step": 2189 }, { "epoch": 1.0872083402283634, "grad_norm": 0.39187854528427124, "learning_rate": 8.045621379104592e-06, "loss": 0.4103, "step": 2190 }, { "epoch": 1.0877047823928512, "grad_norm": 0.36251339316368103, "learning_rate": 8.043329609153254e-06, "loss": 0.3967, "step": 2191 }, { "epoch": 1.0882012245573391, "grad_norm": 0.3623741567134857, "learning_rate": 8.041036823116874e-06, "loss": 0.374, "step": 2192 }, { "epoch": 1.0886976667218269, "grad_norm": 0.4074133038520813, "learning_rate": 8.038743021760948e-06, "loss": 0.3865, "step": 2193 }, { "epoch": 1.0891941088863146, "grad_norm": 0.3911050856113434, "learning_rate": 8.036448205851316e-06, "loss": 0.3861, "step": 2194 }, { "epoch": 1.0896905510508026, "grad_norm": 0.36490586400032043, "learning_rate": 8.034152376154156e-06, "loss": 0.3913, "step": 2195 }, { "epoch": 1.0901869932152903, "grad_norm": 0.40038570761680603, "learning_rate": 8.031855533435979e-06, "loss": 0.4092, "step": 2196 }, { "epoch": 1.0906834353797783, "grad_norm": 0.4475693702697754, "learning_rate": 8.029557678463642e-06, "loss": 0.3725, "step": 2197 }, { "epoch": 1.091179877544266, "grad_norm": 0.3546839654445648, "learning_rate": 8.027258812004335e-06, "loss": 0.3748, "step": 2198 }, { "epoch": 1.091676319708754, "grad_norm": 0.3925829827785492, "learning_rate": 8.024958934825587e-06, "loss": 0.3656, "step": 2199 }, { "epoch": 1.0921727618732417, "grad_norm": 0.356192022562027, "learning_rate": 8.022658047695264e-06, "loss": 0.3954, "step": 2200 }, { "epoch": 1.0926692040377297, "grad_norm": 0.3425845503807068, "learning_rate": 8.020356151381569e-06, "loss": 0.3781, "step": 2201 }, { "epoch": 1.0931656462022175, "grad_norm": 0.375305712223053, "learning_rate": 8.018053246653047e-06, "loss": 0.389, "step": 2202 }, { "epoch": 1.0936620883667052, "grad_norm": 0.3410871922969818, "learning_rate": 8.015749334278569e-06, "loss": 0.3368, "step": 2203 }, { "epoch": 1.0941585305311932, "grad_norm": 0.42358753085136414, "learning_rate": 8.013444415027352e-06, "loss": 0.4223, "step": 2204 }, { "epoch": 1.094654972695681, "grad_norm": 0.36096644401550293, "learning_rate": 8.011138489668948e-06, "loss": 0.3847, "step": 2205 }, { "epoch": 1.0951514148601689, "grad_norm": 0.3530903160572052, "learning_rate": 8.008831558973237e-06, "loss": 0.3764, "step": 2206 }, { "epoch": 1.0956478570246566, "grad_norm": 0.3434128761291504, "learning_rate": 8.006523623710449e-06, "loss": 0.345, "step": 2207 }, { "epoch": 1.0961442991891444, "grad_norm": 0.4156833589076996, "learning_rate": 8.004214684651133e-06, "loss": 0.4114, "step": 2208 }, { "epoch": 1.0966407413536323, "grad_norm": 0.3173113763332367, "learning_rate": 8.001904742566183e-06, "loss": 0.3511, "step": 2209 }, { "epoch": 1.09713718351812, "grad_norm": 0.399172842502594, "learning_rate": 7.999593798226827e-06, "loss": 0.419, "step": 2210 }, { "epoch": 1.097633625682608, "grad_norm": 0.3439413607120514, "learning_rate": 7.997281852404629e-06, "loss": 0.319, "step": 2211 }, { "epoch": 1.0981300678470958, "grad_norm": 0.40024903416633606, "learning_rate": 7.994968905871479e-06, "loss": 0.4154, "step": 2212 }, { "epoch": 1.0986265100115837, "grad_norm": 0.33427318930625916, "learning_rate": 7.992654959399611e-06, "loss": 0.3561, "step": 2213 }, { "epoch": 1.0991229521760715, "grad_norm": 0.3788261413574219, "learning_rate": 7.990340013761587e-06, "loss": 0.4333, "step": 2214 }, { "epoch": 1.0996193943405594, "grad_norm": 0.3369234800338745, "learning_rate": 7.988024069730306e-06, "loss": 0.3606, "step": 2215 }, { "epoch": 1.1001158365050472, "grad_norm": 0.3771449327468872, "learning_rate": 7.985707128079e-06, "loss": 0.3992, "step": 2216 }, { "epoch": 1.100612278669535, "grad_norm": 0.35822930932044983, "learning_rate": 7.983389189581227e-06, "loss": 0.3918, "step": 2217 }, { "epoch": 1.101108720834023, "grad_norm": 0.33791545033454895, "learning_rate": 7.98107025501089e-06, "loss": 0.3565, "step": 2218 }, { "epoch": 1.1016051629985106, "grad_norm": 0.35206139087677, "learning_rate": 7.978750325142217e-06, "loss": 0.3612, "step": 2219 }, { "epoch": 1.1021016051629986, "grad_norm": 0.3750723600387573, "learning_rate": 7.976429400749766e-06, "loss": 0.3905, "step": 2220 }, { "epoch": 1.1025980473274863, "grad_norm": 0.3800555467605591, "learning_rate": 7.974107482608434e-06, "loss": 0.3953, "step": 2221 }, { "epoch": 1.103094489491974, "grad_norm": 0.3609413802623749, "learning_rate": 7.971784571493446e-06, "loss": 0.3727, "step": 2222 }, { "epoch": 1.103590931656462, "grad_norm": 0.3349706530570984, "learning_rate": 7.969460668180358e-06, "loss": 0.3559, "step": 2223 }, { "epoch": 1.1040873738209498, "grad_norm": 0.37222206592559814, "learning_rate": 7.967135773445059e-06, "loss": 0.4049, "step": 2224 }, { "epoch": 1.1045838159854378, "grad_norm": 0.35927021503448486, "learning_rate": 7.964809888063765e-06, "loss": 0.3959, "step": 2225 }, { "epoch": 1.1050802581499255, "grad_norm": 0.370364785194397, "learning_rate": 7.962483012813029e-06, "loss": 0.3916, "step": 2226 }, { "epoch": 1.1055767003144135, "grad_norm": 0.3336319625377655, "learning_rate": 7.960155148469733e-06, "loss": 0.3404, "step": 2227 }, { "epoch": 1.1060731424789012, "grad_norm": 0.4148544669151306, "learning_rate": 7.957826295811085e-06, "loss": 0.4222, "step": 2228 }, { "epoch": 1.106569584643389, "grad_norm": 0.35007229447364807, "learning_rate": 7.955496455614624e-06, "loss": 0.3767, "step": 2229 }, { "epoch": 1.107066026807877, "grad_norm": 0.38181257247924805, "learning_rate": 7.953165628658224e-06, "loss": 0.346, "step": 2230 }, { "epoch": 1.1075624689723647, "grad_norm": 0.38662418723106384, "learning_rate": 7.950833815720083e-06, "loss": 0.4003, "step": 2231 }, { "epoch": 1.1080589111368526, "grad_norm": 0.3962722718715668, "learning_rate": 7.948501017578728e-06, "loss": 0.3861, "step": 2232 }, { "epoch": 1.1085553533013404, "grad_norm": 0.419222354888916, "learning_rate": 7.946167235013023e-06, "loss": 0.3936, "step": 2233 }, { "epoch": 1.1090517954658283, "grad_norm": 0.3879736065864563, "learning_rate": 7.94383246880215e-06, "loss": 0.4141, "step": 2234 }, { "epoch": 1.109548237630316, "grad_norm": 0.37481868267059326, "learning_rate": 7.941496719725622e-06, "loss": 0.3726, "step": 2235 }, { "epoch": 1.1100446797948038, "grad_norm": 0.37485960125923157, "learning_rate": 7.939159988563286e-06, "loss": 0.4065, "step": 2236 }, { "epoch": 1.1105411219592918, "grad_norm": 0.3654593825340271, "learning_rate": 7.936822276095312e-06, "loss": 0.3743, "step": 2237 }, { "epoch": 1.1110375641237795, "grad_norm": 0.3628568947315216, "learning_rate": 7.934483583102197e-06, "loss": 0.3809, "step": 2238 }, { "epoch": 1.1115340062882675, "grad_norm": 0.40710678696632385, "learning_rate": 7.932143910364771e-06, "loss": 0.3738, "step": 2239 }, { "epoch": 1.1120304484527552, "grad_norm": 0.35545194149017334, "learning_rate": 7.929803258664182e-06, "loss": 0.3752, "step": 2240 }, { "epoch": 1.1125268906172432, "grad_norm": 0.3857872188091278, "learning_rate": 7.927461628781915e-06, "loss": 0.3957, "step": 2241 }, { "epoch": 1.113023332781731, "grad_norm": 0.3718709647655487, "learning_rate": 7.925119021499771e-06, "loss": 0.394, "step": 2242 }, { "epoch": 1.1135197749462187, "grad_norm": 0.36279430985450745, "learning_rate": 7.92277543759989e-06, "loss": 0.3883, "step": 2243 }, { "epoch": 1.1140162171107066, "grad_norm": 0.3545992970466614, "learning_rate": 7.920430877864725e-06, "loss": 0.4275, "step": 2244 }, { "epoch": 1.1145126592751944, "grad_norm": 0.38852325081825256, "learning_rate": 7.918085343077062e-06, "loss": 0.4083, "step": 2245 }, { "epoch": 1.1150091014396823, "grad_norm": 0.45527583360671997, "learning_rate": 7.915738834020014e-06, "loss": 0.3931, "step": 2246 }, { "epoch": 1.11550554360417, "grad_norm": 0.3864978551864624, "learning_rate": 7.913391351477013e-06, "loss": 0.3983, "step": 2247 }, { "epoch": 1.116001985768658, "grad_norm": 0.3790328800678253, "learning_rate": 7.911042896231822e-06, "loss": 0.3732, "step": 2248 }, { "epoch": 1.1164984279331458, "grad_norm": 0.4260328710079193, "learning_rate": 7.908693469068525e-06, "loss": 0.3368, "step": 2249 }, { "epoch": 1.1169948700976335, "grad_norm": 0.3786206841468811, "learning_rate": 7.906343070771534e-06, "loss": 0.3933, "step": 2250 }, { "epoch": 1.1174913122621215, "grad_norm": 0.3544807732105255, "learning_rate": 7.903991702125583e-06, "loss": 0.3637, "step": 2251 }, { "epoch": 1.1179877544266092, "grad_norm": 0.42958778142929077, "learning_rate": 7.901639363915724e-06, "loss": 0.437, "step": 2252 }, { "epoch": 1.1184841965910972, "grad_norm": 0.3642534613609314, "learning_rate": 7.899286056927347e-06, "loss": 0.3843, "step": 2253 }, { "epoch": 1.118980638755585, "grad_norm": 0.3488350510597229, "learning_rate": 7.896931781946153e-06, "loss": 0.3694, "step": 2254 }, { "epoch": 1.1194770809200727, "grad_norm": 0.39769092202186584, "learning_rate": 7.894576539758173e-06, "loss": 0.3911, "step": 2255 }, { "epoch": 1.1199735230845607, "grad_norm": 0.3952499032020569, "learning_rate": 7.892220331149753e-06, "loss": 0.3775, "step": 2256 }, { "epoch": 1.1204699652490484, "grad_norm": 0.377708375453949, "learning_rate": 7.889863156907574e-06, "loss": 0.384, "step": 2257 }, { "epoch": 1.1209664074135364, "grad_norm": 0.3656936287879944, "learning_rate": 7.887505017818626e-06, "loss": 0.3907, "step": 2258 }, { "epoch": 1.1214628495780241, "grad_norm": 0.43705278635025024, "learning_rate": 7.885145914670234e-06, "loss": 0.4717, "step": 2259 }, { "epoch": 1.121959291742512, "grad_norm": 0.3498784005641937, "learning_rate": 7.882785848250033e-06, "loss": 0.3466, "step": 2260 }, { "epoch": 1.1224557339069998, "grad_norm": 0.3462260663509369, "learning_rate": 7.880424819345987e-06, "loss": 0.4119, "step": 2261 }, { "epoch": 1.1229521760714878, "grad_norm": 0.3705548942089081, "learning_rate": 7.87806282874638e-06, "loss": 0.3724, "step": 2262 }, { "epoch": 1.1234486182359755, "grad_norm": 0.4021374583244324, "learning_rate": 7.875699877239815e-06, "loss": 0.3801, "step": 2263 }, { "epoch": 1.1239450604004633, "grad_norm": 0.37694790959358215, "learning_rate": 7.873335965615219e-06, "loss": 0.3611, "step": 2264 }, { "epoch": 1.1244415025649512, "grad_norm": 0.43388545513153076, "learning_rate": 7.870971094661836e-06, "loss": 0.4231, "step": 2265 }, { "epoch": 1.124937944729439, "grad_norm": 0.39605483412742615, "learning_rate": 7.868605265169236e-06, "loss": 0.3993, "step": 2266 }, { "epoch": 1.125434386893927, "grad_norm": 0.3873456120491028, "learning_rate": 7.8662384779273e-06, "loss": 0.3818, "step": 2267 }, { "epoch": 1.1259308290584147, "grad_norm": 0.33426907658576965, "learning_rate": 7.863870733726237e-06, "loss": 0.3879, "step": 2268 }, { "epoch": 1.1264272712229024, "grad_norm": 0.38412952423095703, "learning_rate": 7.861502033356572e-06, "loss": 0.426, "step": 2269 }, { "epoch": 1.1269237133873904, "grad_norm": 0.40373706817626953, "learning_rate": 7.859132377609146e-06, "loss": 0.3948, "step": 2270 }, { "epoch": 1.1274201555518781, "grad_norm": 0.3442263901233673, "learning_rate": 7.85676176727513e-06, "loss": 0.3532, "step": 2271 }, { "epoch": 1.127916597716366, "grad_norm": 0.3368537127971649, "learning_rate": 7.854390203146e-06, "loss": 0.3519, "step": 2272 }, { "epoch": 1.1284130398808538, "grad_norm": 0.3920102119445801, "learning_rate": 7.852017686013561e-06, "loss": 0.3871, "step": 2273 }, { "epoch": 1.1289094820453418, "grad_norm": 0.3676077425479889, "learning_rate": 7.849644216669929e-06, "loss": 0.3868, "step": 2274 }, { "epoch": 1.1294059242098295, "grad_norm": 0.33921343088150024, "learning_rate": 7.847269795907543e-06, "loss": 0.341, "step": 2275 }, { "epoch": 1.1299023663743175, "grad_norm": 0.3761577010154724, "learning_rate": 7.844894424519156e-06, "loss": 0.3942, "step": 2276 }, { "epoch": 1.1303988085388053, "grad_norm": 0.37298423051834106, "learning_rate": 7.842518103297842e-06, "loss": 0.4197, "step": 2277 }, { "epoch": 1.130895250703293, "grad_norm": 0.30817145109176636, "learning_rate": 7.840140833036987e-06, "loss": 0.3107, "step": 2278 }, { "epoch": 1.131391692867781, "grad_norm": 0.38474056124687195, "learning_rate": 7.8377626145303e-06, "loss": 0.3969, "step": 2279 }, { "epoch": 1.1318881350322687, "grad_norm": 0.3396674394607544, "learning_rate": 7.835383448571801e-06, "loss": 0.3679, "step": 2280 }, { "epoch": 1.1323845771967567, "grad_norm": 0.3297513723373413, "learning_rate": 7.83300333595583e-06, "loss": 0.3594, "step": 2281 }, { "epoch": 1.1328810193612444, "grad_norm": 0.35833653807640076, "learning_rate": 7.830622277477042e-06, "loss": 0.3995, "step": 2282 }, { "epoch": 1.1333774615257322, "grad_norm": 0.3782055377960205, "learning_rate": 7.828240273930408e-06, "loss": 0.362, "step": 2283 }, { "epoch": 1.1338739036902201, "grad_norm": 0.34893086552619934, "learning_rate": 7.825857326111213e-06, "loss": 0.4019, "step": 2284 }, { "epoch": 1.1343703458547079, "grad_norm": 0.3349522650241852, "learning_rate": 7.82347343481506e-06, "loss": 0.3504, "step": 2285 }, { "epoch": 1.1348667880191958, "grad_norm": 0.3864908814430237, "learning_rate": 7.821088600837865e-06, "loss": 0.3876, "step": 2286 }, { "epoch": 1.1353632301836836, "grad_norm": 0.34145623445510864, "learning_rate": 7.81870282497586e-06, "loss": 0.3335, "step": 2287 }, { "epoch": 1.1358596723481715, "grad_norm": 0.39876648783683777, "learning_rate": 7.816316108025588e-06, "loss": 0.4333, "step": 2288 }, { "epoch": 1.1363561145126593, "grad_norm": 0.34747788310050964, "learning_rate": 7.81392845078391e-06, "loss": 0.3258, "step": 2289 }, { "epoch": 1.136852556677147, "grad_norm": 0.3816116750240326, "learning_rate": 7.811539854048003e-06, "loss": 0.4014, "step": 2290 }, { "epoch": 1.137348998841635, "grad_norm": 0.36201080679893494, "learning_rate": 7.809150318615351e-06, "loss": 0.3529, "step": 2291 }, { "epoch": 1.1378454410061227, "grad_norm": 0.377488374710083, "learning_rate": 7.806759845283755e-06, "loss": 0.414, "step": 2292 }, { "epoch": 1.1383418831706107, "grad_norm": 0.3546985983848572, "learning_rate": 7.804368434851333e-06, "loss": 0.3635, "step": 2293 }, { "epoch": 1.1388383253350984, "grad_norm": 0.3781360387802124, "learning_rate": 7.801976088116507e-06, "loss": 0.4038, "step": 2294 }, { "epoch": 1.1393347674995864, "grad_norm": 0.3553997576236725, "learning_rate": 7.799582805878022e-06, "loss": 0.3604, "step": 2295 }, { "epoch": 1.1398312096640741, "grad_norm": 0.33245354890823364, "learning_rate": 7.797188588934921e-06, "loss": 0.3595, "step": 2296 }, { "epoch": 1.1403276518285619, "grad_norm": 0.3819256126880646, "learning_rate": 7.794793438086578e-06, "loss": 0.3717, "step": 2297 }, { "epoch": 1.1408240939930498, "grad_norm": 0.36895596981048584, "learning_rate": 7.792397354132661e-06, "loss": 0.3605, "step": 2298 }, { "epoch": 1.1413205361575376, "grad_norm": 0.36799997091293335, "learning_rate": 7.790000337873162e-06, "loss": 0.3815, "step": 2299 }, { "epoch": 1.1418169783220256, "grad_norm": 0.34998974204063416, "learning_rate": 7.78760239010838e-06, "loss": 0.4016, "step": 2300 }, { "epoch": 1.1423134204865133, "grad_norm": 0.34225279092788696, "learning_rate": 7.78520351163892e-06, "loss": 0.3799, "step": 2301 }, { "epoch": 1.142809862651001, "grad_norm": 0.39592352509498596, "learning_rate": 7.782803703265707e-06, "loss": 0.3881, "step": 2302 }, { "epoch": 1.143306304815489, "grad_norm": 0.3665475845336914, "learning_rate": 7.780402965789968e-06, "loss": 0.4142, "step": 2303 }, { "epoch": 1.1438027469799767, "grad_norm": 0.3591741621494293, "learning_rate": 7.778001300013248e-06, "loss": 0.4156, "step": 2304 }, { "epoch": 1.1442991891444647, "grad_norm": 0.3615165650844574, "learning_rate": 7.775598706737395e-06, "loss": 0.3711, "step": 2305 }, { "epoch": 1.1447956313089525, "grad_norm": 0.36720308661460876, "learning_rate": 7.77319518676457e-06, "loss": 0.4112, "step": 2306 }, { "epoch": 1.1452920734734404, "grad_norm": 0.39532017707824707, "learning_rate": 7.770790740897245e-06, "loss": 0.4056, "step": 2307 }, { "epoch": 1.1457885156379282, "grad_norm": 0.3662032186985016, "learning_rate": 7.768385369938196e-06, "loss": 0.3982, "step": 2308 }, { "epoch": 1.1462849578024161, "grad_norm": 0.3420380651950836, "learning_rate": 7.765979074690512e-06, "loss": 0.3457, "step": 2309 }, { "epoch": 1.1467813999669039, "grad_norm": 0.4002131223678589, "learning_rate": 7.763571855957592e-06, "loss": 0.3909, "step": 2310 }, { "epoch": 1.1472778421313916, "grad_norm": 0.37023571133613586, "learning_rate": 7.761163714543137e-06, "loss": 0.3935, "step": 2311 }, { "epoch": 1.1477742842958796, "grad_norm": 0.4030362367630005, "learning_rate": 7.758754651251163e-06, "loss": 0.4057, "step": 2312 }, { "epoch": 1.1482707264603673, "grad_norm": 0.4093031883239746, "learning_rate": 7.75634466688599e-06, "loss": 0.4244, "step": 2313 }, { "epoch": 1.1487671686248553, "grad_norm": 0.3856121897697449, "learning_rate": 7.753933762252246e-06, "loss": 0.3343, "step": 2314 }, { "epoch": 1.149263610789343, "grad_norm": 0.41769659519195557, "learning_rate": 7.751521938154867e-06, "loss": 0.389, "step": 2315 }, { "epoch": 1.1497600529538308, "grad_norm": 0.42994576692581177, "learning_rate": 7.749109195399093e-06, "loss": 0.4184, "step": 2316 }, { "epoch": 1.1502564951183187, "grad_norm": 0.35675159096717834, "learning_rate": 7.746695534790477e-06, "loss": 0.3502, "step": 2317 }, { "epoch": 1.1507529372828065, "grad_norm": 0.4236018657684326, "learning_rate": 7.744280957134872e-06, "loss": 0.4076, "step": 2318 }, { "epoch": 1.1512493794472944, "grad_norm": 0.36477944254875183, "learning_rate": 7.741865463238442e-06, "loss": 0.3474, "step": 2319 }, { "epoch": 1.1517458216117822, "grad_norm": 0.40266314148902893, "learning_rate": 7.739449053907653e-06, "loss": 0.4043, "step": 2320 }, { "epoch": 1.1522422637762701, "grad_norm": 0.3751056492328644, "learning_rate": 7.737031729949279e-06, "loss": 0.4134, "step": 2321 }, { "epoch": 1.152738705940758, "grad_norm": 0.36430323123931885, "learning_rate": 7.7346134921704e-06, "loss": 0.3692, "step": 2322 }, { "epoch": 1.1532351481052459, "grad_norm": 0.4183594882488251, "learning_rate": 7.732194341378397e-06, "loss": 0.4018, "step": 2323 }, { "epoch": 1.1537315902697336, "grad_norm": 0.33799296617507935, "learning_rate": 7.72977427838096e-06, "loss": 0.355, "step": 2324 }, { "epoch": 1.1542280324342213, "grad_norm": 0.35931336879730225, "learning_rate": 7.727353303986084e-06, "loss": 0.44, "step": 2325 }, { "epoch": 1.1547244745987093, "grad_norm": 0.37745845317840576, "learning_rate": 7.724931419002063e-06, "loss": 0.3589, "step": 2326 }, { "epoch": 1.155220916763197, "grad_norm": 0.3320532739162445, "learning_rate": 7.722508624237503e-06, "loss": 0.3448, "step": 2327 }, { "epoch": 1.155717358927685, "grad_norm": 0.40056419372558594, "learning_rate": 7.720084920501306e-06, "loss": 0.4065, "step": 2328 }, { "epoch": 1.1562138010921728, "grad_norm": 0.38926753401756287, "learning_rate": 7.717660308602681e-06, "loss": 0.3648, "step": 2329 }, { "epoch": 1.1567102432566605, "grad_norm": 0.4133128523826599, "learning_rate": 7.715234789351144e-06, "loss": 0.3785, "step": 2330 }, { "epoch": 1.1572066854211485, "grad_norm": 0.38080787658691406, "learning_rate": 7.712808363556504e-06, "loss": 0.4021, "step": 2331 }, { "epoch": 1.1577031275856362, "grad_norm": 0.3328632414340973, "learning_rate": 7.710381032028882e-06, "loss": 0.3476, "step": 2332 }, { "epoch": 1.1581995697501242, "grad_norm": 0.39091238379478455, "learning_rate": 7.707952795578698e-06, "loss": 0.3632, "step": 2333 }, { "epoch": 1.158696011914612, "grad_norm": 0.33785536885261536, "learning_rate": 7.705523655016674e-06, "loss": 0.3661, "step": 2334 }, { "epoch": 1.1591924540790999, "grad_norm": 0.32430481910705566, "learning_rate": 7.703093611153833e-06, "loss": 0.3606, "step": 2335 }, { "epoch": 1.1596888962435876, "grad_norm": 0.38205382227897644, "learning_rate": 7.700662664801501e-06, "loss": 0.3697, "step": 2336 }, { "epoch": 1.1601853384080756, "grad_norm": 0.4091084897518158, "learning_rate": 7.698230816771307e-06, "loss": 0.3714, "step": 2337 }, { "epoch": 1.1606817805725633, "grad_norm": 0.3793846666812897, "learning_rate": 7.695798067875174e-06, "loss": 0.3987, "step": 2338 }, { "epoch": 1.161178222737051, "grad_norm": 0.43213868141174316, "learning_rate": 7.693364418925335e-06, "loss": 0.3986, "step": 2339 }, { "epoch": 1.161674664901539, "grad_norm": 0.3411490023136139, "learning_rate": 7.690929870734319e-06, "loss": 0.3762, "step": 2340 }, { "epoch": 1.1621711070660268, "grad_norm": 0.3947332203388214, "learning_rate": 7.688494424114954e-06, "loss": 0.4247, "step": 2341 }, { "epoch": 1.1626675492305147, "grad_norm": 0.38037529587745667, "learning_rate": 7.686058079880371e-06, "loss": 0.3931, "step": 2342 }, { "epoch": 1.1631639913950025, "grad_norm": 0.3837208151817322, "learning_rate": 7.683620838843997e-06, "loss": 0.3789, "step": 2343 }, { "epoch": 1.1636604335594902, "grad_norm": 0.37019309401512146, "learning_rate": 7.681182701819563e-06, "loss": 0.4409, "step": 2344 }, { "epoch": 1.1641568757239782, "grad_norm": 0.3438277542591095, "learning_rate": 7.678743669621094e-06, "loss": 0.3751, "step": 2345 }, { "epoch": 1.164653317888466, "grad_norm": 0.380615770816803, "learning_rate": 7.676303743062917e-06, "loss": 0.405, "step": 2346 }, { "epoch": 1.165149760052954, "grad_norm": 0.3419630229473114, "learning_rate": 7.67386292295966e-06, "loss": 0.3344, "step": 2347 }, { "epoch": 1.1656462022174416, "grad_norm": 0.37234437465667725, "learning_rate": 7.671421210126245e-06, "loss": 0.3813, "step": 2348 }, { "epoch": 1.1661426443819294, "grad_norm": 0.3723068833351135, "learning_rate": 7.668978605377892e-06, "loss": 0.3829, "step": 2349 }, { "epoch": 1.1666390865464173, "grad_norm": 0.39751026034355164, "learning_rate": 7.666535109530121e-06, "loss": 0.3784, "step": 2350 }, { "epoch": 1.167135528710905, "grad_norm": 0.3648802936077118, "learning_rate": 7.66409072339875e-06, "loss": 0.4222, "step": 2351 }, { "epoch": 1.167631970875393, "grad_norm": 0.384642630815506, "learning_rate": 7.661645447799893e-06, "loss": 0.3559, "step": 2352 }, { "epoch": 1.1681284130398808, "grad_norm": 0.39594128727912903, "learning_rate": 7.65919928354996e-06, "loss": 0.3861, "step": 2353 }, { "epoch": 1.1686248552043688, "grad_norm": 0.360088586807251, "learning_rate": 7.656752231465659e-06, "loss": 0.4008, "step": 2354 }, { "epoch": 1.1691212973688565, "grad_norm": 0.35566481947898865, "learning_rate": 7.654304292363993e-06, "loss": 0.3947, "step": 2355 }, { "epoch": 1.1696177395333445, "grad_norm": 0.37017086148262024, "learning_rate": 7.651855467062265e-06, "loss": 0.3597, "step": 2356 }, { "epoch": 1.1701141816978322, "grad_norm": 0.3733358681201935, "learning_rate": 7.649405756378072e-06, "loss": 0.4263, "step": 2357 }, { "epoch": 1.17061062386232, "grad_norm": 0.34745538234710693, "learning_rate": 7.646955161129302e-06, "loss": 0.3775, "step": 2358 }, { "epoch": 1.171107066026808, "grad_norm": 0.3350565433502197, "learning_rate": 7.644503682134143e-06, "loss": 0.3671, "step": 2359 }, { "epoch": 1.1716035081912957, "grad_norm": 0.3362716734409332, "learning_rate": 7.642051320211082e-06, "loss": 0.395, "step": 2360 }, { "epoch": 1.1720999503557836, "grad_norm": 0.3330802619457245, "learning_rate": 7.639598076178887e-06, "loss": 0.3769, "step": 2361 }, { "epoch": 1.1725963925202714, "grad_norm": 0.3890863358974457, "learning_rate": 7.637143950856638e-06, "loss": 0.4084, "step": 2362 }, { "epoch": 1.1730928346847591, "grad_norm": 0.3432477116584778, "learning_rate": 7.634688945063696e-06, "loss": 0.3451, "step": 2363 }, { "epoch": 1.173589276849247, "grad_norm": 0.3857172429561615, "learning_rate": 7.632233059619723e-06, "loss": 0.383, "step": 2364 }, { "epoch": 1.1740857190137348, "grad_norm": 0.38317257165908813, "learning_rate": 7.629776295344672e-06, "loss": 0.4354, "step": 2365 }, { "epoch": 1.1745821611782228, "grad_norm": 0.36577296257019043, "learning_rate": 7.627318653058789e-06, "loss": 0.3851, "step": 2366 }, { "epoch": 1.1750786033427105, "grad_norm": 0.3812764585018158, "learning_rate": 7.624860133582612e-06, "loss": 0.4144, "step": 2367 }, { "epoch": 1.1755750455071985, "grad_norm": 0.3619619905948639, "learning_rate": 7.622400737736978e-06, "loss": 0.3892, "step": 2368 }, { "epoch": 1.1760714876716862, "grad_norm": 0.3692360520362854, "learning_rate": 7.61994046634301e-06, "loss": 0.3512, "step": 2369 }, { "epoch": 1.1765679298361742, "grad_norm": 0.38592424988746643, "learning_rate": 7.6174793202221275e-06, "loss": 0.4203, "step": 2370 }, { "epoch": 1.177064372000662, "grad_norm": 0.34029290080070496, "learning_rate": 7.615017300196038e-06, "loss": 0.3855, "step": 2371 }, { "epoch": 1.1775608141651497, "grad_norm": 0.38000577688217163, "learning_rate": 7.6125544070867456e-06, "loss": 0.3821, "step": 2372 }, { "epoch": 1.1780572563296376, "grad_norm": 0.3670065402984619, "learning_rate": 7.610090641716541e-06, "loss": 0.3847, "step": 2373 }, { "epoch": 1.1785536984941254, "grad_norm": 0.36830514669418335, "learning_rate": 7.607626004908009e-06, "loss": 0.4035, "step": 2374 }, { "epoch": 1.1790501406586134, "grad_norm": 0.365957647562027, "learning_rate": 7.605160497484027e-06, "loss": 0.381, "step": 2375 }, { "epoch": 1.179546582823101, "grad_norm": 0.3736713230609894, "learning_rate": 7.602694120267757e-06, "loss": 0.3504, "step": 2376 }, { "epoch": 1.1800430249875888, "grad_norm": 0.37036436796188354, "learning_rate": 7.600226874082659e-06, "loss": 0.3682, "step": 2377 }, { "epoch": 1.1805394671520768, "grad_norm": 0.3463248312473297, "learning_rate": 7.597758759752476e-06, "loss": 0.3692, "step": 2378 }, { "epoch": 1.1810359093165645, "grad_norm": 0.3683193624019623, "learning_rate": 7.595289778101249e-06, "loss": 0.4263, "step": 2379 }, { "epoch": 1.1815323514810525, "grad_norm": 0.37507516145706177, "learning_rate": 7.592819929953299e-06, "loss": 0.4233, "step": 2380 }, { "epoch": 1.1820287936455403, "grad_norm": 0.36549416184425354, "learning_rate": 7.590349216133245e-06, "loss": 0.3562, "step": 2381 }, { "epoch": 1.1825252358100282, "grad_norm": 0.367531955242157, "learning_rate": 7.587877637465989e-06, "loss": 0.3713, "step": 2382 }, { "epoch": 1.183021677974516, "grad_norm": 0.3687273859977722, "learning_rate": 7.5854051947767235e-06, "loss": 0.3662, "step": 2383 }, { "epoch": 1.183518120139004, "grad_norm": 0.3532942235469818, "learning_rate": 7.582931888890933e-06, "loss": 0.3436, "step": 2384 }, { "epoch": 1.1840145623034917, "grad_norm": 0.3764963448047638, "learning_rate": 7.580457720634383e-06, "loss": 0.4212, "step": 2385 }, { "epoch": 1.1845110044679794, "grad_norm": 0.34810343384742737, "learning_rate": 7.577982690833135e-06, "loss": 0.3884, "step": 2386 }, { "epoch": 1.1850074466324674, "grad_norm": 0.3603106737136841, "learning_rate": 7.575506800313529e-06, "loss": 0.3679, "step": 2387 }, { "epoch": 1.1855038887969551, "grad_norm": 0.37004730105400085, "learning_rate": 7.573030049902204e-06, "loss": 0.3798, "step": 2388 }, { "epoch": 1.186000330961443, "grad_norm": 0.3684477210044861, "learning_rate": 7.570552440426075e-06, "loss": 0.3666, "step": 2389 }, { "epoch": 1.1864967731259308, "grad_norm": 0.3422689139842987, "learning_rate": 7.56807397271235e-06, "loss": 0.3716, "step": 2390 }, { "epoch": 1.1869932152904186, "grad_norm": 0.3801972270011902, "learning_rate": 7.565594647588521e-06, "loss": 0.4116, "step": 2391 }, { "epoch": 1.1874896574549065, "grad_norm": 0.31935322284698486, "learning_rate": 7.563114465882369e-06, "loss": 0.3326, "step": 2392 }, { "epoch": 1.1879860996193943, "grad_norm": 0.3922822177410126, "learning_rate": 7.5606334284219586e-06, "loss": 0.3816, "step": 2393 }, { "epoch": 1.1884825417838822, "grad_norm": 0.3536842167377472, "learning_rate": 7.558151536035641e-06, "loss": 0.4041, "step": 2394 }, { "epoch": 1.18897898394837, "grad_norm": 0.3594890832901001, "learning_rate": 7.555668789552051e-06, "loss": 0.3423, "step": 2395 }, { "epoch": 1.189475426112858, "grad_norm": 0.41851457953453064, "learning_rate": 7.553185189800112e-06, "loss": 0.3924, "step": 2396 }, { "epoch": 1.1899718682773457, "grad_norm": 0.3541496992111206, "learning_rate": 7.550700737609031e-06, "loss": 0.362, "step": 2397 }, { "epoch": 1.1904683104418334, "grad_norm": 0.3823615610599518, "learning_rate": 7.548215433808297e-06, "loss": 0.3819, "step": 2398 }, { "epoch": 1.1909647526063214, "grad_norm": 0.42289066314697266, "learning_rate": 7.545729279227687e-06, "loss": 0.4111, "step": 2399 }, { "epoch": 1.1914611947708091, "grad_norm": 0.40491873025894165, "learning_rate": 7.543242274697258e-06, "loss": 0.3795, "step": 2400 }, { "epoch": 1.191957636935297, "grad_norm": 0.3981381058692932, "learning_rate": 7.540754421047356e-06, "loss": 0.4503, "step": 2401 }, { "epoch": 1.1924540790997848, "grad_norm": 0.3602580428123474, "learning_rate": 7.538265719108606e-06, "loss": 0.354, "step": 2402 }, { "epoch": 1.1929505212642728, "grad_norm": 0.3767736256122589, "learning_rate": 7.5357761697119195e-06, "loss": 0.4413, "step": 2403 }, { "epoch": 1.1934469634287606, "grad_norm": 0.34666746854782104, "learning_rate": 7.533285773688488e-06, "loss": 0.3629, "step": 2404 }, { "epoch": 1.1939434055932483, "grad_norm": 0.36138972640037537, "learning_rate": 7.53079453186979e-06, "loss": 0.3772, "step": 2405 }, { "epoch": 1.1944398477577363, "grad_norm": 0.37020760774612427, "learning_rate": 7.528302445087577e-06, "loss": 0.4177, "step": 2406 }, { "epoch": 1.194936289922224, "grad_norm": 0.34318315982818604, "learning_rate": 7.525809514173896e-06, "loss": 0.4052, "step": 2407 }, { "epoch": 1.195432732086712, "grad_norm": 0.36493951082229614, "learning_rate": 7.523315739961065e-06, "loss": 0.3692, "step": 2408 }, { "epoch": 1.1959291742511997, "grad_norm": 0.3544046878814697, "learning_rate": 7.5208211232816864e-06, "loss": 0.3717, "step": 2409 }, { "epoch": 1.1964256164156875, "grad_norm": 0.3704414367675781, "learning_rate": 7.518325664968649e-06, "loss": 0.3758, "step": 2410 }, { "epoch": 1.1969220585801754, "grad_norm": 0.3511749505996704, "learning_rate": 7.515829365855116e-06, "loss": 0.3609, "step": 2411 }, { "epoch": 1.1974185007446632, "grad_norm": 0.3337518572807312, "learning_rate": 7.513332226774535e-06, "loss": 0.3743, "step": 2412 }, { "epoch": 1.1979149429091511, "grad_norm": 0.3673747181892395, "learning_rate": 7.51083424856063e-06, "loss": 0.4024, "step": 2413 }, { "epoch": 1.1984113850736389, "grad_norm": 0.3500286638736725, "learning_rate": 7.508335432047412e-06, "loss": 0.3842, "step": 2414 }, { "epoch": 1.1989078272381268, "grad_norm": 0.3741984963417053, "learning_rate": 7.505835778069166e-06, "loss": 0.3703, "step": 2415 }, { "epoch": 1.1994042694026146, "grad_norm": 0.3812859058380127, "learning_rate": 7.503335287460456e-06, "loss": 0.3937, "step": 2416 }, { "epoch": 1.1999007115671025, "grad_norm": 0.33995604515075684, "learning_rate": 7.500833961056133e-06, "loss": 0.3638, "step": 2417 }, { "epoch": 1.2003971537315903, "grad_norm": 0.3791371285915375, "learning_rate": 7.498331799691318e-06, "loss": 0.4139, "step": 2418 }, { "epoch": 1.200893595896078, "grad_norm": 0.3483600318431854, "learning_rate": 7.495828804201417e-06, "loss": 0.4002, "step": 2419 }, { "epoch": 1.201390038060566, "grad_norm": 0.35837551951408386, "learning_rate": 7.493324975422112e-06, "loss": 0.3835, "step": 2420 }, { "epoch": 1.2018864802250537, "grad_norm": 0.3583897352218628, "learning_rate": 7.4908203141893594e-06, "loss": 0.3769, "step": 2421 }, { "epoch": 1.2023829223895417, "grad_norm": 0.3483582139015198, "learning_rate": 7.488314821339403e-06, "loss": 0.3837, "step": 2422 }, { "epoch": 1.2028793645540294, "grad_norm": 0.3642521798610687, "learning_rate": 7.485808497708757e-06, "loss": 0.4221, "step": 2423 }, { "epoch": 1.2033758067185172, "grad_norm": 0.39742496609687805, "learning_rate": 7.483301344134213e-06, "loss": 0.372, "step": 2424 }, { "epoch": 1.2038722488830051, "grad_norm": 0.37323349714279175, "learning_rate": 7.480793361452842e-06, "loss": 0.3918, "step": 2425 }, { "epoch": 1.204368691047493, "grad_norm": 0.37973666191101074, "learning_rate": 7.478284550501992e-06, "loss": 0.3677, "step": 2426 }, { "epoch": 1.2048651332119809, "grad_norm": 0.3864479959011078, "learning_rate": 7.475774912119287e-06, "loss": 0.3714, "step": 2427 }, { "epoch": 1.2053615753764686, "grad_norm": 0.41959574818611145, "learning_rate": 7.473264447142626e-06, "loss": 0.3803, "step": 2428 }, { "epoch": 1.2058580175409566, "grad_norm": 0.35454845428466797, "learning_rate": 7.470753156410188e-06, "loss": 0.3821, "step": 2429 }, { "epoch": 1.2063544597054443, "grad_norm": 0.3871806561946869, "learning_rate": 7.46824104076042e-06, "loss": 0.3686, "step": 2430 }, { "epoch": 1.2068509018699323, "grad_norm": 0.3692561686038971, "learning_rate": 7.465728101032052e-06, "loss": 0.3736, "step": 2431 }, { "epoch": 1.20734734403442, "grad_norm": 0.375034898519516, "learning_rate": 7.4632143380640875e-06, "loss": 0.3713, "step": 2432 }, { "epoch": 1.2078437861989078, "grad_norm": 0.36960557103157043, "learning_rate": 7.460699752695801e-06, "loss": 0.4096, "step": 2433 }, { "epoch": 1.2083402283633957, "grad_norm": 0.322822630405426, "learning_rate": 7.458184345766744e-06, "loss": 0.3511, "step": 2434 }, { "epoch": 1.2088366705278835, "grad_norm": 0.4019593596458435, "learning_rate": 7.455668118116746e-06, "loss": 0.3905, "step": 2435 }, { "epoch": 1.2093331126923714, "grad_norm": 0.35878774523735046, "learning_rate": 7.453151070585903e-06, "loss": 0.3857, "step": 2436 }, { "epoch": 1.2098295548568592, "grad_norm": 0.3860284984111786, "learning_rate": 7.45063320401459e-06, "loss": 0.3617, "step": 2437 }, { "epoch": 1.210325997021347, "grad_norm": 0.332145094871521, "learning_rate": 7.448114519243456e-06, "loss": 0.3613, "step": 2438 }, { "epoch": 1.2108224391858349, "grad_norm": 0.36751845479011536, "learning_rate": 7.445595017113418e-06, "loss": 0.4121, "step": 2439 }, { "epoch": 1.2113188813503226, "grad_norm": 0.3848131000995636, "learning_rate": 7.4430746984656736e-06, "loss": 0.3558, "step": 2440 }, { "epoch": 1.2118153235148106, "grad_norm": 0.38705557584762573, "learning_rate": 7.440553564141686e-06, "loss": 0.4134, "step": 2441 }, { "epoch": 1.2123117656792983, "grad_norm": 0.3607991933822632, "learning_rate": 7.438031614983195e-06, "loss": 0.3648, "step": 2442 }, { "epoch": 1.2128082078437863, "grad_norm": 0.4231494665145874, "learning_rate": 7.4355088518322076e-06, "loss": 0.4249, "step": 2443 }, { "epoch": 1.213304650008274, "grad_norm": 0.3775213360786438, "learning_rate": 7.432985275531009e-06, "loss": 0.4071, "step": 2444 }, { "epoch": 1.213801092172762, "grad_norm": 0.33799225091934204, "learning_rate": 7.430460886922152e-06, "loss": 0.3346, "step": 2445 }, { "epoch": 1.2142975343372497, "grad_norm": 0.42159897089004517, "learning_rate": 7.427935686848461e-06, "loss": 0.429, "step": 2446 }, { "epoch": 1.2147939765017375, "grad_norm": 0.3602170944213867, "learning_rate": 7.425409676153032e-06, "loss": 0.3622, "step": 2447 }, { "epoch": 1.2152904186662254, "grad_norm": 0.38913971185684204, "learning_rate": 7.42288285567923e-06, "loss": 0.3863, "step": 2448 }, { "epoch": 1.2157868608307132, "grad_norm": 0.3560231029987335, "learning_rate": 7.420355226270693e-06, "loss": 0.4308, "step": 2449 }, { "epoch": 1.2162833029952012, "grad_norm": 0.3491612672805786, "learning_rate": 7.417826788771327e-06, "loss": 0.3615, "step": 2450 }, { "epoch": 1.216779745159689, "grad_norm": 0.4124278724193573, "learning_rate": 7.415297544025311e-06, "loss": 0.3867, "step": 2451 }, { "epoch": 1.2172761873241766, "grad_norm": 0.3439443111419678, "learning_rate": 7.412767492877089e-06, "loss": 0.3582, "step": 2452 }, { "epoch": 1.2177726294886646, "grad_norm": 0.40619540214538574, "learning_rate": 7.410236636171376e-06, "loss": 0.39, "step": 2453 }, { "epoch": 1.2182690716531523, "grad_norm": 0.46365779638290405, "learning_rate": 7.407704974753157e-06, "loss": 0.4063, "step": 2454 }, { "epoch": 1.2187655138176403, "grad_norm": 0.3229363262653351, "learning_rate": 7.405172509467685e-06, "loss": 0.3442, "step": 2455 }, { "epoch": 1.219261955982128, "grad_norm": 0.42341992259025574, "learning_rate": 7.402639241160479e-06, "loss": 0.441, "step": 2456 }, { "epoch": 1.2197583981466158, "grad_norm": 0.36312371492385864, "learning_rate": 7.400105170677333e-06, "loss": 0.4043, "step": 2457 }, { "epoch": 1.2202548403111038, "grad_norm": 0.34582576155662537, "learning_rate": 7.3975702988643e-06, "loss": 0.3568, "step": 2458 }, { "epoch": 1.2207512824755915, "grad_norm": 0.40011659264564514, "learning_rate": 7.395034626567709e-06, "loss": 0.3923, "step": 2459 }, { "epoch": 1.2212477246400795, "grad_norm": 0.3476411998271942, "learning_rate": 7.392498154634147e-06, "loss": 0.423, "step": 2460 }, { "epoch": 1.2217441668045672, "grad_norm": 0.34699004888534546, "learning_rate": 7.3899608839104775e-06, "loss": 0.3941, "step": 2461 }, { "epoch": 1.2222406089690552, "grad_norm": 0.3436979353427887, "learning_rate": 7.3874228152438236e-06, "loss": 0.3846, "step": 2462 }, { "epoch": 1.222737051133543, "grad_norm": 0.3173581063747406, "learning_rate": 7.3848839494815775e-06, "loss": 0.3793, "step": 2463 }, { "epoch": 1.2232334932980309, "grad_norm": 0.34805142879486084, "learning_rate": 7.382344287471398e-06, "loss": 0.3658, "step": 2464 }, { "epoch": 1.2237299354625186, "grad_norm": 0.37244701385498047, "learning_rate": 7.379803830061211e-06, "loss": 0.3972, "step": 2465 }, { "epoch": 1.2242263776270064, "grad_norm": 0.41300928592681885, "learning_rate": 7.377262578099204e-06, "loss": 0.3883, "step": 2466 }, { "epoch": 1.2247228197914943, "grad_norm": 0.38253551721572876, "learning_rate": 7.374720532433832e-06, "loss": 0.3693, "step": 2467 }, { "epoch": 1.225219261955982, "grad_norm": 0.37581202387809753, "learning_rate": 7.372177693913817e-06, "loss": 0.3986, "step": 2468 }, { "epoch": 1.22571570412047, "grad_norm": 0.3841043710708618, "learning_rate": 7.36963406338814e-06, "loss": 0.3914, "step": 2469 }, { "epoch": 1.2262121462849578, "grad_norm": 0.4228055477142334, "learning_rate": 7.3670896417060555e-06, "loss": 0.4178, "step": 2470 }, { "epoch": 1.2267085884494455, "grad_norm": 0.3559260666370392, "learning_rate": 7.364544429717071e-06, "loss": 0.3923, "step": 2471 }, { "epoch": 1.2272050306139335, "grad_norm": 0.3516424894332886, "learning_rate": 7.3619984282709665e-06, "loss": 0.369, "step": 2472 }, { "epoch": 1.2277014727784212, "grad_norm": 0.3569701910018921, "learning_rate": 7.359451638217783e-06, "loss": 0.3652, "step": 2473 }, { "epoch": 1.2281979149429092, "grad_norm": 0.3442641794681549, "learning_rate": 7.356904060407823e-06, "loss": 0.3432, "step": 2474 }, { "epoch": 1.228694357107397, "grad_norm": 0.39451226592063904, "learning_rate": 7.354355695691655e-06, "loss": 0.4251, "step": 2475 }, { "epoch": 1.229190799271885, "grad_norm": 0.3468201458454132, "learning_rate": 7.3518065449201095e-06, "loss": 0.3447, "step": 2476 }, { "epoch": 1.2296872414363726, "grad_norm": 0.34587058424949646, "learning_rate": 7.349256608944275e-06, "loss": 0.4118, "step": 2477 }, { "epoch": 1.2301836836008606, "grad_norm": 0.354756623506546, "learning_rate": 7.346705888615509e-06, "loss": 0.3868, "step": 2478 }, { "epoch": 1.2306801257653484, "grad_norm": 0.353062242269516, "learning_rate": 7.344154384785426e-06, "loss": 0.3551, "step": 2479 }, { "epoch": 1.231176567929836, "grad_norm": 0.32417312264442444, "learning_rate": 7.341602098305904e-06, "loss": 0.3763, "step": 2480 }, { "epoch": 1.231673010094324, "grad_norm": 0.35204648971557617, "learning_rate": 7.339049030029084e-06, "loss": 0.3819, "step": 2481 }, { "epoch": 1.2321694522588118, "grad_norm": 0.36500757932662964, "learning_rate": 7.336495180807364e-06, "loss": 0.3836, "step": 2482 }, { "epoch": 1.2326658944232998, "grad_norm": 0.3524424433708191, "learning_rate": 7.333940551493406e-06, "loss": 0.3841, "step": 2483 }, { "epoch": 1.2331623365877875, "grad_norm": 0.3535081148147583, "learning_rate": 7.331385142940131e-06, "loss": 0.377, "step": 2484 }, { "epoch": 1.2336587787522753, "grad_norm": 0.34394773840904236, "learning_rate": 7.32882895600072e-06, "loss": 0.3773, "step": 2485 }, { "epoch": 1.2341552209167632, "grad_norm": 0.35802820324897766, "learning_rate": 7.326271991528614e-06, "loss": 0.3639, "step": 2486 }, { "epoch": 1.234651663081251, "grad_norm": 0.3362089991569519, "learning_rate": 7.323714250377515e-06, "loss": 0.3544, "step": 2487 }, { "epoch": 1.235148105245739, "grad_norm": 0.39043882489204407, "learning_rate": 7.321155733401382e-06, "loss": 0.3941, "step": 2488 }, { "epoch": 1.2356445474102267, "grad_norm": 0.36904457211494446, "learning_rate": 7.318596441454437e-06, "loss": 0.3723, "step": 2489 }, { "epoch": 1.2361409895747146, "grad_norm": 0.39808133244514465, "learning_rate": 7.316036375391156e-06, "loss": 0.4289, "step": 2490 }, { "epoch": 1.2366374317392024, "grad_norm": 0.3369465172290802, "learning_rate": 7.313475536066275e-06, "loss": 0.3681, "step": 2491 }, { "epoch": 1.2371338739036903, "grad_norm": 0.3577103912830353, "learning_rate": 7.31091392433479e-06, "loss": 0.3684, "step": 2492 }, { "epoch": 1.237630316068178, "grad_norm": 0.3493306338787079, "learning_rate": 7.3083515410519516e-06, "loss": 0.3893, "step": 2493 }, { "epoch": 1.2381267582326658, "grad_norm": 0.3884400725364685, "learning_rate": 7.305788387073272e-06, "loss": 0.4115, "step": 2494 }, { "epoch": 1.2386232003971538, "grad_norm": 0.31669536232948303, "learning_rate": 7.303224463254517e-06, "loss": 0.328, "step": 2495 }, { "epoch": 1.2391196425616415, "grad_norm": 0.36060312390327454, "learning_rate": 7.3006597704517115e-06, "loss": 0.3942, "step": 2496 }, { "epoch": 1.2396160847261295, "grad_norm": 0.36728230118751526, "learning_rate": 7.298094309521138e-06, "loss": 0.3647, "step": 2497 }, { "epoch": 1.2401125268906172, "grad_norm": 0.33900874853134155, "learning_rate": 7.295528081319334e-06, "loss": 0.3605, "step": 2498 }, { "epoch": 1.240608969055105, "grad_norm": 0.3813052177429199, "learning_rate": 7.292961086703091e-06, "loss": 0.4102, "step": 2499 }, { "epoch": 1.241105411219593, "grad_norm": 0.3421436548233032, "learning_rate": 7.290393326529463e-06, "loss": 0.365, "step": 2500 }, { "epoch": 1.2416018533840807, "grad_norm": 0.3716670274734497, "learning_rate": 7.28782480165575e-06, "loss": 0.383, "step": 2501 }, { "epoch": 1.2420982955485687, "grad_norm": 0.34450894594192505, "learning_rate": 7.285255512939516e-06, "loss": 0.3627, "step": 2502 }, { "epoch": 1.2425947377130564, "grad_norm": 0.37222063541412354, "learning_rate": 7.2826854612385756e-06, "loss": 0.4237, "step": 2503 }, { "epoch": 1.2430911798775444, "grad_norm": 0.38670068979263306, "learning_rate": 7.280114647411001e-06, "loss": 0.397, "step": 2504 }, { "epoch": 1.243587622042032, "grad_norm": 0.4209253489971161, "learning_rate": 7.2775430723151155e-06, "loss": 0.3769, "step": 2505 }, { "epoch": 1.2440840642065198, "grad_norm": 0.358425110578537, "learning_rate": 7.274970736809497e-06, "loss": 0.3922, "step": 2506 }, { "epoch": 1.2445805063710078, "grad_norm": 0.3776313066482544, "learning_rate": 7.272397641752982e-06, "loss": 0.3419, "step": 2507 }, { "epoch": 1.2450769485354956, "grad_norm": 0.42311736941337585, "learning_rate": 7.269823788004653e-06, "loss": 0.3835, "step": 2508 }, { "epoch": 1.2455733906999835, "grad_norm": 0.32418110966682434, "learning_rate": 7.267249176423852e-06, "loss": 0.3654, "step": 2509 }, { "epoch": 1.2460698328644713, "grad_norm": 0.3817286491394043, "learning_rate": 7.264673807870172e-06, "loss": 0.395, "step": 2510 }, { "epoch": 1.2465662750289592, "grad_norm": 0.4634905457496643, "learning_rate": 7.262097683203456e-06, "loss": 0.3957, "step": 2511 }, { "epoch": 1.247062717193447, "grad_norm": 0.3676642179489136, "learning_rate": 7.259520803283806e-06, "loss": 0.3528, "step": 2512 }, { "epoch": 1.2475591593579347, "grad_norm": 0.3380596339702606, "learning_rate": 7.2569431689715695e-06, "loss": 0.3851, "step": 2513 }, { "epoch": 1.2480556015224227, "grad_norm": 0.39991676807403564, "learning_rate": 7.25436478112735e-06, "loss": 0.4145, "step": 2514 }, { "epoch": 1.2485520436869104, "grad_norm": 0.3745156526565552, "learning_rate": 7.251785640611999e-06, "loss": 0.3701, "step": 2515 }, { "epoch": 1.2490484858513984, "grad_norm": 0.40856969356536865, "learning_rate": 7.249205748286623e-06, "loss": 0.4565, "step": 2516 }, { "epoch": 1.2495449280158861, "grad_norm": 0.350930392742157, "learning_rate": 7.246625105012579e-06, "loss": 0.3962, "step": 2517 }, { "epoch": 1.2500413701803739, "grad_norm": 0.42147237062454224, "learning_rate": 7.244043711651472e-06, "loss": 0.3922, "step": 2518 }, { "epoch": 1.2505378123448618, "grad_norm": 0.32925838232040405, "learning_rate": 7.241461569065158e-06, "loss": 0.3349, "step": 2519 }, { "epoch": 1.2510342545093498, "grad_norm": 0.33772751688957214, "learning_rate": 7.238878678115746e-06, "loss": 0.3284, "step": 2520 }, { "epoch": 1.2515306966738375, "grad_norm": 0.35214629769325256, "learning_rate": 7.2362950396655925e-06, "loss": 0.3649, "step": 2521 }, { "epoch": 1.2520271388383253, "grad_norm": 0.35546770691871643, "learning_rate": 7.233710654577306e-06, "loss": 0.3843, "step": 2522 }, { "epoch": 1.2525235810028132, "grad_norm": 0.34586113691329956, "learning_rate": 7.231125523713739e-06, "loss": 0.3737, "step": 2523 }, { "epoch": 1.253020023167301, "grad_norm": 0.36793726682662964, "learning_rate": 7.228539647938e-06, "loss": 0.4149, "step": 2524 }, { "epoch": 1.253516465331789, "grad_norm": 0.39770522713661194, "learning_rate": 7.225953028113439e-06, "loss": 0.3943, "step": 2525 }, { "epoch": 1.2540129074962767, "grad_norm": 0.38550809025764465, "learning_rate": 7.223365665103662e-06, "loss": 0.3685, "step": 2526 }, { "epoch": 1.2545093496607644, "grad_norm": 0.3764232397079468, "learning_rate": 7.220777559772515e-06, "loss": 0.3753, "step": 2527 }, { "epoch": 1.2550057918252524, "grad_norm": 0.3758707344532013, "learning_rate": 7.2181887129841e-06, "loss": 0.3572, "step": 2528 }, { "epoch": 1.2555022339897401, "grad_norm": 0.4190254509449005, "learning_rate": 7.215599125602759e-06, "loss": 0.3891, "step": 2529 }, { "epoch": 1.255998676154228, "grad_norm": 0.4134471118450165, "learning_rate": 7.2130087984930885e-06, "loss": 0.3958, "step": 2530 }, { "epoch": 1.2564951183187159, "grad_norm": 0.3635765016078949, "learning_rate": 7.210417732519926e-06, "loss": 0.3705, "step": 2531 }, { "epoch": 1.2569915604832036, "grad_norm": 0.3846115469932556, "learning_rate": 7.207825928548358e-06, "loss": 0.3671, "step": 2532 }, { "epoch": 1.2574880026476916, "grad_norm": 0.399282842874527, "learning_rate": 7.2052333874437175e-06, "loss": 0.3782, "step": 2533 }, { "epoch": 1.2579844448121793, "grad_norm": 0.36274245381355286, "learning_rate": 7.202640110071584e-06, "loss": 0.3663, "step": 2534 }, { "epoch": 1.2584808869766673, "grad_norm": 0.3528279960155487, "learning_rate": 7.200046097297782e-06, "loss": 0.4004, "step": 2535 }, { "epoch": 1.258977329141155, "grad_norm": 0.3765220642089844, "learning_rate": 7.197451349988382e-06, "loss": 0.3927, "step": 2536 }, { "epoch": 1.259473771305643, "grad_norm": 0.37848764657974243, "learning_rate": 7.194855869009701e-06, "loss": 0.411, "step": 2537 }, { "epoch": 1.2599702134701307, "grad_norm": 0.36914145946502686, "learning_rate": 7.192259655228298e-06, "loss": 0.3856, "step": 2538 }, { "epoch": 1.2604666556346187, "grad_norm": 0.37162649631500244, "learning_rate": 7.189662709510977e-06, "loss": 0.4013, "step": 2539 }, { "epoch": 1.2609630977991064, "grad_norm": 0.4177471399307251, "learning_rate": 7.1870650327247895e-06, "loss": 0.3774, "step": 2540 }, { "epoch": 1.2614595399635942, "grad_norm": 0.3825254440307617, "learning_rate": 7.1844666257370296e-06, "loss": 0.4163, "step": 2541 }, { "epoch": 1.2619559821280821, "grad_norm": 0.36959943175315857, "learning_rate": 7.181867489415233e-06, "loss": 0.3671, "step": 2542 }, { "epoch": 1.2624524242925699, "grad_norm": 0.33835193514823914, "learning_rate": 7.179267624627182e-06, "loss": 0.3458, "step": 2543 }, { "epoch": 1.2629488664570578, "grad_norm": 0.3776026964187622, "learning_rate": 7.1766670322409005e-06, "loss": 0.3895, "step": 2544 }, { "epoch": 1.2634453086215456, "grad_norm": 0.3712221682071686, "learning_rate": 7.1740657131246545e-06, "loss": 0.3758, "step": 2545 }, { "epoch": 1.2639417507860333, "grad_norm": 0.3813170790672302, "learning_rate": 7.171463668146957e-06, "loss": 0.3964, "step": 2546 }, { "epoch": 1.2644381929505213, "grad_norm": 0.31672054529190063, "learning_rate": 7.168860898176555e-06, "loss": 0.3555, "step": 2547 }, { "epoch": 1.264934635115009, "grad_norm": 0.3845021724700928, "learning_rate": 7.166257404082446e-06, "loss": 0.3824, "step": 2548 }, { "epoch": 1.265431077279497, "grad_norm": 0.3594551682472229, "learning_rate": 7.163653186733867e-06, "loss": 0.3965, "step": 2549 }, { "epoch": 1.2659275194439847, "grad_norm": 0.3650512099266052, "learning_rate": 7.161048247000292e-06, "loss": 0.3696, "step": 2550 }, { "epoch": 1.2664239616084725, "grad_norm": 0.36738309264183044, "learning_rate": 7.158442585751442e-06, "loss": 0.4157, "step": 2551 }, { "epoch": 1.2669204037729604, "grad_norm": 0.3798584043979645, "learning_rate": 7.155836203857276e-06, "loss": 0.3929, "step": 2552 }, { "epoch": 1.2674168459374484, "grad_norm": 0.31588494777679443, "learning_rate": 7.153229102187994e-06, "loss": 0.323, "step": 2553 }, { "epoch": 1.2679132881019362, "grad_norm": 0.39771413803100586, "learning_rate": 7.150621281614036e-06, "loss": 0.3943, "step": 2554 }, { "epoch": 1.268409730266424, "grad_norm": 0.37603795528411865, "learning_rate": 7.148012743006083e-06, "loss": 0.4335, "step": 2555 }, { "epoch": 1.2689061724309119, "grad_norm": 0.3256159722805023, "learning_rate": 7.145403487235057e-06, "loss": 0.3572, "step": 2556 }, { "epoch": 1.2694026145953996, "grad_norm": 0.32950761914253235, "learning_rate": 7.142793515172112e-06, "loss": 0.3316, "step": 2557 }, { "epoch": 1.2698990567598876, "grad_norm": 0.3933122456073761, "learning_rate": 7.140182827688651e-06, "loss": 0.3861, "step": 2558 }, { "epoch": 1.2703954989243753, "grad_norm": 0.35590946674346924, "learning_rate": 7.137571425656311e-06, "loss": 0.394, "step": 2559 }, { "epoch": 1.270891941088863, "grad_norm": 0.3591591417789459, "learning_rate": 7.1349593099469676e-06, "loss": 0.335, "step": 2560 }, { "epoch": 1.271388383253351, "grad_norm": 0.3687058091163635, "learning_rate": 7.132346481432737e-06, "loss": 0.3618, "step": 2561 }, { "epoch": 1.2718848254178388, "grad_norm": 0.3511347472667694, "learning_rate": 7.129732940985969e-06, "loss": 0.3846, "step": 2562 }, { "epoch": 1.2723812675823267, "grad_norm": 0.3606283962726593, "learning_rate": 7.127118689479256e-06, "loss": 0.3957, "step": 2563 }, { "epoch": 1.2728777097468145, "grad_norm": 0.3577076196670532, "learning_rate": 7.124503727785424e-06, "loss": 0.3808, "step": 2564 }, { "epoch": 1.2733741519113022, "grad_norm": 0.3880377411842346, "learning_rate": 7.121888056777538e-06, "loss": 0.4193, "step": 2565 }, { "epoch": 1.2738705940757902, "grad_norm": 0.31768131256103516, "learning_rate": 7.1192716773289e-06, "loss": 0.3305, "step": 2566 }, { "epoch": 1.2743670362402781, "grad_norm": 0.3959648013114929, "learning_rate": 7.116654590313045e-06, "loss": 0.3946, "step": 2567 }, { "epoch": 1.2748634784047659, "grad_norm": 0.344655841588974, "learning_rate": 7.114036796603752e-06, "loss": 0.3626, "step": 2568 }, { "epoch": 1.2753599205692536, "grad_norm": 0.40986204147338867, "learning_rate": 7.11141829707503e-06, "loss": 0.4161, "step": 2569 }, { "epoch": 1.2758563627337416, "grad_norm": 0.35796013474464417, "learning_rate": 7.108799092601122e-06, "loss": 0.3724, "step": 2570 }, { "epoch": 1.2763528048982293, "grad_norm": 0.3515617847442627, "learning_rate": 7.106179184056512e-06, "loss": 0.372, "step": 2571 }, { "epoch": 1.2768492470627173, "grad_norm": 0.38866126537323, "learning_rate": 7.103558572315914e-06, "loss": 0.4052, "step": 2572 }, { "epoch": 1.277345689227205, "grad_norm": 0.34905919432640076, "learning_rate": 7.100937258254281e-06, "loss": 0.3405, "step": 2573 }, { "epoch": 1.2778421313916928, "grad_norm": 0.3801780641078949, "learning_rate": 7.098315242746797e-06, "loss": 0.3981, "step": 2574 }, { "epoch": 1.2783385735561807, "grad_norm": 0.3644266426563263, "learning_rate": 7.095692526668882e-06, "loss": 0.353, "step": 2575 }, { "epoch": 1.2788350157206685, "grad_norm": 0.3716840147972107, "learning_rate": 7.093069110896194e-06, "loss": 0.3712, "step": 2576 }, { "epoch": 1.2793314578851565, "grad_norm": 0.38362205028533936, "learning_rate": 7.090444996304613e-06, "loss": 0.4022, "step": 2577 }, { "epoch": 1.2798279000496442, "grad_norm": 0.3662378191947937, "learning_rate": 7.087820183770264e-06, "loss": 0.3808, "step": 2578 }, { "epoch": 1.280324342214132, "grad_norm": 0.39557138085365295, "learning_rate": 7.0851946741694975e-06, "loss": 0.3597, "step": 2579 }, { "epoch": 1.28082078437862, "grad_norm": 0.35370150208473206, "learning_rate": 7.082568468378905e-06, "loss": 0.3671, "step": 2580 }, { "epoch": 1.2813172265431076, "grad_norm": 0.3880578875541687, "learning_rate": 7.079941567275299e-06, "loss": 0.4124, "step": 2581 }, { "epoch": 1.2818136687075956, "grad_norm": 0.3776909410953522, "learning_rate": 7.077313971735735e-06, "loss": 0.3753, "step": 2582 }, { "epoch": 1.2823101108720834, "grad_norm": 0.30351999402046204, "learning_rate": 7.074685682637493e-06, "loss": 0.3486, "step": 2583 }, { "epoch": 1.2828065530365713, "grad_norm": 0.36163565516471863, "learning_rate": 7.07205670085809e-06, "loss": 0.4096, "step": 2584 }, { "epoch": 1.283302995201059, "grad_norm": 0.3534005880355835, "learning_rate": 7.069427027275268e-06, "loss": 0.3266, "step": 2585 }, { "epoch": 1.283799437365547, "grad_norm": 0.38216879963874817, "learning_rate": 7.0667966627670085e-06, "loss": 0.4173, "step": 2586 }, { "epoch": 1.2842958795300348, "grad_norm": 0.3682778775691986, "learning_rate": 7.064165608211513e-06, "loss": 0.3759, "step": 2587 }, { "epoch": 1.2847923216945225, "grad_norm": 0.32738813757896423, "learning_rate": 7.061533864487222e-06, "loss": 0.3629, "step": 2588 }, { "epoch": 1.2852887638590105, "grad_norm": 0.38609933853149414, "learning_rate": 7.058901432472805e-06, "loss": 0.3935, "step": 2589 }, { "epoch": 1.2857852060234982, "grad_norm": 0.35241425037384033, "learning_rate": 7.056268313047155e-06, "loss": 0.3547, "step": 2590 }, { "epoch": 1.2862816481879862, "grad_norm": 0.3727618157863617, "learning_rate": 7.053634507089402e-06, "loss": 0.3879, "step": 2591 }, { "epoch": 1.286778090352474, "grad_norm": 0.328599214553833, "learning_rate": 7.051000015478903e-06, "loss": 0.3584, "step": 2592 }, { "epoch": 1.2872745325169617, "grad_norm": 0.35783520340919495, "learning_rate": 7.048364839095242e-06, "loss": 0.3679, "step": 2593 }, { "epoch": 1.2877709746814496, "grad_norm": 0.3577946722507477, "learning_rate": 7.045728978818231e-06, "loss": 0.4031, "step": 2594 }, { "epoch": 1.2882674168459374, "grad_norm": 0.37786853313446045, "learning_rate": 7.043092435527916e-06, "loss": 0.4108, "step": 2595 }, { "epoch": 1.2887638590104253, "grad_norm": 0.3325015604496002, "learning_rate": 7.040455210104564e-06, "loss": 0.3203, "step": 2596 }, { "epoch": 1.289260301174913, "grad_norm": 0.3287682831287384, "learning_rate": 7.037817303428674e-06, "loss": 0.3749, "step": 2597 }, { "epoch": 1.2897567433394008, "grad_norm": 0.3472592830657959, "learning_rate": 7.0351787163809695e-06, "loss": 0.3621, "step": 2598 }, { "epoch": 1.2902531855038888, "grad_norm": 0.37232330441474915, "learning_rate": 7.032539449842407e-06, "loss": 0.4071, "step": 2599 }, { "epoch": 1.2907496276683768, "grad_norm": 0.3381806015968323, "learning_rate": 7.029899504694162e-06, "loss": 0.3623, "step": 2600 }, { "epoch": 1.2912460698328645, "grad_norm": 0.338817298412323, "learning_rate": 7.0272588818176425e-06, "loss": 0.3706, "step": 2601 }, { "epoch": 1.2917425119973522, "grad_norm": 0.35173776745796204, "learning_rate": 7.0246175820944815e-06, "loss": 0.3216, "step": 2602 }, { "epoch": 1.2922389541618402, "grad_norm": 0.4021787643432617, "learning_rate": 7.021975606406534e-06, "loss": 0.4579, "step": 2603 }, { "epoch": 1.292735396326328, "grad_norm": 0.3367866575717926, "learning_rate": 7.019332955635887e-06, "loss": 0.3882, "step": 2604 }, { "epoch": 1.293231838490816, "grad_norm": 0.35346412658691406, "learning_rate": 7.016689630664848e-06, "loss": 0.387, "step": 2605 }, { "epoch": 1.2937282806553037, "grad_norm": 0.4161042869091034, "learning_rate": 7.014045632375952e-06, "loss": 0.4242, "step": 2606 }, { "epoch": 1.2942247228197914, "grad_norm": 0.3344419598579407, "learning_rate": 7.011400961651958e-06, "loss": 0.3422, "step": 2607 }, { "epoch": 1.2947211649842794, "grad_norm": 0.3771902322769165, "learning_rate": 7.00875561937585e-06, "loss": 0.3698, "step": 2608 }, { "epoch": 1.295217607148767, "grad_norm": 0.3538220524787903, "learning_rate": 7.006109606430836e-06, "loss": 0.3772, "step": 2609 }, { "epoch": 1.295714049313255, "grad_norm": 0.33824092149734497, "learning_rate": 7.003462923700346e-06, "loss": 0.3606, "step": 2610 }, { "epoch": 1.2962104914777428, "grad_norm": 0.40777942538261414, "learning_rate": 7.000815572068038e-06, "loss": 0.4005, "step": 2611 }, { "epoch": 1.2967069336422306, "grad_norm": 0.395151823759079, "learning_rate": 6.998167552417789e-06, "loss": 0.432, "step": 2612 }, { "epoch": 1.2972033758067185, "grad_norm": 0.3168078660964966, "learning_rate": 6.995518865633703e-06, "loss": 0.3285, "step": 2613 }, { "epoch": 1.2976998179712065, "grad_norm": 0.39282041788101196, "learning_rate": 6.992869512600101e-06, "loss": 0.4038, "step": 2614 }, { "epoch": 1.2981962601356942, "grad_norm": 0.367220401763916, "learning_rate": 6.990219494201532e-06, "loss": 0.3636, "step": 2615 }, { "epoch": 1.298692702300182, "grad_norm": 0.3957940340042114, "learning_rate": 6.9875688113227656e-06, "loss": 0.3702, "step": 2616 }, { "epoch": 1.29918914446467, "grad_norm": 0.36403849720954895, "learning_rate": 6.984917464848793e-06, "loss": 0.3721, "step": 2617 }, { "epoch": 1.2996855866291577, "grad_norm": 0.3540959656238556, "learning_rate": 6.982265455664825e-06, "loss": 0.3734, "step": 2618 }, { "epoch": 1.3001820287936456, "grad_norm": 0.33435070514678955, "learning_rate": 6.979612784656298e-06, "loss": 0.347, "step": 2619 }, { "epoch": 1.3006784709581334, "grad_norm": 0.38286688923835754, "learning_rate": 6.9769594527088625e-06, "loss": 0.4191, "step": 2620 }, { "epoch": 1.3011749131226211, "grad_norm": 0.3579058349132538, "learning_rate": 6.974305460708398e-06, "loss": 0.3869, "step": 2621 }, { "epoch": 1.301671355287109, "grad_norm": 0.36824852228164673, "learning_rate": 6.9716508095409985e-06, "loss": 0.3562, "step": 2622 }, { "epoch": 1.3021677974515968, "grad_norm": 0.34438690543174744, "learning_rate": 6.968995500092981e-06, "loss": 0.3629, "step": 2623 }, { "epoch": 1.3026642396160848, "grad_norm": 0.409614235162735, "learning_rate": 6.966339533250879e-06, "loss": 0.4109, "step": 2624 }, { "epoch": 1.3031606817805725, "grad_norm": 0.3956839144229889, "learning_rate": 6.96368290990145e-06, "loss": 0.4045, "step": 2625 }, { "epoch": 1.3036571239450603, "grad_norm": 0.3451760709285736, "learning_rate": 6.961025630931667e-06, "loss": 0.3453, "step": 2626 }, { "epoch": 1.3041535661095482, "grad_norm": 0.39962559938430786, "learning_rate": 6.958367697228725e-06, "loss": 0.3865, "step": 2627 }, { "epoch": 1.3046500082740362, "grad_norm": 0.3918929696083069, "learning_rate": 6.955709109680032e-06, "loss": 0.3954, "step": 2628 }, { "epoch": 1.305146450438524, "grad_norm": 0.37816891074180603, "learning_rate": 6.9530498691732205e-06, "loss": 0.4367, "step": 2629 }, { "epoch": 1.3056428926030117, "grad_norm": 0.3882623314857483, "learning_rate": 6.9503899765961406e-06, "loss": 0.4132, "step": 2630 }, { "epoch": 1.3061393347674997, "grad_norm": 0.3629670739173889, "learning_rate": 6.947729432836854e-06, "loss": 0.3481, "step": 2631 }, { "epoch": 1.3066357769319874, "grad_norm": 0.32718873023986816, "learning_rate": 6.945068238783648e-06, "loss": 0.3671, "step": 2632 }, { "epoch": 1.3071322190964754, "grad_norm": 0.40713077783584595, "learning_rate": 6.942406395325021e-06, "loss": 0.3995, "step": 2633 }, { "epoch": 1.307628661260963, "grad_norm": 0.3598417639732361, "learning_rate": 6.9397439033496894e-06, "loss": 0.3417, "step": 2634 }, { "epoch": 1.3081251034254509, "grad_norm": 0.3668251037597656, "learning_rate": 6.937080763746587e-06, "loss": 0.4315, "step": 2635 }, { "epoch": 1.3086215455899388, "grad_norm": 0.32611414790153503, "learning_rate": 6.9344169774048675e-06, "loss": 0.4071, "step": 2636 }, { "epoch": 1.3091179877544266, "grad_norm": 0.34158384799957275, "learning_rate": 6.9317525452138915e-06, "loss": 0.3423, "step": 2637 }, { "epoch": 1.3096144299189145, "grad_norm": 0.3579246997833252, "learning_rate": 6.929087468063242e-06, "loss": 0.371, "step": 2638 }, { "epoch": 1.3101108720834023, "grad_norm": 0.34000447392463684, "learning_rate": 6.9264217468427175e-06, "loss": 0.3848, "step": 2639 }, { "epoch": 1.31060731424789, "grad_norm": 0.3353886604309082, "learning_rate": 6.92375538244233e-06, "loss": 0.4147, "step": 2640 }, { "epoch": 1.311103756412378, "grad_norm": 0.33397191762924194, "learning_rate": 6.921088375752304e-06, "loss": 0.3689, "step": 2641 }, { "epoch": 1.3116001985768657, "grad_norm": 0.3433331847190857, "learning_rate": 6.918420727663084e-06, "loss": 0.3872, "step": 2642 }, { "epoch": 1.3120966407413537, "grad_norm": 0.36913228034973145, "learning_rate": 6.91575243906532e-06, "loss": 0.3522, "step": 2643 }, { "epoch": 1.3125930829058414, "grad_norm": 0.36693307757377625, "learning_rate": 6.913083510849884e-06, "loss": 0.3952, "step": 2644 }, { "epoch": 1.3130895250703294, "grad_norm": 0.3571702539920807, "learning_rate": 6.910413943907859e-06, "loss": 0.3926, "step": 2645 }, { "epoch": 1.3135859672348171, "grad_norm": 0.36246854066848755, "learning_rate": 6.907743739130539e-06, "loss": 0.3762, "step": 2646 }, { "epoch": 1.314082409399305, "grad_norm": 0.3315647542476654, "learning_rate": 6.905072897409436e-06, "loss": 0.3317, "step": 2647 }, { "epoch": 1.3145788515637928, "grad_norm": 0.398896723985672, "learning_rate": 6.902401419636269e-06, "loss": 0.4082, "step": 2648 }, { "epoch": 1.3150752937282806, "grad_norm": 0.32858777046203613, "learning_rate": 6.899729306702973e-06, "loss": 0.3795, "step": 2649 }, { "epoch": 1.3155717358927685, "grad_norm": 0.3553098738193512, "learning_rate": 6.897056559501693e-06, "loss": 0.4001, "step": 2650 }, { "epoch": 1.3160681780572563, "grad_norm": 0.34728869795799255, "learning_rate": 6.894383178924787e-06, "loss": 0.363, "step": 2651 }, { "epoch": 1.3165646202217443, "grad_norm": 0.37057867646217346, "learning_rate": 6.891709165864824e-06, "loss": 0.3682, "step": 2652 }, { "epoch": 1.317061062386232, "grad_norm": 0.3535225987434387, "learning_rate": 6.889034521214583e-06, "loss": 0.3984, "step": 2653 }, { "epoch": 1.3175575045507197, "grad_norm": 0.33252641558647156, "learning_rate": 6.886359245867057e-06, "loss": 0.3372, "step": 2654 }, { "epoch": 1.3180539467152077, "grad_norm": 0.38798579573631287, "learning_rate": 6.883683340715448e-06, "loss": 0.3736, "step": 2655 }, { "epoch": 1.3185503888796954, "grad_norm": 0.33736395835876465, "learning_rate": 6.881006806653167e-06, "loss": 0.3588, "step": 2656 }, { "epoch": 1.3190468310441834, "grad_norm": 0.3504621982574463, "learning_rate": 6.878329644573835e-06, "loss": 0.3919, "step": 2657 }, { "epoch": 1.3195432732086712, "grad_norm": 0.3138512969017029, "learning_rate": 6.875651855371287e-06, "loss": 0.3514, "step": 2658 }, { "epoch": 1.320039715373159, "grad_norm": 0.35673245787620544, "learning_rate": 6.872973439939561e-06, "loss": 0.3848, "step": 2659 }, { "epoch": 1.3205361575376469, "grad_norm": 0.3764340579509735, "learning_rate": 6.870294399172908e-06, "loss": 0.4085, "step": 2660 }, { "epoch": 1.3210325997021348, "grad_norm": 0.3781886100769043, "learning_rate": 6.867614733965786e-06, "loss": 0.365, "step": 2661 }, { "epoch": 1.3215290418666226, "grad_norm": 0.3310677111148834, "learning_rate": 6.864934445212864e-06, "loss": 0.3443, "step": 2662 }, { "epoch": 1.3220254840311103, "grad_norm": 0.38206759095191956, "learning_rate": 6.862253533809017e-06, "loss": 0.367, "step": 2663 }, { "epoch": 1.3225219261955983, "grad_norm": 0.4227296710014343, "learning_rate": 6.859572000649328e-06, "loss": 0.4253, "step": 2664 }, { "epoch": 1.323018368360086, "grad_norm": 0.3446311354637146, "learning_rate": 6.856889846629089e-06, "loss": 0.3739, "step": 2665 }, { "epoch": 1.323514810524574, "grad_norm": 0.3752003312110901, "learning_rate": 6.854207072643797e-06, "loss": 0.3692, "step": 2666 }, { "epoch": 1.3240112526890617, "grad_norm": 0.3944871127605438, "learning_rate": 6.851523679589158e-06, "loss": 0.3881, "step": 2667 }, { "epoch": 1.3245076948535495, "grad_norm": 0.34330853819847107, "learning_rate": 6.848839668361085e-06, "loss": 0.3863, "step": 2668 }, { "epoch": 1.3250041370180374, "grad_norm": 0.41187238693237305, "learning_rate": 6.846155039855693e-06, "loss": 0.3788, "step": 2669 }, { "epoch": 1.3255005791825252, "grad_norm": 0.4165762662887573, "learning_rate": 6.843469794969311e-06, "loss": 0.3835, "step": 2670 }, { "epoch": 1.3259970213470131, "grad_norm": 0.36667290329933167, "learning_rate": 6.840783934598467e-06, "loss": 0.3846, "step": 2671 }, { "epoch": 1.3264934635115009, "grad_norm": 0.3954448699951172, "learning_rate": 6.838097459639896e-06, "loss": 0.3727, "step": 2672 }, { "epoch": 1.3269899056759886, "grad_norm": 0.3868952989578247, "learning_rate": 6.8354103709905415e-06, "loss": 0.3811, "step": 2673 }, { "epoch": 1.3274863478404766, "grad_norm": 0.371320903301239, "learning_rate": 6.8327226695475464e-06, "loss": 0.4016, "step": 2674 }, { "epoch": 1.3279827900049646, "grad_norm": 0.38764792680740356, "learning_rate": 6.830034356208264e-06, "loss": 0.3604, "step": 2675 }, { "epoch": 1.3284792321694523, "grad_norm": 0.3633298873901367, "learning_rate": 6.827345431870247e-06, "loss": 0.365, "step": 2676 }, { "epoch": 1.32897567433394, "grad_norm": 0.36629122495651245, "learning_rate": 6.824655897431254e-06, "loss": 0.3795, "step": 2677 }, { "epoch": 1.329472116498428, "grad_norm": 0.36269068717956543, "learning_rate": 6.821965753789248e-06, "loss": 0.3794, "step": 2678 }, { "epoch": 1.3299685586629157, "grad_norm": 0.3609590232372284, "learning_rate": 6.819275001842397e-06, "loss": 0.3972, "step": 2679 }, { "epoch": 1.3304650008274037, "grad_norm": 0.3438923954963684, "learning_rate": 6.8165836424890665e-06, "loss": 0.3559, "step": 2680 }, { "epoch": 1.3309614429918915, "grad_norm": 0.3468295633792877, "learning_rate": 6.813891676627831e-06, "loss": 0.3821, "step": 2681 }, { "epoch": 1.3314578851563792, "grad_norm": 0.40950801968574524, "learning_rate": 6.811199105157462e-06, "loss": 0.3665, "step": 2682 }, { "epoch": 1.3319543273208672, "grad_norm": 0.3947905898094177, "learning_rate": 6.808505928976939e-06, "loss": 0.3755, "step": 2683 }, { "epoch": 1.332450769485355, "grad_norm": 0.3659156262874603, "learning_rate": 6.805812148985438e-06, "loss": 0.3773, "step": 2684 }, { "epoch": 1.3329472116498429, "grad_norm": 0.35225987434387207, "learning_rate": 6.803117766082339e-06, "loss": 0.3689, "step": 2685 }, { "epoch": 1.3334436538143306, "grad_norm": 0.38868871331214905, "learning_rate": 6.800422781167224e-06, "loss": 0.3972, "step": 2686 }, { "epoch": 1.3339400959788184, "grad_norm": 0.33765366673469543, "learning_rate": 6.797727195139876e-06, "loss": 0.3312, "step": 2687 }, { "epoch": 1.3344365381433063, "grad_norm": 0.3967869281768799, "learning_rate": 6.795031008900277e-06, "loss": 0.415, "step": 2688 }, { "epoch": 1.334932980307794, "grad_norm": 0.3755505681037903, "learning_rate": 6.792334223348609e-06, "loss": 0.3683, "step": 2689 }, { "epoch": 1.335429422472282, "grad_norm": 0.3864120543003082, "learning_rate": 6.78963683938526e-06, "loss": 0.3985, "step": 2690 }, { "epoch": 1.3359258646367698, "grad_norm": 0.33641380071640015, "learning_rate": 6.786938857910806e-06, "loss": 0.3519, "step": 2691 }, { "epoch": 1.3364223068012577, "grad_norm": 0.3440611958503723, "learning_rate": 6.784240279826035e-06, "loss": 0.3745, "step": 2692 }, { "epoch": 1.3369187489657455, "grad_norm": 0.40117448568344116, "learning_rate": 6.781541106031928e-06, "loss": 0.3542, "step": 2693 }, { "epoch": 1.3374151911302334, "grad_norm": 0.37566718459129333, "learning_rate": 6.7788413374296665e-06, "loss": 0.3641, "step": 2694 }, { "epoch": 1.3379116332947212, "grad_norm": 0.39044493436813354, "learning_rate": 6.776140974920627e-06, "loss": 0.3651, "step": 2695 }, { "epoch": 1.338408075459209, "grad_norm": 0.3833014965057373, "learning_rate": 6.77344001940639e-06, "loss": 0.362, "step": 2696 }, { "epoch": 1.3389045176236969, "grad_norm": 0.348194420337677, "learning_rate": 6.770738471788729e-06, "loss": 0.3416, "step": 2697 }, { "epoch": 1.3394009597881846, "grad_norm": 0.4135442078113556, "learning_rate": 6.7680363329696184e-06, "loss": 0.4475, "step": 2698 }, { "epoch": 1.3398974019526726, "grad_norm": 0.332617849111557, "learning_rate": 6.7653336038512294e-06, "loss": 0.371, "step": 2699 }, { "epoch": 1.3403938441171603, "grad_norm": 0.37999534606933594, "learning_rate": 6.762630285335929e-06, "loss": 0.3626, "step": 2700 }, { "epoch": 1.340890286281648, "grad_norm": 0.3485623896121979, "learning_rate": 6.759926378326281e-06, "loss": 0.4204, "step": 2701 }, { "epoch": 1.341386728446136, "grad_norm": 0.34554368257522583, "learning_rate": 6.757221883725048e-06, "loss": 0.3898, "step": 2702 }, { "epoch": 1.3418831706106238, "grad_norm": 0.37230342626571655, "learning_rate": 6.754516802435187e-06, "loss": 0.3441, "step": 2703 }, { "epoch": 1.3423796127751118, "grad_norm": 0.36234477162361145, "learning_rate": 6.751811135359851e-06, "loss": 0.3917, "step": 2704 }, { "epoch": 1.3428760549395995, "grad_norm": 0.379660427570343, "learning_rate": 6.7491048834023884e-06, "loss": 0.4004, "step": 2705 }, { "epoch": 1.3433724971040872, "grad_norm": 0.3439038395881653, "learning_rate": 6.746398047466343e-06, "loss": 0.3531, "step": 2706 }, { "epoch": 1.3438689392685752, "grad_norm": 0.34772223234176636, "learning_rate": 6.7436906284554545e-06, "loss": 0.389, "step": 2707 }, { "epoch": 1.3443653814330632, "grad_norm": 0.36108747124671936, "learning_rate": 6.740982627273655e-06, "loss": 0.4146, "step": 2708 }, { "epoch": 1.344861823597551, "grad_norm": 0.34029752016067505, "learning_rate": 6.738274044825074e-06, "loss": 0.3397, "step": 2709 }, { "epoch": 1.3453582657620387, "grad_norm": 0.3276771008968353, "learning_rate": 6.735564882014032e-06, "loss": 0.3758, "step": 2710 }, { "epoch": 1.3458547079265266, "grad_norm": 0.3765486478805542, "learning_rate": 6.732855139745047e-06, "loss": 0.4169, "step": 2711 }, { "epoch": 1.3463511500910144, "grad_norm": 0.3420504629611969, "learning_rate": 6.730144818922828e-06, "loss": 0.3571, "step": 2712 }, { "epoch": 1.3468475922555023, "grad_norm": 0.33970507979393005, "learning_rate": 6.727433920452275e-06, "loss": 0.3605, "step": 2713 }, { "epoch": 1.34734403441999, "grad_norm": 0.3765241205692291, "learning_rate": 6.724722445238487e-06, "loss": 0.3938, "step": 2714 }, { "epoch": 1.3478404765844778, "grad_norm": 0.3804856538772583, "learning_rate": 6.722010394186748e-06, "loss": 0.3995, "step": 2715 }, { "epoch": 1.3483369187489658, "grad_norm": 0.3671551048755646, "learning_rate": 6.719297768202541e-06, "loss": 0.4278, "step": 2716 }, { "epoch": 1.3488333609134535, "grad_norm": 0.36221325397491455, "learning_rate": 6.716584568191538e-06, "loss": 0.3514, "step": 2717 }, { "epoch": 1.3493298030779415, "grad_norm": 0.3816855549812317, "learning_rate": 6.713870795059601e-06, "loss": 0.3854, "step": 2718 }, { "epoch": 1.3498262452424292, "grad_norm": 0.36146360635757446, "learning_rate": 6.711156449712786e-06, "loss": 0.3608, "step": 2719 }, { "epoch": 1.350322687406917, "grad_norm": 0.40594038367271423, "learning_rate": 6.70844153305734e-06, "loss": 0.4223, "step": 2720 }, { "epoch": 1.350819129571405, "grad_norm": 0.3193580210208893, "learning_rate": 6.705726045999697e-06, "loss": 0.3356, "step": 2721 }, { "epoch": 1.351315571735893, "grad_norm": 0.38134047389030457, "learning_rate": 6.703009989446487e-06, "loss": 0.3963, "step": 2722 }, { "epoch": 1.3518120139003806, "grad_norm": 0.3230547308921814, "learning_rate": 6.700293364304528e-06, "loss": 0.3734, "step": 2723 }, { "epoch": 1.3523084560648684, "grad_norm": 0.40304285287857056, "learning_rate": 6.697576171480824e-06, "loss": 0.4417, "step": 2724 }, { "epoch": 1.3528048982293563, "grad_norm": 0.3770408034324646, "learning_rate": 6.6948584118825745e-06, "loss": 0.3635, "step": 2725 }, { "epoch": 1.353301340393844, "grad_norm": 0.3665579557418823, "learning_rate": 6.692140086417165e-06, "loss": 0.3752, "step": 2726 }, { "epoch": 1.353797782558332, "grad_norm": 0.35153457522392273, "learning_rate": 6.689421195992172e-06, "loss": 0.3655, "step": 2727 }, { "epoch": 1.3542942247228198, "grad_norm": 0.38815388083457947, "learning_rate": 6.686701741515355e-06, "loss": 0.398, "step": 2728 }, { "epoch": 1.3547906668873075, "grad_norm": 0.3460819721221924, "learning_rate": 6.683981723894672e-06, "loss": 0.3757, "step": 2729 }, { "epoch": 1.3552871090517955, "grad_norm": 0.34716761112213135, "learning_rate": 6.681261144038257e-06, "loss": 0.3985, "step": 2730 }, { "epoch": 1.3557835512162832, "grad_norm": 0.34103551506996155, "learning_rate": 6.678540002854441e-06, "loss": 0.3606, "step": 2731 }, { "epoch": 1.3562799933807712, "grad_norm": 0.35099634528160095, "learning_rate": 6.675818301251737e-06, "loss": 0.3719, "step": 2732 }, { "epoch": 1.356776435545259, "grad_norm": 0.3329656720161438, "learning_rate": 6.6730960401388504e-06, "loss": 0.3583, "step": 2733 }, { "epoch": 1.3572728777097467, "grad_norm": 0.3496479392051697, "learning_rate": 6.670373220424666e-06, "loss": 0.3521, "step": 2734 }, { "epoch": 1.3577693198742347, "grad_norm": 0.3620867431163788, "learning_rate": 6.6676498430182646e-06, "loss": 0.3583, "step": 2735 }, { "epoch": 1.3582657620387226, "grad_norm": 0.3442847728729248, "learning_rate": 6.664925908828902e-06, "loss": 0.3653, "step": 2736 }, { "epoch": 1.3587622042032104, "grad_norm": 0.35360413789749146, "learning_rate": 6.66220141876603e-06, "loss": 0.3994, "step": 2737 }, { "epoch": 1.359258646367698, "grad_norm": 0.38694441318511963, "learning_rate": 6.6594763737392794e-06, "loss": 0.3662, "step": 2738 }, { "epoch": 1.359755088532186, "grad_norm": 0.3218190371990204, "learning_rate": 6.656750774658471e-06, "loss": 0.3555, "step": 2739 }, { "epoch": 1.3602515306966738, "grad_norm": 0.37859249114990234, "learning_rate": 6.6540246224336045e-06, "loss": 0.4032, "step": 2740 }, { "epoch": 1.3607479728611618, "grad_norm": 0.37589016556739807, "learning_rate": 6.651297917974872e-06, "loss": 0.4168, "step": 2741 }, { "epoch": 1.3612444150256495, "grad_norm": 0.3660490810871124, "learning_rate": 6.648570662192646e-06, "loss": 0.4063, "step": 2742 }, { "epoch": 1.3617408571901373, "grad_norm": 0.3313777446746826, "learning_rate": 6.64584285599748e-06, "loss": 0.3521, "step": 2743 }, { "epoch": 1.3622372993546252, "grad_norm": 0.36284080147743225, "learning_rate": 6.643114500300116e-06, "loss": 0.3729, "step": 2744 }, { "epoch": 1.362733741519113, "grad_norm": 0.3638220727443695, "learning_rate": 6.640385596011478e-06, "loss": 0.4054, "step": 2745 }, { "epoch": 1.363230183683601, "grad_norm": 0.35846006870269775, "learning_rate": 6.637656144042672e-06, "loss": 0.39, "step": 2746 }, { "epoch": 1.3637266258480887, "grad_norm": 0.3586236238479614, "learning_rate": 6.6349261453049895e-06, "loss": 0.3565, "step": 2747 }, { "epoch": 1.3642230680125764, "grad_norm": 0.4483121931552887, "learning_rate": 6.632195600709901e-06, "loss": 0.4215, "step": 2748 }, { "epoch": 1.3647195101770644, "grad_norm": 0.3420538306236267, "learning_rate": 6.629464511169062e-06, "loss": 0.3598, "step": 2749 }, { "epoch": 1.3652159523415521, "grad_norm": 0.3410419821739197, "learning_rate": 6.626732877594311e-06, "loss": 0.3442, "step": 2750 }, { "epoch": 1.36571239450604, "grad_norm": 0.3792872428894043, "learning_rate": 6.624000700897662e-06, "loss": 0.3874, "step": 2751 }, { "epoch": 1.3662088366705278, "grad_norm": 0.35589030385017395, "learning_rate": 6.6212679819913185e-06, "loss": 0.3858, "step": 2752 }, { "epoch": 1.3667052788350158, "grad_norm": 0.3806188106536865, "learning_rate": 6.618534721787658e-06, "loss": 0.3961, "step": 2753 }, { "epoch": 1.3672017209995035, "grad_norm": 0.3450206220149994, "learning_rate": 6.615800921199245e-06, "loss": 0.3468, "step": 2754 }, { "epoch": 1.3676981631639915, "grad_norm": 0.38987720012664795, "learning_rate": 6.613066581138819e-06, "loss": 0.3816, "step": 2755 }, { "epoch": 1.3681946053284793, "grad_norm": 0.37727493047714233, "learning_rate": 6.610331702519299e-06, "loss": 0.3928, "step": 2756 }, { "epoch": 1.368691047492967, "grad_norm": 0.4195593297481537, "learning_rate": 6.6075962862537934e-06, "loss": 0.4213, "step": 2757 }, { "epoch": 1.369187489657455, "grad_norm": 0.3751891851425171, "learning_rate": 6.6048603332555796e-06, "loss": 0.4101, "step": 2758 }, { "epoch": 1.3696839318219427, "grad_norm": 0.35304152965545654, "learning_rate": 6.602123844438117e-06, "loss": 0.3928, "step": 2759 }, { "epoch": 1.3701803739864307, "grad_norm": 0.3559597432613373, "learning_rate": 6.5993868207150465e-06, "loss": 0.3693, "step": 2760 }, { "epoch": 1.3706768161509184, "grad_norm": 0.4101899564266205, "learning_rate": 6.596649263000187e-06, "loss": 0.3735, "step": 2761 }, { "epoch": 1.3711732583154062, "grad_norm": 0.36415213346481323, "learning_rate": 6.593911172207532e-06, "loss": 0.376, "step": 2762 }, { "epoch": 1.3716697004798941, "grad_norm": 0.3691195845603943, "learning_rate": 6.591172549251255e-06, "loss": 0.3973, "step": 2763 }, { "epoch": 1.3721661426443819, "grad_norm": 0.3430822491645813, "learning_rate": 6.588433395045711e-06, "loss": 0.377, "step": 2764 }, { "epoch": 1.3726625848088698, "grad_norm": 0.367773175239563, "learning_rate": 6.5856937105054285e-06, "loss": 0.3753, "step": 2765 }, { "epoch": 1.3731590269733576, "grad_norm": 0.3438875675201416, "learning_rate": 6.582953496545112e-06, "loss": 0.3587, "step": 2766 }, { "epoch": 1.3736554691378453, "grad_norm": 0.3658663332462311, "learning_rate": 6.580212754079644e-06, "loss": 0.374, "step": 2767 }, { "epoch": 1.3741519113023333, "grad_norm": 0.36045601963996887, "learning_rate": 6.5774714840240875e-06, "loss": 0.3764, "step": 2768 }, { "epoch": 1.3746483534668212, "grad_norm": 0.341107040643692, "learning_rate": 6.574729687293675e-06, "loss": 0.3463, "step": 2769 }, { "epoch": 1.375144795631309, "grad_norm": 0.44397467374801636, "learning_rate": 6.571987364803819e-06, "loss": 0.3985, "step": 2770 }, { "epoch": 1.3756412377957967, "grad_norm": 0.37467625737190247, "learning_rate": 6.569244517470105e-06, "loss": 0.3674, "step": 2771 }, { "epoch": 1.3761376799602847, "grad_norm": 0.3534158170223236, "learning_rate": 6.5665011462082975e-06, "loss": 0.368, "step": 2772 }, { "epoch": 1.3766341221247724, "grad_norm": 0.332573264837265, "learning_rate": 6.5637572519343305e-06, "loss": 0.3617, "step": 2773 }, { "epoch": 1.3771305642892604, "grad_norm": 0.3445269763469696, "learning_rate": 6.56101283556432e-06, "loss": 0.3928, "step": 2774 }, { "epoch": 1.3776270064537481, "grad_norm": 0.3765236437320709, "learning_rate": 6.5582678980145476e-06, "loss": 0.3911, "step": 2775 }, { "epoch": 1.3781234486182359, "grad_norm": 0.3867045044898987, "learning_rate": 6.555522440201477e-06, "loss": 0.3844, "step": 2776 }, { "epoch": 1.3786198907827238, "grad_norm": 0.3715612292289734, "learning_rate": 6.55277646304174e-06, "loss": 0.3803, "step": 2777 }, { "epoch": 1.3791163329472116, "grad_norm": 0.34959307312965393, "learning_rate": 6.550029967452145e-06, "loss": 0.3527, "step": 2778 }, { "epoch": 1.3796127751116996, "grad_norm": 0.3761398494243622, "learning_rate": 6.547282954349669e-06, "loss": 0.4217, "step": 2779 }, { "epoch": 1.3801092172761873, "grad_norm": 0.30981913208961487, "learning_rate": 6.544535424651468e-06, "loss": 0.3668, "step": 2780 }, { "epoch": 1.380605659440675, "grad_norm": 0.35634005069732666, "learning_rate": 6.541787379274869e-06, "loss": 0.3506, "step": 2781 }, { "epoch": 1.381102101605163, "grad_norm": 0.366862952709198, "learning_rate": 6.539038819137364e-06, "loss": 0.3937, "step": 2782 }, { "epoch": 1.381598543769651, "grad_norm": 0.3148941695690155, "learning_rate": 6.53628974515663e-06, "loss": 0.3441, "step": 2783 }, { "epoch": 1.3820949859341387, "grad_norm": 0.3470355272293091, "learning_rate": 6.533540158250502e-06, "loss": 0.38, "step": 2784 }, { "epoch": 1.3825914280986265, "grad_norm": 0.3507826626300812, "learning_rate": 6.530790059336995e-06, "loss": 0.3734, "step": 2785 }, { "epoch": 1.3830878702631144, "grad_norm": 0.38106024265289307, "learning_rate": 6.528039449334291e-06, "loss": 0.3897, "step": 2786 }, { "epoch": 1.3835843124276022, "grad_norm": 0.38260599970817566, "learning_rate": 6.525288329160745e-06, "loss": 0.4021, "step": 2787 }, { "epoch": 1.3840807545920901, "grad_norm": 0.3377076983451843, "learning_rate": 6.522536699734881e-06, "loss": 0.3552, "step": 2788 }, { "epoch": 1.3845771967565779, "grad_norm": 0.3992628753185272, "learning_rate": 6.519784561975393e-06, "loss": 0.3853, "step": 2789 }, { "epoch": 1.3850736389210656, "grad_norm": 0.399234414100647, "learning_rate": 6.5170319168011455e-06, "loss": 0.3568, "step": 2790 }, { "epoch": 1.3855700810855536, "grad_norm": 0.38354095816612244, "learning_rate": 6.514278765131172e-06, "loss": 0.3754, "step": 2791 }, { "epoch": 1.3860665232500413, "grad_norm": 0.3616054952144623, "learning_rate": 6.511525107884674e-06, "loss": 0.3608, "step": 2792 }, { "epoch": 1.3865629654145293, "grad_norm": 0.4014008641242981, "learning_rate": 6.5087709459810245e-06, "loss": 0.4205, "step": 2793 }, { "epoch": 1.387059407579017, "grad_norm": 0.317293643951416, "learning_rate": 6.506016280339762e-06, "loss": 0.3334, "step": 2794 }, { "epoch": 1.3875558497435048, "grad_norm": 0.3311839699745178, "learning_rate": 6.503261111880593e-06, "loss": 0.3458, "step": 2795 }, { "epoch": 1.3880522919079927, "grad_norm": 0.41100960969924927, "learning_rate": 6.500505441523396e-06, "loss": 0.3994, "step": 2796 }, { "epoch": 1.3885487340724805, "grad_norm": 0.40525081753730774, "learning_rate": 6.497749270188214e-06, "loss": 0.4043, "step": 2797 }, { "epoch": 1.3890451762369684, "grad_norm": 0.36711159348487854, "learning_rate": 6.494992598795258e-06, "loss": 0.372, "step": 2798 }, { "epoch": 1.3895416184014562, "grad_norm": 0.38381442427635193, "learning_rate": 6.492235428264903e-06, "loss": 0.3807, "step": 2799 }, { "epoch": 1.3900380605659441, "grad_norm": 0.3567735254764557, "learning_rate": 6.489477759517697e-06, "loss": 0.3676, "step": 2800 }, { "epoch": 1.3905345027304319, "grad_norm": 0.3618755638599396, "learning_rate": 6.486719593474347e-06, "loss": 0.401, "step": 2801 }, { "epoch": 1.3910309448949199, "grad_norm": 0.3665262460708618, "learning_rate": 6.483960931055735e-06, "loss": 0.4239, "step": 2802 }, { "epoch": 1.3915273870594076, "grad_norm": 0.3209072947502136, "learning_rate": 6.481201773182896e-06, "loss": 0.331, "step": 2803 }, { "epoch": 1.3920238292238953, "grad_norm": 0.3586495816707611, "learning_rate": 6.478442120777044e-06, "loss": 0.3637, "step": 2804 }, { "epoch": 1.3925202713883833, "grad_norm": 0.32943329215049744, "learning_rate": 6.4756819747595486e-06, "loss": 0.4097, "step": 2805 }, { "epoch": 1.393016713552871, "grad_norm": 0.32699304819107056, "learning_rate": 6.472921336051949e-06, "loss": 0.3368, "step": 2806 }, { "epoch": 1.393513155717359, "grad_norm": 0.3512673079967499, "learning_rate": 6.4701602055759475e-06, "loss": 0.3763, "step": 2807 }, { "epoch": 1.3940095978818468, "grad_norm": 0.3298647403717041, "learning_rate": 6.4673985842534094e-06, "loss": 0.3804, "step": 2808 }, { "epoch": 1.3945060400463345, "grad_norm": 0.35487011075019836, "learning_rate": 6.464636473006367e-06, "loss": 0.3831, "step": 2809 }, { "epoch": 1.3950024822108225, "grad_norm": 0.3540220856666565, "learning_rate": 6.461873872757012e-06, "loss": 0.4036, "step": 2810 }, { "epoch": 1.3954989243753102, "grad_norm": 0.3425331711769104, "learning_rate": 6.4591107844277015e-06, "loss": 0.3553, "step": 2811 }, { "epoch": 1.3959953665397982, "grad_norm": 0.34391000866889954, "learning_rate": 6.456347208940956e-06, "loss": 0.3593, "step": 2812 }, { "epoch": 1.396491808704286, "grad_norm": 0.37723639607429504, "learning_rate": 6.453583147219462e-06, "loss": 0.4069, "step": 2813 }, { "epoch": 1.3969882508687737, "grad_norm": 0.3376983106136322, "learning_rate": 6.45081860018606e-06, "loss": 0.3633, "step": 2814 }, { "epoch": 1.3974846930332616, "grad_norm": 0.35254210233688354, "learning_rate": 6.448053568763757e-06, "loss": 0.3779, "step": 2815 }, { "epoch": 1.3979811351977496, "grad_norm": 0.3588304817676544, "learning_rate": 6.445288053875724e-06, "loss": 0.3699, "step": 2816 }, { "epoch": 1.3984775773622373, "grad_norm": 0.36551016569137573, "learning_rate": 6.442522056445292e-06, "loss": 0.3745, "step": 2817 }, { "epoch": 1.398974019526725, "grad_norm": 0.32703259587287903, "learning_rate": 6.43975557739595e-06, "loss": 0.329, "step": 2818 }, { "epoch": 1.399470461691213, "grad_norm": 0.3762635886669159, "learning_rate": 6.43698861765135e-06, "loss": 0.4052, "step": 2819 }, { "epoch": 1.3999669038557008, "grad_norm": 0.37430885434150696, "learning_rate": 6.434221178135306e-06, "loss": 0.3939, "step": 2820 }, { "epoch": 1.4004633460201887, "grad_norm": 0.3451192080974579, "learning_rate": 6.431453259771792e-06, "loss": 0.3765, "step": 2821 }, { "epoch": 1.4009597881846765, "grad_norm": 0.34065496921539307, "learning_rate": 6.428684863484937e-06, "loss": 0.3929, "step": 2822 }, { "epoch": 1.4014562303491642, "grad_norm": 0.3678964674472809, "learning_rate": 6.425915990199038e-06, "loss": 0.3793, "step": 2823 }, { "epoch": 1.4019526725136522, "grad_norm": 0.40408575534820557, "learning_rate": 6.423146640838543e-06, "loss": 0.4049, "step": 2824 }, { "epoch": 1.40244911467814, "grad_norm": 0.3252682387828827, "learning_rate": 6.4203768163280645e-06, "loss": 0.3462, "step": 2825 }, { "epoch": 1.402945556842628, "grad_norm": 0.35227471590042114, "learning_rate": 6.417606517592371e-06, "loss": 0.3876, "step": 2826 }, { "epoch": 1.4034419990071156, "grad_norm": 0.3910061717033386, "learning_rate": 6.414835745556387e-06, "loss": 0.4429, "step": 2827 }, { "epoch": 1.4039384411716034, "grad_norm": 0.34484660625457764, "learning_rate": 6.412064501145203e-06, "loss": 0.3583, "step": 2828 }, { "epoch": 1.4044348833360913, "grad_norm": 0.3489684462547302, "learning_rate": 6.409292785284058e-06, "loss": 0.3508, "step": 2829 }, { "epoch": 1.4049313255005793, "grad_norm": 0.33962583541870117, "learning_rate": 6.406520598898357e-06, "loss": 0.3855, "step": 2830 }, { "epoch": 1.405427767665067, "grad_norm": 0.3604956865310669, "learning_rate": 6.403747942913654e-06, "loss": 0.3922, "step": 2831 }, { "epoch": 1.4059242098295548, "grad_norm": 0.3277181088924408, "learning_rate": 6.400974818255665e-06, "loss": 0.3303, "step": 2832 }, { "epoch": 1.4064206519940428, "grad_norm": 0.33854907751083374, "learning_rate": 6.398201225850259e-06, "loss": 0.3717, "step": 2833 }, { "epoch": 1.4069170941585305, "grad_norm": 0.3449426591396332, "learning_rate": 6.395427166623466e-06, "loss": 0.3981, "step": 2834 }, { "epoch": 1.4074135363230185, "grad_norm": 0.35407885909080505, "learning_rate": 6.392652641501467e-06, "loss": 0.42, "step": 2835 }, { "epoch": 1.4079099784875062, "grad_norm": 0.34692615270614624, "learning_rate": 6.389877651410601e-06, "loss": 0.3241, "step": 2836 }, { "epoch": 1.408406420651994, "grad_norm": 0.33933311700820923, "learning_rate": 6.387102197277364e-06, "loss": 0.3655, "step": 2837 }, { "epoch": 1.408902862816482, "grad_norm": 0.34944701194763184, "learning_rate": 6.3843262800284e-06, "loss": 0.3926, "step": 2838 }, { "epoch": 1.4093993049809697, "grad_norm": 0.3360249400138855, "learning_rate": 6.381549900590517e-06, "loss": 0.3732, "step": 2839 }, { "epoch": 1.4098957471454576, "grad_norm": 0.36038917303085327, "learning_rate": 6.378773059890669e-06, "loss": 0.4007, "step": 2840 }, { "epoch": 1.4103921893099454, "grad_norm": 0.32245591282844543, "learning_rate": 6.375995758855971e-06, "loss": 0.3405, "step": 2841 }, { "epoch": 1.410888631474433, "grad_norm": 0.3476381301879883, "learning_rate": 6.3732179984136855e-06, "loss": 0.4294, "step": 2842 }, { "epoch": 1.411385073638921, "grad_norm": 0.3247641623020172, "learning_rate": 6.370439779491233e-06, "loss": 0.4003, "step": 2843 }, { "epoch": 1.411881515803409, "grad_norm": 0.33601391315460205, "learning_rate": 6.367661103016183e-06, "loss": 0.3429, "step": 2844 }, { "epoch": 1.4123779579678968, "grad_norm": 0.3080992102622986, "learning_rate": 6.3648819699162634e-06, "loss": 0.3487, "step": 2845 }, { "epoch": 1.4128744001323845, "grad_norm": 0.35660025477409363, "learning_rate": 6.362102381119349e-06, "loss": 0.3879, "step": 2846 }, { "epoch": 1.4133708422968725, "grad_norm": 0.3126378655433655, "learning_rate": 6.359322337553471e-06, "loss": 0.3732, "step": 2847 }, { "epoch": 1.4138672844613602, "grad_norm": 0.35447046160697937, "learning_rate": 6.356541840146806e-06, "loss": 0.3863, "step": 2848 }, { "epoch": 1.4143637266258482, "grad_norm": 0.38979196548461914, "learning_rate": 6.35376088982769e-06, "loss": 0.3695, "step": 2849 }, { "epoch": 1.414860168790336, "grad_norm": 0.35440686345100403, "learning_rate": 6.350979487524607e-06, "loss": 0.3647, "step": 2850 }, { "epoch": 1.4153566109548237, "grad_norm": 0.3563547432422638, "learning_rate": 6.34819763416619e-06, "loss": 0.3918, "step": 2851 }, { "epoch": 1.4158530531193116, "grad_norm": 0.3260338306427002, "learning_rate": 6.345415330681226e-06, "loss": 0.3628, "step": 2852 }, { "epoch": 1.4163494952837994, "grad_norm": 0.3309593200683594, "learning_rate": 6.342632577998648e-06, "loss": 0.3524, "step": 2853 }, { "epoch": 1.4168459374482874, "grad_norm": 0.3439188599586487, "learning_rate": 6.3398493770475445e-06, "loss": 0.3687, "step": 2854 }, { "epoch": 1.417342379612775, "grad_norm": 0.37747159600257874, "learning_rate": 6.337065728757148e-06, "loss": 0.4044, "step": 2855 }, { "epoch": 1.4178388217772628, "grad_norm": 0.3698974847793579, "learning_rate": 6.334281634056845e-06, "loss": 0.3692, "step": 2856 }, { "epoch": 1.4183352639417508, "grad_norm": 0.33319008350372314, "learning_rate": 6.3314970938761664e-06, "loss": 0.3926, "step": 2857 }, { "epoch": 1.4188317061062385, "grad_norm": 0.365546315908432, "learning_rate": 6.328712109144798e-06, "loss": 0.3646, "step": 2858 }, { "epoch": 1.4193281482707265, "grad_norm": 0.3534389138221741, "learning_rate": 6.325926680792567e-06, "loss": 0.3976, "step": 2859 }, { "epoch": 1.4198245904352143, "grad_norm": 0.33420148491859436, "learning_rate": 6.323140809749456e-06, "loss": 0.3808, "step": 2860 }, { "epoch": 1.4203210325997022, "grad_norm": 0.34980642795562744, "learning_rate": 6.320354496945588e-06, "loss": 0.4021, "step": 2861 }, { "epoch": 1.42081747476419, "grad_norm": 0.313525527715683, "learning_rate": 6.31756774331124e-06, "loss": 0.3436, "step": 2862 }, { "epoch": 1.421313916928678, "grad_norm": 0.39931052923202515, "learning_rate": 6.3147805497768314e-06, "loss": 0.4041, "step": 2863 }, { "epoch": 1.4218103590931657, "grad_norm": 0.3738240897655487, "learning_rate": 6.311992917272931e-06, "loss": 0.3904, "step": 2864 }, { "epoch": 1.4223068012576534, "grad_norm": 0.3539249002933502, "learning_rate": 6.309204846730254e-06, "loss": 0.3482, "step": 2865 }, { "epoch": 1.4228032434221414, "grad_norm": 0.39582785964012146, "learning_rate": 6.30641633907966e-06, "loss": 0.4041, "step": 2866 }, { "epoch": 1.4232996855866291, "grad_norm": 0.37715598940849304, "learning_rate": 6.303627395252156e-06, "loss": 0.3626, "step": 2867 }, { "epoch": 1.423796127751117, "grad_norm": 0.3622618615627289, "learning_rate": 6.3008380161788965e-06, "loss": 0.3749, "step": 2868 }, { "epoch": 1.4242925699156048, "grad_norm": 0.3505946099758148, "learning_rate": 6.298048202791179e-06, "loss": 0.3493, "step": 2869 }, { "epoch": 1.4247890120800926, "grad_norm": 0.4069675803184509, "learning_rate": 6.295257956020444e-06, "loss": 0.3958, "step": 2870 }, { "epoch": 1.4252854542445805, "grad_norm": 0.32735905051231384, "learning_rate": 6.2924672767982834e-06, "loss": 0.3578, "step": 2871 }, { "epoch": 1.4257818964090683, "grad_norm": 0.41822341084480286, "learning_rate": 6.2896761660564245e-06, "loss": 0.4205, "step": 2872 }, { "epoch": 1.4262783385735562, "grad_norm": 0.3501371443271637, "learning_rate": 6.286884624726746e-06, "loss": 0.346, "step": 2873 }, { "epoch": 1.426774780738044, "grad_norm": 0.3279564082622528, "learning_rate": 6.284092653741264e-06, "loss": 0.3405, "step": 2874 }, { "epoch": 1.4272712229025317, "grad_norm": 0.3538837432861328, "learning_rate": 6.281300254032148e-06, "loss": 0.4151, "step": 2875 }, { "epoch": 1.4277676650670197, "grad_norm": 0.3285258710384369, "learning_rate": 6.278507426531698e-06, "loss": 0.3446, "step": 2876 }, { "epoch": 1.4282641072315077, "grad_norm": 0.3775372803211212, "learning_rate": 6.275714172172368e-06, "loss": 0.4003, "step": 2877 }, { "epoch": 1.4287605493959954, "grad_norm": 0.3651960492134094, "learning_rate": 6.272920491886748e-06, "loss": 0.4067, "step": 2878 }, { "epoch": 1.4292569915604831, "grad_norm": 0.3137930631637573, "learning_rate": 6.270126386607571e-06, "loss": 0.3525, "step": 2879 }, { "epoch": 1.429753433724971, "grad_norm": 0.42232179641723633, "learning_rate": 6.267331857267716e-06, "loss": 0.4252, "step": 2880 }, { "epoch": 1.4302498758894588, "grad_norm": 0.35042604804039, "learning_rate": 6.264536904800196e-06, "loss": 0.3652, "step": 2881 }, { "epoch": 1.4307463180539468, "grad_norm": 0.31708458065986633, "learning_rate": 6.261741530138172e-06, "loss": 0.3749, "step": 2882 }, { "epoch": 1.4312427602184346, "grad_norm": 0.3693234920501709, "learning_rate": 6.258945734214942e-06, "loss": 0.3773, "step": 2883 }, { "epoch": 1.4317392023829223, "grad_norm": 0.3367914855480194, "learning_rate": 6.25614951796395e-06, "loss": 0.3751, "step": 2884 }, { "epoch": 1.4322356445474103, "grad_norm": 0.3237525522708893, "learning_rate": 6.2533528823187725e-06, "loss": 0.3625, "step": 2885 }, { "epoch": 1.432732086711898, "grad_norm": 0.36445218324661255, "learning_rate": 6.250555828213133e-06, "loss": 0.397, "step": 2886 }, { "epoch": 1.433228528876386, "grad_norm": 0.3331945240497589, "learning_rate": 6.24775835658089e-06, "loss": 0.3921, "step": 2887 }, { "epoch": 1.4337249710408737, "grad_norm": 0.3350929915904999, "learning_rate": 6.244960468356044e-06, "loss": 0.356, "step": 2888 }, { "epoch": 1.4342214132053615, "grad_norm": 0.3392343819141388, "learning_rate": 6.242162164472734e-06, "loss": 0.3452, "step": 2889 }, { "epoch": 1.4347178553698494, "grad_norm": 0.3428051173686981, "learning_rate": 6.239363445865237e-06, "loss": 0.3871, "step": 2890 }, { "epoch": 1.4352142975343374, "grad_norm": 0.3862168788909912, "learning_rate": 6.236564313467969e-06, "loss": 0.4198, "step": 2891 }, { "epoch": 1.4357107396988251, "grad_norm": 0.33523282408714294, "learning_rate": 6.233764768215485e-06, "loss": 0.3534, "step": 2892 }, { "epoch": 1.4362071818633129, "grad_norm": 0.32597506046295166, "learning_rate": 6.230964811042477e-06, "loss": 0.3639, "step": 2893 }, { "epoch": 1.4367036240278008, "grad_norm": 0.42753252387046814, "learning_rate": 6.228164442883775e-06, "loss": 0.4009, "step": 2894 }, { "epoch": 1.4372000661922886, "grad_norm": 0.36421486735343933, "learning_rate": 6.225363664674345e-06, "loss": 0.4274, "step": 2895 }, { "epoch": 1.4376965083567765, "grad_norm": 0.33255842328071594, "learning_rate": 6.22256247734929e-06, "loss": 0.3929, "step": 2896 }, { "epoch": 1.4381929505212643, "grad_norm": 0.4093714654445648, "learning_rate": 6.2197608818438515e-06, "loss": 0.3798, "step": 2897 }, { "epoch": 1.438689392685752, "grad_norm": 0.3758787512779236, "learning_rate": 6.216958879093405e-06, "loss": 0.3782, "step": 2898 }, { "epoch": 1.43918583485024, "grad_norm": 0.3651547431945801, "learning_rate": 6.214156470033467e-06, "loss": 0.4032, "step": 2899 }, { "epoch": 1.4396822770147277, "grad_norm": 0.3982715606689453, "learning_rate": 6.211353655599679e-06, "loss": 0.3951, "step": 2900 }, { "epoch": 1.4401787191792157, "grad_norm": 0.3595404624938965, "learning_rate": 6.208550436727831e-06, "loss": 0.3428, "step": 2901 }, { "epoch": 1.4406751613437034, "grad_norm": 0.3750363886356354, "learning_rate": 6.2057468143538365e-06, "loss": 0.4041, "step": 2902 }, { "epoch": 1.4411716035081912, "grad_norm": 0.3059624135494232, "learning_rate": 6.202942789413753e-06, "loss": 0.3489, "step": 2903 }, { "epoch": 1.4416680456726791, "grad_norm": 0.34897902607917786, "learning_rate": 6.200138362843765e-06, "loss": 0.4332, "step": 2904 }, { "epoch": 1.442164487837167, "grad_norm": 0.3444168269634247, "learning_rate": 6.197333535580196e-06, "loss": 0.3626, "step": 2905 }, { "epoch": 1.4426609300016549, "grad_norm": 0.3786559998989105, "learning_rate": 6.194528308559501e-06, "loss": 0.4096, "step": 2906 }, { "epoch": 1.4431573721661426, "grad_norm": 0.37356144189834595, "learning_rate": 6.191722682718269e-06, "loss": 0.3871, "step": 2907 }, { "epoch": 1.4436538143306306, "grad_norm": 0.3113516569137573, "learning_rate": 6.188916658993223e-06, "loss": 0.3294, "step": 2908 }, { "epoch": 1.4441502564951183, "grad_norm": 0.3453516662120819, "learning_rate": 6.186110238321217e-06, "loss": 0.4192, "step": 2909 }, { "epoch": 1.4446466986596063, "grad_norm": 0.3574410080909729, "learning_rate": 6.18330342163924e-06, "loss": 0.4205, "step": 2910 }, { "epoch": 1.445143140824094, "grad_norm": 0.3147244453430176, "learning_rate": 6.1804962098844105e-06, "loss": 0.3423, "step": 2911 }, { "epoch": 1.4456395829885818, "grad_norm": 0.3694359362125397, "learning_rate": 6.177688603993981e-06, "loss": 0.3921, "step": 2912 }, { "epoch": 1.4461360251530697, "grad_norm": 0.31620487570762634, "learning_rate": 6.174880604905334e-06, "loss": 0.3579, "step": 2913 }, { "epoch": 1.4466324673175575, "grad_norm": 0.36003047227859497, "learning_rate": 6.1720722135559844e-06, "loss": 0.3485, "step": 2914 }, { "epoch": 1.4471289094820454, "grad_norm": 0.37003663182258606, "learning_rate": 6.1692634308835766e-06, "loss": 0.3759, "step": 2915 }, { "epoch": 1.4476253516465332, "grad_norm": 0.3431709408760071, "learning_rate": 6.16645425782589e-06, "loss": 0.3968, "step": 2916 }, { "epoch": 1.448121793811021, "grad_norm": 0.3761070966720581, "learning_rate": 6.163644695320829e-06, "loss": 0.3843, "step": 2917 }, { "epoch": 1.4486182359755089, "grad_norm": 0.3729352056980133, "learning_rate": 6.160834744306429e-06, "loss": 0.3732, "step": 2918 }, { "epoch": 1.4491146781399966, "grad_norm": 0.38135653734207153, "learning_rate": 6.158024405720859e-06, "loss": 0.4179, "step": 2919 }, { "epoch": 1.4496111203044846, "grad_norm": 0.3418825566768646, "learning_rate": 6.155213680502412e-06, "loss": 0.3805, "step": 2920 }, { "epoch": 1.4501075624689723, "grad_norm": 0.3621925413608551, "learning_rate": 6.1524025695895155e-06, "loss": 0.3792, "step": 2921 }, { "epoch": 1.45060400463346, "grad_norm": 0.35340791940689087, "learning_rate": 6.14959107392072e-06, "loss": 0.3666, "step": 2922 }, { "epoch": 1.451100446797948, "grad_norm": 0.39432594180107117, "learning_rate": 6.146779194434711e-06, "loss": 0.3736, "step": 2923 }, { "epoch": 1.451596888962436, "grad_norm": 0.31410521268844604, "learning_rate": 6.143966932070295e-06, "loss": 0.3666, "step": 2924 }, { "epoch": 1.4520933311269237, "grad_norm": 0.38581064343452454, "learning_rate": 6.141154287766413e-06, "loss": 0.3949, "step": 2925 }, { "epoch": 1.4525897732914115, "grad_norm": 0.3800438940525055, "learning_rate": 6.138341262462129e-06, "loss": 0.3792, "step": 2926 }, { "epoch": 1.4530862154558994, "grad_norm": 0.3682078421115875, "learning_rate": 6.135527857096635e-06, "loss": 0.386, "step": 2927 }, { "epoch": 1.4535826576203872, "grad_norm": 0.4016319513320923, "learning_rate": 6.132714072609251e-06, "loss": 0.4041, "step": 2928 }, { "epoch": 1.4540790997848752, "grad_norm": 0.38634297251701355, "learning_rate": 6.1298999099394256e-06, "loss": 0.4034, "step": 2929 }, { "epoch": 1.454575541949363, "grad_norm": 0.37067046761512756, "learning_rate": 6.1270853700267275e-06, "loss": 0.3555, "step": 2930 }, { "epoch": 1.4550719841138506, "grad_norm": 0.3314969539642334, "learning_rate": 6.124270453810858e-06, "loss": 0.3854, "step": 2931 }, { "epoch": 1.4555684262783386, "grad_norm": 0.3817991614341736, "learning_rate": 6.1214551622316385e-06, "loss": 0.3744, "step": 2932 }, { "epoch": 1.4560648684428263, "grad_norm": 0.34042009711265564, "learning_rate": 6.118639496229021e-06, "loss": 0.3788, "step": 2933 }, { "epoch": 1.4565613106073143, "grad_norm": 0.3433757722377777, "learning_rate": 6.115823456743079e-06, "loss": 0.3562, "step": 2934 }, { "epoch": 1.457057752771802, "grad_norm": 0.3502334654331207, "learning_rate": 6.11300704471401e-06, "loss": 0.3899, "step": 2935 }, { "epoch": 1.4575541949362898, "grad_norm": 0.3581174612045288, "learning_rate": 6.11019026108214e-06, "loss": 0.3872, "step": 2936 }, { "epoch": 1.4580506371007778, "grad_norm": 0.32599103450775146, "learning_rate": 6.107373106787914e-06, "loss": 0.3435, "step": 2937 }, { "epoch": 1.4585470792652657, "grad_norm": 0.3789547383785248, "learning_rate": 6.104555582771904e-06, "loss": 0.4058, "step": 2938 }, { "epoch": 1.4590435214297535, "grad_norm": 0.31392183899879456, "learning_rate": 6.101737689974805e-06, "loss": 0.3434, "step": 2939 }, { "epoch": 1.4595399635942412, "grad_norm": 0.32051190733909607, "learning_rate": 6.098919429337436e-06, "loss": 0.3683, "step": 2940 }, { "epoch": 1.4600364057587292, "grad_norm": 0.3608928620815277, "learning_rate": 6.0961008018007365e-06, "loss": 0.4063, "step": 2941 }, { "epoch": 1.460532847923217, "grad_norm": 0.337655246257782, "learning_rate": 6.09328180830577e-06, "loss": 0.3534, "step": 2942 }, { "epoch": 1.4610292900877049, "grad_norm": 0.33858177065849304, "learning_rate": 6.090462449793721e-06, "loss": 0.3653, "step": 2943 }, { "epoch": 1.4615257322521926, "grad_norm": 0.3695468306541443, "learning_rate": 6.0876427272058955e-06, "loss": 0.3884, "step": 2944 }, { "epoch": 1.4620221744166804, "grad_norm": 0.33137598633766174, "learning_rate": 6.084822641483725e-06, "loss": 0.3423, "step": 2945 }, { "epoch": 1.4625186165811683, "grad_norm": 0.42174768447875977, "learning_rate": 6.082002193568759e-06, "loss": 0.4231, "step": 2946 }, { "epoch": 1.463015058745656, "grad_norm": 0.32297462224960327, "learning_rate": 6.079181384402667e-06, "loss": 0.3477, "step": 2947 }, { "epoch": 1.463511500910144, "grad_norm": 0.3544239103794098, "learning_rate": 6.076360214927242e-06, "loss": 0.3998, "step": 2948 }, { "epoch": 1.4640079430746318, "grad_norm": 0.34660378098487854, "learning_rate": 6.0735386860843944e-06, "loss": 0.3723, "step": 2949 }, { "epoch": 1.4645043852391195, "grad_norm": 0.33134743571281433, "learning_rate": 6.070716798816157e-06, "loss": 0.3463, "step": 2950 }, { "epoch": 1.4650008274036075, "grad_norm": 0.36111870408058167, "learning_rate": 6.0678945540646815e-06, "loss": 0.3734, "step": 2951 }, { "epoch": 1.4654972695680955, "grad_norm": 0.35751381516456604, "learning_rate": 6.065071952772238e-06, "loss": 0.4241, "step": 2952 }, { "epoch": 1.4659937117325832, "grad_norm": 0.3243441879749298, "learning_rate": 6.062248995881216e-06, "loss": 0.3721, "step": 2953 }, { "epoch": 1.466490153897071, "grad_norm": 0.3662763833999634, "learning_rate": 6.0594256843341235e-06, "loss": 0.3993, "step": 2954 }, { "epoch": 1.466986596061559, "grad_norm": 0.38777607679367065, "learning_rate": 6.056602019073591e-06, "loss": 0.3747, "step": 2955 }, { "epoch": 1.4674830382260466, "grad_norm": 0.34105995297431946, "learning_rate": 6.05377800104236e-06, "loss": 0.3577, "step": 2956 }, { "epoch": 1.4679794803905346, "grad_norm": 0.3494083881378174, "learning_rate": 6.050953631183295e-06, "loss": 0.3733, "step": 2957 }, { "epoch": 1.4684759225550224, "grad_norm": 0.386098712682724, "learning_rate": 6.048128910439374e-06, "loss": 0.3724, "step": 2958 }, { "epoch": 1.46897236471951, "grad_norm": 0.35566169023513794, "learning_rate": 6.045303839753699e-06, "loss": 0.3545, "step": 2959 }, { "epoch": 1.469468806883998, "grad_norm": 0.3507941663265228, "learning_rate": 6.042478420069481e-06, "loss": 0.3961, "step": 2960 }, { "epoch": 1.4699652490484858, "grad_norm": 0.3627687692642212, "learning_rate": 6.03965265233005e-06, "loss": 0.3784, "step": 2961 }, { "epoch": 1.4704616912129738, "grad_norm": 0.3341659903526306, "learning_rate": 6.036826537478856e-06, "loss": 0.3724, "step": 2962 }, { "epoch": 1.4709581333774615, "grad_norm": 0.37028834223747253, "learning_rate": 6.0340000764594595e-06, "loss": 0.4125, "step": 2963 }, { "epoch": 1.4714545755419493, "grad_norm": 0.3688671290874481, "learning_rate": 6.031173270215541e-06, "loss": 0.3739, "step": 2964 }, { "epoch": 1.4719510177064372, "grad_norm": 0.3865700960159302, "learning_rate": 6.028346119690893e-06, "loss": 0.3977, "step": 2965 }, { "epoch": 1.472447459870925, "grad_norm": 0.35038426518440247, "learning_rate": 6.025518625829425e-06, "loss": 0.4049, "step": 2966 }, { "epoch": 1.472943902035413, "grad_norm": 0.3537037670612335, "learning_rate": 6.022690789575159e-06, "loss": 0.3321, "step": 2967 }, { "epoch": 1.4734403441999007, "grad_norm": 0.4057151973247528, "learning_rate": 6.019862611872234e-06, "loss": 0.3967, "step": 2968 }, { "epoch": 1.4739367863643886, "grad_norm": 0.4113604426383972, "learning_rate": 6.017034093664901e-06, "loss": 0.3569, "step": 2969 }, { "epoch": 1.4744332285288764, "grad_norm": 0.37935009598731995, "learning_rate": 6.014205235897526e-06, "loss": 0.3682, "step": 2970 }, { "epoch": 1.4749296706933643, "grad_norm": 0.38451364636421204, "learning_rate": 6.011376039514587e-06, "loss": 0.3997, "step": 2971 }, { "epoch": 1.475426112857852, "grad_norm": 0.3470827639102936, "learning_rate": 6.008546505460677e-06, "loss": 0.3641, "step": 2972 }, { "epoch": 1.4759225550223398, "grad_norm": 0.39409470558166504, "learning_rate": 6.005716634680499e-06, "loss": 0.3834, "step": 2973 }, { "epoch": 1.4764189971868278, "grad_norm": 0.3897729814052582, "learning_rate": 6.002886428118869e-06, "loss": 0.4196, "step": 2974 }, { "epoch": 1.4769154393513155, "grad_norm": 0.3407471776008606, "learning_rate": 6.000055886720719e-06, "loss": 0.3695, "step": 2975 }, { "epoch": 1.4774118815158035, "grad_norm": 0.31065472960472107, "learning_rate": 5.997225011431089e-06, "loss": 0.3378, "step": 2976 }, { "epoch": 1.4779083236802912, "grad_norm": 0.3393095135688782, "learning_rate": 5.994393803195129e-06, "loss": 0.3674, "step": 2977 }, { "epoch": 1.478404765844779, "grad_norm": 0.3740852177143097, "learning_rate": 5.991562262958105e-06, "loss": 0.3776, "step": 2978 }, { "epoch": 1.478901208009267, "grad_norm": 0.32276469469070435, "learning_rate": 5.9887303916653916e-06, "loss": 0.3381, "step": 2979 }, { "epoch": 1.4793976501737547, "grad_norm": 0.3858112096786499, "learning_rate": 5.985898190262471e-06, "loss": 0.3778, "step": 2980 }, { "epoch": 1.4798940923382427, "grad_norm": 0.3155743479728699, "learning_rate": 5.983065659694942e-06, "loss": 0.3389, "step": 2981 }, { "epoch": 1.4803905345027304, "grad_norm": 0.3499200940132141, "learning_rate": 5.980232800908507e-06, "loss": 0.3547, "step": 2982 }, { "epoch": 1.4808869766672181, "grad_norm": 0.39674508571624756, "learning_rate": 5.97739961484898e-06, "loss": 0.3919, "step": 2983 }, { "epoch": 1.481383418831706, "grad_norm": 0.3472881615161896, "learning_rate": 5.974566102462286e-06, "loss": 0.3732, "step": 2984 }, { "epoch": 1.481879860996194, "grad_norm": 0.3626594543457031, "learning_rate": 5.971732264694458e-06, "loss": 0.3944, "step": 2985 }, { "epoch": 1.4823763031606818, "grad_norm": 0.3251488506793976, "learning_rate": 5.9688981024916355e-06, "loss": 0.3139, "step": 2986 }, { "epoch": 1.4828727453251696, "grad_norm": 0.3670632839202881, "learning_rate": 5.966063616800072e-06, "loss": 0.4141, "step": 2987 }, { "epoch": 1.4833691874896575, "grad_norm": 0.3660675287246704, "learning_rate": 5.9632288085661215e-06, "loss": 0.4062, "step": 2988 }, { "epoch": 1.4838656296541453, "grad_norm": 0.35938867926597595, "learning_rate": 5.960393678736252e-06, "loss": 0.3424, "step": 2989 }, { "epoch": 1.4843620718186332, "grad_norm": 0.3526211977005005, "learning_rate": 5.9575582282570356e-06, "loss": 0.3938, "step": 2990 }, { "epoch": 1.484858513983121, "grad_norm": 0.3610614538192749, "learning_rate": 5.95472245807515e-06, "loss": 0.3919, "step": 2991 }, { "epoch": 1.4853549561476087, "grad_norm": 0.3832826316356659, "learning_rate": 5.951886369137384e-06, "loss": 0.414, "step": 2992 }, { "epoch": 1.4858513983120967, "grad_norm": 0.3894597887992859, "learning_rate": 5.94904996239063e-06, "loss": 0.3796, "step": 2993 }, { "epoch": 1.4863478404765844, "grad_norm": 0.3553735017776489, "learning_rate": 5.946213238781889e-06, "loss": 0.3877, "step": 2994 }, { "epoch": 1.4868442826410724, "grad_norm": 0.3466031551361084, "learning_rate": 5.943376199258264e-06, "loss": 0.3739, "step": 2995 }, { "epoch": 1.4873407248055601, "grad_norm": 0.3763887882232666, "learning_rate": 5.9405388447669655e-06, "loss": 0.4002, "step": 2996 }, { "epoch": 1.4878371669700479, "grad_norm": 0.3309984505176544, "learning_rate": 5.9377011762553075e-06, "loss": 0.4096, "step": 2997 }, { "epoch": 1.4883336091345358, "grad_norm": 0.34648868441581726, "learning_rate": 5.9348631946707135e-06, "loss": 0.3554, "step": 2998 }, { "epoch": 1.4888300512990238, "grad_norm": 0.347830206155777, "learning_rate": 5.932024900960707e-06, "loss": 0.3729, "step": 2999 }, { "epoch": 1.4893264934635115, "grad_norm": 0.36283817887306213, "learning_rate": 5.929186296072915e-06, "loss": 0.3813, "step": 3000 }, { "epoch": 1.4898229356279993, "grad_norm": 0.35699811577796936, "learning_rate": 5.926347380955074e-06, "loss": 0.4223, "step": 3001 }, { "epoch": 1.4903193777924872, "grad_norm": 0.332440048456192, "learning_rate": 5.9235081565550205e-06, "loss": 0.3711, "step": 3002 }, { "epoch": 1.490815819956975, "grad_norm": 0.3381398022174835, "learning_rate": 5.920668623820692e-06, "loss": 0.374, "step": 3003 }, { "epoch": 1.491312262121463, "grad_norm": 0.4021315574645996, "learning_rate": 5.917828783700132e-06, "loss": 0.4071, "step": 3004 }, { "epoch": 1.4918087042859507, "grad_norm": 0.3373064696788788, "learning_rate": 5.914988637141488e-06, "loss": 0.3276, "step": 3005 }, { "epoch": 1.4923051464504384, "grad_norm": 0.36498650908470154, "learning_rate": 5.912148185093004e-06, "loss": 0.4142, "step": 3006 }, { "epoch": 1.4928015886149264, "grad_norm": 0.3179672658443451, "learning_rate": 5.909307428503033e-06, "loss": 0.3358, "step": 3007 }, { "epoch": 1.4932980307794141, "grad_norm": 0.36188995838165283, "learning_rate": 5.906466368320025e-06, "loss": 0.3686, "step": 3008 }, { "epoch": 1.493794472943902, "grad_norm": 0.3565570116043091, "learning_rate": 5.903625005492532e-06, "loss": 0.3897, "step": 3009 }, { "epoch": 1.4942909151083899, "grad_norm": 0.3369789719581604, "learning_rate": 5.9007833409692094e-06, "loss": 0.3642, "step": 3010 }, { "epoch": 1.4947873572728776, "grad_norm": 0.36082717776298523, "learning_rate": 5.897941375698812e-06, "loss": 0.3636, "step": 3011 }, { "epoch": 1.4952837994373656, "grad_norm": 0.34907904267311096, "learning_rate": 5.895099110630193e-06, "loss": 0.3976, "step": 3012 }, { "epoch": 1.4957802416018535, "grad_norm": 0.34870150685310364, "learning_rate": 5.892256546712311e-06, "loss": 0.3491, "step": 3013 }, { "epoch": 1.4962766837663413, "grad_norm": 0.3184872567653656, "learning_rate": 5.889413684894215e-06, "loss": 0.3208, "step": 3014 }, { "epoch": 1.496773125930829, "grad_norm": 0.3335537314414978, "learning_rate": 5.886570526125064e-06, "loss": 0.3969, "step": 3015 }, { "epoch": 1.497269568095317, "grad_norm": 0.33117327094078064, "learning_rate": 5.883727071354109e-06, "loss": 0.3915, "step": 3016 }, { "epoch": 1.4977660102598047, "grad_norm": 0.3423610329627991, "learning_rate": 5.880883321530702e-06, "loss": 0.393, "step": 3017 }, { "epoch": 1.4982624524242927, "grad_norm": 0.3282686769962311, "learning_rate": 5.878039277604298e-06, "loss": 0.3399, "step": 3018 }, { "epoch": 1.4987588945887804, "grad_norm": 0.371090829372406, "learning_rate": 5.875194940524442e-06, "loss": 0.3848, "step": 3019 }, { "epoch": 1.4992553367532682, "grad_norm": 0.3489829897880554, "learning_rate": 5.872350311240782e-06, "loss": 0.3677, "step": 3020 }, { "epoch": 1.4997517789177561, "grad_norm": 0.33766189217567444, "learning_rate": 5.869505390703062e-06, "loss": 0.3692, "step": 3021 }, { "epoch": 1.5002482210822439, "grad_norm": 0.33224913477897644, "learning_rate": 5.866660179861125e-06, "loss": 0.3779, "step": 3022 }, { "epoch": 1.5007446632467318, "grad_norm": 0.3360907733440399, "learning_rate": 5.8638146796649065e-06, "loss": 0.3522, "step": 3023 }, { "epoch": 1.5012411054112196, "grad_norm": 0.35386812686920166, "learning_rate": 5.860968891064445e-06, "loss": 0.4241, "step": 3024 }, { "epoch": 1.5017375475757073, "grad_norm": 0.3544110953807831, "learning_rate": 5.858122815009869e-06, "loss": 0.3767, "step": 3025 }, { "epoch": 1.5022339897401953, "grad_norm": 0.33199235796928406, "learning_rate": 5.8552764524514095e-06, "loss": 0.3247, "step": 3026 }, { "epoch": 1.5027304319046833, "grad_norm": 0.36309394240379333, "learning_rate": 5.852429804339386e-06, "loss": 0.3825, "step": 3027 }, { "epoch": 1.503226874069171, "grad_norm": 0.34117695689201355, "learning_rate": 5.84958287162422e-06, "loss": 0.3512, "step": 3028 }, { "epoch": 1.5037233162336587, "grad_norm": 0.36347532272338867, "learning_rate": 5.846735655256423e-06, "loss": 0.3693, "step": 3029 }, { "epoch": 1.5042197583981465, "grad_norm": 0.35891178250312805, "learning_rate": 5.843888156186604e-06, "loss": 0.4257, "step": 3030 }, { "epoch": 1.5047162005626344, "grad_norm": 0.367597371339798, "learning_rate": 5.841040375365464e-06, "loss": 0.3869, "step": 3031 }, { "epoch": 1.5052126427271224, "grad_norm": 0.3227611780166626, "learning_rate": 5.838192313743802e-06, "loss": 0.3427, "step": 3032 }, { "epoch": 1.5057090848916101, "grad_norm": 0.3685976564884186, "learning_rate": 5.835343972272507e-06, "loss": 0.3714, "step": 3033 }, { "epoch": 1.506205527056098, "grad_norm": 0.33524245023727417, "learning_rate": 5.832495351902563e-06, "loss": 0.3725, "step": 3034 }, { "epoch": 1.5067019692205859, "grad_norm": 0.3445053994655609, "learning_rate": 5.829646453585047e-06, "loss": 0.3506, "step": 3035 }, { "epoch": 1.5071984113850736, "grad_norm": 0.38875406980514526, "learning_rate": 5.826797278271128e-06, "loss": 0.3645, "step": 3036 }, { "epoch": 1.5076948535495616, "grad_norm": 0.34083351492881775, "learning_rate": 5.8239478269120706e-06, "loss": 0.3538, "step": 3037 }, { "epoch": 1.5081912957140493, "grad_norm": 0.34649208188056946, "learning_rate": 5.821098100459226e-06, "loss": 0.3846, "step": 3038 }, { "epoch": 1.508687737878537, "grad_norm": 0.35656973719596863, "learning_rate": 5.818248099864042e-06, "loss": 0.3637, "step": 3039 }, { "epoch": 1.509184180043025, "grad_norm": 0.3615702688694, "learning_rate": 5.815397826078056e-06, "loss": 0.3535, "step": 3040 }, { "epoch": 1.509680622207513, "grad_norm": 0.3697679042816162, "learning_rate": 5.812547280052899e-06, "loss": 0.3823, "step": 3041 }, { "epoch": 1.5101770643720007, "grad_norm": 0.3864224851131439, "learning_rate": 5.809696462740287e-06, "loss": 0.4093, "step": 3042 }, { "epoch": 1.5106735065364885, "grad_norm": 0.36766645312309265, "learning_rate": 5.806845375092033e-06, "loss": 0.4011, "step": 3043 }, { "epoch": 1.5111699487009762, "grad_norm": 0.35174867510795593, "learning_rate": 5.803994018060038e-06, "loss": 0.3746, "step": 3044 }, { "epoch": 1.5116663908654642, "grad_norm": 0.36566808819770813, "learning_rate": 5.801142392596291e-06, "loss": 0.3784, "step": 3045 }, { "epoch": 1.5121628330299521, "grad_norm": 0.4024166762828827, "learning_rate": 5.798290499652873e-06, "loss": 0.3867, "step": 3046 }, { "epoch": 1.5126592751944399, "grad_norm": 0.36055895686149597, "learning_rate": 5.795438340181954e-06, "loss": 0.382, "step": 3047 }, { "epoch": 1.5131557173589276, "grad_norm": 0.3331918716430664, "learning_rate": 5.79258591513579e-06, "loss": 0.3521, "step": 3048 }, { "epoch": 1.5136521595234154, "grad_norm": 0.3627139627933502, "learning_rate": 5.789733225466732e-06, "loss": 0.3722, "step": 3049 }, { "epoch": 1.5141486016879033, "grad_norm": 0.37263768911361694, "learning_rate": 5.786880272127213e-06, "loss": 0.3595, "step": 3050 }, { "epoch": 1.5146450438523913, "grad_norm": 0.3884256184101105, "learning_rate": 5.784027056069757e-06, "loss": 0.3912, "step": 3051 }, { "epoch": 1.515141486016879, "grad_norm": 0.3686161935329437, "learning_rate": 5.781173578246978e-06, "loss": 0.377, "step": 3052 }, { "epoch": 1.5156379281813668, "grad_norm": 0.3690820634365082, "learning_rate": 5.77831983961157e-06, "loss": 0.3636, "step": 3053 }, { "epoch": 1.5161343703458547, "grad_norm": 0.35043543577194214, "learning_rate": 5.775465841116323e-06, "loss": 0.3515, "step": 3054 }, { "epoch": 1.5166308125103427, "grad_norm": 0.3677026033401489, "learning_rate": 5.772611583714106e-06, "loss": 0.3878, "step": 3055 }, { "epoch": 1.5171272546748304, "grad_norm": 0.3418862521648407, "learning_rate": 5.769757068357878e-06, "loss": 0.3423, "step": 3056 }, { "epoch": 1.5176236968393182, "grad_norm": 0.36242857575416565, "learning_rate": 5.766902296000689e-06, "loss": 0.3905, "step": 3057 }, { "epoch": 1.518120139003806, "grad_norm": 0.34097063541412354, "learning_rate": 5.7640472675956664e-06, "loss": 0.3679, "step": 3058 }, { "epoch": 1.518616581168294, "grad_norm": 0.3191186785697937, "learning_rate": 5.761191984096026e-06, "loss": 0.361, "step": 3059 }, { "epoch": 1.5191130233327819, "grad_norm": 0.3360063433647156, "learning_rate": 5.758336446455069e-06, "loss": 0.3792, "step": 3060 }, { "epoch": 1.5196094654972696, "grad_norm": 0.35232624411582947, "learning_rate": 5.755480655626185e-06, "loss": 0.3981, "step": 3061 }, { "epoch": 1.5201059076617573, "grad_norm": 0.35192856192588806, "learning_rate": 5.752624612562841e-06, "loss": 0.3793, "step": 3062 }, { "epoch": 1.520602349826245, "grad_norm": 0.32706886529922485, "learning_rate": 5.749768318218595e-06, "loss": 0.3506, "step": 3063 }, { "epoch": 1.521098791990733, "grad_norm": 0.3673951327800751, "learning_rate": 5.746911773547084e-06, "loss": 0.3784, "step": 3064 }, { "epoch": 1.521595234155221, "grad_norm": 0.3913011848926544, "learning_rate": 5.744054979502035e-06, "loss": 0.3737, "step": 3065 }, { "epoch": 1.5220916763197088, "grad_norm": 0.3444094955921173, "learning_rate": 5.741197937037248e-06, "loss": 0.3893, "step": 3066 }, { "epoch": 1.5225881184841965, "grad_norm": 0.34528735280036926, "learning_rate": 5.738340647106615e-06, "loss": 0.3852, "step": 3067 }, { "epoch": 1.5230845606486845, "grad_norm": 0.3499956727027893, "learning_rate": 5.735483110664107e-06, "loss": 0.3517, "step": 3068 }, { "epoch": 1.5235810028131722, "grad_norm": 0.3539198935031891, "learning_rate": 5.732625328663777e-06, "loss": 0.3685, "step": 3069 }, { "epoch": 1.5240774449776602, "grad_norm": 0.37997204065322876, "learning_rate": 5.729767302059763e-06, "loss": 0.3994, "step": 3070 }, { "epoch": 1.524573887142148, "grad_norm": 0.3710693120956421, "learning_rate": 5.726909031806279e-06, "loss": 0.3556, "step": 3071 }, { "epoch": 1.5250703293066357, "grad_norm": 0.39324697852134705, "learning_rate": 5.724050518857627e-06, "loss": 0.4201, "step": 3072 }, { "epoch": 1.5255667714711236, "grad_norm": 0.3587105870246887, "learning_rate": 5.721191764168183e-06, "loss": 0.3294, "step": 3073 }, { "epoch": 1.5260632136356116, "grad_norm": 0.35794225335121155, "learning_rate": 5.718332768692413e-06, "loss": 0.3863, "step": 3074 }, { "epoch": 1.5265596558000993, "grad_norm": 0.3397541344165802, "learning_rate": 5.715473533384853e-06, "loss": 0.3653, "step": 3075 }, { "epoch": 1.527056097964587, "grad_norm": 0.3643323481082916, "learning_rate": 5.712614059200126e-06, "loss": 0.3716, "step": 3076 }, { "epoch": 1.5275525401290748, "grad_norm": 0.39138728380203247, "learning_rate": 5.709754347092933e-06, "loss": 0.3792, "step": 3077 }, { "epoch": 1.5280489822935628, "grad_norm": 0.38919466733932495, "learning_rate": 5.706894398018053e-06, "loss": 0.3783, "step": 3078 }, { "epoch": 1.5285454244580507, "grad_norm": 0.37220892310142517, "learning_rate": 5.704034212930346e-06, "loss": 0.3765, "step": 3079 }, { "epoch": 1.5290418666225385, "grad_norm": 0.3466111123561859, "learning_rate": 5.7011737927847484e-06, "loss": 0.3394, "step": 3080 }, { "epoch": 1.5295383087870262, "grad_norm": 0.39417901635169983, "learning_rate": 5.69831313853628e-06, "loss": 0.4095, "step": 3081 }, { "epoch": 1.5300347509515142, "grad_norm": 0.3569321036338806, "learning_rate": 5.695452251140034e-06, "loss": 0.3393, "step": 3082 }, { "epoch": 1.530531193116002, "grad_norm": 0.33059704303741455, "learning_rate": 5.692591131551182e-06, "loss": 0.3387, "step": 3083 }, { "epoch": 1.53102763528049, "grad_norm": 0.4266974925994873, "learning_rate": 5.689729780724974e-06, "loss": 0.3444, "step": 3084 }, { "epoch": 1.5315240774449776, "grad_norm": 0.3652665317058563, "learning_rate": 5.68686819961674e-06, "loss": 0.373, "step": 3085 }, { "epoch": 1.5320205196094654, "grad_norm": 0.34187376499176025, "learning_rate": 5.6840063891818795e-06, "loss": 0.3736, "step": 3086 }, { "epoch": 1.5325169617739534, "grad_norm": 0.33691924810409546, "learning_rate": 5.681144350375877e-06, "loss": 0.3493, "step": 3087 }, { "epoch": 1.5330134039384413, "grad_norm": 0.38798490166664124, "learning_rate": 5.678282084154289e-06, "loss": 0.4084, "step": 3088 }, { "epoch": 1.533509846102929, "grad_norm": 0.36075475811958313, "learning_rate": 5.675419591472747e-06, "loss": 0.3653, "step": 3089 }, { "epoch": 1.5340062882674168, "grad_norm": 0.3888585865497589, "learning_rate": 5.672556873286961e-06, "loss": 0.4, "step": 3090 }, { "epoch": 1.5345027304319045, "grad_norm": 0.36284753680229187, "learning_rate": 5.669693930552714e-06, "loss": 0.3594, "step": 3091 }, { "epoch": 1.5349991725963925, "grad_norm": 0.35795196890830994, "learning_rate": 5.6668307642258655e-06, "loss": 0.3852, "step": 3092 }, { "epoch": 1.5354956147608805, "grad_norm": 0.3585965037345886, "learning_rate": 5.663967375262348e-06, "loss": 0.3902, "step": 3093 }, { "epoch": 1.5359920569253682, "grad_norm": 0.38853347301483154, "learning_rate": 5.6611037646181684e-06, "loss": 0.3607, "step": 3094 }, { "epoch": 1.536488499089856, "grad_norm": 0.33547329902648926, "learning_rate": 5.65823993324941e-06, "loss": 0.3473, "step": 3095 }, { "epoch": 1.5369849412543437, "grad_norm": 0.3799256980419159, "learning_rate": 5.655375882112228e-06, "loss": 0.382, "step": 3096 }, { "epoch": 1.5374813834188317, "grad_norm": 0.348420113325119, "learning_rate": 5.652511612162851e-06, "loss": 0.3665, "step": 3097 }, { "epoch": 1.5379778255833196, "grad_norm": 0.3571617603302002, "learning_rate": 5.649647124357582e-06, "loss": 0.4265, "step": 3098 }, { "epoch": 1.5384742677478074, "grad_norm": 0.322875052690506, "learning_rate": 5.646782419652793e-06, "loss": 0.3567, "step": 3099 }, { "epoch": 1.5389707099122951, "grad_norm": 0.4008832573890686, "learning_rate": 5.643917499004934e-06, "loss": 0.3735, "step": 3100 }, { "epoch": 1.539467152076783, "grad_norm": 0.36777162551879883, "learning_rate": 5.641052363370523e-06, "loss": 0.4142, "step": 3101 }, { "epoch": 1.539963594241271, "grad_norm": 0.3263140916824341, "learning_rate": 5.63818701370615e-06, "loss": 0.3653, "step": 3102 }, { "epoch": 1.5404600364057588, "grad_norm": 0.366075724363327, "learning_rate": 5.635321450968476e-06, "loss": 0.4175, "step": 3103 }, { "epoch": 1.5409564785702465, "grad_norm": 0.3554900586605072, "learning_rate": 5.63245567611424e-06, "loss": 0.3711, "step": 3104 }, { "epoch": 1.5414529207347343, "grad_norm": 0.3330858647823334, "learning_rate": 5.629589690100241e-06, "loss": 0.3364, "step": 3105 }, { "epoch": 1.5419493628992222, "grad_norm": 0.34458059072494507, "learning_rate": 5.626723493883357e-06, "loss": 0.4, "step": 3106 }, { "epoch": 1.5424458050637102, "grad_norm": 0.3157341182231903, "learning_rate": 5.623857088420531e-06, "loss": 0.3737, "step": 3107 }, { "epoch": 1.542942247228198, "grad_norm": 0.3749960958957672, "learning_rate": 5.620990474668779e-06, "loss": 0.3862, "step": 3108 }, { "epoch": 1.5434386893926857, "grad_norm": 0.33169299364089966, "learning_rate": 5.618123653585184e-06, "loss": 0.3848, "step": 3109 }, { "epoch": 1.5439351315571734, "grad_norm": 0.31676217913627625, "learning_rate": 5.615256626126903e-06, "loss": 0.3575, "step": 3110 }, { "epoch": 1.5444315737216614, "grad_norm": 0.3449697196483612, "learning_rate": 5.612389393251154e-06, "loss": 0.3619, "step": 3111 }, { "epoch": 1.5449280158861494, "grad_norm": 0.3630078136920929, "learning_rate": 5.609521955915231e-06, "loss": 0.3735, "step": 3112 }, { "epoch": 1.545424458050637, "grad_norm": 0.3631591498851776, "learning_rate": 5.606654315076494e-06, "loss": 0.3691, "step": 3113 }, { "epoch": 1.5459209002151248, "grad_norm": 0.3256611227989197, "learning_rate": 5.6037864716923675e-06, "loss": 0.355, "step": 3114 }, { "epoch": 1.5464173423796128, "grad_norm": 0.2973833382129669, "learning_rate": 5.60091842672035e-06, "loss": 0.3768, "step": 3115 }, { "epoch": 1.5469137845441008, "grad_norm": 0.3653898537158966, "learning_rate": 5.5980501811179996e-06, "loss": 0.4039, "step": 3116 }, { "epoch": 1.5474102267085885, "grad_norm": 0.3613310754299164, "learning_rate": 5.595181735842951e-06, "loss": 0.3498, "step": 3117 }, { "epoch": 1.5479066688730763, "grad_norm": 0.35352781414985657, "learning_rate": 5.592313091852894e-06, "loss": 0.3557, "step": 3118 }, { "epoch": 1.548403111037564, "grad_norm": 0.37554314732551575, "learning_rate": 5.589444250105595e-06, "loss": 0.3777, "step": 3119 }, { "epoch": 1.548899553202052, "grad_norm": 0.3652007281780243, "learning_rate": 5.58657521155888e-06, "loss": 0.3721, "step": 3120 }, { "epoch": 1.54939599536654, "grad_norm": 0.33891308307647705, "learning_rate": 5.583705977170646e-06, "loss": 0.3614, "step": 3121 }, { "epoch": 1.5498924375310277, "grad_norm": 0.3810909390449524, "learning_rate": 5.580836547898849e-06, "loss": 0.3787, "step": 3122 }, { "epoch": 1.5503888796955154, "grad_norm": 0.3268742263317108, "learning_rate": 5.577966924701516e-06, "loss": 0.3579, "step": 3123 }, { "epoch": 1.5508853218600032, "grad_norm": 0.3289795517921448, "learning_rate": 5.575097108536735e-06, "loss": 0.3623, "step": 3124 }, { "epoch": 1.5513817640244911, "grad_norm": 0.4088496267795563, "learning_rate": 5.572227100362658e-06, "loss": 0.3547, "step": 3125 }, { "epoch": 1.551878206188979, "grad_norm": 0.3457057774066925, "learning_rate": 5.569356901137506e-06, "loss": 0.374, "step": 3126 }, { "epoch": 1.5523746483534668, "grad_norm": 0.35174092650413513, "learning_rate": 5.566486511819558e-06, "loss": 0.353, "step": 3127 }, { "epoch": 1.5528710905179546, "grad_norm": 0.36245012283325195, "learning_rate": 5.563615933367161e-06, "loss": 0.3948, "step": 3128 }, { "epoch": 1.5533675326824425, "grad_norm": 0.3202965259552002, "learning_rate": 5.560745166738722e-06, "loss": 0.3636, "step": 3129 }, { "epoch": 1.5538639748469303, "grad_norm": 0.3789304494857788, "learning_rate": 5.557874212892711e-06, "loss": 0.3781, "step": 3130 }, { "epoch": 1.5543604170114182, "grad_norm": 0.35255005955696106, "learning_rate": 5.555003072787664e-06, "loss": 0.3787, "step": 3131 }, { "epoch": 1.554856859175906, "grad_norm": 0.3433764576911926, "learning_rate": 5.552131747382174e-06, "loss": 0.3832, "step": 3132 }, { "epoch": 1.5553533013403937, "grad_norm": 0.3490036725997925, "learning_rate": 5.5492602376349e-06, "loss": 0.3629, "step": 3133 }, { "epoch": 1.5558497435048817, "grad_norm": 0.3664191961288452, "learning_rate": 5.5463885445045605e-06, "loss": 0.4244, "step": 3134 }, { "epoch": 1.5563461856693697, "grad_norm": 0.36216917634010315, "learning_rate": 5.543516668949935e-06, "loss": 0.382, "step": 3135 }, { "epoch": 1.5568426278338574, "grad_norm": 0.3688444197177887, "learning_rate": 5.540644611929869e-06, "loss": 0.4017, "step": 3136 }, { "epoch": 1.5573390699983451, "grad_norm": 0.3256170153617859, "learning_rate": 5.5377723744032585e-06, "loss": 0.3691, "step": 3137 }, { "epoch": 1.557835512162833, "grad_norm": 0.318068265914917, "learning_rate": 5.534899957329067e-06, "loss": 0.3552, "step": 3138 }, { "epoch": 1.5583319543273209, "grad_norm": 0.39697158336639404, "learning_rate": 5.53202736166632e-06, "loss": 0.3857, "step": 3139 }, { "epoch": 1.5588283964918088, "grad_norm": 0.3719029426574707, "learning_rate": 5.529154588374096e-06, "loss": 0.398, "step": 3140 }, { "epoch": 1.5593248386562966, "grad_norm": 0.32672232389450073, "learning_rate": 5.526281638411537e-06, "loss": 0.3378, "step": 3141 }, { "epoch": 1.5598212808207843, "grad_norm": 0.38625675439834595, "learning_rate": 5.523408512737841e-06, "loss": 0.3789, "step": 3142 }, { "epoch": 1.5603177229852723, "grad_norm": 0.34263771772384644, "learning_rate": 5.520535212312268e-06, "loss": 0.3435, "step": 3143 }, { "epoch": 1.56081416514976, "grad_norm": 0.36130988597869873, "learning_rate": 5.5176617380941355e-06, "loss": 0.3924, "step": 3144 }, { "epoch": 1.561310607314248, "grad_norm": 0.3535846769809723, "learning_rate": 5.514788091042819e-06, "loss": 0.3949, "step": 3145 }, { "epoch": 1.5618070494787357, "grad_norm": 0.33294177055358887, "learning_rate": 5.511914272117748e-06, "loss": 0.3754, "step": 3146 }, { "epoch": 1.5623034916432235, "grad_norm": 0.3727819621562958, "learning_rate": 5.5090402822784175e-06, "loss": 0.4076, "step": 3147 }, { "epoch": 1.5627999338077114, "grad_norm": 0.3770501911640167, "learning_rate": 5.506166122484369e-06, "loss": 0.4007, "step": 3148 }, { "epoch": 1.5632963759721994, "grad_norm": 0.3363784849643707, "learning_rate": 5.503291793695211e-06, "loss": 0.3687, "step": 3149 }, { "epoch": 1.5637928181366871, "grad_norm": 0.3209868371486664, "learning_rate": 5.500417296870599e-06, "loss": 0.3515, "step": 3150 }, { "epoch": 1.5642892603011749, "grad_norm": 0.3727284073829651, "learning_rate": 5.497542632970255e-06, "loss": 0.3647, "step": 3151 }, { "epoch": 1.5647857024656626, "grad_norm": 0.35282641649246216, "learning_rate": 5.494667802953947e-06, "loss": 0.3705, "step": 3152 }, { "epoch": 1.5652821446301506, "grad_norm": 0.35238105058670044, "learning_rate": 5.4917928077815034e-06, "loss": 0.3881, "step": 3153 }, { "epoch": 1.5657785867946385, "grad_norm": 0.35376375913619995, "learning_rate": 5.488917648412809e-06, "loss": 0.3421, "step": 3154 }, { "epoch": 1.5662750289591263, "grad_norm": 0.3933814465999603, "learning_rate": 5.486042325807799e-06, "loss": 0.4042, "step": 3155 }, { "epoch": 1.566771471123614, "grad_norm": 0.324434369802475, "learning_rate": 5.483166840926467e-06, "loss": 0.3646, "step": 3156 }, { "epoch": 1.5672679132881018, "grad_norm": 0.3423956036567688, "learning_rate": 5.480291194728857e-06, "loss": 0.3628, "step": 3157 }, { "epoch": 1.5677643554525897, "grad_norm": 0.3716980516910553, "learning_rate": 5.477415388175071e-06, "loss": 0.3954, "step": 3158 }, { "epoch": 1.5682607976170777, "grad_norm": 0.3741329312324524, "learning_rate": 5.474539422225263e-06, "loss": 0.3698, "step": 3159 }, { "epoch": 1.5687572397815654, "grad_norm": 0.34008151292800903, "learning_rate": 5.47166329783964e-06, "loss": 0.408, "step": 3160 }, { "epoch": 1.5692536819460532, "grad_norm": 0.3825189173221588, "learning_rate": 5.4687870159784595e-06, "loss": 0.3565, "step": 3161 }, { "epoch": 1.5697501241105412, "grad_norm": 0.3247777223587036, "learning_rate": 5.465910577602037e-06, "loss": 0.3338, "step": 3162 }, { "epoch": 1.5702465662750291, "grad_norm": 0.34523534774780273, "learning_rate": 5.463033983670733e-06, "loss": 0.3617, "step": 3163 }, { "epoch": 1.5707430084395169, "grad_norm": 0.35328808426856995, "learning_rate": 5.4601572351449695e-06, "loss": 0.3841, "step": 3164 }, { "epoch": 1.5712394506040046, "grad_norm": 0.33838966488838196, "learning_rate": 5.457280332985209e-06, "loss": 0.3817, "step": 3165 }, { "epoch": 1.5717358927684923, "grad_norm": 0.3762112855911255, "learning_rate": 5.454403278151974e-06, "loss": 0.398, "step": 3166 }, { "epoch": 1.5722323349329803, "grad_norm": 0.3341659605503082, "learning_rate": 5.451526071605835e-06, "loss": 0.3875, "step": 3167 }, { "epoch": 1.5727287770974683, "grad_norm": 0.33661949634552, "learning_rate": 5.44864871430741e-06, "loss": 0.3601, "step": 3168 }, { "epoch": 1.573225219261956, "grad_norm": 0.3401913642883301, "learning_rate": 5.445771207217377e-06, "loss": 0.3623, "step": 3169 }, { "epoch": 1.5737216614264438, "grad_norm": 0.3585369288921356, "learning_rate": 5.4428935512964505e-06, "loss": 0.3653, "step": 3170 }, { "epoch": 1.5742181035909315, "grad_norm": 0.37264809012413025, "learning_rate": 5.440015747505406e-06, "loss": 0.3868, "step": 3171 }, { "epoch": 1.5747145457554195, "grad_norm": 0.3346083164215088, "learning_rate": 5.437137796805062e-06, "loss": 0.3621, "step": 3172 }, { "epoch": 1.5752109879199074, "grad_norm": 0.3486453890800476, "learning_rate": 5.434259700156288e-06, "loss": 0.3754, "step": 3173 }, { "epoch": 1.5757074300843952, "grad_norm": 0.33752235770225525, "learning_rate": 5.431381458520002e-06, "loss": 0.3847, "step": 3174 }, { "epoch": 1.576203872248883, "grad_norm": 0.34750914573669434, "learning_rate": 5.428503072857172e-06, "loss": 0.3355, "step": 3175 }, { "epoch": 1.5767003144133709, "grad_norm": 0.3550160527229309, "learning_rate": 5.425624544128813e-06, "loss": 0.3839, "step": 3176 }, { "epoch": 1.5771967565778586, "grad_norm": 0.33781370520591736, "learning_rate": 5.422745873295985e-06, "loss": 0.3706, "step": 3177 }, { "epoch": 1.5776931987423466, "grad_norm": 0.33562329411506653, "learning_rate": 5.4198670613198e-06, "loss": 0.393, "step": 3178 }, { "epoch": 1.5781896409068343, "grad_norm": 0.35896775126457214, "learning_rate": 5.416988109161414e-06, "loss": 0.3832, "step": 3179 }, { "epoch": 1.578686083071322, "grad_norm": 0.3671322762966156, "learning_rate": 5.414109017782033e-06, "loss": 0.4223, "step": 3180 }, { "epoch": 1.57918252523581, "grad_norm": 0.31579792499542236, "learning_rate": 5.411229788142905e-06, "loss": 0.3303, "step": 3181 }, { "epoch": 1.579678967400298, "grad_norm": 0.4144510328769684, "learning_rate": 5.408350421205326e-06, "loss": 0.4229, "step": 3182 }, { "epoch": 1.5801754095647857, "grad_norm": 0.35060134530067444, "learning_rate": 5.405470917930641e-06, "loss": 0.3635, "step": 3183 }, { "epoch": 1.5806718517292735, "grad_norm": 0.3237612545490265, "learning_rate": 5.4025912792802374e-06, "loss": 0.3761, "step": 3184 }, { "epoch": 1.5811682938937612, "grad_norm": 0.3505896031856537, "learning_rate": 5.3997115062155455e-06, "loss": 0.3706, "step": 3185 }, { "epoch": 1.5816647360582492, "grad_norm": 0.39312610030174255, "learning_rate": 5.396831599698048e-06, "loss": 0.3579, "step": 3186 }, { "epoch": 1.5821611782227372, "grad_norm": 0.36531656980514526, "learning_rate": 5.393951560689262e-06, "loss": 0.3833, "step": 3187 }, { "epoch": 1.582657620387225, "grad_norm": 0.33672335743904114, "learning_rate": 5.39107139015076e-06, "loss": 0.3519, "step": 3188 }, { "epoch": 1.5831540625517126, "grad_norm": 0.3199464678764343, "learning_rate": 5.388191089044146e-06, "loss": 0.3284, "step": 3189 }, { "epoch": 1.5836505047162006, "grad_norm": 0.39031845331192017, "learning_rate": 5.385310658331079e-06, "loss": 0.409, "step": 3190 }, { "epoch": 1.5841469468806884, "grad_norm": 0.3640403747558594, "learning_rate": 5.382430098973256e-06, "loss": 0.4059, "step": 3191 }, { "epoch": 1.5846433890451763, "grad_norm": 0.3176769018173218, "learning_rate": 5.379549411932417e-06, "loss": 0.3564, "step": 3192 }, { "epoch": 1.585139831209664, "grad_norm": 0.3610278069972992, "learning_rate": 5.376668598170344e-06, "loss": 0.3651, "step": 3193 }, { "epoch": 1.5856362733741518, "grad_norm": 0.3300880491733551, "learning_rate": 5.373787658648864e-06, "loss": 0.3267, "step": 3194 }, { "epoch": 1.5861327155386398, "grad_norm": 0.4169624149799347, "learning_rate": 5.370906594329844e-06, "loss": 0.4173, "step": 3195 }, { "epoch": 1.5866291577031277, "grad_norm": 0.34117940068244934, "learning_rate": 5.368025406175191e-06, "loss": 0.3557, "step": 3196 }, { "epoch": 1.5871255998676155, "grad_norm": 0.34751197695732117, "learning_rate": 5.365144095146858e-06, "loss": 0.3702, "step": 3197 }, { "epoch": 1.5876220420321032, "grad_norm": 0.3579760491847992, "learning_rate": 5.362262662206837e-06, "loss": 0.3829, "step": 3198 }, { "epoch": 1.588118484196591, "grad_norm": 0.3592597246170044, "learning_rate": 5.359381108317159e-06, "loss": 0.3752, "step": 3199 }, { "epoch": 1.588614926361079, "grad_norm": 0.3703460097312927, "learning_rate": 5.3564994344398944e-06, "loss": 0.3993, "step": 3200 }, { "epoch": 1.589111368525567, "grad_norm": 0.31839779019355774, "learning_rate": 5.35361764153716e-06, "loss": 0.3364, "step": 3201 }, { "epoch": 1.5896078106900546, "grad_norm": 0.39757728576660156, "learning_rate": 5.350735730571104e-06, "loss": 0.3981, "step": 3202 }, { "epoch": 1.5901042528545424, "grad_norm": 0.3455289304256439, "learning_rate": 5.347853702503921e-06, "loss": 0.3599, "step": 3203 }, { "epoch": 1.5906006950190301, "grad_norm": 0.32683083415031433, "learning_rate": 5.344971558297841e-06, "loss": 0.3593, "step": 3204 }, { "epoch": 1.591097137183518, "grad_norm": 0.3645009398460388, "learning_rate": 5.342089298915133e-06, "loss": 0.373, "step": 3205 }, { "epoch": 1.591593579348006, "grad_norm": 0.35308584570884705, "learning_rate": 5.339206925318106e-06, "loss": 0.3814, "step": 3206 }, { "epoch": 1.5920900215124938, "grad_norm": 0.35690411925315857, "learning_rate": 5.336324438469104e-06, "loss": 0.4091, "step": 3207 }, { "epoch": 1.5925864636769815, "grad_norm": 0.3432982265949249, "learning_rate": 5.333441839330515e-06, "loss": 0.3323, "step": 3208 }, { "epoch": 1.5930829058414695, "grad_norm": 0.36141109466552734, "learning_rate": 5.330559128864757e-06, "loss": 0.4009, "step": 3209 }, { "epoch": 1.5935793480059575, "grad_norm": 0.36805808544158936, "learning_rate": 5.327676308034292e-06, "loss": 0.3543, "step": 3210 }, { "epoch": 1.5940757901704452, "grad_norm": 0.3601503074169159, "learning_rate": 5.324793377801611e-06, "loss": 0.3892, "step": 3211 }, { "epoch": 1.594572232334933, "grad_norm": 0.3764282763004303, "learning_rate": 5.321910339129251e-06, "loss": 0.4006, "step": 3212 }, { "epoch": 1.5950686744994207, "grad_norm": 0.35506245493888855, "learning_rate": 5.3190271929797755e-06, "loss": 0.3678, "step": 3213 }, { "epoch": 1.5955651166639087, "grad_norm": 0.3671167492866516, "learning_rate": 5.316143940315792e-06, "loss": 0.3662, "step": 3214 }, { "epoch": 1.5960615588283966, "grad_norm": 0.3562886416912079, "learning_rate": 5.313260582099938e-06, "loss": 0.3604, "step": 3215 }, { "epoch": 1.5965580009928844, "grad_norm": 0.35370367765426636, "learning_rate": 5.310377119294892e-06, "loss": 0.3947, "step": 3216 }, { "epoch": 1.597054443157372, "grad_norm": 0.32446590065956116, "learning_rate": 5.307493552863359e-06, "loss": 0.3586, "step": 3217 }, { "epoch": 1.5975508853218598, "grad_norm": 0.38326525688171387, "learning_rate": 5.304609883768088e-06, "loss": 0.4197, "step": 3218 }, { "epoch": 1.5980473274863478, "grad_norm": 0.3793010711669922, "learning_rate": 5.3017261129718545e-06, "loss": 0.341, "step": 3219 }, { "epoch": 1.5985437696508358, "grad_norm": 0.3819548785686493, "learning_rate": 5.298842241437473e-06, "loss": 0.364, "step": 3220 }, { "epoch": 1.5990402118153235, "grad_norm": 0.3570018708705902, "learning_rate": 5.295958270127787e-06, "loss": 0.3651, "step": 3221 }, { "epoch": 1.5995366539798113, "grad_norm": 0.3665010333061218, "learning_rate": 5.293074200005679e-06, "loss": 0.3956, "step": 3222 }, { "epoch": 1.6000330961442992, "grad_norm": 0.36976221203804016, "learning_rate": 5.290190032034063e-06, "loss": 0.3551, "step": 3223 }, { "epoch": 1.6005295383087872, "grad_norm": 0.3653942048549652, "learning_rate": 5.287305767175881e-06, "loss": 0.3375, "step": 3224 }, { "epoch": 1.601025980473275, "grad_norm": 0.38039904832839966, "learning_rate": 5.284421406394112e-06, "loss": 0.4162, "step": 3225 }, { "epoch": 1.6015224226377627, "grad_norm": 0.3587985932826996, "learning_rate": 5.281536950651765e-06, "loss": 0.3552, "step": 3226 }, { "epoch": 1.6020188648022504, "grad_norm": 0.3852309584617615, "learning_rate": 5.2786524009118836e-06, "loss": 0.3712, "step": 3227 }, { "epoch": 1.6025153069667384, "grad_norm": 0.3894484043121338, "learning_rate": 5.2757677581375375e-06, "loss": 0.357, "step": 3228 }, { "epoch": 1.6030117491312263, "grad_norm": 0.36143144965171814, "learning_rate": 5.2728830232918315e-06, "loss": 0.3691, "step": 3229 }, { "epoch": 1.603508191295714, "grad_norm": 0.360694944858551, "learning_rate": 5.269998197337901e-06, "loss": 0.3668, "step": 3230 }, { "epoch": 1.6040046334602018, "grad_norm": 0.3750160038471222, "learning_rate": 5.267113281238912e-06, "loss": 0.3858, "step": 3231 }, { "epoch": 1.6045010756246896, "grad_norm": 0.3537858724594116, "learning_rate": 5.264228275958056e-06, "loss": 0.4003, "step": 3232 }, { "epoch": 1.6049975177891775, "grad_norm": 0.3203774690628052, "learning_rate": 5.261343182458562e-06, "loss": 0.3428, "step": 3233 }, { "epoch": 1.6054939599536655, "grad_norm": 0.36464717984199524, "learning_rate": 5.25845800170368e-06, "loss": 0.3466, "step": 3234 }, { "epoch": 1.6059904021181532, "grad_norm": 0.3537604510784149, "learning_rate": 5.255572734656697e-06, "loss": 0.3883, "step": 3235 }, { "epoch": 1.606486844282641, "grad_norm": 0.3819637596607208, "learning_rate": 5.252687382280924e-06, "loss": 0.3699, "step": 3236 }, { "epoch": 1.606983286447129, "grad_norm": 0.3934093117713928, "learning_rate": 5.249801945539701e-06, "loss": 0.3671, "step": 3237 }, { "epoch": 1.6074797286116167, "grad_norm": 0.34045475721359253, "learning_rate": 5.246916425396398e-06, "loss": 0.3529, "step": 3238 }, { "epoch": 1.6079761707761047, "grad_norm": 0.3964216113090515, "learning_rate": 5.244030822814411e-06, "loss": 0.4076, "step": 3239 }, { "epoch": 1.6084726129405924, "grad_norm": 0.3532741665840149, "learning_rate": 5.241145138757167e-06, "loss": 0.3644, "step": 3240 }, { "epoch": 1.6089690551050801, "grad_norm": 0.3872212767601013, "learning_rate": 5.238259374188113e-06, "loss": 0.3834, "step": 3241 }, { "epoch": 1.6094654972695681, "grad_norm": 0.36223113536834717, "learning_rate": 5.23537353007073e-06, "loss": 0.39, "step": 3242 }, { "epoch": 1.609961939434056, "grad_norm": 0.37415647506713867, "learning_rate": 5.232487607368522e-06, "loss": 0.3801, "step": 3243 }, { "epoch": 1.6104583815985438, "grad_norm": 0.340777724981308, "learning_rate": 5.229601607045021e-06, "loss": 0.3501, "step": 3244 }, { "epoch": 1.6109548237630316, "grad_norm": 0.3944135904312134, "learning_rate": 5.226715530063782e-06, "loss": 0.3674, "step": 3245 }, { "epoch": 1.6114512659275193, "grad_norm": 0.3445008397102356, "learning_rate": 5.223829377388392e-06, "loss": 0.3945, "step": 3246 }, { "epoch": 1.6119477080920073, "grad_norm": 0.3361579179763794, "learning_rate": 5.220943149982455e-06, "loss": 0.3661, "step": 3247 }, { "epoch": 1.6124441502564952, "grad_norm": 0.32979273796081543, "learning_rate": 5.218056848809604e-06, "loss": 0.3816, "step": 3248 }, { "epoch": 1.612940592420983, "grad_norm": 0.3370720148086548, "learning_rate": 5.2151704748335e-06, "loss": 0.3744, "step": 3249 }, { "epoch": 1.6134370345854707, "grad_norm": 0.3308779299259186, "learning_rate": 5.21228402901782e-06, "loss": 0.3414, "step": 3250 }, { "epoch": 1.6139334767499587, "grad_norm": 0.33596354722976685, "learning_rate": 5.2093975123262745e-06, "loss": 0.3389, "step": 3251 }, { "epoch": 1.6144299189144464, "grad_norm": 0.3650726079940796, "learning_rate": 5.20651092572259e-06, "loss": 0.4106, "step": 3252 }, { "epoch": 1.6149263610789344, "grad_norm": 0.3257867097854614, "learning_rate": 5.2036242701705185e-06, "loss": 0.3769, "step": 3253 }, { "epoch": 1.6154228032434221, "grad_norm": 0.38486120104789734, "learning_rate": 5.200737546633839e-06, "loss": 0.378, "step": 3254 }, { "epoch": 1.6159192454079099, "grad_norm": 0.38446488976478577, "learning_rate": 5.197850756076348e-06, "loss": 0.3449, "step": 3255 }, { "epoch": 1.6164156875723978, "grad_norm": 0.39681515097618103, "learning_rate": 5.1949638994618666e-06, "loss": 0.4054, "step": 3256 }, { "epoch": 1.6169121297368858, "grad_norm": 0.3533380329608917, "learning_rate": 5.192076977754239e-06, "loss": 0.3343, "step": 3257 }, { "epoch": 1.6174085719013735, "grad_norm": 0.38695040345191956, "learning_rate": 5.189189991917328e-06, "loss": 0.3861, "step": 3258 }, { "epoch": 1.6179050140658613, "grad_norm": 0.3472655415534973, "learning_rate": 5.186302942915021e-06, "loss": 0.3863, "step": 3259 }, { "epoch": 1.618401456230349, "grad_norm": 0.32558387517929077, "learning_rate": 5.1834158317112245e-06, "loss": 0.3382, "step": 3260 }, { "epoch": 1.618897898394837, "grad_norm": 0.3430306315422058, "learning_rate": 5.180528659269867e-06, "loss": 0.3991, "step": 3261 }, { "epoch": 1.619394340559325, "grad_norm": 0.3358527421951294, "learning_rate": 5.177641426554896e-06, "loss": 0.3469, "step": 3262 }, { "epoch": 1.6198907827238127, "grad_norm": 0.3329887390136719, "learning_rate": 5.174754134530281e-06, "loss": 0.3492, "step": 3263 }, { "epoch": 1.6203872248883004, "grad_norm": 0.37512484192848206, "learning_rate": 5.1718667841600115e-06, "loss": 0.4246, "step": 3264 }, { "epoch": 1.6208836670527882, "grad_norm": 0.34337636828422546, "learning_rate": 5.168979376408092e-06, "loss": 0.3706, "step": 3265 }, { "epoch": 1.6213801092172762, "grad_norm": 0.3612939417362213, "learning_rate": 5.166091912238552e-06, "loss": 0.3751, "step": 3266 }, { "epoch": 1.6218765513817641, "grad_norm": 0.3290347158908844, "learning_rate": 5.163204392615436e-06, "loss": 0.3526, "step": 3267 }, { "epoch": 1.6223729935462519, "grad_norm": 0.3361402451992035, "learning_rate": 5.16031681850281e-06, "loss": 0.3871, "step": 3268 }, { "epoch": 1.6228694357107396, "grad_norm": 0.34763115644454956, "learning_rate": 5.157429190864755e-06, "loss": 0.3913, "step": 3269 }, { "epoch": 1.6233658778752276, "grad_norm": 0.32698097825050354, "learning_rate": 5.154541510665372e-06, "loss": 0.3932, "step": 3270 }, { "epoch": 1.6238623200397155, "grad_norm": 0.3163682818412781, "learning_rate": 5.151653778868778e-06, "loss": 0.3744, "step": 3271 }, { "epoch": 1.6243587622042033, "grad_norm": 0.34879711270332336, "learning_rate": 5.14876599643911e-06, "loss": 0.3983, "step": 3272 }, { "epoch": 1.624855204368691, "grad_norm": 0.34125587344169617, "learning_rate": 5.145878164340518e-06, "loss": 0.3623, "step": 3273 }, { "epoch": 1.6253516465331788, "grad_norm": 0.3417586088180542, "learning_rate": 5.142990283537174e-06, "loss": 0.4109, "step": 3274 }, { "epoch": 1.6258480886976667, "grad_norm": 0.3235298991203308, "learning_rate": 5.140102354993258e-06, "loss": 0.3564, "step": 3275 }, { "epoch": 1.6263445308621547, "grad_norm": 0.4063067138195038, "learning_rate": 5.137214379672975e-06, "loss": 0.4026, "step": 3276 }, { "epoch": 1.6268409730266424, "grad_norm": 0.3544106185436249, "learning_rate": 5.134326358540538e-06, "loss": 0.3602, "step": 3277 }, { "epoch": 1.6273374151911302, "grad_norm": 0.34188076853752136, "learning_rate": 5.131438292560181e-06, "loss": 0.3657, "step": 3278 }, { "epoch": 1.627833857355618, "grad_norm": 0.3480246365070343, "learning_rate": 5.128550182696153e-06, "loss": 0.3892, "step": 3279 }, { "epoch": 1.6283302995201059, "grad_norm": 0.34001439809799194, "learning_rate": 5.12566202991271e-06, "loss": 0.3538, "step": 3280 }, { "epoch": 1.6288267416845938, "grad_norm": 0.3888556957244873, "learning_rate": 5.1227738351741326e-06, "loss": 0.3897, "step": 3281 }, { "epoch": 1.6293231838490816, "grad_norm": 0.3512042164802551, "learning_rate": 5.119885599444707e-06, "loss": 0.3656, "step": 3282 }, { "epoch": 1.6298196260135693, "grad_norm": 0.37370598316192627, "learning_rate": 5.1169973236887394e-06, "loss": 0.4193, "step": 3283 }, { "epoch": 1.6303160681780573, "grad_norm": 0.4145722985267639, "learning_rate": 5.1141090088705436e-06, "loss": 0.3801, "step": 3284 }, { "epoch": 1.630812510342545, "grad_norm": 0.34890708327293396, "learning_rate": 5.111220655954452e-06, "loss": 0.3568, "step": 3285 }, { "epoch": 1.631308952507033, "grad_norm": 0.3654918372631073, "learning_rate": 5.108332265904805e-06, "loss": 0.3937, "step": 3286 }, { "epoch": 1.6318053946715207, "grad_norm": 0.3874290883541107, "learning_rate": 5.105443839685961e-06, "loss": 0.4139, "step": 3287 }, { "epoch": 1.6323018368360085, "grad_norm": 0.344882607460022, "learning_rate": 5.102555378262283e-06, "loss": 0.3694, "step": 3288 }, { "epoch": 1.6327982790004965, "grad_norm": 0.32451745867729187, "learning_rate": 5.099666882598152e-06, "loss": 0.3533, "step": 3289 }, { "epoch": 1.6332947211649844, "grad_norm": 0.3523874878883362, "learning_rate": 5.096778353657957e-06, "loss": 0.3557, "step": 3290 }, { "epoch": 1.6337911633294722, "grad_norm": 0.3664550483226776, "learning_rate": 5.093889792406101e-06, "loss": 0.3621, "step": 3291 }, { "epoch": 1.63428760549396, "grad_norm": 0.3381751775741577, "learning_rate": 5.091001199806994e-06, "loss": 0.4171, "step": 3292 }, { "epoch": 1.6347840476584476, "grad_norm": 0.3026443123817444, "learning_rate": 5.08811257682506e-06, "loss": 0.3181, "step": 3293 }, { "epoch": 1.6352804898229356, "grad_norm": 0.3276064693927765, "learning_rate": 5.085223924424733e-06, "loss": 0.3714, "step": 3294 }, { "epoch": 1.6357769319874236, "grad_norm": 0.32386428117752075, "learning_rate": 5.082335243570452e-06, "loss": 0.3765, "step": 3295 }, { "epoch": 1.6362733741519113, "grad_norm": 0.36304986476898193, "learning_rate": 5.079446535226673e-06, "loss": 0.4009, "step": 3296 }, { "epoch": 1.636769816316399, "grad_norm": 0.35683971643447876, "learning_rate": 5.076557800357853e-06, "loss": 0.3699, "step": 3297 }, { "epoch": 1.637266258480887, "grad_norm": 0.32567059993743896, "learning_rate": 5.073669039928466e-06, "loss": 0.3822, "step": 3298 }, { "epoch": 1.6377627006453748, "grad_norm": 0.3162446618080139, "learning_rate": 5.0707802549029875e-06, "loss": 0.357, "step": 3299 }, { "epoch": 1.6382591428098627, "grad_norm": 0.3364218771457672, "learning_rate": 5.067891446245905e-06, "loss": 0.3174, "step": 3300 }, { "epoch": 1.6387555849743505, "grad_norm": 0.3948424160480499, "learning_rate": 5.0650026149217135e-06, "loss": 0.4268, "step": 3301 }, { "epoch": 1.6392520271388382, "grad_norm": 0.35162192583084106, "learning_rate": 5.062113761894918e-06, "loss": 0.3492, "step": 3302 }, { "epoch": 1.6397484693033262, "grad_norm": 0.38928622007369995, "learning_rate": 5.059224888130023e-06, "loss": 0.3897, "step": 3303 }, { "epoch": 1.6402449114678141, "grad_norm": 0.32670357823371887, "learning_rate": 5.056335994591549e-06, "loss": 0.3372, "step": 3304 }, { "epoch": 1.640741353632302, "grad_norm": 0.35591381788253784, "learning_rate": 5.0534470822440176e-06, "loss": 0.3929, "step": 3305 }, { "epoch": 1.6412377957967896, "grad_norm": 0.3490215539932251, "learning_rate": 5.050558152051957e-06, "loss": 0.3843, "step": 3306 }, { "epoch": 1.6417342379612774, "grad_norm": 0.36331266164779663, "learning_rate": 5.047669204979906e-06, "loss": 0.4278, "step": 3307 }, { "epoch": 1.6422306801257653, "grad_norm": 0.33368322253227234, "learning_rate": 5.0447802419924e-06, "loss": 0.3779, "step": 3308 }, { "epoch": 1.6427271222902533, "grad_norm": 0.33432644605636597, "learning_rate": 5.0418912640539895e-06, "loss": 0.3612, "step": 3309 }, { "epoch": 1.643223564454741, "grad_norm": 0.35099583864212036, "learning_rate": 5.039002272129224e-06, "loss": 0.3615, "step": 3310 }, { "epoch": 1.6437200066192288, "grad_norm": 0.31732651591300964, "learning_rate": 5.036113267182661e-06, "loss": 0.335, "step": 3311 }, { "epoch": 1.6442164487837165, "grad_norm": 0.3211580812931061, "learning_rate": 5.033224250178859e-06, "loss": 0.3642, "step": 3312 }, { "epoch": 1.6447128909482045, "grad_norm": 0.36307382583618164, "learning_rate": 5.030335222082383e-06, "loss": 0.3691, "step": 3313 }, { "epoch": 1.6452093331126925, "grad_norm": 0.313123881816864, "learning_rate": 5.0274461838578e-06, "loss": 0.335, "step": 3314 }, { "epoch": 1.6457057752771802, "grad_norm": 0.3448035418987274, "learning_rate": 5.024557136469682e-06, "loss": 0.3759, "step": 3315 }, { "epoch": 1.646202217441668, "grad_norm": 0.32130730152130127, "learning_rate": 5.021668080882605e-06, "loss": 0.3454, "step": 3316 }, { "epoch": 1.646698659606156, "grad_norm": 0.3291034400463104, "learning_rate": 5.018779018061143e-06, "loss": 0.3788, "step": 3317 }, { "epoch": 1.6471951017706439, "grad_norm": 0.35951533913612366, "learning_rate": 5.015889948969879e-06, "loss": 0.4344, "step": 3318 }, { "epoch": 1.6476915439351316, "grad_norm": 0.33484914898872375, "learning_rate": 5.013000874573392e-06, "loss": 0.3746, "step": 3319 }, { "epoch": 1.6481879860996194, "grad_norm": 0.3424959182739258, "learning_rate": 5.0101117958362665e-06, "loss": 0.3734, "step": 3320 }, { "epoch": 1.648684428264107, "grad_norm": 0.3630015552043915, "learning_rate": 5.007222713723086e-06, "loss": 0.3813, "step": 3321 }, { "epoch": 1.649180870428595, "grad_norm": 0.32640913128852844, "learning_rate": 5.00433362919844e-06, "loss": 0.343, "step": 3322 }, { "epoch": 1.649677312593083, "grad_norm": 0.3656044900417328, "learning_rate": 5.001444543226912e-06, "loss": 0.3911, "step": 3323 }, { "epoch": 1.6501737547575708, "grad_norm": 0.3882608115673065, "learning_rate": 4.99855545677309e-06, "loss": 0.3608, "step": 3324 }, { "epoch": 1.6506701969220585, "grad_norm": 0.34775176644325256, "learning_rate": 4.995666370801563e-06, "loss": 0.3857, "step": 3325 }, { "epoch": 1.6511666390865463, "grad_norm": 0.3285890817642212, "learning_rate": 4.9927772862769136e-06, "loss": 0.3584, "step": 3326 }, { "epoch": 1.6516630812510342, "grad_norm": 0.36268991231918335, "learning_rate": 4.989888204163735e-06, "loss": 0.3592, "step": 3327 }, { "epoch": 1.6521595234155222, "grad_norm": 0.3440394699573517, "learning_rate": 4.98699912542661e-06, "loss": 0.359, "step": 3328 }, { "epoch": 1.65265596558001, "grad_norm": 0.3920816481113434, "learning_rate": 4.9841100510301234e-06, "loss": 0.4018, "step": 3329 }, { "epoch": 1.6531524077444977, "grad_norm": 0.3333778381347656, "learning_rate": 4.981220981938858e-06, "loss": 0.344, "step": 3330 }, { "epoch": 1.6536488499089856, "grad_norm": 0.3423239290714264, "learning_rate": 4.978331919117398e-06, "loss": 0.3963, "step": 3331 }, { "epoch": 1.6541452920734736, "grad_norm": 0.3604969382286072, "learning_rate": 4.975442863530319e-06, "loss": 0.3988, "step": 3332 }, { "epoch": 1.6546417342379613, "grad_norm": 0.3227071762084961, "learning_rate": 4.9725538161422005e-06, "loss": 0.329, "step": 3333 }, { "epoch": 1.655138176402449, "grad_norm": 0.35431402921676636, "learning_rate": 4.969664777917619e-06, "loss": 0.3574, "step": 3334 }, { "epoch": 1.6556346185669368, "grad_norm": 0.3591594994068146, "learning_rate": 4.966775749821143e-06, "loss": 0.3896, "step": 3335 }, { "epoch": 1.6561310607314248, "grad_norm": 0.3388911187648773, "learning_rate": 4.963886732817342e-06, "loss": 0.3874, "step": 3336 }, { "epoch": 1.6566275028959128, "grad_norm": 0.3760874271392822, "learning_rate": 4.9609977278707765e-06, "loss": 0.3615, "step": 3337 }, { "epoch": 1.6571239450604005, "grad_norm": 0.3621922731399536, "learning_rate": 4.958108735946012e-06, "loss": 0.3826, "step": 3338 }, { "epoch": 1.6576203872248882, "grad_norm": 0.3058447539806366, "learning_rate": 4.955219758007601e-06, "loss": 0.2951, "step": 3339 }, { "epoch": 1.658116829389376, "grad_norm": 0.33646121621131897, "learning_rate": 4.9523307950200976e-06, "loss": 0.3769, "step": 3340 }, { "epoch": 1.658613271553864, "grad_norm": 0.32561296224594116, "learning_rate": 4.949441847948043e-06, "loss": 0.3423, "step": 3341 }, { "epoch": 1.659109713718352, "grad_norm": 0.35695725679397583, "learning_rate": 4.946552917755983e-06, "loss": 0.3788, "step": 3342 }, { "epoch": 1.6596061558828397, "grad_norm": 0.34453776478767395, "learning_rate": 4.943664005408453e-06, "loss": 0.3601, "step": 3343 }, { "epoch": 1.6601025980473274, "grad_norm": 0.3755805492401123, "learning_rate": 4.9407751118699784e-06, "loss": 0.3662, "step": 3344 }, { "epoch": 1.6605990402118154, "grad_norm": 0.3445984721183777, "learning_rate": 4.937886238105084e-06, "loss": 0.3655, "step": 3345 }, { "epoch": 1.6610954823763031, "grad_norm": 0.35416045784950256, "learning_rate": 4.934997385078287e-06, "loss": 0.3827, "step": 3346 }, { "epoch": 1.661591924540791, "grad_norm": 0.32908937335014343, "learning_rate": 4.932108553754097e-06, "loss": 0.3341, "step": 3347 }, { "epoch": 1.6620883667052788, "grad_norm": 0.3607989251613617, "learning_rate": 4.929219745097015e-06, "loss": 0.3952, "step": 3348 }, { "epoch": 1.6625848088697666, "grad_norm": 0.37952330708503723, "learning_rate": 4.9263309600715356e-06, "loss": 0.3739, "step": 3349 }, { "epoch": 1.6630812510342545, "grad_norm": 0.3295675814151764, "learning_rate": 4.923442199642148e-06, "loss": 0.3545, "step": 3350 }, { "epoch": 1.6635776931987425, "grad_norm": 0.33536556363105774, "learning_rate": 4.92055346477333e-06, "loss": 0.3748, "step": 3351 }, { "epoch": 1.6640741353632302, "grad_norm": 0.33550912141799927, "learning_rate": 4.917664756429548e-06, "loss": 0.3725, "step": 3352 }, { "epoch": 1.664570577527718, "grad_norm": 0.3408276438713074, "learning_rate": 4.914776075575268e-06, "loss": 0.3904, "step": 3353 }, { "epoch": 1.6650670196922057, "grad_norm": 0.3314831554889679, "learning_rate": 4.91188742317494e-06, "loss": 0.3743, "step": 3354 }, { "epoch": 1.6655634618566937, "grad_norm": 0.334970623254776, "learning_rate": 4.9089988001930064e-06, "loss": 0.3664, "step": 3355 }, { "epoch": 1.6660599040211816, "grad_norm": 0.36754220724105835, "learning_rate": 4.9061102075939e-06, "loss": 0.4379, "step": 3356 }, { "epoch": 1.6665563461856694, "grad_norm": 0.31385645270347595, "learning_rate": 4.903221646342044e-06, "loss": 0.3325, "step": 3357 }, { "epoch": 1.6670527883501571, "grad_norm": 0.321575790643692, "learning_rate": 4.9003331174018494e-06, "loss": 0.3824, "step": 3358 }, { "epoch": 1.667549230514645, "grad_norm": 0.327850878238678, "learning_rate": 4.897444621737717e-06, "loss": 0.3584, "step": 3359 }, { "epoch": 1.6680456726791328, "grad_norm": 0.37844786047935486, "learning_rate": 4.894556160314041e-06, "loss": 0.374, "step": 3360 }, { "epoch": 1.6685421148436208, "grad_norm": 0.3404460549354553, "learning_rate": 4.8916677340951965e-06, "loss": 0.3722, "step": 3361 }, { "epoch": 1.6690385570081085, "grad_norm": 0.3503749668598175, "learning_rate": 4.888779344045549e-06, "loss": 0.3574, "step": 3362 }, { "epoch": 1.6695349991725963, "grad_norm": 0.3640756905078888, "learning_rate": 4.885890991129458e-06, "loss": 0.3777, "step": 3363 }, { "epoch": 1.6700314413370843, "grad_norm": 0.33941686153411865, "learning_rate": 4.883002676311262e-06, "loss": 0.4096, "step": 3364 }, { "epoch": 1.6705278835015722, "grad_norm": 0.3357810080051422, "learning_rate": 4.880114400555294e-06, "loss": 0.3874, "step": 3365 }, { "epoch": 1.67102432566606, "grad_norm": 0.35964104533195496, "learning_rate": 4.87722616482587e-06, "loss": 0.3755, "step": 3366 }, { "epoch": 1.6715207678305477, "grad_norm": 0.39197611808776855, "learning_rate": 4.87433797008729e-06, "loss": 0.4014, "step": 3367 }, { "epoch": 1.6720172099950354, "grad_norm": 0.35034897923469543, "learning_rate": 4.871449817303849e-06, "loss": 0.3355, "step": 3368 }, { "epoch": 1.6725136521595234, "grad_norm": 0.3514167368412018, "learning_rate": 4.86856170743982e-06, "loss": 0.3646, "step": 3369 }, { "epoch": 1.6730100943240114, "grad_norm": 0.3707559108734131, "learning_rate": 4.865673641459463e-06, "loss": 0.4213, "step": 3370 }, { "epoch": 1.6735065364884991, "grad_norm": 0.3582924008369446, "learning_rate": 4.862785620327028e-06, "loss": 0.3596, "step": 3371 }, { "epoch": 1.6740029786529869, "grad_norm": 0.3647651970386505, "learning_rate": 4.859897645006743e-06, "loss": 0.3461, "step": 3372 }, { "epoch": 1.6744994208174746, "grad_norm": 0.36635851860046387, "learning_rate": 4.8570097164628285e-06, "loss": 0.3662, "step": 3373 }, { "epoch": 1.6749958629819626, "grad_norm": 0.3427255153656006, "learning_rate": 4.854121835659482e-06, "loss": 0.3372, "step": 3374 }, { "epoch": 1.6754923051464505, "grad_norm": 0.3204922676086426, "learning_rate": 4.851234003560891e-06, "loss": 0.3587, "step": 3375 }, { "epoch": 1.6759887473109383, "grad_norm": 0.3480212390422821, "learning_rate": 4.848346221131223e-06, "loss": 0.4142, "step": 3376 }, { "epoch": 1.676485189475426, "grad_norm": 0.3245660960674286, "learning_rate": 4.845458489334631e-06, "loss": 0.3464, "step": 3377 }, { "epoch": 1.676981631639914, "grad_norm": 0.3651392161846161, "learning_rate": 4.842570809135246e-06, "loss": 0.4096, "step": 3378 }, { "epoch": 1.677478073804402, "grad_norm": 0.35364460945129395, "learning_rate": 4.839683181497192e-06, "loss": 0.3903, "step": 3379 }, { "epoch": 1.6779745159688897, "grad_norm": 0.3385586738586426, "learning_rate": 4.8367956073845655e-06, "loss": 0.3572, "step": 3380 }, { "epoch": 1.6784709581333774, "grad_norm": 0.37910911440849304, "learning_rate": 4.83390808776145e-06, "loss": 0.3491, "step": 3381 }, { "epoch": 1.6789674002978652, "grad_norm": 0.34041330218315125, "learning_rate": 4.831020623591909e-06, "loss": 0.3397, "step": 3382 }, { "epoch": 1.6794638424623531, "grad_norm": 0.35636886954307556, "learning_rate": 4.828133215839991e-06, "loss": 0.4209, "step": 3383 }, { "epoch": 1.679960284626841, "grad_norm": 0.3177793622016907, "learning_rate": 4.82524586546972e-06, "loss": 0.3867, "step": 3384 }, { "epoch": 1.6804567267913288, "grad_norm": 0.35869330167770386, "learning_rate": 4.822358573445106e-06, "loss": 0.4072, "step": 3385 }, { "epoch": 1.6809531689558166, "grad_norm": 0.29703307151794434, "learning_rate": 4.819471340730135e-06, "loss": 0.3094, "step": 3386 }, { "epoch": 1.6814496111203043, "grad_norm": 0.3890552818775177, "learning_rate": 4.816584168288776e-06, "loss": 0.4154, "step": 3387 }, { "epoch": 1.6819460532847923, "grad_norm": 0.3556058406829834, "learning_rate": 4.81369705708498e-06, "loss": 0.3816, "step": 3388 }, { "epoch": 1.6824424954492803, "grad_norm": 0.31037381291389465, "learning_rate": 4.810810008082672e-06, "loss": 0.3456, "step": 3389 }, { "epoch": 1.682938937613768, "grad_norm": 0.3424561023712158, "learning_rate": 4.8079230222457616e-06, "loss": 0.3837, "step": 3390 }, { "epoch": 1.6834353797782557, "grad_norm": 0.3323521614074707, "learning_rate": 4.805036100538134e-06, "loss": 0.3688, "step": 3391 }, { "epoch": 1.6839318219427437, "grad_norm": 0.37695345282554626, "learning_rate": 4.802149243923655e-06, "loss": 0.4161, "step": 3392 }, { "epoch": 1.6844282641072315, "grad_norm": 0.3448575735092163, "learning_rate": 4.799262453366162e-06, "loss": 0.3208, "step": 3393 }, { "epoch": 1.6849247062717194, "grad_norm": 0.3575380742549896, "learning_rate": 4.796375729829483e-06, "loss": 0.3563, "step": 3394 }, { "epoch": 1.6854211484362072, "grad_norm": 0.3287997245788574, "learning_rate": 4.793489074277412e-06, "loss": 0.3646, "step": 3395 }, { "epoch": 1.685917590600695, "grad_norm": 0.3822130560874939, "learning_rate": 4.790602487673728e-06, "loss": 0.3861, "step": 3396 }, { "epoch": 1.6864140327651829, "grad_norm": 0.3555254340171814, "learning_rate": 4.7877159709821805e-06, "loss": 0.3613, "step": 3397 }, { "epoch": 1.6869104749296708, "grad_norm": 0.3525969088077545, "learning_rate": 4.784829525166502e-06, "loss": 0.3753, "step": 3398 }, { "epoch": 1.6874069170941586, "grad_norm": 0.34727174043655396, "learning_rate": 4.781943151190397e-06, "loss": 0.3623, "step": 3399 }, { "epoch": 1.6879033592586463, "grad_norm": 0.3586980104446411, "learning_rate": 4.779056850017546e-06, "loss": 0.357, "step": 3400 }, { "epoch": 1.688399801423134, "grad_norm": 0.3263377249240875, "learning_rate": 4.77617062261161e-06, "loss": 0.4082, "step": 3401 }, { "epoch": 1.688896243587622, "grad_norm": 0.3483002483844757, "learning_rate": 4.773284469936219e-06, "loss": 0.3578, "step": 3402 }, { "epoch": 1.68939268575211, "grad_norm": 0.35416245460510254, "learning_rate": 4.7703983929549816e-06, "loss": 0.3623, "step": 3403 }, { "epoch": 1.6898891279165977, "grad_norm": 0.3372195363044739, "learning_rate": 4.767512392631479e-06, "loss": 0.3312, "step": 3404 }, { "epoch": 1.6903855700810855, "grad_norm": 0.3426492214202881, "learning_rate": 4.764626469929272e-06, "loss": 0.4057, "step": 3405 }, { "epoch": 1.6908820122455734, "grad_norm": 0.3275741636753082, "learning_rate": 4.7617406258118895e-06, "loss": 0.3553, "step": 3406 }, { "epoch": 1.6913784544100612, "grad_norm": 0.3229416310787201, "learning_rate": 4.758854861242837e-06, "loss": 0.3715, "step": 3407 }, { "epoch": 1.6918748965745491, "grad_norm": 0.3657640218734741, "learning_rate": 4.755969177185589e-06, "loss": 0.3888, "step": 3408 }, { "epoch": 1.692371338739037, "grad_norm": 0.3604000210762024, "learning_rate": 4.753083574603603e-06, "loss": 0.3877, "step": 3409 }, { "epoch": 1.6928677809035246, "grad_norm": 0.3596315085887909, "learning_rate": 4.7501980544602995e-06, "loss": 0.3425, "step": 3410 }, { "epoch": 1.6933642230680126, "grad_norm": 0.3153960108757019, "learning_rate": 4.747312617719079e-06, "loss": 0.3444, "step": 3411 }, { "epoch": 1.6938606652325006, "grad_norm": 0.35318052768707275, "learning_rate": 4.744427265343304e-06, "loss": 0.401, "step": 3412 }, { "epoch": 1.6943571073969883, "grad_norm": 0.3380723297595978, "learning_rate": 4.741541998296321e-06, "loss": 0.3529, "step": 3413 }, { "epoch": 1.694853549561476, "grad_norm": 0.3454844355583191, "learning_rate": 4.738656817541441e-06, "loss": 0.3697, "step": 3414 }, { "epoch": 1.6953499917259638, "grad_norm": 0.39815017580986023, "learning_rate": 4.735771724041945e-06, "loss": 0.3604, "step": 3415 }, { "epoch": 1.6958464338904518, "grad_norm": 0.38873767852783203, "learning_rate": 4.732886718761091e-06, "loss": 0.3977, "step": 3416 }, { "epoch": 1.6963428760549397, "grad_norm": 0.316694438457489, "learning_rate": 4.730001802662101e-06, "loss": 0.3682, "step": 3417 }, { "epoch": 1.6968393182194275, "grad_norm": 0.33081239461898804, "learning_rate": 4.72711697670817e-06, "loss": 0.3733, "step": 3418 }, { "epoch": 1.6973357603839152, "grad_norm": 0.38902547955513, "learning_rate": 4.724232241862464e-06, "loss": 0.3963, "step": 3419 }, { "epoch": 1.697832202548403, "grad_norm": 0.3271400034427643, "learning_rate": 4.721347599088118e-06, "loss": 0.3419, "step": 3420 }, { "epoch": 1.698328644712891, "grad_norm": 0.3311671316623688, "learning_rate": 4.7184630493482355e-06, "loss": 0.3453, "step": 3421 }, { "epoch": 1.6988250868773789, "grad_norm": 0.3804277777671814, "learning_rate": 4.71557859360589e-06, "loss": 0.3855, "step": 3422 }, { "epoch": 1.6993215290418666, "grad_norm": 0.3315379023551941, "learning_rate": 4.71269423282412e-06, "loss": 0.3797, "step": 3423 }, { "epoch": 1.6998179712063544, "grad_norm": 0.34520235657691956, "learning_rate": 4.709809967965939e-06, "loss": 0.3608, "step": 3424 }, { "epoch": 1.7003144133708423, "grad_norm": 0.32613837718963623, "learning_rate": 4.706925799994322e-06, "loss": 0.3506, "step": 3425 }, { "epoch": 1.7008108555353303, "grad_norm": 0.34605443477630615, "learning_rate": 4.704041729872215e-06, "loss": 0.3563, "step": 3426 }, { "epoch": 1.701307297699818, "grad_norm": 0.3362909257411957, "learning_rate": 4.701157758562528e-06, "loss": 0.3576, "step": 3427 }, { "epoch": 1.7018037398643058, "grad_norm": 0.3404662013053894, "learning_rate": 4.698273887028147e-06, "loss": 0.3778, "step": 3428 }, { "epoch": 1.7023001820287935, "grad_norm": 0.36439085006713867, "learning_rate": 4.695390116231915e-06, "loss": 0.3806, "step": 3429 }, { "epoch": 1.7027966241932815, "grad_norm": 0.3578636646270752, "learning_rate": 4.692506447136641e-06, "loss": 0.3845, "step": 3430 }, { "epoch": 1.7032930663577694, "grad_norm": 0.328876256942749, "learning_rate": 4.68962288070511e-06, "loss": 0.3429, "step": 3431 }, { "epoch": 1.7037895085222572, "grad_norm": 0.3483218550682068, "learning_rate": 4.686739417900063e-06, "loss": 0.3661, "step": 3432 }, { "epoch": 1.704285950686745, "grad_norm": 0.39791029691696167, "learning_rate": 4.68385605968421e-06, "loss": 0.3756, "step": 3433 }, { "epoch": 1.7047823928512327, "grad_norm": 0.3531373143196106, "learning_rate": 4.680972807020226e-06, "loss": 0.3807, "step": 3434 }, { "epoch": 1.7052788350157206, "grad_norm": 0.33264026045799255, "learning_rate": 4.67808966087075e-06, "loss": 0.4007, "step": 3435 }, { "epoch": 1.7057752771802086, "grad_norm": 0.3386155962944031, "learning_rate": 4.67520662219839e-06, "loss": 0.3352, "step": 3436 }, { "epoch": 1.7062717193446963, "grad_norm": 0.3500332832336426, "learning_rate": 4.672323691965711e-06, "loss": 0.3692, "step": 3437 }, { "epoch": 1.706768161509184, "grad_norm": 0.32867181301116943, "learning_rate": 4.669440871135243e-06, "loss": 0.3613, "step": 3438 }, { "epoch": 1.707264603673672, "grad_norm": 0.3241957128047943, "learning_rate": 4.666558160669486e-06, "loss": 0.3395, "step": 3439 }, { "epoch": 1.70776104583816, "grad_norm": 0.3520301878452301, "learning_rate": 4.663675561530897e-06, "loss": 0.3975, "step": 3440 }, { "epoch": 1.7082574880026478, "grad_norm": 0.3585999608039856, "learning_rate": 4.660793074681895e-06, "loss": 0.4042, "step": 3441 }, { "epoch": 1.7087539301671355, "grad_norm": 0.3360142409801483, "learning_rate": 4.657910701084869e-06, "loss": 0.3338, "step": 3442 }, { "epoch": 1.7092503723316232, "grad_norm": 0.3667651116847992, "learning_rate": 4.655028441702161e-06, "loss": 0.4003, "step": 3443 }, { "epoch": 1.7097468144961112, "grad_norm": 0.37153318524360657, "learning_rate": 4.6521462974960805e-06, "loss": 0.3925, "step": 3444 }, { "epoch": 1.7102432566605992, "grad_norm": 0.34156572818756104, "learning_rate": 4.649264269428896e-06, "loss": 0.3344, "step": 3445 }, { "epoch": 1.710739698825087, "grad_norm": 0.39686283469200134, "learning_rate": 4.6463823584628415e-06, "loss": 0.3679, "step": 3446 }, { "epoch": 1.7112361409895747, "grad_norm": 0.34634798765182495, "learning_rate": 4.643500565560106e-06, "loss": 0.3672, "step": 3447 }, { "epoch": 1.7117325831540624, "grad_norm": 0.3649572432041168, "learning_rate": 4.640618891682844e-06, "loss": 0.3816, "step": 3448 }, { "epoch": 1.7122290253185504, "grad_norm": 0.39137428998947144, "learning_rate": 4.637737337793164e-06, "loss": 0.3785, "step": 3449 }, { "epoch": 1.7127254674830383, "grad_norm": 0.34134185314178467, "learning_rate": 4.634855904853143e-06, "loss": 0.3631, "step": 3450 }, { "epoch": 1.713221909647526, "grad_norm": 0.3654789328575134, "learning_rate": 4.63197459382481e-06, "loss": 0.3792, "step": 3451 }, { "epoch": 1.7137183518120138, "grad_norm": 0.35728389024734497, "learning_rate": 4.629093405670159e-06, "loss": 0.3572, "step": 3452 }, { "epoch": 1.7142147939765018, "grad_norm": 0.3577714264392853, "learning_rate": 4.626212341351137e-06, "loss": 0.3795, "step": 3453 }, { "epoch": 1.7147112361409895, "grad_norm": 0.3439427316188812, "learning_rate": 4.623331401829658e-06, "loss": 0.3508, "step": 3454 }, { "epoch": 1.7152076783054775, "grad_norm": 0.3291642665863037, "learning_rate": 4.6204505880675856e-06, "loss": 0.3813, "step": 3455 }, { "epoch": 1.7157041204699652, "grad_norm": 0.3378807008266449, "learning_rate": 4.617569901026745e-06, "loss": 0.4073, "step": 3456 }, { "epoch": 1.716200562634453, "grad_norm": 0.32457083463668823, "learning_rate": 4.614689341668922e-06, "loss": 0.3478, "step": 3457 }, { "epoch": 1.716697004798941, "grad_norm": 0.3398924767971039, "learning_rate": 4.611808910955855e-06, "loss": 0.3637, "step": 3458 }, { "epoch": 1.717193446963429, "grad_norm": 0.3609676659107208, "learning_rate": 4.608928609849244e-06, "loss": 0.4037, "step": 3459 }, { "epoch": 1.7176898891279166, "grad_norm": 0.32579919695854187, "learning_rate": 4.606048439310738e-06, "loss": 0.3409, "step": 3460 }, { "epoch": 1.7181863312924044, "grad_norm": 0.3657160997390747, "learning_rate": 4.603168400301954e-06, "loss": 0.4064, "step": 3461 }, { "epoch": 1.7186827734568921, "grad_norm": 0.305945485830307, "learning_rate": 4.600288493784455e-06, "loss": 0.331, "step": 3462 }, { "epoch": 1.71917921562138, "grad_norm": 0.32330188155174255, "learning_rate": 4.597408720719765e-06, "loss": 0.3851, "step": 3463 }, { "epoch": 1.719675657785868, "grad_norm": 0.3611743450164795, "learning_rate": 4.5945290820693585e-06, "loss": 0.421, "step": 3464 }, { "epoch": 1.7201720999503558, "grad_norm": 0.3073453903198242, "learning_rate": 4.591649578794675e-06, "loss": 0.3681, "step": 3465 }, { "epoch": 1.7206685421148435, "grad_norm": 0.30617091059684753, "learning_rate": 4.588770211857096e-06, "loss": 0.3544, "step": 3466 }, { "epoch": 1.7211649842793315, "grad_norm": 0.3152312636375427, "learning_rate": 4.58589098221797e-06, "loss": 0.3917, "step": 3467 }, { "epoch": 1.7216614264438193, "grad_norm": 0.31227368116378784, "learning_rate": 4.583011890838586e-06, "loss": 0.3572, "step": 3468 }, { "epoch": 1.7221578686083072, "grad_norm": 0.32579004764556885, "learning_rate": 4.580132938680202e-06, "loss": 0.3528, "step": 3469 }, { "epoch": 1.722654310772795, "grad_norm": 0.3257172405719757, "learning_rate": 4.577254126704017e-06, "loss": 0.3706, "step": 3470 }, { "epoch": 1.7231507529372827, "grad_norm": 0.3487100899219513, "learning_rate": 4.574375455871188e-06, "loss": 0.3915, "step": 3471 }, { "epoch": 1.7236471951017707, "grad_norm": 0.3612709641456604, "learning_rate": 4.571496927142829e-06, "loss": 0.3842, "step": 3472 }, { "epoch": 1.7241436372662586, "grad_norm": 0.33808526396751404, "learning_rate": 4.56861854148e-06, "loss": 0.3478, "step": 3473 }, { "epoch": 1.7246400794307464, "grad_norm": 0.3747834265232086, "learning_rate": 4.565740299843714e-06, "loss": 0.3778, "step": 3474 }, { "epoch": 1.7251365215952341, "grad_norm": 0.325595885515213, "learning_rate": 4.562862203194939e-06, "loss": 0.3159, "step": 3475 }, { "epoch": 1.7256329637597219, "grad_norm": 0.311140775680542, "learning_rate": 4.559984252494595e-06, "loss": 0.3923, "step": 3476 }, { "epoch": 1.7261294059242098, "grad_norm": 0.33890238404273987, "learning_rate": 4.55710644870355e-06, "loss": 0.3673, "step": 3477 }, { "epoch": 1.7266258480886978, "grad_norm": 0.32407480478286743, "learning_rate": 4.554228792782626e-06, "loss": 0.4168, "step": 3478 }, { "epoch": 1.7271222902531855, "grad_norm": 0.31473493576049805, "learning_rate": 4.551351285692589e-06, "loss": 0.3982, "step": 3479 }, { "epoch": 1.7276187324176733, "grad_norm": 0.31332385540008545, "learning_rate": 4.548473928394167e-06, "loss": 0.3522, "step": 3480 }, { "epoch": 1.728115174582161, "grad_norm": 0.3720772862434387, "learning_rate": 4.545596721848027e-06, "loss": 0.3879, "step": 3481 }, { "epoch": 1.728611616746649, "grad_norm": 0.34805160760879517, "learning_rate": 4.542719667014792e-06, "loss": 0.366, "step": 3482 }, { "epoch": 1.729108058911137, "grad_norm": 0.3319459855556488, "learning_rate": 4.539842764855032e-06, "loss": 0.3568, "step": 3483 }, { "epoch": 1.7296045010756247, "grad_norm": 0.3412543535232544, "learning_rate": 4.5369660163292674e-06, "loss": 0.3595, "step": 3484 }, { "epoch": 1.7301009432401124, "grad_norm": 0.35625651478767395, "learning_rate": 4.534089422397965e-06, "loss": 0.3844, "step": 3485 }, { "epoch": 1.7305973854046004, "grad_norm": 0.38022732734680176, "learning_rate": 4.5312129840215405e-06, "loss": 0.365, "step": 3486 }, { "epoch": 1.7310938275690884, "grad_norm": 0.34466901421546936, "learning_rate": 4.528336702160361e-06, "loss": 0.3713, "step": 3487 }, { "epoch": 1.731590269733576, "grad_norm": 0.35565412044525146, "learning_rate": 4.5254605777747376e-06, "loss": 0.3554, "step": 3488 }, { "epoch": 1.7320867118980638, "grad_norm": 0.42009347677230835, "learning_rate": 4.5225846118249295e-06, "loss": 0.3858, "step": 3489 }, { "epoch": 1.7325831540625516, "grad_norm": 0.36932143568992615, "learning_rate": 4.519708805271144e-06, "loss": 0.3654, "step": 3490 }, { "epoch": 1.7330795962270396, "grad_norm": 0.35912206768989563, "learning_rate": 4.5168331590735345e-06, "loss": 0.3545, "step": 3491 }, { "epoch": 1.7335760383915275, "grad_norm": 0.377895325422287, "learning_rate": 4.513957674192203e-06, "loss": 0.3882, "step": 3492 }, { "epoch": 1.7340724805560153, "grad_norm": 0.3568088710308075, "learning_rate": 4.511082351587194e-06, "loss": 0.317, "step": 3493 }, { "epoch": 1.734568922720503, "grad_norm": 0.3576708137989044, "learning_rate": 4.5082071922184965e-06, "loss": 0.3533, "step": 3494 }, { "epoch": 1.7350653648849907, "grad_norm": 0.3487604260444641, "learning_rate": 4.505332197046055e-06, "loss": 0.39, "step": 3495 }, { "epoch": 1.7355618070494787, "grad_norm": 0.33177661895751953, "learning_rate": 4.5024573670297475e-06, "loss": 0.347, "step": 3496 }, { "epoch": 1.7360582492139667, "grad_norm": 0.3756621479988098, "learning_rate": 4.499582703129402e-06, "loss": 0.3933, "step": 3497 }, { "epoch": 1.7365546913784544, "grad_norm": 0.36003684997558594, "learning_rate": 4.49670820630479e-06, "loss": 0.3788, "step": 3498 }, { "epoch": 1.7370511335429422, "grad_norm": 0.28320977091789246, "learning_rate": 4.493833877515632e-06, "loss": 0.2976, "step": 3499 }, { "epoch": 1.7375475757074301, "grad_norm": 0.3613300621509552, "learning_rate": 4.490959717721586e-06, "loss": 0.4065, "step": 3500 }, { "epoch": 1.7380440178719179, "grad_norm": 0.3325745761394501, "learning_rate": 4.4880857278822524e-06, "loss": 0.3797, "step": 3501 }, { "epoch": 1.7385404600364058, "grad_norm": 0.33606740832328796, "learning_rate": 4.485211908957183e-06, "loss": 0.3661, "step": 3502 }, { "epoch": 1.7390369022008936, "grad_norm": 0.35669565200805664, "learning_rate": 4.482338261905866e-06, "loss": 0.3746, "step": 3503 }, { "epoch": 1.7395333443653813, "grad_norm": 0.36168256402015686, "learning_rate": 4.4794647876877335e-06, "loss": 0.3508, "step": 3504 }, { "epoch": 1.7400297865298693, "grad_norm": 0.3373899459838867, "learning_rate": 4.476591487262161e-06, "loss": 0.3649, "step": 3505 }, { "epoch": 1.7405262286943572, "grad_norm": 0.35068994760513306, "learning_rate": 4.473718361588465e-06, "loss": 0.3576, "step": 3506 }, { "epoch": 1.741022670858845, "grad_norm": 0.3245414197444916, "learning_rate": 4.470845411625906e-06, "loss": 0.385, "step": 3507 }, { "epoch": 1.7415191130233327, "grad_norm": 0.35506394505500793, "learning_rate": 4.467972638333682e-06, "loss": 0.3375, "step": 3508 }, { "epoch": 1.7420155551878205, "grad_norm": 0.3729841709136963, "learning_rate": 4.465100042670933e-06, "loss": 0.4004, "step": 3509 }, { "epoch": 1.7425119973523084, "grad_norm": 0.32407093048095703, "learning_rate": 4.462227625596743e-06, "loss": 0.3134, "step": 3510 }, { "epoch": 1.7430084395167964, "grad_norm": 0.35502633452415466, "learning_rate": 4.459355388070134e-06, "loss": 0.3644, "step": 3511 }, { "epoch": 1.7435048816812841, "grad_norm": 0.3191346526145935, "learning_rate": 4.456483331050064e-06, "loss": 0.3596, "step": 3512 }, { "epoch": 1.744001323845772, "grad_norm": 0.334467351436615, "learning_rate": 4.453611455495441e-06, "loss": 0.3444, "step": 3513 }, { "epoch": 1.7444977660102599, "grad_norm": 0.3787761926651001, "learning_rate": 4.450739762365101e-06, "loss": 0.3872, "step": 3514 }, { "epoch": 1.7449942081747476, "grad_norm": 0.36802181601524353, "learning_rate": 4.447868252617828e-06, "loss": 0.4004, "step": 3515 }, { "epoch": 1.7454906503392356, "grad_norm": 0.2955898940563202, "learning_rate": 4.444996927212337e-06, "loss": 0.303, "step": 3516 }, { "epoch": 1.7459870925037233, "grad_norm": 0.3929346799850464, "learning_rate": 4.44212578710729e-06, "loss": 0.4173, "step": 3517 }, { "epoch": 1.746483534668211, "grad_norm": 0.3553166389465332, "learning_rate": 4.439254833261281e-06, "loss": 0.3706, "step": 3518 }, { "epoch": 1.746979976832699, "grad_norm": 0.3242308497428894, "learning_rate": 4.436384066632842e-06, "loss": 0.3544, "step": 3519 }, { "epoch": 1.747476418997187, "grad_norm": 0.3391484022140503, "learning_rate": 4.433513488180443e-06, "loss": 0.3644, "step": 3520 }, { "epoch": 1.7479728611616747, "grad_norm": 0.3375431001186371, "learning_rate": 4.4306430988624945e-06, "loss": 0.3574, "step": 3521 }, { "epoch": 1.7484693033261625, "grad_norm": 0.36313003301620483, "learning_rate": 4.427772899637343e-06, "loss": 0.3997, "step": 3522 }, { "epoch": 1.7489657454906502, "grad_norm": 0.3337276577949524, "learning_rate": 4.424902891463269e-06, "loss": 0.3598, "step": 3523 }, { "epoch": 1.7494621876551382, "grad_norm": 0.37348270416259766, "learning_rate": 4.422033075298485e-06, "loss": 0.3977, "step": 3524 }, { "epoch": 1.7499586298196261, "grad_norm": 0.36773866415023804, "learning_rate": 4.419163452101153e-06, "loss": 0.4027, "step": 3525 }, { "epoch": 1.7504550719841139, "grad_norm": 0.342812716960907, "learning_rate": 4.416294022829356e-06, "loss": 0.3625, "step": 3526 }, { "epoch": 1.7509515141486016, "grad_norm": 0.3278186619281769, "learning_rate": 4.41342478844112e-06, "loss": 0.3562, "step": 3527 }, { "epoch": 1.7514479563130894, "grad_norm": 0.3284974694252014, "learning_rate": 4.410555749894407e-06, "loss": 0.365, "step": 3528 }, { "epoch": 1.7519443984775773, "grad_norm": 0.33613720536231995, "learning_rate": 4.407686908147107e-06, "loss": 0.365, "step": 3529 }, { "epoch": 1.7524408406420653, "grad_norm": 0.34053757786750793, "learning_rate": 4.404818264157052e-06, "loss": 0.3561, "step": 3530 }, { "epoch": 1.752937282806553, "grad_norm": 0.37089207768440247, "learning_rate": 4.4019498188819996e-06, "loss": 0.3772, "step": 3531 }, { "epoch": 1.7534337249710408, "grad_norm": 0.3719173073768616, "learning_rate": 4.399081573279651e-06, "loss": 0.4041, "step": 3532 }, { "epoch": 1.7539301671355287, "grad_norm": 0.3056204915046692, "learning_rate": 4.396213528307633e-06, "loss": 0.344, "step": 3533 }, { "epoch": 1.7544266093000167, "grad_norm": 0.32960519194602966, "learning_rate": 4.393345684923508e-06, "loss": 0.3432, "step": 3534 }, { "epoch": 1.7549230514645044, "grad_norm": 0.3634607195854187, "learning_rate": 4.3904780440847695e-06, "loss": 0.3699, "step": 3535 }, { "epoch": 1.7554194936289922, "grad_norm": 0.3216673731803894, "learning_rate": 4.387610606748847e-06, "loss": 0.3532, "step": 3536 }, { "epoch": 1.75591593579348, "grad_norm": 0.32173436880111694, "learning_rate": 4.384743373873099e-06, "loss": 0.3745, "step": 3537 }, { "epoch": 1.756412377957968, "grad_norm": 0.39150097966194153, "learning_rate": 4.3818763464148165e-06, "loss": 0.3772, "step": 3538 }, { "epoch": 1.7569088201224559, "grad_norm": 0.33689138293266296, "learning_rate": 4.379009525331222e-06, "loss": 0.3453, "step": 3539 }, { "epoch": 1.7574052622869436, "grad_norm": 0.34673193097114563, "learning_rate": 4.37614291157947e-06, "loss": 0.3361, "step": 3540 }, { "epoch": 1.7579017044514313, "grad_norm": 0.3464982211589813, "learning_rate": 4.373276506116645e-06, "loss": 0.3793, "step": 3541 }, { "epoch": 1.758398146615919, "grad_norm": 0.3691372275352478, "learning_rate": 4.370410309899759e-06, "loss": 0.3871, "step": 3542 }, { "epoch": 1.758894588780407, "grad_norm": 0.3635186553001404, "learning_rate": 4.367544323885762e-06, "loss": 0.386, "step": 3543 }, { "epoch": 1.759391030944895, "grad_norm": 0.32264086604118347, "learning_rate": 4.364678549031525e-06, "loss": 0.3866, "step": 3544 }, { "epoch": 1.7598874731093828, "grad_norm": 0.33480212092399597, "learning_rate": 4.3618129862938525e-06, "loss": 0.3706, "step": 3545 }, { "epoch": 1.7603839152738705, "grad_norm": 0.323770135641098, "learning_rate": 4.358947636629478e-06, "loss": 0.3336, "step": 3546 }, { "epoch": 1.7608803574383585, "grad_norm": 0.3305698335170746, "learning_rate": 4.3560825009950665e-06, "loss": 0.3396, "step": 3547 }, { "epoch": 1.7613767996028464, "grad_norm": 0.37295305728912354, "learning_rate": 4.353217580347208e-06, "loss": 0.4082, "step": 3548 }, { "epoch": 1.7618732417673342, "grad_norm": 0.3330773413181305, "learning_rate": 4.3503528756424204e-06, "loss": 0.3913, "step": 3549 }, { "epoch": 1.762369683931822, "grad_norm": 0.35118335485458374, "learning_rate": 4.3474883878371496e-06, "loss": 0.3631, "step": 3550 }, { "epoch": 1.7628661260963097, "grad_norm": 0.3628711700439453, "learning_rate": 4.3446241178877735e-06, "loss": 0.4142, "step": 3551 }, { "epoch": 1.7633625682607976, "grad_norm": 0.3352130055427551, "learning_rate": 4.341760066750591e-06, "loss": 0.3409, "step": 3552 }, { "epoch": 1.7638590104252856, "grad_norm": 0.360625296831131, "learning_rate": 4.338896235381832e-06, "loss": 0.4148, "step": 3553 }, { "epoch": 1.7643554525897733, "grad_norm": 0.338151752948761, "learning_rate": 4.336032624737653e-06, "loss": 0.3272, "step": 3554 }, { "epoch": 1.764851894754261, "grad_norm": 0.3735258877277374, "learning_rate": 4.333169235774136e-06, "loss": 0.399, "step": 3555 }, { "epoch": 1.7653483369187488, "grad_norm": 0.3484480381011963, "learning_rate": 4.330306069447287e-06, "loss": 0.3373, "step": 3556 }, { "epoch": 1.7658447790832368, "grad_norm": 0.3144625127315521, "learning_rate": 4.327443126713039e-06, "loss": 0.3581, "step": 3557 }, { "epoch": 1.7663412212477247, "grad_norm": 0.33695945143699646, "learning_rate": 4.324580408527254e-06, "loss": 0.351, "step": 3558 }, { "epoch": 1.7668376634122125, "grad_norm": 0.3213583528995514, "learning_rate": 4.321717915845713e-06, "loss": 0.3636, "step": 3559 }, { "epoch": 1.7673341055767002, "grad_norm": 0.3090468943119049, "learning_rate": 4.318855649624124e-06, "loss": 0.3541, "step": 3560 }, { "epoch": 1.7678305477411882, "grad_norm": 0.3091413676738739, "learning_rate": 4.315993610818121e-06, "loss": 0.3708, "step": 3561 }, { "epoch": 1.768326989905676, "grad_norm": 0.3150924742221832, "learning_rate": 4.3131318003832625e-06, "loss": 0.3304, "step": 3562 }, { "epoch": 1.768823432070164, "grad_norm": 0.32218801975250244, "learning_rate": 4.310270219275028e-06, "loss": 0.3556, "step": 3563 }, { "epoch": 1.7693198742346516, "grad_norm": 0.3309474289417267, "learning_rate": 4.307408868448822e-06, "loss": 0.3913, "step": 3564 }, { "epoch": 1.7698163163991394, "grad_norm": 0.337258517742157, "learning_rate": 4.304547748859967e-06, "loss": 0.3727, "step": 3565 }, { "epoch": 1.7703127585636274, "grad_norm": 0.295643150806427, "learning_rate": 4.301686861463722e-06, "loss": 0.3569, "step": 3566 }, { "epoch": 1.7708092007281153, "grad_norm": 0.2973824143409729, "learning_rate": 4.298826207215254e-06, "loss": 0.3364, "step": 3567 }, { "epoch": 1.771305642892603, "grad_norm": 0.34778425097465515, "learning_rate": 4.2959657870696555e-06, "loss": 0.3596, "step": 3568 }, { "epoch": 1.7718020850570908, "grad_norm": 0.3523993194103241, "learning_rate": 4.293105601981948e-06, "loss": 0.3782, "step": 3569 }, { "epoch": 1.7722985272215785, "grad_norm": 0.2933844029903412, "learning_rate": 4.290245652907069e-06, "loss": 0.3509, "step": 3570 }, { "epoch": 1.7727949693860665, "grad_norm": 0.3259839415550232, "learning_rate": 4.287385940799876e-06, "loss": 0.3901, "step": 3571 }, { "epoch": 1.7732914115505545, "grad_norm": 0.3190265893936157, "learning_rate": 4.284526466615148e-06, "loss": 0.3495, "step": 3572 }, { "epoch": 1.7737878537150422, "grad_norm": 0.329305499792099, "learning_rate": 4.281667231307588e-06, "loss": 0.4273, "step": 3573 }, { "epoch": 1.77428429587953, "grad_norm": 0.3129325807094574, "learning_rate": 4.278808235831818e-06, "loss": 0.3596, "step": 3574 }, { "epoch": 1.774780738044018, "grad_norm": 0.3651891350746155, "learning_rate": 4.2759494811423755e-06, "loss": 0.381, "step": 3575 }, { "epoch": 1.7752771802085057, "grad_norm": 0.3555458188056946, "learning_rate": 4.2730909681937224e-06, "loss": 0.3936, "step": 3576 }, { "epoch": 1.7757736223729936, "grad_norm": 0.31823647022247314, "learning_rate": 4.2702326979402385e-06, "loss": 0.3413, "step": 3577 }, { "epoch": 1.7762700645374814, "grad_norm": 0.35866454243659973, "learning_rate": 4.267374671336224e-06, "loss": 0.3844, "step": 3578 }, { "epoch": 1.7767665067019691, "grad_norm": 0.3372751474380493, "learning_rate": 4.264516889335894e-06, "loss": 0.3764, "step": 3579 }, { "epoch": 1.777262948866457, "grad_norm": 0.3212250769138336, "learning_rate": 4.261659352893386e-06, "loss": 0.3507, "step": 3580 }, { "epoch": 1.777759391030945, "grad_norm": 0.3798510730266571, "learning_rate": 4.258802062962754e-06, "loss": 0.3909, "step": 3581 }, { "epoch": 1.7782558331954328, "grad_norm": 0.3163897693157196, "learning_rate": 4.255945020497968e-06, "loss": 0.3231, "step": 3582 }, { "epoch": 1.7787522753599205, "grad_norm": 0.3338291645050049, "learning_rate": 4.253088226452915e-06, "loss": 0.3704, "step": 3583 }, { "epoch": 1.7792487175244083, "grad_norm": 0.3631798028945923, "learning_rate": 4.250231681781406e-06, "loss": 0.3612, "step": 3584 }, { "epoch": 1.7797451596888962, "grad_norm": 0.3482663333415985, "learning_rate": 4.24737538743716e-06, "loss": 0.4031, "step": 3585 }, { "epoch": 1.7802416018533842, "grad_norm": 0.3178453743457794, "learning_rate": 4.244519344373817e-06, "loss": 0.356, "step": 3586 }, { "epoch": 1.780738044017872, "grad_norm": 0.33773940801620483, "learning_rate": 4.241663553544931e-06, "loss": 0.3902, "step": 3587 }, { "epoch": 1.7812344861823597, "grad_norm": 0.3080623149871826, "learning_rate": 4.2388080159039755e-06, "loss": 0.3537, "step": 3588 }, { "epoch": 1.7817309283468474, "grad_norm": 0.3758053779602051, "learning_rate": 4.235952732404336e-06, "loss": 0.4439, "step": 3589 }, { "epoch": 1.7822273705113354, "grad_norm": 0.3112005889415741, "learning_rate": 4.233097703999313e-06, "loss": 0.3296, "step": 3590 }, { "epoch": 1.7827238126758234, "grad_norm": 0.3124103844165802, "learning_rate": 4.230242931642121e-06, "loss": 0.372, "step": 3591 }, { "epoch": 1.783220254840311, "grad_norm": 0.3358811140060425, "learning_rate": 4.2273884162858955e-06, "loss": 0.3669, "step": 3592 }, { "epoch": 1.7837166970047988, "grad_norm": 0.30595171451568604, "learning_rate": 4.224534158883679e-06, "loss": 0.3724, "step": 3593 }, { "epoch": 1.7842131391692868, "grad_norm": 0.36496636271476746, "learning_rate": 4.22168016038843e-06, "loss": 0.3681, "step": 3594 }, { "epoch": 1.7847095813337748, "grad_norm": 0.34704646468162537, "learning_rate": 4.2188264217530235e-06, "loss": 0.3613, "step": 3595 }, { "epoch": 1.7852060234982625, "grad_norm": 0.4397140145301819, "learning_rate": 4.2159729439302435e-06, "loss": 0.3921, "step": 3596 }, { "epoch": 1.7857024656627503, "grad_norm": 0.33280202746391296, "learning_rate": 4.213119727872789e-06, "loss": 0.337, "step": 3597 }, { "epoch": 1.786198907827238, "grad_norm": 0.36956512928009033, "learning_rate": 4.210266774533269e-06, "loss": 0.3626, "step": 3598 }, { "epoch": 1.786695349991726, "grad_norm": 0.32308486104011536, "learning_rate": 4.207414084864211e-06, "loss": 0.3561, "step": 3599 }, { "epoch": 1.787191792156214, "grad_norm": 0.34481051564216614, "learning_rate": 4.204561659818049e-06, "loss": 0.3717, "step": 3600 }, { "epoch": 1.7876882343207017, "grad_norm": 0.33420801162719727, "learning_rate": 4.2017095003471294e-06, "loss": 0.3363, "step": 3601 }, { "epoch": 1.7881846764851894, "grad_norm": 0.35607555508613586, "learning_rate": 4.19885760740371e-06, "loss": 0.3871, "step": 3602 }, { "epoch": 1.7886811186496772, "grad_norm": 0.332539439201355, "learning_rate": 4.196005981939963e-06, "loss": 0.3613, "step": 3603 }, { "epoch": 1.7891775608141651, "grad_norm": 0.34895920753479004, "learning_rate": 4.193154624907968e-06, "loss": 0.3666, "step": 3604 }, { "epoch": 1.789674002978653, "grad_norm": 0.32060644030570984, "learning_rate": 4.1903035372597155e-06, "loss": 0.3561, "step": 3605 }, { "epoch": 1.7901704451431408, "grad_norm": 0.3425676226615906, "learning_rate": 4.1874527199471025e-06, "loss": 0.3734, "step": 3606 }, { "epoch": 1.7906668873076286, "grad_norm": 0.33978596329689026, "learning_rate": 4.184602173921945e-06, "loss": 0.379, "step": 3607 }, { "epoch": 1.7911633294721165, "grad_norm": 0.35001859068870544, "learning_rate": 4.181751900135959e-06, "loss": 0.4004, "step": 3608 }, { "epoch": 1.7916597716366043, "grad_norm": 0.3107544481754303, "learning_rate": 4.178901899540775e-06, "loss": 0.3433, "step": 3609 }, { "epoch": 1.7921562138010922, "grad_norm": 0.3670143187046051, "learning_rate": 4.17605217308793e-06, "loss": 0.3964, "step": 3610 }, { "epoch": 1.79265265596558, "grad_norm": 0.3006398379802704, "learning_rate": 4.173202721728873e-06, "loss": 0.3348, "step": 3611 }, { "epoch": 1.7931490981300677, "grad_norm": 0.3489590883255005, "learning_rate": 4.170353546414955e-06, "loss": 0.3945, "step": 3612 }, { "epoch": 1.7936455402945557, "grad_norm": 0.36203956604003906, "learning_rate": 4.167504648097438e-06, "loss": 0.3747, "step": 3613 }, { "epoch": 1.7941419824590437, "grad_norm": 0.316510409116745, "learning_rate": 4.164656027727495e-06, "loss": 0.3396, "step": 3614 }, { "epoch": 1.7946384246235314, "grad_norm": 0.36544686555862427, "learning_rate": 4.161807686256199e-06, "loss": 0.4063, "step": 3615 }, { "epoch": 1.7951348667880191, "grad_norm": 0.32986196875572205, "learning_rate": 4.158959624634537e-06, "loss": 0.3874, "step": 3616 }, { "epoch": 1.7956313089525069, "grad_norm": 0.33785051107406616, "learning_rate": 4.156111843813397e-06, "loss": 0.3819, "step": 3617 }, { "epoch": 1.7961277511169949, "grad_norm": 0.34470823407173157, "learning_rate": 4.153264344743578e-06, "loss": 0.3626, "step": 3618 }, { "epoch": 1.7966241932814828, "grad_norm": 0.35295894742012024, "learning_rate": 4.150417128375782e-06, "loss": 0.3879, "step": 3619 }, { "epoch": 1.7971206354459706, "grad_norm": 0.3285984694957733, "learning_rate": 4.147570195660614e-06, "loss": 0.3407, "step": 3620 }, { "epoch": 1.7976170776104583, "grad_norm": 0.35320550203323364, "learning_rate": 4.144723547548592e-06, "loss": 0.3629, "step": 3621 }, { "epoch": 1.7981135197749463, "grad_norm": 0.35180506110191345, "learning_rate": 4.141877184990133e-06, "loss": 0.3977, "step": 3622 }, { "epoch": 1.798609961939434, "grad_norm": 0.3499068021774292, "learning_rate": 4.1390311089355575e-06, "loss": 0.3723, "step": 3623 }, { "epoch": 1.799106404103922, "grad_norm": 0.3559710383415222, "learning_rate": 4.136185320335095e-06, "loss": 0.3952, "step": 3624 }, { "epoch": 1.7996028462684097, "grad_norm": 0.3277711272239685, "learning_rate": 4.133339820138876e-06, "loss": 0.3476, "step": 3625 }, { "epoch": 1.8000992884328975, "grad_norm": 0.3215638995170593, "learning_rate": 4.130494609296939e-06, "loss": 0.3446, "step": 3626 }, { "epoch": 1.8005957305973854, "grad_norm": 0.3382883071899414, "learning_rate": 4.12764968875922e-06, "loss": 0.3838, "step": 3627 }, { "epoch": 1.8010921727618734, "grad_norm": 0.36008119583129883, "learning_rate": 4.124805059475559e-06, "loss": 0.3799, "step": 3628 }, { "epoch": 1.8015886149263611, "grad_norm": 0.3596733510494232, "learning_rate": 4.1219607223957026e-06, "loss": 0.3553, "step": 3629 }, { "epoch": 1.8020850570908489, "grad_norm": 0.3740197420120239, "learning_rate": 4.119116678469298e-06, "loss": 0.3857, "step": 3630 }, { "epoch": 1.8025814992553366, "grad_norm": 0.3454590439796448, "learning_rate": 4.116272928645893e-06, "loss": 0.3453, "step": 3631 }, { "epoch": 1.8030779414198246, "grad_norm": 0.34508180618286133, "learning_rate": 4.113429473874938e-06, "loss": 0.3709, "step": 3632 }, { "epoch": 1.8035743835843125, "grad_norm": 0.37271371483802795, "learning_rate": 4.1105863151057865e-06, "loss": 0.3874, "step": 3633 }, { "epoch": 1.8040708257488003, "grad_norm": 0.37129831314086914, "learning_rate": 4.107743453287693e-06, "loss": 0.3652, "step": 3634 }, { "epoch": 1.804567267913288, "grad_norm": 0.3805919289588928, "learning_rate": 4.1049008893698066e-06, "loss": 0.3327, "step": 3635 }, { "epoch": 1.8050637100777758, "grad_norm": 0.36653101444244385, "learning_rate": 4.102058624301189e-06, "loss": 0.3723, "step": 3636 }, { "epoch": 1.8055601522422637, "grad_norm": 0.3156147599220276, "learning_rate": 4.099216659030792e-06, "loss": 0.3562, "step": 3637 }, { "epoch": 1.8060565944067517, "grad_norm": 0.3555183708667755, "learning_rate": 4.09637499450747e-06, "loss": 0.3811, "step": 3638 }, { "epoch": 1.8065530365712394, "grad_norm": 0.35448187589645386, "learning_rate": 4.0935336316799764e-06, "loss": 0.3969, "step": 3639 }, { "epoch": 1.8070494787357272, "grad_norm": 0.306679904460907, "learning_rate": 4.090692571496968e-06, "loss": 0.3294, "step": 3640 }, { "epoch": 1.8075459209002152, "grad_norm": 0.3177945613861084, "learning_rate": 4.087851814906997e-06, "loss": 0.3297, "step": 3641 }, { "epoch": 1.8080423630647031, "grad_norm": 0.32420986890792847, "learning_rate": 4.0850113628585155e-06, "loss": 0.3751, "step": 3642 }, { "epoch": 1.8085388052291909, "grad_norm": 0.3292515277862549, "learning_rate": 4.0821712162998686e-06, "loss": 0.3409, "step": 3643 }, { "epoch": 1.8090352473936786, "grad_norm": 0.34150591492652893, "learning_rate": 4.07933137617931e-06, "loss": 0.3616, "step": 3644 }, { "epoch": 1.8095316895581663, "grad_norm": 0.3436610698699951, "learning_rate": 4.076491843444982e-06, "loss": 0.3506, "step": 3645 }, { "epoch": 1.8100281317226543, "grad_norm": 0.3189089596271515, "learning_rate": 4.0736526190449264e-06, "loss": 0.3776, "step": 3646 }, { "epoch": 1.8105245738871423, "grad_norm": 0.30764153599739075, "learning_rate": 4.0708137039270855e-06, "loss": 0.3794, "step": 3647 }, { "epoch": 1.81102101605163, "grad_norm": 0.310226708650589, "learning_rate": 4.067975099039295e-06, "loss": 0.3733, "step": 3648 }, { "epoch": 1.8115174582161178, "grad_norm": 0.31864356994628906, "learning_rate": 4.065136805329289e-06, "loss": 0.3427, "step": 3649 }, { "epoch": 1.8120139003806055, "grad_norm": 0.34815356135368347, "learning_rate": 4.0622988237446924e-06, "loss": 0.3564, "step": 3650 }, { "epoch": 1.8125103425450935, "grad_norm": 0.3217158615589142, "learning_rate": 4.059461155233036e-06, "loss": 0.403, "step": 3651 }, { "epoch": 1.8130067847095814, "grad_norm": 0.3244237005710602, "learning_rate": 4.056623800741738e-06, "loss": 0.3635, "step": 3652 }, { "epoch": 1.8135032268740692, "grad_norm": 0.3133496642112732, "learning_rate": 4.053786761218113e-06, "loss": 0.3099, "step": 3653 }, { "epoch": 1.813999669038557, "grad_norm": 0.3626497983932495, "learning_rate": 4.05095003760937e-06, "loss": 0.3951, "step": 3654 }, { "epoch": 1.8144961112030449, "grad_norm": 0.3489393889904022, "learning_rate": 4.048113630862617e-06, "loss": 0.3681, "step": 3655 }, { "epoch": 1.8149925533675328, "grad_norm": 0.32637616991996765, "learning_rate": 4.045277541924851e-06, "loss": 0.3619, "step": 3656 }, { "epoch": 1.8154889955320206, "grad_norm": 0.36690080165863037, "learning_rate": 4.042441771742967e-06, "loss": 0.3588, "step": 3657 }, { "epoch": 1.8159854376965083, "grad_norm": 0.33373722434043884, "learning_rate": 4.039606321263748e-06, "loss": 0.3631, "step": 3658 }, { "epoch": 1.816481879860996, "grad_norm": 0.32555025815963745, "learning_rate": 4.036771191433879e-06, "loss": 0.3396, "step": 3659 }, { "epoch": 1.816978322025484, "grad_norm": 0.3188841640949249, "learning_rate": 4.03393638319993e-06, "loss": 0.369, "step": 3660 }, { "epoch": 1.817474764189972, "grad_norm": 0.34161943197250366, "learning_rate": 4.0311018975083644e-06, "loss": 0.3781, "step": 3661 }, { "epoch": 1.8179712063544597, "grad_norm": 0.31291887164115906, "learning_rate": 4.028267735305544e-06, "loss": 0.3443, "step": 3662 }, { "epoch": 1.8184676485189475, "grad_norm": 0.30888301134109497, "learning_rate": 4.025433897537715e-06, "loss": 0.3487, "step": 3663 }, { "epoch": 1.8189640906834352, "grad_norm": 0.34151017665863037, "learning_rate": 4.022600385151022e-06, "loss": 0.3716, "step": 3664 }, { "epoch": 1.8194605328479232, "grad_norm": 0.34958764910697937, "learning_rate": 4.019767199091494e-06, "loss": 0.4004, "step": 3665 }, { "epoch": 1.8199569750124112, "grad_norm": 0.35453763604164124, "learning_rate": 4.016934340305059e-06, "loss": 0.3767, "step": 3666 }, { "epoch": 1.820453417176899, "grad_norm": 0.3161080777645111, "learning_rate": 4.01410180973753e-06, "loss": 0.3588, "step": 3667 }, { "epoch": 1.8209498593413866, "grad_norm": 0.3461999297142029, "learning_rate": 4.01126960833461e-06, "loss": 0.3532, "step": 3668 }, { "epoch": 1.8214463015058746, "grad_norm": 0.3659539222717285, "learning_rate": 4.008437737041895e-06, "loss": 0.4016, "step": 3669 }, { "epoch": 1.8219427436703624, "grad_norm": 0.3250378966331482, "learning_rate": 4.005606196804872e-06, "loss": 0.3322, "step": 3670 }, { "epoch": 1.8224391858348503, "grad_norm": 0.3567419946193695, "learning_rate": 4.0027749885689126e-06, "loss": 0.3918, "step": 3671 }, { "epoch": 1.822935627999338, "grad_norm": 0.302636981010437, "learning_rate": 3.999944113279283e-06, "loss": 0.3392, "step": 3672 }, { "epoch": 1.8234320701638258, "grad_norm": 0.3799000084400177, "learning_rate": 3.9971135718811315e-06, "loss": 0.4077, "step": 3673 }, { "epoch": 1.8239285123283138, "grad_norm": 0.33208590745925903, "learning_rate": 3.994283365319503e-06, "loss": 0.3809, "step": 3674 }, { "epoch": 1.8244249544928017, "grad_norm": 0.33376696705818176, "learning_rate": 3.991453494539326e-06, "loss": 0.377, "step": 3675 }, { "epoch": 1.8249213966572895, "grad_norm": 0.3534259796142578, "learning_rate": 3.988623960485414e-06, "loss": 0.3582, "step": 3676 }, { "epoch": 1.8254178388217772, "grad_norm": 0.3429949879646301, "learning_rate": 3.985794764102475e-06, "loss": 0.3417, "step": 3677 }, { "epoch": 1.825914280986265, "grad_norm": 0.34000205993652344, "learning_rate": 3.9829659063351e-06, "loss": 0.3684, "step": 3678 }, { "epoch": 1.826410723150753, "grad_norm": 0.3505455255508423, "learning_rate": 3.980137388127768e-06, "loss": 0.3573, "step": 3679 }, { "epoch": 1.8269071653152409, "grad_norm": 0.33556246757507324, "learning_rate": 3.977309210424841e-06, "loss": 0.3693, "step": 3680 }, { "epoch": 1.8274036074797286, "grad_norm": 0.323586642742157, "learning_rate": 3.9744813741705766e-06, "loss": 0.3333, "step": 3681 }, { "epoch": 1.8279000496442164, "grad_norm": 0.34015408158302307, "learning_rate": 3.971653880309109e-06, "loss": 0.3619, "step": 3682 }, { "epoch": 1.8283964918087043, "grad_norm": 0.3404366970062256, "learning_rate": 3.968826729784462e-06, "loss": 0.3627, "step": 3683 }, { "epoch": 1.828892933973192, "grad_norm": 0.319111704826355, "learning_rate": 3.965999923540541e-06, "loss": 0.3367, "step": 3684 }, { "epoch": 1.82938937613768, "grad_norm": 0.36395180225372314, "learning_rate": 3.963173462521146e-06, "loss": 0.3966, "step": 3685 }, { "epoch": 1.8298858183021678, "grad_norm": 0.33997803926467896, "learning_rate": 3.960347347669951e-06, "loss": 0.3708, "step": 3686 }, { "epoch": 1.8303822604666555, "grad_norm": 0.3420216739177704, "learning_rate": 3.957521579930522e-06, "loss": 0.39, "step": 3687 }, { "epoch": 1.8308787026311435, "grad_norm": 0.3440239131450653, "learning_rate": 3.954696160246302e-06, "loss": 0.3574, "step": 3688 }, { "epoch": 1.8313751447956315, "grad_norm": 0.3557625114917755, "learning_rate": 3.951871089560626e-06, "loss": 0.3797, "step": 3689 }, { "epoch": 1.8318715869601192, "grad_norm": 0.33279120922088623, "learning_rate": 3.949046368816708e-06, "loss": 0.3854, "step": 3690 }, { "epoch": 1.832368029124607, "grad_norm": 0.32144731283187866, "learning_rate": 3.94622199895764e-06, "loss": 0.3768, "step": 3691 }, { "epoch": 1.8328644712890947, "grad_norm": 0.3611636459827423, "learning_rate": 3.94339798092641e-06, "loss": 0.3582, "step": 3692 }, { "epoch": 1.8333609134535827, "grad_norm": 0.3103770613670349, "learning_rate": 3.940574315665877e-06, "loss": 0.3506, "step": 3693 }, { "epoch": 1.8338573556180706, "grad_norm": 0.3089020550251007, "learning_rate": 3.937751004118786e-06, "loss": 0.3715, "step": 3694 }, { "epoch": 1.8343537977825584, "grad_norm": 0.371931254863739, "learning_rate": 3.934928047227764e-06, "loss": 0.3835, "step": 3695 }, { "epoch": 1.834850239947046, "grad_norm": 0.342430055141449, "learning_rate": 3.932105445935319e-06, "loss": 0.3356, "step": 3696 }, { "epoch": 1.8353466821115338, "grad_norm": 0.353863924741745, "learning_rate": 3.929283201183844e-06, "loss": 0.3919, "step": 3697 }, { "epoch": 1.8358431242760218, "grad_norm": 0.325687050819397, "learning_rate": 3.926461313915607e-06, "loss": 0.3793, "step": 3698 }, { "epoch": 1.8363395664405098, "grad_norm": 0.3294696807861328, "learning_rate": 3.923639785072759e-06, "loss": 0.3809, "step": 3699 }, { "epoch": 1.8368360086049975, "grad_norm": 0.33163613080978394, "learning_rate": 3.920818615597334e-06, "loss": 0.3559, "step": 3700 }, { "epoch": 1.8373324507694853, "grad_norm": 0.3334925174713135, "learning_rate": 3.9179978064312426e-06, "loss": 0.3552, "step": 3701 }, { "epoch": 1.8378288929339732, "grad_norm": 0.32766255736351013, "learning_rate": 3.915177358516276e-06, "loss": 0.3826, "step": 3702 }, { "epoch": 1.8383253350984612, "grad_norm": 0.3322311341762543, "learning_rate": 3.912357272794105e-06, "loss": 0.3597, "step": 3703 }, { "epoch": 1.838821777262949, "grad_norm": 0.33038952946662903, "learning_rate": 3.909537550206281e-06, "loss": 0.3381, "step": 3704 }, { "epoch": 1.8393182194274367, "grad_norm": 0.33699166774749756, "learning_rate": 3.906718191694232e-06, "loss": 0.3672, "step": 3705 }, { "epoch": 1.8398146615919244, "grad_norm": 0.3806226849555969, "learning_rate": 3.903899198199264e-06, "loss": 0.3778, "step": 3706 }, { "epoch": 1.8403111037564124, "grad_norm": 0.34056535363197327, "learning_rate": 3.901080570662565e-06, "loss": 0.382, "step": 3707 }, { "epoch": 1.8408075459209003, "grad_norm": 0.31783270835876465, "learning_rate": 3.898262310025196e-06, "loss": 0.3665, "step": 3708 }, { "epoch": 1.841303988085388, "grad_norm": 0.32973000407218933, "learning_rate": 3.895444417228097e-06, "loss": 0.3974, "step": 3709 }, { "epoch": 1.8418004302498758, "grad_norm": 0.3023588955402374, "learning_rate": 3.892626893212088e-06, "loss": 0.3051, "step": 3710 }, { "epoch": 1.8422968724143636, "grad_norm": 0.36771103739738464, "learning_rate": 3.889809738917862e-06, "loss": 0.4219, "step": 3711 }, { "epoch": 1.8427933145788515, "grad_norm": 0.3069499731063843, "learning_rate": 3.8869929552859915e-06, "loss": 0.3323, "step": 3712 }, { "epoch": 1.8432897567433395, "grad_norm": 0.31589680910110474, "learning_rate": 3.884176543256924e-06, "loss": 0.3574, "step": 3713 }, { "epoch": 1.8437861989078272, "grad_norm": 0.32025447487831116, "learning_rate": 3.88136050377098e-06, "loss": 0.391, "step": 3714 }, { "epoch": 1.844282641072315, "grad_norm": 0.33361780643463135, "learning_rate": 3.878544837768362e-06, "loss": 0.3512, "step": 3715 }, { "epoch": 1.844779083236803, "grad_norm": 0.3694958984851837, "learning_rate": 3.875729546189144e-06, "loss": 0.4057, "step": 3716 }, { "epoch": 1.8452755254012907, "grad_norm": 0.3216908872127533, "learning_rate": 3.872914629973273e-06, "loss": 0.3682, "step": 3717 }, { "epoch": 1.8457719675657787, "grad_norm": 0.3433736562728882, "learning_rate": 3.870100090060577e-06, "loss": 0.3777, "step": 3718 }, { "epoch": 1.8462684097302664, "grad_norm": 0.33654144406318665, "learning_rate": 3.8672859273907495e-06, "loss": 0.3761, "step": 3719 }, { "epoch": 1.8467648518947541, "grad_norm": 0.36412063241004944, "learning_rate": 3.864472142903367e-06, "loss": 0.3905, "step": 3720 }, { "epoch": 1.847261294059242, "grad_norm": 0.3770802617073059, "learning_rate": 3.861658737537872e-06, "loss": 0.4132, "step": 3721 }, { "epoch": 1.84775773622373, "grad_norm": 0.33560192584991455, "learning_rate": 3.858845712233588e-06, "loss": 0.3489, "step": 3722 }, { "epoch": 1.8482541783882178, "grad_norm": 0.3586503267288208, "learning_rate": 3.8560330679297065e-06, "loss": 0.361, "step": 3723 }, { "epoch": 1.8487506205527056, "grad_norm": 0.32046470046043396, "learning_rate": 3.853220805565292e-06, "loss": 0.3341, "step": 3724 }, { "epoch": 1.8492470627171933, "grad_norm": 0.3655908703804016, "learning_rate": 3.850408926079281e-06, "loss": 0.3964, "step": 3725 }, { "epoch": 1.8497435048816813, "grad_norm": 0.33560124039649963, "learning_rate": 3.847597430410486e-06, "loss": 0.3511, "step": 3726 }, { "epoch": 1.8502399470461692, "grad_norm": 0.35963594913482666, "learning_rate": 3.844786319497589e-06, "loss": 0.3699, "step": 3727 }, { "epoch": 1.850736389210657, "grad_norm": 0.34241801500320435, "learning_rate": 3.841975594279144e-06, "loss": 0.3969, "step": 3728 }, { "epoch": 1.8512328313751447, "grad_norm": 0.3354063630104065, "learning_rate": 3.839165255693571e-06, "loss": 0.3429, "step": 3729 }, { "epoch": 1.8517292735396327, "grad_norm": 0.34503376483917236, "learning_rate": 3.836355304679173e-06, "loss": 0.3729, "step": 3730 }, { "epoch": 1.8522257157041204, "grad_norm": 0.36571240425109863, "learning_rate": 3.833545742174113e-06, "loss": 0.3345, "step": 3731 }, { "epoch": 1.8527221578686084, "grad_norm": 0.3648792803287506, "learning_rate": 3.830736569116423e-06, "loss": 0.3516, "step": 3732 }, { "epoch": 1.8532186000330961, "grad_norm": 0.3707738518714905, "learning_rate": 3.827927786444018e-06, "loss": 0.3607, "step": 3733 }, { "epoch": 1.8537150421975839, "grad_norm": 0.3430705666542053, "learning_rate": 3.825119395094668e-06, "loss": 0.3695, "step": 3734 }, { "epoch": 1.8542114843620718, "grad_norm": 0.349571168422699, "learning_rate": 3.822311396006022e-06, "loss": 0.3864, "step": 3735 }, { "epoch": 1.8547079265265598, "grad_norm": 0.33406907320022583, "learning_rate": 3.81950379011559e-06, "loss": 0.3214, "step": 3736 }, { "epoch": 1.8552043686910475, "grad_norm": 0.38630571961402893, "learning_rate": 3.816696578360761e-06, "loss": 0.4027, "step": 3737 }, { "epoch": 1.8557008108555353, "grad_norm": 0.3739798963069916, "learning_rate": 3.8138897616787847e-06, "loss": 0.4111, "step": 3738 }, { "epoch": 1.856197253020023, "grad_norm": 0.3356795608997345, "learning_rate": 3.8110833410067795e-06, "loss": 0.3373, "step": 3739 }, { "epoch": 1.856693695184511, "grad_norm": 0.32553228735923767, "learning_rate": 3.808277317281732e-06, "loss": 0.4037, "step": 3740 }, { "epoch": 1.857190137348999, "grad_norm": 0.33414605259895325, "learning_rate": 3.805471691440501e-06, "loss": 0.3899, "step": 3741 }, { "epoch": 1.8576865795134867, "grad_norm": 0.3404712677001953, "learning_rate": 3.802666464419806e-06, "loss": 0.3463, "step": 3742 }, { "epoch": 1.8581830216779744, "grad_norm": 0.32204535603523254, "learning_rate": 3.7998616371562377e-06, "loss": 0.3271, "step": 3743 }, { "epoch": 1.8586794638424622, "grad_norm": 0.3415451645851135, "learning_rate": 3.797057210586248e-06, "loss": 0.3736, "step": 3744 }, { "epoch": 1.8591759060069502, "grad_norm": 0.34274202585220337, "learning_rate": 3.7942531856461643e-06, "loss": 0.3886, "step": 3745 }, { "epoch": 1.8596723481714381, "grad_norm": 0.37322527170181274, "learning_rate": 3.7914495632721713e-06, "loss": 0.3952, "step": 3746 }, { "epoch": 1.8601687903359259, "grad_norm": 0.32575905323028564, "learning_rate": 3.788646344400321e-06, "loss": 0.3194, "step": 3747 }, { "epoch": 1.8606652325004136, "grad_norm": 0.3274308741092682, "learning_rate": 3.7858435299665354e-06, "loss": 0.3395, "step": 3748 }, { "epoch": 1.8611616746649016, "grad_norm": 0.34329789876937866, "learning_rate": 3.783041120906596e-06, "loss": 0.3755, "step": 3749 }, { "epoch": 1.8616581168293895, "grad_norm": 0.3116765022277832, "learning_rate": 3.7802391181561497e-06, "loss": 0.3496, "step": 3750 }, { "epoch": 1.8621545589938773, "grad_norm": 0.37184619903564453, "learning_rate": 3.7774375226507106e-06, "loss": 0.3862, "step": 3751 }, { "epoch": 1.862651001158365, "grad_norm": 0.3392834961414337, "learning_rate": 3.7746363353256567e-06, "loss": 0.3635, "step": 3752 }, { "epoch": 1.8631474433228528, "grad_norm": 0.33720460534095764, "learning_rate": 3.7718355571162266e-06, "loss": 0.3818, "step": 3753 }, { "epoch": 1.8636438854873407, "grad_norm": 0.3252834677696228, "learning_rate": 3.769035188957525e-06, "loss": 0.3652, "step": 3754 }, { "epoch": 1.8641403276518287, "grad_norm": 0.30949854850769043, "learning_rate": 3.766235231784515e-06, "loss": 0.3857, "step": 3755 }, { "epoch": 1.8646367698163164, "grad_norm": 0.3112739026546478, "learning_rate": 3.7634356865320327e-06, "loss": 0.3359, "step": 3756 }, { "epoch": 1.8651332119808042, "grad_norm": 0.356926828622818, "learning_rate": 3.760636554134765e-06, "loss": 0.3997, "step": 3757 }, { "epoch": 1.865629654145292, "grad_norm": 0.3206149637699127, "learning_rate": 3.757837835527268e-06, "loss": 0.3462, "step": 3758 }, { "epoch": 1.8661260963097799, "grad_norm": 0.32894831895828247, "learning_rate": 3.7550395316439568e-06, "loss": 0.355, "step": 3759 }, { "epoch": 1.8666225384742678, "grad_norm": 0.33003053069114685, "learning_rate": 3.7522416434191117e-06, "loss": 0.3898, "step": 3760 }, { "epoch": 1.8671189806387556, "grad_norm": 0.31348395347595215, "learning_rate": 3.7494441717868698e-06, "loss": 0.3399, "step": 3761 }, { "epoch": 1.8676154228032433, "grad_norm": 0.3353370726108551, "learning_rate": 3.746647117681228e-06, "loss": 0.3751, "step": 3762 }, { "epoch": 1.8681118649677313, "grad_norm": 0.32981693744659424, "learning_rate": 3.7438504820360523e-06, "loss": 0.3776, "step": 3763 }, { "epoch": 1.8686083071322193, "grad_norm": 0.35747820138931274, "learning_rate": 3.741054265785059e-06, "loss": 0.3805, "step": 3764 }, { "epoch": 1.869104749296707, "grad_norm": 0.3330358862876892, "learning_rate": 3.738258469861831e-06, "loss": 0.352, "step": 3765 }, { "epoch": 1.8696011914611947, "grad_norm": 0.3465973734855652, "learning_rate": 3.7354630951998063e-06, "loss": 0.3933, "step": 3766 }, { "epoch": 1.8700976336256825, "grad_norm": 0.30421963334083557, "learning_rate": 3.732668142732286e-06, "loss": 0.3384, "step": 3767 }, { "epoch": 1.8705940757901705, "grad_norm": 0.3537590503692627, "learning_rate": 3.7298736133924295e-06, "loss": 0.3561, "step": 3768 }, { "epoch": 1.8710905179546584, "grad_norm": 0.3652830719947815, "learning_rate": 3.727079508113254e-06, "loss": 0.3892, "step": 3769 }, { "epoch": 1.8715869601191462, "grad_norm": 0.3459082841873169, "learning_rate": 3.724285827827633e-06, "loss": 0.354, "step": 3770 }, { "epoch": 1.872083402283634, "grad_norm": 0.3248676359653473, "learning_rate": 3.721492573468303e-06, "loss": 0.4158, "step": 3771 }, { "epoch": 1.8725798444481216, "grad_norm": 0.3259899616241455, "learning_rate": 3.7186997459678553e-06, "loss": 0.3507, "step": 3772 }, { "epoch": 1.8730762866126096, "grad_norm": 0.3483177721500397, "learning_rate": 3.715907346258737e-06, "loss": 0.3792, "step": 3773 }, { "epoch": 1.8735727287770976, "grad_norm": 0.3273208439350128, "learning_rate": 3.7131153752732563e-06, "loss": 0.387, "step": 3774 }, { "epoch": 1.8740691709415853, "grad_norm": 0.3490307331085205, "learning_rate": 3.7103238339435776e-06, "loss": 0.4081, "step": 3775 }, { "epoch": 1.874565613106073, "grad_norm": 0.336071640253067, "learning_rate": 3.7075327232017195e-06, "loss": 0.3617, "step": 3776 }, { "epoch": 1.875062055270561, "grad_norm": 0.3312743306159973, "learning_rate": 3.7047420439795555e-06, "loss": 0.3753, "step": 3777 }, { "epoch": 1.8755584974350488, "grad_norm": 0.31227707862854004, "learning_rate": 3.701951797208822e-06, "loss": 0.3251, "step": 3778 }, { "epoch": 1.8760549395995367, "grad_norm": 0.3400703966617584, "learning_rate": 3.6991619838211048e-06, "loss": 0.3935, "step": 3779 }, { "epoch": 1.8765513817640245, "grad_norm": 0.34836992621421814, "learning_rate": 3.696372604747845e-06, "loss": 0.3719, "step": 3780 }, { "epoch": 1.8770478239285122, "grad_norm": 0.32354557514190674, "learning_rate": 3.6935836609203412e-06, "loss": 0.3305, "step": 3781 }, { "epoch": 1.8775442660930002, "grad_norm": 0.3633301258087158, "learning_rate": 3.6907951532697474e-06, "loss": 0.4026, "step": 3782 }, { "epoch": 1.8780407082574881, "grad_norm": 0.34747037291526794, "learning_rate": 3.688007082727071e-06, "loss": 0.4001, "step": 3783 }, { "epoch": 1.8785371504219759, "grad_norm": 0.32824939489364624, "learning_rate": 3.6852194502231707e-06, "loss": 0.3476, "step": 3784 }, { "epoch": 1.8790335925864636, "grad_norm": 0.35886794328689575, "learning_rate": 3.682432256688761e-06, "loss": 0.3861, "step": 3785 }, { "epoch": 1.8795300347509514, "grad_norm": 0.3211080729961395, "learning_rate": 3.6796455030544133e-06, "loss": 0.3426, "step": 3786 }, { "epoch": 1.8800264769154393, "grad_norm": 0.3477056324481964, "learning_rate": 3.6768591902505467e-06, "loss": 0.3502, "step": 3787 }, { "epoch": 1.8805229190799273, "grad_norm": 0.3186051547527313, "learning_rate": 3.674073319207433e-06, "loss": 0.3622, "step": 3788 }, { "epoch": 1.881019361244415, "grad_norm": 0.32496124505996704, "learning_rate": 3.671287890855204e-06, "loss": 0.3742, "step": 3789 }, { "epoch": 1.8815158034089028, "grad_norm": 0.3474034070968628, "learning_rate": 3.6685029061238344e-06, "loss": 0.3923, "step": 3790 }, { "epoch": 1.8820122455733908, "grad_norm": 0.3202638030052185, "learning_rate": 3.665718365943158e-06, "loss": 0.344, "step": 3791 }, { "epoch": 1.8825086877378785, "grad_norm": 0.38473060727119446, "learning_rate": 3.662934271242853e-06, "loss": 0.4051, "step": 3792 }, { "epoch": 1.8830051299023665, "grad_norm": 0.3876386880874634, "learning_rate": 3.6601506229524576e-06, "loss": 0.3571, "step": 3793 }, { "epoch": 1.8835015720668542, "grad_norm": 0.35024967789649963, "learning_rate": 3.6573674220013532e-06, "loss": 0.3628, "step": 3794 }, { "epoch": 1.883998014231342, "grad_norm": 0.3238316476345062, "learning_rate": 3.654584669318777e-06, "loss": 0.3363, "step": 3795 }, { "epoch": 1.88449445639583, "grad_norm": 0.35982978343963623, "learning_rate": 3.6518023658338107e-06, "loss": 0.3717, "step": 3796 }, { "epoch": 1.8849908985603179, "grad_norm": 0.38772645592689514, "learning_rate": 3.6490205124753947e-06, "loss": 0.3965, "step": 3797 }, { "epoch": 1.8854873407248056, "grad_norm": 0.33302491903305054, "learning_rate": 3.646239110172311e-06, "loss": 0.362, "step": 3798 }, { "epoch": 1.8859837828892934, "grad_norm": 0.31191486120224, "learning_rate": 3.6434581598531937e-06, "loss": 0.3824, "step": 3799 }, { "epoch": 1.886480225053781, "grad_norm": 0.3333255648612976, "learning_rate": 3.640677662446531e-06, "loss": 0.3827, "step": 3800 }, { "epoch": 1.886976667218269, "grad_norm": 0.33496958017349243, "learning_rate": 3.6378976188806525e-06, "loss": 0.3731, "step": 3801 }, { "epoch": 1.887473109382757, "grad_norm": 0.33254340291023254, "learning_rate": 3.6351180300837386e-06, "loss": 0.3714, "step": 3802 }, { "epoch": 1.8879695515472448, "grad_norm": 0.3369951546192169, "learning_rate": 3.632338896983817e-06, "loss": 0.3706, "step": 3803 }, { "epoch": 1.8884659937117325, "grad_norm": 0.35696941614151, "learning_rate": 3.6295602205087687e-06, "loss": 0.3599, "step": 3804 }, { "epoch": 1.8889624358762203, "grad_norm": 0.33578750491142273, "learning_rate": 3.6267820015863153e-06, "loss": 0.3879, "step": 3805 }, { "epoch": 1.8894588780407082, "grad_norm": 0.3093549609184265, "learning_rate": 3.624004241144031e-06, "loss": 0.3633, "step": 3806 }, { "epoch": 1.8899553202051962, "grad_norm": 0.3339678645133972, "learning_rate": 3.621226940109331e-06, "loss": 0.378, "step": 3807 }, { "epoch": 1.890451762369684, "grad_norm": 0.3395077884197235, "learning_rate": 3.618450099409484e-06, "loss": 0.3592, "step": 3808 }, { "epoch": 1.8909482045341717, "grad_norm": 0.3452979028224945, "learning_rate": 3.6156737199716014e-06, "loss": 0.3761, "step": 3809 }, { "epoch": 1.8914446466986596, "grad_norm": 0.3243652582168579, "learning_rate": 3.612897802722639e-06, "loss": 0.3657, "step": 3810 }, { "epoch": 1.8919410888631476, "grad_norm": 0.31240904331207275, "learning_rate": 3.6101223485893995e-06, "loss": 0.3475, "step": 3811 }, { "epoch": 1.8924375310276353, "grad_norm": 0.32762691378593445, "learning_rate": 3.6073473584985346e-06, "loss": 0.4009, "step": 3812 }, { "epoch": 1.892933973192123, "grad_norm": 0.332918256521225, "learning_rate": 3.6045728333765356e-06, "loss": 0.3498, "step": 3813 }, { "epoch": 1.8934304153566108, "grad_norm": 0.3281799256801605, "learning_rate": 3.601798774149742e-06, "loss": 0.3845, "step": 3814 }, { "epoch": 1.8939268575210988, "grad_norm": 0.33677732944488525, "learning_rate": 3.5990251817443365e-06, "loss": 0.3449, "step": 3815 }, { "epoch": 1.8944232996855868, "grad_norm": 0.3376767635345459, "learning_rate": 3.596252057086348e-06, "loss": 0.3516, "step": 3816 }, { "epoch": 1.8949197418500745, "grad_norm": 0.33410343527793884, "learning_rate": 3.593479401101645e-06, "loss": 0.3703, "step": 3817 }, { "epoch": 1.8954161840145622, "grad_norm": 0.3075319230556488, "learning_rate": 3.590707214715942e-06, "loss": 0.3635, "step": 3818 }, { "epoch": 1.89591262617905, "grad_norm": 0.32732322812080383, "learning_rate": 3.5879354988547988e-06, "loss": 0.3691, "step": 3819 }, { "epoch": 1.896409068343538, "grad_norm": 0.28725630044937134, "learning_rate": 3.585164254443615e-06, "loss": 0.3388, "step": 3820 }, { "epoch": 1.896905510508026, "grad_norm": 0.32536420226097107, "learning_rate": 3.582393482407632e-06, "loss": 0.3785, "step": 3821 }, { "epoch": 1.8974019526725137, "grad_norm": 0.3242858052253723, "learning_rate": 3.5796231836719363e-06, "loss": 0.3699, "step": 3822 }, { "epoch": 1.8978983948370014, "grad_norm": 0.34999698400497437, "learning_rate": 3.5768533591614575e-06, "loss": 0.3897, "step": 3823 }, { "epoch": 1.8983948370014894, "grad_norm": 0.3389486074447632, "learning_rate": 3.5740840098009634e-06, "loss": 0.3763, "step": 3824 }, { "epoch": 1.898891279165977, "grad_norm": 0.3358362317085266, "learning_rate": 3.5713151365150645e-06, "loss": 0.3398, "step": 3825 }, { "epoch": 1.899387721330465, "grad_norm": 0.3397561013698578, "learning_rate": 3.5685467402282093e-06, "loss": 0.321, "step": 3826 }, { "epoch": 1.8998841634949528, "grad_norm": 0.3701314926147461, "learning_rate": 3.565778821864695e-06, "loss": 0.4205, "step": 3827 }, { "epoch": 1.9003806056594406, "grad_norm": 0.34511399269104004, "learning_rate": 3.563011382348651e-06, "loss": 0.3573, "step": 3828 }, { "epoch": 1.9008770478239285, "grad_norm": 0.34791305661201477, "learning_rate": 3.560244422604052e-06, "loss": 0.4165, "step": 3829 }, { "epoch": 1.9013734899884165, "grad_norm": 0.30670687556266785, "learning_rate": 3.557477943554709e-06, "loss": 0.35, "step": 3830 }, { "epoch": 1.9018699321529042, "grad_norm": 0.3210630714893341, "learning_rate": 3.5547119461242766e-06, "loss": 0.3359, "step": 3831 }, { "epoch": 1.902366374317392, "grad_norm": 0.3771286904811859, "learning_rate": 3.551946431236245e-06, "loss": 0.4121, "step": 3832 }, { "epoch": 1.9028628164818797, "grad_norm": 0.3272000849246979, "learning_rate": 3.5491813998139413e-06, "loss": 0.3735, "step": 3833 }, { "epoch": 1.9033592586463677, "grad_norm": 0.33726444840431213, "learning_rate": 3.5464168527805398e-06, "loss": 0.3687, "step": 3834 }, { "epoch": 1.9038557008108556, "grad_norm": 0.3200182020664215, "learning_rate": 3.5436527910590446e-06, "loss": 0.3436, "step": 3835 }, { "epoch": 1.9043521429753434, "grad_norm": 0.3444659411907196, "learning_rate": 3.5408892155723e-06, "loss": 0.3298, "step": 3836 }, { "epoch": 1.9048485851398311, "grad_norm": 0.37392792105674744, "learning_rate": 3.53812612724299e-06, "loss": 0.3534, "step": 3837 }, { "epoch": 1.905345027304319, "grad_norm": 0.32063260674476624, "learning_rate": 3.535363526993635e-06, "loss": 0.4166, "step": 3838 }, { "epoch": 1.9058414694688068, "grad_norm": 0.3341764509677887, "learning_rate": 3.5326014157465922e-06, "loss": 0.3725, "step": 3839 }, { "epoch": 1.9063379116332948, "grad_norm": 0.31554311513900757, "learning_rate": 3.5298397944240524e-06, "loss": 0.3815, "step": 3840 }, { "epoch": 1.9068343537977825, "grad_norm": 0.3730708956718445, "learning_rate": 3.5270786639480512e-06, "loss": 0.3611, "step": 3841 }, { "epoch": 1.9073307959622703, "grad_norm": 0.3260488212108612, "learning_rate": 3.524318025240453e-06, "loss": 0.3659, "step": 3842 }, { "epoch": 1.9078272381267583, "grad_norm": 0.3428926169872284, "learning_rate": 3.5215578792229586e-06, "loss": 0.4058, "step": 3843 }, { "epoch": 1.9083236802912462, "grad_norm": 0.31456688046455383, "learning_rate": 3.518798226817105e-06, "loss": 0.3417, "step": 3844 }, { "epoch": 1.908820122455734, "grad_norm": 0.33419618010520935, "learning_rate": 3.516039068944267e-06, "loss": 0.3922, "step": 3845 }, { "epoch": 1.9093165646202217, "grad_norm": 0.3370512127876282, "learning_rate": 3.513280406525653e-06, "loss": 0.3661, "step": 3846 }, { "epoch": 1.9098130067847094, "grad_norm": 0.31028103828430176, "learning_rate": 3.510522240482305e-06, "loss": 0.3424, "step": 3847 }, { "epoch": 1.9103094489491974, "grad_norm": 0.3382093608379364, "learning_rate": 3.507764571735097e-06, "loss": 0.3775, "step": 3848 }, { "epoch": 1.9108058911136854, "grad_norm": 0.34646105766296387, "learning_rate": 3.5050074012047443e-06, "loss": 0.3694, "step": 3849 }, { "epoch": 1.9113023332781731, "grad_norm": 0.36237967014312744, "learning_rate": 3.5022507298117873e-06, "loss": 0.3918, "step": 3850 }, { "epoch": 1.9117987754426609, "grad_norm": 0.32494789361953735, "learning_rate": 3.4994945584766048e-06, "loss": 0.3504, "step": 3851 }, { "epoch": 1.9122952176071486, "grad_norm": 0.326159805059433, "learning_rate": 3.4967388881194083e-06, "loss": 0.3604, "step": 3852 }, { "epoch": 1.9127916597716366, "grad_norm": 0.32857075333595276, "learning_rate": 3.49398371966024e-06, "loss": 0.3622, "step": 3853 }, { "epoch": 1.9132881019361245, "grad_norm": 0.32144418358802795, "learning_rate": 3.4912290540189776e-06, "loss": 0.3506, "step": 3854 }, { "epoch": 1.9137845441006123, "grad_norm": 0.3219118118286133, "learning_rate": 3.4884748921153253e-06, "loss": 0.3537, "step": 3855 }, { "epoch": 1.9142809862651, "grad_norm": 0.35433369874954224, "learning_rate": 3.4857212348688285e-06, "loss": 0.4029, "step": 3856 }, { "epoch": 1.914777428429588, "grad_norm": 0.3104363679885864, "learning_rate": 3.4829680831988557e-06, "loss": 0.3195, "step": 3857 }, { "epoch": 1.915273870594076, "grad_norm": 0.3316510319709778, "learning_rate": 3.480215438024609e-06, "loss": 0.3451, "step": 3858 }, { "epoch": 1.9157703127585637, "grad_norm": 0.36146679520606995, "learning_rate": 3.4774633002651196e-06, "loss": 0.3794, "step": 3859 }, { "epoch": 1.9162667549230514, "grad_norm": 0.3548322319984436, "learning_rate": 3.4747116708392565e-06, "loss": 0.3857, "step": 3860 }, { "epoch": 1.9167631970875392, "grad_norm": 0.31044018268585205, "learning_rate": 3.4719605506657105e-06, "loss": 0.2991, "step": 3861 }, { "epoch": 1.9172596392520271, "grad_norm": 0.3314984142780304, "learning_rate": 3.4692099406630076e-06, "loss": 0.3406, "step": 3862 }, { "epoch": 1.917756081416515, "grad_norm": 0.3513180911540985, "learning_rate": 3.466459841749499e-06, "loss": 0.3974, "step": 3863 }, { "epoch": 1.9182525235810028, "grad_norm": 0.34671157598495483, "learning_rate": 3.463710254843372e-06, "loss": 0.3468, "step": 3864 }, { "epoch": 1.9187489657454906, "grad_norm": 0.3332311511039734, "learning_rate": 3.4609611808626363e-06, "loss": 0.3622, "step": 3865 }, { "epoch": 1.9192454079099783, "grad_norm": 0.4091743230819702, "learning_rate": 3.458212620725134e-06, "loss": 0.3974, "step": 3866 }, { "epoch": 1.9197418500744663, "grad_norm": 0.3423232138156891, "learning_rate": 3.4554645753485326e-06, "loss": 0.3765, "step": 3867 }, { "epoch": 1.9202382922389543, "grad_norm": 0.3542107045650482, "learning_rate": 3.452717045650332e-06, "loss": 0.3695, "step": 3868 }, { "epoch": 1.920734734403442, "grad_norm": 0.34170565009117126, "learning_rate": 3.449970032547858e-06, "loss": 0.3809, "step": 3869 }, { "epoch": 1.9212311765679297, "grad_norm": 0.34844279289245605, "learning_rate": 3.4472235369582603e-06, "loss": 0.411, "step": 3870 }, { "epoch": 1.9217276187324177, "grad_norm": 0.31811878085136414, "learning_rate": 3.4444775597985236e-06, "loss": 0.3292, "step": 3871 }, { "epoch": 1.9222240608969057, "grad_norm": 0.3751319944858551, "learning_rate": 3.4417321019854533e-06, "loss": 0.3895, "step": 3872 }, { "epoch": 1.9227205030613934, "grad_norm": 0.3897225558757782, "learning_rate": 3.4389871644356825e-06, "loss": 0.3674, "step": 3873 }, { "epoch": 1.9232169452258812, "grad_norm": 0.34066036343574524, "learning_rate": 3.4362427480656703e-06, "loss": 0.3556, "step": 3874 }, { "epoch": 1.923713387390369, "grad_norm": 0.3981136977672577, "learning_rate": 3.4334988537917045e-06, "loss": 0.3881, "step": 3875 }, { "epoch": 1.9242098295548569, "grad_norm": 0.34900400042533875, "learning_rate": 3.430755482529896e-06, "loss": 0.3339, "step": 3876 }, { "epoch": 1.9247062717193448, "grad_norm": 0.3486894369125366, "learning_rate": 3.428012635196184e-06, "loss": 0.3656, "step": 3877 }, { "epoch": 1.9252027138838326, "grad_norm": 0.2833118736743927, "learning_rate": 3.425270312706326e-06, "loss": 0.2987, "step": 3878 }, { "epoch": 1.9256991560483203, "grad_norm": 0.3621808588504791, "learning_rate": 3.4225285159759137e-06, "loss": 0.4191, "step": 3879 }, { "epoch": 1.926195598212808, "grad_norm": 0.34801527857780457, "learning_rate": 3.419787245920357e-06, "loss": 0.3996, "step": 3880 }, { "epoch": 1.926692040377296, "grad_norm": 0.33278945088386536, "learning_rate": 3.4170465034548883e-06, "loss": 0.3979, "step": 3881 }, { "epoch": 1.927188482541784, "grad_norm": 0.31858885288238525, "learning_rate": 3.4143062894945727e-06, "loss": 0.3857, "step": 3882 }, { "epoch": 1.9276849247062717, "grad_norm": 0.3311390280723572, "learning_rate": 3.41156660495429e-06, "loss": 0.3452, "step": 3883 }, { "epoch": 1.9281813668707595, "grad_norm": 0.3232775330543518, "learning_rate": 3.4088274507487455e-06, "loss": 0.3537, "step": 3884 }, { "epoch": 1.9286778090352474, "grad_norm": 0.35950398445129395, "learning_rate": 3.4060888277924697e-06, "loss": 0.3589, "step": 3885 }, { "epoch": 1.9291742511997352, "grad_norm": 0.3189614415168762, "learning_rate": 3.4033507369998143e-06, "loss": 0.3778, "step": 3886 }, { "epoch": 1.9296706933642231, "grad_norm": 0.3157059848308563, "learning_rate": 3.400613179284954e-06, "loss": 0.3271, "step": 3887 }, { "epoch": 1.9301671355287109, "grad_norm": 0.32224324345588684, "learning_rate": 3.3978761555618845e-06, "loss": 0.3369, "step": 3888 }, { "epoch": 1.9306635776931986, "grad_norm": 0.3454539477825165, "learning_rate": 3.3951396667444213e-06, "loss": 0.3831, "step": 3889 }, { "epoch": 1.9311600198576866, "grad_norm": 0.3471345603466034, "learning_rate": 3.3924037137462074e-06, "loss": 0.3649, "step": 3890 }, { "epoch": 1.9316564620221746, "grad_norm": 0.32870134711265564, "learning_rate": 3.389668297480702e-06, "loss": 0.351, "step": 3891 }, { "epoch": 1.9321529041866623, "grad_norm": 0.33915576338768005, "learning_rate": 3.3869334188611848e-06, "loss": 0.3397, "step": 3892 }, { "epoch": 1.93264934635115, "grad_norm": 0.36105504631996155, "learning_rate": 3.384199078800756e-06, "loss": 0.352, "step": 3893 }, { "epoch": 1.9331457885156378, "grad_norm": 0.33851614594459534, "learning_rate": 3.381465278212343e-06, "loss": 0.4029, "step": 3894 }, { "epoch": 1.9336422306801258, "grad_norm": 0.33034971356391907, "learning_rate": 3.3787320180086836e-06, "loss": 0.3819, "step": 3895 }, { "epoch": 1.9341386728446137, "grad_norm": 0.34774407744407654, "learning_rate": 3.375999299102338e-06, "loss": 0.3691, "step": 3896 }, { "epoch": 1.9346351150091015, "grad_norm": 0.322875440120697, "learning_rate": 3.373267122405691e-06, "loss": 0.3404, "step": 3897 }, { "epoch": 1.9351315571735892, "grad_norm": 0.3383539319038391, "learning_rate": 3.3705354888309395e-06, "loss": 0.3594, "step": 3898 }, { "epoch": 1.9356279993380772, "grad_norm": 0.3225177824497223, "learning_rate": 3.3678043992901e-06, "loss": 0.3141, "step": 3899 }, { "epoch": 1.936124441502565, "grad_norm": 0.33751460909843445, "learning_rate": 3.3650738546950117e-06, "loss": 0.3971, "step": 3900 }, { "epoch": 1.9366208836670529, "grad_norm": 0.35920479893684387, "learning_rate": 3.3623438559573284e-06, "loss": 0.3796, "step": 3901 }, { "epoch": 1.9371173258315406, "grad_norm": 0.34035876393318176, "learning_rate": 3.3596144039885237e-06, "loss": 0.4122, "step": 3902 }, { "epoch": 1.9376137679960284, "grad_norm": 0.3470124304294586, "learning_rate": 3.3568854996998864e-06, "loss": 0.3836, "step": 3903 }, { "epoch": 1.9381102101605163, "grad_norm": 0.3599734604358673, "learning_rate": 3.354157144002521e-06, "loss": 0.3462, "step": 3904 }, { "epoch": 1.9386066523250043, "grad_norm": 0.35624030232429504, "learning_rate": 3.351429337807356e-06, "loss": 0.3781, "step": 3905 }, { "epoch": 1.939103094489492, "grad_norm": 0.3119904100894928, "learning_rate": 3.3487020820251293e-06, "loss": 0.3302, "step": 3906 }, { "epoch": 1.9395995366539798, "grad_norm": 0.3112555742263794, "learning_rate": 3.3459753775663963e-06, "loss": 0.3663, "step": 3907 }, { "epoch": 1.9400959788184675, "grad_norm": 0.31231820583343506, "learning_rate": 3.343249225341531e-06, "loss": 0.3681, "step": 3908 }, { "epoch": 1.9405924209829555, "grad_norm": 0.3361033797264099, "learning_rate": 3.3405236262607214e-06, "loss": 0.3569, "step": 3909 }, { "epoch": 1.9410888631474434, "grad_norm": 0.3141835033893585, "learning_rate": 3.337798581233972e-06, "loss": 0.3227, "step": 3910 }, { "epoch": 1.9415853053119312, "grad_norm": 0.3160500228404999, "learning_rate": 3.3350740911710987e-06, "loss": 0.3841, "step": 3911 }, { "epoch": 1.942081747476419, "grad_norm": 0.3128555417060852, "learning_rate": 3.3323501569817375e-06, "loss": 0.3597, "step": 3912 }, { "epoch": 1.9425781896409067, "grad_norm": 0.34930509328842163, "learning_rate": 3.3296267795753345e-06, "loss": 0.3549, "step": 3913 }, { "epoch": 1.9430746318053946, "grad_norm": 0.34263354539871216, "learning_rate": 3.3269039598611525e-06, "loss": 0.3639, "step": 3914 }, { "epoch": 1.9435710739698826, "grad_norm": 0.3462592363357544, "learning_rate": 3.324181698748263e-06, "loss": 0.4162, "step": 3915 }, { "epoch": 1.9440675161343703, "grad_norm": 0.3267165720462799, "learning_rate": 3.3214599971455596e-06, "loss": 0.3567, "step": 3916 }, { "epoch": 1.944563958298858, "grad_norm": 0.3185153603553772, "learning_rate": 3.3187388559617438e-06, "loss": 0.3492, "step": 3917 }, { "epoch": 1.945060400463346, "grad_norm": 0.31983861327171326, "learning_rate": 3.3160182761053306e-06, "loss": 0.3495, "step": 3918 }, { "epoch": 1.945556842627834, "grad_norm": 0.33566582202911377, "learning_rate": 3.3132982584846442e-06, "loss": 0.3972, "step": 3919 }, { "epoch": 1.9460532847923218, "grad_norm": 0.32192692160606384, "learning_rate": 3.310578804007829e-06, "loss": 0.3352, "step": 3920 }, { "epoch": 1.9465497269568095, "grad_norm": 0.3864642083644867, "learning_rate": 3.307859913582836e-06, "loss": 0.3943, "step": 3921 }, { "epoch": 1.9470461691212972, "grad_norm": 0.35425305366516113, "learning_rate": 3.3051415881174263e-06, "loss": 0.3432, "step": 3922 }, { "epoch": 1.9475426112857852, "grad_norm": 0.328681617975235, "learning_rate": 3.3024238285191774e-06, "loss": 0.3402, "step": 3923 }, { "epoch": 1.9480390534502732, "grad_norm": 0.31587228178977966, "learning_rate": 3.299706635695474e-06, "loss": 0.3457, "step": 3924 }, { "epoch": 1.948535495614761, "grad_norm": 0.3387843668460846, "learning_rate": 3.2969900105535148e-06, "loss": 0.3711, "step": 3925 }, { "epoch": 1.9490319377792487, "grad_norm": 0.3326440155506134, "learning_rate": 3.2942739540003034e-06, "loss": 0.3696, "step": 3926 }, { "epoch": 1.9495283799437364, "grad_norm": 0.316671222448349, "learning_rate": 3.2915584669426624e-06, "loss": 0.3068, "step": 3927 }, { "epoch": 1.9500248221082244, "grad_norm": 0.3289339542388916, "learning_rate": 3.288843550287216e-06, "loss": 0.3644, "step": 3928 }, { "epoch": 1.9505212642727123, "grad_norm": 0.32375457882881165, "learning_rate": 3.2861292049404016e-06, "loss": 0.3821, "step": 3929 }, { "epoch": 1.9510177064372, "grad_norm": 0.32627376914024353, "learning_rate": 3.2834154318084632e-06, "loss": 0.3526, "step": 3930 }, { "epoch": 1.9515141486016878, "grad_norm": 0.3292132616043091, "learning_rate": 3.2807022317974594e-06, "loss": 0.363, "step": 3931 }, { "epoch": 1.9520105907661758, "grad_norm": 0.34721633791923523, "learning_rate": 3.277989605813252e-06, "loss": 0.3756, "step": 3932 }, { "epoch": 1.9525070329306637, "grad_norm": 0.3136274516582489, "learning_rate": 3.2752775547615147e-06, "loss": 0.3616, "step": 3933 }, { "epoch": 1.9530034750951515, "grad_norm": 0.34008124470710754, "learning_rate": 3.2725660795477242e-06, "loss": 0.3644, "step": 3934 }, { "epoch": 1.9534999172596392, "grad_norm": 0.3252725601196289, "learning_rate": 3.269855181077173e-06, "loss": 0.3784, "step": 3935 }, { "epoch": 1.953996359424127, "grad_norm": 0.33123594522476196, "learning_rate": 3.2671448602549537e-06, "loss": 0.3546, "step": 3936 }, { "epoch": 1.954492801588615, "grad_norm": 0.3089674711227417, "learning_rate": 3.2644351179859678e-06, "loss": 0.3472, "step": 3937 }, { "epoch": 1.954989243753103, "grad_norm": 0.3483961224555969, "learning_rate": 3.2617259551749283e-06, "loss": 0.3869, "step": 3938 }, { "epoch": 1.9554856859175906, "grad_norm": 0.3357577323913574, "learning_rate": 3.2590173727263464e-06, "loss": 0.3484, "step": 3939 }, { "epoch": 1.9559821280820784, "grad_norm": 0.3370338976383209, "learning_rate": 3.256309371544548e-06, "loss": 0.3346, "step": 3940 }, { "epoch": 1.9564785702465661, "grad_norm": 0.37585678696632385, "learning_rate": 3.253601952533658e-06, "loss": 0.3808, "step": 3941 }, { "epoch": 1.956975012411054, "grad_norm": 0.36407947540283203, "learning_rate": 3.2508951165976132e-06, "loss": 0.3659, "step": 3942 }, { "epoch": 1.957471454575542, "grad_norm": 0.343794047832489, "learning_rate": 3.2481888646401506e-06, "loss": 0.3982, "step": 3943 }, { "epoch": 1.9579678967400298, "grad_norm": 0.35056161880493164, "learning_rate": 3.2454831975648147e-06, "loss": 0.3682, "step": 3944 }, { "epoch": 1.9584643389045175, "grad_norm": 0.448250949382782, "learning_rate": 3.2427781162749527e-06, "loss": 0.382, "step": 3945 }, { "epoch": 1.9589607810690055, "grad_norm": 0.36135149002075195, "learning_rate": 3.2400736216737207e-06, "loss": 0.3627, "step": 3946 }, { "epoch": 1.9594572232334933, "grad_norm": 0.3822857141494751, "learning_rate": 3.2373697146640727e-06, "loss": 0.3822, "step": 3947 }, { "epoch": 1.9599536653979812, "grad_norm": 0.3452215790748596, "learning_rate": 3.2346663961487722e-06, "loss": 0.3762, "step": 3948 }, { "epoch": 1.960450107562469, "grad_norm": 0.3036128580570221, "learning_rate": 3.2319636670303815e-06, "loss": 0.3406, "step": 3949 }, { "epoch": 1.9609465497269567, "grad_norm": 0.3424365818500519, "learning_rate": 3.2292615282112715e-06, "loss": 0.3323, "step": 3950 }, { "epoch": 1.9614429918914447, "grad_norm": 0.370588093996048, "learning_rate": 3.226559980593612e-06, "loss": 0.3582, "step": 3951 }, { "epoch": 1.9619394340559326, "grad_norm": 0.3676590919494629, "learning_rate": 3.2238590250793734e-06, "loss": 0.3631, "step": 3952 }, { "epoch": 1.9624358762204204, "grad_norm": 0.35858631134033203, "learning_rate": 3.2211586625703343e-06, "loss": 0.3762, "step": 3953 }, { "epoch": 1.9629323183849081, "grad_norm": 0.3190658688545227, "learning_rate": 3.2184588939680727e-06, "loss": 0.367, "step": 3954 }, { "epoch": 1.9634287605493959, "grad_norm": 0.3178391456604004, "learning_rate": 3.2157597201739655e-06, "loss": 0.3499, "step": 3955 }, { "epoch": 1.9639252027138838, "grad_norm": 0.36438143253326416, "learning_rate": 3.2130611420891943e-06, "loss": 0.3991, "step": 3956 }, { "epoch": 1.9644216448783718, "grad_norm": 0.3204272389411926, "learning_rate": 3.210363160614742e-06, "loss": 0.3292, "step": 3957 }, { "epoch": 1.9649180870428595, "grad_norm": 0.32035380601882935, "learning_rate": 3.207665776651392e-06, "loss": 0.4014, "step": 3958 }, { "epoch": 1.9654145292073473, "grad_norm": 0.3021283745765686, "learning_rate": 3.2049689910997255e-06, "loss": 0.3355, "step": 3959 }, { "epoch": 1.9659109713718352, "grad_norm": 0.34910133481025696, "learning_rate": 3.202272804860125e-06, "loss": 0.3945, "step": 3960 }, { "epoch": 1.966407413536323, "grad_norm": 0.36013180017471313, "learning_rate": 3.1995772188327778e-06, "loss": 0.4109, "step": 3961 }, { "epoch": 1.966903855700811, "grad_norm": 0.3224453330039978, "learning_rate": 3.196882233917663e-06, "loss": 0.3436, "step": 3962 }, { "epoch": 1.9674002978652987, "grad_norm": 0.3411381244659424, "learning_rate": 3.194187851014565e-06, "loss": 0.3901, "step": 3963 }, { "epoch": 1.9678967400297864, "grad_norm": 0.3001749515533447, "learning_rate": 3.1914940710230622e-06, "loss": 0.2949, "step": 3964 }, { "epoch": 1.9683931821942744, "grad_norm": 0.3390909731388092, "learning_rate": 3.18880089484254e-06, "loss": 0.357, "step": 3965 }, { "epoch": 1.9688896243587624, "grad_norm": 0.31791332364082336, "learning_rate": 3.186108323372172e-06, "loss": 0.3169, "step": 3966 }, { "epoch": 1.96938606652325, "grad_norm": 0.3588135838508606, "learning_rate": 3.1834163575109343e-06, "loss": 0.3951, "step": 3967 }, { "epoch": 1.9698825086877378, "grad_norm": 0.3381841480731964, "learning_rate": 3.180724998157605e-06, "loss": 0.3561, "step": 3968 }, { "epoch": 1.9703789508522256, "grad_norm": 0.33757293224334717, "learning_rate": 3.1780342462107535e-06, "loss": 0.3648, "step": 3969 }, { "epoch": 1.9708753930167136, "grad_norm": 0.3313037157058716, "learning_rate": 3.1753441025687483e-06, "loss": 0.3655, "step": 3970 }, { "epoch": 1.9713718351812015, "grad_norm": 0.3273437023162842, "learning_rate": 3.172654568129755e-06, "loss": 0.3962, "step": 3971 }, { "epoch": 1.9718682773456893, "grad_norm": 0.3291959762573242, "learning_rate": 3.169965643791737e-06, "loss": 0.3746, "step": 3972 }, { "epoch": 1.972364719510177, "grad_norm": 0.3356064558029175, "learning_rate": 3.1672773304524552e-06, "loss": 0.4067, "step": 3973 }, { "epoch": 1.9728611616746647, "grad_norm": 0.3322198688983917, "learning_rate": 3.1645896290094615e-06, "loss": 0.3394, "step": 3974 }, { "epoch": 1.9733576038391527, "grad_norm": 0.3320203125476837, "learning_rate": 3.1619025403601043e-06, "loss": 0.3445, "step": 3975 }, { "epoch": 1.9738540460036407, "grad_norm": 0.359519898891449, "learning_rate": 3.1592160654015346e-06, "loss": 0.3909, "step": 3976 }, { "epoch": 1.9743504881681284, "grad_norm": 0.3276197910308838, "learning_rate": 3.1565302050306914e-06, "loss": 0.3673, "step": 3977 }, { "epoch": 1.9748469303326162, "grad_norm": 0.3480052649974823, "learning_rate": 3.1538449601443067e-06, "loss": 0.3771, "step": 3978 }, { "epoch": 1.9753433724971041, "grad_norm": 0.3432600796222687, "learning_rate": 3.151160331638917e-06, "loss": 0.3836, "step": 3979 }, { "epoch": 1.975839814661592, "grad_norm": 0.33464157581329346, "learning_rate": 3.1484763204108433e-06, "loss": 0.3663, "step": 3980 }, { "epoch": 1.9763362568260798, "grad_norm": 0.3192864954471588, "learning_rate": 3.1457929273562048e-06, "loss": 0.3779, "step": 3981 }, { "epoch": 1.9768326989905676, "grad_norm": 0.29775625467300415, "learning_rate": 3.143110153370912e-06, "loss": 0.3456, "step": 3982 }, { "epoch": 1.9773291411550553, "grad_norm": 0.34228140115737915, "learning_rate": 3.1404279993506726e-06, "loss": 0.4057, "step": 3983 }, { "epoch": 1.9778255833195433, "grad_norm": 0.32349544763565063, "learning_rate": 3.137746466190985e-06, "loss": 0.3398, "step": 3984 }, { "epoch": 1.9783220254840312, "grad_norm": 0.3155564069747925, "learning_rate": 3.1350655547871384e-06, "loss": 0.3298, "step": 3985 }, { "epoch": 1.978818467648519, "grad_norm": 0.3413666784763336, "learning_rate": 3.1323852660342146e-06, "loss": 0.3791, "step": 3986 }, { "epoch": 1.9793149098130067, "grad_norm": 0.34158146381378174, "learning_rate": 3.1297056008270932e-06, "loss": 0.3867, "step": 3987 }, { "epoch": 1.9798113519774945, "grad_norm": 0.3017021715641022, "learning_rate": 3.127026560060441e-06, "loss": 0.3436, "step": 3988 }, { "epoch": 1.9803077941419824, "grad_norm": 0.34371352195739746, "learning_rate": 3.124348144628715e-06, "loss": 0.3596, "step": 3989 }, { "epoch": 1.9808042363064704, "grad_norm": 0.39885443449020386, "learning_rate": 3.121670355426165e-06, "loss": 0.363, "step": 3990 }, { "epoch": 1.9813006784709581, "grad_norm": 0.3345320224761963, "learning_rate": 3.1189931933468345e-06, "loss": 0.3681, "step": 3991 }, { "epoch": 1.9817971206354459, "grad_norm": 0.34552252292633057, "learning_rate": 3.116316659284554e-06, "loss": 0.3738, "step": 3992 }, { "epoch": 1.9822935627999339, "grad_norm": 0.33231040835380554, "learning_rate": 3.1136407541329435e-06, "loss": 0.3304, "step": 3993 }, { "epoch": 1.9827900049644216, "grad_norm": 0.3103536069393158, "learning_rate": 3.1109654787854184e-06, "loss": 0.3737, "step": 3994 }, { "epoch": 1.9832864471289096, "grad_norm": 0.3505650460720062, "learning_rate": 3.108290834135178e-06, "loss": 0.3842, "step": 3995 }, { "epoch": 1.9837828892933973, "grad_norm": 0.33246004581451416, "learning_rate": 3.105616821075216e-06, "loss": 0.3634, "step": 3996 }, { "epoch": 1.984279331457885, "grad_norm": 0.3263545334339142, "learning_rate": 3.102943440498308e-06, "loss": 0.3588, "step": 3997 }, { "epoch": 1.984775773622373, "grad_norm": 0.4028182625770569, "learning_rate": 3.1002706932970283e-06, "loss": 0.3687, "step": 3998 }, { "epoch": 1.985272215786861, "grad_norm": 0.36543548107147217, "learning_rate": 3.097598580363732e-06, "loss": 0.3986, "step": 3999 }, { "epoch": 1.9857686579513487, "grad_norm": 0.3107627332210541, "learning_rate": 3.094927102590566e-06, "loss": 0.3171, "step": 4000 }, { "epoch": 1.9862651001158365, "grad_norm": 0.34178441762924194, "learning_rate": 3.0922562608694604e-06, "loss": 0.3546, "step": 4001 }, { "epoch": 1.9867615422803242, "grad_norm": 0.3328694701194763, "learning_rate": 3.089586056092143e-06, "loss": 0.3677, "step": 4002 }, { "epoch": 1.9872579844448122, "grad_norm": 0.33020225167274475, "learning_rate": 3.086916489150118e-06, "loss": 0.3822, "step": 4003 }, { "epoch": 1.9877544266093001, "grad_norm": 0.3256017565727234, "learning_rate": 3.0842475609346833e-06, "loss": 0.3269, "step": 4004 }, { "epoch": 1.9882508687737879, "grad_norm": 0.32324838638305664, "learning_rate": 3.081579272336919e-06, "loss": 0.3512, "step": 4005 }, { "epoch": 1.9887473109382756, "grad_norm": 0.3421298563480377, "learning_rate": 3.0789116242476967e-06, "loss": 0.3644, "step": 4006 }, { "epoch": 1.9892437531027636, "grad_norm": 0.34571826457977295, "learning_rate": 3.076244617557672e-06, "loss": 0.3678, "step": 4007 }, { "epoch": 1.9897401952672513, "grad_norm": 0.34229612350463867, "learning_rate": 3.073578253157282e-06, "loss": 0.3153, "step": 4008 }, { "epoch": 1.9902366374317393, "grad_norm": 0.3533531427383423, "learning_rate": 3.070912531936759e-06, "loss": 0.3841, "step": 4009 }, { "epoch": 1.990733079596227, "grad_norm": 0.340865820646286, "learning_rate": 3.06824745478611e-06, "loss": 0.342, "step": 4010 }, { "epoch": 1.9912295217607148, "grad_norm": 0.34379997849464417, "learning_rate": 3.0655830225951355e-06, "loss": 0.3793, "step": 4011 }, { "epoch": 1.9917259639252027, "grad_norm": 0.32952681183815, "learning_rate": 3.062919236253412e-06, "loss": 0.36, "step": 4012 }, { "epoch": 1.9922224060896907, "grad_norm": 0.3337259292602539, "learning_rate": 3.0602560966503114e-06, "loss": 0.3657, "step": 4013 }, { "epoch": 1.9927188482541784, "grad_norm": 0.34715747833251953, "learning_rate": 3.057593604674981e-06, "loss": 0.386, "step": 4014 }, { "epoch": 1.9932152904186662, "grad_norm": 0.3171840310096741, "learning_rate": 3.0549317612163543e-06, "loss": 0.3655, "step": 4015 }, { "epoch": 1.993711732583154, "grad_norm": 0.32444092631340027, "learning_rate": 3.052270567163146e-06, "loss": 0.3831, "step": 4016 }, { "epoch": 1.994208174747642, "grad_norm": 0.31277164816856384, "learning_rate": 3.0496100234038615e-06, "loss": 0.3344, "step": 4017 }, { "epoch": 1.9947046169121299, "grad_norm": 0.34416118264198303, "learning_rate": 3.0469501308267803e-06, "loss": 0.3831, "step": 4018 }, { "epoch": 1.9952010590766176, "grad_norm": 0.33301958441734314, "learning_rate": 3.0442908903199692e-06, "loss": 0.3658, "step": 4019 }, { "epoch": 1.9956975012411053, "grad_norm": 0.33303549885749817, "learning_rate": 3.0416323027712767e-06, "loss": 0.3716, "step": 4020 }, { "epoch": 1.996193943405593, "grad_norm": 0.32855910062789917, "learning_rate": 3.0389743690683337e-06, "loss": 0.3372, "step": 4021 }, { "epoch": 1.996690385570081, "grad_norm": 0.3395429253578186, "learning_rate": 3.036317090098552e-06, "loss": 0.3799, "step": 4022 }, { "epoch": 1.997186827734569, "grad_norm": 0.32478106021881104, "learning_rate": 3.033660466749121e-06, "loss": 0.3796, "step": 4023 }, { "epoch": 1.9976832698990568, "grad_norm": 0.31575343012809753, "learning_rate": 3.0310044999070204e-06, "loss": 0.3645, "step": 4024 }, { "epoch": 1.9981797120635445, "grad_norm": 0.34757480025291443, "learning_rate": 3.0283491904590027e-06, "loss": 0.3564, "step": 4025 }, { "epoch": 1.9986761542280325, "grad_norm": 0.35740286111831665, "learning_rate": 3.0256945392916033e-06, "loss": 0.392, "step": 4026 }, { "epoch": 1.9991725963925204, "grad_norm": 0.3444148302078247, "learning_rate": 3.0230405472911374e-06, "loss": 0.3701, "step": 4027 }, { "epoch": 1.9996690385570082, "grad_norm": 0.3161092698574066, "learning_rate": 3.020387215343704e-06, "loss": 0.3794, "step": 4028 }, { "epoch": 2.000165480721496, "grad_norm": 0.7998409271240234, "learning_rate": 3.017734544335176e-06, "loss": 0.6077, "step": 4029 }, { "epoch": 2.0006619228859837, "grad_norm": 0.35551610589027405, "learning_rate": 3.0150825351512094e-06, "loss": 0.3349, "step": 4030 }, { "epoch": 2.0011583650504714, "grad_norm": 0.3389442563056946, "learning_rate": 3.0124311886772352e-06, "loss": 0.3704, "step": 4031 }, { "epoch": 2.0016548072149596, "grad_norm": 0.33968982100486755, "learning_rate": 3.009780505798469e-06, "loss": 0.3473, "step": 4032 }, { "epoch": 2.0021512493794473, "grad_norm": 0.33364811539649963, "learning_rate": 3.007130487399901e-06, "loss": 0.3208, "step": 4033 }, { "epoch": 2.002647691543935, "grad_norm": 0.3727429211139679, "learning_rate": 3.0044811343662996e-06, "loss": 0.3507, "step": 4034 }, { "epoch": 2.003144133708423, "grad_norm": 0.34139284491539, "learning_rate": 3.0018324475822113e-06, "loss": 0.3079, "step": 4035 }, { "epoch": 2.003640575872911, "grad_norm": 0.3681914508342743, "learning_rate": 2.9991844279319636e-06, "loss": 0.3832, "step": 4036 }, { "epoch": 2.0041370180373987, "grad_norm": 0.33871975541114807, "learning_rate": 2.996537076299656e-06, "loss": 0.3175, "step": 4037 }, { "epoch": 2.0046334602018865, "grad_norm": 0.35869354009628296, "learning_rate": 2.9938903935691655e-06, "loss": 0.3109, "step": 4038 }, { "epoch": 2.0051299023663742, "grad_norm": 0.3394424319267273, "learning_rate": 2.991244380624152e-06, "loss": 0.297, "step": 4039 }, { "epoch": 2.005626344530862, "grad_norm": 0.3349080979824066, "learning_rate": 2.9885990383480447e-06, "loss": 0.3264, "step": 4040 }, { "epoch": 2.00612278669535, "grad_norm": 0.3496662378311157, "learning_rate": 2.98595436762405e-06, "loss": 0.3423, "step": 4041 }, { "epoch": 2.006619228859838, "grad_norm": 0.3232991695404053, "learning_rate": 2.9833103693351533e-06, "loss": 0.3467, "step": 4042 }, { "epoch": 2.0071156710243256, "grad_norm": 0.31984591484069824, "learning_rate": 2.980667044364114e-06, "loss": 0.3063, "step": 4043 }, { "epoch": 2.0076121131888134, "grad_norm": 0.33085891604423523, "learning_rate": 2.9780243935934673e-06, "loss": 0.3131, "step": 4044 }, { "epoch": 2.008108555353301, "grad_norm": 0.32170936465263367, "learning_rate": 2.9753824179055214e-06, "loss": 0.3037, "step": 4045 }, { "epoch": 2.0086049975177893, "grad_norm": 0.3722064197063446, "learning_rate": 2.972741118182358e-06, "loss": 0.3632, "step": 4046 }, { "epoch": 2.009101439682277, "grad_norm": 0.3124769926071167, "learning_rate": 2.970100495305839e-06, "loss": 0.2783, "step": 4047 }, { "epoch": 2.009597881846765, "grad_norm": 0.3503338098526001, "learning_rate": 2.9674605501575954e-06, "loss": 0.3865, "step": 4048 }, { "epoch": 2.0100943240112525, "grad_norm": 0.35108089447021484, "learning_rate": 2.9648212836190305e-06, "loss": 0.3518, "step": 4049 }, { "epoch": 2.0105907661757407, "grad_norm": 0.3119352459907532, "learning_rate": 2.9621826965713285e-06, "loss": 0.328, "step": 4050 }, { "epoch": 2.0110872083402285, "grad_norm": 0.31104084849357605, "learning_rate": 2.959544789895438e-06, "loss": 0.3261, "step": 4051 }, { "epoch": 2.011583650504716, "grad_norm": 0.32747122645378113, "learning_rate": 2.956907564472086e-06, "loss": 0.3363, "step": 4052 }, { "epoch": 2.012080092669204, "grad_norm": 0.34039393067359924, "learning_rate": 2.9542710211817687e-06, "loss": 0.3216, "step": 4053 }, { "epoch": 2.0125765348336917, "grad_norm": 0.3722722828388214, "learning_rate": 2.95163516090476e-06, "loss": 0.3242, "step": 4054 }, { "epoch": 2.01307297699818, "grad_norm": 0.3343096971511841, "learning_rate": 2.948999984521099e-06, "loss": 0.3288, "step": 4055 }, { "epoch": 2.0135694191626676, "grad_norm": 0.3104841113090515, "learning_rate": 2.946365492910599e-06, "loss": 0.3587, "step": 4056 }, { "epoch": 2.0140658613271554, "grad_norm": 0.33842915296554565, "learning_rate": 2.9437316869528467e-06, "loss": 0.3767, "step": 4057 }, { "epoch": 2.014562303491643, "grad_norm": 0.337108314037323, "learning_rate": 2.9410985675271968e-06, "loss": 0.3358, "step": 4058 }, { "epoch": 2.015058745656131, "grad_norm": 0.3450813293457031, "learning_rate": 2.9384661355127798e-06, "loss": 0.3278, "step": 4059 }, { "epoch": 2.015555187820619, "grad_norm": 0.33087921142578125, "learning_rate": 2.935834391788488e-06, "loss": 0.3552, "step": 4060 }, { "epoch": 2.016051629985107, "grad_norm": 0.34593430161476135, "learning_rate": 2.9332033372329936e-06, "loss": 0.3279, "step": 4061 }, { "epoch": 2.0165480721495945, "grad_norm": 0.3339502215385437, "learning_rate": 2.930572972724733e-06, "loss": 0.3115, "step": 4062 }, { "epoch": 2.0170445143140823, "grad_norm": 0.309440940618515, "learning_rate": 2.927943299141912e-06, "loss": 0.2998, "step": 4063 }, { "epoch": 2.0175409564785705, "grad_norm": 0.3454245626926422, "learning_rate": 2.9253143173625076e-06, "loss": 0.3679, "step": 4064 }, { "epoch": 2.018037398643058, "grad_norm": 0.30715906620025635, "learning_rate": 2.9226860282642668e-06, "loss": 0.3222, "step": 4065 }, { "epoch": 2.018533840807546, "grad_norm": 0.32275646924972534, "learning_rate": 2.9200584327247017e-06, "loss": 0.3003, "step": 4066 }, { "epoch": 2.0190302829720337, "grad_norm": 0.3332006633281708, "learning_rate": 2.9174315316210987e-06, "loss": 0.3544, "step": 4067 }, { "epoch": 2.0195267251365214, "grad_norm": 0.35066646337509155, "learning_rate": 2.914805325830502e-06, "loss": 0.3423, "step": 4068 }, { "epoch": 2.0200231673010096, "grad_norm": 0.3646138310432434, "learning_rate": 2.912179816229739e-06, "loss": 0.373, "step": 4069 }, { "epoch": 2.0205196094654974, "grad_norm": 0.33475422859191895, "learning_rate": 2.909555003695389e-06, "loss": 0.3243, "step": 4070 }, { "epoch": 2.021016051629985, "grad_norm": 0.3611743748188019, "learning_rate": 2.9069308891038083e-06, "loss": 0.3928, "step": 4071 }, { "epoch": 2.021512493794473, "grad_norm": 0.31803974509239197, "learning_rate": 2.9043074733311172e-06, "loss": 0.3074, "step": 4072 }, { "epoch": 2.0220089359589606, "grad_norm": 0.3755503296852112, "learning_rate": 2.901684757253203e-06, "loss": 0.3683, "step": 4073 }, { "epoch": 2.0225053781234488, "grad_norm": 0.3183102607727051, "learning_rate": 2.8990627417457216e-06, "loss": 0.3244, "step": 4074 }, { "epoch": 2.0230018202879365, "grad_norm": 0.3263412415981293, "learning_rate": 2.8964414276840858e-06, "loss": 0.3303, "step": 4075 }, { "epoch": 2.0234982624524243, "grad_norm": 0.32701340317726135, "learning_rate": 2.8938208159434905e-06, "loss": 0.3032, "step": 4076 }, { "epoch": 2.023994704616912, "grad_norm": 0.37712275981903076, "learning_rate": 2.8912009073988796e-06, "loss": 0.3264, "step": 4077 }, { "epoch": 2.0244911467814, "grad_norm": 0.33080631494522095, "learning_rate": 2.888581702924972e-06, "loss": 0.3416, "step": 4078 }, { "epoch": 2.024987588945888, "grad_norm": 0.34869059920310974, "learning_rate": 2.885963203396248e-06, "loss": 0.3995, "step": 4079 }, { "epoch": 2.0254840311103757, "grad_norm": 0.3213314414024353, "learning_rate": 2.8833454096869546e-06, "loss": 0.2998, "step": 4080 }, { "epoch": 2.0259804732748634, "grad_norm": 0.3274889290332794, "learning_rate": 2.8807283226711036e-06, "loss": 0.3225, "step": 4081 }, { "epoch": 2.026476915439351, "grad_norm": 0.3184937536716461, "learning_rate": 2.8781119432224646e-06, "loss": 0.3071, "step": 4082 }, { "epoch": 2.0269733576038393, "grad_norm": 0.3143932819366455, "learning_rate": 2.875496272214578e-06, "loss": 0.3572, "step": 4083 }, { "epoch": 2.027469799768327, "grad_norm": 0.335145503282547, "learning_rate": 2.8728813105207455e-06, "loss": 0.3496, "step": 4084 }, { "epoch": 2.027966241932815, "grad_norm": 0.3325883150100708, "learning_rate": 2.8702670590140314e-06, "loss": 0.3578, "step": 4085 }, { "epoch": 2.0284626840973026, "grad_norm": 0.32617226243019104, "learning_rate": 2.867653518567265e-06, "loss": 0.335, "step": 4086 }, { "epoch": 2.0289591262617903, "grad_norm": 0.3356643617153168, "learning_rate": 2.8650406900530316e-06, "loss": 0.3385, "step": 4087 }, { "epoch": 2.0294555684262785, "grad_norm": 0.3309609293937683, "learning_rate": 2.8624285743436904e-06, "loss": 0.2779, "step": 4088 }, { "epoch": 2.0299520105907662, "grad_norm": 0.32623475790023804, "learning_rate": 2.85981717231135e-06, "loss": 0.3063, "step": 4089 }, { "epoch": 2.030448452755254, "grad_norm": 0.3529566526412964, "learning_rate": 2.857206484827889e-06, "loss": 0.354, "step": 4090 }, { "epoch": 2.0309448949197417, "grad_norm": 0.3442811667919159, "learning_rate": 2.8545965127649455e-06, "loss": 0.3445, "step": 4091 }, { "epoch": 2.0314413370842295, "grad_norm": 0.3325525224208832, "learning_rate": 2.851987256993919e-06, "loss": 0.2937, "step": 4092 }, { "epoch": 2.0319377792487177, "grad_norm": 0.3648795485496521, "learning_rate": 2.8493787183859657e-06, "loss": 0.3181, "step": 4093 }, { "epoch": 2.0324342214132054, "grad_norm": 0.3441438674926758, "learning_rate": 2.8467708978120075e-06, "loss": 0.2965, "step": 4094 }, { "epoch": 2.032930663577693, "grad_norm": 0.3527580797672272, "learning_rate": 2.844163796142725e-06, "loss": 0.3564, "step": 4095 }, { "epoch": 2.033427105742181, "grad_norm": 0.3504357933998108, "learning_rate": 2.8415574142485588e-06, "loss": 0.3301, "step": 4096 }, { "epoch": 2.033923547906669, "grad_norm": 0.32672417163848877, "learning_rate": 2.83895175299971e-06, "loss": 0.3305, "step": 4097 }, { "epoch": 2.034419990071157, "grad_norm": 0.3586893081665039, "learning_rate": 2.836346813266134e-06, "loss": 0.371, "step": 4098 }, { "epoch": 2.0349164322356446, "grad_norm": 0.32108670473098755, "learning_rate": 2.8337425959175558e-06, "loss": 0.31, "step": 4099 }, { "epoch": 2.0354128744001323, "grad_norm": 0.3236572742462158, "learning_rate": 2.831139101823447e-06, "loss": 0.3027, "step": 4100 }, { "epoch": 2.03590931656462, "grad_norm": 0.32809174060821533, "learning_rate": 2.8285363318530455e-06, "loss": 0.3385, "step": 4101 }, { "epoch": 2.0364057587291082, "grad_norm": 0.2994098365306854, "learning_rate": 2.825934286875346e-06, "loss": 0.2998, "step": 4102 }, { "epoch": 2.036902200893596, "grad_norm": 0.3301042318344116, "learning_rate": 2.8233329677591003e-06, "loss": 0.3042, "step": 4103 }, { "epoch": 2.0373986430580837, "grad_norm": 0.3402593433856964, "learning_rate": 2.8207323753728205e-06, "loss": 0.3316, "step": 4104 }, { "epoch": 2.0378950852225715, "grad_norm": 0.33612504601478577, "learning_rate": 2.8181325105847667e-06, "loss": 0.3728, "step": 4105 }, { "epoch": 2.038391527387059, "grad_norm": 0.32073941826820374, "learning_rate": 2.815533374262972e-06, "loss": 0.32, "step": 4106 }, { "epoch": 2.0388879695515474, "grad_norm": 0.32750821113586426, "learning_rate": 2.8129349672752117e-06, "loss": 0.3371, "step": 4107 }, { "epoch": 2.039384411716035, "grad_norm": 0.3205814063549042, "learning_rate": 2.8103372904890234e-06, "loss": 0.3088, "step": 4108 }, { "epoch": 2.039880853880523, "grad_norm": 0.33860844373703003, "learning_rate": 2.8077403447717034e-06, "loss": 0.3493, "step": 4109 }, { "epoch": 2.0403772960450106, "grad_norm": 0.3491120934486389, "learning_rate": 2.8051441309902995e-06, "loss": 0.3081, "step": 4110 }, { "epoch": 2.040873738209499, "grad_norm": 0.342195063829422, "learning_rate": 2.802548650011619e-06, "loss": 0.3932, "step": 4111 }, { "epoch": 2.0413701803739865, "grad_norm": 0.2908857464790344, "learning_rate": 2.7999539027022193e-06, "loss": 0.2977, "step": 4112 }, { "epoch": 2.0418666225384743, "grad_norm": 0.3272695541381836, "learning_rate": 2.7973598899284173e-06, "loss": 0.3271, "step": 4113 }, { "epoch": 2.042363064702962, "grad_norm": 0.37212592363357544, "learning_rate": 2.7947666125562833e-06, "loss": 0.3879, "step": 4114 }, { "epoch": 2.0428595068674498, "grad_norm": 0.3146877586841583, "learning_rate": 2.7921740714516454e-06, "loss": 0.3621, "step": 4115 }, { "epoch": 2.043355949031938, "grad_norm": 0.3154434859752655, "learning_rate": 2.789582267480075e-06, "loss": 0.2971, "step": 4116 }, { "epoch": 2.0438523911964257, "grad_norm": 0.34901344776153564, "learning_rate": 2.7869912015069136e-06, "loss": 0.3592, "step": 4117 }, { "epoch": 2.0443488333609134, "grad_norm": 0.3320936858654022, "learning_rate": 2.784400874397242e-06, "loss": 0.3099, "step": 4118 }, { "epoch": 2.044845275525401, "grad_norm": 0.31272372603416443, "learning_rate": 2.781811287015902e-06, "loss": 0.33, "step": 4119 }, { "epoch": 2.045341717689889, "grad_norm": 0.3499627709388733, "learning_rate": 2.779222440227486e-06, "loss": 0.3435, "step": 4120 }, { "epoch": 2.045838159854377, "grad_norm": 0.33836060762405396, "learning_rate": 2.7766343348963392e-06, "loss": 0.3284, "step": 4121 }, { "epoch": 2.046334602018865, "grad_norm": 0.3180113732814789, "learning_rate": 2.7740469718865626e-06, "loss": 0.3202, "step": 4122 }, { "epoch": 2.0468310441833526, "grad_norm": 0.32802122831344604, "learning_rate": 2.7714603520620026e-06, "loss": 0.3253, "step": 4123 }, { "epoch": 2.0473274863478403, "grad_norm": 0.3185507357120514, "learning_rate": 2.7688744762862624e-06, "loss": 0.3172, "step": 4124 }, { "epoch": 2.0478239285123285, "grad_norm": 0.34512490034103394, "learning_rate": 2.7662893454226956e-06, "loss": 0.3442, "step": 4125 }, { "epoch": 2.0483203706768163, "grad_norm": 0.34242895245552063, "learning_rate": 2.763704960334408e-06, "loss": 0.3898, "step": 4126 }, { "epoch": 2.048816812841304, "grad_norm": 0.32829779386520386, "learning_rate": 2.761121321884257e-06, "loss": 0.3416, "step": 4127 }, { "epoch": 2.0493132550057918, "grad_norm": 0.31076332926750183, "learning_rate": 2.758538430934843e-06, "loss": 0.3473, "step": 4128 }, { "epoch": 2.0498096971702795, "grad_norm": 0.3235422372817993, "learning_rate": 2.7559562883485314e-06, "loss": 0.3266, "step": 4129 }, { "epoch": 2.0503061393347677, "grad_norm": 0.3378644287586212, "learning_rate": 2.7533748949874227e-06, "loss": 0.3344, "step": 4130 }, { "epoch": 2.0508025814992554, "grad_norm": 0.3538166284561157, "learning_rate": 2.750794251713378e-06, "loss": 0.4016, "step": 4131 }, { "epoch": 2.051299023663743, "grad_norm": 0.3213611841201782, "learning_rate": 2.7482143593880015e-06, "loss": 0.3083, "step": 4132 }, { "epoch": 2.051795465828231, "grad_norm": 0.3382694721221924, "learning_rate": 2.745635218872651e-06, "loss": 0.337, "step": 4133 }, { "epoch": 2.0522919079927187, "grad_norm": 0.34180930256843567, "learning_rate": 2.743056831028432e-06, "loss": 0.3598, "step": 4134 }, { "epoch": 2.052788350157207, "grad_norm": 0.31593868136405945, "learning_rate": 2.7404791967161937e-06, "loss": 0.3288, "step": 4135 }, { "epoch": 2.0532847923216946, "grad_norm": 0.32307255268096924, "learning_rate": 2.7379023167965447e-06, "loss": 0.3561, "step": 4136 }, { "epoch": 2.0537812344861823, "grad_norm": 0.36577051877975464, "learning_rate": 2.7353261921298303e-06, "loss": 0.3622, "step": 4137 }, { "epoch": 2.05427767665067, "grad_norm": 0.31538304686546326, "learning_rate": 2.7327508235761513e-06, "loss": 0.3224, "step": 4138 }, { "epoch": 2.054774118815158, "grad_norm": 0.32516780495643616, "learning_rate": 2.730176211995348e-06, "loss": 0.326, "step": 4139 }, { "epoch": 2.055270560979646, "grad_norm": 0.3362060785293579, "learning_rate": 2.7276023582470213e-06, "loss": 0.3092, "step": 4140 }, { "epoch": 2.0557670031441337, "grad_norm": 0.3539848029613495, "learning_rate": 2.725029263190504e-06, "loss": 0.3579, "step": 4141 }, { "epoch": 2.0562634453086215, "grad_norm": 0.3119448721408844, "learning_rate": 2.7224569276848866e-06, "loss": 0.3287, "step": 4142 }, { "epoch": 2.0567598874731092, "grad_norm": 0.32016104459762573, "learning_rate": 2.7198853525890003e-06, "loss": 0.3182, "step": 4143 }, { "epoch": 2.0572563296375974, "grad_norm": 0.3305678069591522, "learning_rate": 2.717314538761425e-06, "loss": 0.3285, "step": 4144 }, { "epoch": 2.057752771802085, "grad_norm": 0.30706289410591125, "learning_rate": 2.7147444870604868e-06, "loss": 0.3178, "step": 4145 }, { "epoch": 2.058249213966573, "grad_norm": 0.3476121723651886, "learning_rate": 2.712175198344251e-06, "loss": 0.3514, "step": 4146 }, { "epoch": 2.0587456561310606, "grad_norm": 0.3446749150753021, "learning_rate": 2.7096066734705406e-06, "loss": 0.3245, "step": 4147 }, { "epoch": 2.0592420982955484, "grad_norm": 0.33194637298583984, "learning_rate": 2.70703891329691e-06, "loss": 0.314, "step": 4148 }, { "epoch": 2.0597385404600366, "grad_norm": 0.3380625545978546, "learning_rate": 2.7044719186806677e-06, "loss": 0.3918, "step": 4149 }, { "epoch": 2.0602349826245243, "grad_norm": 0.32450079917907715, "learning_rate": 2.7019056904788625e-06, "loss": 0.3365, "step": 4150 }, { "epoch": 2.060731424789012, "grad_norm": 0.35374101996421814, "learning_rate": 2.6993402295482885e-06, "loss": 0.3593, "step": 4151 }, { "epoch": 2.0612278669535, "grad_norm": 0.3349902331829071, "learning_rate": 2.6967755367454855e-06, "loss": 0.2941, "step": 4152 }, { "epoch": 2.0617243091179875, "grad_norm": 0.3383682072162628, "learning_rate": 2.694211612926731e-06, "loss": 0.3802, "step": 4153 }, { "epoch": 2.0622207512824757, "grad_norm": 0.36281928420066833, "learning_rate": 2.6916484589480505e-06, "loss": 0.3488, "step": 4154 }, { "epoch": 2.0627171934469635, "grad_norm": 0.32160326838493347, "learning_rate": 2.6890860756652125e-06, "loss": 0.32, "step": 4155 }, { "epoch": 2.063213635611451, "grad_norm": 0.3211008906364441, "learning_rate": 2.6865244639337263e-06, "loss": 0.3213, "step": 4156 }, { "epoch": 2.063710077775939, "grad_norm": 0.31872791051864624, "learning_rate": 2.6839636246088446e-06, "loss": 0.3213, "step": 4157 }, { "epoch": 2.064206519940427, "grad_norm": 0.331788033246994, "learning_rate": 2.6814035585455628e-06, "loss": 0.3526, "step": 4158 }, { "epoch": 2.064702962104915, "grad_norm": 0.30615705251693726, "learning_rate": 2.6788442665986184e-06, "loss": 0.3632, "step": 4159 }, { "epoch": 2.0651994042694026, "grad_norm": 0.3252916932106018, "learning_rate": 2.6762857496224858e-06, "loss": 0.3508, "step": 4160 }, { "epoch": 2.0656958464338904, "grad_norm": 0.3285047709941864, "learning_rate": 2.673728008471387e-06, "loss": 0.315, "step": 4161 }, { "epoch": 2.066192288598378, "grad_norm": 0.32260340452194214, "learning_rate": 2.6711710439992812e-06, "loss": 0.3416, "step": 4162 }, { "epoch": 2.0666887307628663, "grad_norm": 0.3333630859851837, "learning_rate": 2.668614857059872e-06, "loss": 0.3641, "step": 4163 }, { "epoch": 2.067185172927354, "grad_norm": 0.30349215865135193, "learning_rate": 2.666059448506596e-06, "loss": 0.3216, "step": 4164 }, { "epoch": 2.067681615091842, "grad_norm": 0.3401314616203308, "learning_rate": 2.6635048191926375e-06, "loss": 0.3645, "step": 4165 }, { "epoch": 2.0681780572563295, "grad_norm": 0.3296138644218445, "learning_rate": 2.6609509699709174e-06, "loss": 0.3523, "step": 4166 }, { "epoch": 2.0686744994208173, "grad_norm": 0.32242271304130554, "learning_rate": 2.6583979016940962e-06, "loss": 0.3408, "step": 4167 }, { "epoch": 2.0691709415853055, "grad_norm": 0.3517031967639923, "learning_rate": 2.655845615214577e-06, "loss": 0.3126, "step": 4168 }, { "epoch": 2.069667383749793, "grad_norm": 0.3274412155151367, "learning_rate": 2.6532941113844924e-06, "loss": 0.331, "step": 4169 }, { "epoch": 2.070163825914281, "grad_norm": 0.3344128727912903, "learning_rate": 2.650743391055728e-06, "loss": 0.3152, "step": 4170 }, { "epoch": 2.0706602680787687, "grad_norm": 0.2953287959098816, "learning_rate": 2.648193455079894e-06, "loss": 0.285, "step": 4171 }, { "epoch": 2.071156710243257, "grad_norm": 0.3492104113101959, "learning_rate": 2.6456443043083457e-06, "loss": 0.3647, "step": 4172 }, { "epoch": 2.0716531524077446, "grad_norm": 0.3333228528499603, "learning_rate": 2.643095939592177e-06, "loss": 0.3277, "step": 4173 }, { "epoch": 2.0721495945722324, "grad_norm": 0.32313281297683716, "learning_rate": 2.640548361782218e-06, "loss": 0.3124, "step": 4174 }, { "epoch": 2.07264603673672, "grad_norm": 0.3300900161266327, "learning_rate": 2.6380015717290356e-06, "loss": 0.3181, "step": 4175 }, { "epoch": 2.073142478901208, "grad_norm": 0.3208659589290619, "learning_rate": 2.6354555702829293e-06, "loss": 0.3101, "step": 4176 }, { "epoch": 2.073638921065696, "grad_norm": 0.34852299094200134, "learning_rate": 2.6329103582939474e-06, "loss": 0.3503, "step": 4177 }, { "epoch": 2.0741353632301838, "grad_norm": 0.34458646178245544, "learning_rate": 2.6303659366118605e-06, "loss": 0.3377, "step": 4178 }, { "epoch": 2.0746318053946715, "grad_norm": 0.32508882880210876, "learning_rate": 2.6278223060861846e-06, "loss": 0.2995, "step": 4179 }, { "epoch": 2.0751282475591593, "grad_norm": 0.3312140703201294, "learning_rate": 2.6252794675661685e-06, "loss": 0.3465, "step": 4180 }, { "epoch": 2.075624689723647, "grad_norm": 0.29558441042900085, "learning_rate": 2.6227374219007963e-06, "loss": 0.2997, "step": 4181 }, { "epoch": 2.076121131888135, "grad_norm": 0.31566962599754333, "learning_rate": 2.620196169938791e-06, "loss": 0.2992, "step": 4182 }, { "epoch": 2.076617574052623, "grad_norm": 0.3513790965080261, "learning_rate": 2.617655712528603e-06, "loss": 0.371, "step": 4183 }, { "epoch": 2.0771140162171107, "grad_norm": 0.31306713819503784, "learning_rate": 2.615116050518424e-06, "loss": 0.3617, "step": 4184 }, { "epoch": 2.0776104583815984, "grad_norm": 0.3181251287460327, "learning_rate": 2.6125771847561785e-06, "loss": 0.3652, "step": 4185 }, { "epoch": 2.0781069005460866, "grad_norm": 0.3102791905403137, "learning_rate": 2.610039116089526e-06, "loss": 0.2904, "step": 4186 }, { "epoch": 2.0786033427105743, "grad_norm": 0.3509639501571655, "learning_rate": 2.607501845365853e-06, "loss": 0.3516, "step": 4187 }, { "epoch": 2.079099784875062, "grad_norm": 0.3182564079761505, "learning_rate": 2.604965373432294e-06, "loss": 0.2717, "step": 4188 }, { "epoch": 2.07959622703955, "grad_norm": 0.34692344069480896, "learning_rate": 2.602429701135701e-06, "loss": 0.3653, "step": 4189 }, { "epoch": 2.0800926692040376, "grad_norm": 0.30102577805519104, "learning_rate": 2.5998948293226684e-06, "loss": 0.2971, "step": 4190 }, { "epoch": 2.0805891113685258, "grad_norm": 0.3200232982635498, "learning_rate": 2.597360758839521e-06, "loss": 0.3367, "step": 4191 }, { "epoch": 2.0810855535330135, "grad_norm": 0.34963253140449524, "learning_rate": 2.5948274905323163e-06, "loss": 0.3099, "step": 4192 }, { "epoch": 2.0815819956975012, "grad_norm": 0.31735721230506897, "learning_rate": 2.5922950252468455e-06, "loss": 0.3365, "step": 4193 }, { "epoch": 2.082078437861989, "grad_norm": 0.3460042178630829, "learning_rate": 2.5897633638286256e-06, "loss": 0.3057, "step": 4194 }, { "epoch": 2.0825748800264767, "grad_norm": 0.3375941514968872, "learning_rate": 2.587232507122912e-06, "loss": 0.3562, "step": 4195 }, { "epoch": 2.083071322190965, "grad_norm": 0.3238248825073242, "learning_rate": 2.584702455974689e-06, "loss": 0.3121, "step": 4196 }, { "epoch": 2.0835677643554527, "grad_norm": 0.34387707710266113, "learning_rate": 2.5821732112286726e-06, "loss": 0.3593, "step": 4197 }, { "epoch": 2.0840642065199404, "grad_norm": 0.3170264959335327, "learning_rate": 2.579644773729307e-06, "loss": 0.335, "step": 4198 }, { "epoch": 2.084560648684428, "grad_norm": 0.33634331822395325, "learning_rate": 2.5771171443207703e-06, "loss": 0.3609, "step": 4199 }, { "epoch": 2.085057090848916, "grad_norm": 0.3378826677799225, "learning_rate": 2.574590323846971e-06, "loss": 0.3274, "step": 4200 }, { "epoch": 2.085553533013404, "grad_norm": 0.3522129952907562, "learning_rate": 2.572064313151541e-06, "loss": 0.3639, "step": 4201 }, { "epoch": 2.086049975177892, "grad_norm": 0.33041074872016907, "learning_rate": 2.5695391130778504e-06, "loss": 0.3165, "step": 4202 }, { "epoch": 2.0865464173423796, "grad_norm": 0.328207403421402, "learning_rate": 2.5670147244689926e-06, "loss": 0.3036, "step": 4203 }, { "epoch": 2.0870428595068673, "grad_norm": 0.3548833727836609, "learning_rate": 2.5644911481677937e-06, "loss": 0.3599, "step": 4204 }, { "epoch": 2.0875393016713555, "grad_norm": 0.33536049723625183, "learning_rate": 2.5619683850168087e-06, "loss": 0.3176, "step": 4205 }, { "epoch": 2.0880357438358432, "grad_norm": 0.31917068362236023, "learning_rate": 2.5594464358583137e-06, "loss": 0.3826, "step": 4206 }, { "epoch": 2.088532186000331, "grad_norm": 0.3333820104598999, "learning_rate": 2.5569253015343277e-06, "loss": 0.3332, "step": 4207 }, { "epoch": 2.0890286281648187, "grad_norm": 0.2988761067390442, "learning_rate": 2.5544049828865823e-06, "loss": 0.3123, "step": 4208 }, { "epoch": 2.0895250703293065, "grad_norm": 0.34934985637664795, "learning_rate": 2.5518854807565473e-06, "loss": 0.3294, "step": 4209 }, { "epoch": 2.0900215124937946, "grad_norm": 0.3349267840385437, "learning_rate": 2.5493667959854106e-06, "loss": 0.2893, "step": 4210 }, { "epoch": 2.0905179546582824, "grad_norm": 0.33514276146888733, "learning_rate": 2.5468489294141003e-06, "loss": 0.3841, "step": 4211 }, { "epoch": 2.09101439682277, "grad_norm": 0.30857956409454346, "learning_rate": 2.5443318818832574e-06, "loss": 0.3228, "step": 4212 }, { "epoch": 2.091510838987258, "grad_norm": 0.3120240569114685, "learning_rate": 2.5418156542332557e-06, "loss": 0.3239, "step": 4213 }, { "epoch": 2.0920072811517456, "grad_norm": 0.3126356303691864, "learning_rate": 2.539300247304202e-06, "loss": 0.3228, "step": 4214 }, { "epoch": 2.092503723316234, "grad_norm": 0.3168451488018036, "learning_rate": 2.536785661935914e-06, "loss": 0.3356, "step": 4215 }, { "epoch": 2.0930001654807215, "grad_norm": 0.33429408073425293, "learning_rate": 2.53427189896795e-06, "loss": 0.3237, "step": 4216 }, { "epoch": 2.0934966076452093, "grad_norm": 0.35291051864624023, "learning_rate": 2.5317589592395802e-06, "loss": 0.3224, "step": 4217 }, { "epoch": 2.093993049809697, "grad_norm": 0.3460938334465027, "learning_rate": 2.5292468435898145e-06, "loss": 0.3295, "step": 4218 }, { "epoch": 2.094489491974185, "grad_norm": 0.33039286732673645, "learning_rate": 2.5267355528573745e-06, "loss": 0.3341, "step": 4219 }, { "epoch": 2.094985934138673, "grad_norm": 0.3232404589653015, "learning_rate": 2.524225087880714e-06, "loss": 0.3449, "step": 4220 }, { "epoch": 2.0954823763031607, "grad_norm": 0.29227015376091003, "learning_rate": 2.5217154494980087e-06, "loss": 0.2912, "step": 4221 }, { "epoch": 2.0959788184676484, "grad_norm": 0.3440796136856079, "learning_rate": 2.5192066385471592e-06, "loss": 0.3554, "step": 4222 }, { "epoch": 2.096475260632136, "grad_norm": 0.3215096592903137, "learning_rate": 2.5166986558657904e-06, "loss": 0.336, "step": 4223 }, { "epoch": 2.0969717027966244, "grad_norm": 0.34793710708618164, "learning_rate": 2.5141915022912454e-06, "loss": 0.3704, "step": 4224 }, { "epoch": 2.097468144961112, "grad_norm": 0.3094974160194397, "learning_rate": 2.5116851786605983e-06, "loss": 0.3185, "step": 4225 }, { "epoch": 2.0979645871256, "grad_norm": 0.32783839106559753, "learning_rate": 2.509179685810641e-06, "loss": 0.334, "step": 4226 }, { "epoch": 2.0984610292900876, "grad_norm": 0.35505247116088867, "learning_rate": 2.5066750245778905e-06, "loss": 0.3393, "step": 4227 }, { "epoch": 2.0989574714545753, "grad_norm": 0.3838469982147217, "learning_rate": 2.504171195798584e-06, "loss": 0.3656, "step": 4228 }, { "epoch": 2.0994539136190635, "grad_norm": 0.32875052094459534, "learning_rate": 2.5016682003086812e-06, "loss": 0.3352, "step": 4229 }, { "epoch": 2.0999503557835513, "grad_norm": 0.3213410973548889, "learning_rate": 2.4991660389438687e-06, "loss": 0.3013, "step": 4230 }, { "epoch": 2.100446797948039, "grad_norm": 0.3114587366580963, "learning_rate": 2.496664712539545e-06, "loss": 0.3144, "step": 4231 }, { "epoch": 2.1009432401125268, "grad_norm": 0.3564547598361969, "learning_rate": 2.494164221930836e-06, "loss": 0.3312, "step": 4232 }, { "epoch": 2.101439682277015, "grad_norm": 0.3453371226787567, "learning_rate": 2.491664567952589e-06, "loss": 0.3892, "step": 4233 }, { "epoch": 2.1019361244415027, "grad_norm": 0.3364989459514618, "learning_rate": 2.489165751439372e-06, "loss": 0.3355, "step": 4234 }, { "epoch": 2.1024325666059904, "grad_norm": 0.33580732345581055, "learning_rate": 2.486667773225468e-06, "loss": 0.3217, "step": 4235 }, { "epoch": 2.102929008770478, "grad_norm": 0.3544289171695709, "learning_rate": 2.484170634144884e-06, "loss": 0.3434, "step": 4236 }, { "epoch": 2.103425450934966, "grad_norm": 0.306465744972229, "learning_rate": 2.481674335031352e-06, "loss": 0.3097, "step": 4237 }, { "epoch": 2.103921893099454, "grad_norm": 0.3154820203781128, "learning_rate": 2.4791788767183144e-06, "loss": 0.3387, "step": 4238 }, { "epoch": 2.104418335263942, "grad_norm": 0.3505590558052063, "learning_rate": 2.476684260038937e-06, "loss": 0.381, "step": 4239 }, { "epoch": 2.1049147774284296, "grad_norm": 0.3261749744415283, "learning_rate": 2.474190485826106e-06, "loss": 0.3143, "step": 4240 }, { "epoch": 2.1054112195929173, "grad_norm": 0.33610445261001587, "learning_rate": 2.471697554912425e-06, "loss": 0.3916, "step": 4241 }, { "epoch": 2.105907661757405, "grad_norm": 0.3318333625793457, "learning_rate": 2.4692054681302135e-06, "loss": 0.3464, "step": 4242 }, { "epoch": 2.1064041039218933, "grad_norm": 0.33396369218826294, "learning_rate": 2.466714226311513e-06, "loss": 0.3367, "step": 4243 }, { "epoch": 2.106900546086381, "grad_norm": 0.33704233169555664, "learning_rate": 2.4642238302880817e-06, "loss": 0.3376, "step": 4244 }, { "epoch": 2.1073969882508687, "grad_norm": 0.33959683775901794, "learning_rate": 2.461734280891394e-06, "loss": 0.3302, "step": 4245 }, { "epoch": 2.1078934304153565, "grad_norm": 0.34782832860946655, "learning_rate": 2.4592455789526466e-06, "loss": 0.3577, "step": 4246 }, { "epoch": 2.1083898725798447, "grad_norm": 0.3145185708999634, "learning_rate": 2.4567577253027425e-06, "loss": 0.3387, "step": 4247 }, { "epoch": 2.1088863147443324, "grad_norm": 0.3387877643108368, "learning_rate": 2.4542707207723158e-06, "loss": 0.3308, "step": 4248 }, { "epoch": 2.10938275690882, "grad_norm": 0.3339676856994629, "learning_rate": 2.451784566191705e-06, "loss": 0.3215, "step": 4249 }, { "epoch": 2.109879199073308, "grad_norm": 0.35742804408073425, "learning_rate": 2.4492992623909706e-06, "loss": 0.3549, "step": 4250 }, { "epoch": 2.1103756412377956, "grad_norm": 0.3079461455345154, "learning_rate": 2.4468148101998877e-06, "loss": 0.3324, "step": 4251 }, { "epoch": 2.110872083402284, "grad_norm": 0.31888121366500854, "learning_rate": 2.4443312104479487e-06, "loss": 0.3344, "step": 4252 }, { "epoch": 2.1113685255667716, "grad_norm": 0.37400588393211365, "learning_rate": 2.441848463964361e-06, "loss": 0.3344, "step": 4253 }, { "epoch": 2.1118649677312593, "grad_norm": 0.33595502376556396, "learning_rate": 2.4393665715780405e-06, "loss": 0.328, "step": 4254 }, { "epoch": 2.112361409895747, "grad_norm": 0.33446356654167175, "learning_rate": 2.436885534117632e-06, "loss": 0.352, "step": 4255 }, { "epoch": 2.112857852060235, "grad_norm": 0.30777522921562195, "learning_rate": 2.4344053524114796e-06, "loss": 0.319, "step": 4256 }, { "epoch": 2.113354294224723, "grad_norm": 0.3476639986038208, "learning_rate": 2.4319260272876533e-06, "loss": 0.389, "step": 4257 }, { "epoch": 2.1138507363892107, "grad_norm": 0.34331679344177246, "learning_rate": 2.429447559573926e-06, "loss": 0.3301, "step": 4258 }, { "epoch": 2.1143471785536985, "grad_norm": 0.3418808877468109, "learning_rate": 2.4269699500977987e-06, "loss": 0.3525, "step": 4259 }, { "epoch": 2.114843620718186, "grad_norm": 0.3557969629764557, "learning_rate": 2.424493199686472e-06, "loss": 0.3671, "step": 4260 }, { "epoch": 2.115340062882674, "grad_norm": 0.3376113772392273, "learning_rate": 2.4220173091668675e-06, "loss": 0.3227, "step": 4261 }, { "epoch": 2.115836505047162, "grad_norm": 0.32230791449546814, "learning_rate": 2.419542279365618e-06, "loss": 0.3199, "step": 4262 }, { "epoch": 2.11633294721165, "grad_norm": 0.3500332534313202, "learning_rate": 2.4170681111090684e-06, "loss": 0.354, "step": 4263 }, { "epoch": 2.1168293893761376, "grad_norm": 0.32294514775276184, "learning_rate": 2.414594805223278e-06, "loss": 0.3295, "step": 4264 }, { "epoch": 2.1173258315406254, "grad_norm": 0.35558801889419556, "learning_rate": 2.4121223625340134e-06, "loss": 0.3584, "step": 4265 }, { "epoch": 2.1178222737051136, "grad_norm": 0.32766616344451904, "learning_rate": 2.4096507838667564e-06, "loss": 0.2972, "step": 4266 }, { "epoch": 2.1183187158696013, "grad_norm": 0.3143586814403534, "learning_rate": 2.407180070046702e-06, "loss": 0.312, "step": 4267 }, { "epoch": 2.118815158034089, "grad_norm": 0.30515992641448975, "learning_rate": 2.404710221898752e-06, "loss": 0.3074, "step": 4268 }, { "epoch": 2.119311600198577, "grad_norm": 0.3358466625213623, "learning_rate": 2.4022412402475235e-06, "loss": 0.3212, "step": 4269 }, { "epoch": 2.1198080423630645, "grad_norm": 0.3212876617908478, "learning_rate": 2.3997731259173423e-06, "loss": 0.3368, "step": 4270 }, { "epoch": 2.1203044845275527, "grad_norm": 0.3291272819042206, "learning_rate": 2.3973058797322453e-06, "loss": 0.3545, "step": 4271 }, { "epoch": 2.1208009266920405, "grad_norm": 0.3112680912017822, "learning_rate": 2.394839502515976e-06, "loss": 0.316, "step": 4272 }, { "epoch": 2.121297368856528, "grad_norm": 0.34451058506965637, "learning_rate": 2.3923739950919924e-06, "loss": 0.3031, "step": 4273 }, { "epoch": 2.121793811021016, "grad_norm": 0.3228455185890198, "learning_rate": 2.3899093582834605e-06, "loss": 0.363, "step": 4274 }, { "epoch": 2.1222902531855037, "grad_norm": 0.3239097595214844, "learning_rate": 2.3874455929132557e-06, "loss": 0.3318, "step": 4275 }, { "epoch": 2.122786695349992, "grad_norm": 0.3275667130947113, "learning_rate": 2.384982699803964e-06, "loss": 0.3181, "step": 4276 }, { "epoch": 2.1232831375144796, "grad_norm": 0.3278745412826538, "learning_rate": 2.382520679777873e-06, "loss": 0.2932, "step": 4277 }, { "epoch": 2.1237795796789674, "grad_norm": 0.3516084849834442, "learning_rate": 2.380059533656991e-06, "loss": 0.3481, "step": 4278 }, { "epoch": 2.124276021843455, "grad_norm": 0.33535149693489075, "learning_rate": 2.377599262263023e-06, "loss": 0.3533, "step": 4279 }, { "epoch": 2.1247724640079433, "grad_norm": 0.3196180760860443, "learning_rate": 2.3751398664173906e-06, "loss": 0.2947, "step": 4280 }, { "epoch": 2.125268906172431, "grad_norm": 0.3356037139892578, "learning_rate": 2.372681346941213e-06, "loss": 0.3707, "step": 4281 }, { "epoch": 2.1257653483369188, "grad_norm": 0.3352327346801758, "learning_rate": 2.370223704655331e-06, "loss": 0.3635, "step": 4282 }, { "epoch": 2.1262617905014065, "grad_norm": 0.32871776819229126, "learning_rate": 2.3677669403802788e-06, "loss": 0.3112, "step": 4283 }, { "epoch": 2.1267582326658943, "grad_norm": 0.34807151556015015, "learning_rate": 2.3653110549363036e-06, "loss": 0.3805, "step": 4284 }, { "epoch": 2.1272546748303824, "grad_norm": 0.31433263421058655, "learning_rate": 2.3628560491433637e-06, "loss": 0.3186, "step": 4285 }, { "epoch": 2.12775111699487, "grad_norm": 0.3272414207458496, "learning_rate": 2.3604019238211135e-06, "loss": 0.3131, "step": 4286 }, { "epoch": 2.128247559159358, "grad_norm": 0.3468098044395447, "learning_rate": 2.3579486797889222e-06, "loss": 0.3466, "step": 4287 }, { "epoch": 2.1287440013238457, "grad_norm": 0.3459050953388214, "learning_rate": 2.3554963178658564e-06, "loss": 0.3628, "step": 4288 }, { "epoch": 2.1292404434883334, "grad_norm": 0.3314029574394226, "learning_rate": 2.3530448388707e-06, "loss": 0.3698, "step": 4289 }, { "epoch": 2.1297368856528216, "grad_norm": 0.2991660535335541, "learning_rate": 2.3505942436219297e-06, "loss": 0.3102, "step": 4290 }, { "epoch": 2.1302333278173093, "grad_norm": 0.32927045226097107, "learning_rate": 2.348144532937735e-06, "loss": 0.3484, "step": 4291 }, { "epoch": 2.130729769981797, "grad_norm": 0.3271586000919342, "learning_rate": 2.345695707636007e-06, "loss": 0.3655, "step": 4292 }, { "epoch": 2.131226212146285, "grad_norm": 0.35453277826309204, "learning_rate": 2.3432477685343426e-06, "loss": 0.3478, "step": 4293 }, { "epoch": 2.1317226543107726, "grad_norm": 0.3171868622303009, "learning_rate": 2.3408007164500427e-06, "loss": 0.3399, "step": 4294 }, { "epoch": 2.1322190964752608, "grad_norm": 0.3153713047504425, "learning_rate": 2.338354552200108e-06, "loss": 0.2784, "step": 4295 }, { "epoch": 2.1327155386397485, "grad_norm": 0.342125803232193, "learning_rate": 2.3359092766012517e-06, "loss": 0.3633, "step": 4296 }, { "epoch": 2.1332119808042362, "grad_norm": 0.31337183713912964, "learning_rate": 2.33346489046988e-06, "loss": 0.3319, "step": 4297 }, { "epoch": 2.133708422968724, "grad_norm": 0.3200780749320984, "learning_rate": 2.3310213946221094e-06, "loss": 0.3429, "step": 4298 }, { "epoch": 2.134204865133212, "grad_norm": 0.31334254145622253, "learning_rate": 2.3285787898737565e-06, "loss": 0.2971, "step": 4299 }, { "epoch": 2.1347013072977, "grad_norm": 0.3372136354446411, "learning_rate": 2.32613707704034e-06, "loss": 0.3589, "step": 4300 }, { "epoch": 2.1351977494621877, "grad_norm": 0.30958038568496704, "learning_rate": 2.3236962569370843e-06, "loss": 0.3353, "step": 4301 }, { "epoch": 2.1356941916266754, "grad_norm": 0.3078596889972687, "learning_rate": 2.3212563303789082e-06, "loss": 0.3496, "step": 4302 }, { "epoch": 2.136190633791163, "grad_norm": 0.29255664348602295, "learning_rate": 2.318817298180439e-06, "loss": 0.3041, "step": 4303 }, { "epoch": 2.1366870759556513, "grad_norm": 0.3528154790401459, "learning_rate": 2.3163791611560036e-06, "loss": 0.3509, "step": 4304 }, { "epoch": 2.137183518120139, "grad_norm": 0.3502650260925293, "learning_rate": 2.3139419201196316e-06, "loss": 0.3211, "step": 4305 }, { "epoch": 2.137679960284627, "grad_norm": 0.35387226939201355, "learning_rate": 2.3115055758850476e-06, "loss": 0.3041, "step": 4306 }, { "epoch": 2.1381764024491146, "grad_norm": 0.3213590383529663, "learning_rate": 2.3090701292656808e-06, "loss": 0.3341, "step": 4307 }, { "epoch": 2.1386728446136027, "grad_norm": 0.3002477288246155, "learning_rate": 2.306635581074666e-06, "loss": 0.339, "step": 4308 }, { "epoch": 2.1391692867780905, "grad_norm": 0.32140523195266724, "learning_rate": 2.304201932124827e-06, "loss": 0.3376, "step": 4309 }, { "epoch": 2.1396657289425782, "grad_norm": 0.3294033408164978, "learning_rate": 2.3017691832286953e-06, "loss": 0.3205, "step": 4310 }, { "epoch": 2.140162171107066, "grad_norm": 0.31653621792793274, "learning_rate": 2.2993373351984994e-06, "loss": 0.3366, "step": 4311 }, { "epoch": 2.1406586132715537, "grad_norm": 0.3247642517089844, "learning_rate": 2.2969063888461697e-06, "loss": 0.3334, "step": 4312 }, { "epoch": 2.141155055436042, "grad_norm": 0.3595779836177826, "learning_rate": 2.294476344983328e-06, "loss": 0.3777, "step": 4313 }, { "epoch": 2.1416514976005296, "grad_norm": 0.2883423864841461, "learning_rate": 2.292047204421303e-06, "loss": 0.3015, "step": 4314 }, { "epoch": 2.1421479397650174, "grad_norm": 0.33771437406539917, "learning_rate": 2.2896189679711186e-06, "loss": 0.3369, "step": 4315 }, { "epoch": 2.142644381929505, "grad_norm": 0.3227347135543823, "learning_rate": 2.2871916364434963e-06, "loss": 0.3359, "step": 4316 }, { "epoch": 2.143140824093993, "grad_norm": 0.31611889600753784, "learning_rate": 2.284765210648859e-06, "loss": 0.3321, "step": 4317 }, { "epoch": 2.143637266258481, "grad_norm": 0.31132757663726807, "learning_rate": 2.282339691397318e-06, "loss": 0.3504, "step": 4318 }, { "epoch": 2.144133708422969, "grad_norm": 0.2997288703918457, "learning_rate": 2.279915079498696e-06, "loss": 0.319, "step": 4319 }, { "epoch": 2.1446301505874565, "grad_norm": 0.3518507182598114, "learning_rate": 2.277491375762499e-06, "loss": 0.3446, "step": 4320 }, { "epoch": 2.1451265927519443, "grad_norm": 0.32510092854499817, "learning_rate": 2.2750685809979378e-06, "loss": 0.2968, "step": 4321 }, { "epoch": 2.145623034916432, "grad_norm": 0.3213229179382324, "learning_rate": 2.2726466960139176e-06, "loss": 0.3221, "step": 4322 }, { "epoch": 2.14611947708092, "grad_norm": 0.33020639419555664, "learning_rate": 2.270225721619041e-06, "loss": 0.3517, "step": 4323 }, { "epoch": 2.146615919245408, "grad_norm": 0.34343501925468445, "learning_rate": 2.2678056586216062e-06, "loss": 0.4066, "step": 4324 }, { "epoch": 2.1471123614098957, "grad_norm": 0.3101903200149536, "learning_rate": 2.2653865078296017e-06, "loss": 0.3301, "step": 4325 }, { "epoch": 2.1476088035743834, "grad_norm": 0.3007713556289673, "learning_rate": 2.2629682700507225e-06, "loss": 0.3158, "step": 4326 }, { "epoch": 2.1481052457388716, "grad_norm": 0.3580935597419739, "learning_rate": 2.2605509460923488e-06, "loss": 0.3369, "step": 4327 }, { "epoch": 2.1486016879033594, "grad_norm": 0.3451901972293854, "learning_rate": 2.258134536761561e-06, "loss": 0.3348, "step": 4328 }, { "epoch": 2.149098130067847, "grad_norm": 0.3156387507915497, "learning_rate": 2.2557190428651282e-06, "loss": 0.3093, "step": 4329 }, { "epoch": 2.149594572232335, "grad_norm": 0.3338984549045563, "learning_rate": 2.253304465209524e-06, "loss": 0.2975, "step": 4330 }, { "epoch": 2.1500910143968226, "grad_norm": 0.3181239068508148, "learning_rate": 2.250890804600909e-06, "loss": 0.331, "step": 4331 }, { "epoch": 2.150587456561311, "grad_norm": 0.3361319601535797, "learning_rate": 2.2484780618451357e-06, "loss": 0.3264, "step": 4332 }, { "epoch": 2.1510838987257985, "grad_norm": 0.3132392466068268, "learning_rate": 2.2460662377477554e-06, "loss": 0.3002, "step": 4333 }, { "epoch": 2.1515803408902863, "grad_norm": 0.3080103397369385, "learning_rate": 2.243655333114011e-06, "loss": 0.3525, "step": 4334 }, { "epoch": 2.152076783054774, "grad_norm": 0.337830513715744, "learning_rate": 2.2412453487488394e-06, "loss": 0.3526, "step": 4335 }, { "epoch": 2.1525732252192618, "grad_norm": 0.3154048025608063, "learning_rate": 2.2388362854568628e-06, "loss": 0.3197, "step": 4336 }, { "epoch": 2.15306966738375, "grad_norm": 0.3398171663284302, "learning_rate": 2.236428144042411e-06, "loss": 0.3523, "step": 4337 }, { "epoch": 2.1535661095482377, "grad_norm": 0.2970592677593231, "learning_rate": 2.234020925309489e-06, "loss": 0.2962, "step": 4338 }, { "epoch": 2.1540625517127254, "grad_norm": 0.366179496049881, "learning_rate": 2.2316146300618057e-06, "loss": 0.345, "step": 4339 }, { "epoch": 2.154558993877213, "grad_norm": 0.31536951661109924, "learning_rate": 2.2292092591027565e-06, "loss": 0.3029, "step": 4340 }, { "epoch": 2.1550554360417014, "grad_norm": 0.3417406976222992, "learning_rate": 2.2268048132354303e-06, "loss": 0.3505, "step": 4341 }, { "epoch": 2.155551878206189, "grad_norm": 0.3150607645511627, "learning_rate": 2.224401293262607e-06, "loss": 0.2968, "step": 4342 }, { "epoch": 2.156048320370677, "grad_norm": 0.3248623013496399, "learning_rate": 2.2219986999867537e-06, "loss": 0.3378, "step": 4343 }, { "epoch": 2.1565447625351646, "grad_norm": 0.34506648778915405, "learning_rate": 2.2195970342100328e-06, "loss": 0.3836, "step": 4344 }, { "epoch": 2.1570412046996523, "grad_norm": 0.28582853078842163, "learning_rate": 2.217196296734294e-06, "loss": 0.2973, "step": 4345 }, { "epoch": 2.1575376468641405, "grad_norm": 0.3096078634262085, "learning_rate": 2.21479648836108e-06, "loss": 0.3411, "step": 4346 }, { "epoch": 2.1580340890286283, "grad_norm": 0.33599215745925903, "learning_rate": 2.212397609891623e-06, "loss": 0.3315, "step": 4347 }, { "epoch": 2.158530531193116, "grad_norm": 0.3181214928627014, "learning_rate": 2.209999662126837e-06, "loss": 0.2937, "step": 4348 }, { "epoch": 2.1590269733576037, "grad_norm": 0.3213338553905487, "learning_rate": 2.20760264586734e-06, "loss": 0.3534, "step": 4349 }, { "epoch": 2.1595234155220915, "grad_norm": 0.329299658536911, "learning_rate": 2.2052065619134243e-06, "loss": 0.3313, "step": 4350 }, { "epoch": 2.1600198576865797, "grad_norm": 0.3124282658100128, "learning_rate": 2.2028114110650796e-06, "loss": 0.3639, "step": 4351 }, { "epoch": 2.1605162998510674, "grad_norm": 0.35106468200683594, "learning_rate": 2.200417194121981e-06, "loss": 0.3571, "step": 4352 }, { "epoch": 2.161012742015555, "grad_norm": 0.3281121253967285, "learning_rate": 2.198023911883495e-06, "loss": 0.3279, "step": 4353 }, { "epoch": 2.161509184180043, "grad_norm": 0.3081805109977722, "learning_rate": 2.1956315651486694e-06, "loss": 0.33, "step": 4354 }, { "epoch": 2.1620056263445306, "grad_norm": 0.31199729442596436, "learning_rate": 2.1932401547162436e-06, "loss": 0.3157, "step": 4355 }, { "epoch": 2.162502068509019, "grad_norm": 0.3429150879383087, "learning_rate": 2.1908496813846503e-06, "loss": 0.3482, "step": 4356 }, { "epoch": 2.1629985106735066, "grad_norm": 0.33236101269721985, "learning_rate": 2.188460145951998e-06, "loss": 0.3547, "step": 4357 }, { "epoch": 2.1634949528379943, "grad_norm": 0.3246014416217804, "learning_rate": 2.1860715492160922e-06, "loss": 0.3662, "step": 4358 }, { "epoch": 2.163991395002482, "grad_norm": 0.2991873025894165, "learning_rate": 2.1836838919744136e-06, "loss": 0.2888, "step": 4359 }, { "epoch": 2.1644878371669702, "grad_norm": 0.32452821731567383, "learning_rate": 2.1812971750241436e-06, "loss": 0.3702, "step": 4360 }, { "epoch": 2.164984279331458, "grad_norm": 0.3370179533958435, "learning_rate": 2.178911399162137e-06, "loss": 0.3408, "step": 4361 }, { "epoch": 2.1654807214959457, "grad_norm": 0.31037458777427673, "learning_rate": 2.1765265651849415e-06, "loss": 0.3035, "step": 4362 }, { "epoch": 2.1659771636604335, "grad_norm": 0.33051279187202454, "learning_rate": 2.1741426738887885e-06, "loss": 0.3322, "step": 4363 }, { "epoch": 2.166473605824921, "grad_norm": 0.3431636691093445, "learning_rate": 2.1717597260695934e-06, "loss": 0.337, "step": 4364 }, { "epoch": 2.1669700479894094, "grad_norm": 0.3232525587081909, "learning_rate": 2.1693777225229605e-06, "loss": 0.3122, "step": 4365 }, { "epoch": 2.167466490153897, "grad_norm": 0.3531665503978729, "learning_rate": 2.16699666404417e-06, "loss": 0.3925, "step": 4366 }, { "epoch": 2.167962932318385, "grad_norm": 0.3042391836643219, "learning_rate": 2.1646165514282014e-06, "loss": 0.2785, "step": 4367 }, { "epoch": 2.1684593744828726, "grad_norm": 0.32260772585868835, "learning_rate": 2.162237385469702e-06, "loss": 0.307, "step": 4368 }, { "epoch": 2.168955816647361, "grad_norm": 0.3310593366622925, "learning_rate": 2.1598591669630135e-06, "loss": 0.3651, "step": 4369 }, { "epoch": 2.1694522588118486, "grad_norm": 0.3620672821998596, "learning_rate": 2.1574818967021595e-06, "loss": 0.3691, "step": 4370 }, { "epoch": 2.1699487009763363, "grad_norm": 0.32394781708717346, "learning_rate": 2.1551055754808436e-06, "loss": 0.3304, "step": 4371 }, { "epoch": 2.170445143140824, "grad_norm": 0.3346589207649231, "learning_rate": 2.1527302040924588e-06, "loss": 0.3107, "step": 4372 }, { "epoch": 2.170941585305312, "grad_norm": 0.31566759943962097, "learning_rate": 2.1503557833300714e-06, "loss": 0.3424, "step": 4373 }, { "epoch": 2.1714380274698, "grad_norm": 0.42482197284698486, "learning_rate": 2.14798231398644e-06, "loss": 0.312, "step": 4374 }, { "epoch": 2.1719344696342877, "grad_norm": 0.3189176917076111, "learning_rate": 2.1456097968539996e-06, "loss": 0.2974, "step": 4375 }, { "epoch": 2.1724309117987755, "grad_norm": 0.3509669899940491, "learning_rate": 2.1432382327248724e-06, "loss": 0.3669, "step": 4376 }, { "epoch": 2.172927353963263, "grad_norm": 0.33200567960739136, "learning_rate": 2.140867622390853e-06, "loss": 0.3142, "step": 4377 }, { "epoch": 2.173423796127751, "grad_norm": 0.34207016229629517, "learning_rate": 2.1384979666434295e-06, "loss": 0.3652, "step": 4378 }, { "epoch": 2.173920238292239, "grad_norm": 0.33781740069389343, "learning_rate": 2.1361292662737655e-06, "loss": 0.3549, "step": 4379 }, { "epoch": 2.174416680456727, "grad_norm": 0.3015495836734772, "learning_rate": 2.1337615220727015e-06, "loss": 0.341, "step": 4380 }, { "epoch": 2.1749131226212146, "grad_norm": 0.3198246955871582, "learning_rate": 2.1313947348307655e-06, "loss": 0.3227, "step": 4381 }, { "epoch": 2.1754095647857024, "grad_norm": 0.33245915174484253, "learning_rate": 2.1290289053381635e-06, "loss": 0.3377, "step": 4382 }, { "epoch": 2.17590600695019, "grad_norm": 0.3500523865222931, "learning_rate": 2.1266640343847826e-06, "loss": 0.3719, "step": 4383 }, { "epoch": 2.1764024491146783, "grad_norm": 0.3077021837234497, "learning_rate": 2.124300122760186e-06, "loss": 0.2665, "step": 4384 }, { "epoch": 2.176898891279166, "grad_norm": 0.3473612368106842, "learning_rate": 2.1219371712536214e-06, "loss": 0.346, "step": 4385 }, { "epoch": 2.1773953334436538, "grad_norm": 0.32069307565689087, "learning_rate": 2.119575180654014e-06, "loss": 0.3394, "step": 4386 }, { "epoch": 2.1778917756081415, "grad_norm": 0.30832117795944214, "learning_rate": 2.1172141517499676e-06, "loss": 0.306, "step": 4387 }, { "epoch": 2.1783882177726293, "grad_norm": 0.3152512311935425, "learning_rate": 2.114854085329769e-06, "loss": 0.312, "step": 4388 }, { "epoch": 2.1788846599371174, "grad_norm": 0.32638293504714966, "learning_rate": 2.112494982181373e-06, "loss": 0.3752, "step": 4389 }, { "epoch": 2.179381102101605, "grad_norm": 0.31603074073791504, "learning_rate": 2.110136843092428e-06, "loss": 0.334, "step": 4390 }, { "epoch": 2.179877544266093, "grad_norm": 0.30802610516548157, "learning_rate": 2.1077796688502478e-06, "loss": 0.2912, "step": 4391 }, { "epoch": 2.1803739864305807, "grad_norm": 0.32200688123703003, "learning_rate": 2.1054234602418294e-06, "loss": 0.3304, "step": 4392 }, { "epoch": 2.180870428595069, "grad_norm": 0.33420705795288086, "learning_rate": 2.1030682180538475e-06, "loss": 0.3706, "step": 4393 }, { "epoch": 2.1813668707595566, "grad_norm": 0.3049645721912384, "learning_rate": 2.100713943072653e-06, "loss": 0.3375, "step": 4394 }, { "epoch": 2.1818633129240443, "grad_norm": 0.3106015920639038, "learning_rate": 2.0983606360842773e-06, "loss": 0.3389, "step": 4395 }, { "epoch": 2.182359755088532, "grad_norm": 0.31577420234680176, "learning_rate": 2.096008297874419e-06, "loss": 0.3302, "step": 4396 }, { "epoch": 2.18285619725302, "grad_norm": 0.31807342171669006, "learning_rate": 2.0936569292284675e-06, "loss": 0.3227, "step": 4397 }, { "epoch": 2.183352639417508, "grad_norm": 0.3192598223686218, "learning_rate": 2.091306530931475e-06, "loss": 0.3181, "step": 4398 }, { "epoch": 2.1838490815819958, "grad_norm": 0.3093181848526001, "learning_rate": 2.0889571037681807e-06, "loss": 0.3207, "step": 4399 }, { "epoch": 2.1843455237464835, "grad_norm": 0.33326950669288635, "learning_rate": 2.0866086485229875e-06, "loss": 0.3323, "step": 4400 }, { "epoch": 2.1848419659109712, "grad_norm": 0.33552753925323486, "learning_rate": 2.0842611659799868e-06, "loss": 0.3248, "step": 4401 }, { "epoch": 2.1853384080754594, "grad_norm": 0.34005868434906006, "learning_rate": 2.081914656922939e-06, "loss": 0.3195, "step": 4402 }, { "epoch": 2.185834850239947, "grad_norm": 0.3170172870159149, "learning_rate": 2.0795691221352766e-06, "loss": 0.3361, "step": 4403 }, { "epoch": 2.186331292404435, "grad_norm": 0.3240414261817932, "learning_rate": 2.0772245624001114e-06, "loss": 0.3502, "step": 4404 }, { "epoch": 2.1868277345689227, "grad_norm": 0.30753692984580994, "learning_rate": 2.0748809785002285e-06, "loss": 0.3123, "step": 4405 }, { "epoch": 2.1873241767334104, "grad_norm": 0.33742570877075195, "learning_rate": 2.072538371218088e-06, "loss": 0.3389, "step": 4406 }, { "epoch": 2.1878206188978986, "grad_norm": 0.3144347071647644, "learning_rate": 2.0701967413358177e-06, "loss": 0.3369, "step": 4407 }, { "epoch": 2.1883170610623863, "grad_norm": 0.33557894825935364, "learning_rate": 2.067856089635231e-06, "loss": 0.3643, "step": 4408 }, { "epoch": 2.188813503226874, "grad_norm": 0.31581220030784607, "learning_rate": 2.065516416897804e-06, "loss": 0.3441, "step": 4409 }, { "epoch": 2.189309945391362, "grad_norm": 0.3408113121986389, "learning_rate": 2.06317772390469e-06, "loss": 0.3474, "step": 4410 }, { "epoch": 2.1898063875558496, "grad_norm": 0.33141466975212097, "learning_rate": 2.060840011436715e-06, "loss": 0.3181, "step": 4411 }, { "epoch": 2.1903028297203377, "grad_norm": 0.3296647369861603, "learning_rate": 2.058503280274379e-06, "loss": 0.3257, "step": 4412 }, { "epoch": 2.1907992718848255, "grad_norm": 0.3253755271434784, "learning_rate": 2.0561675311978533e-06, "loss": 0.3302, "step": 4413 }, { "epoch": 2.1912957140493132, "grad_norm": 0.31297603249549866, "learning_rate": 2.0538327649869793e-06, "loss": 0.3362, "step": 4414 }, { "epoch": 2.191792156213801, "grad_norm": 0.3459151089191437, "learning_rate": 2.0514989824212723e-06, "loss": 0.342, "step": 4415 }, { "epoch": 2.1922885983782887, "grad_norm": 0.31148412823677063, "learning_rate": 2.049166184279919e-06, "loss": 0.3345, "step": 4416 }, { "epoch": 2.192785040542777, "grad_norm": 0.3244611322879791, "learning_rate": 2.0468343713417773e-06, "loss": 0.3408, "step": 4417 }, { "epoch": 2.1932814827072646, "grad_norm": 0.3200485110282898, "learning_rate": 2.0445035443853765e-06, "loss": 0.3564, "step": 4418 }, { "epoch": 2.1937779248717524, "grad_norm": 0.3207896649837494, "learning_rate": 2.0421737041889167e-06, "loss": 0.3266, "step": 4419 }, { "epoch": 2.19427436703624, "grad_norm": 0.29134175181388855, "learning_rate": 2.0398448515302694e-06, "loss": 0.2619, "step": 4420 }, { "epoch": 2.1947708092007283, "grad_norm": 0.331379771232605, "learning_rate": 2.0375169871869722e-06, "loss": 0.3731, "step": 4421 }, { "epoch": 2.195267251365216, "grad_norm": 0.2969735860824585, "learning_rate": 2.0351901119362368e-06, "loss": 0.3348, "step": 4422 }, { "epoch": 2.195763693529704, "grad_norm": 0.3251933455467224, "learning_rate": 2.0328642265549435e-06, "loss": 0.3309, "step": 4423 }, { "epoch": 2.1962601356941915, "grad_norm": 0.3437923192977905, "learning_rate": 2.0305393318196432e-06, "loss": 0.3859, "step": 4424 }, { "epoch": 2.1967565778586793, "grad_norm": 0.312558650970459, "learning_rate": 2.0282154285065566e-06, "loss": 0.3165, "step": 4425 }, { "epoch": 2.1972530200231675, "grad_norm": 0.3154654800891876, "learning_rate": 2.0258925173915658e-06, "loss": 0.318, "step": 4426 }, { "epoch": 2.197749462187655, "grad_norm": 0.33701270818710327, "learning_rate": 2.0235705992502353e-06, "loss": 0.3733, "step": 4427 }, { "epoch": 2.198245904352143, "grad_norm": 0.31165245175361633, "learning_rate": 2.021249674857785e-06, "loss": 0.3149, "step": 4428 }, { "epoch": 2.1987423465166307, "grad_norm": 0.32909515500068665, "learning_rate": 2.0189297449891123e-06, "loss": 0.3309, "step": 4429 }, { "epoch": 2.199238788681119, "grad_norm": 0.3229091167449951, "learning_rate": 2.016610810418773e-06, "loss": 0.3535, "step": 4430 }, { "epoch": 2.1997352308456066, "grad_norm": 0.2987779974937439, "learning_rate": 2.0142928719210035e-06, "loss": 0.3161, "step": 4431 }, { "epoch": 2.2002316730100944, "grad_norm": 0.3156440556049347, "learning_rate": 2.011975930269696e-06, "loss": 0.309, "step": 4432 }, { "epoch": 2.200728115174582, "grad_norm": 0.34972062706947327, "learning_rate": 2.0096599862384147e-06, "loss": 0.3424, "step": 4433 }, { "epoch": 2.20122455733907, "grad_norm": 0.30167168378829956, "learning_rate": 2.0073450406003907e-06, "loss": 0.3371, "step": 4434 }, { "epoch": 2.201720999503558, "grad_norm": 0.3113873600959778, "learning_rate": 2.0050310941285226e-06, "loss": 0.3528, "step": 4435 }, { "epoch": 2.202217441668046, "grad_norm": 0.3178193271160126, "learning_rate": 2.002718147595375e-06, "loss": 0.3192, "step": 4436 }, { "epoch": 2.2027138838325335, "grad_norm": 0.3341817557811737, "learning_rate": 2.0004062017731724e-06, "loss": 0.3475, "step": 4437 }, { "epoch": 2.2032103259970213, "grad_norm": 0.3020227253437042, "learning_rate": 1.9980952574338185e-06, "loss": 0.3173, "step": 4438 }, { "epoch": 2.203706768161509, "grad_norm": 0.29995718598365784, "learning_rate": 1.9957853153488694e-06, "loss": 0.3285, "step": 4439 }, { "epoch": 2.204203210325997, "grad_norm": 0.3143261969089508, "learning_rate": 1.9934763762895526e-06, "loss": 0.3338, "step": 4440 }, { "epoch": 2.204699652490485, "grad_norm": 0.3235434591770172, "learning_rate": 1.991168441026762e-06, "loss": 0.2912, "step": 4441 }, { "epoch": 2.2051960946549727, "grad_norm": 0.31201425194740295, "learning_rate": 1.9888615103310527e-06, "loss": 0.355, "step": 4442 }, { "epoch": 2.2056925368194604, "grad_norm": 0.3182674050331116, "learning_rate": 1.9865555849726488e-06, "loss": 0.3585, "step": 4443 }, { "epoch": 2.206188978983948, "grad_norm": 0.3238373100757599, "learning_rate": 1.9842506657214327e-06, "loss": 0.3766, "step": 4444 }, { "epoch": 2.2066854211484364, "grad_norm": 0.30601564049720764, "learning_rate": 1.9819467533469554e-06, "loss": 0.2919, "step": 4445 }, { "epoch": 2.207181863312924, "grad_norm": 0.33354127407073975, "learning_rate": 1.979643848618431e-06, "loss": 0.3684, "step": 4446 }, { "epoch": 2.207678305477412, "grad_norm": 0.3276797831058502, "learning_rate": 1.977341952304739e-06, "loss": 0.3455, "step": 4447 }, { "epoch": 2.2081747476418996, "grad_norm": 0.33222460746765137, "learning_rate": 1.9750410651744138e-06, "loss": 0.3195, "step": 4448 }, { "epoch": 2.2086711898063873, "grad_norm": 0.3128325343132019, "learning_rate": 1.9727411879956654e-06, "loss": 0.3155, "step": 4449 }, { "epoch": 2.2091676319708755, "grad_norm": 0.3314554989337921, "learning_rate": 1.9704423215363594e-06, "loss": 0.3579, "step": 4450 }, { "epoch": 2.2096640741353633, "grad_norm": 0.30452513694763184, "learning_rate": 1.968144466564022e-06, "loss": 0.3117, "step": 4451 }, { "epoch": 2.210160516299851, "grad_norm": 0.3302224278450012, "learning_rate": 1.9658476238458458e-06, "loss": 0.3448, "step": 4452 }, { "epoch": 2.2106569584643387, "grad_norm": 0.32048389315605164, "learning_rate": 1.9635517941486843e-06, "loss": 0.3249, "step": 4453 }, { "epoch": 2.211153400628827, "grad_norm": 0.34949082136154175, "learning_rate": 1.961256978239054e-06, "loss": 0.3332, "step": 4454 }, { "epoch": 2.2116498427933147, "grad_norm": 0.36576932668685913, "learning_rate": 1.9589631768831293e-06, "loss": 0.3727, "step": 4455 }, { "epoch": 2.2121462849578024, "grad_norm": 0.28081250190734863, "learning_rate": 1.956670390846748e-06, "loss": 0.2922, "step": 4456 }, { "epoch": 2.21264272712229, "grad_norm": 0.3177272379398346, "learning_rate": 1.9543786208954106e-06, "loss": 0.3327, "step": 4457 }, { "epoch": 2.213139169286778, "grad_norm": 0.3349776864051819, "learning_rate": 1.952087867794277e-06, "loss": 0.3381, "step": 4458 }, { "epoch": 2.213635611451266, "grad_norm": 0.309898316860199, "learning_rate": 1.949798132308167e-06, "loss": 0.2954, "step": 4459 }, { "epoch": 2.214132053615754, "grad_norm": 0.31365182995796204, "learning_rate": 1.947509415201558e-06, "loss": 0.3346, "step": 4460 }, { "epoch": 2.2146284957802416, "grad_norm": 0.3166103661060333, "learning_rate": 1.945221717238597e-06, "loss": 0.3515, "step": 4461 }, { "epoch": 2.2151249379447293, "grad_norm": 0.3291226029396057, "learning_rate": 1.942935039183078e-06, "loss": 0.3557, "step": 4462 }, { "epoch": 2.2156213801092175, "grad_norm": 0.33277636766433716, "learning_rate": 1.9406493817984632e-06, "loss": 0.3371, "step": 4463 }, { "epoch": 2.2161178222737052, "grad_norm": 0.30517733097076416, "learning_rate": 1.9383647458478718e-06, "loss": 0.2886, "step": 4464 }, { "epoch": 2.216614264438193, "grad_norm": 0.33692896366119385, "learning_rate": 1.9360811320940805e-06, "loss": 0.3645, "step": 4465 }, { "epoch": 2.2171107066026807, "grad_norm": 0.34152930974960327, "learning_rate": 1.933798541299528e-06, "loss": 0.3706, "step": 4466 }, { "epoch": 2.2176071487671685, "grad_norm": 0.33263856172561646, "learning_rate": 1.9315169742263048e-06, "loss": 0.3124, "step": 4467 }, { "epoch": 2.2181035909316567, "grad_norm": 0.3426186740398407, "learning_rate": 1.9292364316361707e-06, "loss": 0.3013, "step": 4468 }, { "epoch": 2.2186000330961444, "grad_norm": 0.32433751225471497, "learning_rate": 1.9269569142905316e-06, "loss": 0.3652, "step": 4469 }, { "epoch": 2.219096475260632, "grad_norm": 0.32144537568092346, "learning_rate": 1.9246784229504593e-06, "loss": 0.3145, "step": 4470 }, { "epoch": 2.21959291742512, "grad_norm": 0.3154356777667999, "learning_rate": 1.9224009583766763e-06, "loss": 0.3379, "step": 4471 }, { "epoch": 2.2200893595896076, "grad_norm": 0.3307437598705292, "learning_rate": 1.92012452132957e-06, "loss": 0.3377, "step": 4472 }, { "epoch": 2.220585801754096, "grad_norm": 0.3323841691017151, "learning_rate": 1.917849112569181e-06, "loss": 0.3285, "step": 4473 }, { "epoch": 2.2210822439185836, "grad_norm": 0.3066924214363098, "learning_rate": 1.9155747328552027e-06, "loss": 0.3582, "step": 4474 }, { "epoch": 2.2215786860830713, "grad_norm": 0.3378407061100006, "learning_rate": 1.913301382946994e-06, "loss": 0.3472, "step": 4475 }, { "epoch": 2.222075128247559, "grad_norm": 0.3283631503582001, "learning_rate": 1.91102906360356e-06, "loss": 0.3172, "step": 4476 }, { "epoch": 2.222571570412047, "grad_norm": 0.32977166771888733, "learning_rate": 1.9087577755835694e-06, "loss": 0.339, "step": 4477 }, { "epoch": 2.223068012576535, "grad_norm": 0.33217310905456543, "learning_rate": 1.9064875196453392e-06, "loss": 0.3747, "step": 4478 }, { "epoch": 2.2235644547410227, "grad_norm": 0.35641342401504517, "learning_rate": 1.9042182965468525e-06, "loss": 0.3391, "step": 4479 }, { "epoch": 2.2240608969055105, "grad_norm": 0.3427965044975281, "learning_rate": 1.9019501070457363e-06, "loss": 0.3531, "step": 4480 }, { "epoch": 2.224557339069998, "grad_norm": 0.31374630331993103, "learning_rate": 1.8996829518992793e-06, "loss": 0.3334, "step": 4481 }, { "epoch": 2.2250537812344864, "grad_norm": 0.32918646931648254, "learning_rate": 1.8974168318644221e-06, "loss": 0.3059, "step": 4482 }, { "epoch": 2.225550223398974, "grad_norm": 0.3266584277153015, "learning_rate": 1.8951517476977615e-06, "loss": 0.3401, "step": 4483 }, { "epoch": 2.226046665563462, "grad_norm": 0.31245356798171997, "learning_rate": 1.892887700155549e-06, "loss": 0.3636, "step": 4484 }, { "epoch": 2.2265431077279496, "grad_norm": 0.3227308988571167, "learning_rate": 1.8906246899936853e-06, "loss": 0.3168, "step": 4485 }, { "epoch": 2.2270395498924374, "grad_norm": 0.3331391215324402, "learning_rate": 1.8883627179677287e-06, "loss": 0.3137, "step": 4486 }, { "epoch": 2.2275359920569255, "grad_norm": 0.33028334379196167, "learning_rate": 1.8861017848328917e-06, "loss": 0.3373, "step": 4487 }, { "epoch": 2.2280324342214133, "grad_norm": 0.29741334915161133, "learning_rate": 1.8838418913440376e-06, "loss": 0.3243, "step": 4488 }, { "epoch": 2.228528876385901, "grad_norm": 0.3535260856151581, "learning_rate": 1.8815830382556832e-06, "loss": 0.3922, "step": 4489 }, { "epoch": 2.2290253185503888, "grad_norm": 0.3085800111293793, "learning_rate": 1.8793252263219985e-06, "loss": 0.3142, "step": 4490 }, { "epoch": 2.2295217607148765, "grad_norm": 0.31913793087005615, "learning_rate": 1.8770684562968079e-06, "loss": 0.3356, "step": 4491 }, { "epoch": 2.2300182028793647, "grad_norm": 0.3177220821380615, "learning_rate": 1.8748127289335805e-06, "loss": 0.3148, "step": 4492 }, { "epoch": 2.2305146450438524, "grad_norm": 0.2904345989227295, "learning_rate": 1.8725580449854453e-06, "loss": 0.3121, "step": 4493 }, { "epoch": 2.23101108720834, "grad_norm": 0.31378066539764404, "learning_rate": 1.87030440520518e-06, "loss": 0.349, "step": 4494 }, { "epoch": 2.231507529372828, "grad_norm": 0.3205774426460266, "learning_rate": 1.8680518103452134e-06, "loss": 0.3332, "step": 4495 }, { "epoch": 2.232003971537316, "grad_norm": 0.32441022992134094, "learning_rate": 1.865800261157627e-06, "loss": 0.3453, "step": 4496 }, { "epoch": 2.232500413701804, "grad_norm": 0.3088085651397705, "learning_rate": 1.863549758394147e-06, "loss": 0.2865, "step": 4497 }, { "epoch": 2.2329968558662916, "grad_norm": 0.30940964818000793, "learning_rate": 1.8613003028061627e-06, "loss": 0.3058, "step": 4498 }, { "epoch": 2.2334932980307793, "grad_norm": 0.31816986203193665, "learning_rate": 1.8590518951447001e-06, "loss": 0.3646, "step": 4499 }, { "epoch": 2.233989740195267, "grad_norm": 0.3317260146141052, "learning_rate": 1.8568045361604453e-06, "loss": 0.3192, "step": 4500 }, { "epoch": 2.2344861823597553, "grad_norm": 0.30083125829696655, "learning_rate": 1.8545582266037254e-06, "loss": 0.3441, "step": 4501 }, { "epoch": 2.234982624524243, "grad_norm": 0.31072255969047546, "learning_rate": 1.8523129672245283e-06, "loss": 0.3338, "step": 4502 }, { "epoch": 2.2354790666887308, "grad_norm": 0.3322658836841583, "learning_rate": 1.8500687587724803e-06, "loss": 0.3633, "step": 4503 }, { "epoch": 2.2359755088532185, "grad_norm": 0.3446650505065918, "learning_rate": 1.8478256019968637e-06, "loss": 0.3003, "step": 4504 }, { "epoch": 2.2364719510177062, "grad_norm": 0.32848796248435974, "learning_rate": 1.8455834976466069e-06, "loss": 0.3321, "step": 4505 }, { "epoch": 2.2369683931821944, "grad_norm": 0.3020167052745819, "learning_rate": 1.8433424464702882e-06, "loss": 0.3164, "step": 4506 }, { "epoch": 2.237464835346682, "grad_norm": 0.3336108922958374, "learning_rate": 1.841102449216135e-06, "loss": 0.3434, "step": 4507 }, { "epoch": 2.23796127751117, "grad_norm": 0.3166925013065338, "learning_rate": 1.8388635066320164e-06, "loss": 0.3509, "step": 4508 }, { "epoch": 2.2384577196756577, "grad_norm": 0.2970641851425171, "learning_rate": 1.8366256194654613e-06, "loss": 0.3265, "step": 4509 }, { "epoch": 2.2389541618401454, "grad_norm": 0.3336975574493408, "learning_rate": 1.8343887884636353e-06, "loss": 0.3537, "step": 4510 }, { "epoch": 2.2394506040046336, "grad_norm": 0.3011425733566284, "learning_rate": 1.8321530143733552e-06, "loss": 0.3119, "step": 4511 }, { "epoch": 2.2399470461691213, "grad_norm": 0.3248828649520874, "learning_rate": 1.8299182979410867e-06, "loss": 0.3586, "step": 4512 }, { "epoch": 2.240443488333609, "grad_norm": 0.31825852394104004, "learning_rate": 1.8276846399129405e-06, "loss": 0.3236, "step": 4513 }, { "epoch": 2.240939930498097, "grad_norm": 0.3210591971874237, "learning_rate": 1.825452041034676e-06, "loss": 0.3141, "step": 4514 }, { "epoch": 2.241436372662585, "grad_norm": 0.31836313009262085, "learning_rate": 1.8232205020516925e-06, "loss": 0.3276, "step": 4515 }, { "epoch": 2.2419328148270727, "grad_norm": 0.33820202946662903, "learning_rate": 1.8209900237090461e-06, "loss": 0.3409, "step": 4516 }, { "epoch": 2.2424292569915605, "grad_norm": 0.3335728049278259, "learning_rate": 1.8187606067514284e-06, "loss": 0.3042, "step": 4517 }, { "epoch": 2.2429256991560482, "grad_norm": 0.3288349211215973, "learning_rate": 1.8165322519231832e-06, "loss": 0.3363, "step": 4518 }, { "epoch": 2.243422141320536, "grad_norm": 0.3241031765937805, "learning_rate": 1.8143049599682972e-06, "loss": 0.3603, "step": 4519 }, { "epoch": 2.243918583485024, "grad_norm": 0.30090489983558655, "learning_rate": 1.8120787316304028e-06, "loss": 0.315, "step": 4520 }, { "epoch": 2.244415025649512, "grad_norm": 0.3166385591030121, "learning_rate": 1.8098535676527785e-06, "loss": 0.3359, "step": 4521 }, { "epoch": 2.2449114678139996, "grad_norm": 0.3398277461528778, "learning_rate": 1.8076294687783424e-06, "loss": 0.3341, "step": 4522 }, { "epoch": 2.2454079099784874, "grad_norm": 0.32305267453193665, "learning_rate": 1.8054064357496636e-06, "loss": 0.3122, "step": 4523 }, { "epoch": 2.2459043521429756, "grad_norm": 0.3290334939956665, "learning_rate": 1.8031844693089513e-06, "loss": 0.3357, "step": 4524 }, { "epoch": 2.2464007943074633, "grad_norm": 0.3166326582431793, "learning_rate": 1.8009635701980615e-06, "loss": 0.316, "step": 4525 }, { "epoch": 2.246897236471951, "grad_norm": 0.32509684562683105, "learning_rate": 1.7987437391584894e-06, "loss": 0.3658, "step": 4526 }, { "epoch": 2.247393678636439, "grad_norm": 0.299060583114624, "learning_rate": 1.7965249769313776e-06, "loss": 0.3117, "step": 4527 }, { "epoch": 2.2478901208009265, "grad_norm": 0.3175930678844452, "learning_rate": 1.79430728425751e-06, "loss": 0.3488, "step": 4528 }, { "epoch": 2.2483865629654147, "grad_norm": 0.3175496757030487, "learning_rate": 1.7920906618773142e-06, "loss": 0.3462, "step": 4529 }, { "epoch": 2.2488830051299025, "grad_norm": 0.3160818815231323, "learning_rate": 1.7898751105308605e-06, "loss": 0.3411, "step": 4530 }, { "epoch": 2.24937944729439, "grad_norm": 0.318082332611084, "learning_rate": 1.7876606309578608e-06, "loss": 0.3355, "step": 4531 }, { "epoch": 2.249875889458878, "grad_norm": 0.3267844319343567, "learning_rate": 1.7854472238976717e-06, "loss": 0.3249, "step": 4532 }, { "epoch": 2.2503723316233657, "grad_norm": 0.30597206950187683, "learning_rate": 1.7832348900892864e-06, "loss": 0.3215, "step": 4533 }, { "epoch": 2.250868773787854, "grad_norm": 0.347590833902359, "learning_rate": 1.781023630271344e-06, "loss": 0.3427, "step": 4534 }, { "epoch": 2.2513652159523416, "grad_norm": 0.3154980540275574, "learning_rate": 1.7788134451821248e-06, "loss": 0.3068, "step": 4535 }, { "epoch": 2.2518616581168294, "grad_norm": 0.323558509349823, "learning_rate": 1.7766043355595498e-06, "loss": 0.34, "step": 4536 }, { "epoch": 2.252358100281317, "grad_norm": 0.3123858571052551, "learning_rate": 1.774396302141181e-06, "loss": 0.3334, "step": 4537 }, { "epoch": 2.252854542445805, "grad_norm": 0.33964478969573975, "learning_rate": 1.7721893456642165e-06, "loss": 0.3331, "step": 4538 }, { "epoch": 2.253350984610293, "grad_norm": 0.36003851890563965, "learning_rate": 1.7699834668655065e-06, "loss": 0.3369, "step": 4539 }, { "epoch": 2.253847426774781, "grad_norm": 0.33648812770843506, "learning_rate": 1.7677786664815278e-06, "loss": 0.3285, "step": 4540 }, { "epoch": 2.2543438689392685, "grad_norm": 0.3430643379688263, "learning_rate": 1.7655749452484067e-06, "loss": 0.3217, "step": 4541 }, { "epoch": 2.2548403111037563, "grad_norm": 0.3320642113685608, "learning_rate": 1.7633723039019018e-06, "loss": 0.3217, "step": 4542 }, { "epoch": 2.255336753268244, "grad_norm": 0.31508857011795044, "learning_rate": 1.7611707431774193e-06, "loss": 0.3434, "step": 4543 }, { "epoch": 2.255833195432732, "grad_norm": 0.3058767318725586, "learning_rate": 1.758970263810001e-06, "loss": 0.3371, "step": 4544 }, { "epoch": 2.25632963759722, "grad_norm": 0.3142128586769104, "learning_rate": 1.756770866534322e-06, "loss": 0.3158, "step": 4545 }, { "epoch": 2.2568260797617077, "grad_norm": 0.321697473526001, "learning_rate": 1.7545725520847078e-06, "loss": 0.3581, "step": 4546 }, { "epoch": 2.2573225219261954, "grad_norm": 0.3084547817707062, "learning_rate": 1.7523753211951112e-06, "loss": 0.3565, "step": 4547 }, { "epoch": 2.2578189640906836, "grad_norm": 0.3010653853416443, "learning_rate": 1.7501791745991308e-06, "loss": 0.3084, "step": 4548 }, { "epoch": 2.2583154062551714, "grad_norm": 0.3122425377368927, "learning_rate": 1.7479841130299957e-06, "loss": 0.3361, "step": 4549 }, { "epoch": 2.258811848419659, "grad_norm": 0.3446396589279175, "learning_rate": 1.7457901372205832e-06, "loss": 0.3443, "step": 4550 }, { "epoch": 2.259308290584147, "grad_norm": 0.33582213521003723, "learning_rate": 1.7435972479033981e-06, "loss": 0.2869, "step": 4551 }, { "epoch": 2.259804732748635, "grad_norm": 0.3433196544647217, "learning_rate": 1.7414054458105878e-06, "loss": 0.3687, "step": 4552 }, { "epoch": 2.2603011749131228, "grad_norm": 0.33336177468299866, "learning_rate": 1.7392147316739356e-06, "loss": 0.3225, "step": 4553 }, { "epoch": 2.2607976170776105, "grad_norm": 0.3073997497558594, "learning_rate": 1.7370251062248606e-06, "loss": 0.2978, "step": 4554 }, { "epoch": 2.2612940592420983, "grad_norm": 0.34691545367240906, "learning_rate": 1.734836570194422e-06, "loss": 0.3168, "step": 4555 }, { "epoch": 2.261790501406586, "grad_norm": 0.35814040899276733, "learning_rate": 1.732649124313307e-06, "loss": 0.3476, "step": 4556 }, { "epoch": 2.262286943571074, "grad_norm": 0.3213381767272949, "learning_rate": 1.7304627693118508e-06, "loss": 0.3375, "step": 4557 }, { "epoch": 2.262783385735562, "grad_norm": 0.325488805770874, "learning_rate": 1.7282775059200136e-06, "loss": 0.3092, "step": 4558 }, { "epoch": 2.2632798279000497, "grad_norm": 0.3246370851993561, "learning_rate": 1.7260933348673963e-06, "loss": 0.3183, "step": 4559 }, { "epoch": 2.2637762700645374, "grad_norm": 0.313072144985199, "learning_rate": 1.723910256883235e-06, "loss": 0.3334, "step": 4560 }, { "epoch": 2.264272712229025, "grad_norm": 0.3300376534461975, "learning_rate": 1.7217282726963996e-06, "loss": 0.3406, "step": 4561 }, { "epoch": 2.2647691543935133, "grad_norm": 0.3243522047996521, "learning_rate": 1.7195473830353971e-06, "loss": 0.3044, "step": 4562 }, { "epoch": 2.265265596558001, "grad_norm": 0.31319159269332886, "learning_rate": 1.7173675886283642e-06, "loss": 0.3178, "step": 4563 }, { "epoch": 2.265762038722489, "grad_norm": 0.3281477689743042, "learning_rate": 1.7151888902030762e-06, "loss": 0.3447, "step": 4564 }, { "epoch": 2.2662584808869766, "grad_norm": 0.3153233826160431, "learning_rate": 1.7130112884869415e-06, "loss": 0.3353, "step": 4565 }, { "epoch": 2.2667549230514643, "grad_norm": 0.3292144536972046, "learning_rate": 1.7108347842070023e-06, "loss": 0.3616, "step": 4566 }, { "epoch": 2.2672513652159525, "grad_norm": 0.3152390420436859, "learning_rate": 1.7086593780899353e-06, "loss": 0.3487, "step": 4567 }, { "epoch": 2.2677478073804402, "grad_norm": 0.3099406063556671, "learning_rate": 1.7064850708620457e-06, "loss": 0.3174, "step": 4568 }, { "epoch": 2.268244249544928, "grad_norm": 0.33168962597846985, "learning_rate": 1.704311863249281e-06, "loss": 0.3418, "step": 4569 }, { "epoch": 2.2687406917094157, "grad_norm": 0.376756489276886, "learning_rate": 1.7021397559772118e-06, "loss": 0.3748, "step": 4570 }, { "epoch": 2.2692371338739035, "grad_norm": 0.32767820358276367, "learning_rate": 1.6999687497710472e-06, "loss": 0.3107, "step": 4571 }, { "epoch": 2.2697335760383917, "grad_norm": 0.3297105133533478, "learning_rate": 1.697798845355627e-06, "loss": 0.3418, "step": 4572 }, { "epoch": 2.2702300182028794, "grad_norm": 0.3110722005367279, "learning_rate": 1.6956300434554256e-06, "loss": 0.3237, "step": 4573 }, { "epoch": 2.270726460367367, "grad_norm": 0.34384116530418396, "learning_rate": 1.6934623447945431e-06, "loss": 0.3556, "step": 4574 }, { "epoch": 2.271222902531855, "grad_norm": 0.3292185962200165, "learning_rate": 1.6912957500967164e-06, "loss": 0.3246, "step": 4575 }, { "epoch": 2.271719344696343, "grad_norm": 0.33134564757347107, "learning_rate": 1.6891302600853137e-06, "loss": 0.323, "step": 4576 }, { "epoch": 2.272215786860831, "grad_norm": 0.33794763684272766, "learning_rate": 1.6869658754833323e-06, "loss": 0.3438, "step": 4577 }, { "epoch": 2.2727122290253186, "grad_norm": 0.33051931858062744, "learning_rate": 1.684802597013404e-06, "loss": 0.3049, "step": 4578 }, { "epoch": 2.2732086711898063, "grad_norm": 0.33094996213912964, "learning_rate": 1.682640425397783e-06, "loss": 0.3167, "step": 4579 }, { "epoch": 2.273705113354294, "grad_norm": 0.30571579933166504, "learning_rate": 1.6804793613583663e-06, "loss": 0.2961, "step": 4580 }, { "epoch": 2.2742015555187822, "grad_norm": 0.32236775755882263, "learning_rate": 1.6783194056166697e-06, "loss": 0.3213, "step": 4581 }, { "epoch": 2.27469799768327, "grad_norm": 0.335134357213974, "learning_rate": 1.676160558893845e-06, "loss": 0.3361, "step": 4582 }, { "epoch": 2.2751944398477577, "grad_norm": 0.31865811347961426, "learning_rate": 1.674002821910673e-06, "loss": 0.3914, "step": 4583 }, { "epoch": 2.2756908820122455, "grad_norm": 0.33366134762763977, "learning_rate": 1.671846195387563e-06, "loss": 0.3421, "step": 4584 }, { "epoch": 2.2761873241767336, "grad_norm": 0.37917065620422363, "learning_rate": 1.6696906800445562e-06, "loss": 0.3568, "step": 4585 }, { "epoch": 2.2766837663412214, "grad_norm": 0.32041501998901367, "learning_rate": 1.6675362766013148e-06, "loss": 0.3223, "step": 4586 }, { "epoch": 2.277180208505709, "grad_norm": 0.3101111352443695, "learning_rate": 1.6653829857771432e-06, "loss": 0.313, "step": 4587 }, { "epoch": 2.277676650670197, "grad_norm": 0.32882821559906006, "learning_rate": 1.6632308082909604e-06, "loss": 0.3429, "step": 4588 }, { "epoch": 2.2781730928346846, "grad_norm": 0.3272518217563629, "learning_rate": 1.6610797448613225e-06, "loss": 0.3288, "step": 4589 }, { "epoch": 2.278669534999173, "grad_norm": 0.3311448395252228, "learning_rate": 1.6589297962064111e-06, "loss": 0.3232, "step": 4590 }, { "epoch": 2.2791659771636605, "grad_norm": 0.3284163773059845, "learning_rate": 1.6567809630440356e-06, "loss": 0.3147, "step": 4591 }, { "epoch": 2.2796624193281483, "grad_norm": 0.3004896640777588, "learning_rate": 1.6546332460916347e-06, "loss": 0.3165, "step": 4592 }, { "epoch": 2.280158861492636, "grad_norm": 0.33021438121795654, "learning_rate": 1.6524866460662686e-06, "loss": 0.3432, "step": 4593 }, { "epoch": 2.2806553036571238, "grad_norm": 0.3275982439517975, "learning_rate": 1.6503411636846318e-06, "loss": 0.3587, "step": 4594 }, { "epoch": 2.281151745821612, "grad_norm": 0.3095559775829315, "learning_rate": 1.648196799663041e-06, "loss": 0.2915, "step": 4595 }, { "epoch": 2.2816481879860997, "grad_norm": 0.3368721902370453, "learning_rate": 1.646053554717444e-06, "loss": 0.3814, "step": 4596 }, { "epoch": 2.2821446301505874, "grad_norm": 0.33760085701942444, "learning_rate": 1.6439114295634068e-06, "loss": 0.3271, "step": 4597 }, { "epoch": 2.282641072315075, "grad_norm": 0.3254109025001526, "learning_rate": 1.6417704249161326e-06, "loss": 0.3162, "step": 4598 }, { "epoch": 2.283137514479563, "grad_norm": 0.32436037063598633, "learning_rate": 1.63963054149044e-06, "loss": 0.3673, "step": 4599 }, { "epoch": 2.283633956644051, "grad_norm": 0.33620673418045044, "learning_rate": 1.6374917800007806e-06, "loss": 0.3359, "step": 4600 }, { "epoch": 2.284130398808539, "grad_norm": 0.30896520614624023, "learning_rate": 1.6353541411612272e-06, "loss": 0.3418, "step": 4601 }, { "epoch": 2.2846268409730266, "grad_norm": 0.34264862537384033, "learning_rate": 1.6332176256854809e-06, "loss": 0.3432, "step": 4602 }, { "epoch": 2.2851232831375143, "grad_norm": 0.3310759663581848, "learning_rate": 1.6310822342868664e-06, "loss": 0.3849, "step": 4603 }, { "epoch": 2.285619725302002, "grad_norm": 0.34250661730766296, "learning_rate": 1.6289479676783305e-06, "loss": 0.2789, "step": 4604 }, { "epoch": 2.2861161674664903, "grad_norm": 0.33619266748428345, "learning_rate": 1.6268148265724476e-06, "loss": 0.3599, "step": 4605 }, { "epoch": 2.286612609630978, "grad_norm": 0.32861045002937317, "learning_rate": 1.624682811681416e-06, "loss": 0.3222, "step": 4606 }, { "epoch": 2.2871090517954658, "grad_norm": 0.3244905173778534, "learning_rate": 1.6225519237170578e-06, "loss": 0.3165, "step": 4607 }, { "epoch": 2.2876054939599535, "grad_norm": 0.3184097707271576, "learning_rate": 1.6204221633908202e-06, "loss": 0.3351, "step": 4608 }, { "epoch": 2.2881019361244417, "grad_norm": 0.3312532603740692, "learning_rate": 1.6182935314137665e-06, "loss": 0.3483, "step": 4609 }, { "epoch": 2.2885983782889294, "grad_norm": 0.28261521458625793, "learning_rate": 1.6161660284965969e-06, "loss": 0.2686, "step": 4610 }, { "epoch": 2.289094820453417, "grad_norm": 0.3289376497268677, "learning_rate": 1.6140396553496208e-06, "loss": 0.3459, "step": 4611 }, { "epoch": 2.289591262617905, "grad_norm": 0.3009338676929474, "learning_rate": 1.6119144126827784e-06, "loss": 0.3291, "step": 4612 }, { "epoch": 2.290087704782393, "grad_norm": 0.3150816857814789, "learning_rate": 1.609790301205631e-06, "loss": 0.3401, "step": 4613 }, { "epoch": 2.290584146946881, "grad_norm": 0.3091810643672943, "learning_rate": 1.607667321627361e-06, "loss": 0.3533, "step": 4614 }, { "epoch": 2.2910805891113686, "grad_norm": 0.31515100598335266, "learning_rate": 1.605545474656775e-06, "loss": 0.3155, "step": 4615 }, { "epoch": 2.2915770312758563, "grad_norm": 0.32878416776657104, "learning_rate": 1.6034247610022962e-06, "loss": 0.3253, "step": 4616 }, { "epoch": 2.292073473440344, "grad_norm": 0.32475244998931885, "learning_rate": 1.6013051813719788e-06, "loss": 0.3112, "step": 4617 }, { "epoch": 2.2925699156048323, "grad_norm": 0.3232291340827942, "learning_rate": 1.5991867364734887e-06, "loss": 0.3179, "step": 4618 }, { "epoch": 2.29306635776932, "grad_norm": 0.3546046018600464, "learning_rate": 1.5970694270141197e-06, "loss": 0.3747, "step": 4619 }, { "epoch": 2.2935627999338077, "grad_norm": 0.3046615421772003, "learning_rate": 1.5949532537007795e-06, "loss": 0.2789, "step": 4620 }, { "epoch": 2.2940592420982955, "grad_norm": 0.3210830092430115, "learning_rate": 1.5928382172400064e-06, "loss": 0.311, "step": 4621 }, { "epoch": 2.2945556842627832, "grad_norm": 0.3282589018344879, "learning_rate": 1.59072431833795e-06, "loss": 0.3258, "step": 4622 }, { "epoch": 2.2950521264272714, "grad_norm": 0.3391930162906647, "learning_rate": 1.5886115577003847e-06, "loss": 0.3348, "step": 4623 }, { "epoch": 2.295548568591759, "grad_norm": 0.3088192641735077, "learning_rate": 1.5864999360327039e-06, "loss": 0.3235, "step": 4624 }, { "epoch": 2.296045010756247, "grad_norm": 0.333566278219223, "learning_rate": 1.5843894540399201e-06, "loss": 0.3336, "step": 4625 }, { "epoch": 2.2965414529207346, "grad_norm": 0.3258605897426605, "learning_rate": 1.582280112426669e-06, "loss": 0.3316, "step": 4626 }, { "epoch": 2.2970378950852224, "grad_norm": 0.3082706928253174, "learning_rate": 1.580171911897196e-06, "loss": 0.2684, "step": 4627 }, { "epoch": 2.2975343372497106, "grad_norm": 0.34007665514945984, "learning_rate": 1.5780648531553794e-06, "loss": 0.3757, "step": 4628 }, { "epoch": 2.2980307794141983, "grad_norm": 0.32133734226226807, "learning_rate": 1.5759589369047035e-06, "loss": 0.3304, "step": 4629 }, { "epoch": 2.298527221578686, "grad_norm": 0.32424068450927734, "learning_rate": 1.573854163848278e-06, "loss": 0.3057, "step": 4630 }, { "epoch": 2.299023663743174, "grad_norm": 0.3111746907234192, "learning_rate": 1.5717505346888301e-06, "loss": 0.3046, "step": 4631 }, { "epoch": 2.2995201059076615, "grad_norm": 0.32022687792778015, "learning_rate": 1.5696480501287037e-06, "loss": 0.3505, "step": 4632 }, { "epoch": 2.3000165480721497, "grad_norm": 0.3231147527694702, "learning_rate": 1.567546710869864e-06, "loss": 0.3165, "step": 4633 }, { "epoch": 2.3005129902366375, "grad_norm": 0.3333238363265991, "learning_rate": 1.565446517613886e-06, "loss": 0.3652, "step": 4634 }, { "epoch": 2.301009432401125, "grad_norm": 0.30005303025245667, "learning_rate": 1.56334747106197e-06, "loss": 0.291, "step": 4635 }, { "epoch": 2.301505874565613, "grad_norm": 0.3490307033061981, "learning_rate": 1.5612495719149306e-06, "loss": 0.3535, "step": 4636 }, { "epoch": 2.3020023167301007, "grad_norm": 0.33394455909729004, "learning_rate": 1.5591528208731993e-06, "loss": 0.312, "step": 4637 }, { "epoch": 2.302498758894589, "grad_norm": 0.3438171446323395, "learning_rate": 1.5570572186368255e-06, "loss": 0.3515, "step": 4638 }, { "epoch": 2.3029952010590766, "grad_norm": 0.3098241090774536, "learning_rate": 1.554962765905469e-06, "loss": 0.2977, "step": 4639 }, { "epoch": 2.3034916432235644, "grad_norm": 0.34919095039367676, "learning_rate": 1.5528694633784175e-06, "loss": 0.3531, "step": 4640 }, { "epoch": 2.303988085388052, "grad_norm": 0.33794164657592773, "learning_rate": 1.5507773117545628e-06, "loss": 0.3271, "step": 4641 }, { "epoch": 2.3044845275525403, "grad_norm": 0.3480783700942993, "learning_rate": 1.5486863117324185e-06, "loss": 0.3527, "step": 4642 }, { "epoch": 2.304980969717028, "grad_norm": 0.30468425154685974, "learning_rate": 1.5465964640101134e-06, "loss": 0.3068, "step": 4643 }, { "epoch": 2.305477411881516, "grad_norm": 0.3655818700790405, "learning_rate": 1.5445077692853926e-06, "loss": 0.3454, "step": 4644 }, { "epoch": 2.3059738540460035, "grad_norm": 0.3479599952697754, "learning_rate": 1.5424202282556106e-06, "loss": 0.3773, "step": 4645 }, { "epoch": 2.3064702962104917, "grad_norm": 0.32364290952682495, "learning_rate": 1.5403338416177428e-06, "loss": 0.3175, "step": 4646 }, { "epoch": 2.3069667383749795, "grad_norm": 0.3304896950721741, "learning_rate": 1.5382486100683768e-06, "loss": 0.3351, "step": 4647 }, { "epoch": 2.307463180539467, "grad_norm": 0.33942297101020813, "learning_rate": 1.5361645343037146e-06, "loss": 0.3187, "step": 4648 }, { "epoch": 2.307959622703955, "grad_norm": 0.3177892863750458, "learning_rate": 1.5340816150195743e-06, "loss": 0.3185, "step": 4649 }, { "epoch": 2.3084560648684427, "grad_norm": 0.30221593379974365, "learning_rate": 1.5319998529113812e-06, "loss": 0.3284, "step": 4650 }, { "epoch": 2.308952507032931, "grad_norm": 0.33889374136924744, "learning_rate": 1.5299192486741848e-06, "loss": 0.324, "step": 4651 }, { "epoch": 2.3094489491974186, "grad_norm": 0.31133273243904114, "learning_rate": 1.5278398030026386e-06, "loss": 0.3549, "step": 4652 }, { "epoch": 2.3099453913619064, "grad_norm": 0.33562207221984863, "learning_rate": 1.5257615165910139e-06, "loss": 0.3946, "step": 4653 }, { "epoch": 2.310441833526394, "grad_norm": 0.3040272891521454, "learning_rate": 1.5236843901331943e-06, "loss": 0.3594, "step": 4654 }, { "epoch": 2.310938275690882, "grad_norm": 0.3320101797580719, "learning_rate": 1.521608424322676e-06, "loss": 0.3323, "step": 4655 }, { "epoch": 2.31143471785537, "grad_norm": 0.32817238569259644, "learning_rate": 1.519533619852569e-06, "loss": 0.364, "step": 4656 }, { "epoch": 2.3119311600198578, "grad_norm": 0.2885287404060364, "learning_rate": 1.517459977415589e-06, "loss": 0.2775, "step": 4657 }, { "epoch": 2.3124276021843455, "grad_norm": 0.34093764424324036, "learning_rate": 1.5153874977040756e-06, "loss": 0.3601, "step": 4658 }, { "epoch": 2.3129240443488333, "grad_norm": 0.3202614188194275, "learning_rate": 1.5133161814099683e-06, "loss": 0.3484, "step": 4659 }, { "epoch": 2.313420486513321, "grad_norm": 0.3135617971420288, "learning_rate": 1.511246029224826e-06, "loss": 0.3489, "step": 4660 }, { "epoch": 2.313916928677809, "grad_norm": 0.30008643865585327, "learning_rate": 1.5091770418398149e-06, "loss": 0.3452, "step": 4661 }, { "epoch": 2.314413370842297, "grad_norm": 0.31608712673187256, "learning_rate": 1.5071092199457144e-06, "loss": 0.3562, "step": 4662 }, { "epoch": 2.3149098130067847, "grad_norm": 0.33253809809684753, "learning_rate": 1.5050425642329152e-06, "loss": 0.3631, "step": 4663 }, { "epoch": 2.3154062551712724, "grad_norm": 0.3190091550350189, "learning_rate": 1.5029770753914148e-06, "loss": 0.3244, "step": 4664 }, { "epoch": 2.31590269733576, "grad_norm": 0.34498676657676697, "learning_rate": 1.5009127541108247e-06, "loss": 0.364, "step": 4665 }, { "epoch": 2.3163991395002483, "grad_norm": 0.28748518228530884, "learning_rate": 1.4988496010803667e-06, "loss": 0.2667, "step": 4666 }, { "epoch": 2.316895581664736, "grad_norm": 0.342769593000412, "learning_rate": 1.4967876169888724e-06, "loss": 0.3787, "step": 4667 }, { "epoch": 2.317392023829224, "grad_norm": 0.33429154753685, "learning_rate": 1.4947268025247774e-06, "loss": 0.3145, "step": 4668 }, { "epoch": 2.3178884659937116, "grad_norm": 0.3125181794166565, "learning_rate": 1.4926671583761381e-06, "loss": 0.3506, "step": 4669 }, { "epoch": 2.3183849081581998, "grad_norm": 0.3321564495563507, "learning_rate": 1.490608685230609e-06, "loss": 0.403, "step": 4670 }, { "epoch": 2.3188813503226875, "grad_norm": 0.314718097448349, "learning_rate": 1.48855138377546e-06, "loss": 0.2993, "step": 4671 }, { "epoch": 2.3193777924871752, "grad_norm": 0.3244549632072449, "learning_rate": 1.486495254697568e-06, "loss": 0.2879, "step": 4672 }, { "epoch": 2.319874234651663, "grad_norm": 0.3280952274799347, "learning_rate": 1.4844402986834188e-06, "loss": 0.3254, "step": 4673 }, { "epoch": 2.320370676816151, "grad_norm": 0.33056676387786865, "learning_rate": 1.4823865164191077e-06, "loss": 0.378, "step": 4674 }, { "epoch": 2.320867118980639, "grad_norm": 0.30930668115615845, "learning_rate": 1.480333908590334e-06, "loss": 0.3585, "step": 4675 }, { "epoch": 2.3213635611451267, "grad_norm": 0.30333560705184937, "learning_rate": 1.4782824758824088e-06, "loss": 0.3351, "step": 4676 }, { "epoch": 2.3218600033096144, "grad_norm": 0.318502277135849, "learning_rate": 1.4762322189802502e-06, "loss": 0.2969, "step": 4677 }, { "epoch": 2.322356445474102, "grad_norm": 0.35850757360458374, "learning_rate": 1.4741831385683824e-06, "loss": 0.3868, "step": 4678 }, { "epoch": 2.3228528876385903, "grad_norm": 0.3077169954776764, "learning_rate": 1.4721352353309403e-06, "loss": 0.3059, "step": 4679 }, { "epoch": 2.323349329803078, "grad_norm": 0.2985019087791443, "learning_rate": 1.4700885099516577e-06, "loss": 0.3261, "step": 4680 }, { "epoch": 2.323845771967566, "grad_norm": 0.31966039538383484, "learning_rate": 1.468042963113887e-06, "loss": 0.3493, "step": 4681 }, { "epoch": 2.3243422141320536, "grad_norm": 0.31327521800994873, "learning_rate": 1.4659985955005767e-06, "loss": 0.3564, "step": 4682 }, { "epoch": 2.3248386562965413, "grad_norm": 0.2962530255317688, "learning_rate": 1.4639554077942859e-06, "loss": 0.3147, "step": 4683 }, { "epoch": 2.3253350984610295, "grad_norm": 0.31145602464675903, "learning_rate": 1.4619134006771802e-06, "loss": 0.3039, "step": 4684 }, { "epoch": 2.3258315406255172, "grad_norm": 0.3266931176185608, "learning_rate": 1.4598725748310304e-06, "loss": 0.3567, "step": 4685 }, { "epoch": 2.326327982790005, "grad_norm": 0.31423017382621765, "learning_rate": 1.4578329309372136e-06, "loss": 0.3324, "step": 4686 }, { "epoch": 2.3268244249544927, "grad_norm": 0.3518998622894287, "learning_rate": 1.4557944696767078e-06, "loss": 0.3414, "step": 4687 }, { "epoch": 2.3273208671189805, "grad_norm": 0.34168192744255066, "learning_rate": 1.4537571917301051e-06, "loss": 0.3391, "step": 4688 }, { "epoch": 2.3278173092834686, "grad_norm": 0.3100300133228302, "learning_rate": 1.4517210977775936e-06, "loss": 0.3032, "step": 4689 }, { "epoch": 2.3283137514479564, "grad_norm": 0.30516356229782104, "learning_rate": 1.4496861884989716e-06, "loss": 0.3161, "step": 4690 }, { "epoch": 2.328810193612444, "grad_norm": 0.34038451313972473, "learning_rate": 1.4476524645736362e-06, "loss": 0.3349, "step": 4691 }, { "epoch": 2.329306635776932, "grad_norm": 0.31677931547164917, "learning_rate": 1.4456199266805986e-06, "loss": 0.3074, "step": 4692 }, { "epoch": 2.3298030779414196, "grad_norm": 0.33055707812309265, "learning_rate": 1.443588575498463e-06, "loss": 0.3713, "step": 4693 }, { "epoch": 2.330299520105908, "grad_norm": 0.3309830129146576, "learning_rate": 1.4415584117054443e-06, "loss": 0.3584, "step": 4694 }, { "epoch": 2.3307959622703955, "grad_norm": 0.31794849038124084, "learning_rate": 1.4395294359793589e-06, "loss": 0.3017, "step": 4695 }, { "epoch": 2.3312924044348833, "grad_norm": 0.3348728120326996, "learning_rate": 1.4375016489976268e-06, "loss": 0.3305, "step": 4696 }, { "epoch": 2.331788846599371, "grad_norm": 0.3375498950481415, "learning_rate": 1.4354750514372717e-06, "loss": 0.327, "step": 4697 }, { "epoch": 2.3322852887638588, "grad_norm": 0.3131754696369171, "learning_rate": 1.4334496439749157e-06, "loss": 0.3283, "step": 4698 }, { "epoch": 2.332781730928347, "grad_norm": 0.29236334562301636, "learning_rate": 1.4314254272867933e-06, "loss": 0.3148, "step": 4699 }, { "epoch": 2.3332781730928347, "grad_norm": 0.31617456674575806, "learning_rate": 1.4294024020487307e-06, "loss": 0.3497, "step": 4700 }, { "epoch": 2.3337746152573224, "grad_norm": 0.3115748465061188, "learning_rate": 1.4273805689361625e-06, "loss": 0.3352, "step": 4701 }, { "epoch": 2.33427105742181, "grad_norm": 0.3106376826763153, "learning_rate": 1.4253599286241242e-06, "loss": 0.2923, "step": 4702 }, { "epoch": 2.3347674995862984, "grad_norm": 0.32074829936027527, "learning_rate": 1.423340481787252e-06, "loss": 0.2992, "step": 4703 }, { "epoch": 2.335263941750786, "grad_norm": 0.3581116497516632, "learning_rate": 1.4213222290997863e-06, "loss": 0.3338, "step": 4704 }, { "epoch": 2.335760383915274, "grad_norm": 0.3329753577709198, "learning_rate": 1.4193051712355638e-06, "loss": 0.3036, "step": 4705 }, { "epoch": 2.3362568260797616, "grad_norm": 0.3287467658519745, "learning_rate": 1.4172893088680268e-06, "loss": 0.321, "step": 4706 }, { "epoch": 2.33675326824425, "grad_norm": 0.3203154504299164, "learning_rate": 1.4152746426702169e-06, "loss": 0.3427, "step": 4707 }, { "epoch": 2.3372497104087375, "grad_norm": 0.3360244333744049, "learning_rate": 1.4132611733147767e-06, "loss": 0.3129, "step": 4708 }, { "epoch": 2.3377461525732253, "grad_norm": 0.34754499793052673, "learning_rate": 1.4112489014739477e-06, "loss": 0.3081, "step": 4709 }, { "epoch": 2.338242594737713, "grad_norm": 0.31784719228744507, "learning_rate": 1.4092378278195746e-06, "loss": 0.3371, "step": 4710 }, { "epoch": 2.3387390369022008, "grad_norm": 0.3207119405269623, "learning_rate": 1.4072279530231004e-06, "loss": 0.3669, "step": 4711 }, { "epoch": 2.339235479066689, "grad_norm": 0.3248942792415619, "learning_rate": 1.4052192777555645e-06, "loss": 0.2907, "step": 4712 }, { "epoch": 2.3397319212311767, "grad_norm": 0.320049911737442, "learning_rate": 1.4032118026876118e-06, "loss": 0.3551, "step": 4713 }, { "epoch": 2.3402283633956644, "grad_norm": 0.3222912549972534, "learning_rate": 1.4012055284894827e-06, "loss": 0.3402, "step": 4714 }, { "epoch": 2.340724805560152, "grad_norm": 0.3136901557445526, "learning_rate": 1.399200455831019e-06, "loss": 0.3557, "step": 4715 }, { "epoch": 2.34122124772464, "grad_norm": 0.3045031428337097, "learning_rate": 1.3971965853816577e-06, "loss": 0.3147, "step": 4716 }, { "epoch": 2.341717689889128, "grad_norm": 0.3183434009552002, "learning_rate": 1.3951939178104374e-06, "loss": 0.3228, "step": 4717 }, { "epoch": 2.342214132053616, "grad_norm": 0.3346197307109833, "learning_rate": 1.3931924537859948e-06, "loss": 0.3487, "step": 4718 }, { "epoch": 2.3427105742181036, "grad_norm": 0.3164183795452118, "learning_rate": 1.3911921939765643e-06, "loss": 0.2918, "step": 4719 }, { "epoch": 2.3432070163825913, "grad_norm": 0.3249368369579315, "learning_rate": 1.3891931390499802e-06, "loss": 0.3684, "step": 4720 }, { "epoch": 2.343703458547079, "grad_norm": 0.2924025356769562, "learning_rate": 1.3871952896736673e-06, "loss": 0.2979, "step": 4721 }, { "epoch": 2.3441999007115673, "grad_norm": 0.3249209225177765, "learning_rate": 1.38519864651466e-06, "loss": 0.3381, "step": 4722 }, { "epoch": 2.344696342876055, "grad_norm": 0.31237468123435974, "learning_rate": 1.3832032102395775e-06, "loss": 0.3028, "step": 4723 }, { "epoch": 2.3451927850405427, "grad_norm": 0.3528405725955963, "learning_rate": 1.3812089815146446e-06, "loss": 0.3512, "step": 4724 }, { "epoch": 2.3456892272050305, "grad_norm": 0.3267931044101715, "learning_rate": 1.3792159610056794e-06, "loss": 0.3062, "step": 4725 }, { "epoch": 2.3461856693695182, "grad_norm": 0.32838907837867737, "learning_rate": 1.3772241493780975e-06, "loss": 0.3322, "step": 4726 }, { "epoch": 2.3466821115340064, "grad_norm": 0.312778502702713, "learning_rate": 1.3752335472969113e-06, "loss": 0.3296, "step": 4727 }, { "epoch": 2.347178553698494, "grad_norm": 0.333787202835083, "learning_rate": 1.3732441554267257e-06, "loss": 0.3567, "step": 4728 }, { "epoch": 2.347674995862982, "grad_norm": 0.31847020983695984, "learning_rate": 1.37125597443175e-06, "loss": 0.3468, "step": 4729 }, { "epoch": 2.3481714380274696, "grad_norm": 0.3216151297092438, "learning_rate": 1.3692690049757783e-06, "loss": 0.3341, "step": 4730 }, { "epoch": 2.348667880191958, "grad_norm": 0.33531272411346436, "learning_rate": 1.3672832477222086e-06, "loss": 0.3391, "step": 4731 }, { "epoch": 2.3491643223564456, "grad_norm": 0.361990362405777, "learning_rate": 1.365298703334031e-06, "loss": 0.347, "step": 4732 }, { "epoch": 2.3496607645209333, "grad_norm": 0.326322466135025, "learning_rate": 1.3633153724738302e-06, "loss": 0.3541, "step": 4733 }, { "epoch": 2.350157206685421, "grad_norm": 0.35463836789131165, "learning_rate": 1.3613332558037883e-06, "loss": 0.3019, "step": 4734 }, { "epoch": 2.350653648849909, "grad_norm": 0.3169175982475281, "learning_rate": 1.3593523539856763e-06, "loss": 0.3148, "step": 4735 }, { "epoch": 2.351150091014397, "grad_norm": 0.34690672159194946, "learning_rate": 1.3573726676808686e-06, "loss": 0.3618, "step": 4736 }, { "epoch": 2.3516465331788847, "grad_norm": 0.3087981045246124, "learning_rate": 1.3553941975503243e-06, "loss": 0.3102, "step": 4737 }, { "epoch": 2.3521429753433725, "grad_norm": 0.3268030881881714, "learning_rate": 1.3534169442546046e-06, "loss": 0.3486, "step": 4738 }, { "epoch": 2.35263941750786, "grad_norm": 0.27782922983169556, "learning_rate": 1.3514409084538555e-06, "loss": 0.2893, "step": 4739 }, { "epoch": 2.3531358596723484, "grad_norm": 0.3243143558502197, "learning_rate": 1.3494660908078272e-06, "loss": 0.3582, "step": 4740 }, { "epoch": 2.353632301836836, "grad_norm": 0.3277122974395752, "learning_rate": 1.3474924919758542e-06, "loss": 0.3496, "step": 4741 }, { "epoch": 2.354128744001324, "grad_norm": 0.32554396986961365, "learning_rate": 1.3455201126168682e-06, "loss": 0.3067, "step": 4742 }, { "epoch": 2.3546251861658116, "grad_norm": 0.3369700014591217, "learning_rate": 1.3435489533893937e-06, "loss": 0.3509, "step": 4743 }, { "epoch": 2.3551216283302994, "grad_norm": 0.31986409425735474, "learning_rate": 1.3415790149515461e-06, "loss": 0.3504, "step": 4744 }, { "epoch": 2.3556180704947876, "grad_norm": 0.3247639536857605, "learning_rate": 1.3396102979610377e-06, "loss": 0.3282, "step": 4745 }, { "epoch": 2.3561145126592753, "grad_norm": 0.32720810174942017, "learning_rate": 1.3376428030751643e-06, "loss": 0.3362, "step": 4746 }, { "epoch": 2.356610954823763, "grad_norm": 0.3081485629081726, "learning_rate": 1.3356765309508224e-06, "loss": 0.3188, "step": 4747 }, { "epoch": 2.357107396988251, "grad_norm": 0.3305875062942505, "learning_rate": 1.3337114822444958e-06, "loss": 0.3743, "step": 4748 }, { "epoch": 2.3576038391527385, "grad_norm": 0.3428115248680115, "learning_rate": 1.3317476576122607e-06, "loss": 0.3175, "step": 4749 }, { "epoch": 2.3581002813172267, "grad_norm": 0.3283305764198303, "learning_rate": 1.3297850577097853e-06, "loss": 0.3468, "step": 4750 }, { "epoch": 2.3585967234817145, "grad_norm": 0.31073814630508423, "learning_rate": 1.3278236831923286e-06, "loss": 0.305, "step": 4751 }, { "epoch": 2.359093165646202, "grad_norm": 0.32599806785583496, "learning_rate": 1.3258635347147407e-06, "loss": 0.3241, "step": 4752 }, { "epoch": 2.35958960781069, "grad_norm": 0.30657222867012024, "learning_rate": 1.3239046129314603e-06, "loss": 0.3055, "step": 4753 }, { "epoch": 2.3600860499751777, "grad_norm": 0.31697165966033936, "learning_rate": 1.3219469184965184e-06, "loss": 0.3343, "step": 4754 }, { "epoch": 2.360582492139666, "grad_norm": 0.33515238761901855, "learning_rate": 1.3199904520635365e-06, "loss": 0.3782, "step": 4755 }, { "epoch": 2.3610789343041536, "grad_norm": 0.30027955770492554, "learning_rate": 1.3180352142857256e-06, "loss": 0.384, "step": 4756 }, { "epoch": 2.3615753764686414, "grad_norm": 0.29143619537353516, "learning_rate": 1.3160812058158883e-06, "loss": 0.3102, "step": 4757 }, { "epoch": 2.362071818633129, "grad_norm": 0.3252527713775635, "learning_rate": 1.3141284273064099e-06, "loss": 0.3367, "step": 4758 }, { "epoch": 2.362568260797617, "grad_norm": 0.31326377391815186, "learning_rate": 1.3121768794092753e-06, "loss": 0.2925, "step": 4759 }, { "epoch": 2.363064702962105, "grad_norm": 0.3209030032157898, "learning_rate": 1.3102265627760507e-06, "loss": 0.3548, "step": 4760 }, { "epoch": 2.3635611451265928, "grad_norm": 0.3177974224090576, "learning_rate": 1.3082774780578954e-06, "loss": 0.3742, "step": 4761 }, { "epoch": 2.3640575872910805, "grad_norm": 0.30619877576828003, "learning_rate": 1.306329625905552e-06, "loss": 0.3243, "step": 4762 }, { "epoch": 2.3645540294555683, "grad_norm": 0.31517595052719116, "learning_rate": 1.3043830069693607e-06, "loss": 0.3792, "step": 4763 }, { "epoch": 2.3650504716200564, "grad_norm": 0.299470990896225, "learning_rate": 1.3024376218992407e-06, "loss": 0.3273, "step": 4764 }, { "epoch": 2.365546913784544, "grad_norm": 0.32455262541770935, "learning_rate": 1.3004934713447047e-06, "loss": 0.3199, "step": 4765 }, { "epoch": 2.366043355949032, "grad_norm": 0.3229582607746124, "learning_rate": 1.2985505559548516e-06, "loss": 0.3149, "step": 4766 }, { "epoch": 2.3665397981135197, "grad_norm": 0.32530027627944946, "learning_rate": 1.296608876378368e-06, "loss": 0.3423, "step": 4767 }, { "epoch": 2.367036240278008, "grad_norm": 0.2997121810913086, "learning_rate": 1.2946684332635295e-06, "loss": 0.2806, "step": 4768 }, { "epoch": 2.3675326824424956, "grad_norm": 0.34457215666770935, "learning_rate": 1.2927292272581925e-06, "loss": 0.3115, "step": 4769 }, { "epoch": 2.3680291246069833, "grad_norm": 0.31758204102516174, "learning_rate": 1.290791259009812e-06, "loss": 0.3455, "step": 4770 }, { "epoch": 2.368525566771471, "grad_norm": 0.30909299850463867, "learning_rate": 1.2888545291654175e-06, "loss": 0.3145, "step": 4771 }, { "epoch": 2.369022008935959, "grad_norm": 0.322599321603775, "learning_rate": 1.2869190383716323e-06, "loss": 0.3429, "step": 4772 }, { "epoch": 2.369518451100447, "grad_norm": 0.32784128189086914, "learning_rate": 1.2849847872746646e-06, "loss": 0.3472, "step": 4773 }, { "epoch": 2.3700148932649348, "grad_norm": 0.30270636081695557, "learning_rate": 1.2830517765203082e-06, "loss": 0.324, "step": 4774 }, { "epoch": 2.3705113354294225, "grad_norm": 0.31967616081237793, "learning_rate": 1.281120006753943e-06, "loss": 0.3433, "step": 4775 }, { "epoch": 2.3710077775939102, "grad_norm": 0.3125044107437134, "learning_rate": 1.2791894786205322e-06, "loss": 0.3404, "step": 4776 }, { "epoch": 2.371504219758398, "grad_norm": 0.30933111906051636, "learning_rate": 1.2772601927646305e-06, "loss": 0.3341, "step": 4777 }, { "epoch": 2.372000661922886, "grad_norm": 0.32937878370285034, "learning_rate": 1.2753321498303711e-06, "loss": 0.3182, "step": 4778 }, { "epoch": 2.372497104087374, "grad_norm": 0.33191388845443726, "learning_rate": 1.2734053504614757e-06, "loss": 0.3491, "step": 4779 }, { "epoch": 2.3729935462518617, "grad_norm": 0.34053394198417664, "learning_rate": 1.271479795301251e-06, "loss": 0.343, "step": 4780 }, { "epoch": 2.3734899884163494, "grad_norm": 0.34091416001319885, "learning_rate": 1.2695554849925862e-06, "loss": 0.3078, "step": 4781 }, { "epoch": 2.373986430580837, "grad_norm": 0.34031787514686584, "learning_rate": 1.2676324201779593e-06, "loss": 0.3596, "step": 4782 }, { "epoch": 2.3744828727453253, "grad_norm": 0.30544036626815796, "learning_rate": 1.265710601499426e-06, "loss": 0.3122, "step": 4783 }, { "epoch": 2.374979314909813, "grad_norm": 0.325436532497406, "learning_rate": 1.2637900295986293e-06, "loss": 0.3703, "step": 4784 }, { "epoch": 2.375475757074301, "grad_norm": 0.3185451626777649, "learning_rate": 1.2618707051167983e-06, "loss": 0.309, "step": 4785 }, { "epoch": 2.3759721992387886, "grad_norm": 0.324036180973053, "learning_rate": 1.2599526286947427e-06, "loss": 0.3325, "step": 4786 }, { "epoch": 2.3764686414032763, "grad_norm": 0.3200422525405884, "learning_rate": 1.258035800972855e-06, "loss": 0.3108, "step": 4787 }, { "epoch": 2.3769650835677645, "grad_norm": 0.3463287353515625, "learning_rate": 1.2561202225911117e-06, "loss": 0.3174, "step": 4788 }, { "epoch": 2.3774615257322522, "grad_norm": 0.3342285454273224, "learning_rate": 1.2542058941890734e-06, "loss": 0.3284, "step": 4789 }, { "epoch": 2.37795796789674, "grad_norm": 0.3052111268043518, "learning_rate": 1.2522928164058817e-06, "loss": 0.331, "step": 4790 }, { "epoch": 2.3784544100612277, "grad_norm": 0.33773568272590637, "learning_rate": 1.2503809898802615e-06, "loss": 0.3621, "step": 4791 }, { "epoch": 2.378950852225716, "grad_norm": 0.2975723147392273, "learning_rate": 1.2484704152505205e-06, "loss": 0.3372, "step": 4792 }, { "epoch": 2.3794472943902036, "grad_norm": 0.3422873616218567, "learning_rate": 1.246561093154548e-06, "loss": 0.3168, "step": 4793 }, { "epoch": 2.3799437365546914, "grad_norm": 0.32512715458869934, "learning_rate": 1.2446530242298117e-06, "loss": 0.3057, "step": 4794 }, { "epoch": 2.380440178719179, "grad_norm": 0.3196203112602234, "learning_rate": 1.2427462091133662e-06, "loss": 0.3336, "step": 4795 }, { "epoch": 2.380936620883667, "grad_norm": 0.32864150404930115, "learning_rate": 1.2408406484418455e-06, "loss": 0.3264, "step": 4796 }, { "epoch": 2.381433063048155, "grad_norm": 0.3382572829723358, "learning_rate": 1.2389363428514634e-06, "loss": 0.3499, "step": 4797 }, { "epoch": 2.381929505212643, "grad_norm": 0.3126155436038971, "learning_rate": 1.2370332929780182e-06, "loss": 0.3067, "step": 4798 }, { "epoch": 2.3824259473771305, "grad_norm": 0.31482961773872375, "learning_rate": 1.235131499456882e-06, "loss": 0.3491, "step": 4799 }, { "epoch": 2.3829223895416183, "grad_norm": 0.314344197511673, "learning_rate": 1.233230962923017e-06, "loss": 0.3191, "step": 4800 }, { "epoch": 2.3834188317061065, "grad_norm": 0.34116601943969727, "learning_rate": 1.2313316840109573e-06, "loss": 0.3551, "step": 4801 }, { "epoch": 2.383915273870594, "grad_norm": 0.32531997561454773, "learning_rate": 1.2294336633548215e-06, "loss": 0.3448, "step": 4802 }, { "epoch": 2.384411716035082, "grad_norm": 0.3075607717037201, "learning_rate": 1.227536901588307e-06, "loss": 0.2801, "step": 4803 }, { "epoch": 2.3849081581995697, "grad_norm": 0.3279825448989868, "learning_rate": 1.2256413993446915e-06, "loss": 0.3561, "step": 4804 }, { "epoch": 2.3854046003640574, "grad_norm": 0.3164108991622925, "learning_rate": 1.2237471572568328e-06, "loss": 0.3342, "step": 4805 }, { "epoch": 2.3859010425285456, "grad_norm": 0.3223809599876404, "learning_rate": 1.2218541759571623e-06, "loss": 0.3765, "step": 4806 }, { "epoch": 2.3863974846930334, "grad_norm": 0.3081531226634979, "learning_rate": 1.2199624560777006e-06, "loss": 0.301, "step": 4807 }, { "epoch": 2.386893926857521, "grad_norm": 0.3226205110549927, "learning_rate": 1.2180719982500383e-06, "loss": 0.3368, "step": 4808 }, { "epoch": 2.387390369022009, "grad_norm": 0.31979066133499146, "learning_rate": 1.2161828031053502e-06, "loss": 0.3497, "step": 4809 }, { "epoch": 2.3878868111864966, "grad_norm": 0.32773932814598083, "learning_rate": 1.2142948712743824e-06, "loss": 0.3226, "step": 4810 }, { "epoch": 2.388383253350985, "grad_norm": 0.342678040266037, "learning_rate": 1.2124082033874706e-06, "loss": 0.3447, "step": 4811 }, { "epoch": 2.3888796955154725, "grad_norm": 0.31865450739860535, "learning_rate": 1.2105228000745173e-06, "loss": 0.3099, "step": 4812 }, { "epoch": 2.3893761376799603, "grad_norm": 0.31018081307411194, "learning_rate": 1.208638661965008e-06, "loss": 0.3686, "step": 4813 }, { "epoch": 2.389872579844448, "grad_norm": 0.2867971658706665, "learning_rate": 1.2067557896880066e-06, "loss": 0.325, "step": 4814 }, { "epoch": 2.3903690220089358, "grad_norm": 0.34648358821868896, "learning_rate": 1.2048741838721523e-06, "loss": 0.3785, "step": 4815 }, { "epoch": 2.390865464173424, "grad_norm": 0.32181087136268616, "learning_rate": 1.2029938451456636e-06, "loss": 0.2892, "step": 4816 }, { "epoch": 2.3913619063379117, "grad_norm": 0.3450469970703125, "learning_rate": 1.20111477413633e-06, "loss": 0.365, "step": 4817 }, { "epoch": 2.3918583485023994, "grad_norm": 0.2980102598667145, "learning_rate": 1.1992369714715285e-06, "loss": 0.3176, "step": 4818 }, { "epoch": 2.392354790666887, "grad_norm": 0.295429527759552, "learning_rate": 1.1973604377782017e-06, "loss": 0.3028, "step": 4819 }, { "epoch": 2.392851232831375, "grad_norm": 0.34278449416160583, "learning_rate": 1.195485173682875e-06, "loss": 0.3603, "step": 4820 }, { "epoch": 2.393347674995863, "grad_norm": 0.3045828938484192, "learning_rate": 1.1936111798116474e-06, "loss": 0.3409, "step": 4821 }, { "epoch": 2.393844117160351, "grad_norm": 0.3105868399143219, "learning_rate": 1.1917384567901946e-06, "loss": 0.333, "step": 4822 }, { "epoch": 2.3943405593248386, "grad_norm": 0.3112289309501648, "learning_rate": 1.1898670052437705e-06, "loss": 0.3021, "step": 4823 }, { "epoch": 2.3948370014893263, "grad_norm": 0.30040442943573, "learning_rate": 1.1879968257971979e-06, "loss": 0.3169, "step": 4824 }, { "epoch": 2.3953334436538145, "grad_norm": 0.3349066972732544, "learning_rate": 1.1861279190748804e-06, "loss": 0.3537, "step": 4825 }, { "epoch": 2.3958298858183023, "grad_norm": 0.3350982964038849, "learning_rate": 1.1842602857007957e-06, "loss": 0.3101, "step": 4826 }, { "epoch": 2.39632632798279, "grad_norm": 0.3089805543422699, "learning_rate": 1.1823939262984958e-06, "loss": 0.3304, "step": 4827 }, { "epoch": 2.3968227701472777, "grad_norm": 0.3230282962322235, "learning_rate": 1.180528841491108e-06, "loss": 0.3479, "step": 4828 }, { "epoch": 2.397319212311766, "grad_norm": 0.3196326792240143, "learning_rate": 1.1786650319013298e-06, "loss": 0.316, "step": 4829 }, { "epoch": 2.3978156544762537, "grad_norm": 0.3110717535018921, "learning_rate": 1.1768024981514426e-06, "loss": 0.3398, "step": 4830 }, { "epoch": 2.3983120966407414, "grad_norm": 0.36497244238853455, "learning_rate": 1.174941240863291e-06, "loss": 0.3383, "step": 4831 }, { "epoch": 2.398808538805229, "grad_norm": 0.30876874923706055, "learning_rate": 1.1730812606582996e-06, "loss": 0.3194, "step": 4832 }, { "epoch": 2.399304980969717, "grad_norm": 0.31386592984199524, "learning_rate": 1.1712225581574655e-06, "loss": 0.299, "step": 4833 }, { "epoch": 2.399801423134205, "grad_norm": 0.3392130434513092, "learning_rate": 1.16936513398136e-06, "loss": 0.348, "step": 4834 }, { "epoch": 2.400297865298693, "grad_norm": 0.30024203658103943, "learning_rate": 1.167508988750124e-06, "loss": 0.2926, "step": 4835 }, { "epoch": 2.4007943074631806, "grad_norm": 0.36618030071258545, "learning_rate": 1.1656541230834756e-06, "loss": 0.3327, "step": 4836 }, { "epoch": 2.4012907496276683, "grad_norm": 0.36459460854530334, "learning_rate": 1.1638005376007034e-06, "loss": 0.3728, "step": 4837 }, { "epoch": 2.401787191792156, "grad_norm": 0.2973106801509857, "learning_rate": 1.1619482329206694e-06, "loss": 0.3142, "step": 4838 }, { "epoch": 2.4022836339566442, "grad_norm": 0.32004860043525696, "learning_rate": 1.1600972096618102e-06, "loss": 0.304, "step": 4839 }, { "epoch": 2.402780076121132, "grad_norm": 0.30898517370224, "learning_rate": 1.1582474684421262e-06, "loss": 0.3634, "step": 4840 }, { "epoch": 2.4032765182856197, "grad_norm": 0.33649516105651855, "learning_rate": 1.1563990098792028e-06, "loss": 0.3786, "step": 4841 }, { "epoch": 2.4037729604501075, "grad_norm": 0.3016398549079895, "learning_rate": 1.1545518345901851e-06, "loss": 0.3219, "step": 4842 }, { "epoch": 2.404269402614595, "grad_norm": 0.291343629360199, "learning_rate": 1.1527059431917965e-06, "loss": 0.2998, "step": 4843 }, { "epoch": 2.4047658447790834, "grad_norm": 0.35051363706588745, "learning_rate": 1.1508613363003295e-06, "loss": 0.3278, "step": 4844 }, { "epoch": 2.405262286943571, "grad_norm": 0.297861784696579, "learning_rate": 1.1490180145316487e-06, "loss": 0.3505, "step": 4845 }, { "epoch": 2.405758729108059, "grad_norm": 0.3073481321334839, "learning_rate": 1.1471759785011903e-06, "loss": 0.3111, "step": 4846 }, { "epoch": 2.4062551712725466, "grad_norm": 0.3461441993713379, "learning_rate": 1.1453352288239561e-06, "loss": 0.3026, "step": 4847 }, { "epoch": 2.4067516134370344, "grad_norm": 0.3353833854198456, "learning_rate": 1.143495766114528e-06, "loss": 0.3408, "step": 4848 }, { "epoch": 2.4072480556015226, "grad_norm": 0.3265356421470642, "learning_rate": 1.141657590987048e-06, "loss": 0.2959, "step": 4849 }, { "epoch": 2.4077444977660103, "grad_norm": 0.3514552414417267, "learning_rate": 1.1398207040552344e-06, "loss": 0.3448, "step": 4850 }, { "epoch": 2.408240939930498, "grad_norm": 0.30589067935943604, "learning_rate": 1.1379851059323739e-06, "loss": 0.2876, "step": 4851 }, { "epoch": 2.408737382094986, "grad_norm": 0.32607290148735046, "learning_rate": 1.1361507972313223e-06, "loss": 0.3255, "step": 4852 }, { "epoch": 2.4092338242594735, "grad_norm": 0.29918816685676575, "learning_rate": 1.1343177785645083e-06, "loss": 0.3396, "step": 4853 }, { "epoch": 2.4097302664239617, "grad_norm": 0.29276391863822937, "learning_rate": 1.1324860505439222e-06, "loss": 0.3038, "step": 4854 }, { "epoch": 2.4102267085884495, "grad_norm": 0.35386744141578674, "learning_rate": 1.1306556137811309e-06, "loss": 0.338, "step": 4855 }, { "epoch": 2.410723150752937, "grad_norm": 0.3151682913303375, "learning_rate": 1.1288264688872674e-06, "loss": 0.3596, "step": 4856 }, { "epoch": 2.411219592917425, "grad_norm": 0.30146223306655884, "learning_rate": 1.1269986164730351e-06, "loss": 0.3161, "step": 4857 }, { "epoch": 2.411716035081913, "grad_norm": 0.3169059157371521, "learning_rate": 1.1251720571487002e-06, "loss": 0.3229, "step": 4858 }, { "epoch": 2.412212477246401, "grad_norm": 0.31114906072616577, "learning_rate": 1.1233467915241037e-06, "loss": 0.2978, "step": 4859 }, { "epoch": 2.4127089194108886, "grad_norm": 0.3297962248325348, "learning_rate": 1.121522820208652e-06, "loss": 0.3598, "step": 4860 }, { "epoch": 2.4132053615753764, "grad_norm": 0.3214893341064453, "learning_rate": 1.1197001438113198e-06, "loss": 0.3295, "step": 4861 }, { "epoch": 2.4137018037398645, "grad_norm": 0.3077270984649658, "learning_rate": 1.1178787629406485e-06, "loss": 0.3419, "step": 4862 }, { "epoch": 2.4141982459043523, "grad_norm": 0.3212195932865143, "learning_rate": 1.1160586782047478e-06, "loss": 0.355, "step": 4863 }, { "epoch": 2.41469468806884, "grad_norm": 0.3218463063240051, "learning_rate": 1.1142398902112967e-06, "loss": 0.3145, "step": 4864 }, { "epoch": 2.4151911302333278, "grad_norm": 0.3006143271923065, "learning_rate": 1.1124223995675353e-06, "loss": 0.3031, "step": 4865 }, { "epoch": 2.4156875723978155, "grad_norm": 0.3028270900249481, "learning_rate": 1.1106062068802765e-06, "loss": 0.3411, "step": 4866 }, { "epoch": 2.4161840145623037, "grad_norm": 0.339186429977417, "learning_rate": 1.1087913127558974e-06, "loss": 0.3412, "step": 4867 }, { "epoch": 2.4166804567267914, "grad_norm": 0.3257400393486023, "learning_rate": 1.1069777178003416e-06, "loss": 0.3228, "step": 4868 }, { "epoch": 2.417176898891279, "grad_norm": 0.3304087817668915, "learning_rate": 1.1051654226191205e-06, "loss": 0.3405, "step": 4869 }, { "epoch": 2.417673341055767, "grad_norm": 0.2975143492221832, "learning_rate": 1.103354427817307e-06, "loss": 0.3018, "step": 4870 }, { "epoch": 2.4181697832202547, "grad_norm": 0.3426970839500427, "learning_rate": 1.1015447339995473e-06, "loss": 0.3195, "step": 4871 }, { "epoch": 2.418666225384743, "grad_norm": 0.31794077157974243, "learning_rate": 1.099736341770045e-06, "loss": 0.3017, "step": 4872 }, { "epoch": 2.4191626675492306, "grad_norm": 0.315288782119751, "learning_rate": 1.0979292517325757e-06, "loss": 0.3664, "step": 4873 }, { "epoch": 2.4196591097137183, "grad_norm": 0.3268272280693054, "learning_rate": 1.0961234644904767e-06, "loss": 0.3024, "step": 4874 }, { "epoch": 2.420155551878206, "grad_norm": 0.3322502672672272, "learning_rate": 1.0943189806466515e-06, "loss": 0.3483, "step": 4875 }, { "epoch": 2.420651994042694, "grad_norm": 0.2991706430912018, "learning_rate": 1.0925158008035692e-06, "loss": 0.3424, "step": 4876 }, { "epoch": 2.421148436207182, "grad_norm": 0.3201747536659241, "learning_rate": 1.0907139255632587e-06, "loss": 0.3401, "step": 4877 }, { "epoch": 2.4216448783716698, "grad_norm": 0.33363303542137146, "learning_rate": 1.0889133555273228e-06, "loss": 0.3105, "step": 4878 }, { "epoch": 2.4221413205361575, "grad_norm": 0.3338918089866638, "learning_rate": 1.0871140912969186e-06, "loss": 0.3499, "step": 4879 }, { "epoch": 2.4226377627006452, "grad_norm": 0.3333820104598999, "learning_rate": 1.0853161334727746e-06, "loss": 0.3579, "step": 4880 }, { "epoch": 2.423134204865133, "grad_norm": 0.3083851933479309, "learning_rate": 1.0835194826551754e-06, "loss": 0.3265, "step": 4881 }, { "epoch": 2.423630647029621, "grad_norm": 0.3052518367767334, "learning_rate": 1.08172413944398e-06, "loss": 0.2978, "step": 4882 }, { "epoch": 2.424127089194109, "grad_norm": 0.3229319751262665, "learning_rate": 1.0799301044385996e-06, "loss": 0.3414, "step": 4883 }, { "epoch": 2.4246235313585967, "grad_norm": 0.30840492248535156, "learning_rate": 1.0781373782380162e-06, "loss": 0.329, "step": 4884 }, { "epoch": 2.4251199735230844, "grad_norm": 0.30014768242836, "learning_rate": 1.0763459614407717e-06, "loss": 0.3158, "step": 4885 }, { "epoch": 2.4256164156875726, "grad_norm": 0.34222644567489624, "learning_rate": 1.074555854644972e-06, "loss": 0.3644, "step": 4886 }, { "epoch": 2.4261128578520603, "grad_norm": 0.3428688645362854, "learning_rate": 1.0727670584482857e-06, "loss": 0.3327, "step": 4887 }, { "epoch": 2.426609300016548, "grad_norm": 0.3133029341697693, "learning_rate": 1.0709795734479395e-06, "loss": 0.3384, "step": 4888 }, { "epoch": 2.427105742181036, "grad_norm": 0.31581762433052063, "learning_rate": 1.0691934002407323e-06, "loss": 0.3498, "step": 4889 }, { "epoch": 2.427602184345524, "grad_norm": 0.33637580275535583, "learning_rate": 1.0674085394230132e-06, "loss": 0.3674, "step": 4890 }, { "epoch": 2.4280986265100117, "grad_norm": 0.33491262793540955, "learning_rate": 1.0656249915907012e-06, "loss": 0.3043, "step": 4891 }, { "epoch": 2.4285950686744995, "grad_norm": 0.3209185302257538, "learning_rate": 1.0638427573392745e-06, "loss": 0.3081, "step": 4892 }, { "epoch": 2.4290915108389872, "grad_norm": 0.32216957211494446, "learning_rate": 1.062061837263772e-06, "loss": 0.3184, "step": 4893 }, { "epoch": 2.429587953003475, "grad_norm": 0.3195521831512451, "learning_rate": 1.0602822319587958e-06, "loss": 0.335, "step": 4894 }, { "epoch": 2.430084395167963, "grad_norm": 0.309387743473053, "learning_rate": 1.0585039420185056e-06, "loss": 0.3421, "step": 4895 }, { "epoch": 2.430580837332451, "grad_norm": 0.3213215470314026, "learning_rate": 1.0567269680366255e-06, "loss": 0.3369, "step": 4896 }, { "epoch": 2.4310772794969386, "grad_norm": 0.32291379570961, "learning_rate": 1.0549513106064386e-06, "loss": 0.3472, "step": 4897 }, { "epoch": 2.4315737216614264, "grad_norm": 0.3259825110435486, "learning_rate": 1.0531769703207883e-06, "loss": 0.3266, "step": 4898 }, { "epoch": 2.432070163825914, "grad_norm": 0.2976614832878113, "learning_rate": 1.0514039477720805e-06, "loss": 0.3001, "step": 4899 }, { "epoch": 2.4325666059904023, "grad_norm": 0.3015928864479065, "learning_rate": 1.0496322435522748e-06, "loss": 0.3149, "step": 4900 }, { "epoch": 2.43306304815489, "grad_norm": 0.3135557472705841, "learning_rate": 1.0478618582529004e-06, "loss": 0.3186, "step": 4901 }, { "epoch": 2.433559490319378, "grad_norm": 0.297443151473999, "learning_rate": 1.0460927924650371e-06, "loss": 0.364, "step": 4902 }, { "epoch": 2.4340559324838655, "grad_norm": 0.2825373411178589, "learning_rate": 1.0443250467793297e-06, "loss": 0.3105, "step": 4903 }, { "epoch": 2.4345523746483533, "grad_norm": 0.3154524564743042, "learning_rate": 1.0425586217859796e-06, "loss": 0.3599, "step": 4904 }, { "epoch": 2.4350488168128415, "grad_norm": 0.3098675012588501, "learning_rate": 1.0407935180747496e-06, "loss": 0.317, "step": 4905 }, { "epoch": 2.435545258977329, "grad_norm": 0.3254479765892029, "learning_rate": 1.0390297362349572e-06, "loss": 0.3053, "step": 4906 }, { "epoch": 2.436041701141817, "grad_norm": 0.37587764859199524, "learning_rate": 1.0372672768554813e-06, "loss": 0.3776, "step": 4907 }, { "epoch": 2.4365381433063047, "grad_norm": 0.31843075156211853, "learning_rate": 1.0355061405247635e-06, "loss": 0.326, "step": 4908 }, { "epoch": 2.4370345854707924, "grad_norm": 0.31247466802597046, "learning_rate": 1.0337463278307953e-06, "loss": 0.312, "step": 4909 }, { "epoch": 2.4375310276352806, "grad_norm": 0.320947527885437, "learning_rate": 1.0319878393611321e-06, "loss": 0.3668, "step": 4910 }, { "epoch": 2.4380274697997684, "grad_norm": 0.31110426783561707, "learning_rate": 1.0302306757028824e-06, "loss": 0.3032, "step": 4911 }, { "epoch": 2.438523911964256, "grad_norm": 0.32536327838897705, "learning_rate": 1.0284748374427207e-06, "loss": 0.3382, "step": 4912 }, { "epoch": 2.439020354128744, "grad_norm": 0.31842130422592163, "learning_rate": 1.0267203251668689e-06, "loss": 0.322, "step": 4913 }, { "epoch": 2.4395167962932316, "grad_norm": 0.31568601727485657, "learning_rate": 1.0249671394611134e-06, "loss": 0.3518, "step": 4914 }, { "epoch": 2.44001323845772, "grad_norm": 0.31094616651535034, "learning_rate": 1.0232152809107937e-06, "loss": 0.3405, "step": 4915 }, { "epoch": 2.4405096806222075, "grad_norm": 0.32998141646385193, "learning_rate": 1.0214647501008095e-06, "loss": 0.3417, "step": 4916 }, { "epoch": 2.4410061227866953, "grad_norm": 0.28635546565055847, "learning_rate": 1.0197155476156156e-06, "loss": 0.3019, "step": 4917 }, { "epoch": 2.441502564951183, "grad_norm": 0.3116688132286072, "learning_rate": 1.0179676740392196e-06, "loss": 0.3171, "step": 4918 }, { "epoch": 2.441999007115671, "grad_norm": 0.33747443556785583, "learning_rate": 1.0162211299551944e-06, "loss": 0.3539, "step": 4919 }, { "epoch": 2.442495449280159, "grad_norm": 0.31292828917503357, "learning_rate": 1.0144759159466594e-06, "loss": 0.2941, "step": 4920 }, { "epoch": 2.4429918914446467, "grad_norm": 0.34376415610313416, "learning_rate": 1.0127320325962953e-06, "loss": 0.3764, "step": 4921 }, { "epoch": 2.4434883336091344, "grad_norm": 0.3301224112510681, "learning_rate": 1.0109894804863378e-06, "loss": 0.3278, "step": 4922 }, { "epoch": 2.4439847757736226, "grad_norm": 0.3064170181751251, "learning_rate": 1.0092482601985775e-06, "loss": 0.2958, "step": 4923 }, { "epoch": 2.4444812179381104, "grad_norm": 0.33765843510627747, "learning_rate": 1.0075083723143614e-06, "loss": 0.3471, "step": 4924 }, { "epoch": 2.444977660102598, "grad_norm": 0.30039137601852417, "learning_rate": 1.005769817414589e-06, "loss": 0.3221, "step": 4925 }, { "epoch": 2.445474102267086, "grad_norm": 0.3008192777633667, "learning_rate": 1.0040325960797176e-06, "loss": 0.2806, "step": 4926 }, { "epoch": 2.4459705444315736, "grad_norm": 0.32692331075668335, "learning_rate": 1.0022967088897573e-06, "loss": 0.3462, "step": 4927 }, { "epoch": 2.4464669865960618, "grad_norm": 0.32113322615623474, "learning_rate": 1.0005621564242762e-06, "loss": 0.3089, "step": 4928 }, { "epoch": 2.4469634287605495, "grad_norm": 0.32217758893966675, "learning_rate": 9.988289392623895e-07, "loss": 0.3177, "step": 4929 }, { "epoch": 2.4474598709250373, "grad_norm": 0.31282317638397217, "learning_rate": 9.970970579827771e-07, "loss": 0.3381, "step": 4930 }, { "epoch": 2.447956313089525, "grad_norm": 0.3091821074485779, "learning_rate": 9.953665131636624e-07, "loss": 0.3605, "step": 4931 }, { "epoch": 2.4484527552540127, "grad_norm": 0.2997247278690338, "learning_rate": 9.936373053828297e-07, "loss": 0.3178, "step": 4932 }, { "epoch": 2.448949197418501, "grad_norm": 0.3199334740638733, "learning_rate": 9.919094352176134e-07, "loss": 0.3142, "step": 4933 }, { "epoch": 2.4494456395829887, "grad_norm": 0.3252067565917969, "learning_rate": 9.901829032449028e-07, "loss": 0.3234, "step": 4934 }, { "epoch": 2.4499420817474764, "grad_norm": 0.32677653431892395, "learning_rate": 9.884577100411413e-07, "loss": 0.3584, "step": 4935 }, { "epoch": 2.450438523911964, "grad_norm": 0.3199077844619751, "learning_rate": 9.867338561823215e-07, "loss": 0.3322, "step": 4936 }, { "epoch": 2.450934966076452, "grad_norm": 0.3121779263019562, "learning_rate": 9.850113422439927e-07, "loss": 0.2812, "step": 4937 }, { "epoch": 2.45143140824094, "grad_norm": 0.30825772881507874, "learning_rate": 9.832901688012554e-07, "loss": 0.2822, "step": 4938 }, { "epoch": 2.451927850405428, "grad_norm": 0.3411232531070709, "learning_rate": 9.815703364287622e-07, "loss": 0.3695, "step": 4939 }, { "epoch": 2.4524242925699156, "grad_norm": 0.3170334994792938, "learning_rate": 9.798518457007206e-07, "loss": 0.3112, "step": 4940 }, { "epoch": 2.4529207347344033, "grad_norm": 0.32362544536590576, "learning_rate": 9.781346971908833e-07, "loss": 0.3556, "step": 4941 }, { "epoch": 2.453417176898891, "grad_norm": 0.30353397130966187, "learning_rate": 9.764188914725647e-07, "loss": 0.3254, "step": 4942 }, { "epoch": 2.4539136190633792, "grad_norm": 0.34163734316825867, "learning_rate": 9.747044291186226e-07, "loss": 0.4012, "step": 4943 }, { "epoch": 2.454410061227867, "grad_norm": 0.3021128177642822, "learning_rate": 9.7299131070147e-07, "loss": 0.2981, "step": 4944 }, { "epoch": 2.4549065033923547, "grad_norm": 0.31757768988609314, "learning_rate": 9.712795367930706e-07, "loss": 0.3585, "step": 4945 }, { "epoch": 2.4554029455568425, "grad_norm": 0.2941545844078064, "learning_rate": 9.695691079649394e-07, "loss": 0.2895, "step": 4946 }, { "epoch": 2.4558993877213307, "grad_norm": 0.3153875470161438, "learning_rate": 9.678600247881431e-07, "loss": 0.3473, "step": 4947 }, { "epoch": 2.4563958298858184, "grad_norm": 0.3119075894355774, "learning_rate": 9.661522878332947e-07, "loss": 0.3455, "step": 4948 }, { "epoch": 2.456892272050306, "grad_norm": 0.3227948546409607, "learning_rate": 9.64445897670566e-07, "loss": 0.3306, "step": 4949 }, { "epoch": 2.457388714214794, "grad_norm": 0.30724748969078064, "learning_rate": 9.627408548696704e-07, "loss": 0.3131, "step": 4950 }, { "epoch": 2.4578851563792816, "grad_norm": 0.3317863345146179, "learning_rate": 9.61037159999878e-07, "loss": 0.3764, "step": 4951 }, { "epoch": 2.45838159854377, "grad_norm": 0.30029937624931335, "learning_rate": 9.593348136300028e-07, "loss": 0.3076, "step": 4952 }, { "epoch": 2.4588780407082576, "grad_norm": 0.330111026763916, "learning_rate": 9.57633816328416e-07, "loss": 0.3552, "step": 4953 }, { "epoch": 2.4593744828727453, "grad_norm": 0.3381207287311554, "learning_rate": 9.559341686630319e-07, "loss": 0.3886, "step": 4954 }, { "epoch": 2.459870925037233, "grad_norm": 0.31567132472991943, "learning_rate": 9.542358712013155e-07, "loss": 0.3179, "step": 4955 }, { "epoch": 2.4603673672017212, "grad_norm": 0.3161924481391907, "learning_rate": 9.525389245102867e-07, "loss": 0.3509, "step": 4956 }, { "epoch": 2.460863809366209, "grad_norm": 0.32710590958595276, "learning_rate": 9.508433291565061e-07, "loss": 0.3281, "step": 4957 }, { "epoch": 2.4613602515306967, "grad_norm": 0.30569934844970703, "learning_rate": 9.491490857060887e-07, "loss": 0.3199, "step": 4958 }, { "epoch": 2.4618566936951845, "grad_norm": 0.2966354191303253, "learning_rate": 9.474561947246935e-07, "loss": 0.3051, "step": 4959 }, { "epoch": 2.462353135859672, "grad_norm": 0.3026801645755768, "learning_rate": 9.457646567775347e-07, "loss": 0.3479, "step": 4960 }, { "epoch": 2.4628495780241604, "grad_norm": 0.30218231678009033, "learning_rate": 9.440744724293682e-07, "loss": 0.2978, "step": 4961 }, { "epoch": 2.463346020188648, "grad_norm": 0.33186155557632446, "learning_rate": 9.423856422445015e-07, "loss": 0.3614, "step": 4962 }, { "epoch": 2.463842462353136, "grad_norm": 0.30650731921195984, "learning_rate": 9.406981667867888e-07, "loss": 0.3375, "step": 4963 }, { "epoch": 2.4643389045176236, "grad_norm": 0.31424614787101746, "learning_rate": 9.390120466196323e-07, "loss": 0.2979, "step": 4964 }, { "epoch": 2.4648353466821113, "grad_norm": 0.3011576235294342, "learning_rate": 9.373272823059836e-07, "loss": 0.3127, "step": 4965 }, { "epoch": 2.4653317888465995, "grad_norm": 0.30405256152153015, "learning_rate": 9.356438744083368e-07, "loss": 0.3279, "step": 4966 }, { "epoch": 2.4658282310110873, "grad_norm": 0.30730295181274414, "learning_rate": 9.339618234887371e-07, "loss": 0.3844, "step": 4967 }, { "epoch": 2.466324673175575, "grad_norm": 0.3317989706993103, "learning_rate": 9.322811301087753e-07, "loss": 0.3479, "step": 4968 }, { "epoch": 2.4668211153400628, "grad_norm": 0.3071756958961487, "learning_rate": 9.306017948295903e-07, "loss": 0.3081, "step": 4969 }, { "epoch": 2.4673175575045505, "grad_norm": 0.3051615357398987, "learning_rate": 9.289238182118654e-07, "loss": 0.294, "step": 4970 }, { "epoch": 2.4678139996690387, "grad_norm": 0.35896262526512146, "learning_rate": 9.272472008158323e-07, "loss": 0.3726, "step": 4971 }, { "epoch": 2.4683104418335264, "grad_norm": 0.2930692732334137, "learning_rate": 9.255719432012683e-07, "loss": 0.3118, "step": 4972 }, { "epoch": 2.468806883998014, "grad_norm": 0.3274289667606354, "learning_rate": 9.238980459274949e-07, "loss": 0.3282, "step": 4973 }, { "epoch": 2.469303326162502, "grad_norm": 0.3255192041397095, "learning_rate": 9.222255095533816e-07, "loss": 0.3221, "step": 4974 }, { "epoch": 2.4697997683269897, "grad_norm": 0.3134053945541382, "learning_rate": 9.20554334637343e-07, "loss": 0.3351, "step": 4975 }, { "epoch": 2.470296210491478, "grad_norm": 0.2970511317253113, "learning_rate": 9.188845217373399e-07, "loss": 0.3122, "step": 4976 }, { "epoch": 2.4707926526559656, "grad_norm": 0.3129856288433075, "learning_rate": 9.172160714108752e-07, "loss": 0.3366, "step": 4977 }, { "epoch": 2.4712890948204533, "grad_norm": 0.2995874583721161, "learning_rate": 9.15548984214999e-07, "loss": 0.293, "step": 4978 }, { "epoch": 2.471785536984941, "grad_norm": 0.3293156623840332, "learning_rate": 9.138832607063103e-07, "loss": 0.3658, "step": 4979 }, { "epoch": 2.4722819791494293, "grad_norm": 0.31835758686065674, "learning_rate": 9.122189014409449e-07, "loss": 0.3459, "step": 4980 }, { "epoch": 2.472778421313917, "grad_norm": 0.32798922061920166, "learning_rate": 9.1055590697459e-07, "loss": 0.3511, "step": 4981 }, { "epoch": 2.4732748634784048, "grad_norm": 0.3030830919742584, "learning_rate": 9.088942778624704e-07, "loss": 0.3248, "step": 4982 }, { "epoch": 2.4737713056428925, "grad_norm": 0.3202032744884491, "learning_rate": 9.072340146593639e-07, "loss": 0.3396, "step": 4983 }, { "epoch": 2.4742677478073807, "grad_norm": 0.3133372962474823, "learning_rate": 9.055751179195832e-07, "loss": 0.3191, "step": 4984 }, { "epoch": 2.4747641899718684, "grad_norm": 0.3174385130405426, "learning_rate": 9.039175881969903e-07, "loss": 0.3463, "step": 4985 }, { "epoch": 2.475260632136356, "grad_norm": 0.29562705755233765, "learning_rate": 9.022614260449897e-07, "loss": 0.3038, "step": 4986 }, { "epoch": 2.475757074300844, "grad_norm": 0.33492058515548706, "learning_rate": 9.006066320165285e-07, "loss": 0.4034, "step": 4987 }, { "epoch": 2.4762535164653316, "grad_norm": 0.33062806725502014, "learning_rate": 8.989532066640988e-07, "loss": 0.3425, "step": 4988 }, { "epoch": 2.47674995862982, "grad_norm": 0.3045496642589569, "learning_rate": 8.973011505397306e-07, "loss": 0.3334, "step": 4989 }, { "epoch": 2.4772464007943076, "grad_norm": 0.3002225458621979, "learning_rate": 8.956504641950053e-07, "loss": 0.3377, "step": 4990 }, { "epoch": 2.4777428429587953, "grad_norm": 0.3166615068912506, "learning_rate": 8.940011481810384e-07, "loss": 0.3205, "step": 4991 }, { "epoch": 2.478239285123283, "grad_norm": 0.3309929072856903, "learning_rate": 8.923532030484938e-07, "loss": 0.3357, "step": 4992 }, { "epoch": 2.478735727287771, "grad_norm": 0.3175433278083801, "learning_rate": 8.907066293475752e-07, "loss": 0.3832, "step": 4993 }, { "epoch": 2.479232169452259, "grad_norm": 0.3198089003562927, "learning_rate": 8.890614276280285e-07, "loss": 0.3333, "step": 4994 }, { "epoch": 2.4797286116167467, "grad_norm": 0.2982562780380249, "learning_rate": 8.874175984391431e-07, "loss": 0.3241, "step": 4995 }, { "epoch": 2.4802250537812345, "grad_norm": 0.29905256628990173, "learning_rate": 8.857751423297456e-07, "loss": 0.3643, "step": 4996 }, { "epoch": 2.480721495945722, "grad_norm": 0.30087998509407043, "learning_rate": 8.841340598482117e-07, "loss": 0.3056, "step": 4997 }, { "epoch": 2.48121793811021, "grad_norm": 0.31656354665756226, "learning_rate": 8.824943515424511e-07, "loss": 0.347, "step": 4998 }, { "epoch": 2.481714380274698, "grad_norm": 0.31610995531082153, "learning_rate": 8.808560179599201e-07, "loss": 0.2992, "step": 4999 }, { "epoch": 2.482210822439186, "grad_norm": 0.3201879858970642, "learning_rate": 8.792190596476102e-07, "loss": 0.3281, "step": 5000 }, { "epoch": 2.4827072646036736, "grad_norm": 0.3480318784713745, "learning_rate": 8.775834771520608e-07, "loss": 0.3486, "step": 5001 }, { "epoch": 2.4832037067681614, "grad_norm": 0.2957208454608917, "learning_rate": 8.75949271019349e-07, "loss": 0.2956, "step": 5002 }, { "epoch": 2.483700148932649, "grad_norm": 0.30829259753227234, "learning_rate": 8.743164417950883e-07, "loss": 0.3233, "step": 5003 }, { "epoch": 2.4841965910971373, "grad_norm": 0.3323674201965332, "learning_rate": 8.726849900244383e-07, "loss": 0.3526, "step": 5004 }, { "epoch": 2.484693033261625, "grad_norm": 0.3080350458621979, "learning_rate": 8.710549162520954e-07, "loss": 0.3372, "step": 5005 }, { "epoch": 2.485189475426113, "grad_norm": 0.3216954469680786, "learning_rate": 8.694262210222992e-07, "loss": 0.358, "step": 5006 }, { "epoch": 2.4856859175906005, "grad_norm": 0.3221517503261566, "learning_rate": 8.677989048788238e-07, "loss": 0.32, "step": 5007 }, { "epoch": 2.4861823597550887, "grad_norm": 0.3035203516483307, "learning_rate": 8.661729683649867e-07, "loss": 0.3084, "step": 5008 }, { "epoch": 2.4866788019195765, "grad_norm": 0.3254999816417694, "learning_rate": 8.645484120236442e-07, "loss": 0.33, "step": 5009 }, { "epoch": 2.487175244084064, "grad_norm": 0.31976163387298584, "learning_rate": 8.629252363971918e-07, "loss": 0.3411, "step": 5010 }, { "epoch": 2.487671686248552, "grad_norm": 0.3066427707672119, "learning_rate": 8.613034420275634e-07, "loss": 0.2942, "step": 5011 }, { "epoch": 2.4881681284130397, "grad_norm": 0.345913827419281, "learning_rate": 8.596830294562325e-07, "loss": 0.3465, "step": 5012 }, { "epoch": 2.488664570577528, "grad_norm": 0.33694756031036377, "learning_rate": 8.580639992242113e-07, "loss": 0.3231, "step": 5013 }, { "epoch": 2.4891610127420156, "grad_norm": 0.32520562410354614, "learning_rate": 8.564463518720483e-07, "loss": 0.3468, "step": 5014 }, { "epoch": 2.4896574549065034, "grad_norm": 0.3084220290184021, "learning_rate": 8.548300879398324e-07, "loss": 0.3193, "step": 5015 }, { "epoch": 2.490153897070991, "grad_norm": 0.3616010546684265, "learning_rate": 8.532152079671913e-07, "loss": 0.3397, "step": 5016 }, { "epoch": 2.4906503392354793, "grad_norm": 0.3297555148601532, "learning_rate": 8.516017124932885e-07, "loss": 0.328, "step": 5017 }, { "epoch": 2.491146781399967, "grad_norm": 0.32147017121315, "learning_rate": 8.499896020568276e-07, "loss": 0.3662, "step": 5018 }, { "epoch": 2.491643223564455, "grad_norm": 0.31104496121406555, "learning_rate": 8.483788771960455e-07, "loss": 0.3171, "step": 5019 }, { "epoch": 2.4921396657289425, "grad_norm": 0.3162350356578827, "learning_rate": 8.46769538448724e-07, "loss": 0.3485, "step": 5020 }, { "epoch": 2.4926361078934303, "grad_norm": 0.30667513608932495, "learning_rate": 8.451615863521734e-07, "loss": 0.3084, "step": 5021 }, { "epoch": 2.4931325500579185, "grad_norm": 0.3073335289955139, "learning_rate": 8.435550214432486e-07, "loss": 0.3056, "step": 5022 }, { "epoch": 2.493628992222406, "grad_norm": 0.31277772784233093, "learning_rate": 8.419498442583335e-07, "loss": 0.327, "step": 5023 }, { "epoch": 2.494125434386894, "grad_norm": 0.34462177753448486, "learning_rate": 8.403460553333586e-07, "loss": 0.3486, "step": 5024 }, { "epoch": 2.4946218765513817, "grad_norm": 0.31839674711227417, "learning_rate": 8.387436552037814e-07, "loss": 0.34, "step": 5025 }, { "epoch": 2.4951183187158694, "grad_norm": 0.3388362228870392, "learning_rate": 8.371426444045994e-07, "loss": 0.3287, "step": 5026 }, { "epoch": 2.4956147608803576, "grad_norm": 0.34473717212677, "learning_rate": 8.35543023470351e-07, "loss": 0.3436, "step": 5027 }, { "epoch": 2.4961112030448454, "grad_norm": 0.31258001923561096, "learning_rate": 8.339447929351025e-07, "loss": 0.3177, "step": 5028 }, { "epoch": 2.496607645209333, "grad_norm": 0.34306609630584717, "learning_rate": 8.323479533324613e-07, "loss": 0.3205, "step": 5029 }, { "epoch": 2.497104087373821, "grad_norm": 0.3568379580974579, "learning_rate": 8.307525051955656e-07, "loss": 0.3364, "step": 5030 }, { "epoch": 2.4976005295383086, "grad_norm": 0.32624009251594543, "learning_rate": 8.29158449057097e-07, "loss": 0.3205, "step": 5031 }, { "epoch": 2.4980969717027968, "grad_norm": 0.32872384786605835, "learning_rate": 8.275657854492636e-07, "loss": 0.3505, "step": 5032 }, { "epoch": 2.4985934138672845, "grad_norm": 0.3095146417617798, "learning_rate": 8.259745149038145e-07, "loss": 0.3467, "step": 5033 }, { "epoch": 2.4990898560317722, "grad_norm": 0.2994921803474426, "learning_rate": 8.243846379520309e-07, "loss": 0.354, "step": 5034 }, { "epoch": 2.49958629819626, "grad_norm": 0.30702969431877136, "learning_rate": 8.227961551247298e-07, "loss": 0.3248, "step": 5035 }, { "epoch": 2.5000827403607477, "grad_norm": 0.3311944007873535, "learning_rate": 8.212090669522632e-07, "loss": 0.343, "step": 5036 }, { "epoch": 2.500579182525236, "grad_norm": 0.3224543035030365, "learning_rate": 8.196233739645154e-07, "loss": 0.2921, "step": 5037 }, { "epoch": 2.5010756246897237, "grad_norm": 0.3185226321220398, "learning_rate": 8.180390766909063e-07, "loss": 0.3393, "step": 5038 }, { "epoch": 2.5015720668542114, "grad_norm": 0.32325318455696106, "learning_rate": 8.164561756603901e-07, "loss": 0.3861, "step": 5039 }, { "epoch": 2.5020685090186996, "grad_norm": 0.30614566802978516, "learning_rate": 8.148746714014544e-07, "loss": 0.2902, "step": 5040 }, { "epoch": 2.502564951183187, "grad_norm": 0.32863184809684753, "learning_rate": 8.132945644421203e-07, "loss": 0.3305, "step": 5041 }, { "epoch": 2.503061393347675, "grad_norm": 0.3338319957256317, "learning_rate": 8.11715855309943e-07, "loss": 0.3579, "step": 5042 }, { "epoch": 2.503557835512163, "grad_norm": 0.3018699884414673, "learning_rate": 8.10138544532012e-07, "loss": 0.3457, "step": 5043 }, { "epoch": 2.5040542776766506, "grad_norm": 0.3017390966415405, "learning_rate": 8.08562632634945e-07, "loss": 0.3569, "step": 5044 }, { "epoch": 2.5045507198411388, "grad_norm": 0.31262052059173584, "learning_rate": 8.069881201448987e-07, "loss": 0.357, "step": 5045 }, { "epoch": 2.5050471620056265, "grad_norm": 0.30114707350730896, "learning_rate": 8.054150075875589e-07, "loss": 0.336, "step": 5046 }, { "epoch": 2.5055436041701142, "grad_norm": 0.3155539035797119, "learning_rate": 8.038432954881464e-07, "loss": 0.3496, "step": 5047 }, { "epoch": 2.506040046334602, "grad_norm": 0.3307652175426483, "learning_rate": 8.022729843714116e-07, "loss": 0.3117, "step": 5048 }, { "epoch": 2.5065364884990897, "grad_norm": 0.29085278511047363, "learning_rate": 8.007040747616379e-07, "loss": 0.3284, "step": 5049 }, { "epoch": 2.507032930663578, "grad_norm": 0.32182520627975464, "learning_rate": 7.991365671826462e-07, "loss": 0.3449, "step": 5050 }, { "epoch": 2.5075293728280657, "grad_norm": 0.3179790675640106, "learning_rate": 7.975704621577796e-07, "loss": 0.3141, "step": 5051 }, { "epoch": 2.5080258149925534, "grad_norm": 0.32652220129966736, "learning_rate": 7.960057602099203e-07, "loss": 0.336, "step": 5052 }, { "epoch": 2.508522257157041, "grad_norm": 0.33778175711631775, "learning_rate": 7.944424618614794e-07, "loss": 0.3268, "step": 5053 }, { "epoch": 2.509018699321529, "grad_norm": 0.3283417224884033, "learning_rate": 7.928805676344009e-07, "loss": 0.3664, "step": 5054 }, { "epoch": 2.509515141486017, "grad_norm": 0.28966808319091797, "learning_rate": 7.913200780501568e-07, "loss": 0.3122, "step": 5055 }, { "epoch": 2.510011583650505, "grad_norm": 0.3117029368877411, "learning_rate": 7.897609936297529e-07, "loss": 0.3136, "step": 5056 }, { "epoch": 2.5105080258149925, "grad_norm": 0.321534126996994, "learning_rate": 7.882033148937252e-07, "loss": 0.317, "step": 5057 }, { "epoch": 2.5110044679794803, "grad_norm": 0.3307974934577942, "learning_rate": 7.866470423621402e-07, "loss": 0.3607, "step": 5058 }, { "epoch": 2.511500910143968, "grad_norm": 0.31441542506217957, "learning_rate": 7.850921765545966e-07, "loss": 0.3185, "step": 5059 }, { "epoch": 2.511997352308456, "grad_norm": 0.2940565347671509, "learning_rate": 7.835387179902182e-07, "loss": 0.2916, "step": 5060 }, { "epoch": 2.512493794472944, "grad_norm": 0.34934374690055847, "learning_rate": 7.819866671876669e-07, "loss": 0.3178, "step": 5061 }, { "epoch": 2.5129902366374317, "grad_norm": 0.3121988773345947, "learning_rate": 7.804360246651271e-07, "loss": 0.3471, "step": 5062 }, { "epoch": 2.5134866788019194, "grad_norm": 0.30358803272247314, "learning_rate": 7.788867909403169e-07, "loss": 0.3965, "step": 5063 }, { "epoch": 2.513983120966407, "grad_norm": 0.2949267029762268, "learning_rate": 7.773389665304842e-07, "loss": 0.3099, "step": 5064 }, { "epoch": 2.5144795631308954, "grad_norm": 0.3269472122192383, "learning_rate": 7.757925519524045e-07, "loss": 0.3342, "step": 5065 }, { "epoch": 2.514976005295383, "grad_norm": 0.3103768825531006, "learning_rate": 7.742475477223859e-07, "loss": 0.3225, "step": 5066 }, { "epoch": 2.515472447459871, "grad_norm": 0.3260815441608429, "learning_rate": 7.727039543562586e-07, "loss": 0.3438, "step": 5067 }, { "epoch": 2.5159688896243586, "grad_norm": 0.30164745450019836, "learning_rate": 7.711617723693921e-07, "loss": 0.3165, "step": 5068 }, { "epoch": 2.5164653317888463, "grad_norm": 0.3436562418937683, "learning_rate": 7.696210022766753e-07, "loss": 0.3289, "step": 5069 }, { "epoch": 2.5169617739533345, "grad_norm": 0.3279421925544739, "learning_rate": 7.680816445925315e-07, "loss": 0.3285, "step": 5070 }, { "epoch": 2.5174582161178223, "grad_norm": 0.32270872592926025, "learning_rate": 7.665436998309067e-07, "loss": 0.3404, "step": 5071 }, { "epoch": 2.51795465828231, "grad_norm": 0.30293896794319153, "learning_rate": 7.650071685052835e-07, "loss": 0.3023, "step": 5072 }, { "epoch": 2.518451100446798, "grad_norm": 0.31045040488243103, "learning_rate": 7.634720511286664e-07, "loss": 0.3447, "step": 5073 }, { "epoch": 2.518947542611286, "grad_norm": 0.3153442144393921, "learning_rate": 7.619383482135884e-07, "loss": 0.3302, "step": 5074 }, { "epoch": 2.5194439847757737, "grad_norm": 0.3358253836631775, "learning_rate": 7.604060602721114e-07, "loss": 0.336, "step": 5075 }, { "epoch": 2.5199404269402614, "grad_norm": 0.3163382411003113, "learning_rate": 7.588751878158251e-07, "loss": 0.3446, "step": 5076 }, { "epoch": 2.520436869104749, "grad_norm": 0.30969715118408203, "learning_rate": 7.57345731355848e-07, "loss": 0.3503, "step": 5077 }, { "epoch": 2.5209333112692374, "grad_norm": 0.3218114674091339, "learning_rate": 7.558176914028203e-07, "loss": 0.3807, "step": 5078 }, { "epoch": 2.521429753433725, "grad_norm": 0.31766650080680847, "learning_rate": 7.542910684669153e-07, "loss": 0.2959, "step": 5079 }, { "epoch": 2.521926195598213, "grad_norm": 0.3264714777469635, "learning_rate": 7.527658630578305e-07, "loss": 0.3444, "step": 5080 }, { "epoch": 2.5224226377627006, "grad_norm": 0.29624098539352417, "learning_rate": 7.51242075684791e-07, "loss": 0.2982, "step": 5081 }, { "epoch": 2.5229190799271883, "grad_norm": 0.3143554925918579, "learning_rate": 7.49719706856547e-07, "loss": 0.3056, "step": 5082 }, { "epoch": 2.5234155220916765, "grad_norm": 0.31189942359924316, "learning_rate": 7.48198757081377e-07, "loss": 0.3505, "step": 5083 }, { "epoch": 2.5239119642561643, "grad_norm": 0.32733607292175293, "learning_rate": 7.466792268670853e-07, "loss": 0.3635, "step": 5084 }, { "epoch": 2.524408406420652, "grad_norm": 0.3337823152542114, "learning_rate": 7.451611167209999e-07, "loss": 0.2749, "step": 5085 }, { "epoch": 2.5249048485851397, "grad_norm": 0.3207217752933502, "learning_rate": 7.436444271499776e-07, "loss": 0.3431, "step": 5086 }, { "epoch": 2.5254012907496275, "grad_norm": 0.3016471564769745, "learning_rate": 7.421291586604001e-07, "loss": 0.3271, "step": 5087 }, { "epoch": 2.5258977329141157, "grad_norm": 0.34944599866867065, "learning_rate": 7.406153117581733e-07, "loss": 0.3353, "step": 5088 }, { "epoch": 2.5263941750786034, "grad_norm": 0.31816673278808594, "learning_rate": 7.391028869487316e-07, "loss": 0.3458, "step": 5089 }, { "epoch": 2.526890617243091, "grad_norm": 0.30024659633636475, "learning_rate": 7.375918847370294e-07, "loss": 0.3211, "step": 5090 }, { "epoch": 2.527387059407579, "grad_norm": 0.3012419044971466, "learning_rate": 7.360823056275528e-07, "loss": 0.3407, "step": 5091 }, { "epoch": 2.5278835015720666, "grad_norm": 0.3080849051475525, "learning_rate": 7.345741501243065e-07, "loss": 0.3293, "step": 5092 }, { "epoch": 2.528379943736555, "grad_norm": 0.3211326599121094, "learning_rate": 7.330674187308234e-07, "loss": 0.3513, "step": 5093 }, { "epoch": 2.5288763859010426, "grad_norm": 0.3043195307254791, "learning_rate": 7.315621119501609e-07, "loss": 0.3201, "step": 5094 }, { "epoch": 2.5293728280655303, "grad_norm": 0.2794642746448517, "learning_rate": 7.300582302848991e-07, "loss": 0.2956, "step": 5095 }, { "epoch": 2.529869270230018, "grad_norm": 0.33602601289749146, "learning_rate": 7.285557742371446e-07, "loss": 0.394, "step": 5096 }, { "epoch": 2.530365712394506, "grad_norm": 0.31070852279663086, "learning_rate": 7.270547443085241e-07, "loss": 0.3194, "step": 5097 }, { "epoch": 2.530862154558994, "grad_norm": 0.33470016717910767, "learning_rate": 7.255551410001938e-07, "loss": 0.3623, "step": 5098 }, { "epoch": 2.5313585967234817, "grad_norm": 0.3006722033023834, "learning_rate": 7.240569648128282e-07, "loss": 0.3248, "step": 5099 }, { "epoch": 2.5318550388879695, "grad_norm": 0.29329022765159607, "learning_rate": 7.225602162466294e-07, "loss": 0.313, "step": 5100 }, { "epoch": 2.5323514810524577, "grad_norm": 0.31448909640312195, "learning_rate": 7.210648958013177e-07, "loss": 0.3306, "step": 5101 }, { "epoch": 2.532847923216945, "grad_norm": 0.3233155310153961, "learning_rate": 7.195710039761444e-07, "loss": 0.3669, "step": 5102 }, { "epoch": 2.533344365381433, "grad_norm": 0.30967122316360474, "learning_rate": 7.180785412698765e-07, "loss": 0.3145, "step": 5103 }, { "epoch": 2.533840807545921, "grad_norm": 0.3304818272590637, "learning_rate": 7.165875081808072e-07, "loss": 0.3488, "step": 5104 }, { "epoch": 2.5343372497104086, "grad_norm": 0.28827208280563354, "learning_rate": 7.150979052067524e-07, "loss": 0.3215, "step": 5105 }, { "epoch": 2.534833691874897, "grad_norm": 0.28808164596557617, "learning_rate": 7.136097328450497e-07, "loss": 0.3401, "step": 5106 }, { "epoch": 2.5353301340393846, "grad_norm": 0.3092648983001709, "learning_rate": 7.12122991592561e-07, "loss": 0.3164, "step": 5107 }, { "epoch": 2.5358265762038723, "grad_norm": 0.3076358437538147, "learning_rate": 7.106376819456651e-07, "loss": 0.3141, "step": 5108 }, { "epoch": 2.53632301836836, "grad_norm": 0.29969072341918945, "learning_rate": 7.091538044002705e-07, "loss": 0.3375, "step": 5109 }, { "epoch": 2.536819460532848, "grad_norm": 0.29475054144859314, "learning_rate": 7.076713594518014e-07, "loss": 0.3261, "step": 5110 }, { "epoch": 2.537315902697336, "grad_norm": 0.3217315077781677, "learning_rate": 7.061903475952059e-07, "loss": 0.3263, "step": 5111 }, { "epoch": 2.5378123448618237, "grad_norm": 0.3613514006137848, "learning_rate": 7.047107693249544e-07, "loss": 0.301, "step": 5112 }, { "epoch": 2.5383087870263115, "grad_norm": 0.335196316242218, "learning_rate": 7.032326251350375e-07, "loss": 0.3325, "step": 5113 }, { "epoch": 2.538805229190799, "grad_norm": 0.3175831437110901, "learning_rate": 7.017559155189679e-07, "loss": 0.3436, "step": 5114 }, { "epoch": 2.539301671355287, "grad_norm": 0.32401493191719055, "learning_rate": 7.002806409697776e-07, "loss": 0.342, "step": 5115 }, { "epoch": 2.539798113519775, "grad_norm": 0.30582496523857117, "learning_rate": 6.988068019800214e-07, "loss": 0.3023, "step": 5116 }, { "epoch": 2.540294555684263, "grad_norm": 0.32121843099594116, "learning_rate": 6.973343990417746e-07, "loss": 0.3052, "step": 5117 }, { "epoch": 2.5407909978487506, "grad_norm": 0.3400506377220154, "learning_rate": 6.958634326466313e-07, "loss": 0.373, "step": 5118 }, { "epoch": 2.5412874400132384, "grad_norm": 0.2802794575691223, "learning_rate": 6.943939032857094e-07, "loss": 0.3029, "step": 5119 }, { "epoch": 2.541783882177726, "grad_norm": 0.30908775329589844, "learning_rate": 6.929258114496407e-07, "loss": 0.3438, "step": 5120 }, { "epoch": 2.5422803243422143, "grad_norm": 0.3162083923816681, "learning_rate": 6.914591576285862e-07, "loss": 0.3083, "step": 5121 }, { "epoch": 2.542776766506702, "grad_norm": 0.3295953869819641, "learning_rate": 6.899939423122181e-07, "loss": 0.3903, "step": 5122 }, { "epoch": 2.5432732086711898, "grad_norm": 0.2863773703575134, "learning_rate": 6.885301659897336e-07, "loss": 0.3055, "step": 5123 }, { "epoch": 2.5437696508356775, "grad_norm": 0.3183149993419647, "learning_rate": 6.870678291498467e-07, "loss": 0.3311, "step": 5124 }, { "epoch": 2.5442660930001653, "grad_norm": 0.3035643994808197, "learning_rate": 6.856069322807946e-07, "loss": 0.2972, "step": 5125 }, { "epoch": 2.5447625351646534, "grad_norm": 0.3422996699810028, "learning_rate": 6.841474758703276e-07, "loss": 0.3858, "step": 5126 }, { "epoch": 2.545258977329141, "grad_norm": 0.31344693899154663, "learning_rate": 6.826894604057199e-07, "loss": 0.36, "step": 5127 }, { "epoch": 2.545755419493629, "grad_norm": 0.3057853579521179, "learning_rate": 6.812328863737632e-07, "loss": 0.2804, "step": 5128 }, { "epoch": 2.5462518616581167, "grad_norm": 0.3345441222190857, "learning_rate": 6.797777542607686e-07, "loss": 0.348, "step": 5129 }, { "epoch": 2.5467483038226044, "grad_norm": 0.3127901554107666, "learning_rate": 6.783240645525657e-07, "loss": 0.3395, "step": 5130 }, { "epoch": 2.5472447459870926, "grad_norm": 0.30488210916519165, "learning_rate": 6.768718177344985e-07, "loss": 0.3349, "step": 5131 }, { "epoch": 2.5477411881515803, "grad_norm": 0.30771103501319885, "learning_rate": 6.75421014291438e-07, "loss": 0.3009, "step": 5132 }, { "epoch": 2.548237630316068, "grad_norm": 0.3143271207809448, "learning_rate": 6.739716547077635e-07, "loss": 0.3403, "step": 5133 }, { "epoch": 2.5487340724805563, "grad_norm": 0.29900866746902466, "learning_rate": 6.72523739467379e-07, "loss": 0.3364, "step": 5134 }, { "epoch": 2.5492305146450436, "grad_norm": 0.2929685413837433, "learning_rate": 6.710772690537037e-07, "loss": 0.365, "step": 5135 }, { "epoch": 2.5497269568095318, "grad_norm": 0.3274560272693634, "learning_rate": 6.696322439496744e-07, "loss": 0.3498, "step": 5136 }, { "epoch": 2.5502233989740195, "grad_norm": 0.31897902488708496, "learning_rate": 6.681886646377473e-07, "loss": 0.3335, "step": 5137 }, { "epoch": 2.5507198411385072, "grad_norm": 0.3138137757778168, "learning_rate": 6.667465315998906e-07, "loss": 0.3347, "step": 5138 }, { "epoch": 2.5512162833029954, "grad_norm": 0.28070688247680664, "learning_rate": 6.653058453175981e-07, "loss": 0.3262, "step": 5139 }, { "epoch": 2.551712725467483, "grad_norm": 0.32429012656211853, "learning_rate": 6.638666062718718e-07, "loss": 0.3299, "step": 5140 }, { "epoch": 2.552209167631971, "grad_norm": 0.3366736173629761, "learning_rate": 6.624288149432378e-07, "loss": 0.3099, "step": 5141 }, { "epoch": 2.5527056097964587, "grad_norm": 0.2979748249053955, "learning_rate": 6.609924718117311e-07, "loss": 0.3185, "step": 5142 }, { "epoch": 2.5532020519609464, "grad_norm": 0.32833969593048096, "learning_rate": 6.595575773569118e-07, "loss": 0.3368, "step": 5143 }, { "epoch": 2.5536984941254346, "grad_norm": 0.3098457455635071, "learning_rate": 6.581241320578519e-07, "loss": 0.3374, "step": 5144 }, { "epoch": 2.5541949362899223, "grad_norm": 0.2973994314670563, "learning_rate": 6.566921363931373e-07, "loss": 0.35, "step": 5145 }, { "epoch": 2.55469137845441, "grad_norm": 0.32967424392700195, "learning_rate": 6.552615908408739e-07, "loss": 0.3386, "step": 5146 }, { "epoch": 2.555187820618898, "grad_norm": 0.31924816966056824, "learning_rate": 6.538324958786818e-07, "loss": 0.3032, "step": 5147 }, { "epoch": 2.5556842627833856, "grad_norm": 0.29379674792289734, "learning_rate": 6.524048519836984e-07, "loss": 0.3062, "step": 5148 }, { "epoch": 2.5561807049478737, "grad_norm": 0.31677404046058655, "learning_rate": 6.509786596325718e-07, "loss": 0.3717, "step": 5149 }, { "epoch": 2.5566771471123615, "grad_norm": 0.3338976502418518, "learning_rate": 6.495539193014727e-07, "loss": 0.3378, "step": 5150 }, { "epoch": 2.5571735892768492, "grad_norm": 0.32763832807540894, "learning_rate": 6.481306314660801e-07, "loss": 0.325, "step": 5151 }, { "epoch": 2.557670031441337, "grad_norm": 0.2875607907772064, "learning_rate": 6.467087966015928e-07, "loss": 0.2959, "step": 5152 }, { "epoch": 2.5581664736058247, "grad_norm": 0.3196239769458771, "learning_rate": 6.452884151827222e-07, "loss": 0.3642, "step": 5153 }, { "epoch": 2.558662915770313, "grad_norm": 0.32423028349876404, "learning_rate": 6.438694876836954e-07, "loss": 0.3389, "step": 5154 }, { "epoch": 2.5591593579348006, "grad_norm": 0.31484121084213257, "learning_rate": 6.424520145782542e-07, "loss": 0.3621, "step": 5155 }, { "epoch": 2.5596558000992884, "grad_norm": 0.3093782067298889, "learning_rate": 6.410359963396534e-07, "loss": 0.3282, "step": 5156 }, { "epoch": 2.560152242263776, "grad_norm": 0.33503496646881104, "learning_rate": 6.396214334406631e-07, "loss": 0.3173, "step": 5157 }, { "epoch": 2.560648684428264, "grad_norm": 0.3282957971096039, "learning_rate": 6.382083263535677e-07, "loss": 0.3126, "step": 5158 }, { "epoch": 2.561145126592752, "grad_norm": 0.3661416172981262, "learning_rate": 6.367966755501647e-07, "loss": 0.3495, "step": 5159 }, { "epoch": 2.56164156875724, "grad_norm": 0.3226284086704254, "learning_rate": 6.35386481501768e-07, "loss": 0.3086, "step": 5160 }, { "epoch": 2.5621380109217275, "grad_norm": 0.3273298144340515, "learning_rate": 6.339777446791994e-07, "loss": 0.3568, "step": 5161 }, { "epoch": 2.5626344530862153, "grad_norm": 0.2950994074344635, "learning_rate": 6.32570465552802e-07, "loss": 0.2873, "step": 5162 }, { "epoch": 2.563130895250703, "grad_norm": 0.3133484423160553, "learning_rate": 6.311646445924246e-07, "loss": 0.3079, "step": 5163 }, { "epoch": 2.563627337415191, "grad_norm": 0.3107456862926483, "learning_rate": 6.297602822674343e-07, "loss": 0.379, "step": 5164 }, { "epoch": 2.564123779579679, "grad_norm": 0.28560253977775574, "learning_rate": 6.283573790467091e-07, "loss": 0.3132, "step": 5165 }, { "epoch": 2.5646202217441667, "grad_norm": 0.32742998003959656, "learning_rate": 6.269559353986404e-07, "loss": 0.335, "step": 5166 }, { "epoch": 2.565116663908655, "grad_norm": 0.31301817297935486, "learning_rate": 6.255559517911336e-07, "loss": 0.3137, "step": 5167 }, { "epoch": 2.5656131060731426, "grad_norm": 0.3015623986721039, "learning_rate": 6.241574286916007e-07, "loss": 0.3541, "step": 5168 }, { "epoch": 2.5661095482376304, "grad_norm": 0.29796913266181946, "learning_rate": 6.227603665669762e-07, "loss": 0.311, "step": 5169 }, { "epoch": 2.566605990402118, "grad_norm": 0.33391380310058594, "learning_rate": 6.21364765883698e-07, "loss": 0.3272, "step": 5170 }, { "epoch": 2.567102432566606, "grad_norm": 0.3293060064315796, "learning_rate": 6.199706271077199e-07, "loss": 0.3483, "step": 5171 }, { "epoch": 2.567598874731094, "grad_norm": 0.31433209776878357, "learning_rate": 6.185779507045053e-07, "loss": 0.3301, "step": 5172 }, { "epoch": 2.568095316895582, "grad_norm": 0.300067275762558, "learning_rate": 6.171867371390345e-07, "loss": 0.3206, "step": 5173 }, { "epoch": 2.5685917590600695, "grad_norm": 0.28140369057655334, "learning_rate": 6.157969868757923e-07, "loss": 0.3105, "step": 5174 }, { "epoch": 2.5690882012245573, "grad_norm": 0.3144219219684601, "learning_rate": 6.144087003787807e-07, "loss": 0.322, "step": 5175 }, { "epoch": 2.569584643389045, "grad_norm": 0.3215709626674652, "learning_rate": 6.130218781115105e-07, "loss": 0.2988, "step": 5176 }, { "epoch": 2.570081085553533, "grad_norm": 0.3128758370876312, "learning_rate": 6.116365205370034e-07, "loss": 0.3282, "step": 5177 }, { "epoch": 2.570577527718021, "grad_norm": 0.3216630220413208, "learning_rate": 6.102526281177939e-07, "loss": 0.3576, "step": 5178 }, { "epoch": 2.5710739698825087, "grad_norm": 0.31718289852142334, "learning_rate": 6.088702013159231e-07, "loss": 0.3108, "step": 5179 }, { "epoch": 2.5715704120469964, "grad_norm": 0.32607221603393555, "learning_rate": 6.0748924059295e-07, "loss": 0.3543, "step": 5180 }, { "epoch": 2.572066854211484, "grad_norm": 0.3297826945781708, "learning_rate": 6.061097464099363e-07, "loss": 0.3274, "step": 5181 }, { "epoch": 2.5725632963759724, "grad_norm": 0.36041322350502014, "learning_rate": 6.047317192274593e-07, "loss": 0.347, "step": 5182 }, { "epoch": 2.57305973854046, "grad_norm": 0.31144261360168457, "learning_rate": 6.033551595056048e-07, "loss": 0.3043, "step": 5183 }, { "epoch": 2.573556180704948, "grad_norm": 0.3241175711154938, "learning_rate": 6.019800677039677e-07, "loss": 0.3286, "step": 5184 }, { "epoch": 2.5740526228694356, "grad_norm": 0.3404695391654968, "learning_rate": 6.006064442816556e-07, "loss": 0.3531, "step": 5185 }, { "epoch": 2.5745490650339233, "grad_norm": 0.3120766282081604, "learning_rate": 5.99234289697282e-07, "loss": 0.3084, "step": 5186 }, { "epoch": 2.5750455071984115, "grad_norm": 0.3083415627479553, "learning_rate": 5.978636044089731e-07, "loss": 0.3633, "step": 5187 }, { "epoch": 2.5755419493628993, "grad_norm": 0.29919755458831787, "learning_rate": 5.96494388874363e-07, "loss": 0.3567, "step": 5188 }, { "epoch": 2.576038391527387, "grad_norm": 0.3009760081768036, "learning_rate": 5.951266435505959e-07, "loss": 0.3031, "step": 5189 }, { "epoch": 2.5765348336918747, "grad_norm": 0.3248187005519867, "learning_rate": 5.937603688943244e-07, "loss": 0.3263, "step": 5190 }, { "epoch": 2.5770312758563625, "grad_norm": 0.3276160657405853, "learning_rate": 5.923955653617109e-07, "loss": 0.3553, "step": 5191 }, { "epoch": 2.5775277180208507, "grad_norm": 0.3165351450443268, "learning_rate": 5.910322334084273e-07, "loss": 0.3286, "step": 5192 }, { "epoch": 2.5780241601853384, "grad_norm": 0.3136853277683258, "learning_rate": 5.896703734896508e-07, "loss": 0.3902, "step": 5193 }, { "epoch": 2.578520602349826, "grad_norm": 0.3324239253997803, "learning_rate": 5.883099860600699e-07, "loss": 0.3223, "step": 5194 }, { "epoch": 2.5790170445143143, "grad_norm": 0.30111414194107056, "learning_rate": 5.869510715738824e-07, "loss": 0.3291, "step": 5195 }, { "epoch": 2.5795134866788016, "grad_norm": 0.30762460827827454, "learning_rate": 5.855936304847926e-07, "loss": 0.3323, "step": 5196 }, { "epoch": 2.58000992884329, "grad_norm": 0.3432765603065491, "learning_rate": 5.842376632460117e-07, "loss": 0.3641, "step": 5197 }, { "epoch": 2.5805063710077776, "grad_norm": 0.29140445590019226, "learning_rate": 5.828831703102616e-07, "loss": 0.3065, "step": 5198 }, { "epoch": 2.5810028131722653, "grad_norm": 0.31527450680732727, "learning_rate": 5.815301521297701e-07, "loss": 0.3374, "step": 5199 }, { "epoch": 2.5814992553367535, "grad_norm": 0.2916865348815918, "learning_rate": 5.801786091562733e-07, "loss": 0.3225, "step": 5200 }, { "epoch": 2.5819956975012412, "grad_norm": 0.3328399658203125, "learning_rate": 5.788285418410161e-07, "loss": 0.3691, "step": 5201 }, { "epoch": 2.582492139665729, "grad_norm": 0.31618762016296387, "learning_rate": 5.774799506347461e-07, "loss": 0.3146, "step": 5202 }, { "epoch": 2.5829885818302167, "grad_norm": 0.3065619170665741, "learning_rate": 5.76132835987725e-07, "loss": 0.2913, "step": 5203 }, { "epoch": 2.5834850239947045, "grad_norm": 0.3094007074832916, "learning_rate": 5.747871983497144e-07, "loss": 0.3737, "step": 5204 }, { "epoch": 2.5839814661591927, "grad_norm": 0.28340938687324524, "learning_rate": 5.734430381699884e-07, "loss": 0.2894, "step": 5205 }, { "epoch": 2.5844779083236804, "grad_norm": 0.31676092743873596, "learning_rate": 5.721003558973243e-07, "loss": 0.3576, "step": 5206 }, { "epoch": 2.584974350488168, "grad_norm": 0.3340267241001129, "learning_rate": 5.707591519800082e-07, "loss": 0.3295, "step": 5207 }, { "epoch": 2.585470792652656, "grad_norm": 0.33918216824531555, "learning_rate": 5.694194268658315e-07, "loss": 0.3593, "step": 5208 }, { "epoch": 2.5859672348171436, "grad_norm": 0.3253670632839203, "learning_rate": 5.680811810020903e-07, "loss": 0.2819, "step": 5209 }, { "epoch": 2.586463676981632, "grad_norm": 0.338967502117157, "learning_rate": 5.667444148355916e-07, "loss": 0.3539, "step": 5210 }, { "epoch": 2.5869601191461196, "grad_norm": 0.29674801230430603, "learning_rate": 5.654091288126429e-07, "loss": 0.3081, "step": 5211 }, { "epoch": 2.5874565613106073, "grad_norm": 0.29947343468666077, "learning_rate": 5.640753233790602e-07, "loss": 0.333, "step": 5212 }, { "epoch": 2.587953003475095, "grad_norm": 0.29281556606292725, "learning_rate": 5.627429989801653e-07, "loss": 0.349, "step": 5213 }, { "epoch": 2.588449445639583, "grad_norm": 0.3130631744861603, "learning_rate": 5.614121560607849e-07, "loss": 0.3339, "step": 5214 }, { "epoch": 2.588945887804071, "grad_norm": 0.3132879436016083, "learning_rate": 5.600827950652532e-07, "loss": 0.3203, "step": 5215 }, { "epoch": 2.5894423299685587, "grad_norm": 0.30341869592666626, "learning_rate": 5.58754916437404e-07, "loss": 0.3247, "step": 5216 }, { "epoch": 2.5899387721330465, "grad_norm": 0.301573246717453, "learning_rate": 5.574285206205826e-07, "loss": 0.3245, "step": 5217 }, { "epoch": 2.590435214297534, "grad_norm": 0.32337233424186707, "learning_rate": 5.561036080576354e-07, "loss": 0.3268, "step": 5218 }, { "epoch": 2.590931656462022, "grad_norm": 0.29031792283058167, "learning_rate": 5.547801791909163e-07, "loss": 0.2584, "step": 5219 }, { "epoch": 2.59142809862651, "grad_norm": 0.3250426948070526, "learning_rate": 5.534582344622785e-07, "loss": 0.3516, "step": 5220 }, { "epoch": 2.591924540790998, "grad_norm": 0.32200801372528076, "learning_rate": 5.521377743130885e-07, "loss": 0.3407, "step": 5221 }, { "epoch": 2.5924209829554856, "grad_norm": 0.3100559711456299, "learning_rate": 5.508187991842085e-07, "loss": 0.3443, "step": 5222 }, { "epoch": 2.5929174251199734, "grad_norm": 0.3241822421550751, "learning_rate": 5.49501309516009e-07, "loss": 0.3423, "step": 5223 }, { "epoch": 2.593413867284461, "grad_norm": 0.31480497121810913, "learning_rate": 5.481853057483644e-07, "loss": 0.3352, "step": 5224 }, { "epoch": 2.5939103094489493, "grad_norm": 0.30745235085487366, "learning_rate": 5.468707883206525e-07, "loss": 0.3405, "step": 5225 }, { "epoch": 2.594406751613437, "grad_norm": 0.31789708137512207, "learning_rate": 5.455577576717563e-07, "loss": 0.3138, "step": 5226 }, { "epoch": 2.5949031937779248, "grad_norm": 0.3344665765762329, "learning_rate": 5.442462142400589e-07, "loss": 0.3344, "step": 5227 }, { "epoch": 2.595399635942413, "grad_norm": 0.32535988092422485, "learning_rate": 5.429361584634496e-07, "loss": 0.3534, "step": 5228 }, { "epoch": 2.5958960781069007, "grad_norm": 0.3142169117927551, "learning_rate": 5.416275907793212e-07, "loss": 0.296, "step": 5229 }, { "epoch": 2.5963925202713884, "grad_norm": 0.3428489565849304, "learning_rate": 5.40320511624568e-07, "loss": 0.3564, "step": 5230 }, { "epoch": 2.596888962435876, "grad_norm": 0.30342450737953186, "learning_rate": 5.390149214355884e-07, "loss": 0.3158, "step": 5231 }, { "epoch": 2.597385404600364, "grad_norm": 0.3440493047237396, "learning_rate": 5.37710820648284e-07, "loss": 0.3281, "step": 5232 }, { "epoch": 2.597881846764852, "grad_norm": 0.3311026096343994, "learning_rate": 5.364082096980589e-07, "loss": 0.3776, "step": 5233 }, { "epoch": 2.59837828892934, "grad_norm": 0.3003179132938385, "learning_rate": 5.351070890198184e-07, "loss": 0.3382, "step": 5234 }, { "epoch": 2.5988747310938276, "grad_norm": 0.2928702235221863, "learning_rate": 5.338074590479714e-07, "loss": 0.3393, "step": 5235 }, { "epoch": 2.5993711732583153, "grad_norm": 0.31340545415878296, "learning_rate": 5.3250932021643e-07, "loss": 0.3405, "step": 5236 }, { "epoch": 2.599867615422803, "grad_norm": 0.2903567850589752, "learning_rate": 5.312126729586065e-07, "loss": 0.3206, "step": 5237 }, { "epoch": 2.6003640575872913, "grad_norm": 0.3208666741847992, "learning_rate": 5.299175177074173e-07, "loss": 0.3199, "step": 5238 }, { "epoch": 2.600860499751779, "grad_norm": 0.3334077000617981, "learning_rate": 5.286238548952771e-07, "loss": 0.3617, "step": 5239 }, { "epoch": 2.6013569419162668, "grad_norm": 0.3116713762283325, "learning_rate": 5.273316849541088e-07, "loss": 0.2838, "step": 5240 }, { "epoch": 2.6018533840807545, "grad_norm": 0.3469405174255371, "learning_rate": 5.260410083153289e-07, "loss": 0.3682, "step": 5241 }, { "epoch": 2.6023498262452422, "grad_norm": 0.3120805323123932, "learning_rate": 5.247518254098627e-07, "loss": 0.2945, "step": 5242 }, { "epoch": 2.6028462684097304, "grad_norm": 0.3254571557044983, "learning_rate": 5.234641366681287e-07, "loss": 0.3162, "step": 5243 }, { "epoch": 2.603342710574218, "grad_norm": 0.3073902726173401, "learning_rate": 5.221779425200563e-07, "loss": 0.3555, "step": 5244 }, { "epoch": 2.603839152738706, "grad_norm": 0.32329195737838745, "learning_rate": 5.208932433950675e-07, "loss": 0.349, "step": 5245 }, { "epoch": 2.6043355949031937, "grad_norm": 0.3176155686378479, "learning_rate": 5.196100397220893e-07, "loss": 0.3313, "step": 5246 }, { "epoch": 2.6048320370676814, "grad_norm": 0.29285770654678345, "learning_rate": 5.183283319295485e-07, "loss": 0.3176, "step": 5247 }, { "epoch": 2.6053284792321696, "grad_norm": 0.28777024149894714, "learning_rate": 5.170481204453725e-07, "loss": 0.2835, "step": 5248 }, { "epoch": 2.6058249213966573, "grad_norm": 0.33983513712882996, "learning_rate": 5.157694056969903e-07, "loss": 0.4156, "step": 5249 }, { "epoch": 2.606321363561145, "grad_norm": 0.3031575083732605, "learning_rate": 5.144921881113269e-07, "loss": 0.3078, "step": 5250 }, { "epoch": 2.606817805725633, "grad_norm": 0.3331718444824219, "learning_rate": 5.132164681148144e-07, "loss": 0.3205, "step": 5251 }, { "epoch": 2.6073142478901206, "grad_norm": 0.3102482855319977, "learning_rate": 5.119422461333784e-07, "loss": 0.2936, "step": 5252 }, { "epoch": 2.6078106900546087, "grad_norm": 0.3466714918613434, "learning_rate": 5.10669522592448e-07, "loss": 0.3644, "step": 5253 }, { "epoch": 2.6083071322190965, "grad_norm": 0.3156680762767792, "learning_rate": 5.093982979169503e-07, "loss": 0.3535, "step": 5254 }, { "epoch": 2.6088035743835842, "grad_norm": 0.29712018370628357, "learning_rate": 5.081285725313134e-07, "loss": 0.3242, "step": 5255 }, { "epoch": 2.6093000165480724, "grad_norm": 0.32821211218833923, "learning_rate": 5.068603468594646e-07, "loss": 0.3419, "step": 5256 }, { "epoch": 2.6097964587125597, "grad_norm": 0.3076991140842438, "learning_rate": 5.055936213248286e-07, "loss": 0.3024, "step": 5257 }, { "epoch": 2.610292900877048, "grad_norm": 0.3142346739768982, "learning_rate": 5.043283963503309e-07, "loss": 0.3622, "step": 5258 }, { "epoch": 2.6107893430415356, "grad_norm": 0.3228954076766968, "learning_rate": 5.030646723583959e-07, "loss": 0.3398, "step": 5259 }, { "epoch": 2.6112857852060234, "grad_norm": 0.31439390778541565, "learning_rate": 5.018024497709473e-07, "loss": 0.31, "step": 5260 }, { "epoch": 2.6117822273705116, "grad_norm": 0.30324220657348633, "learning_rate": 5.005417290094061e-07, "loss": 0.3326, "step": 5261 }, { "epoch": 2.6122786695349993, "grad_norm": 0.31573644280433655, "learning_rate": 4.992825104946936e-07, "loss": 0.3498, "step": 5262 }, { "epoch": 2.612775111699487, "grad_norm": 0.29416918754577637, "learning_rate": 4.980247946472289e-07, "loss": 0.3264, "step": 5263 }, { "epoch": 2.613271553863975, "grad_norm": 0.2950877547264099, "learning_rate": 4.967685818869273e-07, "loss": 0.3326, "step": 5264 }, { "epoch": 2.6137679960284625, "grad_norm": 0.3268579840660095, "learning_rate": 4.955138726332054e-07, "loss": 0.3211, "step": 5265 }, { "epoch": 2.6142644381929507, "grad_norm": 0.3110427260398865, "learning_rate": 4.94260667304976e-07, "loss": 0.3271, "step": 5266 }, { "epoch": 2.6147608803574385, "grad_norm": 0.32809099555015564, "learning_rate": 4.930089663206516e-07, "loss": 0.3385, "step": 5267 }, { "epoch": 2.615257322521926, "grad_norm": 0.30347949266433716, "learning_rate": 4.917587700981391e-07, "loss": 0.3124, "step": 5268 }, { "epoch": 2.615753764686414, "grad_norm": 0.30891740322113037, "learning_rate": 4.905100790548462e-07, "loss": 0.3264, "step": 5269 }, { "epoch": 2.6162502068509017, "grad_norm": 0.2939970791339874, "learning_rate": 4.892628936076766e-07, "loss": 0.2759, "step": 5270 }, { "epoch": 2.61674664901539, "grad_norm": 0.316761314868927, "learning_rate": 4.880172141730316e-07, "loss": 0.3668, "step": 5271 }, { "epoch": 2.6172430911798776, "grad_norm": 0.2997087240219116, "learning_rate": 4.867730411668103e-07, "loss": 0.3232, "step": 5272 }, { "epoch": 2.6177395333443654, "grad_norm": 0.30701908469200134, "learning_rate": 4.855303750044077e-07, "loss": 0.2982, "step": 5273 }, { "epoch": 2.618235975508853, "grad_norm": 0.30238261818885803, "learning_rate": 4.842892161007173e-07, "loss": 0.2891, "step": 5274 }, { "epoch": 2.618732417673341, "grad_norm": 0.34054887294769287, "learning_rate": 4.830495648701266e-07, "loss": 0.3279, "step": 5275 }, { "epoch": 2.619228859837829, "grad_norm": 0.31992319226264954, "learning_rate": 4.818114217265219e-07, "loss": 0.3273, "step": 5276 }, { "epoch": 2.619725302002317, "grad_norm": 0.30309030413627625, "learning_rate": 4.805747870832867e-07, "loss": 0.2852, "step": 5277 }, { "epoch": 2.6202217441668045, "grad_norm": 0.3225860297679901, "learning_rate": 4.79339661353298e-07, "loss": 0.3082, "step": 5278 }, { "epoch": 2.6207181863312923, "grad_norm": 0.3279791474342346, "learning_rate": 4.781060449489333e-07, "loss": 0.2942, "step": 5279 }, { "epoch": 2.62121462849578, "grad_norm": 0.3098865747451782, "learning_rate": 4.768739382820597e-07, "loss": 0.328, "step": 5280 }, { "epoch": 2.621711070660268, "grad_norm": 0.28955772519111633, "learning_rate": 4.7564334176404827e-07, "loss": 0.3037, "step": 5281 }, { "epoch": 2.622207512824756, "grad_norm": 0.3094613254070282, "learning_rate": 4.7441425580575904e-07, "loss": 0.3283, "step": 5282 }, { "epoch": 2.6227039549892437, "grad_norm": 0.32700100541114807, "learning_rate": 4.7318668081755116e-07, "loss": 0.3353, "step": 5283 }, { "epoch": 2.6232003971537314, "grad_norm": 0.316816121339798, "learning_rate": 4.7196061720927835e-07, "loss": 0.3234, "step": 5284 }, { "epoch": 2.623696839318219, "grad_norm": 0.3234062194824219, "learning_rate": 4.707360653902904e-07, "loss": 0.3183, "step": 5285 }, { "epoch": 2.6241932814827074, "grad_norm": 0.32205313444137573, "learning_rate": 4.695130257694325e-07, "loss": 0.3491, "step": 5286 }, { "epoch": 2.624689723647195, "grad_norm": 0.3208634853363037, "learning_rate": 4.682914987550413e-07, "loss": 0.3181, "step": 5287 }, { "epoch": 2.625186165811683, "grad_norm": 0.3415628969669342, "learning_rate": 4.6707148475495623e-07, "loss": 0.3442, "step": 5288 }, { "epoch": 2.625682607976171, "grad_norm": 0.3054746091365814, "learning_rate": 4.6585298417650306e-07, "loss": 0.3511, "step": 5289 }, { "epoch": 2.6261790501406588, "grad_norm": 0.29120054841041565, "learning_rate": 4.6463599742650745e-07, "loss": 0.2859, "step": 5290 }, { "epoch": 2.6266754923051465, "grad_norm": 0.30436891317367554, "learning_rate": 4.6342052491128664e-07, "loss": 0.3324, "step": 5291 }, { "epoch": 2.6271719344696343, "grad_norm": 0.2974749803543091, "learning_rate": 4.622065670366571e-07, "loss": 0.3029, "step": 5292 }, { "epoch": 2.627668376634122, "grad_norm": 0.32376107573509216, "learning_rate": 4.6099412420792354e-07, "loss": 0.3333, "step": 5293 }, { "epoch": 2.62816481879861, "grad_norm": 0.3350920081138611, "learning_rate": 4.5978319682988826e-07, "loss": 0.3459, "step": 5294 }, { "epoch": 2.628661260963098, "grad_norm": 0.2858850657939911, "learning_rate": 4.5857378530684724e-07, "loss": 0.2886, "step": 5295 }, { "epoch": 2.6291577031275857, "grad_norm": 0.3351372182369232, "learning_rate": 4.573658900425909e-07, "loss": 0.3685, "step": 5296 }, { "epoch": 2.6296541452920734, "grad_norm": 0.3325794041156769, "learning_rate": 4.561595114404022e-07, "loss": 0.3672, "step": 5297 }, { "epoch": 2.630150587456561, "grad_norm": 0.3171500563621521, "learning_rate": 4.5495464990305715e-07, "loss": 0.301, "step": 5298 }, { "epoch": 2.6306470296210493, "grad_norm": 0.33820414543151855, "learning_rate": 4.537513058328269e-07, "loss": 0.3121, "step": 5299 }, { "epoch": 2.631143471785537, "grad_norm": 0.3038290739059448, "learning_rate": 4.5254947963147553e-07, "loss": 0.3003, "step": 5300 }, { "epoch": 2.631639913950025, "grad_norm": 0.3117413818836212, "learning_rate": 4.513491717002599e-07, "loss": 0.3538, "step": 5301 }, { "epoch": 2.6321363561145126, "grad_norm": 0.322177529335022, "learning_rate": 4.501503824399306e-07, "loss": 0.3031, "step": 5302 }, { "epoch": 2.6326327982790003, "grad_norm": 0.3193669319152832, "learning_rate": 4.4895311225073014e-07, "loss": 0.3188, "step": 5303 }, { "epoch": 2.6331292404434885, "grad_norm": 0.3275974690914154, "learning_rate": 4.4775736153239657e-07, "loss": 0.3218, "step": 5304 }, { "epoch": 2.6336256826079762, "grad_norm": 0.3248395025730133, "learning_rate": 4.465631306841556e-07, "loss": 0.3111, "step": 5305 }, { "epoch": 2.634122124772464, "grad_norm": 0.2977208197116852, "learning_rate": 4.453704201047293e-07, "loss": 0.3112, "step": 5306 }, { "epoch": 2.6346185669369517, "grad_norm": 0.30518198013305664, "learning_rate": 4.44179230192332e-07, "loss": 0.3364, "step": 5307 }, { "epoch": 2.6351150091014395, "grad_norm": 0.306215763092041, "learning_rate": 4.429895613446694e-07, "loss": 0.317, "step": 5308 }, { "epoch": 2.6356114512659277, "grad_norm": 0.3142072260379791, "learning_rate": 4.41801413958941e-07, "loss": 0.3116, "step": 5309 }, { "epoch": 2.6361078934304154, "grad_norm": 0.3124794363975525, "learning_rate": 4.4061478843183294e-07, "loss": 0.3252, "step": 5310 }, { "epoch": 2.636604335594903, "grad_norm": 0.3238534927368164, "learning_rate": 4.39429685159532e-07, "loss": 0.3164, "step": 5311 }, { "epoch": 2.637100777759391, "grad_norm": 0.3107975423336029, "learning_rate": 4.38246104537709e-07, "loss": 0.3426, "step": 5312 }, { "epoch": 2.6375972199238786, "grad_norm": 0.3363976776599884, "learning_rate": 4.3706404696153003e-07, "loss": 0.3242, "step": 5313 }, { "epoch": 2.638093662088367, "grad_norm": 0.345651775598526, "learning_rate": 4.3588351282565166e-07, "loss": 0.3608, "step": 5314 }, { "epoch": 2.6385901042528546, "grad_norm": 0.29773658514022827, "learning_rate": 4.3470450252422416e-07, "loss": 0.3507, "step": 5315 }, { "epoch": 2.6390865464173423, "grad_norm": 0.30998602509498596, "learning_rate": 4.335270164508837e-07, "loss": 0.3287, "step": 5316 }, { "epoch": 2.6395829885818305, "grad_norm": 0.32093456387519836, "learning_rate": 4.3235105499876306e-07, "loss": 0.3416, "step": 5317 }, { "epoch": 2.640079430746318, "grad_norm": 0.323691189289093, "learning_rate": 4.311766185604832e-07, "loss": 0.3076, "step": 5318 }, { "epoch": 2.640575872910806, "grad_norm": 0.31169047951698303, "learning_rate": 4.3000370752815655e-07, "loss": 0.3284, "step": 5319 }, { "epoch": 2.6410723150752937, "grad_norm": 0.32264581322669983, "learning_rate": 4.2883232229338766e-07, "loss": 0.334, "step": 5320 }, { "epoch": 2.6415687572397815, "grad_norm": 0.31804540753364563, "learning_rate": 4.27662463247267e-07, "loss": 0.3141, "step": 5321 }, { "epoch": 2.6420651994042696, "grad_norm": 0.32129180431365967, "learning_rate": 4.2649413078038215e-07, "loss": 0.3151, "step": 5322 }, { "epoch": 2.6425616415687574, "grad_norm": 0.31126779317855835, "learning_rate": 4.2532732528280497e-07, "loss": 0.3367, "step": 5323 }, { "epoch": 2.643058083733245, "grad_norm": 0.33466586470603943, "learning_rate": 4.241620471441016e-07, "loss": 0.3477, "step": 5324 }, { "epoch": 2.643554525897733, "grad_norm": 0.30245548486709595, "learning_rate": 4.2299829675332636e-07, "loss": 0.3178, "step": 5325 }, { "epoch": 2.6440509680622206, "grad_norm": 0.29840990900993347, "learning_rate": 4.2183607449902355e-07, "loss": 0.311, "step": 5326 }, { "epoch": 2.644547410226709, "grad_norm": 0.31806954741477966, "learning_rate": 4.2067538076922874e-07, "loss": 0.3347, "step": 5327 }, { "epoch": 2.6450438523911965, "grad_norm": 0.31173205375671387, "learning_rate": 4.195162159514632e-07, "loss": 0.3524, "step": 5328 }, { "epoch": 2.6455402945556843, "grad_norm": 0.2946256995201111, "learning_rate": 4.1835858043274445e-07, "loss": 0.3094, "step": 5329 }, { "epoch": 2.646036736720172, "grad_norm": 0.30247417092323303, "learning_rate": 4.172024745995729e-07, "loss": 0.3359, "step": 5330 }, { "epoch": 2.6465331788846598, "grad_norm": 0.32096388936042786, "learning_rate": 4.160478988379413e-07, "loss": 0.3328, "step": 5331 }, { "epoch": 2.647029621049148, "grad_norm": 0.3321000635623932, "learning_rate": 4.148948535333319e-07, "loss": 0.3432, "step": 5332 }, { "epoch": 2.6475260632136357, "grad_norm": 0.28672856092453003, "learning_rate": 4.1374333907071406e-07, "loss": 0.3227, "step": 5333 }, { "epoch": 2.6480225053781234, "grad_norm": 0.30879586935043335, "learning_rate": 4.1259335583454854e-07, "loss": 0.3438, "step": 5334 }, { "epoch": 2.648518947542611, "grad_norm": 0.28739458322525024, "learning_rate": 4.114449042087826e-07, "loss": 0.36, "step": 5335 }, { "epoch": 2.649015389707099, "grad_norm": 0.301638126373291, "learning_rate": 4.102979845768523e-07, "loss": 0.3369, "step": 5336 }, { "epoch": 2.649511831871587, "grad_norm": 0.30587247014045715, "learning_rate": 4.0915259732168425e-07, "loss": 0.3236, "step": 5337 }, { "epoch": 2.650008274036075, "grad_norm": 0.3258969485759735, "learning_rate": 4.080087428256924e-07, "loss": 0.2939, "step": 5338 }, { "epoch": 2.6505047162005626, "grad_norm": 0.3343387842178345, "learning_rate": 4.068664214707768e-07, "loss": 0.3843, "step": 5339 }, { "epoch": 2.6510011583650503, "grad_norm": 0.3199111521244049, "learning_rate": 4.0572563363832864e-07, "loss": 0.3114, "step": 5340 }, { "epoch": 2.651497600529538, "grad_norm": 0.3432503044605255, "learning_rate": 4.0458637970922645e-07, "loss": 0.3332, "step": 5341 }, { "epoch": 2.6519940426940263, "grad_norm": 0.3400680124759674, "learning_rate": 4.034486600638349e-07, "loss": 0.3246, "step": 5342 }, { "epoch": 2.652490484858514, "grad_norm": 0.31343820691108704, "learning_rate": 4.02312475082009e-07, "loss": 0.322, "step": 5343 }, { "epoch": 2.6529869270230018, "grad_norm": 0.30441945791244507, "learning_rate": 4.011778251430892e-07, "loss": 0.2935, "step": 5344 }, { "epoch": 2.6534833691874895, "grad_norm": 0.306130051612854, "learning_rate": 4.000447106259059e-07, "loss": 0.311, "step": 5345 }, { "epoch": 2.6539798113519772, "grad_norm": 0.302301287651062, "learning_rate": 3.9891313190877243e-07, "loss": 0.3181, "step": 5346 }, { "epoch": 2.6544762535164654, "grad_norm": 0.2984433174133301, "learning_rate": 3.977830893694934e-07, "loss": 0.3049, "step": 5347 }, { "epoch": 2.654972695680953, "grad_norm": 0.2941012382507324, "learning_rate": 3.9665458338536023e-07, "loss": 0.3146, "step": 5348 }, { "epoch": 2.655469137845441, "grad_norm": 0.3283845782279968, "learning_rate": 3.9552761433314936e-07, "loss": 0.32, "step": 5349 }, { "epoch": 2.655965580009929, "grad_norm": 0.31832918524742126, "learning_rate": 3.944021825891259e-07, "loss": 0.3664, "step": 5350 }, { "epoch": 2.6564620221744164, "grad_norm": 0.312012642621994, "learning_rate": 3.932782885290393e-07, "loss": 0.3131, "step": 5351 }, { "epoch": 2.6569584643389046, "grad_norm": 0.298179566860199, "learning_rate": 3.921559325281299e-07, "loss": 0.3103, "step": 5352 }, { "epoch": 2.6574549065033923, "grad_norm": 0.3289341926574707, "learning_rate": 3.9103511496111965e-07, "loss": 0.3838, "step": 5353 }, { "epoch": 2.65795134866788, "grad_norm": 0.33185872435569763, "learning_rate": 3.899158362022193e-07, "loss": 0.3322, "step": 5354 }, { "epoch": 2.6584477908323683, "grad_norm": 0.320936918258667, "learning_rate": 3.887980966251265e-07, "loss": 0.3391, "step": 5355 }, { "epoch": 2.658944232996856, "grad_norm": 0.30239981412887573, "learning_rate": 3.876818966030238e-07, "loss": 0.3479, "step": 5356 }, { "epoch": 2.6594406751613437, "grad_norm": 0.3398089110851288, "learning_rate": 3.865672365085804e-07, "loss": 0.3482, "step": 5357 }, { "epoch": 2.6599371173258315, "grad_norm": 0.29597821831703186, "learning_rate": 3.8545411671394914e-07, "loss": 0.3431, "step": 5358 }, { "epoch": 2.6604335594903192, "grad_norm": 0.3265974223613739, "learning_rate": 3.843425375907739e-07, "loss": 0.32, "step": 5359 }, { "epoch": 2.6609300016548074, "grad_norm": 0.34001171588897705, "learning_rate": 3.832324995101777e-07, "loss": 0.3593, "step": 5360 }, { "epoch": 2.661426443819295, "grad_norm": 0.278868168592453, "learning_rate": 3.8212400284277364e-07, "loss": 0.2818, "step": 5361 }, { "epoch": 2.661922885983783, "grad_norm": 0.301200270652771, "learning_rate": 3.810170479586567e-07, "loss": 0.3612, "step": 5362 }, { "epoch": 2.6624193281482706, "grad_norm": 0.3438519537448883, "learning_rate": 3.799116352274124e-07, "loss": 0.3683, "step": 5363 }, { "epoch": 2.6629157703127584, "grad_norm": 0.31301912665367126, "learning_rate": 3.788077650181049e-07, "loss": 0.2894, "step": 5364 }, { "epoch": 2.6634122124772466, "grad_norm": 0.29822614789009094, "learning_rate": 3.7770543769928724e-07, "loss": 0.3295, "step": 5365 }, { "epoch": 2.6639086546417343, "grad_norm": 0.3199133276939392, "learning_rate": 3.766046536389978e-07, "loss": 0.3451, "step": 5366 }, { "epoch": 2.664405096806222, "grad_norm": 0.28702211380004883, "learning_rate": 3.7550541320475697e-07, "loss": 0.285, "step": 5367 }, { "epoch": 2.66490153897071, "grad_norm": 0.327791690826416, "learning_rate": 3.744077167635729e-07, "loss": 0.3571, "step": 5368 }, { "epoch": 2.6653979811351975, "grad_norm": 0.31715527176856995, "learning_rate": 3.7331156468193353e-07, "loss": 0.3222, "step": 5369 }, { "epoch": 2.6658944232996857, "grad_norm": 0.32490280270576477, "learning_rate": 3.722169573258183e-07, "loss": 0.3488, "step": 5370 }, { "epoch": 2.6663908654641735, "grad_norm": 0.28680869936943054, "learning_rate": 3.7112389506068435e-07, "loss": 0.2737, "step": 5371 }, { "epoch": 2.666887307628661, "grad_norm": 0.32466939091682434, "learning_rate": 3.7003237825147533e-07, "loss": 0.3479, "step": 5372 }, { "epoch": 2.667383749793149, "grad_norm": 0.3145809769630432, "learning_rate": 3.689424072626202e-07, "loss": 0.3683, "step": 5373 }, { "epoch": 2.6678801919576367, "grad_norm": 0.309259295463562, "learning_rate": 3.678539824580296e-07, "loss": 0.3284, "step": 5374 }, { "epoch": 2.668376634122125, "grad_norm": 0.30322280526161194, "learning_rate": 3.6676710420110063e-07, "loss": 0.3254, "step": 5375 }, { "epoch": 2.6688730762866126, "grad_norm": 0.2933993935585022, "learning_rate": 3.656817728547107e-07, "loss": 0.2842, "step": 5376 }, { "epoch": 2.6693695184511004, "grad_norm": 0.3199950158596039, "learning_rate": 3.6459798878122233e-07, "loss": 0.3042, "step": 5377 }, { "epoch": 2.669865960615588, "grad_norm": 0.314136266708374, "learning_rate": 3.635157523424826e-07, "loss": 0.3298, "step": 5378 }, { "epoch": 2.670362402780076, "grad_norm": 0.32366541028022766, "learning_rate": 3.624350638998209e-07, "loss": 0.3211, "step": 5379 }, { "epoch": 2.670858844944564, "grad_norm": 0.3319578170776367, "learning_rate": 3.613559238140496e-07, "loss": 0.3028, "step": 5380 }, { "epoch": 2.671355287109052, "grad_norm": 0.3221125304698944, "learning_rate": 3.6027833244546286e-07, "loss": 0.3837, "step": 5381 }, { "epoch": 2.6718517292735395, "grad_norm": 0.30065053701400757, "learning_rate": 3.5920229015384165e-07, "loss": 0.3394, "step": 5382 }, { "epoch": 2.6723481714380277, "grad_norm": 0.31339240074157715, "learning_rate": 3.581277972984448e-07, "loss": 0.3419, "step": 5383 }, { "epoch": 2.6728446136025155, "grad_norm": 0.3158104419708252, "learning_rate": 3.5705485423801755e-07, "loss": 0.3142, "step": 5384 }, { "epoch": 2.673341055767003, "grad_norm": 0.3069358468055725, "learning_rate": 3.559834613307861e-07, "loss": 0.3119, "step": 5385 }, { "epoch": 2.673837497931491, "grad_norm": 0.3339449465274811, "learning_rate": 3.549136189344604e-07, "loss": 0.3846, "step": 5386 }, { "epoch": 2.6743339400959787, "grad_norm": 0.3136313557624817, "learning_rate": 3.5384532740623033e-07, "loss": 0.2998, "step": 5387 }, { "epoch": 2.674830382260467, "grad_norm": 0.32586008310317993, "learning_rate": 3.5277858710277e-07, "loss": 0.3416, "step": 5388 }, { "epoch": 2.6753268244249546, "grad_norm": 0.30617213249206543, "learning_rate": 3.5171339838023453e-07, "loss": 0.3323, "step": 5389 }, { "epoch": 2.6758232665894424, "grad_norm": 0.30507710576057434, "learning_rate": 3.5064976159426224e-07, "loss": 0.3035, "step": 5390 }, { "epoch": 2.67631970875393, "grad_norm": 0.3250305950641632, "learning_rate": 3.495876770999729e-07, "loss": 0.3196, "step": 5391 }, { "epoch": 2.676816150918418, "grad_norm": 0.3327906131744385, "learning_rate": 3.4852714525196507e-07, "loss": 0.3199, "step": 5392 }, { "epoch": 2.677312593082906, "grad_norm": 0.29888495802879333, "learning_rate": 3.4746816640432556e-07, "loss": 0.321, "step": 5393 }, { "epoch": 2.6778090352473938, "grad_norm": 0.30715179443359375, "learning_rate": 3.4641074091061545e-07, "loss": 0.3682, "step": 5394 }, { "epoch": 2.6783054774118815, "grad_norm": 0.30133169889450073, "learning_rate": 3.4535486912388115e-07, "loss": 0.3376, "step": 5395 }, { "epoch": 2.6788019195763693, "grad_norm": 0.3104872703552246, "learning_rate": 3.443005513966502e-07, "loss": 0.3336, "step": 5396 }, { "epoch": 2.679298361740857, "grad_norm": 0.3127725422382355, "learning_rate": 3.4324778808092985e-07, "loss": 0.3486, "step": 5397 }, { "epoch": 2.679794803905345, "grad_norm": 0.2921850085258484, "learning_rate": 3.421965795282106e-07, "loss": 0.2958, "step": 5398 }, { "epoch": 2.680291246069833, "grad_norm": 0.31885626912117004, "learning_rate": 3.411469260894601e-07, "loss": 0.3299, "step": 5399 }, { "epoch": 2.6807876882343207, "grad_norm": 0.3056564927101135, "learning_rate": 3.400988281151313e-07, "loss": 0.3089, "step": 5400 }, { "epoch": 2.6812841303988084, "grad_norm": 0.3477216958999634, "learning_rate": 3.3905228595515425e-07, "loss": 0.3781, "step": 5401 }, { "epoch": 2.681780572563296, "grad_norm": 0.3286778926849365, "learning_rate": 3.3800729995894124e-07, "loss": 0.3371, "step": 5402 }, { "epoch": 2.6822770147277843, "grad_norm": 0.3207147419452667, "learning_rate": 3.3696387047538525e-07, "loss": 0.3386, "step": 5403 }, { "epoch": 2.682773456892272, "grad_norm": 0.326035737991333, "learning_rate": 3.359219978528583e-07, "loss": 0.3348, "step": 5404 }, { "epoch": 2.68326989905676, "grad_norm": 0.32537680864334106, "learning_rate": 3.348816824392143e-07, "loss": 0.2971, "step": 5405 }, { "epoch": 2.6837663412212476, "grad_norm": 0.298620343208313, "learning_rate": 3.338429245817848e-07, "loss": 0.2998, "step": 5406 }, { "epoch": 2.6842627833857353, "grad_norm": 0.3147369921207428, "learning_rate": 3.3280572462738415e-07, "loss": 0.3305, "step": 5407 }, { "epoch": 2.6847592255502235, "grad_norm": 0.31971824169158936, "learning_rate": 3.3177008292230415e-07, "loss": 0.3077, "step": 5408 }, { "epoch": 2.6852556677147112, "grad_norm": 0.32614269852638245, "learning_rate": 3.307359998123194e-07, "loss": 0.3351, "step": 5409 }, { "epoch": 2.685752109879199, "grad_norm": 0.32140234112739563, "learning_rate": 3.297034756426787e-07, "loss": 0.3577, "step": 5410 }, { "epoch": 2.686248552043687, "grad_norm": 0.29071927070617676, "learning_rate": 3.286725107581179e-07, "loss": 0.3036, "step": 5411 }, { "epoch": 2.6867449942081745, "grad_norm": 0.29942142963409424, "learning_rate": 3.276431055028445e-07, "loss": 0.3094, "step": 5412 }, { "epoch": 2.6872414363726627, "grad_norm": 0.32465001940727234, "learning_rate": 3.2661526022055135e-07, "loss": 0.354, "step": 5413 }, { "epoch": 2.6877378785371504, "grad_norm": 0.3504467308521271, "learning_rate": 3.255889752544067e-07, "loss": 0.3235, "step": 5414 }, { "epoch": 2.688234320701638, "grad_norm": 0.3265693783760071, "learning_rate": 3.2456425094706034e-07, "loss": 0.3256, "step": 5415 }, { "epoch": 2.6887307628661263, "grad_norm": 0.312694787979126, "learning_rate": 3.2354108764063973e-07, "loss": 0.3242, "step": 5416 }, { "epoch": 2.689227205030614, "grad_norm": 0.3133346438407898, "learning_rate": 3.2251948567674993e-07, "loss": 0.3494, "step": 5417 }, { "epoch": 2.689723647195102, "grad_norm": 0.322666198015213, "learning_rate": 3.214994453964776e-07, "loss": 0.3109, "step": 5418 }, { "epoch": 2.6902200893595896, "grad_norm": 0.328652024269104, "learning_rate": 3.204809671403852e-07, "loss": 0.3094, "step": 5419 }, { "epoch": 2.6907165315240773, "grad_norm": 0.28206872940063477, "learning_rate": 3.194640512485159e-07, "loss": 0.3274, "step": 5420 }, { "epoch": 2.6912129736885655, "grad_norm": 0.31293123960494995, "learning_rate": 3.184486980603907e-07, "loss": 0.3259, "step": 5421 }, { "epoch": 2.6917094158530532, "grad_norm": 0.33148854970932007, "learning_rate": 3.1743490791500577e-07, "loss": 0.3771, "step": 5422 }, { "epoch": 2.692205858017541, "grad_norm": 0.3161206543445587, "learning_rate": 3.1642268115084196e-07, "loss": 0.3428, "step": 5423 }, { "epoch": 2.6927023001820287, "grad_norm": 0.30540478229522705, "learning_rate": 3.1541201810585175e-07, "loss": 0.3019, "step": 5424 }, { "epoch": 2.6931987423465165, "grad_norm": 0.3087798058986664, "learning_rate": 3.14402919117468e-07, "loss": 0.3167, "step": 5425 }, { "epoch": 2.6936951845110046, "grad_norm": 0.30908387899398804, "learning_rate": 3.133953845226029e-07, "loss": 0.3098, "step": 5426 }, { "epoch": 2.6941916266754924, "grad_norm": 0.3515626788139343, "learning_rate": 3.1238941465764337e-07, "loss": 0.3445, "step": 5427 }, { "epoch": 2.69468806883998, "grad_norm": 0.3160986304283142, "learning_rate": 3.1138500985845755e-07, "loss": 0.3352, "step": 5428 }, { "epoch": 2.695184511004468, "grad_norm": 0.3475051820278168, "learning_rate": 3.103821704603854e-07, "loss": 0.3347, "step": 5429 }, { "epoch": 2.6956809531689556, "grad_norm": 0.3133622705936432, "learning_rate": 3.093808967982515e-07, "loss": 0.322, "step": 5430 }, { "epoch": 2.696177395333444, "grad_norm": 0.323030948638916, "learning_rate": 3.08381189206351e-07, "loss": 0.3025, "step": 5431 }, { "epoch": 2.6966738374979315, "grad_norm": 0.3119143545627594, "learning_rate": 3.0738304801846144e-07, "loss": 0.3506, "step": 5432 }, { "epoch": 2.6971702796624193, "grad_norm": 0.325710266828537, "learning_rate": 3.0638647356783236e-07, "loss": 0.3804, "step": 5433 }, { "epoch": 2.697666721826907, "grad_norm": 0.3140539526939392, "learning_rate": 3.0539146618719596e-07, "loss": 0.3714, "step": 5434 }, { "epoch": 2.6981631639913948, "grad_norm": 0.31751778721809387, "learning_rate": 3.043980262087559e-07, "loss": 0.3254, "step": 5435 }, { "epoch": 2.698659606155883, "grad_norm": 0.3255801200866699, "learning_rate": 3.0340615396419524e-07, "loss": 0.3431, "step": 5436 }, { "epoch": 2.6991560483203707, "grad_norm": 0.30462560057640076, "learning_rate": 3.0241584978467354e-07, "loss": 0.3347, "step": 5437 }, { "epoch": 2.6996524904848584, "grad_norm": 0.32576608657836914, "learning_rate": 3.0142711400082626e-07, "loss": 0.367, "step": 5438 }, { "epoch": 2.700148932649346, "grad_norm": 0.2879064381122589, "learning_rate": 3.004399469427666e-07, "loss": 0.2762, "step": 5439 }, { "epoch": 2.700645374813834, "grad_norm": 0.31539860367774963, "learning_rate": 2.994543489400797e-07, "loss": 0.3372, "step": 5440 }, { "epoch": 2.701141816978322, "grad_norm": 0.29874810576438904, "learning_rate": 2.9847032032183366e-07, "loss": 0.3071, "step": 5441 }, { "epoch": 2.70163825914281, "grad_norm": 0.3115823566913605, "learning_rate": 2.974878614165666e-07, "loss": 0.3491, "step": 5442 }, { "epoch": 2.7021347013072976, "grad_norm": 0.28547585010528564, "learning_rate": 2.965069725522951e-07, "loss": 0.338, "step": 5443 }, { "epoch": 2.702631143471786, "grad_norm": 0.3474343419075012, "learning_rate": 2.955276540565122e-07, "loss": 0.3646, "step": 5444 }, { "epoch": 2.7031275856362735, "grad_norm": 0.32367661595344543, "learning_rate": 2.945499062561846e-07, "loss": 0.2833, "step": 5445 }, { "epoch": 2.7036240278007613, "grad_norm": 0.3165450394153595, "learning_rate": 2.9357372947775684e-07, "loss": 0.3364, "step": 5446 }, { "epoch": 2.704120469965249, "grad_norm": 0.30385902523994446, "learning_rate": 2.925991240471471e-07, "loss": 0.3069, "step": 5447 }, { "epoch": 2.7046169121297368, "grad_norm": 0.33303043246269226, "learning_rate": 2.916260902897494e-07, "loss": 0.3533, "step": 5448 }, { "epoch": 2.705113354294225, "grad_norm": 0.30602535605430603, "learning_rate": 2.9065462853043345e-07, "loss": 0.3141, "step": 5449 }, { "epoch": 2.7056097964587127, "grad_norm": 0.32556891441345215, "learning_rate": 2.896847390935442e-07, "loss": 0.2967, "step": 5450 }, { "epoch": 2.7061062386232004, "grad_norm": 0.30759915709495544, "learning_rate": 2.887164223029015e-07, "loss": 0.303, "step": 5451 }, { "epoch": 2.706602680787688, "grad_norm": 0.3466031551361084, "learning_rate": 2.8774967848179956e-07, "loss": 0.3329, "step": 5452 }, { "epoch": 2.707099122952176, "grad_norm": 0.3340030610561371, "learning_rate": 2.8678450795300907e-07, "loss": 0.3442, "step": 5453 }, { "epoch": 2.707595565116664, "grad_norm": 0.32366830110549927, "learning_rate": 2.8582091103877274e-07, "loss": 0.3263, "step": 5454 }, { "epoch": 2.708092007281152, "grad_norm": 0.31396594643592834, "learning_rate": 2.848588880608094e-07, "loss": 0.323, "step": 5455 }, { "epoch": 2.7085884494456396, "grad_norm": 0.30991774797439575, "learning_rate": 2.8389843934031327e-07, "loss": 0.3061, "step": 5456 }, { "epoch": 2.7090848916101273, "grad_norm": 0.2859792411327362, "learning_rate": 2.8293956519795216e-07, "loss": 0.3227, "step": 5457 }, { "epoch": 2.709581333774615, "grad_norm": 0.3072301745414734, "learning_rate": 2.8198226595386736e-07, "loss": 0.34, "step": 5458 }, { "epoch": 2.7100777759391033, "grad_norm": 0.3465893566608429, "learning_rate": 2.810265419276753e-07, "loss": 0.3742, "step": 5459 }, { "epoch": 2.710574218103591, "grad_norm": 0.2965722680091858, "learning_rate": 2.800723934384658e-07, "loss": 0.3102, "step": 5460 }, { "epoch": 2.7110706602680787, "grad_norm": 0.3163202404975891, "learning_rate": 2.79119820804804e-07, "loss": 0.3265, "step": 5461 }, { "epoch": 2.7115671024325665, "grad_norm": 0.3151935935020447, "learning_rate": 2.7816882434472836e-07, "loss": 0.3252, "step": 5462 }, { "epoch": 2.7120635445970542, "grad_norm": 0.3210795223712921, "learning_rate": 2.772194043757481e-07, "loss": 0.3526, "step": 5463 }, { "epoch": 2.7125599867615424, "grad_norm": 0.3186233937740326, "learning_rate": 2.762715612148525e-07, "loss": 0.2993, "step": 5464 }, { "epoch": 2.71305642892603, "grad_norm": 0.34454840421676636, "learning_rate": 2.7532529517849795e-07, "loss": 0.3666, "step": 5465 }, { "epoch": 2.713552871090518, "grad_norm": 0.29780763387680054, "learning_rate": 2.7438060658261825e-07, "loss": 0.2956, "step": 5466 }, { "epoch": 2.7140493132550056, "grad_norm": 0.32143568992614746, "learning_rate": 2.7343749574261836e-07, "loss": 0.3534, "step": 5467 }, { "epoch": 2.7145457554194934, "grad_norm": 0.28986427187919617, "learning_rate": 2.7249596297337755e-07, "loss": 0.2976, "step": 5468 }, { "epoch": 2.7150421975839816, "grad_norm": 0.3152945935726166, "learning_rate": 2.715560085892494e-07, "loss": 0.3615, "step": 5469 }, { "epoch": 2.7155386397484693, "grad_norm": 0.31852424144744873, "learning_rate": 2.7061763290405606e-07, "loss": 0.3303, "step": 5470 }, { "epoch": 2.716035081912957, "grad_norm": 0.32839879393577576, "learning_rate": 2.6968083623109984e-07, "loss": 0.2894, "step": 5471 }, { "epoch": 2.7165315240774452, "grad_norm": 0.32914039492607117, "learning_rate": 2.687456188831483e-07, "loss": 0.3542, "step": 5472 }, { "epoch": 2.7170279662419325, "grad_norm": 0.33081528544425964, "learning_rate": 2.678119811724461e-07, "loss": 0.3645, "step": 5473 }, { "epoch": 2.7175244084064207, "grad_norm": 0.3161695897579193, "learning_rate": 2.6687992341070944e-07, "loss": 0.3146, "step": 5474 }, { "epoch": 2.7180208505709085, "grad_norm": 0.2956677973270416, "learning_rate": 2.6594944590912774e-07, "loss": 0.3237, "step": 5475 }, { "epoch": 2.718517292735396, "grad_norm": 0.2798486053943634, "learning_rate": 2.650205489783625e-07, "loss": 0.322, "step": 5476 }, { "epoch": 2.7190137348998844, "grad_norm": 0.3341737389564514, "learning_rate": 2.6409323292854563e-07, "loss": 0.4037, "step": 5477 }, { "epoch": 2.719510177064372, "grad_norm": 0.31984496116638184, "learning_rate": 2.6316749806928277e-07, "loss": 0.3274, "step": 5478 }, { "epoch": 2.72000661922886, "grad_norm": 0.30979540944099426, "learning_rate": 2.6224334470965284e-07, "loss": 0.314, "step": 5479 }, { "epoch": 2.7205030613933476, "grad_norm": 0.3038616478443146, "learning_rate": 2.613207731582057e-07, "loss": 0.3232, "step": 5480 }, { "epoch": 2.7209995035578354, "grad_norm": 0.3277778625488281, "learning_rate": 2.60399783722961e-07, "loss": 0.3188, "step": 5481 }, { "epoch": 2.7214959457223236, "grad_norm": 0.28182750940322876, "learning_rate": 2.594803767114146e-07, "loss": 0.3142, "step": 5482 }, { "epoch": 2.7219923878868113, "grad_norm": 0.31159040331840515, "learning_rate": 2.5856255243052964e-07, "loss": 0.3821, "step": 5483 }, { "epoch": 2.722488830051299, "grad_norm": 0.32820650935173035, "learning_rate": 2.5764631118674275e-07, "loss": 0.3241, "step": 5484 }, { "epoch": 2.722985272215787, "grad_norm": 0.29457759857177734, "learning_rate": 2.5673165328596315e-07, "loss": 0.3092, "step": 5485 }, { "epoch": 2.7234817143802745, "grad_norm": 0.337430477142334, "learning_rate": 2.5581857903356935e-07, "loss": 0.3436, "step": 5486 }, { "epoch": 2.7239781565447627, "grad_norm": 0.2914576530456543, "learning_rate": 2.5490708873441295e-07, "loss": 0.3267, "step": 5487 }, { "epoch": 2.7244745987092505, "grad_norm": 0.3579276204109192, "learning_rate": 2.5399718269281505e-07, "loss": 0.3351, "step": 5488 }, { "epoch": 2.724971040873738, "grad_norm": 0.2905930280685425, "learning_rate": 2.5308886121256816e-07, "loss": 0.3339, "step": 5489 }, { "epoch": 2.725467483038226, "grad_norm": 0.3048407733440399, "learning_rate": 2.5218212459693636e-07, "loss": 0.3705, "step": 5490 }, { "epoch": 2.7259639252027137, "grad_norm": 0.29062899947166443, "learning_rate": 2.5127697314865475e-07, "loss": 0.323, "step": 5491 }, { "epoch": 2.726460367367202, "grad_norm": 0.2954337000846863, "learning_rate": 2.5037340716992874e-07, "loss": 0.3159, "step": 5492 }, { "epoch": 2.7269568095316896, "grad_norm": 0.3149557411670685, "learning_rate": 2.494714269624343e-07, "loss": 0.3715, "step": 5493 }, { "epoch": 2.7274532516961774, "grad_norm": 0.3072965443134308, "learning_rate": 2.485710328273194e-07, "loss": 0.36, "step": 5494 }, { "epoch": 2.727949693860665, "grad_norm": 0.28738245368003845, "learning_rate": 2.4767222506519863e-07, "loss": 0.3449, "step": 5495 }, { "epoch": 2.728446136025153, "grad_norm": 0.30889105796813965, "learning_rate": 2.467750039761613e-07, "loss": 0.3615, "step": 5496 }, { "epoch": 2.728942578189641, "grad_norm": 0.28501415252685547, "learning_rate": 2.4587936985976445e-07, "loss": 0.2924, "step": 5497 }, { "epoch": 2.7294390203541288, "grad_norm": 0.31857195496559143, "learning_rate": 2.4498532301503563e-07, "loss": 0.3173, "step": 5498 }, { "epoch": 2.7299354625186165, "grad_norm": 0.2935973107814789, "learning_rate": 2.440928637404749e-07, "loss": 0.3204, "step": 5499 }, { "epoch": 2.7304319046831043, "grad_norm": 0.33672669529914856, "learning_rate": 2.4320199233404675e-07, "loss": 0.3357, "step": 5500 }, { "epoch": 2.730928346847592, "grad_norm": 0.30771106481552124, "learning_rate": 2.4231270909319203e-07, "loss": 0.3132, "step": 5501 }, { "epoch": 2.73142478901208, "grad_norm": 0.3124135732650757, "learning_rate": 2.4142501431481613e-07, "loss": 0.3231, "step": 5502 }, { "epoch": 2.731921231176568, "grad_norm": 0.3527693450450897, "learning_rate": 2.4053890829529804e-07, "loss": 0.3486, "step": 5503 }, { "epoch": 2.7324176733410557, "grad_norm": 0.31055232882499695, "learning_rate": 2.396543913304822e-07, "loss": 0.3161, "step": 5504 }, { "epoch": 2.732914115505544, "grad_norm": 0.3138221800327301, "learning_rate": 2.387714637156874e-07, "loss": 0.4049, "step": 5505 }, { "epoch": 2.7334105576700316, "grad_norm": 0.29294922947883606, "learning_rate": 2.3789012574569726e-07, "loss": 0.2481, "step": 5506 }, { "epoch": 2.7339069998345193, "grad_norm": 0.3325761556625366, "learning_rate": 2.3701037771476642e-07, "loss": 0.3175, "step": 5507 }, { "epoch": 2.734403441999007, "grad_norm": 0.3255939483642578, "learning_rate": 2.361322199166205e-07, "loss": 0.363, "step": 5508 }, { "epoch": 2.734899884163495, "grad_norm": 0.3154188394546509, "learning_rate": 2.352556526444516e-07, "loss": 0.3138, "step": 5509 }, { "epoch": 2.735396326327983, "grad_norm": 0.32027098536491394, "learning_rate": 2.3438067619092176e-07, "loss": 0.3554, "step": 5510 }, { "epoch": 2.7358927684924708, "grad_norm": 0.31610798835754395, "learning_rate": 2.335072908481606e-07, "loss": 0.3119, "step": 5511 }, { "epoch": 2.7363892106569585, "grad_norm": 0.3459371030330658, "learning_rate": 2.3263549690777044e-07, "loss": 0.3143, "step": 5512 }, { "epoch": 2.7368856528214462, "grad_norm": 0.3190409541130066, "learning_rate": 2.3176529466081733e-07, "loss": 0.3352, "step": 5513 }, { "epoch": 2.737382094985934, "grad_norm": 0.33176466822624207, "learning_rate": 2.3089668439783885e-07, "loss": 0.3327, "step": 5514 }, { "epoch": 2.737878537150422, "grad_norm": 0.32384106516838074, "learning_rate": 2.3002966640884084e-07, "loss": 0.3478, "step": 5515 }, { "epoch": 2.73837497931491, "grad_norm": 0.30757224559783936, "learning_rate": 2.2916424098329614e-07, "loss": 0.3079, "step": 5516 }, { "epoch": 2.7388714214793977, "grad_norm": 0.3155524730682373, "learning_rate": 2.2830040841014812e-07, "loss": 0.3065, "step": 5517 }, { "epoch": 2.7393678636438854, "grad_norm": 0.3076499402523041, "learning_rate": 2.2743816897780547e-07, "loss": 0.3457, "step": 5518 }, { "epoch": 2.739864305808373, "grad_norm": 0.30658143758773804, "learning_rate": 2.265775229741468e-07, "loss": 0.3172, "step": 5519 }, { "epoch": 2.7403607479728613, "grad_norm": 0.3174297511577606, "learning_rate": 2.2571847068651898e-07, "loss": 0.3454, "step": 5520 }, { "epoch": 2.740857190137349, "grad_norm": 0.313932329416275, "learning_rate": 2.2486101240173585e-07, "loss": 0.3155, "step": 5521 }, { "epoch": 2.741353632301837, "grad_norm": 0.33490973711013794, "learning_rate": 2.2400514840608012e-07, "loss": 0.3268, "step": 5522 }, { "epoch": 2.7418500744663246, "grad_norm": 0.31372499465942383, "learning_rate": 2.231508789853004e-07, "loss": 0.3657, "step": 5523 }, { "epoch": 2.7423465166308123, "grad_norm": 0.3343569040298462, "learning_rate": 2.222982044246158e-07, "loss": 0.3321, "step": 5524 }, { "epoch": 2.7428429587953005, "grad_norm": 0.3245448172092438, "learning_rate": 2.2144712500870913e-07, "loss": 0.3346, "step": 5525 }, { "epoch": 2.7433394009597882, "grad_norm": 0.3131314218044281, "learning_rate": 2.2059764102173364e-07, "loss": 0.3011, "step": 5526 }, { "epoch": 2.743835843124276, "grad_norm": 0.31760287284851074, "learning_rate": 2.1974975274730857e-07, "loss": 0.2948, "step": 5527 }, { "epoch": 2.7443322852887637, "grad_norm": 0.3455687463283539, "learning_rate": 2.1890346046852197e-07, "loss": 0.3274, "step": 5528 }, { "epoch": 2.7448287274532515, "grad_norm": 0.321842759847641, "learning_rate": 2.1805876446792607e-07, "loss": 0.336, "step": 5529 }, { "epoch": 2.7453251696177396, "grad_norm": 0.29839378595352173, "learning_rate": 2.1721566502754255e-07, "loss": 0.3002, "step": 5530 }, { "epoch": 2.7458216117822274, "grad_norm": 0.3080875873565674, "learning_rate": 2.1637416242886012e-07, "loss": 0.3286, "step": 5531 }, { "epoch": 2.746318053946715, "grad_norm": 0.31883618235588074, "learning_rate": 2.1553425695283293e-07, "loss": 0.3, "step": 5532 }, { "epoch": 2.7468144961112033, "grad_norm": 0.3032870888710022, "learning_rate": 2.1469594887988277e-07, "loss": 0.32, "step": 5533 }, { "epoch": 2.7473109382756906, "grad_norm": 0.30962228775024414, "learning_rate": 2.1385923848989797e-07, "loss": 0.32, "step": 5534 }, { "epoch": 2.747807380440179, "grad_norm": 0.34174367785453796, "learning_rate": 2.13024126062234e-07, "loss": 0.3318, "step": 5535 }, { "epoch": 2.7483038226046665, "grad_norm": 0.28254246711730957, "learning_rate": 2.1219061187571056e-07, "loss": 0.294, "step": 5536 }, { "epoch": 2.7488002647691543, "grad_norm": 0.3273497223854065, "learning_rate": 2.1135869620861671e-07, "loss": 0.3779, "step": 5537 }, { "epoch": 2.7492967069336425, "grad_norm": 0.3134206235408783, "learning_rate": 2.1052837933870583e-07, "loss": 0.3247, "step": 5538 }, { "epoch": 2.74979314909813, "grad_norm": 0.30873629450798035, "learning_rate": 2.09699661543199e-07, "loss": 0.3605, "step": 5539 }, { "epoch": 2.750289591262618, "grad_norm": 0.31347087025642395, "learning_rate": 2.0887254309878202e-07, "loss": 0.354, "step": 5540 }, { "epoch": 2.7507860334271057, "grad_norm": 0.2911852300167084, "learning_rate": 2.0804702428160629e-07, "loss": 0.3184, "step": 5541 }, { "epoch": 2.7512824755915934, "grad_norm": 0.3095359802246094, "learning_rate": 2.072231053672924e-07, "loss": 0.3308, "step": 5542 }, { "epoch": 2.7517789177560816, "grad_norm": 0.2953628897666931, "learning_rate": 2.0640078663092256e-07, "loss": 0.3299, "step": 5543 }, { "epoch": 2.7522753599205694, "grad_norm": 0.31827473640441895, "learning_rate": 2.055800683470477e-07, "loss": 0.3273, "step": 5544 }, { "epoch": 2.752771802085057, "grad_norm": 0.3202262222766876, "learning_rate": 2.0476095078968195e-07, "loss": 0.3649, "step": 5545 }, { "epoch": 2.753268244249545, "grad_norm": 0.3133939802646637, "learning_rate": 2.0394343423230824e-07, "loss": 0.3117, "step": 5546 }, { "epoch": 2.7537646864140326, "grad_norm": 0.2986651659011841, "learning_rate": 2.0312751894787208e-07, "loss": 0.3518, "step": 5547 }, { "epoch": 2.754261128578521, "grad_norm": 0.3024437129497528, "learning_rate": 2.0231320520878507e-07, "loss": 0.3168, "step": 5548 }, { "epoch": 2.7547575707430085, "grad_norm": 0.31055518984794617, "learning_rate": 2.0150049328692578e-07, "loss": 0.3656, "step": 5549 }, { "epoch": 2.7552540129074963, "grad_norm": 0.3057727813720703, "learning_rate": 2.0068938345363497e-07, "loss": 0.3631, "step": 5550 }, { "epoch": 2.755750455071984, "grad_norm": 0.30828458070755005, "learning_rate": 1.9987987597972212e-07, "loss": 0.3788, "step": 5551 }, { "epoch": 2.7562468972364718, "grad_norm": 0.32676127552986145, "learning_rate": 1.9907197113545716e-07, "loss": 0.3258, "step": 5552 }, { "epoch": 2.75674333940096, "grad_norm": 0.33828094601631165, "learning_rate": 1.9826566919058043e-07, "loss": 0.2885, "step": 5553 }, { "epoch": 2.7572397815654477, "grad_norm": 0.32747596502304077, "learning_rate": 1.9746097041429212e-07, "loss": 0.3306, "step": 5554 }, { "epoch": 2.7577362237299354, "grad_norm": 0.3117265999317169, "learning_rate": 1.9665787507525958e-07, "loss": 0.3206, "step": 5555 }, { "epoch": 2.758232665894423, "grad_norm": 0.2830089032649994, "learning_rate": 1.958563834416155e-07, "loss": 0.296, "step": 5556 }, { "epoch": 2.758729108058911, "grad_norm": 0.29822859168052673, "learning_rate": 1.9505649578095532e-07, "loss": 0.2987, "step": 5557 }, { "epoch": 2.759225550223399, "grad_norm": 0.3251523971557617, "learning_rate": 1.9425821236034094e-07, "loss": 0.3462, "step": 5558 }, { "epoch": 2.759721992387887, "grad_norm": 0.29427170753479004, "learning_rate": 1.9346153344629583e-07, "loss": 0.3265, "step": 5559 }, { "epoch": 2.7602184345523746, "grad_norm": 0.3051506578922272, "learning_rate": 1.9266645930481053e-07, "loss": 0.3484, "step": 5560 }, { "epoch": 2.7607148767168623, "grad_norm": 0.3220616281032562, "learning_rate": 1.9187299020133775e-07, "loss": 0.3503, "step": 5561 }, { "epoch": 2.76121131888135, "grad_norm": 0.32251766324043274, "learning_rate": 1.910811264007967e-07, "loss": 0.3335, "step": 5562 }, { "epoch": 2.7617077610458383, "grad_norm": 0.30572280287742615, "learning_rate": 1.9029086816756804e-07, "loss": 0.3399, "step": 5563 }, { "epoch": 2.762204203210326, "grad_norm": 0.3070008158683777, "learning_rate": 1.8950221576549743e-07, "loss": 0.3284, "step": 5564 }, { "epoch": 2.7627006453748137, "grad_norm": 0.33133581280708313, "learning_rate": 1.887151694578959e-07, "loss": 0.355, "step": 5565 }, { "epoch": 2.763197087539302, "grad_norm": 0.3239547610282898, "learning_rate": 1.8792972950753495e-07, "loss": 0.3189, "step": 5566 }, { "epoch": 2.7636935297037892, "grad_norm": 0.32022804021835327, "learning_rate": 1.8714589617665314e-07, "loss": 0.318, "step": 5567 }, { "epoch": 2.7641899718682774, "grad_norm": 0.2967100441455841, "learning_rate": 1.8636366972694996e-07, "loss": 0.3163, "step": 5568 }, { "epoch": 2.764686414032765, "grad_norm": 0.3371680974960327, "learning_rate": 1.8558305041958992e-07, "loss": 0.3221, "step": 5569 }, { "epoch": 2.765182856197253, "grad_norm": 0.3391682803630829, "learning_rate": 1.8480403851520167e-07, "loss": 0.3524, "step": 5570 }, { "epoch": 2.765679298361741, "grad_norm": 0.32933974266052246, "learning_rate": 1.840266342738739e-07, "loss": 0.338, "step": 5571 }, { "epoch": 2.766175740526229, "grad_norm": 0.2756885886192322, "learning_rate": 1.832508379551634e-07, "loss": 0.3088, "step": 5572 }, { "epoch": 2.7666721826907166, "grad_norm": 0.3026617467403412, "learning_rate": 1.8247664981808522e-07, "loss": 0.3302, "step": 5573 }, { "epoch": 2.7671686248552043, "grad_norm": 0.30874621868133545, "learning_rate": 1.8170407012112146e-07, "loss": 0.3524, "step": 5574 }, { "epoch": 2.767665067019692, "grad_norm": 0.3340041935443878, "learning_rate": 1.8093309912221302e-07, "loss": 0.346, "step": 5575 }, { "epoch": 2.7681615091841802, "grad_norm": 0.315430611371994, "learning_rate": 1.8016373707876956e-07, "loss": 0.2822, "step": 5576 }, { "epoch": 2.768657951348668, "grad_norm": 0.30734312534332275, "learning_rate": 1.7939598424765726e-07, "loss": 0.3368, "step": 5577 }, { "epoch": 2.7691543935131557, "grad_norm": 0.326242059469223, "learning_rate": 1.7862984088520886e-07, "loss": 0.356, "step": 5578 }, { "epoch": 2.7696508356776435, "grad_norm": 0.3105933368206024, "learning_rate": 1.778653072472203e-07, "loss": 0.3522, "step": 5579 }, { "epoch": 2.770147277842131, "grad_norm": 0.29903870820999146, "learning_rate": 1.7710238358894683e-07, "loss": 0.3259, "step": 5580 }, { "epoch": 2.7706437200066194, "grad_norm": 0.3356019854545593, "learning_rate": 1.763410701651086e-07, "loss": 0.3218, "step": 5581 }, { "epoch": 2.771140162171107, "grad_norm": 0.29897722601890564, "learning_rate": 1.7558136722988617e-07, "loss": 0.3229, "step": 5582 }, { "epoch": 2.771636604335595, "grad_norm": 0.320944607257843, "learning_rate": 1.7482327503692552e-07, "loss": 0.3245, "step": 5583 }, { "epoch": 2.7721330465000826, "grad_norm": 0.31358081102371216, "learning_rate": 1.7406679383933255e-07, "loss": 0.3154, "step": 5584 }, { "epoch": 2.7726294886645704, "grad_norm": 0.3160251975059509, "learning_rate": 1.7331192388967523e-07, "loss": 0.3216, "step": 5585 }, { "epoch": 2.7731259308290586, "grad_norm": 0.33884552121162415, "learning_rate": 1.7255866543998412e-07, "loss": 0.3299, "step": 5586 }, { "epoch": 2.7736223729935463, "grad_norm": 0.2978258728981018, "learning_rate": 1.7180701874175198e-07, "loss": 0.3044, "step": 5587 }, { "epoch": 2.774118815158034, "grad_norm": 0.3247503340244293, "learning_rate": 1.710569840459342e-07, "loss": 0.3501, "step": 5588 }, { "epoch": 2.774615257322522, "grad_norm": 0.31929680705070496, "learning_rate": 1.7030856160294485e-07, "loss": 0.3158, "step": 5589 }, { "epoch": 2.7751116994870095, "grad_norm": 0.32145753502845764, "learning_rate": 1.695617516626641e-07, "loss": 0.3191, "step": 5590 }, { "epoch": 2.7756081416514977, "grad_norm": 0.3215952217578888, "learning_rate": 1.6881655447442968e-07, "loss": 0.3637, "step": 5591 }, { "epoch": 2.7761045838159855, "grad_norm": 0.31591057777404785, "learning_rate": 1.680729702870437e-07, "loss": 0.3573, "step": 5592 }, { "epoch": 2.776601025980473, "grad_norm": 0.2874772250652313, "learning_rate": 1.6733099934876873e-07, "loss": 0.2915, "step": 5593 }, { "epoch": 2.777097468144961, "grad_norm": 0.31941789388656616, "learning_rate": 1.6659064190732764e-07, "loss": 0.3539, "step": 5594 }, { "epoch": 2.7775939103094487, "grad_norm": 0.2898183763027191, "learning_rate": 1.6585189820990776e-07, "loss": 0.3401, "step": 5595 }, { "epoch": 2.778090352473937, "grad_norm": 0.2869662344455719, "learning_rate": 1.6511476850315344e-07, "loss": 0.3107, "step": 5596 }, { "epoch": 2.7785867946384246, "grad_norm": 0.3380005955696106, "learning_rate": 1.643792530331728e-07, "loss": 0.3579, "step": 5597 }, { "epoch": 2.7790832368029124, "grad_norm": 0.3073091208934784, "learning_rate": 1.6364535204553444e-07, "loss": 0.3091, "step": 5598 }, { "epoch": 2.7795796789674005, "grad_norm": 0.31093552708625793, "learning_rate": 1.62913065785269e-07, "loss": 0.311, "step": 5599 }, { "epoch": 2.7800761211318883, "grad_norm": 0.32576116919517517, "learning_rate": 1.621823944968659e-07, "loss": 0.3409, "step": 5600 }, { "epoch": 2.780572563296376, "grad_norm": 0.32196739315986633, "learning_rate": 1.6145333842427612e-07, "loss": 0.3549, "step": 5601 }, { "epoch": 2.7810690054608638, "grad_norm": 0.31272751092910767, "learning_rate": 1.6072589781091274e-07, "loss": 0.3187, "step": 5602 }, { "epoch": 2.7815654476253515, "grad_norm": 0.3180808424949646, "learning_rate": 1.6000007289964815e-07, "loss": 0.3564, "step": 5603 }, { "epoch": 2.7820618897898397, "grad_norm": 0.2998058795928955, "learning_rate": 1.5927586393281458e-07, "loss": 0.3115, "step": 5604 }, { "epoch": 2.7825583319543274, "grad_norm": 0.3138784170150757, "learning_rate": 1.5855327115220698e-07, "loss": 0.3522, "step": 5605 }, { "epoch": 2.783054774118815, "grad_norm": 0.3036832809448242, "learning_rate": 1.57832294799079e-07, "loss": 0.2967, "step": 5606 }, { "epoch": 2.783551216283303, "grad_norm": 0.2992107570171356, "learning_rate": 1.5711293511414482e-07, "loss": 0.3326, "step": 5607 }, { "epoch": 2.7840476584477907, "grad_norm": 0.27259406447410583, "learning_rate": 1.5639519233757895e-07, "loss": 0.3227, "step": 5608 }, { "epoch": 2.784544100612279, "grad_norm": 0.30825868248939514, "learning_rate": 1.556790667090169e-07, "loss": 0.3323, "step": 5609 }, { "epoch": 2.7850405427767666, "grad_norm": 0.30206045508384705, "learning_rate": 1.5496455846755242e-07, "loss": 0.3429, "step": 5610 }, { "epoch": 2.7855369849412543, "grad_norm": 0.3397676348686218, "learning_rate": 1.542516678517425e-07, "loss": 0.3523, "step": 5611 }, { "epoch": 2.786033427105742, "grad_norm": 0.31190988421440125, "learning_rate": 1.5354039509959894e-07, "loss": 0.3474, "step": 5612 }, { "epoch": 2.78652986927023, "grad_norm": 0.2973173260688782, "learning_rate": 1.5283074044859904e-07, "loss": 0.287, "step": 5613 }, { "epoch": 2.787026311434718, "grad_norm": 0.318635493516922, "learning_rate": 1.5212270413567544e-07, "loss": 0.3241, "step": 5614 }, { "epoch": 2.7875227535992058, "grad_norm": 0.3118172585964203, "learning_rate": 1.514162863972235e-07, "loss": 0.3458, "step": 5615 }, { "epoch": 2.7880191957636935, "grad_norm": 0.30130282044410706, "learning_rate": 1.5071148746909569e-07, "loss": 0.3041, "step": 5616 }, { "epoch": 2.7885156379281812, "grad_norm": 0.33482152223587036, "learning_rate": 1.5000830758660656e-07, "loss": 0.3318, "step": 5617 }, { "epoch": 2.789012080092669, "grad_norm": 0.326308012008667, "learning_rate": 1.493067469845283e-07, "loss": 0.3747, "step": 5618 }, { "epoch": 2.789508522257157, "grad_norm": 0.3042612075805664, "learning_rate": 1.486068058970913e-07, "loss": 0.311, "step": 5619 }, { "epoch": 2.790004964421645, "grad_norm": 0.30367225408554077, "learning_rate": 1.479084845579898e-07, "loss": 0.3232, "step": 5620 }, { "epoch": 2.7905014065861327, "grad_norm": 0.31851789355278015, "learning_rate": 1.4721178320037167e-07, "loss": 0.3254, "step": 5621 }, { "epoch": 2.7909978487506204, "grad_norm": 0.2983223795890808, "learning_rate": 1.4651670205684863e-07, "loss": 0.3224, "step": 5622 }, { "epoch": 2.791494290915108, "grad_norm": 0.3033770024776459, "learning_rate": 1.4582324135948734e-07, "loss": 0.3736, "step": 5623 }, { "epoch": 2.7919907330795963, "grad_norm": 0.30762526392936707, "learning_rate": 1.4513140133981752e-07, "loss": 0.3167, "step": 5624 }, { "epoch": 2.792487175244084, "grad_norm": 0.3172612488269806, "learning_rate": 1.4444118222882387e-07, "loss": 0.2972, "step": 5625 }, { "epoch": 2.792983617408572, "grad_norm": 0.31853386759757996, "learning_rate": 1.4375258425695317e-07, "loss": 0.3958, "step": 5626 }, { "epoch": 2.79348005957306, "grad_norm": 0.31515538692474365, "learning_rate": 1.4306560765410925e-07, "loss": 0.3237, "step": 5627 }, { "epoch": 2.7939765017375473, "grad_norm": 0.31198662519454956, "learning_rate": 1.4238025264965428e-07, "loss": 0.3426, "step": 5628 }, { "epoch": 2.7944729439020355, "grad_norm": 0.3242517113685608, "learning_rate": 1.4169651947241069e-07, "loss": 0.3068, "step": 5629 }, { "epoch": 2.7949693860665232, "grad_norm": 0.3116937279701233, "learning_rate": 1.4101440835065705e-07, "loss": 0.315, "step": 5630 }, { "epoch": 2.795465828231011, "grad_norm": 0.32562071084976196, "learning_rate": 1.4033391951213392e-07, "loss": 0.3061, "step": 5631 }, { "epoch": 2.795962270395499, "grad_norm": 0.3099510371685028, "learning_rate": 1.3965505318403572e-07, "loss": 0.3313, "step": 5632 }, { "epoch": 2.796458712559987, "grad_norm": 0.3080188035964966, "learning_rate": 1.389778095930183e-07, "loss": 0.3622, "step": 5633 }, { "epoch": 2.7969551547244746, "grad_norm": 0.28997620940208435, "learning_rate": 1.3830218896519532e-07, "loss": 0.3452, "step": 5634 }, { "epoch": 2.7974515968889624, "grad_norm": 0.2893969416618347, "learning_rate": 1.3762819152613793e-07, "loss": 0.3267, "step": 5635 }, { "epoch": 2.79794803905345, "grad_norm": 0.3011128604412079, "learning_rate": 1.3695581750087562e-07, "loss": 0.283, "step": 5636 }, { "epoch": 2.7984444812179383, "grad_norm": 1.0338088274002075, "learning_rate": 1.3628506711389545e-07, "loss": 0.3435, "step": 5637 }, { "epoch": 2.798940923382426, "grad_norm": 0.3079615533351898, "learning_rate": 1.3561594058914218e-07, "loss": 0.3436, "step": 5638 }, { "epoch": 2.799437365546914, "grad_norm": 0.31256332993507385, "learning_rate": 1.3494843815002047e-07, "loss": 0.3254, "step": 5639 }, { "epoch": 2.7999338077114015, "grad_norm": 0.31225600838661194, "learning_rate": 1.3428256001939034e-07, "loss": 0.3355, "step": 5640 }, { "epoch": 2.8004302498758893, "grad_norm": 0.32893216609954834, "learning_rate": 1.3361830641957118e-07, "loss": 0.3426, "step": 5641 }, { "epoch": 2.8009266920403775, "grad_norm": 0.30460110306739807, "learning_rate": 1.3295567757233729e-07, "loss": 0.2999, "step": 5642 }, { "epoch": 2.801423134204865, "grad_norm": 0.3277698755264282, "learning_rate": 1.3229467369892446e-07, "loss": 0.3338, "step": 5643 }, { "epoch": 2.801919576369353, "grad_norm": 0.31269100308418274, "learning_rate": 1.3163529502002337e-07, "loss": 0.3139, "step": 5644 }, { "epoch": 2.8024160185338407, "grad_norm": 0.29782938957214355, "learning_rate": 1.3097754175578182e-07, "loss": 0.2978, "step": 5645 }, { "epoch": 2.8029124606983284, "grad_norm": 0.29802170395851135, "learning_rate": 1.303214141258069e-07, "loss": 0.309, "step": 5646 }, { "epoch": 2.8034089028628166, "grad_norm": 0.31239089369773865, "learning_rate": 1.2966691234916119e-07, "loss": 0.3249, "step": 5647 }, { "epoch": 2.8039053450273044, "grad_norm": 0.29573002457618713, "learning_rate": 1.290140366443654e-07, "loss": 0.3222, "step": 5648 }, { "epoch": 2.804401787191792, "grad_norm": 0.31957879662513733, "learning_rate": 1.2836278722939576e-07, "loss": 0.3408, "step": 5649 }, { "epoch": 2.80489822935628, "grad_norm": 0.30077284574508667, "learning_rate": 1.2771316432168889e-07, "loss": 0.3171, "step": 5650 }, { "epoch": 2.8053946715207676, "grad_norm": 0.32531559467315674, "learning_rate": 1.270651681381341e-07, "loss": 0.3435, "step": 5651 }, { "epoch": 2.805891113685256, "grad_norm": 0.3144959509372711, "learning_rate": 1.2641879889508158e-07, "loss": 0.2901, "step": 5652 }, { "epoch": 2.8063875558497435, "grad_norm": 0.3102501630783081, "learning_rate": 1.2577405680833433e-07, "loss": 0.3435, "step": 5653 }, { "epoch": 2.8068839980142313, "grad_norm": 0.30547574162483215, "learning_rate": 1.2513094209315625e-07, "loss": 0.3108, "step": 5654 }, { "epoch": 2.807380440178719, "grad_norm": 0.30926504731178284, "learning_rate": 1.24489454964265e-07, "loss": 0.3889, "step": 5655 }, { "epoch": 2.8078768823432068, "grad_norm": 0.27562910318374634, "learning_rate": 1.2384959563583542e-07, "loss": 0.3131, "step": 5656 }, { "epoch": 2.808373324507695, "grad_norm": 0.29931771755218506, "learning_rate": 1.2321136432149938e-07, "loss": 0.3636, "step": 5657 }, { "epoch": 2.8088697666721827, "grad_norm": 0.3020811975002289, "learning_rate": 1.2257476123434474e-07, "loss": 0.3402, "step": 5658 }, { "epoch": 2.8093662088366704, "grad_norm": 0.338810533285141, "learning_rate": 1.2193978658691708e-07, "loss": 0.2934, "step": 5659 }, { "epoch": 2.8098626510011586, "grad_norm": 0.342663437128067, "learning_rate": 1.2130644059121565e-07, "loss": 0.3121, "step": 5660 }, { "epoch": 2.8103590931656464, "grad_norm": 0.3099873661994934, "learning_rate": 1.2067472345869858e-07, "loss": 0.3117, "step": 5661 }, { "epoch": 2.810855535330134, "grad_norm": 0.3303530514240265, "learning_rate": 1.2004463540027822e-07, "loss": 0.3587, "step": 5662 }, { "epoch": 2.811351977494622, "grad_norm": 0.3164670765399933, "learning_rate": 1.1941617662632466e-07, "loss": 0.3402, "step": 5663 }, { "epoch": 2.8118484196591096, "grad_norm": 0.3077276051044464, "learning_rate": 1.1878934734666281e-07, "loss": 0.3634, "step": 5664 }, { "epoch": 2.8123448618235978, "grad_norm": 0.3239983320236206, "learning_rate": 1.1816414777057361e-07, "loss": 0.3089, "step": 5665 }, { "epoch": 2.8128413039880855, "grad_norm": 0.31090787053108215, "learning_rate": 1.1754057810679509e-07, "loss": 0.3309, "step": 5666 }, { "epoch": 2.8133377461525733, "grad_norm": 0.32847607135772705, "learning_rate": 1.1691863856351904e-07, "loss": 0.3286, "step": 5667 }, { "epoch": 2.813834188317061, "grad_norm": 0.3116595447063446, "learning_rate": 1.1629832934839491e-07, "loss": 0.3089, "step": 5668 }, { "epoch": 2.8143306304815487, "grad_norm": 0.31922340393066406, "learning_rate": 1.1567965066852704e-07, "loss": 0.3393, "step": 5669 }, { "epoch": 2.814827072646037, "grad_norm": 0.3321036100387573, "learning_rate": 1.1506260273047576e-07, "loss": 0.2974, "step": 5670 }, { "epoch": 2.8153235148105247, "grad_norm": 0.31755444407463074, "learning_rate": 1.1444718574025516e-07, "loss": 0.3058, "step": 5671 }, { "epoch": 2.8158199569750124, "grad_norm": 0.3226362466812134, "learning_rate": 1.1383339990333753e-07, "loss": 0.3491, "step": 5672 }, { "epoch": 2.8163163991395, "grad_norm": 0.3374142348766327, "learning_rate": 1.1322124542465008e-07, "loss": 0.3502, "step": 5673 }, { "epoch": 2.816812841303988, "grad_norm": 0.27577534317970276, "learning_rate": 1.1261072250857264e-07, "loss": 0.3032, "step": 5674 }, { "epoch": 2.817309283468476, "grad_norm": 0.313730925321579, "learning_rate": 1.1200183135894327e-07, "loss": 0.3417, "step": 5675 }, { "epoch": 2.817805725632964, "grad_norm": 0.2889561057090759, "learning_rate": 1.113945721790538e-07, "loss": 0.327, "step": 5676 }, { "epoch": 2.8183021677974516, "grad_norm": 0.30937331914901733, "learning_rate": 1.1078894517165206e-07, "loss": 0.3229, "step": 5677 }, { "epoch": 2.8187986099619393, "grad_norm": 0.3230600655078888, "learning_rate": 1.1018495053894018e-07, "loss": 0.3275, "step": 5678 }, { "epoch": 2.819295052126427, "grad_norm": 0.2989407777786255, "learning_rate": 1.095825884825752e-07, "loss": 0.3271, "step": 5679 }, { "epoch": 2.8197914942909152, "grad_norm": 0.3083917200565338, "learning_rate": 1.0898185920366954e-07, "loss": 0.3388, "step": 5680 }, { "epoch": 2.820287936455403, "grad_norm": 0.29334548115730286, "learning_rate": 1.0838276290279115e-07, "loss": 0.3441, "step": 5681 }, { "epoch": 2.8207843786198907, "grad_norm": 0.3045148253440857, "learning_rate": 1.0778529977996166e-07, "loss": 0.2842, "step": 5682 }, { "epoch": 2.8212808207843785, "grad_norm": 0.3290744423866272, "learning_rate": 1.0718947003465652e-07, "loss": 0.3291, "step": 5683 }, { "epoch": 2.821777262948866, "grad_norm": 0.3331650495529175, "learning_rate": 1.0659527386580882e-07, "loss": 0.3517, "step": 5684 }, { "epoch": 2.8222737051133544, "grad_norm": 0.33179718255996704, "learning_rate": 1.0600271147180374e-07, "loss": 0.3207, "step": 5685 }, { "epoch": 2.822770147277842, "grad_norm": 0.3121303617954254, "learning_rate": 1.0541178305048139e-07, "loss": 0.3256, "step": 5686 }, { "epoch": 2.82326658944233, "grad_norm": 0.29498612880706787, "learning_rate": 1.0482248879913725e-07, "loss": 0.3054, "step": 5687 }, { "epoch": 2.823763031606818, "grad_norm": 0.30982181429862976, "learning_rate": 1.0423482891452119e-07, "loss": 0.3218, "step": 5688 }, { "epoch": 2.8242594737713054, "grad_norm": 0.29664313793182373, "learning_rate": 1.0364880359283625e-07, "loss": 0.3247, "step": 5689 }, { "epoch": 2.8247559159357936, "grad_norm": 0.3077421188354492, "learning_rate": 1.0306441302973924e-07, "loss": 0.3657, "step": 5690 }, { "epoch": 2.8252523581002813, "grad_norm": 0.2944149076938629, "learning_rate": 1.024816574203441e-07, "loss": 0.3112, "step": 5691 }, { "epoch": 2.825748800264769, "grad_norm": 0.31153547763824463, "learning_rate": 1.0190053695921631e-07, "loss": 0.3344, "step": 5692 }, { "epoch": 2.8262452424292572, "grad_norm": 0.29269683361053467, "learning_rate": 1.0132105184037677e-07, "loss": 0.3383, "step": 5693 }, { "epoch": 2.826741684593745, "grad_norm": 0.3033067286014557, "learning_rate": 1.007432022572985e-07, "loss": 0.3289, "step": 5694 }, { "epoch": 2.8272381267582327, "grad_norm": 0.28854477405548096, "learning_rate": 1.001669884029105e-07, "loss": 0.3361, "step": 5695 }, { "epoch": 2.8277345689227205, "grad_norm": 0.28941604495048523, "learning_rate": 9.959241046959611e-08, "loss": 0.331, "step": 5696 }, { "epoch": 2.828231011087208, "grad_norm": 0.32556310296058655, "learning_rate": 9.90194686491891e-08, "loss": 0.3632, "step": 5697 }, { "epoch": 2.8287274532516964, "grad_norm": 0.29297128319740295, "learning_rate": 9.84481631329809e-08, "loss": 0.2841, "step": 5698 }, { "epoch": 2.829223895416184, "grad_norm": 0.3109973669052124, "learning_rate": 9.787849411171391e-08, "loss": 0.3177, "step": 5699 }, { "epoch": 2.829720337580672, "grad_norm": 0.30785611271858215, "learning_rate": 9.731046177558545e-08, "loss": 0.3516, "step": 5700 }, { "epoch": 2.8302167797451596, "grad_norm": 0.324411004781723, "learning_rate": 9.674406631424549e-08, "loss": 0.3831, "step": 5701 }, { "epoch": 2.8307132219096474, "grad_norm": 0.3018746078014374, "learning_rate": 9.617930791679997e-08, "loss": 0.3217, "step": 5702 }, { "epoch": 2.8312096640741355, "grad_norm": 0.33468735218048096, "learning_rate": 9.561618677180418e-08, "loss": 0.3353, "step": 5703 }, { "epoch": 2.8317061062386233, "grad_norm": 0.3226318955421448, "learning_rate": 9.505470306726994e-08, "loss": 0.3413, "step": 5704 }, { "epoch": 2.832202548403111, "grad_norm": 0.29834654927253723, "learning_rate": 9.449485699066174e-08, "loss": 0.3398, "step": 5705 }, { "epoch": 2.8326989905675988, "grad_norm": 0.29896417260169983, "learning_rate": 9.393664872889619e-08, "loss": 0.3022, "step": 5706 }, { "epoch": 2.8331954327320865, "grad_norm": 0.3206222653388977, "learning_rate": 9.338007846834474e-08, "loss": 0.3498, "step": 5707 }, { "epoch": 2.8336918748965747, "grad_norm": 0.32197052240371704, "learning_rate": 9.282514639482986e-08, "loss": 0.3395, "step": 5708 }, { "epoch": 2.8341883170610624, "grad_norm": 0.3024064004421234, "learning_rate": 9.227185269362893e-08, "loss": 0.3384, "step": 5709 }, { "epoch": 2.83468475922555, "grad_norm": 0.3035600781440735, "learning_rate": 9.172019754947192e-08, "loss": 0.3114, "step": 5710 }, { "epoch": 2.835181201390038, "grad_norm": 0.30061113834381104, "learning_rate": 9.117018114654153e-08, "loss": 0.3106, "step": 5711 }, { "epoch": 2.8356776435545257, "grad_norm": 0.31428638100624084, "learning_rate": 9.062180366847306e-08, "loss": 0.3416, "step": 5712 }, { "epoch": 2.836174085719014, "grad_norm": 0.3191569447517395, "learning_rate": 9.007506529835452e-08, "loss": 0.3128, "step": 5713 }, { "epoch": 2.8366705278835016, "grad_norm": 0.32517746090888977, "learning_rate": 8.952996621872767e-08, "loss": 0.3587, "step": 5714 }, { "epoch": 2.8371669700479893, "grad_norm": 0.2979184687137604, "learning_rate": 8.898650661158582e-08, "loss": 0.3186, "step": 5715 }, { "epoch": 2.837663412212477, "grad_norm": 0.2907969653606415, "learning_rate": 8.844468665837546e-08, "loss": 0.2957, "step": 5716 }, { "epoch": 2.838159854376965, "grad_norm": 0.31449979543685913, "learning_rate": 8.790450653999527e-08, "loss": 0.3466, "step": 5717 }, { "epoch": 2.838656296541453, "grad_norm": 0.31880638003349304, "learning_rate": 8.736596643679762e-08, "loss": 0.3742, "step": 5718 }, { "epoch": 2.8391527387059408, "grad_norm": 0.2792821228504181, "learning_rate": 8.682906652858536e-08, "loss": 0.285, "step": 5719 }, { "epoch": 2.8396491808704285, "grad_norm": 0.29003533720970154, "learning_rate": 8.629380699461453e-08, "loss": 0.3349, "step": 5720 }, { "epoch": 2.8401456230349167, "grad_norm": 0.3204509913921356, "learning_rate": 8.576018801359553e-08, "loss": 0.3344, "step": 5721 }, { "epoch": 2.8406420651994044, "grad_norm": 0.34067076444625854, "learning_rate": 8.52282097636875e-08, "loss": 0.3147, "step": 5722 }, { "epoch": 2.841138507363892, "grad_norm": 0.29338762164115906, "learning_rate": 8.469787242250504e-08, "loss": 0.3057, "step": 5723 }, { "epoch": 2.84163494952838, "grad_norm": 0.3238431215286255, "learning_rate": 8.416917616711095e-08, "loss": 0.3504, "step": 5724 }, { "epoch": 2.8421313916928677, "grad_norm": 0.3115657567977905, "learning_rate": 8.364212117402515e-08, "loss": 0.3096, "step": 5725 }, { "epoch": 2.842627833857356, "grad_norm": 0.30511823296546936, "learning_rate": 8.311670761921576e-08, "loss": 0.3253, "step": 5726 }, { "epoch": 2.8431242760218436, "grad_norm": 0.31349778175354004, "learning_rate": 8.259293567810412e-08, "loss": 0.3231, "step": 5727 }, { "epoch": 2.8436207181863313, "grad_norm": 0.3228025734424591, "learning_rate": 8.207080552556313e-08, "loss": 0.3206, "step": 5728 }, { "epoch": 2.844117160350819, "grad_norm": 0.2949788570404053, "learning_rate": 8.155031733591889e-08, "loss": 0.3088, "step": 5729 }, { "epoch": 2.844613602515307, "grad_norm": 0.3068118989467621, "learning_rate": 8.103147128294742e-08, "loss": 0.3278, "step": 5730 }, { "epoch": 2.845110044679795, "grad_norm": 0.3074490427970886, "learning_rate": 8.051426753987734e-08, "loss": 0.2922, "step": 5731 }, { "epoch": 2.8456064868442827, "grad_norm": 0.32411497831344604, "learning_rate": 7.999870627938944e-08, "loss": 0.3775, "step": 5732 }, { "epoch": 2.8461029290087705, "grad_norm": 0.2910352945327759, "learning_rate": 7.94847876736149e-08, "loss": 0.2988, "step": 5733 }, { "epoch": 2.8465993711732582, "grad_norm": 0.30405598878860474, "learning_rate": 7.897251189413758e-08, "loss": 0.3131, "step": 5734 }, { "epoch": 2.847095813337746, "grad_norm": 0.3108201324939728, "learning_rate": 7.846187911199287e-08, "loss": 0.3498, "step": 5735 }, { "epoch": 2.847592255502234, "grad_norm": 0.33138665556907654, "learning_rate": 7.795288949766611e-08, "loss": 0.3451, "step": 5736 }, { "epoch": 2.848088697666722, "grad_norm": 0.31866568326950073, "learning_rate": 7.744554322109633e-08, "loss": 0.3077, "step": 5737 }, { "epoch": 2.8485851398312096, "grad_norm": 0.297761470079422, "learning_rate": 7.693984045167192e-08, "loss": 0.3326, "step": 5738 }, { "epoch": 2.8490815819956974, "grad_norm": 0.3170928657054901, "learning_rate": 7.643578135823338e-08, "loss": 0.3148, "step": 5739 }, { "epoch": 2.849578024160185, "grad_norm": 0.2813025414943695, "learning_rate": 7.593336610907221e-08, "loss": 0.2897, "step": 5740 }, { "epoch": 2.8500744663246733, "grad_norm": 0.3184492290019989, "learning_rate": 7.543259487193144e-08, "loss": 0.3471, "step": 5741 }, { "epoch": 2.850570908489161, "grad_norm": 0.3006981611251831, "learning_rate": 7.493346781400457e-08, "loss": 0.2909, "step": 5742 }, { "epoch": 2.851067350653649, "grad_norm": 0.32752877473831177, "learning_rate": 7.443598510193716e-08, "loss": 0.3285, "step": 5743 }, { "epoch": 2.8515637928181365, "grad_norm": 0.3082933723926544, "learning_rate": 7.394014690182583e-08, "loss": 0.3527, "step": 5744 }, { "epoch": 2.8520602349826243, "grad_norm": 0.30903735756874084, "learning_rate": 7.344595337921534e-08, "loss": 0.3345, "step": 5745 }, { "epoch": 2.8525566771471125, "grad_norm": 0.30952003598213196, "learning_rate": 7.29534046991054e-08, "loss": 0.32, "step": 5746 }, { "epoch": 2.8530531193116, "grad_norm": 0.31480666995048523, "learning_rate": 7.246250102594332e-08, "loss": 0.3232, "step": 5747 }, { "epoch": 2.853549561476088, "grad_norm": 0.3127814829349518, "learning_rate": 7.197324252362969e-08, "loss": 0.3282, "step": 5748 }, { "epoch": 2.854046003640576, "grad_norm": 0.3135005533695221, "learning_rate": 7.148562935551384e-08, "loss": 0.3244, "step": 5749 }, { "epoch": 2.8545424458050634, "grad_norm": 0.32655069231987, "learning_rate": 7.099966168439665e-08, "loss": 0.3164, "step": 5750 }, { "epoch": 2.8550388879695516, "grad_norm": 0.30641815066337585, "learning_rate": 7.051533967252999e-08, "loss": 0.3374, "step": 5751 }, { "epoch": 2.8555353301340394, "grad_norm": 0.3098739981651306, "learning_rate": 7.003266348161508e-08, "loss": 0.3079, "step": 5752 }, { "epoch": 2.856031772298527, "grad_norm": 0.33899176120758057, "learning_rate": 6.955163327280467e-08, "loss": 0.3539, "step": 5753 }, { "epoch": 2.8565282144630153, "grad_norm": 0.3256663978099823, "learning_rate": 6.907224920670141e-08, "loss": 0.3373, "step": 5754 }, { "epoch": 2.857024656627503, "grad_norm": 0.3031977117061615, "learning_rate": 6.859451144336005e-08, "loss": 0.3297, "step": 5755 }, { "epoch": 2.857521098791991, "grad_norm": 0.32100799679756165, "learning_rate": 6.811842014228243e-08, "loss": 0.363, "step": 5756 }, { "epoch": 2.8580175409564785, "grad_norm": 0.2729087471961975, "learning_rate": 6.764397546242307e-08, "loss": 0.3068, "step": 5757 }, { "epoch": 2.8585139831209663, "grad_norm": 0.28879278898239136, "learning_rate": 6.717117756218639e-08, "loss": 0.3204, "step": 5758 }, { "epoch": 2.8590104252854545, "grad_norm": 0.3181460201740265, "learning_rate": 6.670002659942664e-08, "loss": 0.347, "step": 5759 }, { "epoch": 2.859506867449942, "grad_norm": 0.31808021664619446, "learning_rate": 6.623052273144914e-08, "loss": 0.3695, "step": 5760 }, { "epoch": 2.86000330961443, "grad_norm": 0.30265170335769653, "learning_rate": 6.576266611500681e-08, "loss": 0.3387, "step": 5761 }, { "epoch": 2.8604997517789177, "grad_norm": 0.3159080445766449, "learning_rate": 6.529645690630526e-08, "loss": 0.3527, "step": 5762 }, { "epoch": 2.8609961939434054, "grad_norm": 0.31195518374443054, "learning_rate": 6.483189526099887e-08, "loss": 0.2805, "step": 5763 }, { "epoch": 2.8614926361078936, "grad_norm": 0.32539504766464233, "learning_rate": 6.436898133419301e-08, "loss": 0.3415, "step": 5764 }, { "epoch": 2.8619890782723814, "grad_norm": 0.30869197845458984, "learning_rate": 6.390771528044016e-08, "loss": 0.3215, "step": 5765 }, { "epoch": 2.862485520436869, "grad_norm": 0.3232330083847046, "learning_rate": 6.344809725374601e-08, "loss": 0.3619, "step": 5766 }, { "epoch": 2.862981962601357, "grad_norm": 0.3377227485179901, "learning_rate": 6.29901274075645e-08, "loss": 0.3543, "step": 5767 }, { "epoch": 2.8634784047658446, "grad_norm": 0.2961440682411194, "learning_rate": 6.253380589479829e-08, "loss": 0.3019, "step": 5768 }, { "epoch": 2.8639748469303328, "grad_norm": 0.3096289336681366, "learning_rate": 6.207913286780221e-08, "loss": 0.3028, "step": 5769 }, { "epoch": 2.8644712890948205, "grad_norm": 0.33476853370666504, "learning_rate": 6.162610847837813e-08, "loss": 0.3355, "step": 5770 }, { "epoch": 2.8649677312593083, "grad_norm": 0.3208600580692291, "learning_rate": 6.117473287777897e-08, "loss": 0.3495, "step": 5771 }, { "epoch": 2.865464173423796, "grad_norm": 0.319747656583786, "learning_rate": 6.072500621670585e-08, "loss": 0.3222, "step": 5772 }, { "epoch": 2.8659606155882837, "grad_norm": 0.2939278781414032, "learning_rate": 6.027692864531198e-08, "loss": 0.3882, "step": 5773 }, { "epoch": 2.866457057752772, "grad_norm": 0.3029797375202179, "learning_rate": 5.983050031319714e-08, "loss": 0.3205, "step": 5774 }, { "epoch": 2.8669534999172597, "grad_norm": 0.33994731307029724, "learning_rate": 5.938572136941156e-08, "loss": 0.3486, "step": 5775 }, { "epoch": 2.8674499420817474, "grad_norm": 0.3076452612876892, "learning_rate": 5.8942591962455334e-08, "loss": 0.3183, "step": 5776 }, { "epoch": 2.867946384246235, "grad_norm": 0.3038314878940582, "learning_rate": 5.8501112240277325e-08, "loss": 0.3067, "step": 5777 }, { "epoch": 2.868442826410723, "grad_norm": 0.32604724168777466, "learning_rate": 5.806128235027575e-08, "loss": 0.3194, "step": 5778 }, { "epoch": 2.868939268575211, "grad_norm": 0.2838820219039917, "learning_rate": 5.762310243929703e-08, "loss": 0.3132, "step": 5779 }, { "epoch": 2.869435710739699, "grad_norm": 0.29864129424095154, "learning_rate": 5.718657265363858e-08, "loss": 0.3353, "step": 5780 }, { "epoch": 2.8699321529041866, "grad_norm": 0.31851616501808167, "learning_rate": 5.6751693139044385e-08, "loss": 0.3559, "step": 5781 }, { "epoch": 2.8704285950686748, "grad_norm": 0.3069516718387604, "learning_rate": 5.6318464040710505e-08, "loss": 0.2888, "step": 5782 }, { "epoch": 2.8709250372331625, "grad_norm": 0.30196094512939453, "learning_rate": 5.5886885503279584e-08, "loss": 0.2828, "step": 5783 }, { "epoch": 2.8714214793976502, "grad_norm": 0.301567018032074, "learning_rate": 5.5456957670843584e-08, "loss": 0.3218, "step": 5784 }, { "epoch": 2.871917921562138, "grad_norm": 0.30610033869743347, "learning_rate": 5.502868068694489e-08, "loss": 0.309, "step": 5785 }, { "epoch": 2.8724143637266257, "grad_norm": 0.32572701573371887, "learning_rate": 5.460205469457247e-08, "loss": 0.354, "step": 5786 }, { "epoch": 2.872910805891114, "grad_norm": 0.33454835414886475, "learning_rate": 5.417707983616571e-08, "loss": 0.3227, "step": 5787 }, { "epoch": 2.8734072480556017, "grad_norm": 0.30338671803474426, "learning_rate": 5.375375625361168e-08, "loss": 0.2697, "step": 5788 }, { "epoch": 2.8739036902200894, "grad_norm": 0.3178071081638336, "learning_rate": 5.3332084088247305e-08, "loss": 0.3547, "step": 5789 }, { "epoch": 2.874400132384577, "grad_norm": 0.30787140130996704, "learning_rate": 5.2912063480857204e-08, "loss": 0.3384, "step": 5790 }, { "epoch": 2.874896574549065, "grad_norm": 0.30330169200897217, "learning_rate": 5.2493694571673635e-08, "loss": 0.3219, "step": 5791 }, { "epoch": 2.875393016713553, "grad_norm": 0.2779994606971741, "learning_rate": 5.207697750038099e-08, "loss": 0.3054, "step": 5792 }, { "epoch": 2.875889458878041, "grad_norm": 0.29714128375053406, "learning_rate": 5.166191240610741e-08, "loss": 0.311, "step": 5793 }, { "epoch": 2.8763859010425286, "grad_norm": 0.3114349842071533, "learning_rate": 5.1248499427433704e-08, "loss": 0.3397, "step": 5794 }, { "epoch": 2.8768823432070163, "grad_norm": 0.32061293721199036, "learning_rate": 5.083673870238559e-08, "loss": 0.3298, "step": 5795 }, { "epoch": 2.877378785371504, "grad_norm": 0.287615567445755, "learning_rate": 5.0426630368440314e-08, "loss": 0.3099, "step": 5796 }, { "epoch": 2.8778752275359922, "grad_norm": 0.3219422399997711, "learning_rate": 5.001817456252111e-08, "loss": 0.3326, "step": 5797 }, { "epoch": 2.87837166970048, "grad_norm": 0.32116472721099854, "learning_rate": 4.9611371421000034e-08, "loss": 0.3474, "step": 5798 }, { "epoch": 2.8788681118649677, "grad_norm": 0.3007492423057556, "learning_rate": 4.9206221079698414e-08, "loss": 0.3146, "step": 5799 }, { "epoch": 2.8793645540294555, "grad_norm": 0.31575191020965576, "learning_rate": 4.8802723673884164e-08, "loss": 0.3248, "step": 5800 }, { "epoch": 2.879860996193943, "grad_norm": 0.3252822756767273, "learning_rate": 4.8400879338274534e-08, "loss": 0.3416, "step": 5801 }, { "epoch": 2.8803574383584314, "grad_norm": 0.3075472414493561, "learning_rate": 4.800068820703385e-08, "loss": 0.3257, "step": 5802 }, { "epoch": 2.880853880522919, "grad_norm": 0.30404651165008545, "learning_rate": 4.760215041377636e-08, "loss": 0.2897, "step": 5803 }, { "epoch": 2.881350322687407, "grad_norm": 0.3030191957950592, "learning_rate": 4.7205266091561175e-08, "loss": 0.315, "step": 5804 }, { "epoch": 2.8818467648518946, "grad_norm": 0.2856552302837372, "learning_rate": 4.6810035372898964e-08, "loss": 0.3348, "step": 5805 }, { "epoch": 2.8823432070163824, "grad_norm": 0.30564725399017334, "learning_rate": 4.641645838974473e-08, "loss": 0.3431, "step": 5806 }, { "epoch": 2.8828396491808705, "grad_norm": 0.314155250787735, "learning_rate": 4.602453527350503e-08, "loss": 0.294, "step": 5807 }, { "epoch": 2.8833360913453583, "grad_norm": 0.32501888275146484, "learning_rate": 4.5634266155031304e-08, "loss": 0.3614, "step": 5808 }, { "epoch": 2.883832533509846, "grad_norm": 0.30773991346359253, "learning_rate": 4.524565116462321e-08, "loss": 0.3486, "step": 5809 }, { "epoch": 2.884328975674334, "grad_norm": 0.29579147696495056, "learning_rate": 4.4858690432030285e-08, "loss": 0.2986, "step": 5810 }, { "epoch": 2.8848254178388215, "grad_norm": 0.2993593215942383, "learning_rate": 4.447338408644697e-08, "loss": 0.3102, "step": 5811 }, { "epoch": 2.8853218600033097, "grad_norm": 0.32053491473197937, "learning_rate": 4.4089732256517026e-08, "loss": 0.3244, "step": 5812 }, { "epoch": 2.8858183021677974, "grad_norm": 0.4077499508857727, "learning_rate": 4.370773507033077e-08, "loss": 0.3273, "step": 5813 }, { "epoch": 2.886314744332285, "grad_norm": 0.3154866397380829, "learning_rate": 4.332739265542785e-08, "loss": 0.337, "step": 5814 }, { "epoch": 2.8868111864967734, "grad_norm": 0.28676971793174744, "learning_rate": 4.294870513879335e-08, "loss": 0.34, "step": 5815 }, { "epoch": 2.887307628661261, "grad_norm": 0.29382723569869995, "learning_rate": 4.257167264686113e-08, "loss": 0.3313, "step": 5816 }, { "epoch": 2.887804070825749, "grad_norm": 0.31032368540763855, "learning_rate": 4.219629530551217e-08, "loss": 0.3494, "step": 5817 }, { "epoch": 2.8883005129902366, "grad_norm": 0.30747175216674805, "learning_rate": 4.1822573240073995e-08, "loss": 0.3737, "step": 5818 }, { "epoch": 2.8887969551547243, "grad_norm": 0.2847530245780945, "learning_rate": 4.145050657532346e-08, "loss": 0.3039, "step": 5819 }, { "epoch": 2.8892933973192125, "grad_norm": 0.3081226944923401, "learning_rate": 4.108009543548286e-08, "loss": 0.3161, "step": 5820 }, { "epoch": 2.8897898394837003, "grad_norm": 0.3417774438858032, "learning_rate": 4.071133994422216e-08, "loss": 0.3279, "step": 5821 }, { "epoch": 2.890286281648188, "grad_norm": 0.3259845972061157, "learning_rate": 4.034424022465899e-08, "loss": 0.3571, "step": 5822 }, { "epoch": 2.8907827238126758, "grad_norm": 0.3360610902309418, "learning_rate": 3.9978796399358086e-08, "loss": 0.3051, "step": 5823 }, { "epoch": 2.8912791659771635, "grad_norm": 0.32194390892982483, "learning_rate": 3.961500859033074e-08, "loss": 0.3601, "step": 5824 }, { "epoch": 2.8917756081416517, "grad_norm": 0.2976999282836914, "learning_rate": 3.925287691903701e-08, "loss": 0.3202, "step": 5825 }, { "epoch": 2.8922720503061394, "grad_norm": 0.3012429475784302, "learning_rate": 3.8892401506381846e-08, "loss": 0.3144, "step": 5826 }, { "epoch": 2.892768492470627, "grad_norm": 0.3271055519580841, "learning_rate": 3.8533582472717877e-08, "loss": 0.3768, "step": 5827 }, { "epoch": 2.893264934635115, "grad_norm": 0.292473703622818, "learning_rate": 3.817641993784593e-08, "loss": 0.3079, "step": 5828 }, { "epoch": 2.8937613767996027, "grad_norm": 0.29092535376548767, "learning_rate": 3.782091402101229e-08, "loss": 0.3204, "step": 5829 }, { "epoch": 2.894257818964091, "grad_norm": 0.3148901164531708, "learning_rate": 3.746706484091145e-08, "loss": 0.4119, "step": 5830 }, { "epoch": 2.8947542611285786, "grad_norm": 0.2915846109390259, "learning_rate": 3.711487251568335e-08, "loss": 0.2911, "step": 5831 }, { "epoch": 2.8952507032930663, "grad_norm": 0.31672054529190063, "learning_rate": 3.67643371629145e-08, "loss": 0.3725, "step": 5832 }, { "epoch": 2.895747145457554, "grad_norm": 0.2956191897392273, "learning_rate": 3.641545889964126e-08, "loss": 0.2783, "step": 5833 }, { "epoch": 2.896243587622042, "grad_norm": 0.3035525679588318, "learning_rate": 3.606823784234326e-08, "loss": 0.3326, "step": 5834 }, { "epoch": 2.89674002978653, "grad_norm": 0.3324674069881439, "learning_rate": 3.572267410694885e-08, "loss": 0.3453, "step": 5835 }, { "epoch": 2.8972364719510177, "grad_norm": 0.30910176038742065, "learning_rate": 3.5378767808831315e-08, "loss": 0.2967, "step": 5836 }, { "epoch": 2.8977329141155055, "grad_norm": 0.28564518690109253, "learning_rate": 3.503651906281269e-08, "loss": 0.3172, "step": 5837 }, { "epoch": 2.8982293562799932, "grad_norm": 0.2931228280067444, "learning_rate": 3.469592798316046e-08, "loss": 0.3284, "step": 5838 }, { "epoch": 2.898725798444481, "grad_norm": 0.32074403762817383, "learning_rate": 3.435699468358755e-08, "loss": 0.3753, "step": 5839 }, { "epoch": 2.899222240608969, "grad_norm": 0.32227563858032227, "learning_rate": 3.401971927725623e-08, "loss": 0.3745, "step": 5840 }, { "epoch": 2.899718682773457, "grad_norm": 0.2851843237876892, "learning_rate": 3.368410187677196e-08, "loss": 0.2724, "step": 5841 }, { "epoch": 2.9002151249379446, "grad_norm": 0.29044628143310547, "learning_rate": 3.3350142594190115e-08, "loss": 0.3217, "step": 5842 }, { "epoch": 2.900711567102433, "grad_norm": 0.3106456398963928, "learning_rate": 3.301784154100818e-08, "loss": 0.3109, "step": 5843 }, { "epoch": 2.90120800926692, "grad_norm": 0.2966822683811188, "learning_rate": 3.268719882817517e-08, "loss": 0.3027, "step": 5844 }, { "epoch": 2.9017044514314083, "grad_norm": 0.29523953795433044, "learning_rate": 3.235821456608168e-08, "loss": 0.3249, "step": 5845 }, { "epoch": 2.902200893595896, "grad_norm": 0.32273781299591064, "learning_rate": 3.203088886456762e-08, "loss": 0.3532, "step": 5846 }, { "epoch": 2.902697335760384, "grad_norm": 0.31262972950935364, "learning_rate": 3.17052218329178e-08, "loss": 0.3143, "step": 5847 }, { "epoch": 2.903193777924872, "grad_norm": 0.3046535849571228, "learning_rate": 3.138121357986357e-08, "loss": 0.3106, "step": 5848 }, { "epoch": 2.9036902200893597, "grad_norm": 0.28787654638290405, "learning_rate": 3.105886421358284e-08, "loss": 0.3285, "step": 5849 }, { "epoch": 2.9041866622538475, "grad_norm": 0.3037766218185425, "learning_rate": 3.073817384169841e-08, "loss": 0.328, "step": 5850 }, { "epoch": 2.904683104418335, "grad_norm": 0.28795555233955383, "learning_rate": 3.041914257128131e-08, "loss": 0.3011, "step": 5851 }, { "epoch": 2.905179546582823, "grad_norm": 0.31232765316963196, "learning_rate": 3.010177050884633e-08, "loss": 0.3148, "step": 5852 }, { "epoch": 2.905675988747311, "grad_norm": 0.3126181662082672, "learning_rate": 2.9786057760355925e-08, "loss": 0.3325, "step": 5853 }, { "epoch": 2.906172430911799, "grad_norm": 0.34029489755630493, "learning_rate": 2.9472004431218004e-08, "loss": 0.302, "step": 5854 }, { "epoch": 2.9066688730762866, "grad_norm": 0.304362952709198, "learning_rate": 2.9159610626286472e-08, "loss": 0.3202, "step": 5855 }, { "epoch": 2.9071653152407744, "grad_norm": 0.31800174713134766, "learning_rate": 2.8848876449860673e-08, "loss": 0.331, "step": 5856 }, { "epoch": 2.907661757405262, "grad_norm": 0.27943331003189087, "learning_rate": 2.8539802005687068e-08, "loss": 0.282, "step": 5857 }, { "epoch": 2.9081581995697503, "grad_norm": 0.28549519181251526, "learning_rate": 2.823238739695644e-08, "loss": 0.3218, "step": 5858 }, { "epoch": 2.908654641734238, "grad_norm": 0.31783923506736755, "learning_rate": 2.792663272630669e-08, "loss": 0.3673, "step": 5859 }, { "epoch": 2.909151083898726, "grad_norm": 0.30784013867378235, "learning_rate": 2.7622538095820606e-08, "loss": 0.303, "step": 5860 }, { "epoch": 2.9096475260632135, "grad_norm": 0.2944496273994446, "learning_rate": 2.7320103607027527e-08, "loss": 0.3219, "step": 5861 }, { "epoch": 2.9101439682277013, "grad_norm": 0.3251620829105377, "learning_rate": 2.701932936090168e-08, "loss": 0.3416, "step": 5862 }, { "epoch": 2.9106404103921895, "grad_norm": 0.31784138083457947, "learning_rate": 2.672021545786385e-08, "loss": 0.3698, "step": 5863 }, { "epoch": 2.911136852556677, "grad_norm": 0.31645044684410095, "learning_rate": 2.642276199777971e-08, "loss": 0.2934, "step": 5864 }, { "epoch": 2.911633294721165, "grad_norm": 0.3101307153701782, "learning_rate": 2.612696907996093e-08, "loss": 0.3466, "step": 5865 }, { "epoch": 2.9121297368856527, "grad_norm": 0.294542133808136, "learning_rate": 2.583283680316462e-08, "loss": 0.3461, "step": 5866 }, { "epoch": 2.9126261790501404, "grad_norm": 0.2836979031562805, "learning_rate": 2.5540365265594446e-08, "loss": 0.3098, "step": 5867 }, { "epoch": 2.9131226212146286, "grad_norm": 0.3115215003490448, "learning_rate": 2.5249554564897305e-08, "loss": 0.3622, "step": 5868 }, { "epoch": 2.9136190633791164, "grad_norm": 0.34972482919692993, "learning_rate": 2.496040479816775e-08, "loss": 0.3205, "step": 5869 }, { "epoch": 2.914115505543604, "grad_norm": 0.3082001507282257, "learning_rate": 2.467291606194522e-08, "loss": 0.2989, "step": 5870 }, { "epoch": 2.914611947708092, "grad_norm": 0.3062213361263275, "learning_rate": 2.4387088452214046e-08, "loss": 0.3316, "step": 5871 }, { "epoch": 2.9151083898725796, "grad_norm": 0.33535268902778625, "learning_rate": 2.4102922064404566e-08, "loss": 0.3313, "step": 5872 }, { "epoch": 2.9156048320370678, "grad_norm": 0.28765398263931274, "learning_rate": 2.3820416993391437e-08, "loss": 0.3227, "step": 5873 }, { "epoch": 2.9161012742015555, "grad_norm": 0.2921086549758911, "learning_rate": 2.3539573333496436e-08, "loss": 0.3342, "step": 5874 }, { "epoch": 2.9165977163660433, "grad_norm": 0.2890383005142212, "learning_rate": 2.326039117848511e-08, "loss": 0.2957, "step": 5875 }, { "epoch": 2.9170941585305314, "grad_norm": 0.3174726963043213, "learning_rate": 2.298287062156901e-08, "loss": 0.366, "step": 5876 }, { "epoch": 2.917590600695019, "grad_norm": 0.2731146812438965, "learning_rate": 2.270701175540402e-08, "loss": 0.29, "step": 5877 }, { "epoch": 2.918087042859507, "grad_norm": 0.32601380348205566, "learning_rate": 2.243281467209313e-08, "loss": 0.3392, "step": 5878 }, { "epoch": 2.9185834850239947, "grad_norm": 0.3208707869052887, "learning_rate": 2.2160279463182554e-08, "loss": 0.3303, "step": 5879 }, { "epoch": 2.9190799271884824, "grad_norm": 0.32191935181617737, "learning_rate": 2.1889406219663955e-08, "loss": 0.391, "step": 5880 }, { "epoch": 2.9195763693529706, "grad_norm": 0.29718533158302307, "learning_rate": 2.16201950319761e-08, "loss": 0.2995, "step": 5881 }, { "epoch": 2.9200728115174583, "grad_norm": 0.30364692211151123, "learning_rate": 2.135264598999931e-08, "loss": 0.3178, "step": 5882 }, { "epoch": 2.920569253681946, "grad_norm": 0.29370442032814026, "learning_rate": 2.1086759183062132e-08, "loss": 0.3315, "step": 5883 }, { "epoch": 2.921065695846434, "grad_norm": 0.31359750032424927, "learning_rate": 2.0822534699936892e-08, "loss": 0.3179, "step": 5884 }, { "epoch": 2.9215621380109216, "grad_norm": 0.3221331238746643, "learning_rate": 2.0559972628840795e-08, "loss": 0.3567, "step": 5885 }, { "epoch": 2.9220585801754098, "grad_norm": 0.3022630512714386, "learning_rate": 2.0299073057435946e-08, "loss": 0.3218, "step": 5886 }, { "epoch": 2.9225550223398975, "grad_norm": 0.28812354803085327, "learning_rate": 2.0039836072829888e-08, "loss": 0.3295, "step": 5887 }, { "epoch": 2.9230514645043852, "grad_norm": 0.3022056221961975, "learning_rate": 1.978226176157505e-08, "loss": 0.352, "step": 5888 }, { "epoch": 2.923547906668873, "grad_norm": 0.30860403180122375, "learning_rate": 1.9526350209667645e-08, "loss": 0.3172, "step": 5889 }, { "epoch": 2.9240443488333607, "grad_norm": 0.3049660921096802, "learning_rate": 1.9272101502550432e-08, "loss": 0.3212, "step": 5890 }, { "epoch": 2.924540790997849, "grad_norm": 0.317765474319458, "learning_rate": 1.901951572510996e-08, "loss": 0.3656, "step": 5891 }, { "epoch": 2.9250372331623367, "grad_norm": 0.28203752636909485, "learning_rate": 1.8768592961677655e-08, "loss": 0.3178, "step": 5892 }, { "epoch": 2.9255336753268244, "grad_norm": 0.32049262523651123, "learning_rate": 1.8519333296029286e-08, "loss": 0.3448, "step": 5893 }, { "epoch": 2.926030117491312, "grad_norm": 0.32309281826019287, "learning_rate": 1.827173681138661e-08, "loss": 0.3369, "step": 5894 }, { "epoch": 2.9265265596558, "grad_norm": 0.3210950791835785, "learning_rate": 1.802580359041517e-08, "loss": 0.328, "step": 5895 }, { "epoch": 2.927023001820288, "grad_norm": 0.3002423942089081, "learning_rate": 1.7781533715225952e-08, "loss": 0.3601, "step": 5896 }, { "epoch": 2.927519443984776, "grad_norm": 0.2984199523925781, "learning_rate": 1.7538927267372606e-08, "loss": 0.2838, "step": 5897 }, { "epoch": 2.9280158861492636, "grad_norm": 0.35004207491874695, "learning_rate": 1.7297984327856456e-08, "loss": 0.3524, "step": 5898 }, { "epoch": 2.9285123283137513, "grad_norm": 0.30129867792129517, "learning_rate": 1.7058704977120366e-08, "loss": 0.3175, "step": 5899 }, { "epoch": 2.929008770478239, "grad_norm": 0.2996237277984619, "learning_rate": 1.6821089295053773e-08, "loss": 0.3617, "step": 5900 }, { "epoch": 2.9295052126427272, "grad_norm": 0.31886935234069824, "learning_rate": 1.6585137360990434e-08, "loss": 0.3368, "step": 5901 }, { "epoch": 2.930001654807215, "grad_norm": 0.31499427556991577, "learning_rate": 1.6350849253708444e-08, "loss": 0.3425, "step": 5902 }, { "epoch": 2.9304980969717027, "grad_norm": 0.3245478868484497, "learning_rate": 1.6118225051429125e-08, "loss": 0.3234, "step": 5903 }, { "epoch": 2.930994539136191, "grad_norm": 0.3074646592140198, "learning_rate": 1.5887264831820348e-08, "loss": 0.3097, "step": 5904 }, { "epoch": 2.931490981300678, "grad_norm": 0.30583733320236206, "learning_rate": 1.5657968671993208e-08, "loss": 0.325, "step": 5905 }, { "epoch": 2.9319874234651664, "grad_norm": 0.3401729166507721, "learning_rate": 1.543033664850313e-08, "loss": 0.3505, "step": 5906 }, { "epoch": 2.932483865629654, "grad_norm": 0.31007111072540283, "learning_rate": 1.5204368837350437e-08, "loss": 0.3309, "step": 5907 }, { "epoch": 2.932980307794142, "grad_norm": 0.30145272612571716, "learning_rate": 1.498006531398033e-08, "loss": 0.3122, "step": 5908 }, { "epoch": 2.93347674995863, "grad_norm": 0.305314302444458, "learning_rate": 1.4757426153280685e-08, "loss": 0.3218, "step": 5909 }, { "epoch": 2.933973192123118, "grad_norm": 0.3236352205276489, "learning_rate": 1.4536451429585374e-08, "loss": 0.3424, "step": 5910 }, { "epoch": 2.9344696342876055, "grad_norm": 0.30952391028404236, "learning_rate": 1.4317141216671493e-08, "loss": 0.2982, "step": 5911 }, { "epoch": 2.9349660764520933, "grad_norm": 0.305591881275177, "learning_rate": 1.409949558776047e-08, "loss": 0.3081, "step": 5912 }, { "epoch": 2.935462518616581, "grad_norm": 0.30115699768066406, "learning_rate": 1.3883514615519178e-08, "loss": 0.3413, "step": 5913 }, { "epoch": 2.935958960781069, "grad_norm": 0.30682143568992615, "learning_rate": 1.3669198372056602e-08, "loss": 0.326, "step": 5914 }, { "epoch": 2.936455402945557, "grad_norm": 0.3249044716358185, "learning_rate": 1.3456546928928282e-08, "loss": 0.3532, "step": 5915 }, { "epoch": 2.9369518451100447, "grad_norm": 0.3063971698284149, "learning_rate": 1.324556035713187e-08, "loss": 0.3083, "step": 5916 }, { "epoch": 2.9374482872745324, "grad_norm": 0.3090262711048126, "learning_rate": 1.3036238727110462e-08, "loss": 0.3219, "step": 5917 }, { "epoch": 2.93794472943902, "grad_norm": 0.3064492344856262, "learning_rate": 1.2828582108750376e-08, "loss": 0.3177, "step": 5918 }, { "epoch": 2.9384411716035084, "grad_norm": 0.2841719388961792, "learning_rate": 1.2622590571383376e-08, "loss": 0.3273, "step": 5919 }, { "epoch": 2.938937613767996, "grad_norm": 0.28387898206710815, "learning_rate": 1.241826418378389e-08, "loss": 0.3031, "step": 5920 }, { "epoch": 2.939434055932484, "grad_norm": 0.29712343215942383, "learning_rate": 1.2215603014170685e-08, "loss": 0.3321, "step": 5921 }, { "epoch": 2.9399304980969716, "grad_norm": 0.312551885843277, "learning_rate": 1.2014607130207967e-08, "loss": 0.3335, "step": 5922 }, { "epoch": 2.9404269402614593, "grad_norm": 0.30224940180778503, "learning_rate": 1.1815276599001501e-08, "loss": 0.337, "step": 5923 }, { "epoch": 2.9409233824259475, "grad_norm": 0.3305496275424957, "learning_rate": 1.1617611487103054e-08, "loss": 0.357, "step": 5924 }, { "epoch": 2.9414198245904353, "grad_norm": 0.31217896938323975, "learning_rate": 1.1421611860507054e-08, "loss": 0.3505, "step": 5925 }, { "epoch": 2.941916266754923, "grad_norm": 0.29986950755119324, "learning_rate": 1.1227277784652823e-08, "loss": 0.2942, "step": 5926 }, { "epoch": 2.9424127089194108, "grad_norm": 0.3207274079322815, "learning_rate": 1.1034609324423463e-08, "loss": 0.3624, "step": 5927 }, { "epoch": 2.9429091510838985, "grad_norm": 0.28080886602401733, "learning_rate": 1.084360654414529e-08, "loss": 0.3033, "step": 5928 }, { "epoch": 2.9434055932483867, "grad_norm": 0.3143976926803589, "learning_rate": 1.0654269507589522e-08, "loss": 0.361, "step": 5929 }, { "epoch": 2.9439020354128744, "grad_norm": 0.28652146458625793, "learning_rate": 1.0466598277970031e-08, "loss": 0.2853, "step": 5930 }, { "epoch": 2.944398477577362, "grad_norm": 0.31364214420318604, "learning_rate": 1.0280592917945032e-08, "loss": 0.3287, "step": 5931 }, { "epoch": 2.94489491974185, "grad_norm": 0.2961176931858063, "learning_rate": 1.009625348961707e-08, "loss": 0.3323, "step": 5932 }, { "epoch": 2.9453913619063377, "grad_norm": 0.3105924129486084, "learning_rate": 9.913580054532468e-09, "loss": 0.3533, "step": 5933 }, { "epoch": 2.945887804070826, "grad_norm": 0.3234111964702606, "learning_rate": 9.732572673680218e-09, "loss": 0.3151, "step": 5934 }, { "epoch": 2.9463842462353136, "grad_norm": 0.3280419111251831, "learning_rate": 9.5532314074942e-09, "loss": 0.3329, "step": 5935 }, { "epoch": 2.9468806883998013, "grad_norm": 0.3047204613685608, "learning_rate": 9.375556315850964e-09, "loss": 0.2865, "step": 5936 }, { "epoch": 2.9473771305642895, "grad_norm": 0.31798988580703735, "learning_rate": 9.199547458071945e-09, "loss": 0.3496, "step": 5937 }, { "epoch": 2.9478735727287773, "grad_norm": 0.33470645546913147, "learning_rate": 9.025204892921801e-09, "loss": 0.3301, "step": 5938 }, { "epoch": 2.948370014893265, "grad_norm": 0.3146890103816986, "learning_rate": 8.852528678608418e-09, "loss": 0.3332, "step": 5939 }, { "epoch": 2.9488664570577527, "grad_norm": 0.29763561487197876, "learning_rate": 8.681518872784011e-09, "loss": 0.3267, "step": 5940 }, { "epoch": 2.9493628992222405, "grad_norm": 0.2943956255912781, "learning_rate": 8.512175532543466e-09, "loss": 0.3365, "step": 5941 }, { "epoch": 2.9498593413867287, "grad_norm": 0.3086245656013489, "learning_rate": 8.344498714427107e-09, "loss": 0.3284, "step": 5942 }, { "epoch": 2.9503557835512164, "grad_norm": 0.3060939908027649, "learning_rate": 8.178488474416269e-09, "loss": 0.3052, "step": 5943 }, { "epoch": 2.950852225715704, "grad_norm": 0.3333144187927246, "learning_rate": 8.014144867938279e-09, "loss": 0.3751, "step": 5944 }, { "epoch": 2.951348667880192, "grad_norm": 0.31139710545539856, "learning_rate": 7.851467949862579e-09, "loss": 0.2979, "step": 5945 }, { "epoch": 2.9518451100446796, "grad_norm": 0.3212064802646637, "learning_rate": 7.690457774502947e-09, "loss": 0.3552, "step": 5946 }, { "epoch": 2.952341552209168, "grad_norm": 0.332133948802948, "learning_rate": 7.531114395615823e-09, "loss": 0.3059, "step": 5947 }, { "epoch": 2.9528379943736556, "grad_norm": 0.3243531286716461, "learning_rate": 7.373437866401434e-09, "loss": 0.3658, "step": 5948 }, { "epoch": 2.9533344365381433, "grad_norm": 0.3251253068447113, "learning_rate": 7.2174282395043314e-09, "loss": 0.376, "step": 5949 }, { "epoch": 2.953830878702631, "grad_norm": 0.2993064522743225, "learning_rate": 7.06308556701174e-09, "loss": 0.3062, "step": 5950 }, { "epoch": 2.954327320867119, "grad_norm": 0.2913537323474884, "learning_rate": 6.910409900454107e-09, "loss": 0.2962, "step": 5951 }, { "epoch": 2.954823763031607, "grad_norm": 0.4285197854042053, "learning_rate": 6.759401290806211e-09, "loss": 0.3326, "step": 5952 }, { "epoch": 2.9553202051960947, "grad_norm": 0.2917100191116333, "learning_rate": 6.610059788485501e-09, "loss": 0.2975, "step": 5953 }, { "epoch": 2.9558166473605825, "grad_norm": 0.33828312158584595, "learning_rate": 6.462385443353203e-09, "loss": 0.3844, "step": 5954 }, { "epoch": 2.95631308952507, "grad_norm": 0.3186664879322052, "learning_rate": 6.316378304713211e-09, "loss": 0.327, "step": 5955 }, { "epoch": 2.956809531689558, "grad_norm": 0.34092840552330017, "learning_rate": 6.172038421313753e-09, "loss": 0.3262, "step": 5956 }, { "epoch": 2.957305973854046, "grad_norm": 0.3141838014125824, "learning_rate": 6.029365841345724e-09, "loss": 0.3221, "step": 5957 }, { "epoch": 2.957802416018534, "grad_norm": 0.3098863959312439, "learning_rate": 5.888360612444355e-09, "loss": 0.3495, "step": 5958 }, { "epoch": 2.9582988581830216, "grad_norm": 0.3179643750190735, "learning_rate": 5.749022781686431e-09, "loss": 0.3361, "step": 5959 }, { "epoch": 2.9587953003475094, "grad_norm": 0.32224181294441223, "learning_rate": 5.6113523955941825e-09, "loss": 0.3278, "step": 5960 }, { "epoch": 2.959291742511997, "grad_norm": 0.3038732707500458, "learning_rate": 5.475349500130844e-09, "loss": 0.2924, "step": 5961 }, { "epoch": 2.9597881846764853, "grad_norm": 0.3187200129032135, "learning_rate": 5.341014140705092e-09, "loss": 0.321, "step": 5962 }, { "epoch": 2.960284626840973, "grad_norm": 0.3234194815158844, "learning_rate": 5.208346362167161e-09, "loss": 0.3532, "step": 5963 }, { "epoch": 2.960781069005461, "grad_norm": 0.3000408113002777, "learning_rate": 5.077346208811618e-09, "loss": 0.3299, "step": 5964 }, { "epoch": 2.961277511169949, "grad_norm": 0.29332759976387024, "learning_rate": 4.948013724375145e-09, "loss": 0.3407, "step": 5965 }, { "epoch": 2.9617739533344363, "grad_norm": 0.33273130655288696, "learning_rate": 4.820348952039311e-09, "loss": 0.3065, "step": 5966 }, { "epoch": 2.9622703954989245, "grad_norm": 0.3681550621986389, "learning_rate": 4.694351934427799e-09, "loss": 0.3486, "step": 5967 }, { "epoch": 2.962766837663412, "grad_norm": 0.3095383048057556, "learning_rate": 4.5700227136069585e-09, "loss": 0.2996, "step": 5968 }, { "epoch": 2.9632632798279, "grad_norm": 0.29451149702072144, "learning_rate": 4.447361331087474e-09, "loss": 0.3651, "step": 5969 }, { "epoch": 2.963759721992388, "grad_norm": 0.28505653142929077, "learning_rate": 4.326367827822142e-09, "loss": 0.3045, "step": 5970 }, { "epoch": 2.964256164156876, "grad_norm": 0.2969594895839691, "learning_rate": 4.207042244208092e-09, "loss": 0.3335, "step": 5971 }, { "epoch": 2.9647526063213636, "grad_norm": 0.29782095551490784, "learning_rate": 4.0893846200840135e-09, "loss": 0.3555, "step": 5972 }, { "epoch": 2.9652490484858514, "grad_norm": 0.3274759352207184, "learning_rate": 3.973394994733481e-09, "loss": 0.3297, "step": 5973 }, { "epoch": 2.965745490650339, "grad_norm": 0.30674874782562256, "learning_rate": 3.85907340688163e-09, "loss": 0.345, "step": 5974 }, { "epoch": 2.9662419328148273, "grad_norm": 0.28412994742393494, "learning_rate": 3.746419894697928e-09, "loss": 0.288, "step": 5975 }, { "epoch": 2.966738374979315, "grad_norm": 0.3106635808944702, "learning_rate": 3.635434495793955e-09, "loss": 0.3558, "step": 5976 }, { "epoch": 2.9672348171438028, "grad_norm": 0.3254022002220154, "learning_rate": 3.5261172472245143e-09, "loss": 0.3449, "step": 5977 }, { "epoch": 2.9677312593082905, "grad_norm": 0.3037446439266205, "learning_rate": 3.4184681854876335e-09, "loss": 0.3046, "step": 5978 }, { "epoch": 2.9682277014727783, "grad_norm": 0.2887590825557709, "learning_rate": 3.3124873465251172e-09, "loss": 0.3037, "step": 5979 }, { "epoch": 2.9687241436372664, "grad_norm": 0.3116016387939453, "learning_rate": 3.208174765720329e-09, "loss": 0.3616, "step": 5980 }, { "epoch": 2.969220585801754, "grad_norm": 0.3280855417251587, "learning_rate": 3.1055304779009645e-09, "loss": 0.3194, "step": 5981 }, { "epoch": 2.969717027966242, "grad_norm": 0.3287355601787567, "learning_rate": 3.004554517336833e-09, "loss": 0.375, "step": 5982 }, { "epoch": 2.9702134701307297, "grad_norm": 0.29693248867988586, "learning_rate": 2.905246917740967e-09, "loss": 0.2941, "step": 5983 }, { "epoch": 2.9707099122952174, "grad_norm": 0.30319640040397644, "learning_rate": 2.8076077122696222e-09, "loss": 0.3492, "step": 5984 }, { "epoch": 2.9712063544597056, "grad_norm": 0.2983386218547821, "learning_rate": 2.711636933522277e-09, "loss": 0.3444, "step": 5985 }, { "epoch": 2.9717027966241933, "grad_norm": 0.32061633467674255, "learning_rate": 2.617334613540523e-09, "loss": 0.2883, "step": 5986 }, { "epoch": 2.972199238788681, "grad_norm": 0.3204083740711212, "learning_rate": 2.5247007838091753e-09, "loss": 0.3184, "step": 5987 }, { "epoch": 2.972695680953169, "grad_norm": 0.31628870964050293, "learning_rate": 2.4337354752562714e-09, "loss": 0.361, "step": 5988 }, { "epoch": 2.9731921231176566, "grad_norm": 0.28504833579063416, "learning_rate": 2.3444387182530726e-09, "loss": 0.2997, "step": 5989 }, { "epoch": 2.9736885652821448, "grad_norm": 0.3058261275291443, "learning_rate": 2.256810542612953e-09, "loss": 0.3304, "step": 5990 }, { "epoch": 2.9741850074466325, "grad_norm": 0.3257979154586792, "learning_rate": 2.170850977592509e-09, "loss": 0.342, "step": 5991 }, { "epoch": 2.9746814496111202, "grad_norm": 0.29660844802856445, "learning_rate": 2.0865600518915618e-09, "loss": 0.3366, "step": 5992 }, { "epoch": 2.975177891775608, "grad_norm": 0.29286426305770874, "learning_rate": 2.0039377936525995e-09, "loss": 0.3079, "step": 5993 }, { "epoch": 2.9756743339400957, "grad_norm": 0.3247978389263153, "learning_rate": 1.922984230460778e-09, "loss": 0.3456, "step": 5994 }, { "epoch": 2.976170776104584, "grad_norm": 0.30106034874916077, "learning_rate": 1.8436993893444777e-09, "loss": 0.3498, "step": 5995 }, { "epoch": 2.9766672182690717, "grad_norm": 0.3157024383544922, "learning_rate": 1.7660832967741904e-09, "loss": 0.3382, "step": 5996 }, { "epoch": 2.9771636604335594, "grad_norm": 0.3288118243217468, "learning_rate": 1.6901359786641869e-09, "loss": 0.3303, "step": 5997 }, { "epoch": 2.9776601025980476, "grad_norm": 0.33447355031967163, "learning_rate": 1.615857460371406e-09, "loss": 0.3607, "step": 5998 }, { "epoch": 2.9781565447625353, "grad_norm": 0.2822491526603699, "learning_rate": 1.5432477666954548e-09, "loss": 0.2937, "step": 5999 }, { "epoch": 2.978652986927023, "grad_norm": 0.3269483149051666, "learning_rate": 1.4723069218780528e-09, "loss": 0.3462, "step": 6000 }, { "epoch": 2.979149429091511, "grad_norm": 0.2776494324207306, "learning_rate": 1.403034949605253e-09, "loss": 0.3081, "step": 6001 }, { "epoch": 2.9796458712559986, "grad_norm": 0.31470340490341187, "learning_rate": 1.3354318730052219e-09, "loss": 0.3213, "step": 6002 }, { "epoch": 2.9801423134204867, "grad_norm": 0.30720260739326477, "learning_rate": 1.2694977146476828e-09, "loss": 0.3064, "step": 6003 }, { "epoch": 2.9806387555849745, "grad_norm": 0.31976333260536194, "learning_rate": 1.2052324965466934e-09, "loss": 0.3141, "step": 6004 }, { "epoch": 2.9811351977494622, "grad_norm": 0.3040764331817627, "learning_rate": 1.1426362401595337e-09, "loss": 0.336, "step": 6005 }, { "epoch": 2.98163163991395, "grad_norm": 0.30121561884880066, "learning_rate": 1.0817089663844872e-09, "loss": 0.3337, "step": 6006 }, { "epoch": 2.9821280820784377, "grad_norm": 0.284947007894516, "learning_rate": 1.0224506955636148e-09, "loss": 0.3253, "step": 6007 }, { "epoch": 2.982624524242926, "grad_norm": 0.3047350347042084, "learning_rate": 9.648614474816465e-10, "loss": 0.3432, "step": 6008 }, { "epoch": 2.9831209664074136, "grad_norm": 0.31258782744407654, "learning_rate": 9.089412413665344e-10, "loss": 0.3122, "step": 6009 }, { "epoch": 2.9836174085719014, "grad_norm": 0.3250047266483307, "learning_rate": 8.54690095887789e-10, "loss": 0.3468, "step": 6010 }, { "epoch": 2.984113850736389, "grad_norm": 0.2981475293636322, "learning_rate": 8.021080291592542e-10, "loss": 0.2796, "step": 6011 }, { "epoch": 2.984610292900877, "grad_norm": 0.31087526679039, "learning_rate": 7.511950587357764e-10, "loss": 0.3202, "step": 6012 }, { "epoch": 2.985106735065365, "grad_norm": 0.3147108256816864, "learning_rate": 7.019512016165353e-10, "loss": 0.3479, "step": 6013 }, { "epoch": 2.985603177229853, "grad_norm": 0.30886873602867126, "learning_rate": 6.543764742422687e-10, "loss": 0.336, "step": 6014 }, { "epoch": 2.9860996193943405, "grad_norm": 0.3124079704284668, "learning_rate": 6.084708924969373e-10, "loss": 0.3013, "step": 6015 }, { "epoch": 2.9865960615588283, "grad_norm": 0.3288419544696808, "learning_rate": 5.642344717071702e-10, "loss": 0.3403, "step": 6016 }, { "epoch": 2.987092503723316, "grad_norm": 0.2761222720146179, "learning_rate": 5.21667226642264e-10, "loss": 0.3235, "step": 6017 }, { "epoch": 2.987588945887804, "grad_norm": 0.31467682123184204, "learning_rate": 4.807691715147389e-10, "loss": 0.3877, "step": 6018 }, { "epoch": 2.988085388052292, "grad_norm": 0.3071209490299225, "learning_rate": 4.4154031997867274e-10, "loss": 0.3271, "step": 6019 }, { "epoch": 2.9885818302167797, "grad_norm": 0.311810702085495, "learning_rate": 4.039806851324768e-10, "loss": 0.3085, "step": 6020 }, { "epoch": 2.9890782723812674, "grad_norm": 0.3087822198867798, "learning_rate": 3.6809027951500987e-10, "loss": 0.2983, "step": 6021 }, { "epoch": 2.989574714545755, "grad_norm": 0.34698545932769775, "learning_rate": 3.338691151100193e-10, "loss": 0.3528, "step": 6022 }, { "epoch": 2.9900711567102434, "grad_norm": 0.3150802254676819, "learning_rate": 3.013172033422551e-10, "loss": 0.3513, "step": 6023 }, { "epoch": 2.990567598874731, "grad_norm": 0.4040195643901825, "learning_rate": 2.7043455508080075e-10, "loss": 0.3244, "step": 6024 }, { "epoch": 2.991064041039219, "grad_norm": 0.30927813053131104, "learning_rate": 2.412211806362974e-10, "loss": 0.3293, "step": 6025 }, { "epoch": 2.991560483203707, "grad_norm": 0.30079522728919983, "learning_rate": 2.1367708976205436e-10, "loss": 0.3062, "step": 6026 }, { "epoch": 2.9920569253681943, "grad_norm": 0.3109484016895294, "learning_rate": 1.8780229165404894e-10, "loss": 0.3638, "step": 6027 }, { "epoch": 2.9925533675326825, "grad_norm": 0.3148918151855469, "learning_rate": 1.6359679495148162e-10, "loss": 0.3367, "step": 6028 }, { "epoch": 2.9930498096971703, "grad_norm": 0.3318570852279663, "learning_rate": 1.4106060773622088e-10, "loss": 0.3207, "step": 6029 }, { "epoch": 2.993546251861658, "grad_norm": 0.32658708095550537, "learning_rate": 1.2019373753224816e-10, "loss": 0.3117, "step": 6030 }, { "epoch": 2.994042694026146, "grad_norm": 0.3068765103816986, "learning_rate": 1.0099619130621296e-10, "loss": 0.2921, "step": 6031 }, { "epoch": 2.994539136190634, "grad_norm": 0.31089600920677185, "learning_rate": 8.346797546798791e-11, "loss": 0.3297, "step": 6032 }, { "epoch": 2.9950355783551217, "grad_norm": 0.30726292729377747, "learning_rate": 6.760909586900343e-11, "loss": 0.2869, "step": 6033 }, { "epoch": 2.9955320205196094, "grad_norm": 0.3059562146663666, "learning_rate": 5.3419557805578504e-11, "loss": 0.3307, "step": 6034 }, { "epoch": 2.996028462684097, "grad_norm": 0.32581809163093567, "learning_rate": 4.0899366013924524e-11, "loss": 0.3625, "step": 6035 }, { "epoch": 2.9965249048485854, "grad_norm": 0.31466490030288696, "learning_rate": 3.00485246745863e-11, "loss": 0.315, "step": 6036 }, { "epoch": 2.997021347013073, "grad_norm": 0.33737117052078247, "learning_rate": 2.086703741022156e-11, "loss": 0.291, "step": 6037 }, { "epoch": 2.997517789177561, "grad_norm": 0.30521276593208313, "learning_rate": 1.3354907286711184e-11, "loss": 0.3042, "step": 6038 }, { "epoch": 2.9980142313420486, "grad_norm": 0.31655365228652954, "learning_rate": 7.512136812048987e-12, "loss": 0.3364, "step": 6039 }, { "epoch": 2.9985106735065363, "grad_norm": 0.29727238416671753, "learning_rate": 3.3387279363417123e-12, "loss": 0.2819, "step": 6040 }, { "epoch": 2.9990071156710245, "grad_norm": 0.29813098907470703, "learning_rate": 8.346820540294787e-13, "loss": 0.327, "step": 6041 }, { "epoch": 2.9995035578355123, "grad_norm": 0.3040812611579895, "learning_rate": 0.0, "loss": 0.3297, "step": 6042 }, { "epoch": 2.9995035578355123, "step": 6042, "total_flos": 7155333578227712.0, "train_loss": 0.39068429800936105, "train_runtime": 117986.0124, "train_samples_per_second": 4.916, "train_steps_per_second": 0.051 } ], "logging_steps": 1.0, "max_steps": 6042, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7155333578227712.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }