{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021739130434782608, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6431, "num_tokens": 2090831.0, "step": 1 }, { "epoch": 0.043478260869565216, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6435, "num_tokens": 4183006.0, "step": 2 }, { "epoch": 0.06521739130434782, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6379, "num_tokens": 6272564.0, "step": 3 }, { "epoch": 0.08695652173913043, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6449, "num_tokens": 8366125.0, "step": 4 }, { "epoch": 0.10869565217391304, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6402, "num_tokens": 10461703.0, "step": 5 }, { "epoch": 0.13043478260869565, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6338, "num_tokens": 12555347.0, "step": 6 }, { "epoch": 0.15217391304347827, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6436, "num_tokens": 14648409.0, "step": 7 }, { "epoch": 0.17391304347826086, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6384, "num_tokens": 16741562.0, "step": 8 }, { "epoch": 0.1956521739130435, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6437, "num_tokens": 18831318.0, "step": 9 }, { "epoch": 0.21739130434782608, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.644, "num_tokens": 20922863.0, "step": 10 }, { "epoch": 0.2391304347826087, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6419, "num_tokens": 23014620.0, "step": 11 }, { "epoch": 0.2608695652173913, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6406, "num_tokens": 25105339.0, "step": 12 }, { "epoch": 0.2826086956521739, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6405, "num_tokens": 27197795.0, "step": 13 }, { "epoch": 0.30434782608695654, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6448, "num_tokens": 29291898.0, "step": 14 }, { "epoch": 0.32608695652173914, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6372, "num_tokens": 31383496.0, "step": 15 }, { "epoch": 0.34782608695652173, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.6397, "num_tokens": 33477448.0, "step": 16 }, { "epoch": 0.3695652173913043, "grad_norm": 8.101283473215778, "learning_rate": 0.0, "loss": 0.6359, "num_tokens": 35568911.0, "step": 17 }, { "epoch": 0.391304347826087, "grad_norm": 8.159300793582531, "learning_rate": 2.1739130434782606e-08, "loss": 0.6418, "num_tokens": 37660844.0, "step": 18 }, { "epoch": 0.41304347826086957, "grad_norm": 8.143428019277147, "learning_rate": 4.347826086956521e-08, "loss": 0.6391, "num_tokens": 39755083.0, "step": 19 }, { "epoch": 0.43478260869565216, "grad_norm": 8.104124464792218, "learning_rate": 6.521739130434782e-08, "loss": 0.6442, "num_tokens": 41847436.0, "step": 20 }, { "epoch": 0.45652173913043476, "grad_norm": 8.101986671909064, "learning_rate": 8.695652173913042e-08, "loss": 0.6452, "num_tokens": 43940263.0, "step": 21 }, { "epoch": 0.4782608695652174, "grad_norm": 8.13244304749822, "learning_rate": 1.0869565217391303e-07, "loss": 0.646, "num_tokens": 46033076.0, "step": 22 }, { "epoch": 0.5, "grad_norm": 8.147383007336266, "learning_rate": 1.3043478260869563e-07, "loss": 0.6435, "num_tokens": 48124508.0, "step": 23 }, { "epoch": 0.5217391304347826, "grad_norm": 8.144105003431784, "learning_rate": 1.5217391304347825e-07, "loss": 0.639, "num_tokens": 50217845.0, "step": 24 }, { "epoch": 0.5434782608695652, "grad_norm": 7.999617610004394, "learning_rate": 1.7391304347826085e-07, "loss": 0.6374, "num_tokens": 52311173.0, "step": 25 }, { "epoch": 0.5652173913043478, "grad_norm": 8.109185633091318, "learning_rate": 1.9565217391304347e-07, "loss": 0.6335, "num_tokens": 54401629.0, "step": 26 }, { "epoch": 0.5869565217391305, "grad_norm": 8.109185633091318, "learning_rate": 2.1739130434782607e-07, "loss": 0.6192, "num_tokens": 56492920.0, "step": 27 }, { "epoch": 0.6086956521739131, "grad_norm": 9.27996446234992, "learning_rate": 2.1739130434782607e-07, "loss": 0.6146, "num_tokens": 58586157.0, "step": 28 }, { "epoch": 0.6304347826086957, "grad_norm": 9.159324927898115, "learning_rate": 2.391304347826087e-07, "loss": 0.6175, "num_tokens": 60679798.0, "step": 29 }, { "epoch": 0.6521739130434783, "grad_norm": 9.763710040610643, "learning_rate": 2.6086956521739126e-07, "loss": 0.6155, "num_tokens": 62773329.0, "step": 30 }, { "epoch": 0.6739130434782609, "grad_norm": 9.779733585945591, "learning_rate": 2.8260869565217386e-07, "loss": 0.6135, "num_tokens": 64864578.0, "step": 31 }, { "epoch": 0.6956521739130435, "grad_norm": 9.779733585945591, "learning_rate": 3.043478260869565e-07, "loss": 0.5828, "num_tokens": 66956700.0, "step": 32 }, { "epoch": 0.717391304347826, "grad_norm": 13.430953070069496, "learning_rate": 3.043478260869565e-07, "loss": 0.5856, "num_tokens": 69046410.0, "step": 33 }, { "epoch": 0.7391304347826086, "grad_norm": 13.32547280061291, "learning_rate": 3.260869565217391e-07, "loss": 0.5778, "num_tokens": 71138403.0, "step": 34 }, { "epoch": 0.7608695652173914, "grad_norm": 13.037082670124171, "learning_rate": 3.478260869565217e-07, "loss": 0.5698, "num_tokens": 73231369.0, "step": 35 }, { "epoch": 0.782608695652174, "grad_norm": 11.047732146906828, "learning_rate": 3.695652173913043e-07, "loss": 0.5637, "num_tokens": 75324509.0, "step": 36 }, { "epoch": 0.8043478260869565, "grad_norm": 9.940315680480857, "learning_rate": 3.9130434782608694e-07, "loss": 0.5683, "num_tokens": 77414031.0, "step": 37 }, { "epoch": 0.8260869565217391, "grad_norm": 5.482237002272238, "learning_rate": 4.1304347826086954e-07, "loss": 0.559, "num_tokens": 79506910.0, "step": 38 }, { "epoch": 0.8478260869565217, "grad_norm": 5.078292155102419, "learning_rate": 4.3478260869565214e-07, "loss": 0.5504, "num_tokens": 81597326.0, "step": 39 }, { "epoch": 0.8695652173913043, "grad_norm": 4.758390837071518, "learning_rate": 4.5652173913043473e-07, "loss": 0.5467, "num_tokens": 83691142.0, "step": 40 }, { "epoch": 0.8913043478260869, "grad_norm": 4.533086950846491, "learning_rate": 4.782608695652174e-07, "loss": 0.5368, "num_tokens": 85781965.0, "step": 41 }, { "epoch": 0.9130434782608695, "grad_norm": 4.3523282131985175, "learning_rate": 5e-07, "loss": 0.5356, "num_tokens": 87872195.0, "step": 42 }, { "epoch": 0.9347826086956522, "grad_norm": 4.14903309276372, "learning_rate": 5.217391304347825e-07, "loss": 0.536, "num_tokens": 89964241.0, "step": 43 }, { "epoch": 0.9565217391304348, "grad_norm": 3.9293608373652873, "learning_rate": 5.434782608695652e-07, "loss": 0.537, "num_tokens": 92057253.0, "step": 44 }, { "epoch": 0.9782608695652174, "grad_norm": 3.597560116551251, "learning_rate": 5.652173913043477e-07, "loss": 0.5299, "num_tokens": 94149837.0, "step": 45 }, { "epoch": 1.0, "grad_norm": 3.0088158093378103, "learning_rate": 5.869565217391305e-07, "loss": 0.5214, "num_tokens": 96244823.0, "step": 46 }, { "epoch": 1.0217391304347827, "grad_norm": 2.7490399390700904, "learning_rate": 6.08695652173913e-07, "loss": 0.5175, "num_tokens": 98337212.0, "step": 47 }, { "epoch": 1.0434782608695652, "grad_norm": 2.5946058680255386, "learning_rate": 6.304347826086957e-07, "loss": 0.5159, "num_tokens": 100428766.0, "step": 48 }, { "epoch": 1.065217391304348, "grad_norm": 2.503905065889435, "learning_rate": 6.521739130434782e-07, "loss": 0.5072, "num_tokens": 102520698.0, "step": 49 }, { "epoch": 1.0869565217391304, "grad_norm": 2.456643430738729, "learning_rate": 6.739130434782609e-07, "loss": 0.5018, "num_tokens": 104612511.0, "step": 50 }, { "epoch": 1.108695652173913, "grad_norm": 2.3865628559826857, "learning_rate": 6.956521739130434e-07, "loss": 0.4942, "num_tokens": 106707204.0, "step": 51 }, { "epoch": 1.1304347826086956, "grad_norm": 2.3356298780592293, "learning_rate": 7.17391304347826e-07, "loss": 0.4918, "num_tokens": 108800670.0, "step": 52 }, { "epoch": 1.1521739130434783, "grad_norm": 2.304735371322351, "learning_rate": 7.391304347826086e-07, "loss": 0.4914, "num_tokens": 110894728.0, "step": 53 }, { "epoch": 1.1739130434782608, "grad_norm": 2.2915944727664055, "learning_rate": 7.608695652173913e-07, "loss": 0.4919, "num_tokens": 112989220.0, "step": 54 }, { "epoch": 1.1956521739130435, "grad_norm": 2.2452202598548534, "learning_rate": 7.826086956521739e-07, "loss": 0.4794, "num_tokens": 115080217.0, "step": 55 }, { "epoch": 1.2173913043478262, "grad_norm": 2.179597494769898, "learning_rate": 8.043478260869565e-07, "loss": 0.4857, "num_tokens": 117172369.0, "step": 56 }, { "epoch": 1.2391304347826086, "grad_norm": 2.074697161067729, "learning_rate": 8.260869565217391e-07, "loss": 0.4789, "num_tokens": 119266181.0, "step": 57 }, { "epoch": 1.2608695652173914, "grad_norm": 1.8408360821578929, "learning_rate": 8.478260869565217e-07, "loss": 0.4746, "num_tokens": 121360235.0, "step": 58 }, { "epoch": 1.2826086956521738, "grad_norm": 1.409431046959032, "learning_rate": 8.695652173913043e-07, "loss": 0.4767, "num_tokens": 123452301.0, "step": 59 }, { "epoch": 1.3043478260869565, "grad_norm": 0.8344487221470542, "learning_rate": 8.913043478260869e-07, "loss": 0.4775, "num_tokens": 125544278.0, "step": 60 }, { "epoch": 1.3260869565217392, "grad_norm": 0.4342290656199116, "learning_rate": 9.130434782608695e-07, "loss": 0.4744, "num_tokens": 127634616.0, "step": 61 }, { "epoch": 1.3478260869565217, "grad_norm": 0.27358056826447424, "learning_rate": 9.347826086956522e-07, "loss": 0.4754, "num_tokens": 129729453.0, "step": 62 }, { "epoch": 1.3695652173913042, "grad_norm": 0.23082541661596323, "learning_rate": 9.565217391304349e-07, "loss": 0.4743, "num_tokens": 131820225.0, "step": 63 }, { "epoch": 1.391304347826087, "grad_norm": 0.22354521543660916, "learning_rate": 9.782608695652173e-07, "loss": 0.472, "num_tokens": 133913678.0, "step": 64 }, { "epoch": 1.4130434782608696, "grad_norm": 0.20118073906094502, "learning_rate": 1e-06, "loss": 0.4697, "num_tokens": 136007430.0, "step": 65 }, { "epoch": 1.434782608695652, "grad_norm": 0.20692402483858816, "learning_rate": 9.999870437446958e-07, "loss": 0.4631, "num_tokens": 138101493.0, "step": 66 }, { "epoch": 1.4565217391304348, "grad_norm": 0.1901715221113706, "learning_rate": 9.999481757248477e-07, "loss": 0.4666, "num_tokens": 140193704.0, "step": 67 }, { "epoch": 1.4782608695652173, "grad_norm": 0.19179490787267098, "learning_rate": 9.998833981786071e-07, "loss": 0.464, "num_tokens": 142285250.0, "step": 68 }, { "epoch": 1.5, "grad_norm": 0.18845464387262947, "learning_rate": 9.997927148360823e-07, "loss": 0.4692, "num_tokens": 144378781.0, "step": 69 }, { "epoch": 1.5217391304347827, "grad_norm": 0.1903737563315874, "learning_rate": 9.996761309191247e-07, "loss": 0.4596, "num_tokens": 146471011.0, "step": 70 }, { "epoch": 1.5434782608695652, "grad_norm": 0.1736225981517059, "learning_rate": 9.995336531410273e-07, "loss": 0.4631, "num_tokens": 148562357.0, "step": 71 }, { "epoch": 1.5652173913043477, "grad_norm": 0.17508414588076462, "learning_rate": 9.993652897061393e-07, "loss": 0.4649, "num_tokens": 150656061.0, "step": 72 }, { "epoch": 1.5869565217391304, "grad_norm": 0.17741120682256034, "learning_rate": 9.991710503093922e-07, "loss": 0.4674, "num_tokens": 152748871.0, "step": 73 }, { "epoch": 1.608695652173913, "grad_norm": 0.16540006434719232, "learning_rate": 9.989509461357426e-07, "loss": 0.4642, "num_tokens": 154839927.0, "step": 74 }, { "epoch": 1.6304347826086958, "grad_norm": 0.17667838370848385, "learning_rate": 9.987049898595276e-07, "loss": 0.4651, "num_tokens": 156932138.0, "step": 75 }, { "epoch": 1.6521739130434783, "grad_norm": 0.15619953156457722, "learning_rate": 9.984331956437354e-07, "loss": 0.46, "num_tokens": 159020025.0, "step": 76 }, { "epoch": 1.6739130434782608, "grad_norm": 0.158692399179484, "learning_rate": 9.98135579139189e-07, "loss": 0.4642, "num_tokens": 161110709.0, "step": 77 }, { "epoch": 1.6956521739130435, "grad_norm": 0.16387685015731232, "learning_rate": 9.97812157483646e-07, "loss": 0.4598, "num_tokens": 163201280.0, "step": 78 }, { "epoch": 1.7173913043478262, "grad_norm": 0.16147264627938895, "learning_rate": 9.974629493008114e-07, "loss": 0.4593, "num_tokens": 165293375.0, "step": 79 }, { "epoch": 1.7391304347826086, "grad_norm": 0.14847792515904235, "learning_rate": 9.97087974699264e-07, "loss": 0.4539, "num_tokens": 167386166.0, "step": 80 }, { "epoch": 1.7608695652173914, "grad_norm": 0.14368418963515403, "learning_rate": 9.966872552713004e-07, "loss": 0.4596, "num_tokens": 169477777.0, "step": 81 }, { "epoch": 1.7826086956521738, "grad_norm": 0.14685857026484775, "learning_rate": 9.962608140916905e-07, "loss": 0.4582, "num_tokens": 171567446.0, "step": 82 }, { "epoch": 1.8043478260869565, "grad_norm": 0.15282259101923246, "learning_rate": 9.958086757163488e-07, "loss": 0.4616, "num_tokens": 173655862.0, "step": 83 }, { "epoch": 1.8260869565217392, "grad_norm": 0.15514478856193514, "learning_rate": 9.953308661809207e-07, "loss": 0.457, "num_tokens": 175749514.0, "step": 84 }, { "epoch": 1.8478260869565217, "grad_norm": 0.14228669183096485, "learning_rate": 9.948274129992836e-07, "loss": 0.46, "num_tokens": 177837992.0, "step": 85 }, { "epoch": 1.8695652173913042, "grad_norm": 0.13988219902396246, "learning_rate": 9.942983451619614e-07, "loss": 0.4603, "num_tokens": 179931723.0, "step": 86 }, { "epoch": 1.891304347826087, "grad_norm": 0.1391966834345067, "learning_rate": 9.93743693134456e-07, "loss": 0.4617, "num_tokens": 182023329.0, "step": 87 }, { "epoch": 1.9130434782608696, "grad_norm": 0.14697062115129336, "learning_rate": 9.931634888554935e-07, "loss": 0.4623, "num_tokens": 184117906.0, "step": 88 }, { "epoch": 1.9347826086956523, "grad_norm": 0.13927296968611877, "learning_rate": 9.92557765735184e-07, "loss": 0.4563, "num_tokens": 186211056.0, "step": 89 }, { "epoch": 1.9565217391304348, "grad_norm": 0.14637757594641007, "learning_rate": 9.919265586530975e-07, "loss": 0.4627, "num_tokens": 188304474.0, "step": 90 }, { "epoch": 1.9782608695652173, "grad_norm": 0.142956055977351, "learning_rate": 9.912699039562576e-07, "loss": 0.4579, "num_tokens": 190397770.0, "step": 91 }, { "epoch": 2.0, "grad_norm": 0.14123350435136145, "learning_rate": 9.905878394570453e-07, "loss": 0.4602, "num_tokens": 192489635.0, "step": 92 }, { "epoch": 2.0217391304347827, "grad_norm": 0.1436834934317212, "learning_rate": 9.898804044310245e-07, "loss": 0.4558, "num_tokens": 194583301.0, "step": 93 }, { "epoch": 2.0434782608695654, "grad_norm": 0.1374483001051701, "learning_rate": 9.891476396146784e-07, "loss": 0.452, "num_tokens": 196677306.0, "step": 94 }, { "epoch": 2.0652173913043477, "grad_norm": 0.13722577602290897, "learning_rate": 9.883895872030657e-07, "loss": 0.4509, "num_tokens": 198768835.0, "step": 95 }, { "epoch": 2.0869565217391304, "grad_norm": 0.13783100429148887, "learning_rate": 9.87606290847388e-07, "loss": 0.4572, "num_tokens": 200859264.0, "step": 96 }, { "epoch": 2.108695652173913, "grad_norm": 0.1367807019239086, "learning_rate": 9.867977956524796e-07, "loss": 0.4582, "num_tokens": 202952584.0, "step": 97 }, { "epoch": 2.130434782608696, "grad_norm": 0.14120080132901613, "learning_rate": 9.859641481742077e-07, "loss": 0.4542, "num_tokens": 205045012.0, "step": 98 }, { "epoch": 2.1521739130434785, "grad_norm": 0.1347346802712429, "learning_rate": 9.851053964167927e-07, "loss": 0.4506, "num_tokens": 207137257.0, "step": 99 }, { "epoch": 2.1739130434782608, "grad_norm": 0.13826725695583056, "learning_rate": 9.842215898300433e-07, "loss": 0.4553, "num_tokens": 209229044.0, "step": 100 }, { "epoch": 2.1956521739130435, "grad_norm": 0.14306905673630102, "learning_rate": 9.833127793065097e-07, "loss": 0.4533, "num_tokens": 211323327.0, "step": 101 }, { "epoch": 2.217391304347826, "grad_norm": 0.13752558845525534, "learning_rate": 9.823790171785526e-07, "loss": 0.4556, "num_tokens": 213417950.0, "step": 102 }, { "epoch": 2.239130434782609, "grad_norm": 0.1451178643142076, "learning_rate": 9.814203572153298e-07, "loss": 0.4542, "num_tokens": 215508377.0, "step": 103 }, { "epoch": 2.260869565217391, "grad_norm": 0.1341943658847912, "learning_rate": 9.804368546197006e-07, "loss": 0.4548, "num_tokens": 217599824.0, "step": 104 }, { "epoch": 2.282608695652174, "grad_norm": 0.13948206525149412, "learning_rate": 9.794285660250455e-07, "loss": 0.4549, "num_tokens": 219693642.0, "step": 105 }, { "epoch": 2.3043478260869565, "grad_norm": 0.13795263816003625, "learning_rate": 9.783955494920066e-07, "loss": 0.4548, "num_tokens": 221789678.0, "step": 106 }, { "epoch": 2.3260869565217392, "grad_norm": 0.1409796803835522, "learning_rate": 9.773378645051436e-07, "loss": 0.4562, "num_tokens": 223881586.0, "step": 107 }, { "epoch": 2.3478260869565215, "grad_norm": 0.13044382265443313, "learning_rate": 9.762555719695088e-07, "loss": 0.4461, "num_tokens": 225970167.0, "step": 108 }, { "epoch": 2.369565217391304, "grad_norm": 0.1502600460482269, "learning_rate": 9.751487342071393e-07, "loss": 0.4498, "num_tokens": 228061661.0, "step": 109 }, { "epoch": 2.391304347826087, "grad_norm": 0.1338230534634335, "learning_rate": 9.740174149534692e-07, "loss": 0.4485, "num_tokens": 230153932.0, "step": 110 }, { "epoch": 2.4130434782608696, "grad_norm": 0.1440831587199848, "learning_rate": 9.728616793536587e-07, "loss": 0.4552, "num_tokens": 232244695.0, "step": 111 }, { "epoch": 2.4347826086956523, "grad_norm": 0.13658285859261474, "learning_rate": 9.716815939588436e-07, "loss": 0.4557, "num_tokens": 234336105.0, "step": 112 }, { "epoch": 2.4565217391304346, "grad_norm": 0.14313487538739023, "learning_rate": 9.704772267223019e-07, "loss": 0.4527, "num_tokens": 236428770.0, "step": 113 }, { "epoch": 2.4782608695652173, "grad_norm": 0.1400873818490847, "learning_rate": 9.692486469955424e-07, "loss": 0.4494, "num_tokens": 238521551.0, "step": 114 }, { "epoch": 2.5, "grad_norm": 0.13049717910266437, "learning_rate": 9.6799592552431e-07, "loss": 0.4458, "num_tokens": 240614588.0, "step": 115 }, { "epoch": 2.5217391304347827, "grad_norm": 0.13318259742833988, "learning_rate": 9.667191344445122e-07, "loss": 0.4562, "num_tokens": 242706024.0, "step": 116 }, { "epoch": 2.5434782608695654, "grad_norm": 0.1342513590455262, "learning_rate": 9.654183472780655e-07, "loss": 0.4573, "num_tokens": 244798483.0, "step": 117 }, { "epoch": 2.5652173913043477, "grad_norm": 0.136855839967963, "learning_rate": 9.640936389286615e-07, "loss": 0.4472, "num_tokens": 246891342.0, "step": 118 }, { "epoch": 2.5869565217391304, "grad_norm": 0.137209398518282, "learning_rate": 9.627450856774539e-07, "loss": 0.4468, "num_tokens": 248983299.0, "step": 119 }, { "epoch": 2.608695652173913, "grad_norm": 0.1347427629250892, "learning_rate": 9.613727651786657e-07, "loss": 0.4492, "num_tokens": 251073761.0, "step": 120 }, { "epoch": 2.630434782608696, "grad_norm": 0.14870638200971545, "learning_rate": 9.599767564551183e-07, "loss": 0.4502, "num_tokens": 253168287.0, "step": 121 }, { "epoch": 2.6521739130434785, "grad_norm": 0.13859516453775694, "learning_rate": 9.5855713989368e-07, "loss": 0.4523, "num_tokens": 255260895.0, "step": 122 }, { "epoch": 2.6739130434782608, "grad_norm": 0.15720652496010762, "learning_rate": 9.57113997240638e-07, "loss": 0.4463, "num_tokens": 257351654.0, "step": 123 }, { "epoch": 2.6956521739130435, "grad_norm": 0.1401244780972256, "learning_rate": 9.55647411596991e-07, "loss": 0.4518, "num_tokens": 259440240.0, "step": 124 }, { "epoch": 2.717391304347826, "grad_norm": 0.15018164735793, "learning_rate": 9.541574674136632e-07, "loss": 0.4498, "num_tokens": 261532114.0, "step": 125 }, { "epoch": 2.7391304347826084, "grad_norm": 0.13823614764022513, "learning_rate": 9.526442504866426e-07, "loss": 0.4558, "num_tokens": 263623046.0, "step": 126 }, { "epoch": 2.7608695652173916, "grad_norm": 0.14842418144378133, "learning_rate": 9.511078479520392e-07, "loss": 0.4523, "num_tokens": 265714481.0, "step": 127 }, { "epoch": 2.782608695652174, "grad_norm": 0.13803890404964064, "learning_rate": 9.495483482810687e-07, "loss": 0.451, "num_tokens": 267809891.0, "step": 128 }, { "epoch": 2.8043478260869565, "grad_norm": 0.15595909439991468, "learning_rate": 9.479658412749575e-07, "loss": 0.4535, "num_tokens": 269901433.0, "step": 129 }, { "epoch": 2.8260869565217392, "grad_norm": 0.1368971901595536, "learning_rate": 9.46360418059771e-07, "loss": 0.451, "num_tokens": 271991817.0, "step": 130 }, { "epoch": 2.8478260869565215, "grad_norm": 0.15455191550217653, "learning_rate": 9.447321710811674e-07, "loss": 0.4519, "num_tokens": 274081721.0, "step": 131 }, { "epoch": 2.869565217391304, "grad_norm": 0.13697363499414442, "learning_rate": 9.430811940990734e-07, "loss": 0.4509, "num_tokens": 276177796.0, "step": 132 }, { "epoch": 2.891304347826087, "grad_norm": 0.14942955290721893, "learning_rate": 9.41407582182286e-07, "loss": 0.4472, "num_tokens": 278269125.0, "step": 133 }, { "epoch": 2.9130434782608696, "grad_norm": 0.13839892788575286, "learning_rate": 9.397114317029974e-07, "loss": 0.4559, "num_tokens": 280360480.0, "step": 134 }, { "epoch": 2.9347826086956523, "grad_norm": 0.139804429331633, "learning_rate": 9.37992840331246e-07, "loss": 0.4512, "num_tokens": 282455179.0, "step": 135 }, { "epoch": 2.9565217391304346, "grad_norm": 0.1522025576863328, "learning_rate": 9.362519070292923e-07, "loss": 0.4457, "num_tokens": 284547258.0, "step": 136 }, { "epoch": 2.9782608695652173, "grad_norm": 0.13715855940260502, "learning_rate": 9.344887320459198e-07, "loss": 0.4472, "num_tokens": 286640938.0, "step": 137 }, { "epoch": 3.0, "grad_norm": 0.1457504699757938, "learning_rate": 9.327034169106629e-07, "loss": 0.4465, "num_tokens": 288733869.0, "step": 138 }, { "epoch": 3.0217391304347827, "grad_norm": 0.13583971625499844, "learning_rate": 9.308960644279604e-07, "loss": 0.4507, "num_tokens": 290826317.0, "step": 139 }, { "epoch": 3.0434782608695654, "grad_norm": 0.13589135954621737, "learning_rate": 9.290667786712352e-07, "loss": 0.4435, "num_tokens": 292916931.0, "step": 140 }, { "epoch": 3.0652173913043477, "grad_norm": 0.13858649102418938, "learning_rate": 9.272156649769018e-07, "loss": 0.454, "num_tokens": 295007551.0, "step": 141 }, { "epoch": 3.0869565217391304, "grad_norm": 0.13151545079660656, "learning_rate": 9.253428299383012e-07, "loss": 0.4443, "num_tokens": 297101123.0, "step": 142 }, { "epoch": 3.108695652173913, "grad_norm": 0.1372256717064563, "learning_rate": 9.234483813995613e-07, "loss": 0.4525, "num_tokens": 299194691.0, "step": 143 }, { "epoch": 3.130434782608696, "grad_norm": 0.1362038166818457, "learning_rate": 9.215324284493888e-07, "loss": 0.4441, "num_tokens": 301288963.0, "step": 144 }, { "epoch": 3.1521739130434785, "grad_norm": 0.13366059430402608, "learning_rate": 9.19595081414786e-07, "loss": 0.4499, "num_tokens": 303381802.0, "step": 145 }, { "epoch": 3.1739130434782608, "grad_norm": 0.13964481270629708, "learning_rate": 9.176364518546988e-07, "loss": 0.4484, "num_tokens": 305474609.0, "step": 146 }, { "epoch": 3.1956521739130435, "grad_norm": 0.13321001510853767, "learning_rate": 9.156566525535923e-07, "loss": 0.4452, "num_tokens": 307567147.0, "step": 147 }, { "epoch": 3.217391304347826, "grad_norm": 0.14086672975049896, "learning_rate": 9.136557975149561e-07, "loss": 0.4446, "num_tokens": 309657954.0, "step": 148 }, { "epoch": 3.239130434782609, "grad_norm": 0.1334280312981284, "learning_rate": 9.116340019547401e-07, "loss": 0.4468, "num_tokens": 311749336.0, "step": 149 }, { "epoch": 3.260869565217391, "grad_norm": 0.13486315217872386, "learning_rate": 9.095913822947196e-07, "loss": 0.4495, "num_tokens": 313844697.0, "step": 150 }, { "epoch": 3.282608695652174, "grad_norm": 0.13793460372824046, "learning_rate": 9.075280561557915e-07, "loss": 0.4452, "num_tokens": 315936487.0, "step": 151 }, { "epoch": 3.3043478260869565, "grad_norm": 0.1379832600278911, "learning_rate": 9.054441423512013e-07, "loss": 0.4485, "num_tokens": 318028220.0, "step": 152 }, { "epoch": 3.3260869565217392, "grad_norm": 0.1349244615099038, "learning_rate": 9.033397608797014e-07, "loss": 0.4463, "num_tokens": 320120976.0, "step": 153 }, { "epoch": 3.3478260869565215, "grad_norm": 0.13302146086489272, "learning_rate": 9.012150329186411e-07, "loss": 0.4446, "num_tokens": 322213515.0, "step": 154 }, { "epoch": 3.369565217391304, "grad_norm": 0.13213199888987598, "learning_rate": 8.990700808169889e-07, "loss": 0.4457, "num_tokens": 324306085.0, "step": 155 }, { "epoch": 3.391304347826087, "grad_norm": 0.13576476545257601, "learning_rate": 8.969050280882872e-07, "loss": 0.4435, "num_tokens": 326393943.0, "step": 156 }, { "epoch": 3.4130434782608696, "grad_norm": 0.13269213517048742, "learning_rate": 8.9471999940354e-07, "loss": 0.4421, "num_tokens": 328487650.0, "step": 157 }, { "epoch": 3.4347826086956523, "grad_norm": 0.1397362160827111, "learning_rate": 8.925151205840341e-07, "loss": 0.4434, "num_tokens": 330581600.0, "step": 158 }, { "epoch": 3.4565217391304346, "grad_norm": 0.13448412881240226, "learning_rate": 8.902905185940933e-07, "loss": 0.4392, "num_tokens": 332675243.0, "step": 159 }, { "epoch": 3.4782608695652173, "grad_norm": 0.13581732207346198, "learning_rate": 8.880463215337679e-07, "loss": 0.4412, "num_tokens": 334764586.0, "step": 160 }, { "epoch": 3.5, "grad_norm": 0.1390811493560612, "learning_rate": 8.857826586314586e-07, "loss": 0.4402, "num_tokens": 336856307.0, "step": 161 }, { "epoch": 3.5217391304347827, "grad_norm": 0.13460989761218342, "learning_rate": 8.834996602364736e-07, "loss": 0.4456, "num_tokens": 338950008.0, "step": 162 }, { "epoch": 3.5434782608695654, "grad_norm": 0.13236047844850818, "learning_rate": 8.811974578115248e-07, "loss": 0.4448, "num_tokens": 341042954.0, "step": 163 }, { "epoch": 3.5652173913043477, "grad_norm": 0.13163982832755347, "learning_rate": 8.788761839251558e-07, "loss": 0.4415, "num_tokens": 343134592.0, "step": 164 }, { "epoch": 3.5869565217391304, "grad_norm": 0.14154355839698224, "learning_rate": 8.765359722441094e-07, "loss": 0.4456, "num_tokens": 345226933.0, "step": 165 }, { "epoch": 3.608695652173913, "grad_norm": 0.1355716379437535, "learning_rate": 8.741769575256304e-07, "loss": 0.4539, "num_tokens": 347318986.0, "step": 166 }, { "epoch": 3.630434782608696, "grad_norm": 0.13083494960528463, "learning_rate": 8.717992756097047e-07, "loss": 0.4448, "num_tokens": 349411839.0, "step": 167 }, { "epoch": 3.6521739130434785, "grad_norm": 0.13406159250224642, "learning_rate": 8.694030634112389e-07, "loss": 0.4421, "num_tokens": 351504447.0, "step": 168 }, { "epoch": 3.6739130434782608, "grad_norm": 0.13054451577457893, "learning_rate": 8.669884589121756e-07, "loss": 0.4426, "num_tokens": 353596301.0, "step": 169 }, { "epoch": 3.6956521739130435, "grad_norm": 0.13149102842185534, "learning_rate": 8.645556011535469e-07, "loss": 0.4432, "num_tokens": 355690374.0, "step": 170 }, { "epoch": 3.717391304347826, "grad_norm": 0.139504340331513, "learning_rate": 8.621046302274697e-07, "loss": 0.4464, "num_tokens": 357782173.0, "step": 171 }, { "epoch": 3.7391304347826084, "grad_norm": 0.13974946687867332, "learning_rate": 8.596356872690778e-07, "loss": 0.4441, "num_tokens": 359875585.0, "step": 172 }, { "epoch": 3.7608695652173916, "grad_norm": 0.1334362497712104, "learning_rate": 8.571489144483944e-07, "loss": 0.446, "num_tokens": 361966708.0, "step": 173 }, { "epoch": 3.782608695652174, "grad_norm": 0.1338803708751177, "learning_rate": 8.546444549621466e-07, "loss": 0.4449, "num_tokens": 364059418.0, "step": 174 }, { "epoch": 3.8043478260869565, "grad_norm": 0.13983854438472215, "learning_rate": 8.521224530255185e-07, "loss": 0.4448, "num_tokens": 366152621.0, "step": 175 }, { "epoch": 3.8260869565217392, "grad_norm": 0.13196553812425743, "learning_rate": 8.495830538638481e-07, "loss": 0.4418, "num_tokens": 368245662.0, "step": 176 }, { "epoch": 3.8478260869565215, "grad_norm": 0.13578885820125056, "learning_rate": 8.470264037042638e-07, "loss": 0.441, "num_tokens": 370334430.0, "step": 177 }, { "epoch": 3.869565217391304, "grad_norm": 0.13655268949641716, "learning_rate": 8.44452649767264e-07, "loss": 0.4439, "num_tokens": 372426004.0, "step": 178 }, { "epoch": 3.891304347826087, "grad_norm": 0.13786817781349708, "learning_rate": 8.418619402582402e-07, "loss": 0.4449, "num_tokens": 374517459.0, "step": 179 }, { "epoch": 3.9130434782608696, "grad_norm": 0.13918471016485154, "learning_rate": 8.392544243589427e-07, "loss": 0.4443, "num_tokens": 376607611.0, "step": 180 }, { "epoch": 3.9347826086956523, "grad_norm": 0.1403794476474249, "learning_rate": 8.366302522188902e-07, "loss": 0.4458, "num_tokens": 378702030.0, "step": 181 }, { "epoch": 3.9565217391304346, "grad_norm": 0.13426097626521305, "learning_rate": 8.339895749467237e-07, "loss": 0.4454, "num_tokens": 380793542.0, "step": 182 }, { "epoch": 3.9782608695652173, "grad_norm": 0.13694462770495058, "learning_rate": 8.313325446015051e-07, "loss": 0.4402, "num_tokens": 382884886.0, "step": 183 }, { "epoch": 4.0, "grad_norm": 0.14272951717733043, "learning_rate": 8.286593141839608e-07, "loss": 0.4423, "num_tokens": 384977846.0, "step": 184 }, { "epoch": 4.021739130434782, "grad_norm": 0.1330582820057405, "learning_rate": 8.259700376276723e-07, "loss": 0.444, "num_tokens": 387071368.0, "step": 185 }, { "epoch": 4.043478260869565, "grad_norm": 0.14269491995175726, "learning_rate": 8.232648697902113e-07, "loss": 0.4419, "num_tokens": 389164937.0, "step": 186 }, { "epoch": 4.065217391304348, "grad_norm": 0.14167939313294817, "learning_rate": 8.205439664442229e-07, "loss": 0.4374, "num_tokens": 391258317.0, "step": 187 }, { "epoch": 4.086956521739131, "grad_norm": 0.13301946278649346, "learning_rate": 8.178074842684554e-07, "loss": 0.4369, "num_tokens": 393349237.0, "step": 188 }, { "epoch": 4.108695652173913, "grad_norm": 0.14450522832231033, "learning_rate": 8.150555808387387e-07, "loss": 0.441, "num_tokens": 395440009.0, "step": 189 }, { "epoch": 4.130434782608695, "grad_norm": 0.13877319368593247, "learning_rate": 8.122884146189103e-07, "loss": 0.4374, "num_tokens": 397533811.0, "step": 190 }, { "epoch": 4.1521739130434785, "grad_norm": 0.14318441928820957, "learning_rate": 8.095061449516902e-07, "loss": 0.4427, "num_tokens": 399626066.0, "step": 191 }, { "epoch": 4.173913043478261, "grad_norm": 0.14579072710487548, "learning_rate": 8.067089320495056e-07, "loss": 0.4424, "num_tokens": 401718435.0, "step": 192 }, { "epoch": 4.195652173913044, "grad_norm": 0.1374419102226233, "learning_rate": 8.038969369852654e-07, "loss": 0.443, "num_tokens": 403809789.0, "step": 193 }, { "epoch": 4.217391304347826, "grad_norm": 0.14176995288895658, "learning_rate": 8.010703216830851e-07, "loss": 0.4413, "num_tokens": 405899766.0, "step": 194 }, { "epoch": 4.239130434782608, "grad_norm": 0.13176722868730403, "learning_rate": 7.982292489089621e-07, "loss": 0.4416, "num_tokens": 407993759.0, "step": 195 }, { "epoch": 4.260869565217392, "grad_norm": 0.13658573742727376, "learning_rate": 7.953738822614047e-07, "loss": 0.4391, "num_tokens": 410085571.0, "step": 196 }, { "epoch": 4.282608695652174, "grad_norm": 0.13952183028099105, "learning_rate": 7.92504386162009e-07, "loss": 0.4349, "num_tokens": 412179524.0, "step": 197 }, { "epoch": 4.304347826086957, "grad_norm": 0.13564370570138545, "learning_rate": 7.896209258459932e-07, "loss": 0.4444, "num_tokens": 414268672.0, "step": 198 }, { "epoch": 4.326086956521739, "grad_norm": 0.14161938766244753, "learning_rate": 7.867236673526819e-07, "loss": 0.4437, "num_tokens": 416362080.0, "step": 199 }, { "epoch": 4.3478260869565215, "grad_norm": 0.13684396912767316, "learning_rate": 7.838127775159451e-07, "loss": 0.4436, "num_tokens": 418453097.0, "step": 200 }, { "epoch": 4.369565217391305, "grad_norm": 0.14184406812492364, "learning_rate": 7.808884239545909e-07, "loss": 0.4415, "num_tokens": 420545560.0, "step": 201 }, { "epoch": 4.391304347826087, "grad_norm": 0.1366458217786427, "learning_rate": 7.779507750627144e-07, "loss": 0.4402, "num_tokens": 422636960.0, "step": 202 }, { "epoch": 4.413043478260869, "grad_norm": 0.13773619821700897, "learning_rate": 7.75e-07, "loss": 0.4437, "num_tokens": 424730959.0, "step": 203 }, { "epoch": 4.434782608695652, "grad_norm": 0.13758121811172125, "learning_rate": 7.720362686819813e-07, "loss": 0.44, "num_tokens": 426822955.0, "step": 204 }, { "epoch": 4.456521739130435, "grad_norm": 0.13761175022454988, "learning_rate": 7.690597517702567e-07, "loss": 0.4423, "num_tokens": 428914580.0, "step": 205 }, { "epoch": 4.478260869565218, "grad_norm": 0.13045102347864693, "learning_rate": 7.660706206626619e-07, "loss": 0.4364, "num_tokens": 431007431.0, "step": 206 }, { "epoch": 4.5, "grad_norm": 0.13320915243246825, "learning_rate": 7.630690474834003e-07, "loss": 0.4376, "num_tokens": 433101278.0, "step": 207 }, { "epoch": 4.521739130434782, "grad_norm": 0.135879604945715, "learning_rate": 7.600552050731314e-07, "loss": 0.4392, "num_tokens": 435193373.0, "step": 208 }, { "epoch": 4.543478260869565, "grad_norm": 0.13761527466802642, "learning_rate": 7.570292669790184e-07, "loss": 0.4383, "num_tokens": 437286568.0, "step": 209 }, { "epoch": 4.565217391304348, "grad_norm": 0.1403373341420107, "learning_rate": 7.539914074447348e-07, "loss": 0.4419, "num_tokens": 439379915.0, "step": 210 }, { "epoch": 4.586956521739131, "grad_norm": 0.13604302231295956, "learning_rate": 7.5094180140043e-07, "loss": 0.44, "num_tokens": 441473072.0, "step": 211 }, { "epoch": 4.608695652173913, "grad_norm": 0.1322101614674662, "learning_rate": 7.478806244526576e-07, "loss": 0.4436, "num_tokens": 443567056.0, "step": 212 }, { "epoch": 4.630434782608695, "grad_norm": 0.13669477430314778, "learning_rate": 7.448080528742623e-07, "loss": 0.4398, "num_tokens": 445657660.0, "step": 213 }, { "epoch": 4.6521739130434785, "grad_norm": 0.1415477810417469, "learning_rate": 7.417242635942297e-07, "loss": 0.4394, "num_tokens": 447751320.0, "step": 214 }, { "epoch": 4.673913043478261, "grad_norm": 0.135566768628023, "learning_rate": 7.38629434187499e-07, "loss": 0.4386, "num_tokens": 449843005.0, "step": 215 }, { "epoch": 4.695652173913043, "grad_norm": 0.14123634359786125, "learning_rate": 7.355237428647359e-07, "loss": 0.4415, "num_tokens": 451936377.0, "step": 216 }, { "epoch": 4.717391304347826, "grad_norm": 0.1357872359821091, "learning_rate": 7.324073684620725e-07, "loss": 0.4389, "num_tokens": 454027246.0, "step": 217 }, { "epoch": 4.739130434782608, "grad_norm": 0.13106073468244842, "learning_rate": 7.292804904308086e-07, "loss": 0.4353, "num_tokens": 456118801.0, "step": 218 }, { "epoch": 4.760869565217392, "grad_norm": 0.13664278397915866, "learning_rate": 7.261432888270776e-07, "loss": 0.4436, "num_tokens": 458211696.0, "step": 219 }, { "epoch": 4.782608695652174, "grad_norm": 0.1359534311622986, "learning_rate": 7.229959443014793e-07, "loss": 0.4427, "num_tokens": 460302365.0, "step": 220 }, { "epoch": 4.804347826086957, "grad_norm": 0.1391728706721248, "learning_rate": 7.198386380886764e-07, "loss": 0.4378, "num_tokens": 462395009.0, "step": 221 }, { "epoch": 4.826086956521739, "grad_norm": 0.14471303133933838, "learning_rate": 7.1667155199696e-07, "loss": 0.4393, "num_tokens": 464488113.0, "step": 222 }, { "epoch": 4.8478260869565215, "grad_norm": 0.13340639535871296, "learning_rate": 7.134948683977786e-07, "loss": 0.4403, "num_tokens": 466576826.0, "step": 223 }, { "epoch": 4.869565217391305, "grad_norm": 0.13672161474434347, "learning_rate": 7.103087702152376e-07, "loss": 0.4377, "num_tokens": 468668935.0, "step": 224 }, { "epoch": 4.891304347826087, "grad_norm": 0.1344120648043691, "learning_rate": 7.071134409155658e-07, "loss": 0.4399, "num_tokens": 470761454.0, "step": 225 }, { "epoch": 4.913043478260869, "grad_norm": 0.13711458441843338, "learning_rate": 7.039090644965509e-07, "loss": 0.4432, "num_tokens": 472854948.0, "step": 226 }, { "epoch": 4.934782608695652, "grad_norm": 0.1355562476824205, "learning_rate": 7.006958254769437e-07, "loss": 0.4404, "num_tokens": 474946262.0, "step": 227 }, { "epoch": 4.956521739130435, "grad_norm": 0.13762364563023322, "learning_rate": 6.974739088858337e-07, "loss": 0.439, "num_tokens": 477036875.0, "step": 228 }, { "epoch": 4.978260869565218, "grad_norm": 0.13969976190229713, "learning_rate": 6.942435002519938e-07, "loss": 0.4379, "num_tokens": 479130327.0, "step": 229 }, { "epoch": 5.0, "grad_norm": 0.13408662519831854, "learning_rate": 6.91004785593197e-07, "loss": 0.4466, "num_tokens": 481221569.0, "step": 230 }, { "epoch": 5.021739130434782, "grad_norm": 0.14032592776135683, "learning_rate": 6.877579514055058e-07, "loss": 0.4396, "num_tokens": 483311867.0, "step": 231 }, { "epoch": 5.043478260869565, "grad_norm": 0.13548191720383043, "learning_rate": 6.845031846525321e-07, "loss": 0.4347, "num_tokens": 485403919.0, "step": 232 }, { "epoch": 5.065217391304348, "grad_norm": 0.130439503781961, "learning_rate": 6.812406727546712e-07, "loss": 0.4389, "num_tokens": 487494574.0, "step": 233 }, { "epoch": 5.086956521739131, "grad_norm": 0.13779175423288925, "learning_rate": 6.779706035783104e-07, "loss": 0.4348, "num_tokens": 489585429.0, "step": 234 }, { "epoch": 5.108695652173913, "grad_norm": 0.13497846536624478, "learning_rate": 6.7469316542501e-07, "loss": 0.4371, "num_tokens": 491679660.0, "step": 235 }, { "epoch": 5.130434782608695, "grad_norm": 0.14168178870357873, "learning_rate": 6.714085470206609e-07, "loss": 0.4383, "num_tokens": 493769766.0, "step": 236 }, { "epoch": 5.1521739130434785, "grad_norm": 0.13453405538660843, "learning_rate": 6.681169375046172e-07, "loss": 0.438, "num_tokens": 495862475.0, "step": 237 }, { "epoch": 5.173913043478261, "grad_norm": 0.13202645111151726, "learning_rate": 6.648185264188042e-07, "loss": 0.4381, "num_tokens": 497955918.0, "step": 238 }, { "epoch": 5.195652173913044, "grad_norm": 0.13125723135499598, "learning_rate": 6.615135036968049e-07, "loss": 0.4364, "num_tokens": 500047691.0, "step": 239 }, { "epoch": 5.217391304347826, "grad_norm": 0.1335404532583581, "learning_rate": 6.582020596529223e-07, "loss": 0.4346, "num_tokens": 502139896.0, "step": 240 }, { "epoch": 5.239130434782608, "grad_norm": 0.13521276914397093, "learning_rate": 6.548843849712204e-07, "loss": 0.4402, "num_tokens": 504233584.0, "step": 241 }, { "epoch": 5.260869565217392, "grad_norm": 0.13773428031418447, "learning_rate": 6.515606706945448e-07, "loss": 0.4369, "num_tokens": 506324690.0, "step": 242 }, { "epoch": 5.282608695652174, "grad_norm": 0.13773029409346027, "learning_rate": 6.482311082135207e-07, "loss": 0.4395, "num_tokens": 508417180.0, "step": 243 }, { "epoch": 5.304347826086957, "grad_norm": 0.1356871115442459, "learning_rate": 6.448958892555331e-07, "loss": 0.4365, "num_tokens": 510508979.0, "step": 244 }, { "epoch": 5.326086956521739, "grad_norm": 0.13534102412383026, "learning_rate": 6.415552058736853e-07, "loss": 0.4389, "num_tokens": 512598937.0, "step": 245 }, { "epoch": 5.3478260869565215, "grad_norm": 0.13264202592618046, "learning_rate": 6.382092504357407e-07, "loss": 0.4317, "num_tokens": 514689904.0, "step": 246 }, { "epoch": 5.369565217391305, "grad_norm": 0.1333826695615688, "learning_rate": 6.348582156130461e-07, "loss": 0.4383, "num_tokens": 516783964.0, "step": 247 }, { "epoch": 5.391304347826087, "grad_norm": 0.13899071330031054, "learning_rate": 6.315022943694351e-07, "loss": 0.4403, "num_tokens": 518876985.0, "step": 248 }, { "epoch": 5.413043478260869, "grad_norm": 0.13778710354618337, "learning_rate": 6.281416799501187e-07, "loss": 0.4399, "num_tokens": 520970152.0, "step": 249 }, { "epoch": 5.434782608695652, "grad_norm": 0.13632128488986214, "learning_rate": 6.247765658705564e-07, "loss": 0.4337, "num_tokens": 523061474.0, "step": 250 }, { "epoch": 5.456521739130435, "grad_norm": 0.1462604946144207, "learning_rate": 6.21407145905313e-07, "loss": 0.4404, "num_tokens": 525152838.0, "step": 251 }, { "epoch": 5.478260869565218, "grad_norm": 0.13766959751545696, "learning_rate": 6.180336140769014e-07, "loss": 0.4408, "num_tokens": 527243818.0, "step": 252 }, { "epoch": 5.5, "grad_norm": 0.13758581395499722, "learning_rate": 6.146561646446086e-07, "loss": 0.4369, "num_tokens": 529336853.0, "step": 253 }, { "epoch": 5.521739130434782, "grad_norm": 0.13513710600810164, "learning_rate": 6.11274992093311e-07, "loss": 0.4308, "num_tokens": 531430293.0, "step": 254 }, { "epoch": 5.543478260869565, "grad_norm": 0.13912312404631008, "learning_rate": 6.078902911222739e-07, "loss": 0.4383, "num_tokens": 533522415.0, "step": 255 }, { "epoch": 5.565217391304348, "grad_norm": 0.1388530569147137, "learning_rate": 6.045022566339418e-07, "loss": 0.4376, "num_tokens": 535617186.0, "step": 256 }, { "epoch": 5.586956521739131, "grad_norm": 0.1374779214335327, "learning_rate": 6.011110837227137e-07, "loss": 0.4308, "num_tokens": 537709724.0, "step": 257 }, { "epoch": 5.608695652173913, "grad_norm": 0.1377630412176324, "learning_rate": 5.977169676637097e-07, "loss": 0.4468, "num_tokens": 539801117.0, "step": 258 }, { "epoch": 5.630434782608695, "grad_norm": 0.13653727942572233, "learning_rate": 5.943201039015259e-07, "loss": 0.4388, "num_tokens": 541895241.0, "step": 259 }, { "epoch": 5.6521739130434785, "grad_norm": 0.14282273069692783, "learning_rate": 5.909206880389812e-07, "loss": 0.4377, "num_tokens": 543985798.0, "step": 260 }, { "epoch": 5.673913043478261, "grad_norm": 0.13054693337369536, "learning_rate": 5.87518915825852e-07, "loss": 0.4363, "num_tokens": 546077077.0, "step": 261 }, { "epoch": 5.695652173913043, "grad_norm": 0.13814789552591392, "learning_rate": 5.841149831476024e-07, "loss": 0.4385, "num_tokens": 548170917.0, "step": 262 }, { "epoch": 5.717391304347826, "grad_norm": 0.13844152235488286, "learning_rate": 5.80709086014102e-07, "loss": 0.4333, "num_tokens": 550259924.0, "step": 263 }, { "epoch": 5.739130434782608, "grad_norm": 0.14059296175840055, "learning_rate": 5.773014205483413e-07, "loss": 0.4379, "num_tokens": 552352828.0, "step": 264 }, { "epoch": 5.760869565217392, "grad_norm": 0.13847725520224147, "learning_rate": 5.738921829751373e-07, "loss": 0.442, "num_tokens": 554447030.0, "step": 265 }, { "epoch": 5.782608695652174, "grad_norm": 0.13609874773318442, "learning_rate": 5.704815696098336e-07, "loss": 0.4379, "num_tokens": 556540214.0, "step": 266 }, { "epoch": 5.804347826086957, "grad_norm": 0.13297354961540186, "learning_rate": 5.67069776846997e-07, "loss": 0.4325, "num_tokens": 558631373.0, "step": 267 }, { "epoch": 5.826086956521739, "grad_norm": 0.13567272164852606, "learning_rate": 5.636570011491081e-07, "loss": 0.4388, "num_tokens": 560726058.0, "step": 268 }, { "epoch": 5.8478260869565215, "grad_norm": 0.13162046638977157, "learning_rate": 5.602434390352476e-07, "loss": 0.4329, "num_tokens": 562819414.0, "step": 269 }, { "epoch": 5.869565217391305, "grad_norm": 0.13332396886010534, "learning_rate": 5.568292870697812e-07, "loss": 0.4341, "num_tokens": 564912852.0, "step": 270 }, { "epoch": 5.891304347826087, "grad_norm": 0.13584602864934453, "learning_rate": 5.5341474185104e-07, "loss": 0.4297, "num_tokens": 567005867.0, "step": 271 }, { "epoch": 5.913043478260869, "grad_norm": 0.1369250861070486, "learning_rate": 5.5e-07, "loss": 0.4369, "num_tokens": 569101094.0, "step": 272 }, { "epoch": 5.934782608695652, "grad_norm": 0.1377729350140169, "learning_rate": 5.4658525814896e-07, "loss": 0.4356, "num_tokens": 571193885.0, "step": 273 }, { "epoch": 5.956521739130435, "grad_norm": 0.1365562431007031, "learning_rate": 5.431707129302188e-07, "loss": 0.4363, "num_tokens": 573284677.0, "step": 274 }, { "epoch": 5.978260869565218, "grad_norm": 0.13502269056012137, "learning_rate": 5.397565609647524e-07, "loss": 0.4304, "num_tokens": 575374743.0, "step": 275 }, { "epoch": 6.0, "grad_norm": 0.13363349453152876, "learning_rate": 5.36342998850892e-07, "loss": 0.4368, "num_tokens": 577464868.0, "step": 276 }, { "epoch": 6.021739130434782, "grad_norm": 0.1328059670364238, "learning_rate": 5.329302231530028e-07, "loss": 0.4315, "num_tokens": 579556555.0, "step": 277 }, { "epoch": 6.043478260869565, "grad_norm": 0.13754954688543153, "learning_rate": 5.295184303901664e-07, "loss": 0.4338, "num_tokens": 581648063.0, "step": 278 }, { "epoch": 6.065217391304348, "grad_norm": 0.12821659544185995, "learning_rate": 5.261078170248629e-07, "loss": 0.4355, "num_tokens": 583740874.0, "step": 279 }, { "epoch": 6.086956521739131, "grad_norm": 0.13679137656422818, "learning_rate": 5.226985794516586e-07, "loss": 0.4319, "num_tokens": 585832349.0, "step": 280 }, { "epoch": 6.108695652173913, "grad_norm": 0.13249307568241678, "learning_rate": 5.192909139858981e-07, "loss": 0.4338, "num_tokens": 587921984.0, "step": 281 }, { "epoch": 6.130434782608695, "grad_norm": 0.1357134504296526, "learning_rate": 5.158850168523978e-07, "loss": 0.4406, "num_tokens": 590012104.0, "step": 282 }, { "epoch": 6.1521739130434785, "grad_norm": 0.13359851753248614, "learning_rate": 5.124810841741479e-07, "loss": 0.4367, "num_tokens": 592104332.0, "step": 283 }, { "epoch": 6.173913043478261, "grad_norm": 0.13330128917193781, "learning_rate": 5.090793119610189e-07, "loss": 0.4365, "num_tokens": 594197153.0, "step": 284 }, { "epoch": 6.195652173913044, "grad_norm": 0.1389379881395303, "learning_rate": 5.05679896098474e-07, "loss": 0.4306, "num_tokens": 596290282.0, "step": 285 }, { "epoch": 6.217391304347826, "grad_norm": 0.13502081622963785, "learning_rate": 5.022830323362904e-07, "loss": 0.4339, "num_tokens": 598381737.0, "step": 286 }, { "epoch": 6.239130434782608, "grad_norm": 0.1300343985791597, "learning_rate": 4.988889162772862e-07, "loss": 0.4287, "num_tokens": 600474261.0, "step": 287 }, { "epoch": 6.260869565217392, "grad_norm": 0.1346594845108426, "learning_rate": 4.954977433660582e-07, "loss": 0.4328, "num_tokens": 602567714.0, "step": 288 }, { "epoch": 6.282608695652174, "grad_norm": 0.1347227053154414, "learning_rate": 4.921097088777261e-07, "loss": 0.4279, "num_tokens": 604656969.0, "step": 289 }, { "epoch": 6.304347826086957, "grad_norm": 0.1300345204916007, "learning_rate": 4.887250079066891e-07, "loss": 0.4324, "num_tokens": 606751602.0, "step": 290 }, { "epoch": 6.326086956521739, "grad_norm": 0.1373172335581007, "learning_rate": 4.853438353553913e-07, "loss": 0.4352, "num_tokens": 608844165.0, "step": 291 }, { "epoch": 6.3478260869565215, "grad_norm": 0.1355884963872206, "learning_rate": 4.819663859230986e-07, "loss": 0.4358, "num_tokens": 610938529.0, "step": 292 }, { "epoch": 6.369565217391305, "grad_norm": 0.13411787711864406, "learning_rate": 4.785928540946868e-07, "loss": 0.4353, "num_tokens": 613033101.0, "step": 293 }, { "epoch": 6.391304347826087, "grad_norm": 0.13405503421436976, "learning_rate": 4.752234341294438e-07, "loss": 0.4405, "num_tokens": 615127003.0, "step": 294 }, { "epoch": 6.413043478260869, "grad_norm": 0.13437305820655193, "learning_rate": 4.7185832004988133e-07, "loss": 0.433, "num_tokens": 617218474.0, "step": 295 }, { "epoch": 6.434782608695652, "grad_norm": 0.1321759781171093, "learning_rate": 4.684977056305649e-07, "loss": 0.4391, "num_tokens": 619311600.0, "step": 296 }, { "epoch": 6.456521739130435, "grad_norm": 0.1305282668313758, "learning_rate": 4.6514178438695393e-07, "loss": 0.4388, "num_tokens": 621403757.0, "step": 297 }, { "epoch": 6.478260869565218, "grad_norm": 0.13202954037114603, "learning_rate": 4.6179074956425933e-07, "loss": 0.4292, "num_tokens": 623498451.0, "step": 298 }, { "epoch": 6.5, "grad_norm": 0.13284707048157723, "learning_rate": 4.584447941263149e-07, "loss": 0.4325, "num_tokens": 625591661.0, "step": 299 }, { "epoch": 6.521739130434782, "grad_norm": 0.13602035136541316, "learning_rate": 4.551041107444671e-07, "loss": 0.4392, "num_tokens": 627682691.0, "step": 300 }, { "epoch": 6.543478260869565, "grad_norm": 0.13598544826900796, "learning_rate": 4.517688917864794e-07, "loss": 0.4353, "num_tokens": 629776091.0, "step": 301 }, { "epoch": 6.565217391304348, "grad_norm": 0.1338409318077529, "learning_rate": 4.4843932930545523e-07, "loss": 0.4345, "num_tokens": 631868289.0, "step": 302 }, { "epoch": 6.586956521739131, "grad_norm": 0.1342983907771441, "learning_rate": 4.4511561502877957e-07, "loss": 0.4369, "num_tokens": 633961314.0, "step": 303 }, { "epoch": 6.608695652173913, "grad_norm": 0.13346437927742005, "learning_rate": 4.417979403470777e-07, "loss": 0.431, "num_tokens": 636053320.0, "step": 304 }, { "epoch": 6.630434782608695, "grad_norm": 0.1310788686933386, "learning_rate": 4.384864963031951e-07, "loss": 0.4356, "num_tokens": 638148918.0, "step": 305 }, { "epoch": 6.6521739130434785, "grad_norm": 0.13154555127052447, "learning_rate": 4.3518147358119574e-07, "loss": 0.4339, "num_tokens": 640240048.0, "step": 306 }, { "epoch": 6.673913043478261, "grad_norm": 0.13334619119653454, "learning_rate": 4.3188306249538274e-07, "loss": 0.4314, "num_tokens": 642332226.0, "step": 307 }, { "epoch": 6.695652173913043, "grad_norm": 0.13503339105593512, "learning_rate": 4.285914529793391e-07, "loss": 0.4342, "num_tokens": 644421381.0, "step": 308 }, { "epoch": 6.717391304347826, "grad_norm": 0.13212880134022156, "learning_rate": 4.2530683457499015e-07, "loss": 0.4363, "num_tokens": 646510254.0, "step": 309 }, { "epoch": 6.739130434782608, "grad_norm": 0.13394002235598818, "learning_rate": 4.220293964216898e-07, "loss": 0.4359, "num_tokens": 648602689.0, "step": 310 }, { "epoch": 6.760869565217392, "grad_norm": 0.14660384432956694, "learning_rate": 4.187593272453288e-07, "loss": 0.4365, "num_tokens": 650696691.0, "step": 311 }, { "epoch": 6.782608695652174, "grad_norm": 0.13239093242319802, "learning_rate": 4.154968153474679e-07, "loss": 0.4347, "num_tokens": 652788120.0, "step": 312 }, { "epoch": 6.804347826086957, "grad_norm": 0.1342643540007929, "learning_rate": 4.1224204859449416e-07, "loss": 0.433, "num_tokens": 654880645.0, "step": 313 }, { "epoch": 6.826086956521739, "grad_norm": 0.13275609367078162, "learning_rate": 4.0899521440680306e-07, "loss": 0.4355, "num_tokens": 656971574.0, "step": 314 }, { "epoch": 6.8478260869565215, "grad_norm": 0.13259301897142664, "learning_rate": 4.057564997480063e-07, "loss": 0.4332, "num_tokens": 659063107.0, "step": 315 }, { "epoch": 6.869565217391305, "grad_norm": 0.13223705233189004, "learning_rate": 4.0252609111416633e-07, "loss": 0.4337, "num_tokens": 661155890.0, "step": 316 }, { "epoch": 6.891304347826087, "grad_norm": 0.13224068238361178, "learning_rate": 3.993041745230562e-07, "loss": 0.4309, "num_tokens": 663247212.0, "step": 317 }, { "epoch": 6.913043478260869, "grad_norm": 0.13637009129468228, "learning_rate": 3.9609093550344907e-07, "loss": 0.4321, "num_tokens": 665338057.0, "step": 318 }, { "epoch": 6.934782608695652, "grad_norm": 0.13707779499269707, "learning_rate": 3.9288655908443423e-07, "loss": 0.4329, "num_tokens": 667432205.0, "step": 319 }, { "epoch": 6.956521739130435, "grad_norm": 0.132109300794455, "learning_rate": 3.8969122978476253e-07, "loss": 0.4346, "num_tokens": 669525185.0, "step": 320 }, { "epoch": 6.978260869565218, "grad_norm": 0.13205979491874026, "learning_rate": 3.865051316022214e-07, "loss": 0.4327, "num_tokens": 671616364.0, "step": 321 }, { "epoch": 7.0, "grad_norm": 0.135129481925338, "learning_rate": 3.8332844800303996e-07, "loss": 0.4378, "num_tokens": 673708729.0, "step": 322 }, { "epoch": 7.021739130434782, "grad_norm": 0.1351161665425618, "learning_rate": 3.8016136191132354e-07, "loss": 0.4365, "num_tokens": 675800356.0, "step": 323 }, { "epoch": 7.043478260869565, "grad_norm": 0.1349927733423944, "learning_rate": 3.770040556985208e-07, "loss": 0.4328, "num_tokens": 677892616.0, "step": 324 }, { "epoch": 7.065217391304348, "grad_norm": 0.1352555819729584, "learning_rate": 3.738567111729224e-07, "loss": 0.4334, "num_tokens": 679985493.0, "step": 325 }, { "epoch": 7.086956521739131, "grad_norm": 0.13250294100790336, "learning_rate": 3.707195095691913e-07, "loss": 0.4328, "num_tokens": 682078744.0, "step": 326 }, { "epoch": 7.108695652173913, "grad_norm": 0.13666594640233576, "learning_rate": 3.675926315379274e-07, "loss": 0.4322, "num_tokens": 684171663.0, "step": 327 }, { "epoch": 7.130434782608695, "grad_norm": 0.13676036413293755, "learning_rate": 3.644762571352641e-07, "loss": 0.433, "num_tokens": 686264097.0, "step": 328 }, { "epoch": 7.1521739130434785, "grad_norm": 0.12951987903608786, "learning_rate": 3.6137056581250137e-07, "loss": 0.4365, "num_tokens": 688358020.0, "step": 329 }, { "epoch": 7.173913043478261, "grad_norm": 0.13212843134698485, "learning_rate": 3.5827573640577033e-07, "loss": 0.4333, "num_tokens": 690450259.0, "step": 330 }, { "epoch": 7.195652173913044, "grad_norm": 0.13407226703097985, "learning_rate": 3.5519194712573787e-07, "loss": 0.4371, "num_tokens": 692542688.0, "step": 331 }, { "epoch": 7.217391304347826, "grad_norm": 0.13581794126806007, "learning_rate": 3.521193755473423e-07, "loss": 0.4372, "num_tokens": 694633082.0, "step": 332 }, { "epoch": 7.239130434782608, "grad_norm": 0.1276611741507177, "learning_rate": 3.4905819859957e-07, "loss": 0.4303, "num_tokens": 696724260.0, "step": 333 }, { "epoch": 7.260869565217392, "grad_norm": 0.13106145609895156, "learning_rate": 3.460085925552653e-07, "loss": 0.437, "num_tokens": 698813738.0, "step": 334 }, { "epoch": 7.282608695652174, "grad_norm": 0.1306738813490512, "learning_rate": 3.4297073302098155e-07, "loss": 0.432, "num_tokens": 700905567.0, "step": 335 }, { "epoch": 7.304347826086957, "grad_norm": 0.1327736232372846, "learning_rate": 3.399447949268686e-07, "loss": 0.4285, "num_tokens": 702999204.0, "step": 336 }, { "epoch": 7.326086956521739, "grad_norm": 0.1332013722230655, "learning_rate": 3.369309525165997e-07, "loss": 0.4339, "num_tokens": 705092251.0, "step": 337 }, { "epoch": 7.3478260869565215, "grad_norm": 0.12940464875966667, "learning_rate": 3.33929379337338e-07, "loss": 0.4288, "num_tokens": 707186648.0, "step": 338 }, { "epoch": 7.369565217391305, "grad_norm": 0.13228179281843067, "learning_rate": 3.30940248229743e-07, "loss": 0.4302, "num_tokens": 709278066.0, "step": 339 }, { "epoch": 7.391304347826087, "grad_norm": 0.13503745540474288, "learning_rate": 3.279637313180187e-07, "loss": 0.431, "num_tokens": 711370104.0, "step": 340 }, { "epoch": 7.413043478260869, "grad_norm": 0.14399326300596813, "learning_rate": 3.250000000000001e-07, "loss": 0.4295, "num_tokens": 713463519.0, "step": 341 }, { "epoch": 7.434782608695652, "grad_norm": 0.1335803366951806, "learning_rate": 3.220492249372857e-07, "loss": 0.4333, "num_tokens": 715555950.0, "step": 342 }, { "epoch": 7.456521739130435, "grad_norm": 0.1321490413754198, "learning_rate": 3.191115760454092e-07, "loss": 0.4334, "num_tokens": 717644115.0, "step": 343 }, { "epoch": 7.478260869565218, "grad_norm": 0.1378381482716753, "learning_rate": 3.16187222484055e-07, "loss": 0.4278, "num_tokens": 719737075.0, "step": 344 }, { "epoch": 7.5, "grad_norm": 0.13813454110009424, "learning_rate": 3.1327633264731803e-07, "loss": 0.4354, "num_tokens": 721829111.0, "step": 345 }, { "epoch": 7.521739130434782, "grad_norm": 0.13507583558096983, "learning_rate": 3.103790741540067e-07, "loss": 0.4346, "num_tokens": 723921674.0, "step": 346 }, { "epoch": 7.543478260869565, "grad_norm": 0.13094378688454855, "learning_rate": 3.0749561383799107e-07, "loss": 0.4331, "num_tokens": 726014640.0, "step": 347 }, { "epoch": 7.565217391304348, "grad_norm": 0.1340997929945924, "learning_rate": 3.0462611773859536e-07, "loss": 0.4269, "num_tokens": 728103347.0, "step": 348 }, { "epoch": 7.586956521739131, "grad_norm": 0.13188855397354393, "learning_rate": 3.017707510910378e-07, "loss": 0.4315, "num_tokens": 730196549.0, "step": 349 }, { "epoch": 7.608695652173913, "grad_norm": 0.13420100529200588, "learning_rate": 2.9892967831691504e-07, "loss": 0.4287, "num_tokens": 732290824.0, "step": 350 }, { "epoch": 7.630434782608695, "grad_norm": 0.13059199827577136, "learning_rate": 2.961030630147346e-07, "loss": 0.4353, "num_tokens": 734382209.0, "step": 351 }, { "epoch": 7.6521739130434785, "grad_norm": 0.1317331651303684, "learning_rate": 2.9329106795049443e-07, "loss": 0.4291, "num_tokens": 736476467.0, "step": 352 }, { "epoch": 7.673913043478261, "grad_norm": 0.13586368026615056, "learning_rate": 2.904938550483098e-07, "loss": 0.4361, "num_tokens": 738567121.0, "step": 353 }, { "epoch": 7.695652173913043, "grad_norm": 0.1333568766936638, "learning_rate": 2.8771158538108976e-07, "loss": 0.4316, "num_tokens": 740660970.0, "step": 354 }, { "epoch": 7.717391304347826, "grad_norm": 0.13226567822802499, "learning_rate": 2.849444191612613e-07, "loss": 0.4313, "num_tokens": 742752708.0, "step": 355 }, { "epoch": 7.739130434782608, "grad_norm": 0.13495676434313775, "learning_rate": 2.821925157315447e-07, "loss": 0.4304, "num_tokens": 744844699.0, "step": 356 }, { "epoch": 7.760869565217392, "grad_norm": 0.13174968471614787, "learning_rate": 2.7945603355577707e-07, "loss": 0.4331, "num_tokens": 746938461.0, "step": 357 }, { "epoch": 7.782608695652174, "grad_norm": 0.13315770983596392, "learning_rate": 2.7673513020978866e-07, "loss": 0.4382, "num_tokens": 749030330.0, "step": 358 }, { "epoch": 7.804347826086957, "grad_norm": 0.1308412156094041, "learning_rate": 2.7402996237232757e-07, "loss": 0.4318, "num_tokens": 751124081.0, "step": 359 }, { "epoch": 7.826086956521739, "grad_norm": 0.14307594281989688, "learning_rate": 2.713406858160393e-07, "loss": 0.4257, "num_tokens": 753217342.0, "step": 360 }, { "epoch": 7.8478260869565215, "grad_norm": 0.13412489931327856, "learning_rate": 2.686674553984951e-07, "loss": 0.4337, "num_tokens": 755310255.0, "step": 361 }, { "epoch": 7.869565217391305, "grad_norm": 0.13326162890783366, "learning_rate": 2.6601042505327635e-07, "loss": 0.4278, "num_tokens": 757400781.0, "step": 362 }, { "epoch": 7.891304347826087, "grad_norm": 0.13553749314962396, "learning_rate": 2.6336974778110974e-07, "loss": 0.4335, "num_tokens": 759493454.0, "step": 363 }, { "epoch": 7.913043478260869, "grad_norm": 0.13276724428197223, "learning_rate": 2.6074557564105724e-07, "loss": 0.4297, "num_tokens": 761586571.0, "step": 364 }, { "epoch": 7.934782608695652, "grad_norm": 0.13353006154405495, "learning_rate": 2.5813805974175984e-07, "loss": 0.4366, "num_tokens": 763678682.0, "step": 365 }, { "epoch": 7.956521739130435, "grad_norm": 0.13422033595407565, "learning_rate": 2.55547350232736e-07, "loss": 0.4336, "num_tokens": 765769422.0, "step": 366 }, { "epoch": 7.978260869565218, "grad_norm": 0.13744588281494488, "learning_rate": 2.529735962957361e-07, "loss": 0.432, "num_tokens": 767860704.0, "step": 367 }, { "epoch": 8.0, "grad_norm": 0.13605419812550198, "learning_rate": 2.504169461361518e-07, "loss": 0.4346, "num_tokens": 769952460.0, "step": 368 }, { "epoch": 8.021739130434783, "grad_norm": 0.12913143473579597, "learning_rate": 2.478775469744815e-07, "loss": 0.4325, "num_tokens": 772044426.0, "step": 369 }, { "epoch": 8.043478260869565, "grad_norm": 0.12961630524558168, "learning_rate": 2.453555450378535e-07, "loss": 0.4281, "num_tokens": 774137051.0, "step": 370 }, { "epoch": 8.065217391304348, "grad_norm": 0.12615636919892154, "learning_rate": 2.4285108555160575e-07, "loss": 0.4309, "num_tokens": 776225808.0, "step": 371 }, { "epoch": 8.08695652173913, "grad_norm": 0.13305530149593306, "learning_rate": 2.4036431273092235e-07, "loss": 0.4331, "num_tokens": 778316140.0, "step": 372 }, { "epoch": 8.108695652173912, "grad_norm": 0.13211852945485628, "learning_rate": 2.378953697725303e-07, "loss": 0.4295, "num_tokens": 780410250.0, "step": 373 }, { "epoch": 8.130434782608695, "grad_norm": 0.1292232171342386, "learning_rate": 2.3544439884645314e-07, "loss": 0.4354, "num_tokens": 782502683.0, "step": 374 }, { "epoch": 8.152173913043478, "grad_norm": 0.1313175653486085, "learning_rate": 2.3301154108782453e-07, "loss": 0.4256, "num_tokens": 784596876.0, "step": 375 }, { "epoch": 8.173913043478262, "grad_norm": 0.13116271616904762, "learning_rate": 2.3059693658876094e-07, "loss": 0.434, "num_tokens": 786687028.0, "step": 376 }, { "epoch": 8.195652173913043, "grad_norm": 0.1290464225342815, "learning_rate": 2.2820072439029523e-07, "loss": 0.4307, "num_tokens": 788781709.0, "step": 377 }, { "epoch": 8.217391304347826, "grad_norm": 0.13638944899256616, "learning_rate": 2.2582304247436962e-07, "loss": 0.4305, "num_tokens": 790874628.0, "step": 378 }, { "epoch": 8.23913043478261, "grad_norm": 0.13152310229196626, "learning_rate": 2.2346402775589042e-07, "loss": 0.4353, "num_tokens": 792968522.0, "step": 379 }, { "epoch": 8.26086956521739, "grad_norm": 0.13177637659531652, "learning_rate": 2.2112381607484416e-07, "loss": 0.4335, "num_tokens": 795061203.0, "step": 380 }, { "epoch": 8.282608695652174, "grad_norm": 0.1325848225519375, "learning_rate": 2.1880254218847538e-07, "loss": 0.4309, "num_tokens": 797155596.0, "step": 381 }, { "epoch": 8.304347826086957, "grad_norm": 0.13091228502197777, "learning_rate": 2.1650033976352643e-07, "loss": 0.4273, "num_tokens": 799250052.0, "step": 382 }, { "epoch": 8.326086956521738, "grad_norm": 0.12968816077115408, "learning_rate": 2.1421734136854153e-07, "loss": 0.4283, "num_tokens": 801343245.0, "step": 383 }, { "epoch": 8.347826086956522, "grad_norm": 0.13538549199023636, "learning_rate": 2.1195367846623207e-07, "loss": 0.4336, "num_tokens": 803434640.0, "step": 384 }, { "epoch": 8.369565217391305, "grad_norm": 0.12970848773853702, "learning_rate": 2.0970948140590672e-07, "loss": 0.4258, "num_tokens": 805528060.0, "step": 385 }, { "epoch": 8.391304347826088, "grad_norm": 0.13268823242396818, "learning_rate": 2.0748487941596594e-07, "loss": 0.4325, "num_tokens": 807620018.0, "step": 386 }, { "epoch": 8.41304347826087, "grad_norm": 0.130777558774095, "learning_rate": 2.0528000059645995e-07, "loss": 0.4341, "num_tokens": 809712424.0, "step": 387 }, { "epoch": 8.434782608695652, "grad_norm": 0.13276667916448534, "learning_rate": 2.0309497191171281e-07, "loss": 0.4301, "num_tokens": 811806174.0, "step": 388 }, { "epoch": 8.456521739130435, "grad_norm": 0.13377311658786517, "learning_rate": 2.0092991918301106e-07, "loss": 0.437, "num_tokens": 813898764.0, "step": 389 }, { "epoch": 8.478260869565217, "grad_norm": 0.1341059998288671, "learning_rate": 1.9878496708135884e-07, "loss": 0.4323, "num_tokens": 815990077.0, "step": 390 }, { "epoch": 8.5, "grad_norm": 0.13253582182125295, "learning_rate": 1.9666023912029849e-07, "loss": 0.4402, "num_tokens": 818080579.0, "step": 391 }, { "epoch": 8.521739130434783, "grad_norm": 0.1302260861058939, "learning_rate": 1.9455585764879873e-07, "loss": 0.4338, "num_tokens": 820171456.0, "step": 392 }, { "epoch": 8.543478260869565, "grad_norm": 0.12879496335031015, "learning_rate": 1.924719438442085e-07, "loss": 0.4267, "num_tokens": 822264249.0, "step": 393 }, { "epoch": 8.565217391304348, "grad_norm": 0.13047843266489334, "learning_rate": 1.9040861770528043e-07, "loss": 0.4324, "num_tokens": 824355668.0, "step": 394 }, { "epoch": 8.58695652173913, "grad_norm": 0.13118675730217721, "learning_rate": 1.883659980452598e-07, "loss": 0.4286, "num_tokens": 826448015.0, "step": 395 }, { "epoch": 8.608695652173914, "grad_norm": 0.131767591474556, "learning_rate": 1.863442024850438e-07, "loss": 0.4355, "num_tokens": 828540071.0, "step": 396 }, { "epoch": 8.630434782608695, "grad_norm": 0.13480920458872578, "learning_rate": 1.843433474464076e-07, "loss": 0.4302, "num_tokens": 830635170.0, "step": 397 }, { "epoch": 8.652173913043478, "grad_norm": 0.1315376912303259, "learning_rate": 1.8236354814530112e-07, "loss": 0.4359, "num_tokens": 832727964.0, "step": 398 }, { "epoch": 8.673913043478262, "grad_norm": 0.1332959061250401, "learning_rate": 1.80404918585214e-07, "loss": 0.4344, "num_tokens": 834819503.0, "step": 399 }, { "epoch": 8.695652173913043, "grad_norm": 0.1357750786137828, "learning_rate": 1.7846757155061127e-07, "loss": 0.4312, "num_tokens": 836909543.0, "step": 400 }, { "epoch": 8.717391304347826, "grad_norm": 0.1297667217706855, "learning_rate": 1.765516186004387e-07, "loss": 0.4298, "num_tokens": 839002022.0, "step": 401 }, { "epoch": 8.73913043478261, "grad_norm": 0.13324607592951465, "learning_rate": 1.7465717006169887e-07, "loss": 0.4298, "num_tokens": 841093887.0, "step": 402 }, { "epoch": 8.76086956521739, "grad_norm": 0.1319543530134324, "learning_rate": 1.7278433502309808e-07, "loss": 0.4302, "num_tokens": 843185214.0, "step": 403 }, { "epoch": 8.782608695652174, "grad_norm": 0.1292262569354397, "learning_rate": 1.7093322132876485e-07, "loss": 0.4289, "num_tokens": 845275872.0, "step": 404 }, { "epoch": 8.804347826086957, "grad_norm": 0.13330281500828273, "learning_rate": 1.691039355720396e-07, "loss": 0.4333, "num_tokens": 847364898.0, "step": 405 }, { "epoch": 8.826086956521738, "grad_norm": 0.13157685547158732, "learning_rate": 1.6729658308933703e-07, "loss": 0.432, "num_tokens": 849456452.0, "step": 406 }, { "epoch": 8.847826086956522, "grad_norm": 0.13145264305507653, "learning_rate": 1.6551126795408015e-07, "loss": 0.4265, "num_tokens": 851549867.0, "step": 407 }, { "epoch": 8.869565217391305, "grad_norm": 0.13486432492275377, "learning_rate": 1.6374809297070763e-07, "loss": 0.4329, "num_tokens": 853641176.0, "step": 408 }, { "epoch": 8.891304347826086, "grad_norm": 0.12974827151243915, "learning_rate": 1.6200715966875392e-07, "loss": 0.4213, "num_tokens": 855735273.0, "step": 409 }, { "epoch": 8.91304347826087, "grad_norm": 0.13185441487515778, "learning_rate": 1.6028856829700258e-07, "loss": 0.4271, "num_tokens": 857828511.0, "step": 410 }, { "epoch": 8.934782608695652, "grad_norm": 0.13485608760298035, "learning_rate": 1.5859241781771399e-07, "loss": 0.4308, "num_tokens": 859920415.0, "step": 411 }, { "epoch": 8.956521739130435, "grad_norm": 0.1339411805022494, "learning_rate": 1.5691880590092667e-07, "loss": 0.431, "num_tokens": 862012285.0, "step": 412 }, { "epoch": 8.978260869565217, "grad_norm": 0.1349062172890918, "learning_rate": 1.552678289188326e-07, "loss": 0.435, "num_tokens": 864105715.0, "step": 413 }, { "epoch": 9.0, "grad_norm": 0.13339483975994826, "learning_rate": 1.5363958194022895e-07, "loss": 0.4284, "num_tokens": 866196230.0, "step": 414 }, { "epoch": 9.021739130434783, "grad_norm": 0.13155206773063716, "learning_rate": 1.5203415872504246e-07, "loss": 0.4234, "num_tokens": 868289042.0, "step": 415 }, { "epoch": 9.043478260869565, "grad_norm": 0.1265848518754384, "learning_rate": 1.5045165171893116e-07, "loss": 0.4331, "num_tokens": 870380556.0, "step": 416 }, { "epoch": 9.065217391304348, "grad_norm": 0.12943671669921178, "learning_rate": 1.488921520479608e-07, "loss": 0.4332, "num_tokens": 872469107.0, "step": 417 }, { "epoch": 9.08695652173913, "grad_norm": 0.13125183209740615, "learning_rate": 1.473557495133575e-07, "loss": 0.4294, "num_tokens": 874561955.0, "step": 418 }, { "epoch": 9.108695652173912, "grad_norm": 0.13109229633803124, "learning_rate": 1.4584253258633681e-07, "loss": 0.4251, "num_tokens": 876654532.0, "step": 419 }, { "epoch": 9.130434782608695, "grad_norm": 0.13214384571361726, "learning_rate": 1.4435258840300897e-07, "loss": 0.4336, "num_tokens": 878746295.0, "step": 420 }, { "epoch": 9.152173913043478, "grad_norm": 0.1311401974553015, "learning_rate": 1.4288600275936184e-07, "loss": 0.4274, "num_tokens": 880840455.0, "step": 421 }, { "epoch": 9.173913043478262, "grad_norm": 0.13541450363149224, "learning_rate": 1.4144286010631992e-07, "loss": 0.4307, "num_tokens": 882933892.0, "step": 422 }, { "epoch": 9.195652173913043, "grad_norm": 0.1347209244136294, "learning_rate": 1.4002324354488175e-07, "loss": 0.4321, "num_tokens": 885025725.0, "step": 423 }, { "epoch": 9.217391304347826, "grad_norm": 0.1307731062962624, "learning_rate": 1.3862723482133435e-07, "loss": 0.434, "num_tokens": 887119819.0, "step": 424 }, { "epoch": 9.23913043478261, "grad_norm": 0.13830257430323137, "learning_rate": 1.3725491432254623e-07, "loss": 0.4322, "num_tokens": 889212264.0, "step": 425 }, { "epoch": 9.26086956521739, "grad_norm": 0.13014409193295903, "learning_rate": 1.3590636107133845e-07, "loss": 0.4287, "num_tokens": 891305737.0, "step": 426 }, { "epoch": 9.282608695652174, "grad_norm": 0.13261904746981767, "learning_rate": 1.3458165272193445e-07, "loss": 0.4255, "num_tokens": 893399618.0, "step": 427 }, { "epoch": 9.304347826086957, "grad_norm": 0.1312843101167757, "learning_rate": 1.3328086555548762e-07, "loss": 0.4297, "num_tokens": 895491469.0, "step": 428 }, { "epoch": 9.326086956521738, "grad_norm": 0.1364003552911898, "learning_rate": 1.3200407447568984e-07, "loss": 0.4341, "num_tokens": 897584632.0, "step": 429 }, { "epoch": 9.347826086956522, "grad_norm": 0.1339886261490337, "learning_rate": 1.3075135300445745e-07, "loss": 0.4281, "num_tokens": 899675181.0, "step": 430 }, { "epoch": 9.369565217391305, "grad_norm": 0.1337370078605457, "learning_rate": 1.2952277327769804e-07, "loss": 0.434, "num_tokens": 901770066.0, "step": 431 }, { "epoch": 9.391304347826088, "grad_norm": 0.13055173922308577, "learning_rate": 1.2831840604115645e-07, "loss": 0.4276, "num_tokens": 903859911.0, "step": 432 }, { "epoch": 9.41304347826087, "grad_norm": 0.13139166365646263, "learning_rate": 1.2713832064634125e-07, "loss": 0.428, "num_tokens": 905953157.0, "step": 433 }, { "epoch": 9.434782608695652, "grad_norm": 0.13103415317501538, "learning_rate": 1.259825850465308e-07, "loss": 0.4323, "num_tokens": 908046526.0, "step": 434 }, { "epoch": 9.456521739130435, "grad_norm": 0.1286719245844758, "learning_rate": 1.2485126579286066e-07, "loss": 0.4341, "num_tokens": 910137413.0, "step": 435 }, { "epoch": 9.478260869565217, "grad_norm": 0.12910865959756296, "learning_rate": 1.2374442803049124e-07, "loss": 0.4314, "num_tokens": 912227811.0, "step": 436 }, { "epoch": 9.5, "grad_norm": 0.13356681693096856, "learning_rate": 1.2266213549485637e-07, "loss": 0.4261, "num_tokens": 914319892.0, "step": 437 }, { "epoch": 9.521739130434783, "grad_norm": 0.13245196915807964, "learning_rate": 1.2160445050799345e-07, "loss": 0.4336, "num_tokens": 916412324.0, "step": 438 }, { "epoch": 9.543478260869565, "grad_norm": 0.13247117449980128, "learning_rate": 1.205714339749545e-07, "loss": 0.4288, "num_tokens": 918503434.0, "step": 439 }, { "epoch": 9.565217391304348, "grad_norm": 0.13140776239629576, "learning_rate": 1.1956314538029936e-07, "loss": 0.4286, "num_tokens": 920596471.0, "step": 440 }, { "epoch": 9.58695652173913, "grad_norm": 0.1317686114638022, "learning_rate": 1.1857964278467e-07, "loss": 0.4316, "num_tokens": 922688205.0, "step": 441 }, { "epoch": 9.608695652173914, "grad_norm": 0.1300939804698934, "learning_rate": 1.1762098282144734e-07, "loss": 0.4237, "num_tokens": 924777343.0, "step": 442 }, { "epoch": 9.630434782608695, "grad_norm": 0.13046645210086893, "learning_rate": 1.166872206934904e-07, "loss": 0.4308, "num_tokens": 926870486.0, "step": 443 }, { "epoch": 9.652173913043478, "grad_norm": 0.1339176102367468, "learning_rate": 1.157784101699567e-07, "loss": 0.4333, "num_tokens": 928965703.0, "step": 444 }, { "epoch": 9.673913043478262, "grad_norm": 0.13053722597742878, "learning_rate": 1.1489460358320726e-07, "loss": 0.4315, "num_tokens": 931058607.0, "step": 445 }, { "epoch": 9.695652173913043, "grad_norm": 0.1318797432070931, "learning_rate": 1.1403585182579217e-07, "loss": 0.4328, "num_tokens": 933148486.0, "step": 446 }, { "epoch": 9.717391304347826, "grad_norm": 0.12863606554725282, "learning_rate": 1.1320220434752026e-07, "loss": 0.433, "num_tokens": 935240504.0, "step": 447 }, { "epoch": 9.73913043478261, "grad_norm": 0.13076944288775819, "learning_rate": 1.1239370915261193e-07, "loss": 0.4284, "num_tokens": 937328887.0, "step": 448 }, { "epoch": 9.76086956521739, "grad_norm": 0.13176495576730735, "learning_rate": 1.1161041279693445e-07, "loss": 0.4282, "num_tokens": 939420861.0, "step": 449 }, { "epoch": 9.782608695652174, "grad_norm": 0.1326360118816391, "learning_rate": 1.1085236038532148e-07, "loss": 0.4301, "num_tokens": 941513118.0, "step": 450 }, { "epoch": 9.804347826086957, "grad_norm": 0.13082582487161826, "learning_rate": 1.1011959556897558e-07, "loss": 0.4278, "num_tokens": 943605716.0, "step": 451 }, { "epoch": 9.826086956521738, "grad_norm": 0.1314931923327051, "learning_rate": 1.0941216054295468e-07, "loss": 0.4323, "num_tokens": 945698868.0, "step": 452 }, { "epoch": 9.847826086956522, "grad_norm": 0.13055748678716064, "learning_rate": 1.0873009604374245e-07, "loss": 0.433, "num_tokens": 947792760.0, "step": 453 }, { "epoch": 9.869565217391305, "grad_norm": 0.1305072624250908, "learning_rate": 1.0807344134690236e-07, "loss": 0.4307, "num_tokens": 949886033.0, "step": 454 }, { "epoch": 9.891304347826086, "grad_norm": 0.12983181142765143, "learning_rate": 1.074422342648161e-07, "loss": 0.428, "num_tokens": 951978463.0, "step": 455 }, { "epoch": 9.91304347826087, "grad_norm": 0.12986749329326977, "learning_rate": 1.068365111445064e-07, "loss": 0.4259, "num_tokens": 954071277.0, "step": 456 }, { "epoch": 9.934782608695652, "grad_norm": 0.13307807739369767, "learning_rate": 1.0625630686554389e-07, "loss": 0.4333, "num_tokens": 956162411.0, "step": 457 }, { "epoch": 9.956521739130435, "grad_norm": 0.1298842544540006, "learning_rate": 1.0570165483803867e-07, "loss": 0.4381, "num_tokens": 958256872.0, "step": 458 }, { "epoch": 9.978260869565217, "grad_norm": 0.12872143065997999, "learning_rate": 1.0517258700071639e-07, "loss": 0.4322, "num_tokens": 960347511.0, "step": 459 }, { "epoch": 10.0, "grad_norm": 0.12998360754044697, "learning_rate": 1.0466913381907913e-07, "loss": 0.4357, "num_tokens": 962441279.0, "step": 460 }, { "epoch": 10.0, "step": 460, "total_flos": 856414331338752.0, "train_loss": 0.4584593860351521, "train_runtime": 14198.1717, "train_samples_per_second": 66.018, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 856414331338752.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }