| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 460, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021739130434782608, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6431, | |
| "num_tokens": 2090831.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6435, | |
| "num_tokens": 4183006.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.06521739130434782, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6379, | |
| "num_tokens": 6272564.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6449, | |
| "num_tokens": 8366125.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.10869565217391304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6402, | |
| "num_tokens": 10461703.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6338, | |
| "num_tokens": 12555347.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.15217391304347827, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6436, | |
| "num_tokens": 14648409.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6384, | |
| "num_tokens": 16741562.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.1956521739130435, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6437, | |
| "num_tokens": 18831318.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.644, | |
| "num_tokens": 20922863.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2391304347826087, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6419, | |
| "num_tokens": 23014620.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6406, | |
| "num_tokens": 25105339.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2826086956521739, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6405, | |
| "num_tokens": 27197795.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6448, | |
| "num_tokens": 29291898.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.32608695652173914, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6372, | |
| "num_tokens": 31383496.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.6397, | |
| "num_tokens": 33477448.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.3695652173913043, | |
| "grad_norm": 8.101283473215778, | |
| "learning_rate": 0.0, | |
| "loss": 0.6359, | |
| "num_tokens": 35568911.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 8.159300793582531, | |
| "learning_rate": 2.1739130434782606e-08, | |
| "loss": 0.6418, | |
| "num_tokens": 37660844.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.41304347826086957, | |
| "grad_norm": 8.143428019277147, | |
| "learning_rate": 4.347826086956521e-08, | |
| "loss": 0.6391, | |
| "num_tokens": 39755083.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 8.104124464792218, | |
| "learning_rate": 6.521739130434782e-08, | |
| "loss": 0.6442, | |
| "num_tokens": 41847436.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.45652173913043476, | |
| "grad_norm": 8.101986671909064, | |
| "learning_rate": 8.695652173913042e-08, | |
| "loss": 0.6452, | |
| "num_tokens": 43940263.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 8.13244304749822, | |
| "learning_rate": 1.0869565217391303e-07, | |
| "loss": 0.646, | |
| "num_tokens": 46033076.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.147383007336266, | |
| "learning_rate": 1.3043478260869563e-07, | |
| "loss": 0.6435, | |
| "num_tokens": 48124508.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 8.144105003431784, | |
| "learning_rate": 1.5217391304347825e-07, | |
| "loss": 0.639, | |
| "num_tokens": 50217845.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5434782608695652, | |
| "grad_norm": 7.999617610004394, | |
| "learning_rate": 1.7391304347826085e-07, | |
| "loss": 0.6374, | |
| "num_tokens": 52311173.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 8.109185633091318, | |
| "learning_rate": 1.9565217391304347e-07, | |
| "loss": 0.6335, | |
| "num_tokens": 54401629.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5869565217391305, | |
| "grad_norm": 8.109185633091318, | |
| "learning_rate": 2.1739130434782607e-07, | |
| "loss": 0.6192, | |
| "num_tokens": 56492920.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 9.27996446234992, | |
| "learning_rate": 2.1739130434782607e-07, | |
| "loss": 0.6146, | |
| "num_tokens": 58586157.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6304347826086957, | |
| "grad_norm": 9.159324927898115, | |
| "learning_rate": 2.391304347826087e-07, | |
| "loss": 0.6175, | |
| "num_tokens": 60679798.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 9.763710040610643, | |
| "learning_rate": 2.6086956521739126e-07, | |
| "loss": 0.6155, | |
| "num_tokens": 62773329.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6739130434782609, | |
| "grad_norm": 9.779733585945591, | |
| "learning_rate": 2.8260869565217386e-07, | |
| "loss": 0.6135, | |
| "num_tokens": 64864578.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 9.779733585945591, | |
| "learning_rate": 3.043478260869565e-07, | |
| "loss": 0.5828, | |
| "num_tokens": 66956700.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.717391304347826, | |
| "grad_norm": 13.430953070069496, | |
| "learning_rate": 3.043478260869565e-07, | |
| "loss": 0.5856, | |
| "num_tokens": 69046410.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 13.32547280061291, | |
| "learning_rate": 3.260869565217391e-07, | |
| "loss": 0.5778, | |
| "num_tokens": 71138403.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.7608695652173914, | |
| "grad_norm": 13.037082670124171, | |
| "learning_rate": 3.478260869565217e-07, | |
| "loss": 0.5698, | |
| "num_tokens": 73231369.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 11.047732146906828, | |
| "learning_rate": 3.695652173913043e-07, | |
| "loss": 0.5637, | |
| "num_tokens": 75324509.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.8043478260869565, | |
| "grad_norm": 9.940315680480857, | |
| "learning_rate": 3.9130434782608694e-07, | |
| "loss": 0.5683, | |
| "num_tokens": 77414031.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 5.482237002272238, | |
| "learning_rate": 4.1304347826086954e-07, | |
| "loss": 0.559, | |
| "num_tokens": 79506910.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.8478260869565217, | |
| "grad_norm": 5.078292155102419, | |
| "learning_rate": 4.3478260869565214e-07, | |
| "loss": 0.5504, | |
| "num_tokens": 81597326.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 4.758390837071518, | |
| "learning_rate": 4.5652173913043473e-07, | |
| "loss": 0.5467, | |
| "num_tokens": 83691142.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8913043478260869, | |
| "grad_norm": 4.533086950846491, | |
| "learning_rate": 4.782608695652174e-07, | |
| "loss": 0.5368, | |
| "num_tokens": 85781965.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 4.3523282131985175, | |
| "learning_rate": 5e-07, | |
| "loss": 0.5356, | |
| "num_tokens": 87872195.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.9347826086956522, | |
| "grad_norm": 4.14903309276372, | |
| "learning_rate": 5.217391304347825e-07, | |
| "loss": 0.536, | |
| "num_tokens": 89964241.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 3.9293608373652873, | |
| "learning_rate": 5.434782608695652e-07, | |
| "loss": 0.537, | |
| "num_tokens": 92057253.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.9782608695652174, | |
| "grad_norm": 3.597560116551251, | |
| "learning_rate": 5.652173913043477e-07, | |
| "loss": 0.5299, | |
| "num_tokens": 94149837.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.0088158093378103, | |
| "learning_rate": 5.869565217391305e-07, | |
| "loss": 0.5214, | |
| "num_tokens": 96244823.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.0217391304347827, | |
| "grad_norm": 2.7490399390700904, | |
| "learning_rate": 6.08695652173913e-07, | |
| "loss": 0.5175, | |
| "num_tokens": 98337212.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.0434782608695652, | |
| "grad_norm": 2.5946058680255386, | |
| "learning_rate": 6.304347826086957e-07, | |
| "loss": 0.5159, | |
| "num_tokens": 100428766.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.065217391304348, | |
| "grad_norm": 2.503905065889435, | |
| "learning_rate": 6.521739130434782e-07, | |
| "loss": 0.5072, | |
| "num_tokens": 102520698.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.0869565217391304, | |
| "grad_norm": 2.456643430738729, | |
| "learning_rate": 6.739130434782609e-07, | |
| "loss": 0.5018, | |
| "num_tokens": 104612511.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.108695652173913, | |
| "grad_norm": 2.3865628559826857, | |
| "learning_rate": 6.956521739130434e-07, | |
| "loss": 0.4942, | |
| "num_tokens": 106707204.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.1304347826086956, | |
| "grad_norm": 2.3356298780592293, | |
| "learning_rate": 7.17391304347826e-07, | |
| "loss": 0.4918, | |
| "num_tokens": 108800670.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.1521739130434783, | |
| "grad_norm": 2.304735371322351, | |
| "learning_rate": 7.391304347826086e-07, | |
| "loss": 0.4914, | |
| "num_tokens": 110894728.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.1739130434782608, | |
| "grad_norm": 2.2915944727664055, | |
| "learning_rate": 7.608695652173913e-07, | |
| "loss": 0.4919, | |
| "num_tokens": 112989220.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.1956521739130435, | |
| "grad_norm": 2.2452202598548534, | |
| "learning_rate": 7.826086956521739e-07, | |
| "loss": 0.4794, | |
| "num_tokens": 115080217.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.2173913043478262, | |
| "grad_norm": 2.179597494769898, | |
| "learning_rate": 8.043478260869565e-07, | |
| "loss": 0.4857, | |
| "num_tokens": 117172369.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.2391304347826086, | |
| "grad_norm": 2.074697161067729, | |
| "learning_rate": 8.260869565217391e-07, | |
| "loss": 0.4789, | |
| "num_tokens": 119266181.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.2608695652173914, | |
| "grad_norm": 1.8408360821578929, | |
| "learning_rate": 8.478260869565217e-07, | |
| "loss": 0.4746, | |
| "num_tokens": 121360235.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.2826086956521738, | |
| "grad_norm": 1.409431046959032, | |
| "learning_rate": 8.695652173913043e-07, | |
| "loss": 0.4767, | |
| "num_tokens": 123452301.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.3043478260869565, | |
| "grad_norm": 0.8344487221470542, | |
| "learning_rate": 8.913043478260869e-07, | |
| "loss": 0.4775, | |
| "num_tokens": 125544278.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3260869565217392, | |
| "grad_norm": 0.4342290656199116, | |
| "learning_rate": 9.130434782608695e-07, | |
| "loss": 0.4744, | |
| "num_tokens": 127634616.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.3478260869565217, | |
| "grad_norm": 0.27358056826447424, | |
| "learning_rate": 9.347826086956522e-07, | |
| "loss": 0.4754, | |
| "num_tokens": 129729453.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.3695652173913042, | |
| "grad_norm": 0.23082541661596323, | |
| "learning_rate": 9.565217391304349e-07, | |
| "loss": 0.4743, | |
| "num_tokens": 131820225.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.391304347826087, | |
| "grad_norm": 0.22354521543660916, | |
| "learning_rate": 9.782608695652173e-07, | |
| "loss": 0.472, | |
| "num_tokens": 133913678.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.4130434782608696, | |
| "grad_norm": 0.20118073906094502, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4697, | |
| "num_tokens": 136007430.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.434782608695652, | |
| "grad_norm": 0.20692402483858816, | |
| "learning_rate": 9.999870437446958e-07, | |
| "loss": 0.4631, | |
| "num_tokens": 138101493.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.4565217391304348, | |
| "grad_norm": 0.1901715221113706, | |
| "learning_rate": 9.999481757248477e-07, | |
| "loss": 0.4666, | |
| "num_tokens": 140193704.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.4782608695652173, | |
| "grad_norm": 0.19179490787267098, | |
| "learning_rate": 9.998833981786071e-07, | |
| "loss": 0.464, | |
| "num_tokens": 142285250.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.18845464387262947, | |
| "learning_rate": 9.997927148360823e-07, | |
| "loss": 0.4692, | |
| "num_tokens": 144378781.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.5217391304347827, | |
| "grad_norm": 0.1903737563315874, | |
| "learning_rate": 9.996761309191247e-07, | |
| "loss": 0.4596, | |
| "num_tokens": 146471011.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.5434782608695652, | |
| "grad_norm": 0.1736225981517059, | |
| "learning_rate": 9.995336531410273e-07, | |
| "loss": 0.4631, | |
| "num_tokens": 148562357.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.5652173913043477, | |
| "grad_norm": 0.17508414588076462, | |
| "learning_rate": 9.993652897061393e-07, | |
| "loss": 0.4649, | |
| "num_tokens": 150656061.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.5869565217391304, | |
| "grad_norm": 0.17741120682256034, | |
| "learning_rate": 9.991710503093922e-07, | |
| "loss": 0.4674, | |
| "num_tokens": 152748871.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.608695652173913, | |
| "grad_norm": 0.16540006434719232, | |
| "learning_rate": 9.989509461357426e-07, | |
| "loss": 0.4642, | |
| "num_tokens": 154839927.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.6304347826086958, | |
| "grad_norm": 0.17667838370848385, | |
| "learning_rate": 9.987049898595276e-07, | |
| "loss": 0.4651, | |
| "num_tokens": 156932138.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.6521739130434783, | |
| "grad_norm": 0.15619953156457722, | |
| "learning_rate": 9.984331956437354e-07, | |
| "loss": 0.46, | |
| "num_tokens": 159020025.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.6739130434782608, | |
| "grad_norm": 0.158692399179484, | |
| "learning_rate": 9.98135579139189e-07, | |
| "loss": 0.4642, | |
| "num_tokens": 161110709.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.6956521739130435, | |
| "grad_norm": 0.16387685015731232, | |
| "learning_rate": 9.97812157483646e-07, | |
| "loss": 0.4598, | |
| "num_tokens": 163201280.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.7173913043478262, | |
| "grad_norm": 0.16147264627938895, | |
| "learning_rate": 9.974629493008114e-07, | |
| "loss": 0.4593, | |
| "num_tokens": 165293375.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 0.14847792515904235, | |
| "learning_rate": 9.97087974699264e-07, | |
| "loss": 0.4539, | |
| "num_tokens": 167386166.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.7608695652173914, | |
| "grad_norm": 0.14368418963515403, | |
| "learning_rate": 9.966872552713004e-07, | |
| "loss": 0.4596, | |
| "num_tokens": 169477777.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.7826086956521738, | |
| "grad_norm": 0.14685857026484775, | |
| "learning_rate": 9.962608140916905e-07, | |
| "loss": 0.4582, | |
| "num_tokens": 171567446.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.8043478260869565, | |
| "grad_norm": 0.15282259101923246, | |
| "learning_rate": 9.958086757163488e-07, | |
| "loss": 0.4616, | |
| "num_tokens": 173655862.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.8260869565217392, | |
| "grad_norm": 0.15514478856193514, | |
| "learning_rate": 9.953308661809207e-07, | |
| "loss": 0.457, | |
| "num_tokens": 175749514.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.8478260869565217, | |
| "grad_norm": 0.14228669183096485, | |
| "learning_rate": 9.948274129992836e-07, | |
| "loss": 0.46, | |
| "num_tokens": 177837992.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.8695652173913042, | |
| "grad_norm": 0.13988219902396246, | |
| "learning_rate": 9.942983451619614e-07, | |
| "loss": 0.4603, | |
| "num_tokens": 179931723.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.891304347826087, | |
| "grad_norm": 0.1391966834345067, | |
| "learning_rate": 9.93743693134456e-07, | |
| "loss": 0.4617, | |
| "num_tokens": 182023329.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.9130434782608696, | |
| "grad_norm": 0.14697062115129336, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.4623, | |
| "num_tokens": 184117906.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.9347826086956523, | |
| "grad_norm": 0.13927296968611877, | |
| "learning_rate": 9.92557765735184e-07, | |
| "loss": 0.4563, | |
| "num_tokens": 186211056.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.9565217391304348, | |
| "grad_norm": 0.14637757594641007, | |
| "learning_rate": 9.919265586530975e-07, | |
| "loss": 0.4627, | |
| "num_tokens": 188304474.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.9782608695652173, | |
| "grad_norm": 0.142956055977351, | |
| "learning_rate": 9.912699039562576e-07, | |
| "loss": 0.4579, | |
| "num_tokens": 190397770.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.14123350435136145, | |
| "learning_rate": 9.905878394570453e-07, | |
| "loss": 0.4602, | |
| "num_tokens": 192489635.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.0217391304347827, | |
| "grad_norm": 0.1436834934317212, | |
| "learning_rate": 9.898804044310245e-07, | |
| "loss": 0.4558, | |
| "num_tokens": 194583301.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.0434782608695654, | |
| "grad_norm": 0.1374483001051701, | |
| "learning_rate": 9.891476396146784e-07, | |
| "loss": 0.452, | |
| "num_tokens": 196677306.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.0652173913043477, | |
| "grad_norm": 0.13722577602290897, | |
| "learning_rate": 9.883895872030657e-07, | |
| "loss": 0.4509, | |
| "num_tokens": 198768835.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.0869565217391304, | |
| "grad_norm": 0.13783100429148887, | |
| "learning_rate": 9.87606290847388e-07, | |
| "loss": 0.4572, | |
| "num_tokens": 200859264.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.108695652173913, | |
| "grad_norm": 0.1367807019239086, | |
| "learning_rate": 9.867977956524796e-07, | |
| "loss": 0.4582, | |
| "num_tokens": 202952584.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.130434782608696, | |
| "grad_norm": 0.14120080132901613, | |
| "learning_rate": 9.859641481742077e-07, | |
| "loss": 0.4542, | |
| "num_tokens": 205045012.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.1521739130434785, | |
| "grad_norm": 0.1347346802712429, | |
| "learning_rate": 9.851053964167927e-07, | |
| "loss": 0.4506, | |
| "num_tokens": 207137257.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.1739130434782608, | |
| "grad_norm": 0.13826725695583056, | |
| "learning_rate": 9.842215898300433e-07, | |
| "loss": 0.4553, | |
| "num_tokens": 209229044.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.1956521739130435, | |
| "grad_norm": 0.14306905673630102, | |
| "learning_rate": 9.833127793065097e-07, | |
| "loss": 0.4533, | |
| "num_tokens": 211323327.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 2.217391304347826, | |
| "grad_norm": 0.13752558845525534, | |
| "learning_rate": 9.823790171785526e-07, | |
| "loss": 0.4556, | |
| "num_tokens": 213417950.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 2.239130434782609, | |
| "grad_norm": 0.1451178643142076, | |
| "learning_rate": 9.814203572153298e-07, | |
| "loss": 0.4542, | |
| "num_tokens": 215508377.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 2.260869565217391, | |
| "grad_norm": 0.1341943658847912, | |
| "learning_rate": 9.804368546197006e-07, | |
| "loss": 0.4548, | |
| "num_tokens": 217599824.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.282608695652174, | |
| "grad_norm": 0.13948206525149412, | |
| "learning_rate": 9.794285660250455e-07, | |
| "loss": 0.4549, | |
| "num_tokens": 219693642.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.3043478260869565, | |
| "grad_norm": 0.13795263816003625, | |
| "learning_rate": 9.783955494920066e-07, | |
| "loss": 0.4548, | |
| "num_tokens": 221789678.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.3260869565217392, | |
| "grad_norm": 0.1409796803835522, | |
| "learning_rate": 9.773378645051436e-07, | |
| "loss": 0.4562, | |
| "num_tokens": 223881586.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 2.3478260869565215, | |
| "grad_norm": 0.13044382265443313, | |
| "learning_rate": 9.762555719695088e-07, | |
| "loss": 0.4461, | |
| "num_tokens": 225970167.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.369565217391304, | |
| "grad_norm": 0.1502600460482269, | |
| "learning_rate": 9.751487342071393e-07, | |
| "loss": 0.4498, | |
| "num_tokens": 228061661.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 2.391304347826087, | |
| "grad_norm": 0.1338230534634335, | |
| "learning_rate": 9.740174149534692e-07, | |
| "loss": 0.4485, | |
| "num_tokens": 230153932.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.4130434782608696, | |
| "grad_norm": 0.1440831587199848, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.4552, | |
| "num_tokens": 232244695.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 2.4347826086956523, | |
| "grad_norm": 0.13658285859261474, | |
| "learning_rate": 9.716815939588436e-07, | |
| "loss": 0.4557, | |
| "num_tokens": 234336105.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.4565217391304346, | |
| "grad_norm": 0.14313487538739023, | |
| "learning_rate": 9.704772267223019e-07, | |
| "loss": 0.4527, | |
| "num_tokens": 236428770.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.4782608695652173, | |
| "grad_norm": 0.1400873818490847, | |
| "learning_rate": 9.692486469955424e-07, | |
| "loss": 0.4494, | |
| "num_tokens": 238521551.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.13049717910266437, | |
| "learning_rate": 9.6799592552431e-07, | |
| "loss": 0.4458, | |
| "num_tokens": 240614588.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.5217391304347827, | |
| "grad_norm": 0.13318259742833988, | |
| "learning_rate": 9.667191344445122e-07, | |
| "loss": 0.4562, | |
| "num_tokens": 242706024.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.5434782608695654, | |
| "grad_norm": 0.1342513590455262, | |
| "learning_rate": 9.654183472780655e-07, | |
| "loss": 0.4573, | |
| "num_tokens": 244798483.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.5652173913043477, | |
| "grad_norm": 0.136855839967963, | |
| "learning_rate": 9.640936389286615e-07, | |
| "loss": 0.4472, | |
| "num_tokens": 246891342.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.5869565217391304, | |
| "grad_norm": 0.137209398518282, | |
| "learning_rate": 9.627450856774539e-07, | |
| "loss": 0.4468, | |
| "num_tokens": 248983299.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.608695652173913, | |
| "grad_norm": 0.1347427629250892, | |
| "learning_rate": 9.613727651786657e-07, | |
| "loss": 0.4492, | |
| "num_tokens": 251073761.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.630434782608696, | |
| "grad_norm": 0.14870638200971545, | |
| "learning_rate": 9.599767564551183e-07, | |
| "loss": 0.4502, | |
| "num_tokens": 253168287.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.6521739130434785, | |
| "grad_norm": 0.13859516453775694, | |
| "learning_rate": 9.5855713989368e-07, | |
| "loss": 0.4523, | |
| "num_tokens": 255260895.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.6739130434782608, | |
| "grad_norm": 0.15720652496010762, | |
| "learning_rate": 9.57113997240638e-07, | |
| "loss": 0.4463, | |
| "num_tokens": 257351654.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.6956521739130435, | |
| "grad_norm": 0.1401244780972256, | |
| "learning_rate": 9.55647411596991e-07, | |
| "loss": 0.4518, | |
| "num_tokens": 259440240.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.717391304347826, | |
| "grad_norm": 0.15018164735793, | |
| "learning_rate": 9.541574674136632e-07, | |
| "loss": 0.4498, | |
| "num_tokens": 261532114.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.7391304347826084, | |
| "grad_norm": 0.13823614764022513, | |
| "learning_rate": 9.526442504866426e-07, | |
| "loss": 0.4558, | |
| "num_tokens": 263623046.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.7608695652173916, | |
| "grad_norm": 0.14842418144378133, | |
| "learning_rate": 9.511078479520392e-07, | |
| "loss": 0.4523, | |
| "num_tokens": 265714481.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.782608695652174, | |
| "grad_norm": 0.13803890404964064, | |
| "learning_rate": 9.495483482810687e-07, | |
| "loss": 0.451, | |
| "num_tokens": 267809891.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.8043478260869565, | |
| "grad_norm": 0.15595909439991468, | |
| "learning_rate": 9.479658412749575e-07, | |
| "loss": 0.4535, | |
| "num_tokens": 269901433.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.8260869565217392, | |
| "grad_norm": 0.1368971901595536, | |
| "learning_rate": 9.46360418059771e-07, | |
| "loss": 0.451, | |
| "num_tokens": 271991817.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.8478260869565215, | |
| "grad_norm": 0.15455191550217653, | |
| "learning_rate": 9.447321710811674e-07, | |
| "loss": 0.4519, | |
| "num_tokens": 274081721.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.869565217391304, | |
| "grad_norm": 0.13697363499414442, | |
| "learning_rate": 9.430811940990734e-07, | |
| "loss": 0.4509, | |
| "num_tokens": 276177796.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.891304347826087, | |
| "grad_norm": 0.14942955290721893, | |
| "learning_rate": 9.41407582182286e-07, | |
| "loss": 0.4472, | |
| "num_tokens": 278269125.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.9130434782608696, | |
| "grad_norm": 0.13839892788575286, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.4559, | |
| "num_tokens": 280360480.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.9347826086956523, | |
| "grad_norm": 0.139804429331633, | |
| "learning_rate": 9.37992840331246e-07, | |
| "loss": 0.4512, | |
| "num_tokens": 282455179.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.9565217391304346, | |
| "grad_norm": 0.1522025576863328, | |
| "learning_rate": 9.362519070292923e-07, | |
| "loss": 0.4457, | |
| "num_tokens": 284547258.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.9782608695652173, | |
| "grad_norm": 0.13715855940260502, | |
| "learning_rate": 9.344887320459198e-07, | |
| "loss": 0.4472, | |
| "num_tokens": 286640938.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.1457504699757938, | |
| "learning_rate": 9.327034169106629e-07, | |
| "loss": 0.4465, | |
| "num_tokens": 288733869.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 3.0217391304347827, | |
| "grad_norm": 0.13583971625499844, | |
| "learning_rate": 9.308960644279604e-07, | |
| "loss": 0.4507, | |
| "num_tokens": 290826317.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 3.0434782608695654, | |
| "grad_norm": 0.13589135954621737, | |
| "learning_rate": 9.290667786712352e-07, | |
| "loss": 0.4435, | |
| "num_tokens": 292916931.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.0652173913043477, | |
| "grad_norm": 0.13858649102418938, | |
| "learning_rate": 9.272156649769018e-07, | |
| "loss": 0.454, | |
| "num_tokens": 295007551.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 3.0869565217391304, | |
| "grad_norm": 0.13151545079660656, | |
| "learning_rate": 9.253428299383012e-07, | |
| "loss": 0.4443, | |
| "num_tokens": 297101123.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 3.108695652173913, | |
| "grad_norm": 0.1372256717064563, | |
| "learning_rate": 9.234483813995613e-07, | |
| "loss": 0.4525, | |
| "num_tokens": 299194691.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 3.130434782608696, | |
| "grad_norm": 0.1362038166818457, | |
| "learning_rate": 9.215324284493888e-07, | |
| "loss": 0.4441, | |
| "num_tokens": 301288963.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 3.1521739130434785, | |
| "grad_norm": 0.13366059430402608, | |
| "learning_rate": 9.19595081414786e-07, | |
| "loss": 0.4499, | |
| "num_tokens": 303381802.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 3.1739130434782608, | |
| "grad_norm": 0.13964481270629708, | |
| "learning_rate": 9.176364518546988e-07, | |
| "loss": 0.4484, | |
| "num_tokens": 305474609.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 3.1956521739130435, | |
| "grad_norm": 0.13321001510853767, | |
| "learning_rate": 9.156566525535923e-07, | |
| "loss": 0.4452, | |
| "num_tokens": 307567147.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 3.217391304347826, | |
| "grad_norm": 0.14086672975049896, | |
| "learning_rate": 9.136557975149561e-07, | |
| "loss": 0.4446, | |
| "num_tokens": 309657954.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 3.239130434782609, | |
| "grad_norm": 0.1334280312981284, | |
| "learning_rate": 9.116340019547401e-07, | |
| "loss": 0.4468, | |
| "num_tokens": 311749336.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 3.260869565217391, | |
| "grad_norm": 0.13486315217872386, | |
| "learning_rate": 9.095913822947196e-07, | |
| "loss": 0.4495, | |
| "num_tokens": 313844697.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.282608695652174, | |
| "grad_norm": 0.13793460372824046, | |
| "learning_rate": 9.075280561557915e-07, | |
| "loss": 0.4452, | |
| "num_tokens": 315936487.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 3.3043478260869565, | |
| "grad_norm": 0.1379832600278911, | |
| "learning_rate": 9.054441423512013e-07, | |
| "loss": 0.4485, | |
| "num_tokens": 318028220.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 3.3260869565217392, | |
| "grad_norm": 0.1349244615099038, | |
| "learning_rate": 9.033397608797014e-07, | |
| "loss": 0.4463, | |
| "num_tokens": 320120976.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 3.3478260869565215, | |
| "grad_norm": 0.13302146086489272, | |
| "learning_rate": 9.012150329186411e-07, | |
| "loss": 0.4446, | |
| "num_tokens": 322213515.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 3.369565217391304, | |
| "grad_norm": 0.13213199888987598, | |
| "learning_rate": 8.990700808169889e-07, | |
| "loss": 0.4457, | |
| "num_tokens": 324306085.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 3.391304347826087, | |
| "grad_norm": 0.13576476545257601, | |
| "learning_rate": 8.969050280882872e-07, | |
| "loss": 0.4435, | |
| "num_tokens": 326393943.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 3.4130434782608696, | |
| "grad_norm": 0.13269213517048742, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.4421, | |
| "num_tokens": 328487650.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 3.4347826086956523, | |
| "grad_norm": 0.1397362160827111, | |
| "learning_rate": 8.925151205840341e-07, | |
| "loss": 0.4434, | |
| "num_tokens": 330581600.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 3.4565217391304346, | |
| "grad_norm": 0.13448412881240226, | |
| "learning_rate": 8.902905185940933e-07, | |
| "loss": 0.4392, | |
| "num_tokens": 332675243.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 3.4782608695652173, | |
| "grad_norm": 0.13581732207346198, | |
| "learning_rate": 8.880463215337679e-07, | |
| "loss": 0.4412, | |
| "num_tokens": 334764586.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.1390811493560612, | |
| "learning_rate": 8.857826586314586e-07, | |
| "loss": 0.4402, | |
| "num_tokens": 336856307.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 3.5217391304347827, | |
| "grad_norm": 0.13460989761218342, | |
| "learning_rate": 8.834996602364736e-07, | |
| "loss": 0.4456, | |
| "num_tokens": 338950008.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 3.5434782608695654, | |
| "grad_norm": 0.13236047844850818, | |
| "learning_rate": 8.811974578115248e-07, | |
| "loss": 0.4448, | |
| "num_tokens": 341042954.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 3.5652173913043477, | |
| "grad_norm": 0.13163982832755347, | |
| "learning_rate": 8.788761839251558e-07, | |
| "loss": 0.4415, | |
| "num_tokens": 343134592.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 3.5869565217391304, | |
| "grad_norm": 0.14154355839698224, | |
| "learning_rate": 8.765359722441094e-07, | |
| "loss": 0.4456, | |
| "num_tokens": 345226933.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 3.608695652173913, | |
| "grad_norm": 0.1355716379437535, | |
| "learning_rate": 8.741769575256304e-07, | |
| "loss": 0.4539, | |
| "num_tokens": 347318986.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 3.630434782608696, | |
| "grad_norm": 0.13083494960528463, | |
| "learning_rate": 8.717992756097047e-07, | |
| "loss": 0.4448, | |
| "num_tokens": 349411839.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 3.6521739130434785, | |
| "grad_norm": 0.13406159250224642, | |
| "learning_rate": 8.694030634112389e-07, | |
| "loss": 0.4421, | |
| "num_tokens": 351504447.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 3.6739130434782608, | |
| "grad_norm": 0.13054451577457893, | |
| "learning_rate": 8.669884589121756e-07, | |
| "loss": 0.4426, | |
| "num_tokens": 353596301.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 3.6956521739130435, | |
| "grad_norm": 0.13149102842185534, | |
| "learning_rate": 8.645556011535469e-07, | |
| "loss": 0.4432, | |
| "num_tokens": 355690374.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.717391304347826, | |
| "grad_norm": 0.139504340331513, | |
| "learning_rate": 8.621046302274697e-07, | |
| "loss": 0.4464, | |
| "num_tokens": 357782173.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 3.7391304347826084, | |
| "grad_norm": 0.13974946687867332, | |
| "learning_rate": 8.596356872690778e-07, | |
| "loss": 0.4441, | |
| "num_tokens": 359875585.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 3.7608695652173916, | |
| "grad_norm": 0.1334362497712104, | |
| "learning_rate": 8.571489144483944e-07, | |
| "loss": 0.446, | |
| "num_tokens": 361966708.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 3.782608695652174, | |
| "grad_norm": 0.1338803708751177, | |
| "learning_rate": 8.546444549621466e-07, | |
| "loss": 0.4449, | |
| "num_tokens": 364059418.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 3.8043478260869565, | |
| "grad_norm": 0.13983854438472215, | |
| "learning_rate": 8.521224530255185e-07, | |
| "loss": 0.4448, | |
| "num_tokens": 366152621.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 3.8260869565217392, | |
| "grad_norm": 0.13196553812425743, | |
| "learning_rate": 8.495830538638481e-07, | |
| "loss": 0.4418, | |
| "num_tokens": 368245662.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 3.8478260869565215, | |
| "grad_norm": 0.13578885820125056, | |
| "learning_rate": 8.470264037042638e-07, | |
| "loss": 0.441, | |
| "num_tokens": 370334430.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 3.869565217391304, | |
| "grad_norm": 0.13655268949641716, | |
| "learning_rate": 8.44452649767264e-07, | |
| "loss": 0.4439, | |
| "num_tokens": 372426004.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 3.891304347826087, | |
| "grad_norm": 0.13786817781349708, | |
| "learning_rate": 8.418619402582402e-07, | |
| "loss": 0.4449, | |
| "num_tokens": 374517459.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 3.9130434782608696, | |
| "grad_norm": 0.13918471016485154, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.4443, | |
| "num_tokens": 376607611.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.9347826086956523, | |
| "grad_norm": 0.1403794476474249, | |
| "learning_rate": 8.366302522188902e-07, | |
| "loss": 0.4458, | |
| "num_tokens": 378702030.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 3.9565217391304346, | |
| "grad_norm": 0.13426097626521305, | |
| "learning_rate": 8.339895749467237e-07, | |
| "loss": 0.4454, | |
| "num_tokens": 380793542.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 3.9782608695652173, | |
| "grad_norm": 0.13694462770495058, | |
| "learning_rate": 8.313325446015051e-07, | |
| "loss": 0.4402, | |
| "num_tokens": 382884886.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.14272951717733043, | |
| "learning_rate": 8.286593141839608e-07, | |
| "loss": 0.4423, | |
| "num_tokens": 384977846.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 4.021739130434782, | |
| "grad_norm": 0.1330582820057405, | |
| "learning_rate": 8.259700376276723e-07, | |
| "loss": 0.444, | |
| "num_tokens": 387071368.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 4.043478260869565, | |
| "grad_norm": 0.14269491995175726, | |
| "learning_rate": 8.232648697902113e-07, | |
| "loss": 0.4419, | |
| "num_tokens": 389164937.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 4.065217391304348, | |
| "grad_norm": 0.14167939313294817, | |
| "learning_rate": 8.205439664442229e-07, | |
| "loss": 0.4374, | |
| "num_tokens": 391258317.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 4.086956521739131, | |
| "grad_norm": 0.13301946278649346, | |
| "learning_rate": 8.178074842684554e-07, | |
| "loss": 0.4369, | |
| "num_tokens": 393349237.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 4.108695652173913, | |
| "grad_norm": 0.14450522832231033, | |
| "learning_rate": 8.150555808387387e-07, | |
| "loss": 0.441, | |
| "num_tokens": 395440009.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 4.130434782608695, | |
| "grad_norm": 0.13877319368593247, | |
| "learning_rate": 8.122884146189103e-07, | |
| "loss": 0.4374, | |
| "num_tokens": 397533811.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.1521739130434785, | |
| "grad_norm": 0.14318441928820957, | |
| "learning_rate": 8.095061449516902e-07, | |
| "loss": 0.4427, | |
| "num_tokens": 399626066.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 4.173913043478261, | |
| "grad_norm": 0.14579072710487548, | |
| "learning_rate": 8.067089320495056e-07, | |
| "loss": 0.4424, | |
| "num_tokens": 401718435.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 4.195652173913044, | |
| "grad_norm": 0.1374419102226233, | |
| "learning_rate": 8.038969369852654e-07, | |
| "loss": 0.443, | |
| "num_tokens": 403809789.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 4.217391304347826, | |
| "grad_norm": 0.14176995288895658, | |
| "learning_rate": 8.010703216830851e-07, | |
| "loss": 0.4413, | |
| "num_tokens": 405899766.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 4.239130434782608, | |
| "grad_norm": 0.13176722868730403, | |
| "learning_rate": 7.982292489089621e-07, | |
| "loss": 0.4416, | |
| "num_tokens": 407993759.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 4.260869565217392, | |
| "grad_norm": 0.13658573742727376, | |
| "learning_rate": 7.953738822614047e-07, | |
| "loss": 0.4391, | |
| "num_tokens": 410085571.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 4.282608695652174, | |
| "grad_norm": 0.13952183028099105, | |
| "learning_rate": 7.92504386162009e-07, | |
| "loss": 0.4349, | |
| "num_tokens": 412179524.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 4.304347826086957, | |
| "grad_norm": 0.13564370570138545, | |
| "learning_rate": 7.896209258459932e-07, | |
| "loss": 0.4444, | |
| "num_tokens": 414268672.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 4.326086956521739, | |
| "grad_norm": 0.14161938766244753, | |
| "learning_rate": 7.867236673526819e-07, | |
| "loss": 0.4437, | |
| "num_tokens": 416362080.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 4.3478260869565215, | |
| "grad_norm": 0.13684396912767316, | |
| "learning_rate": 7.838127775159451e-07, | |
| "loss": 0.4436, | |
| "num_tokens": 418453097.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.369565217391305, | |
| "grad_norm": 0.14184406812492364, | |
| "learning_rate": 7.808884239545909e-07, | |
| "loss": 0.4415, | |
| "num_tokens": 420545560.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 4.391304347826087, | |
| "grad_norm": 0.1366458217786427, | |
| "learning_rate": 7.779507750627144e-07, | |
| "loss": 0.4402, | |
| "num_tokens": 422636960.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 4.413043478260869, | |
| "grad_norm": 0.13773619821700897, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.4437, | |
| "num_tokens": 424730959.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 4.434782608695652, | |
| "grad_norm": 0.13758121811172125, | |
| "learning_rate": 7.720362686819813e-07, | |
| "loss": 0.44, | |
| "num_tokens": 426822955.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 4.456521739130435, | |
| "grad_norm": 0.13761175022454988, | |
| "learning_rate": 7.690597517702567e-07, | |
| "loss": 0.4423, | |
| "num_tokens": 428914580.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 4.478260869565218, | |
| "grad_norm": 0.13045102347864693, | |
| "learning_rate": 7.660706206626619e-07, | |
| "loss": 0.4364, | |
| "num_tokens": 431007431.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.13320915243246825, | |
| "learning_rate": 7.630690474834003e-07, | |
| "loss": 0.4376, | |
| "num_tokens": 433101278.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 4.521739130434782, | |
| "grad_norm": 0.135879604945715, | |
| "learning_rate": 7.600552050731314e-07, | |
| "loss": 0.4392, | |
| "num_tokens": 435193373.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 4.543478260869565, | |
| "grad_norm": 0.13761527466802642, | |
| "learning_rate": 7.570292669790184e-07, | |
| "loss": 0.4383, | |
| "num_tokens": 437286568.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 4.565217391304348, | |
| "grad_norm": 0.1403373341420107, | |
| "learning_rate": 7.539914074447348e-07, | |
| "loss": 0.4419, | |
| "num_tokens": 439379915.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.586956521739131, | |
| "grad_norm": 0.13604302231295956, | |
| "learning_rate": 7.5094180140043e-07, | |
| "loss": 0.44, | |
| "num_tokens": 441473072.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 4.608695652173913, | |
| "grad_norm": 0.1322101614674662, | |
| "learning_rate": 7.478806244526576e-07, | |
| "loss": 0.4436, | |
| "num_tokens": 443567056.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 4.630434782608695, | |
| "grad_norm": 0.13669477430314778, | |
| "learning_rate": 7.448080528742623e-07, | |
| "loss": 0.4398, | |
| "num_tokens": 445657660.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 4.6521739130434785, | |
| "grad_norm": 0.1415477810417469, | |
| "learning_rate": 7.417242635942297e-07, | |
| "loss": 0.4394, | |
| "num_tokens": 447751320.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 4.673913043478261, | |
| "grad_norm": 0.135566768628023, | |
| "learning_rate": 7.38629434187499e-07, | |
| "loss": 0.4386, | |
| "num_tokens": 449843005.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 4.695652173913043, | |
| "grad_norm": 0.14123634359786125, | |
| "learning_rate": 7.355237428647359e-07, | |
| "loss": 0.4415, | |
| "num_tokens": 451936377.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 4.717391304347826, | |
| "grad_norm": 0.1357872359821091, | |
| "learning_rate": 7.324073684620725e-07, | |
| "loss": 0.4389, | |
| "num_tokens": 454027246.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 4.739130434782608, | |
| "grad_norm": 0.13106073468244842, | |
| "learning_rate": 7.292804904308086e-07, | |
| "loss": 0.4353, | |
| "num_tokens": 456118801.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 4.760869565217392, | |
| "grad_norm": 0.13664278397915866, | |
| "learning_rate": 7.261432888270776e-07, | |
| "loss": 0.4436, | |
| "num_tokens": 458211696.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 4.782608695652174, | |
| "grad_norm": 0.1359534311622986, | |
| "learning_rate": 7.229959443014793e-07, | |
| "loss": 0.4427, | |
| "num_tokens": 460302365.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.804347826086957, | |
| "grad_norm": 0.1391728706721248, | |
| "learning_rate": 7.198386380886764e-07, | |
| "loss": 0.4378, | |
| "num_tokens": 462395009.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 4.826086956521739, | |
| "grad_norm": 0.14471303133933838, | |
| "learning_rate": 7.1667155199696e-07, | |
| "loss": 0.4393, | |
| "num_tokens": 464488113.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 4.8478260869565215, | |
| "grad_norm": 0.13340639535871296, | |
| "learning_rate": 7.134948683977786e-07, | |
| "loss": 0.4403, | |
| "num_tokens": 466576826.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 4.869565217391305, | |
| "grad_norm": 0.13672161474434347, | |
| "learning_rate": 7.103087702152376e-07, | |
| "loss": 0.4377, | |
| "num_tokens": 468668935.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 4.891304347826087, | |
| "grad_norm": 0.1344120648043691, | |
| "learning_rate": 7.071134409155658e-07, | |
| "loss": 0.4399, | |
| "num_tokens": 470761454.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 4.913043478260869, | |
| "grad_norm": 0.13711458441843338, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.4432, | |
| "num_tokens": 472854948.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 4.934782608695652, | |
| "grad_norm": 0.1355562476824205, | |
| "learning_rate": 7.006958254769437e-07, | |
| "loss": 0.4404, | |
| "num_tokens": 474946262.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 4.956521739130435, | |
| "grad_norm": 0.13762364563023322, | |
| "learning_rate": 6.974739088858337e-07, | |
| "loss": 0.439, | |
| "num_tokens": 477036875.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 4.978260869565218, | |
| "grad_norm": 0.13969976190229713, | |
| "learning_rate": 6.942435002519938e-07, | |
| "loss": 0.4379, | |
| "num_tokens": 479130327.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.13408662519831854, | |
| "learning_rate": 6.91004785593197e-07, | |
| "loss": 0.4466, | |
| "num_tokens": 481221569.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.021739130434782, | |
| "grad_norm": 0.14032592776135683, | |
| "learning_rate": 6.877579514055058e-07, | |
| "loss": 0.4396, | |
| "num_tokens": 483311867.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 5.043478260869565, | |
| "grad_norm": 0.13548191720383043, | |
| "learning_rate": 6.845031846525321e-07, | |
| "loss": 0.4347, | |
| "num_tokens": 485403919.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 5.065217391304348, | |
| "grad_norm": 0.130439503781961, | |
| "learning_rate": 6.812406727546712e-07, | |
| "loss": 0.4389, | |
| "num_tokens": 487494574.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 5.086956521739131, | |
| "grad_norm": 0.13779175423288925, | |
| "learning_rate": 6.779706035783104e-07, | |
| "loss": 0.4348, | |
| "num_tokens": 489585429.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 5.108695652173913, | |
| "grad_norm": 0.13497846536624478, | |
| "learning_rate": 6.7469316542501e-07, | |
| "loss": 0.4371, | |
| "num_tokens": 491679660.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 5.130434782608695, | |
| "grad_norm": 0.14168178870357873, | |
| "learning_rate": 6.714085470206609e-07, | |
| "loss": 0.4383, | |
| "num_tokens": 493769766.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 5.1521739130434785, | |
| "grad_norm": 0.13453405538660843, | |
| "learning_rate": 6.681169375046172e-07, | |
| "loss": 0.438, | |
| "num_tokens": 495862475.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 5.173913043478261, | |
| "grad_norm": 0.13202645111151726, | |
| "learning_rate": 6.648185264188042e-07, | |
| "loss": 0.4381, | |
| "num_tokens": 497955918.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 5.195652173913044, | |
| "grad_norm": 0.13125723135499598, | |
| "learning_rate": 6.615135036968049e-07, | |
| "loss": 0.4364, | |
| "num_tokens": 500047691.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 5.217391304347826, | |
| "grad_norm": 0.1335404532583581, | |
| "learning_rate": 6.582020596529223e-07, | |
| "loss": 0.4346, | |
| "num_tokens": 502139896.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.239130434782608, | |
| "grad_norm": 0.13521276914397093, | |
| "learning_rate": 6.548843849712204e-07, | |
| "loss": 0.4402, | |
| "num_tokens": 504233584.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 5.260869565217392, | |
| "grad_norm": 0.13773428031418447, | |
| "learning_rate": 6.515606706945448e-07, | |
| "loss": 0.4369, | |
| "num_tokens": 506324690.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 5.282608695652174, | |
| "grad_norm": 0.13773029409346027, | |
| "learning_rate": 6.482311082135207e-07, | |
| "loss": 0.4395, | |
| "num_tokens": 508417180.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 5.304347826086957, | |
| "grad_norm": 0.1356871115442459, | |
| "learning_rate": 6.448958892555331e-07, | |
| "loss": 0.4365, | |
| "num_tokens": 510508979.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 5.326086956521739, | |
| "grad_norm": 0.13534102412383026, | |
| "learning_rate": 6.415552058736853e-07, | |
| "loss": 0.4389, | |
| "num_tokens": 512598937.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 5.3478260869565215, | |
| "grad_norm": 0.13264202592618046, | |
| "learning_rate": 6.382092504357407e-07, | |
| "loss": 0.4317, | |
| "num_tokens": 514689904.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 5.369565217391305, | |
| "grad_norm": 0.1333826695615688, | |
| "learning_rate": 6.348582156130461e-07, | |
| "loss": 0.4383, | |
| "num_tokens": 516783964.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 5.391304347826087, | |
| "grad_norm": 0.13899071330031054, | |
| "learning_rate": 6.315022943694351e-07, | |
| "loss": 0.4403, | |
| "num_tokens": 518876985.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 5.413043478260869, | |
| "grad_norm": 0.13778710354618337, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.4399, | |
| "num_tokens": 520970152.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 5.434782608695652, | |
| "grad_norm": 0.13632128488986214, | |
| "learning_rate": 6.247765658705564e-07, | |
| "loss": 0.4337, | |
| "num_tokens": 523061474.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.456521739130435, | |
| "grad_norm": 0.1462604946144207, | |
| "learning_rate": 6.21407145905313e-07, | |
| "loss": 0.4404, | |
| "num_tokens": 525152838.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 5.478260869565218, | |
| "grad_norm": 0.13766959751545696, | |
| "learning_rate": 6.180336140769014e-07, | |
| "loss": 0.4408, | |
| "num_tokens": 527243818.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.13758581395499722, | |
| "learning_rate": 6.146561646446086e-07, | |
| "loss": 0.4369, | |
| "num_tokens": 529336853.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 5.521739130434782, | |
| "grad_norm": 0.13513710600810164, | |
| "learning_rate": 6.11274992093311e-07, | |
| "loss": 0.4308, | |
| "num_tokens": 531430293.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 5.543478260869565, | |
| "grad_norm": 0.13912312404631008, | |
| "learning_rate": 6.078902911222739e-07, | |
| "loss": 0.4383, | |
| "num_tokens": 533522415.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 5.565217391304348, | |
| "grad_norm": 0.1388530569147137, | |
| "learning_rate": 6.045022566339418e-07, | |
| "loss": 0.4376, | |
| "num_tokens": 535617186.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 5.586956521739131, | |
| "grad_norm": 0.1374779214335327, | |
| "learning_rate": 6.011110837227137e-07, | |
| "loss": 0.4308, | |
| "num_tokens": 537709724.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 5.608695652173913, | |
| "grad_norm": 0.1377630412176324, | |
| "learning_rate": 5.977169676637097e-07, | |
| "loss": 0.4468, | |
| "num_tokens": 539801117.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 5.630434782608695, | |
| "grad_norm": 0.13653727942572233, | |
| "learning_rate": 5.943201039015259e-07, | |
| "loss": 0.4388, | |
| "num_tokens": 541895241.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 5.6521739130434785, | |
| "grad_norm": 0.14282273069692783, | |
| "learning_rate": 5.909206880389812e-07, | |
| "loss": 0.4377, | |
| "num_tokens": 543985798.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.673913043478261, | |
| "grad_norm": 0.13054693337369536, | |
| "learning_rate": 5.87518915825852e-07, | |
| "loss": 0.4363, | |
| "num_tokens": 546077077.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 5.695652173913043, | |
| "grad_norm": 0.13814789552591392, | |
| "learning_rate": 5.841149831476024e-07, | |
| "loss": 0.4385, | |
| "num_tokens": 548170917.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 5.717391304347826, | |
| "grad_norm": 0.13844152235488286, | |
| "learning_rate": 5.80709086014102e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 550259924.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 5.739130434782608, | |
| "grad_norm": 0.14059296175840055, | |
| "learning_rate": 5.773014205483413e-07, | |
| "loss": 0.4379, | |
| "num_tokens": 552352828.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 5.760869565217392, | |
| "grad_norm": 0.13847725520224147, | |
| "learning_rate": 5.738921829751373e-07, | |
| "loss": 0.442, | |
| "num_tokens": 554447030.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 5.782608695652174, | |
| "grad_norm": 0.13609874773318442, | |
| "learning_rate": 5.704815696098336e-07, | |
| "loss": 0.4379, | |
| "num_tokens": 556540214.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 5.804347826086957, | |
| "grad_norm": 0.13297354961540186, | |
| "learning_rate": 5.67069776846997e-07, | |
| "loss": 0.4325, | |
| "num_tokens": 558631373.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 5.826086956521739, | |
| "grad_norm": 0.13567272164852606, | |
| "learning_rate": 5.636570011491081e-07, | |
| "loss": 0.4388, | |
| "num_tokens": 560726058.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 5.8478260869565215, | |
| "grad_norm": 0.13162046638977157, | |
| "learning_rate": 5.602434390352476e-07, | |
| "loss": 0.4329, | |
| "num_tokens": 562819414.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 5.869565217391305, | |
| "grad_norm": 0.13332396886010534, | |
| "learning_rate": 5.568292870697812e-07, | |
| "loss": 0.4341, | |
| "num_tokens": 564912852.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.891304347826087, | |
| "grad_norm": 0.13584602864934453, | |
| "learning_rate": 5.5341474185104e-07, | |
| "loss": 0.4297, | |
| "num_tokens": 567005867.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 5.913043478260869, | |
| "grad_norm": 0.1369250861070486, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.4369, | |
| "num_tokens": 569101094.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 5.934782608695652, | |
| "grad_norm": 0.1377729350140169, | |
| "learning_rate": 5.4658525814896e-07, | |
| "loss": 0.4356, | |
| "num_tokens": 571193885.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 5.956521739130435, | |
| "grad_norm": 0.1365562431007031, | |
| "learning_rate": 5.431707129302188e-07, | |
| "loss": 0.4363, | |
| "num_tokens": 573284677.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 5.978260869565218, | |
| "grad_norm": 0.13502269056012137, | |
| "learning_rate": 5.397565609647524e-07, | |
| "loss": 0.4304, | |
| "num_tokens": 575374743.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.13363349453152876, | |
| "learning_rate": 5.36342998850892e-07, | |
| "loss": 0.4368, | |
| "num_tokens": 577464868.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 6.021739130434782, | |
| "grad_norm": 0.1328059670364238, | |
| "learning_rate": 5.329302231530028e-07, | |
| "loss": 0.4315, | |
| "num_tokens": 579556555.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 6.043478260869565, | |
| "grad_norm": 0.13754954688543153, | |
| "learning_rate": 5.295184303901664e-07, | |
| "loss": 0.4338, | |
| "num_tokens": 581648063.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 6.065217391304348, | |
| "grad_norm": 0.12821659544185995, | |
| "learning_rate": 5.261078170248629e-07, | |
| "loss": 0.4355, | |
| "num_tokens": 583740874.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 6.086956521739131, | |
| "grad_norm": 0.13679137656422818, | |
| "learning_rate": 5.226985794516586e-07, | |
| "loss": 0.4319, | |
| "num_tokens": 585832349.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.108695652173913, | |
| "grad_norm": 0.13249307568241678, | |
| "learning_rate": 5.192909139858981e-07, | |
| "loss": 0.4338, | |
| "num_tokens": 587921984.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 6.130434782608695, | |
| "grad_norm": 0.1357134504296526, | |
| "learning_rate": 5.158850168523978e-07, | |
| "loss": 0.4406, | |
| "num_tokens": 590012104.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 6.1521739130434785, | |
| "grad_norm": 0.13359851753248614, | |
| "learning_rate": 5.124810841741479e-07, | |
| "loss": 0.4367, | |
| "num_tokens": 592104332.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 6.173913043478261, | |
| "grad_norm": 0.13330128917193781, | |
| "learning_rate": 5.090793119610189e-07, | |
| "loss": 0.4365, | |
| "num_tokens": 594197153.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 6.195652173913044, | |
| "grad_norm": 0.1389379881395303, | |
| "learning_rate": 5.05679896098474e-07, | |
| "loss": 0.4306, | |
| "num_tokens": 596290282.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 6.217391304347826, | |
| "grad_norm": 0.13502081622963785, | |
| "learning_rate": 5.022830323362904e-07, | |
| "loss": 0.4339, | |
| "num_tokens": 598381737.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 6.239130434782608, | |
| "grad_norm": 0.1300343985791597, | |
| "learning_rate": 4.988889162772862e-07, | |
| "loss": 0.4287, | |
| "num_tokens": 600474261.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 6.260869565217392, | |
| "grad_norm": 0.1346594845108426, | |
| "learning_rate": 4.954977433660582e-07, | |
| "loss": 0.4328, | |
| "num_tokens": 602567714.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 6.282608695652174, | |
| "grad_norm": 0.1347227053154414, | |
| "learning_rate": 4.921097088777261e-07, | |
| "loss": 0.4279, | |
| "num_tokens": 604656969.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 6.304347826086957, | |
| "grad_norm": 0.1300345204916007, | |
| "learning_rate": 4.887250079066891e-07, | |
| "loss": 0.4324, | |
| "num_tokens": 606751602.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 6.326086956521739, | |
| "grad_norm": 0.1373172335581007, | |
| "learning_rate": 4.853438353553913e-07, | |
| "loss": 0.4352, | |
| "num_tokens": 608844165.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 6.3478260869565215, | |
| "grad_norm": 0.1355884963872206, | |
| "learning_rate": 4.819663859230986e-07, | |
| "loss": 0.4358, | |
| "num_tokens": 610938529.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 6.369565217391305, | |
| "grad_norm": 0.13411787711864406, | |
| "learning_rate": 4.785928540946868e-07, | |
| "loss": 0.4353, | |
| "num_tokens": 613033101.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 6.391304347826087, | |
| "grad_norm": 0.13405503421436976, | |
| "learning_rate": 4.752234341294438e-07, | |
| "loss": 0.4405, | |
| "num_tokens": 615127003.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 6.413043478260869, | |
| "grad_norm": 0.13437305820655193, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.433, | |
| "num_tokens": 617218474.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 6.434782608695652, | |
| "grad_norm": 0.1321759781171093, | |
| "learning_rate": 4.684977056305649e-07, | |
| "loss": 0.4391, | |
| "num_tokens": 619311600.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 6.456521739130435, | |
| "grad_norm": 0.1305282668313758, | |
| "learning_rate": 4.6514178438695393e-07, | |
| "loss": 0.4388, | |
| "num_tokens": 621403757.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 6.478260869565218, | |
| "grad_norm": 0.13202954037114603, | |
| "learning_rate": 4.6179074956425933e-07, | |
| "loss": 0.4292, | |
| "num_tokens": 623498451.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.13284707048157723, | |
| "learning_rate": 4.584447941263149e-07, | |
| "loss": 0.4325, | |
| "num_tokens": 625591661.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 6.521739130434782, | |
| "grad_norm": 0.13602035136541316, | |
| "learning_rate": 4.551041107444671e-07, | |
| "loss": 0.4392, | |
| "num_tokens": 627682691.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.543478260869565, | |
| "grad_norm": 0.13598544826900796, | |
| "learning_rate": 4.517688917864794e-07, | |
| "loss": 0.4353, | |
| "num_tokens": 629776091.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 6.565217391304348, | |
| "grad_norm": 0.1338409318077529, | |
| "learning_rate": 4.4843932930545523e-07, | |
| "loss": 0.4345, | |
| "num_tokens": 631868289.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 6.586956521739131, | |
| "grad_norm": 0.1342983907771441, | |
| "learning_rate": 4.4511561502877957e-07, | |
| "loss": 0.4369, | |
| "num_tokens": 633961314.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 6.608695652173913, | |
| "grad_norm": 0.13346437927742005, | |
| "learning_rate": 4.417979403470777e-07, | |
| "loss": 0.431, | |
| "num_tokens": 636053320.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 6.630434782608695, | |
| "grad_norm": 0.1310788686933386, | |
| "learning_rate": 4.384864963031951e-07, | |
| "loss": 0.4356, | |
| "num_tokens": 638148918.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 6.6521739130434785, | |
| "grad_norm": 0.13154555127052447, | |
| "learning_rate": 4.3518147358119574e-07, | |
| "loss": 0.4339, | |
| "num_tokens": 640240048.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 6.673913043478261, | |
| "grad_norm": 0.13334619119653454, | |
| "learning_rate": 4.3188306249538274e-07, | |
| "loss": 0.4314, | |
| "num_tokens": 642332226.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 6.695652173913043, | |
| "grad_norm": 0.13503339105593512, | |
| "learning_rate": 4.285914529793391e-07, | |
| "loss": 0.4342, | |
| "num_tokens": 644421381.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 6.717391304347826, | |
| "grad_norm": 0.13212880134022156, | |
| "learning_rate": 4.2530683457499015e-07, | |
| "loss": 0.4363, | |
| "num_tokens": 646510254.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 6.739130434782608, | |
| "grad_norm": 0.13394002235598818, | |
| "learning_rate": 4.220293964216898e-07, | |
| "loss": 0.4359, | |
| "num_tokens": 648602689.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.760869565217392, | |
| "grad_norm": 0.14660384432956694, | |
| "learning_rate": 4.187593272453288e-07, | |
| "loss": 0.4365, | |
| "num_tokens": 650696691.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 6.782608695652174, | |
| "grad_norm": 0.13239093242319802, | |
| "learning_rate": 4.154968153474679e-07, | |
| "loss": 0.4347, | |
| "num_tokens": 652788120.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 6.804347826086957, | |
| "grad_norm": 0.1342643540007929, | |
| "learning_rate": 4.1224204859449416e-07, | |
| "loss": 0.433, | |
| "num_tokens": 654880645.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 6.826086956521739, | |
| "grad_norm": 0.13275609367078162, | |
| "learning_rate": 4.0899521440680306e-07, | |
| "loss": 0.4355, | |
| "num_tokens": 656971574.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 6.8478260869565215, | |
| "grad_norm": 0.13259301897142664, | |
| "learning_rate": 4.057564997480063e-07, | |
| "loss": 0.4332, | |
| "num_tokens": 659063107.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 6.869565217391305, | |
| "grad_norm": 0.13223705233189004, | |
| "learning_rate": 4.0252609111416633e-07, | |
| "loss": 0.4337, | |
| "num_tokens": 661155890.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 6.891304347826087, | |
| "grad_norm": 0.13224068238361178, | |
| "learning_rate": 3.993041745230562e-07, | |
| "loss": 0.4309, | |
| "num_tokens": 663247212.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 6.913043478260869, | |
| "grad_norm": 0.13637009129468228, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.4321, | |
| "num_tokens": 665338057.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 6.934782608695652, | |
| "grad_norm": 0.13707779499269707, | |
| "learning_rate": 3.9288655908443423e-07, | |
| "loss": 0.4329, | |
| "num_tokens": 667432205.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 6.956521739130435, | |
| "grad_norm": 0.132109300794455, | |
| "learning_rate": 3.8969122978476253e-07, | |
| "loss": 0.4346, | |
| "num_tokens": 669525185.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.978260869565218, | |
| "grad_norm": 0.13205979491874026, | |
| "learning_rate": 3.865051316022214e-07, | |
| "loss": 0.4327, | |
| "num_tokens": 671616364.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.135129481925338, | |
| "learning_rate": 3.8332844800303996e-07, | |
| "loss": 0.4378, | |
| "num_tokens": 673708729.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 7.021739130434782, | |
| "grad_norm": 0.1351161665425618, | |
| "learning_rate": 3.8016136191132354e-07, | |
| "loss": 0.4365, | |
| "num_tokens": 675800356.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 7.043478260869565, | |
| "grad_norm": 0.1349927733423944, | |
| "learning_rate": 3.770040556985208e-07, | |
| "loss": 0.4328, | |
| "num_tokens": 677892616.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 7.065217391304348, | |
| "grad_norm": 0.1352555819729584, | |
| "learning_rate": 3.738567111729224e-07, | |
| "loss": 0.4334, | |
| "num_tokens": 679985493.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 7.086956521739131, | |
| "grad_norm": 0.13250294100790336, | |
| "learning_rate": 3.707195095691913e-07, | |
| "loss": 0.4328, | |
| "num_tokens": 682078744.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 7.108695652173913, | |
| "grad_norm": 0.13666594640233576, | |
| "learning_rate": 3.675926315379274e-07, | |
| "loss": 0.4322, | |
| "num_tokens": 684171663.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 7.130434782608695, | |
| "grad_norm": 0.13676036413293755, | |
| "learning_rate": 3.644762571352641e-07, | |
| "loss": 0.433, | |
| "num_tokens": 686264097.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 7.1521739130434785, | |
| "grad_norm": 0.12951987903608786, | |
| "learning_rate": 3.6137056581250137e-07, | |
| "loss": 0.4365, | |
| "num_tokens": 688358020.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 7.173913043478261, | |
| "grad_norm": 0.13212843134698485, | |
| "learning_rate": 3.5827573640577033e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 690450259.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 7.195652173913044, | |
| "grad_norm": 0.13407226703097985, | |
| "learning_rate": 3.5519194712573787e-07, | |
| "loss": 0.4371, | |
| "num_tokens": 692542688.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 7.217391304347826, | |
| "grad_norm": 0.13581794126806007, | |
| "learning_rate": 3.521193755473423e-07, | |
| "loss": 0.4372, | |
| "num_tokens": 694633082.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 7.239130434782608, | |
| "grad_norm": 0.1276611741507177, | |
| "learning_rate": 3.4905819859957e-07, | |
| "loss": 0.4303, | |
| "num_tokens": 696724260.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 7.260869565217392, | |
| "grad_norm": 0.13106145609895156, | |
| "learning_rate": 3.460085925552653e-07, | |
| "loss": 0.437, | |
| "num_tokens": 698813738.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 7.282608695652174, | |
| "grad_norm": 0.1306738813490512, | |
| "learning_rate": 3.4297073302098155e-07, | |
| "loss": 0.432, | |
| "num_tokens": 700905567.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 7.304347826086957, | |
| "grad_norm": 0.1327736232372846, | |
| "learning_rate": 3.399447949268686e-07, | |
| "loss": 0.4285, | |
| "num_tokens": 702999204.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 7.326086956521739, | |
| "grad_norm": 0.1332013722230655, | |
| "learning_rate": 3.369309525165997e-07, | |
| "loss": 0.4339, | |
| "num_tokens": 705092251.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 7.3478260869565215, | |
| "grad_norm": 0.12940464875966667, | |
| "learning_rate": 3.33929379337338e-07, | |
| "loss": 0.4288, | |
| "num_tokens": 707186648.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 7.369565217391305, | |
| "grad_norm": 0.13228179281843067, | |
| "learning_rate": 3.30940248229743e-07, | |
| "loss": 0.4302, | |
| "num_tokens": 709278066.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 7.391304347826087, | |
| "grad_norm": 0.13503745540474288, | |
| "learning_rate": 3.279637313180187e-07, | |
| "loss": 0.431, | |
| "num_tokens": 711370104.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 7.413043478260869, | |
| "grad_norm": 0.14399326300596813, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.4295, | |
| "num_tokens": 713463519.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 7.434782608695652, | |
| "grad_norm": 0.1335803366951806, | |
| "learning_rate": 3.220492249372857e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 715555950.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 7.456521739130435, | |
| "grad_norm": 0.1321490413754198, | |
| "learning_rate": 3.191115760454092e-07, | |
| "loss": 0.4334, | |
| "num_tokens": 717644115.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 7.478260869565218, | |
| "grad_norm": 0.1378381482716753, | |
| "learning_rate": 3.16187222484055e-07, | |
| "loss": 0.4278, | |
| "num_tokens": 719737075.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.13813454110009424, | |
| "learning_rate": 3.1327633264731803e-07, | |
| "loss": 0.4354, | |
| "num_tokens": 721829111.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 7.521739130434782, | |
| "grad_norm": 0.13507583558096983, | |
| "learning_rate": 3.103790741540067e-07, | |
| "loss": 0.4346, | |
| "num_tokens": 723921674.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 7.543478260869565, | |
| "grad_norm": 0.13094378688454855, | |
| "learning_rate": 3.0749561383799107e-07, | |
| "loss": 0.4331, | |
| "num_tokens": 726014640.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 7.565217391304348, | |
| "grad_norm": 0.1340997929945924, | |
| "learning_rate": 3.0462611773859536e-07, | |
| "loss": 0.4269, | |
| "num_tokens": 728103347.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 7.586956521739131, | |
| "grad_norm": 0.13188855397354393, | |
| "learning_rate": 3.017707510910378e-07, | |
| "loss": 0.4315, | |
| "num_tokens": 730196549.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 7.608695652173913, | |
| "grad_norm": 0.13420100529200588, | |
| "learning_rate": 2.9892967831691504e-07, | |
| "loss": 0.4287, | |
| "num_tokens": 732290824.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.630434782608695, | |
| "grad_norm": 0.13059199827577136, | |
| "learning_rate": 2.961030630147346e-07, | |
| "loss": 0.4353, | |
| "num_tokens": 734382209.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 7.6521739130434785, | |
| "grad_norm": 0.1317331651303684, | |
| "learning_rate": 2.9329106795049443e-07, | |
| "loss": 0.4291, | |
| "num_tokens": 736476467.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 7.673913043478261, | |
| "grad_norm": 0.13586368026615056, | |
| "learning_rate": 2.904938550483098e-07, | |
| "loss": 0.4361, | |
| "num_tokens": 738567121.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 7.695652173913043, | |
| "grad_norm": 0.1333568766936638, | |
| "learning_rate": 2.8771158538108976e-07, | |
| "loss": 0.4316, | |
| "num_tokens": 740660970.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 7.717391304347826, | |
| "grad_norm": 0.13226567822802499, | |
| "learning_rate": 2.849444191612613e-07, | |
| "loss": 0.4313, | |
| "num_tokens": 742752708.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 7.739130434782608, | |
| "grad_norm": 0.13495676434313775, | |
| "learning_rate": 2.821925157315447e-07, | |
| "loss": 0.4304, | |
| "num_tokens": 744844699.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 7.760869565217392, | |
| "grad_norm": 0.13174968471614787, | |
| "learning_rate": 2.7945603355577707e-07, | |
| "loss": 0.4331, | |
| "num_tokens": 746938461.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 7.782608695652174, | |
| "grad_norm": 0.13315770983596392, | |
| "learning_rate": 2.7673513020978866e-07, | |
| "loss": 0.4382, | |
| "num_tokens": 749030330.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 7.804347826086957, | |
| "grad_norm": 0.1308412156094041, | |
| "learning_rate": 2.7402996237232757e-07, | |
| "loss": 0.4318, | |
| "num_tokens": 751124081.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 7.826086956521739, | |
| "grad_norm": 0.14307594281989688, | |
| "learning_rate": 2.713406858160393e-07, | |
| "loss": 0.4257, | |
| "num_tokens": 753217342.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.8478260869565215, | |
| "grad_norm": 0.13412489931327856, | |
| "learning_rate": 2.686674553984951e-07, | |
| "loss": 0.4337, | |
| "num_tokens": 755310255.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 7.869565217391305, | |
| "grad_norm": 0.13326162890783366, | |
| "learning_rate": 2.6601042505327635e-07, | |
| "loss": 0.4278, | |
| "num_tokens": 757400781.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 7.891304347826087, | |
| "grad_norm": 0.13553749314962396, | |
| "learning_rate": 2.6336974778110974e-07, | |
| "loss": 0.4335, | |
| "num_tokens": 759493454.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 7.913043478260869, | |
| "grad_norm": 0.13276724428197223, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.4297, | |
| "num_tokens": 761586571.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 7.934782608695652, | |
| "grad_norm": 0.13353006154405495, | |
| "learning_rate": 2.5813805974175984e-07, | |
| "loss": 0.4366, | |
| "num_tokens": 763678682.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 7.956521739130435, | |
| "grad_norm": 0.13422033595407565, | |
| "learning_rate": 2.55547350232736e-07, | |
| "loss": 0.4336, | |
| "num_tokens": 765769422.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 7.978260869565218, | |
| "grad_norm": 0.13744588281494488, | |
| "learning_rate": 2.529735962957361e-07, | |
| "loss": 0.432, | |
| "num_tokens": 767860704.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.13605419812550198, | |
| "learning_rate": 2.504169461361518e-07, | |
| "loss": 0.4346, | |
| "num_tokens": 769952460.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 8.021739130434783, | |
| "grad_norm": 0.12913143473579597, | |
| "learning_rate": 2.478775469744815e-07, | |
| "loss": 0.4325, | |
| "num_tokens": 772044426.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 8.043478260869565, | |
| "grad_norm": 0.12961630524558168, | |
| "learning_rate": 2.453555450378535e-07, | |
| "loss": 0.4281, | |
| "num_tokens": 774137051.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 8.065217391304348, | |
| "grad_norm": 0.12615636919892154, | |
| "learning_rate": 2.4285108555160575e-07, | |
| "loss": 0.4309, | |
| "num_tokens": 776225808.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 8.08695652173913, | |
| "grad_norm": 0.13305530149593306, | |
| "learning_rate": 2.4036431273092235e-07, | |
| "loss": 0.4331, | |
| "num_tokens": 778316140.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 8.108695652173912, | |
| "grad_norm": 0.13211852945485628, | |
| "learning_rate": 2.378953697725303e-07, | |
| "loss": 0.4295, | |
| "num_tokens": 780410250.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 8.130434782608695, | |
| "grad_norm": 0.1292232171342386, | |
| "learning_rate": 2.3544439884645314e-07, | |
| "loss": 0.4354, | |
| "num_tokens": 782502683.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 8.152173913043478, | |
| "grad_norm": 0.1313175653486085, | |
| "learning_rate": 2.3301154108782453e-07, | |
| "loss": 0.4256, | |
| "num_tokens": 784596876.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 8.173913043478262, | |
| "grad_norm": 0.13116271616904762, | |
| "learning_rate": 2.3059693658876094e-07, | |
| "loss": 0.434, | |
| "num_tokens": 786687028.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 8.195652173913043, | |
| "grad_norm": 0.1290464225342815, | |
| "learning_rate": 2.2820072439029523e-07, | |
| "loss": 0.4307, | |
| "num_tokens": 788781709.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 8.217391304347826, | |
| "grad_norm": 0.13638944899256616, | |
| "learning_rate": 2.2582304247436962e-07, | |
| "loss": 0.4305, | |
| "num_tokens": 790874628.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 8.23913043478261, | |
| "grad_norm": 0.13152310229196626, | |
| "learning_rate": 2.2346402775589042e-07, | |
| "loss": 0.4353, | |
| "num_tokens": 792968522.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 8.26086956521739, | |
| "grad_norm": 0.13177637659531652, | |
| "learning_rate": 2.2112381607484416e-07, | |
| "loss": 0.4335, | |
| "num_tokens": 795061203.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 8.282608695652174, | |
| "grad_norm": 0.1325848225519375, | |
| "learning_rate": 2.1880254218847538e-07, | |
| "loss": 0.4309, | |
| "num_tokens": 797155596.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 8.304347826086957, | |
| "grad_norm": 0.13091228502197777, | |
| "learning_rate": 2.1650033976352643e-07, | |
| "loss": 0.4273, | |
| "num_tokens": 799250052.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 8.326086956521738, | |
| "grad_norm": 0.12968816077115408, | |
| "learning_rate": 2.1421734136854153e-07, | |
| "loss": 0.4283, | |
| "num_tokens": 801343245.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 8.347826086956522, | |
| "grad_norm": 0.13538549199023636, | |
| "learning_rate": 2.1195367846623207e-07, | |
| "loss": 0.4336, | |
| "num_tokens": 803434640.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 8.369565217391305, | |
| "grad_norm": 0.12970848773853702, | |
| "learning_rate": 2.0970948140590672e-07, | |
| "loss": 0.4258, | |
| "num_tokens": 805528060.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 8.391304347826088, | |
| "grad_norm": 0.13268823242396818, | |
| "learning_rate": 2.0748487941596594e-07, | |
| "loss": 0.4325, | |
| "num_tokens": 807620018.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 8.41304347826087, | |
| "grad_norm": 0.130777558774095, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.4341, | |
| "num_tokens": 809712424.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 8.434782608695652, | |
| "grad_norm": 0.13276667916448534, | |
| "learning_rate": 2.0309497191171281e-07, | |
| "loss": 0.4301, | |
| "num_tokens": 811806174.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 8.456521739130435, | |
| "grad_norm": 0.13377311658786517, | |
| "learning_rate": 2.0092991918301106e-07, | |
| "loss": 0.437, | |
| "num_tokens": 813898764.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 8.478260869565217, | |
| "grad_norm": 0.1341059998288671, | |
| "learning_rate": 1.9878496708135884e-07, | |
| "loss": 0.4323, | |
| "num_tokens": 815990077.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.13253582182125295, | |
| "learning_rate": 1.9666023912029849e-07, | |
| "loss": 0.4402, | |
| "num_tokens": 818080579.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 8.521739130434783, | |
| "grad_norm": 0.1302260861058939, | |
| "learning_rate": 1.9455585764879873e-07, | |
| "loss": 0.4338, | |
| "num_tokens": 820171456.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 8.543478260869565, | |
| "grad_norm": 0.12879496335031015, | |
| "learning_rate": 1.924719438442085e-07, | |
| "loss": 0.4267, | |
| "num_tokens": 822264249.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 8.565217391304348, | |
| "grad_norm": 0.13047843266489334, | |
| "learning_rate": 1.9040861770528043e-07, | |
| "loss": 0.4324, | |
| "num_tokens": 824355668.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 8.58695652173913, | |
| "grad_norm": 0.13118675730217721, | |
| "learning_rate": 1.883659980452598e-07, | |
| "loss": 0.4286, | |
| "num_tokens": 826448015.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 8.608695652173914, | |
| "grad_norm": 0.131767591474556, | |
| "learning_rate": 1.863442024850438e-07, | |
| "loss": 0.4355, | |
| "num_tokens": 828540071.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 8.630434782608695, | |
| "grad_norm": 0.13480920458872578, | |
| "learning_rate": 1.843433474464076e-07, | |
| "loss": 0.4302, | |
| "num_tokens": 830635170.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 8.652173913043478, | |
| "grad_norm": 0.1315376912303259, | |
| "learning_rate": 1.8236354814530112e-07, | |
| "loss": 0.4359, | |
| "num_tokens": 832727964.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 8.673913043478262, | |
| "grad_norm": 0.1332959061250401, | |
| "learning_rate": 1.80404918585214e-07, | |
| "loss": 0.4344, | |
| "num_tokens": 834819503.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 8.695652173913043, | |
| "grad_norm": 0.1357750786137828, | |
| "learning_rate": 1.7846757155061127e-07, | |
| "loss": 0.4312, | |
| "num_tokens": 836909543.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.717391304347826, | |
| "grad_norm": 0.1297667217706855, | |
| "learning_rate": 1.765516186004387e-07, | |
| "loss": 0.4298, | |
| "num_tokens": 839002022.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 8.73913043478261, | |
| "grad_norm": 0.13324607592951465, | |
| "learning_rate": 1.7465717006169887e-07, | |
| "loss": 0.4298, | |
| "num_tokens": 841093887.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 8.76086956521739, | |
| "grad_norm": 0.1319543530134324, | |
| "learning_rate": 1.7278433502309808e-07, | |
| "loss": 0.4302, | |
| "num_tokens": 843185214.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 8.782608695652174, | |
| "grad_norm": 0.1292262569354397, | |
| "learning_rate": 1.7093322132876485e-07, | |
| "loss": 0.4289, | |
| "num_tokens": 845275872.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 8.804347826086957, | |
| "grad_norm": 0.13330281500828273, | |
| "learning_rate": 1.691039355720396e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 847364898.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 8.826086956521738, | |
| "grad_norm": 0.13157685547158732, | |
| "learning_rate": 1.6729658308933703e-07, | |
| "loss": 0.432, | |
| "num_tokens": 849456452.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 8.847826086956522, | |
| "grad_norm": 0.13145264305507653, | |
| "learning_rate": 1.6551126795408015e-07, | |
| "loss": 0.4265, | |
| "num_tokens": 851549867.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 8.869565217391305, | |
| "grad_norm": 0.13486432492275377, | |
| "learning_rate": 1.6374809297070763e-07, | |
| "loss": 0.4329, | |
| "num_tokens": 853641176.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 8.891304347826086, | |
| "grad_norm": 0.12974827151243915, | |
| "learning_rate": 1.6200715966875392e-07, | |
| "loss": 0.4213, | |
| "num_tokens": 855735273.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 8.91304347826087, | |
| "grad_norm": 0.13185441487515778, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.4271, | |
| "num_tokens": 857828511.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 8.934782608695652, | |
| "grad_norm": 0.13485608760298035, | |
| "learning_rate": 1.5859241781771399e-07, | |
| "loss": 0.4308, | |
| "num_tokens": 859920415.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 8.956521739130435, | |
| "grad_norm": 0.1339411805022494, | |
| "learning_rate": 1.5691880590092667e-07, | |
| "loss": 0.431, | |
| "num_tokens": 862012285.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 8.978260869565217, | |
| "grad_norm": 0.1349062172890918, | |
| "learning_rate": 1.552678289188326e-07, | |
| "loss": 0.435, | |
| "num_tokens": 864105715.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.13339483975994826, | |
| "learning_rate": 1.5363958194022895e-07, | |
| "loss": 0.4284, | |
| "num_tokens": 866196230.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 9.021739130434783, | |
| "grad_norm": 0.13155206773063716, | |
| "learning_rate": 1.5203415872504246e-07, | |
| "loss": 0.4234, | |
| "num_tokens": 868289042.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 9.043478260869565, | |
| "grad_norm": 0.1265848518754384, | |
| "learning_rate": 1.5045165171893116e-07, | |
| "loss": 0.4331, | |
| "num_tokens": 870380556.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 9.065217391304348, | |
| "grad_norm": 0.12943671669921178, | |
| "learning_rate": 1.488921520479608e-07, | |
| "loss": 0.4332, | |
| "num_tokens": 872469107.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 9.08695652173913, | |
| "grad_norm": 0.13125183209740615, | |
| "learning_rate": 1.473557495133575e-07, | |
| "loss": 0.4294, | |
| "num_tokens": 874561955.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 9.108695652173912, | |
| "grad_norm": 0.13109229633803124, | |
| "learning_rate": 1.4584253258633681e-07, | |
| "loss": 0.4251, | |
| "num_tokens": 876654532.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 9.130434782608695, | |
| "grad_norm": 0.13214384571361726, | |
| "learning_rate": 1.4435258840300897e-07, | |
| "loss": 0.4336, | |
| "num_tokens": 878746295.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 9.152173913043478, | |
| "grad_norm": 0.1311401974553015, | |
| "learning_rate": 1.4288600275936184e-07, | |
| "loss": 0.4274, | |
| "num_tokens": 880840455.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 9.173913043478262, | |
| "grad_norm": 0.13541450363149224, | |
| "learning_rate": 1.4144286010631992e-07, | |
| "loss": 0.4307, | |
| "num_tokens": 882933892.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 9.195652173913043, | |
| "grad_norm": 0.1347209244136294, | |
| "learning_rate": 1.4002324354488175e-07, | |
| "loss": 0.4321, | |
| "num_tokens": 885025725.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 9.217391304347826, | |
| "grad_norm": 0.1307731062962624, | |
| "learning_rate": 1.3862723482133435e-07, | |
| "loss": 0.434, | |
| "num_tokens": 887119819.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 9.23913043478261, | |
| "grad_norm": 0.13830257430323137, | |
| "learning_rate": 1.3725491432254623e-07, | |
| "loss": 0.4322, | |
| "num_tokens": 889212264.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 9.26086956521739, | |
| "grad_norm": 0.13014409193295903, | |
| "learning_rate": 1.3590636107133845e-07, | |
| "loss": 0.4287, | |
| "num_tokens": 891305737.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 9.282608695652174, | |
| "grad_norm": 0.13261904746981767, | |
| "learning_rate": 1.3458165272193445e-07, | |
| "loss": 0.4255, | |
| "num_tokens": 893399618.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 9.304347826086957, | |
| "grad_norm": 0.1312843101167757, | |
| "learning_rate": 1.3328086555548762e-07, | |
| "loss": 0.4297, | |
| "num_tokens": 895491469.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 9.326086956521738, | |
| "grad_norm": 0.1364003552911898, | |
| "learning_rate": 1.3200407447568984e-07, | |
| "loss": 0.4341, | |
| "num_tokens": 897584632.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 9.347826086956522, | |
| "grad_norm": 0.1339886261490337, | |
| "learning_rate": 1.3075135300445745e-07, | |
| "loss": 0.4281, | |
| "num_tokens": 899675181.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 9.369565217391305, | |
| "grad_norm": 0.1337370078605457, | |
| "learning_rate": 1.2952277327769804e-07, | |
| "loss": 0.434, | |
| "num_tokens": 901770066.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 9.391304347826088, | |
| "grad_norm": 0.13055173922308577, | |
| "learning_rate": 1.2831840604115645e-07, | |
| "loss": 0.4276, | |
| "num_tokens": 903859911.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 9.41304347826087, | |
| "grad_norm": 0.13139166365646263, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.428, | |
| "num_tokens": 905953157.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 9.434782608695652, | |
| "grad_norm": 0.13103415317501538, | |
| "learning_rate": 1.259825850465308e-07, | |
| "loss": 0.4323, | |
| "num_tokens": 908046526.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 9.456521739130435, | |
| "grad_norm": 0.1286719245844758, | |
| "learning_rate": 1.2485126579286066e-07, | |
| "loss": 0.4341, | |
| "num_tokens": 910137413.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 9.478260869565217, | |
| "grad_norm": 0.12910865959756296, | |
| "learning_rate": 1.2374442803049124e-07, | |
| "loss": 0.4314, | |
| "num_tokens": 912227811.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 0.13356681693096856, | |
| "learning_rate": 1.2266213549485637e-07, | |
| "loss": 0.4261, | |
| "num_tokens": 914319892.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 9.521739130434783, | |
| "grad_norm": 0.13245196915807964, | |
| "learning_rate": 1.2160445050799345e-07, | |
| "loss": 0.4336, | |
| "num_tokens": 916412324.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 9.543478260869565, | |
| "grad_norm": 0.13247117449980128, | |
| "learning_rate": 1.205714339749545e-07, | |
| "loss": 0.4288, | |
| "num_tokens": 918503434.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 9.565217391304348, | |
| "grad_norm": 0.13140776239629576, | |
| "learning_rate": 1.1956314538029936e-07, | |
| "loss": 0.4286, | |
| "num_tokens": 920596471.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 9.58695652173913, | |
| "grad_norm": 0.1317686114638022, | |
| "learning_rate": 1.1857964278467e-07, | |
| "loss": 0.4316, | |
| "num_tokens": 922688205.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 9.608695652173914, | |
| "grad_norm": 0.1300939804698934, | |
| "learning_rate": 1.1762098282144734e-07, | |
| "loss": 0.4237, | |
| "num_tokens": 924777343.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 9.630434782608695, | |
| "grad_norm": 0.13046645210086893, | |
| "learning_rate": 1.166872206934904e-07, | |
| "loss": 0.4308, | |
| "num_tokens": 926870486.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 9.652173913043478, | |
| "grad_norm": 0.1339176102367468, | |
| "learning_rate": 1.157784101699567e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 928965703.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 9.673913043478262, | |
| "grad_norm": 0.13053722597742878, | |
| "learning_rate": 1.1489460358320726e-07, | |
| "loss": 0.4315, | |
| "num_tokens": 931058607.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 9.695652173913043, | |
| "grad_norm": 0.1318797432070931, | |
| "learning_rate": 1.1403585182579217e-07, | |
| "loss": 0.4328, | |
| "num_tokens": 933148486.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 9.717391304347826, | |
| "grad_norm": 0.12863606554725282, | |
| "learning_rate": 1.1320220434752026e-07, | |
| "loss": 0.433, | |
| "num_tokens": 935240504.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 9.73913043478261, | |
| "grad_norm": 0.13076944288775819, | |
| "learning_rate": 1.1239370915261193e-07, | |
| "loss": 0.4284, | |
| "num_tokens": 937328887.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 9.76086956521739, | |
| "grad_norm": 0.13176495576730735, | |
| "learning_rate": 1.1161041279693445e-07, | |
| "loss": 0.4282, | |
| "num_tokens": 939420861.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 9.782608695652174, | |
| "grad_norm": 0.1326360118816391, | |
| "learning_rate": 1.1085236038532148e-07, | |
| "loss": 0.4301, | |
| "num_tokens": 941513118.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.804347826086957, | |
| "grad_norm": 0.13082582487161826, | |
| "learning_rate": 1.1011959556897558e-07, | |
| "loss": 0.4278, | |
| "num_tokens": 943605716.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 9.826086956521738, | |
| "grad_norm": 0.1314931923327051, | |
| "learning_rate": 1.0941216054295468e-07, | |
| "loss": 0.4323, | |
| "num_tokens": 945698868.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 9.847826086956522, | |
| "grad_norm": 0.13055748678716064, | |
| "learning_rate": 1.0873009604374245e-07, | |
| "loss": 0.433, | |
| "num_tokens": 947792760.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 9.869565217391305, | |
| "grad_norm": 0.1305072624250908, | |
| "learning_rate": 1.0807344134690236e-07, | |
| "loss": 0.4307, | |
| "num_tokens": 949886033.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 9.891304347826086, | |
| "grad_norm": 0.12983181142765143, | |
| "learning_rate": 1.074422342648161e-07, | |
| "loss": 0.428, | |
| "num_tokens": 951978463.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 9.91304347826087, | |
| "grad_norm": 0.12986749329326977, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.4259, | |
| "num_tokens": 954071277.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 9.934782608695652, | |
| "grad_norm": 0.13307807739369767, | |
| "learning_rate": 1.0625630686554389e-07, | |
| "loss": 0.4333, | |
| "num_tokens": 956162411.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 9.956521739130435, | |
| "grad_norm": 0.1298842544540006, | |
| "learning_rate": 1.0570165483803867e-07, | |
| "loss": 0.4381, | |
| "num_tokens": 958256872.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 9.978260869565217, | |
| "grad_norm": 0.12872143065997999, | |
| "learning_rate": 1.0517258700071639e-07, | |
| "loss": 0.4322, | |
| "num_tokens": 960347511.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.12998360754044697, | |
| "learning_rate": 1.0466913381907913e-07, | |
| "loss": 0.4357, | |
| "num_tokens": 962441279.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 460, | |
| "total_flos": 856414331338752.0, | |
| "train_loss": 0.4584593860351521, | |
| "train_runtime": 14198.1717, | |
| "train_samples_per_second": 66.018, | |
| "train_steps_per_second": 0.032 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 856414331338752.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |