{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3325688073394497, "eval_steps": 500, "global_step": 2616, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00038226299694189603, "grad_norm": 0.21810047996490894, "learning_rate": 0.0, "loss": 0.3999, "num_tokens": 451800.0, "step": 1 }, { "epoch": 0.0007645259938837921, "grad_norm": 0.22618320492130675, "learning_rate": 1.2658227848101266e-07, "loss": 0.3816, "num_tokens": 859683.0, "step": 2 }, { "epoch": 0.0011467889908256881, "grad_norm": 0.21791046559310479, "learning_rate": 2.5316455696202533e-07, "loss": 0.3841, "num_tokens": 1288518.0, "step": 3 }, { "epoch": 0.0015290519877675841, "grad_norm": 0.2393639885413405, "learning_rate": 3.79746835443038e-07, "loss": 0.3845, "num_tokens": 1705818.0, "step": 4 }, { "epoch": 0.00191131498470948, "grad_norm": 0.2324716366978111, "learning_rate": 5.063291139240507e-07, "loss": 0.3978, "num_tokens": 2197596.0, "step": 5 }, { "epoch": 0.0022935779816513763, "grad_norm": 0.25843851753079405, "learning_rate": 6.329113924050634e-07, "loss": 0.3975, "num_tokens": 2622476.0, "step": 6 }, { "epoch": 0.002675840978593272, "grad_norm": 0.2315188490221056, "learning_rate": 7.59493670886076e-07, "loss": 0.3779, "num_tokens": 3001993.0, "step": 7 }, { "epoch": 0.0030581039755351682, "grad_norm": 0.23681575988460776, "learning_rate": 8.860759493670887e-07, "loss": 0.4222, "num_tokens": 3415434.0, "step": 8 }, { "epoch": 0.0034403669724770644, "grad_norm": 0.23905098958071425, "learning_rate": 1.0126582278481013e-06, "loss": 0.3978, "num_tokens": 3821257.0, "step": 9 }, { "epoch": 0.00382262996941896, "grad_norm": 0.26184159159239856, "learning_rate": 1.139240506329114e-06, "loss": 0.3847, "num_tokens": 4253873.0, "step": 10 }, { "epoch": 0.004204892966360856, "grad_norm": 0.26006057447778413, "learning_rate": 1.2658227848101267e-06, "loss": 0.4175, "num_tokens": 4670787.0, "step": 11 }, { "epoch": 0.0045871559633027525, "grad_norm": 0.29033047360969433, "learning_rate": 1.3924050632911392e-06, "loss": 0.4179, "num_tokens": 5050931.0, "step": 12 }, { "epoch": 0.004969418960244648, "grad_norm": 0.22946819381085923, "learning_rate": 1.518987341772152e-06, "loss": 0.396, "num_tokens": 5449145.0, "step": 13 }, { "epoch": 0.005351681957186544, "grad_norm": 0.23651528126169233, "learning_rate": 1.6455696202531647e-06, "loss": 0.4167, "num_tokens": 5870528.0, "step": 14 }, { "epoch": 0.005733944954128441, "grad_norm": 0.2214382864278198, "learning_rate": 1.7721518987341774e-06, "loss": 0.3823, "num_tokens": 6271849.0, "step": 15 }, { "epoch": 0.0061162079510703364, "grad_norm": 0.23587934762400245, "learning_rate": 1.8987341772151901e-06, "loss": 0.3882, "num_tokens": 6654431.0, "step": 16 }, { "epoch": 0.006498470948012232, "grad_norm": 0.25429501849461983, "learning_rate": 2.0253164556962026e-06, "loss": 0.4154, "num_tokens": 7085591.0, "step": 17 }, { "epoch": 0.006880733944954129, "grad_norm": 0.24366557685855064, "learning_rate": 2.1518987341772153e-06, "loss": 0.3597, "num_tokens": 7488022.0, "step": 18 }, { "epoch": 0.007262996941896025, "grad_norm": 0.23975901272004702, "learning_rate": 2.278481012658228e-06, "loss": 0.3993, "num_tokens": 7873601.0, "step": 19 }, { "epoch": 0.00764525993883792, "grad_norm": 0.23535810193170292, "learning_rate": 2.4050632911392408e-06, "loss": 0.4204, "num_tokens": 8325413.0, "step": 20 }, { "epoch": 0.008027522935779817, "grad_norm": 0.2452036677236248, "learning_rate": 2.5316455696202535e-06, "loss": 0.386, "num_tokens": 8738589.0, "step": 21 }, { "epoch": 0.008409785932721712, "grad_norm": 0.24165922913315604, "learning_rate": 2.6582278481012658e-06, "loss": 0.3995, "num_tokens": 9161420.0, "step": 22 }, { "epoch": 0.008792048929663608, "grad_norm": 0.26628800861046653, "learning_rate": 2.7848101265822785e-06, "loss": 0.3829, "num_tokens": 9569642.0, "step": 23 }, { "epoch": 0.009174311926605505, "grad_norm": 0.23133751904315147, "learning_rate": 2.9113924050632912e-06, "loss": 0.3918, "num_tokens": 9994839.0, "step": 24 }, { "epoch": 0.0095565749235474, "grad_norm": 0.2227586971283885, "learning_rate": 3.037974683544304e-06, "loss": 0.3997, "num_tokens": 10468238.0, "step": 25 }, { "epoch": 0.009938837920489297, "grad_norm": 0.24385418192481537, "learning_rate": 3.164556962025317e-06, "loss": 0.3972, "num_tokens": 10858797.0, "step": 26 }, { "epoch": 0.010321100917431193, "grad_norm": 0.25515084915475794, "learning_rate": 3.2911392405063294e-06, "loss": 0.3655, "num_tokens": 11251677.0, "step": 27 }, { "epoch": 0.010703363914373088, "grad_norm": 0.24269046503305744, "learning_rate": 3.417721518987342e-06, "loss": 0.3705, "num_tokens": 11638850.0, "step": 28 }, { "epoch": 0.011085626911314985, "grad_norm": 0.2309672930448967, "learning_rate": 3.544303797468355e-06, "loss": 0.3784, "num_tokens": 12040096.0, "step": 29 }, { "epoch": 0.011467889908256881, "grad_norm": 0.2435658585064058, "learning_rate": 3.6708860759493675e-06, "loss": 0.3669, "num_tokens": 12389404.0, "step": 30 }, { "epoch": 0.011850152905198776, "grad_norm": 0.25743490316274564, "learning_rate": 3.7974683544303802e-06, "loss": 0.377, "num_tokens": 12761781.0, "step": 31 }, { "epoch": 0.012232415902140673, "grad_norm": 0.25201610288625453, "learning_rate": 3.924050632911393e-06, "loss": 0.3702, "num_tokens": 13154838.0, "step": 32 }, { "epoch": 0.01261467889908257, "grad_norm": 0.2469634285116853, "learning_rate": 4.050632911392405e-06, "loss": 0.4076, "num_tokens": 13576761.0, "step": 33 }, { "epoch": 0.012996941896024464, "grad_norm": 0.2496549628504463, "learning_rate": 4.177215189873418e-06, "loss": 0.3892, "num_tokens": 13974489.0, "step": 34 }, { "epoch": 0.013379204892966361, "grad_norm": 0.25846070232991863, "learning_rate": 4.303797468354431e-06, "loss": 0.3559, "num_tokens": 14374049.0, "step": 35 }, { "epoch": 0.013761467889908258, "grad_norm": 0.2663423942828327, "learning_rate": 4.430379746835443e-06, "loss": 0.3688, "num_tokens": 14776911.0, "step": 36 }, { "epoch": 0.014143730886850153, "grad_norm": 0.2505114169928893, "learning_rate": 4.556962025316456e-06, "loss": 0.3806, "num_tokens": 15178891.0, "step": 37 }, { "epoch": 0.01452599388379205, "grad_norm": 0.28586301253591584, "learning_rate": 4.683544303797468e-06, "loss": 0.38, "num_tokens": 15608420.0, "step": 38 }, { "epoch": 0.014908256880733946, "grad_norm": 0.2637903406700679, "learning_rate": 4.8101265822784815e-06, "loss": 0.3727, "num_tokens": 15994086.0, "step": 39 }, { "epoch": 0.01529051987767584, "grad_norm": 0.24464885901758707, "learning_rate": 4.936708860759495e-06, "loss": 0.3564, "num_tokens": 16429454.0, "step": 40 }, { "epoch": 0.015672782874617736, "grad_norm": 0.2469314868927337, "learning_rate": 5.063291139240507e-06, "loss": 0.3522, "num_tokens": 16809145.0, "step": 41 }, { "epoch": 0.016055045871559634, "grad_norm": 0.24748811283451091, "learning_rate": 5.189873417721519e-06, "loss": 0.3827, "num_tokens": 17211551.0, "step": 42 }, { "epoch": 0.01643730886850153, "grad_norm": 0.23723819861725134, "learning_rate": 5.3164556962025316e-06, "loss": 0.3826, "num_tokens": 17671901.0, "step": 43 }, { "epoch": 0.016819571865443424, "grad_norm": 0.27105169685529573, "learning_rate": 5.443037974683545e-06, "loss": 0.3815, "num_tokens": 18093728.0, "step": 44 }, { "epoch": 0.017201834862385322, "grad_norm": 0.23791331359443213, "learning_rate": 5.569620253164557e-06, "loss": 0.3451, "num_tokens": 18512366.0, "step": 45 }, { "epoch": 0.017584097859327217, "grad_norm": 0.27793141561269313, "learning_rate": 5.69620253164557e-06, "loss": 0.3627, "num_tokens": 18896121.0, "step": 46 }, { "epoch": 0.017966360856269112, "grad_norm": 0.26524920695661636, "learning_rate": 5.8227848101265824e-06, "loss": 0.3771, "num_tokens": 19301930.0, "step": 47 }, { "epoch": 0.01834862385321101, "grad_norm": 0.29818416745906995, "learning_rate": 5.949367088607595e-06, "loss": 0.3702, "num_tokens": 19715395.0, "step": 48 }, { "epoch": 0.018730886850152905, "grad_norm": 0.30338112047811494, "learning_rate": 6.075949367088608e-06, "loss": 0.3773, "num_tokens": 20131002.0, "step": 49 }, { "epoch": 0.0191131498470948, "grad_norm": 0.30308659699156243, "learning_rate": 6.20253164556962e-06, "loss": 0.3831, "num_tokens": 20563137.0, "step": 50 }, { "epoch": 0.0194954128440367, "grad_norm": 0.2535074196213391, "learning_rate": 6.329113924050634e-06, "loss": 0.3744, "num_tokens": 20972802.0, "step": 51 }, { "epoch": 0.019877675840978593, "grad_norm": 0.2735974167361757, "learning_rate": 6.4556962025316464e-06, "loss": 0.3701, "num_tokens": 21374877.0, "step": 52 }, { "epoch": 0.020259938837920488, "grad_norm": 0.26877877811745887, "learning_rate": 6.582278481012659e-06, "loss": 0.3507, "num_tokens": 21760013.0, "step": 53 }, { "epoch": 0.020642201834862386, "grad_norm": 0.2443412417585162, "learning_rate": 6.708860759493672e-06, "loss": 0.3311, "num_tokens": 22150140.0, "step": 54 }, { "epoch": 0.02102446483180428, "grad_norm": 0.25033632489186186, "learning_rate": 6.835443037974684e-06, "loss": 0.3588, "num_tokens": 22551822.0, "step": 55 }, { "epoch": 0.021406727828746176, "grad_norm": 0.25421424683231686, "learning_rate": 6.962025316455697e-06, "loss": 0.3687, "num_tokens": 22925722.0, "step": 56 }, { "epoch": 0.021788990825688075, "grad_norm": 0.285201645139862, "learning_rate": 7.08860759493671e-06, "loss": 0.3748, "num_tokens": 23302379.0, "step": 57 }, { "epoch": 0.02217125382262997, "grad_norm": 0.2901922327347829, "learning_rate": 7.215189873417722e-06, "loss": 0.3814, "num_tokens": 23714576.0, "step": 58 }, { "epoch": 0.022553516819571864, "grad_norm": 0.3270813662793202, "learning_rate": 7.341772151898735e-06, "loss": 0.3658, "num_tokens": 24107230.0, "step": 59 }, { "epoch": 0.022935779816513763, "grad_norm": 0.2842972823309671, "learning_rate": 7.468354430379747e-06, "loss": 0.3717, "num_tokens": 24524589.0, "step": 60 }, { "epoch": 0.023318042813455658, "grad_norm": 0.2940890615468725, "learning_rate": 7.5949367088607605e-06, "loss": 0.3768, "num_tokens": 24986088.0, "step": 61 }, { "epoch": 0.023700305810397553, "grad_norm": 0.29303219141347625, "learning_rate": 7.721518987341773e-06, "loss": 0.371, "num_tokens": 25387659.0, "step": 62 }, { "epoch": 0.02408256880733945, "grad_norm": 0.2712988604342114, "learning_rate": 7.848101265822786e-06, "loss": 0.3578, "num_tokens": 25804901.0, "step": 63 }, { "epoch": 0.024464831804281346, "grad_norm": 0.2583146632355596, "learning_rate": 7.974683544303799e-06, "loss": 0.3838, "num_tokens": 26208000.0, "step": 64 }, { "epoch": 0.02484709480122324, "grad_norm": 0.3066777920300367, "learning_rate": 8.10126582278481e-06, "loss": 0.3577, "num_tokens": 26566527.0, "step": 65 }, { "epoch": 0.02522935779816514, "grad_norm": 0.2918873290167186, "learning_rate": 8.227848101265824e-06, "loss": 0.4074, "num_tokens": 26984923.0, "step": 66 }, { "epoch": 0.025611620795107034, "grad_norm": 0.24972708510248107, "learning_rate": 8.354430379746837e-06, "loss": 0.3597, "num_tokens": 27394879.0, "step": 67 }, { "epoch": 0.02599388379204893, "grad_norm": 0.2533277702671471, "learning_rate": 8.481012658227848e-06, "loss": 0.3383, "num_tokens": 27796126.0, "step": 68 }, { "epoch": 0.026376146788990827, "grad_norm": 0.2794694608524889, "learning_rate": 8.607594936708861e-06, "loss": 0.3831, "num_tokens": 28223656.0, "step": 69 }, { "epoch": 0.026758409785932722, "grad_norm": 0.30892432798732183, "learning_rate": 8.734177215189874e-06, "loss": 0.3833, "num_tokens": 28636038.0, "step": 70 }, { "epoch": 0.027140672782874617, "grad_norm": 0.2872201553687094, "learning_rate": 8.860759493670886e-06, "loss": 0.3581, "num_tokens": 29037027.0, "step": 71 }, { "epoch": 0.027522935779816515, "grad_norm": 0.26788403211970613, "learning_rate": 8.987341772151899e-06, "loss": 0.3657, "num_tokens": 29436818.0, "step": 72 }, { "epoch": 0.02790519877675841, "grad_norm": 0.2928976601330555, "learning_rate": 9.113924050632912e-06, "loss": 0.3659, "num_tokens": 29835295.0, "step": 73 }, { "epoch": 0.028287461773700305, "grad_norm": 0.27351385486989827, "learning_rate": 9.240506329113925e-06, "loss": 0.3926, "num_tokens": 30275929.0, "step": 74 }, { "epoch": 0.028669724770642203, "grad_norm": 0.24860015314094797, "learning_rate": 9.367088607594937e-06, "loss": 0.3755, "num_tokens": 30713177.0, "step": 75 }, { "epoch": 0.0290519877675841, "grad_norm": 0.3059374767943009, "learning_rate": 9.49367088607595e-06, "loss": 0.376, "num_tokens": 31085359.0, "step": 76 }, { "epoch": 0.029434250764525993, "grad_norm": 0.2779162746474668, "learning_rate": 9.620253164556963e-06, "loss": 0.3549, "num_tokens": 31483591.0, "step": 77 }, { "epoch": 0.02981651376146789, "grad_norm": 0.30027546164974406, "learning_rate": 9.746835443037975e-06, "loss": 0.3661, "num_tokens": 31852456.0, "step": 78 }, { "epoch": 0.030198776758409786, "grad_norm": 0.3563030266321935, "learning_rate": 9.87341772151899e-06, "loss": 0.3815, "num_tokens": 32253510.0, "step": 79 }, { "epoch": 0.03058103975535168, "grad_norm": 0.328592724448693, "learning_rate": 1e-05, "loss": 0.3793, "num_tokens": 32632481.0, "step": 80 }, { "epoch": 0.03096330275229358, "grad_norm": 0.33221793764748714, "learning_rate": 9.999996549823812e-06, "loss": 0.393, "num_tokens": 33036079.0, "step": 81 }, { "epoch": 0.03134556574923547, "grad_norm": 0.2629167220936857, "learning_rate": 9.999986199300538e-06, "loss": 0.3636, "num_tokens": 33465676.0, "step": 82 }, { "epoch": 0.03172782874617737, "grad_norm": 0.3063229560821108, "learning_rate": 9.999968948446047e-06, "loss": 0.382, "num_tokens": 33898302.0, "step": 83 }, { "epoch": 0.03211009174311927, "grad_norm": 0.30754736893216966, "learning_rate": 9.999944797286795e-06, "loss": 0.3781, "num_tokens": 34282661.0, "step": 84 }, { "epoch": 0.03249235474006116, "grad_norm": 0.32776552225107286, "learning_rate": 9.999913745859813e-06, "loss": 0.3431, "num_tokens": 34651832.0, "step": 85 }, { "epoch": 0.03287461773700306, "grad_norm": 0.3174044990343936, "learning_rate": 9.999875794212719e-06, "loss": 0.367, "num_tokens": 35031841.0, "step": 86 }, { "epoch": 0.033256880733944956, "grad_norm": 0.24005371718221158, "learning_rate": 9.999830942403703e-06, "loss": 0.3586, "num_tokens": 35455746.0, "step": 87 }, { "epoch": 0.03363914373088685, "grad_norm": 0.3617154019715443, "learning_rate": 9.999779190501546e-06, "loss": 0.3931, "num_tokens": 35910940.0, "step": 88 }, { "epoch": 0.034021406727828746, "grad_norm": 0.36614336324493635, "learning_rate": 9.999720538585606e-06, "loss": 0.3725, "num_tokens": 36283542.0, "step": 89 }, { "epoch": 0.034403669724770644, "grad_norm": 0.33038980708954735, "learning_rate": 9.999654986745815e-06, "loss": 0.3463, "num_tokens": 36656330.0, "step": 90 }, { "epoch": 0.034785932721712536, "grad_norm": 0.2881952745690317, "learning_rate": 9.999582535082697e-06, "loss": 0.3599, "num_tokens": 37044581.0, "step": 91 }, { "epoch": 0.035168195718654434, "grad_norm": 0.3018344869659633, "learning_rate": 9.999503183707346e-06, "loss": 0.383, "num_tokens": 37484958.0, "step": 92 }, { "epoch": 0.03555045871559633, "grad_norm": 0.3010144740002948, "learning_rate": 9.999416932741441e-06, "loss": 0.3622, "num_tokens": 37878048.0, "step": 93 }, { "epoch": 0.035932721712538224, "grad_norm": 0.3333443305835725, "learning_rate": 9.999323782317242e-06, "loss": 0.3579, "num_tokens": 38270512.0, "step": 94 }, { "epoch": 0.03631498470948012, "grad_norm": 0.2669100929329328, "learning_rate": 9.999223732577585e-06, "loss": 0.3662, "num_tokens": 38649821.0, "step": 95 }, { "epoch": 0.03669724770642202, "grad_norm": 0.2701402260845986, "learning_rate": 9.99911678367589e-06, "loss": 0.3815, "num_tokens": 39073051.0, "step": 96 }, { "epoch": 0.03707951070336391, "grad_norm": 0.3077991029054113, "learning_rate": 9.999002935776151e-06, "loss": 0.3621, "num_tokens": 39508682.0, "step": 97 }, { "epoch": 0.03746177370030581, "grad_norm": 0.3124151218543076, "learning_rate": 9.998882189052944e-06, "loss": 0.3364, "num_tokens": 39885020.0, "step": 98 }, { "epoch": 0.03784403669724771, "grad_norm": 0.2659437648117742, "learning_rate": 9.998754543691425e-06, "loss": 0.3625, "num_tokens": 40279360.0, "step": 99 }, { "epoch": 0.0382262996941896, "grad_norm": 0.3320407836544897, "learning_rate": 9.998619999887325e-06, "loss": 0.3727, "num_tokens": 40707012.0, "step": 100 }, { "epoch": 0.0386085626911315, "grad_norm": 0.3010028394901584, "learning_rate": 9.998478557846959e-06, "loss": 0.3538, "num_tokens": 41118421.0, "step": 101 }, { "epoch": 0.0389908256880734, "grad_norm": 0.34633059384007736, "learning_rate": 9.99833021778721e-06, "loss": 0.3749, "num_tokens": 41454912.0, "step": 102 }, { "epoch": 0.03937308868501529, "grad_norm": 0.3189762698698707, "learning_rate": 9.998174979935548e-06, "loss": 0.3822, "num_tokens": 41868510.0, "step": 103 }, { "epoch": 0.039755351681957186, "grad_norm": 0.3241454062281064, "learning_rate": 9.998012844530015e-06, "loss": 0.3661, "num_tokens": 42255880.0, "step": 104 }, { "epoch": 0.040137614678899085, "grad_norm": 0.3205981920415945, "learning_rate": 9.997843811819233e-06, "loss": 0.3505, "num_tokens": 42643458.0, "step": 105 }, { "epoch": 0.040519877675840976, "grad_norm": 0.2886754997136016, "learning_rate": 9.997667882062399e-06, "loss": 0.3485, "num_tokens": 43036614.0, "step": 106 }, { "epoch": 0.040902140672782875, "grad_norm": 0.2827586254697602, "learning_rate": 9.997485055529284e-06, "loss": 0.3597, "num_tokens": 43434804.0, "step": 107 }, { "epoch": 0.04128440366972477, "grad_norm": 0.3107714425116188, "learning_rate": 9.997295332500235e-06, "loss": 0.3757, "num_tokens": 43866730.0, "step": 108 }, { "epoch": 0.041666666666666664, "grad_norm": 0.3475879363319308, "learning_rate": 9.99709871326618e-06, "loss": 0.3857, "num_tokens": 44278454.0, "step": 109 }, { "epoch": 0.04204892966360856, "grad_norm": 0.3441567304199641, "learning_rate": 9.996895198128611e-06, "loss": 0.3708, "num_tokens": 44661982.0, "step": 110 }, { "epoch": 0.04243119266055046, "grad_norm": 0.3166935566476928, "learning_rate": 9.996684787399607e-06, "loss": 0.3744, "num_tokens": 45074431.0, "step": 111 }, { "epoch": 0.04281345565749235, "grad_norm": 0.2663876326581232, "learning_rate": 9.996467481401812e-06, "loss": 0.3769, "num_tokens": 45548524.0, "step": 112 }, { "epoch": 0.04319571865443425, "grad_norm": 0.33096965103381665, "learning_rate": 9.996243280468445e-06, "loss": 0.3864, "num_tokens": 45974787.0, "step": 113 }, { "epoch": 0.04357798165137615, "grad_norm": 0.2826887984249236, "learning_rate": 9.996012184943296e-06, "loss": 0.3667, "num_tokens": 46387257.0, "step": 114 }, { "epoch": 0.04396024464831804, "grad_norm": 0.29292802715977545, "learning_rate": 9.995774195180734e-06, "loss": 0.3795, "num_tokens": 46839806.0, "step": 115 }, { "epoch": 0.04434250764525994, "grad_norm": 0.30349471334335787, "learning_rate": 9.995529311545691e-06, "loss": 0.3563, "num_tokens": 47251973.0, "step": 116 }, { "epoch": 0.04472477064220184, "grad_norm": 0.27282794281680073, "learning_rate": 9.995277534413679e-06, "loss": 0.3712, "num_tokens": 47657116.0, "step": 117 }, { "epoch": 0.04510703363914373, "grad_norm": 0.27268959033730367, "learning_rate": 9.995018864170771e-06, "loss": 0.3487, "num_tokens": 48058332.0, "step": 118 }, { "epoch": 0.04548929663608563, "grad_norm": 0.30209907729014973, "learning_rate": 9.99475330121362e-06, "loss": 0.3799, "num_tokens": 48489567.0, "step": 119 }, { "epoch": 0.045871559633027525, "grad_norm": 0.31225728938204095, "learning_rate": 9.994480845949439e-06, "loss": 0.3708, "num_tokens": 48912078.0, "step": 120 }, { "epoch": 0.04625382262996942, "grad_norm": 0.3273189575817362, "learning_rate": 9.994201498796016e-06, "loss": 0.3832, "num_tokens": 49337202.0, "step": 121 }, { "epoch": 0.046636085626911315, "grad_norm": 0.2671602528577052, "learning_rate": 9.993915260181706e-06, "loss": 0.3625, "num_tokens": 49781789.0, "step": 122 }, { "epoch": 0.047018348623853214, "grad_norm": 0.32368873694887007, "learning_rate": 9.99362213054543e-06, "loss": 0.3922, "num_tokens": 50230058.0, "step": 123 }, { "epoch": 0.047400611620795105, "grad_norm": 0.3579166193844263, "learning_rate": 9.993322110336673e-06, "loss": 0.3875, "num_tokens": 50629056.0, "step": 124 }, { "epoch": 0.047782874617737, "grad_norm": 0.30383100284797826, "learning_rate": 9.993015200015497e-06, "loss": 0.3636, "num_tokens": 51052633.0, "step": 125 }, { "epoch": 0.0481651376146789, "grad_norm": 0.2805275983840069, "learning_rate": 9.992701400052515e-06, "loss": 0.352, "num_tokens": 51458305.0, "step": 126 }, { "epoch": 0.04854740061162079, "grad_norm": 0.4891914795855784, "learning_rate": 9.992380710928915e-06, "loss": 0.3765, "num_tokens": 51917160.0, "step": 127 }, { "epoch": 0.04892966360856269, "grad_norm": 0.38387755477046753, "learning_rate": 9.992053133136444e-06, "loss": 0.3509, "num_tokens": 52307600.0, "step": 128 }, { "epoch": 0.04931192660550459, "grad_norm": 0.32856095051074563, "learning_rate": 9.991718667177412e-06, "loss": 0.3695, "num_tokens": 52716931.0, "step": 129 }, { "epoch": 0.04969418960244648, "grad_norm": 0.2914171894527055, "learning_rate": 9.991377313564696e-06, "loss": 0.3691, "num_tokens": 53144509.0, "step": 130 }, { "epoch": 0.05007645259938838, "grad_norm": 0.42507072873813, "learning_rate": 9.991029072821732e-06, "loss": 0.3758, "num_tokens": 53508939.0, "step": 131 }, { "epoch": 0.05045871559633028, "grad_norm": 0.38037928356246004, "learning_rate": 9.990673945482513e-06, "loss": 0.375, "num_tokens": 53933117.0, "step": 132 }, { "epoch": 0.05084097859327217, "grad_norm": 0.3220768092935591, "learning_rate": 9.990311932091598e-06, "loss": 0.3717, "num_tokens": 54363509.0, "step": 133 }, { "epoch": 0.05122324159021407, "grad_norm": 0.31290892909963786, "learning_rate": 9.989943033204103e-06, "loss": 0.3716, "num_tokens": 54767385.0, "step": 134 }, { "epoch": 0.051605504587155966, "grad_norm": 0.31645783180029785, "learning_rate": 9.9895672493857e-06, "loss": 0.3766, "num_tokens": 55199177.0, "step": 135 }, { "epoch": 0.05198776758409786, "grad_norm": 0.32448742324934265, "learning_rate": 9.989184581212621e-06, "loss": 0.3477, "num_tokens": 55569572.0, "step": 136 }, { "epoch": 0.052370030581039756, "grad_norm": 0.33787619079802583, "learning_rate": 9.988795029271652e-06, "loss": 0.3729, "num_tokens": 55982545.0, "step": 137 }, { "epoch": 0.052752293577981654, "grad_norm": 0.31022855396474125, "learning_rate": 9.988398594160143e-06, "loss": 0.3526, "num_tokens": 56401354.0, "step": 138 }, { "epoch": 0.053134556574923546, "grad_norm": 0.2741685239111767, "learning_rate": 9.987995276485984e-06, "loss": 0.3856, "num_tokens": 56843122.0, "step": 139 }, { "epoch": 0.053516819571865444, "grad_norm": 0.2774758423136237, "learning_rate": 9.987585076867631e-06, "loss": 0.3706, "num_tokens": 57259050.0, "step": 140 }, { "epoch": 0.05389908256880734, "grad_norm": 0.3030510256051822, "learning_rate": 9.987167995934088e-06, "loss": 0.3561, "num_tokens": 57654563.0, "step": 141 }, { "epoch": 0.054281345565749234, "grad_norm": 0.295612594198623, "learning_rate": 9.986744034324915e-06, "loss": 0.3567, "num_tokens": 58064881.0, "step": 142 }, { "epoch": 0.05466360856269113, "grad_norm": 0.28410601764906807, "learning_rate": 9.986313192690214e-06, "loss": 0.3411, "num_tokens": 58478669.0, "step": 143 }, { "epoch": 0.05504587155963303, "grad_norm": 0.3142195781797062, "learning_rate": 9.985875471690646e-06, "loss": 0.3682, "num_tokens": 58864073.0, "step": 144 }, { "epoch": 0.05542813455657492, "grad_norm": 0.3526499476007933, "learning_rate": 9.985430871997419e-06, "loss": 0.3704, "num_tokens": 59265865.0, "step": 145 }, { "epoch": 0.05581039755351682, "grad_norm": 0.29526552988659543, "learning_rate": 9.984979394292281e-06, "loss": 0.3789, "num_tokens": 59713987.0, "step": 146 }, { "epoch": 0.05619266055045872, "grad_norm": 0.29245778696402475, "learning_rate": 9.984521039267541e-06, "loss": 0.3721, "num_tokens": 60116080.0, "step": 147 }, { "epoch": 0.05657492354740061, "grad_norm": 0.29858161465460215, "learning_rate": 9.98405580762604e-06, "loss": 0.3985, "num_tokens": 60528843.0, "step": 148 }, { "epoch": 0.05695718654434251, "grad_norm": 0.36715593710952876, "learning_rate": 9.983583700081175e-06, "loss": 0.3927, "num_tokens": 60961209.0, "step": 149 }, { "epoch": 0.05733944954128441, "grad_norm": 0.31612664518260053, "learning_rate": 9.983104717356876e-06, "loss": 0.3571, "num_tokens": 61422654.0, "step": 150 }, { "epoch": 0.0577217125382263, "grad_norm": 0.3171890235532633, "learning_rate": 9.982618860187622e-06, "loss": 0.36, "num_tokens": 61882792.0, "step": 151 }, { "epoch": 0.0581039755351682, "grad_norm": 0.3009225032179536, "learning_rate": 9.982126129318434e-06, "loss": 0.3624, "num_tokens": 62291139.0, "step": 152 }, { "epoch": 0.058486238532110095, "grad_norm": 0.344516015270405, "learning_rate": 9.981626525504872e-06, "loss": 0.3947, "num_tokens": 62734489.0, "step": 153 }, { "epoch": 0.058868501529051986, "grad_norm": 0.40734618283334334, "learning_rate": 9.981120049513031e-06, "loss": 0.3809, "num_tokens": 63184842.0, "step": 154 }, { "epoch": 0.059250764525993885, "grad_norm": 0.3718170700195857, "learning_rate": 9.980606702119547e-06, "loss": 0.3899, "num_tokens": 63616995.0, "step": 155 }, { "epoch": 0.05963302752293578, "grad_norm": 0.28053844302417147, "learning_rate": 9.980086484111596e-06, "loss": 0.3449, "num_tokens": 63983629.0, "step": 156 }, { "epoch": 0.060015290519877675, "grad_norm": 0.2979628145729509, "learning_rate": 9.979559396286885e-06, "loss": 0.3677, "num_tokens": 64423985.0, "step": 157 }, { "epoch": 0.06039755351681957, "grad_norm": 0.29871227972915554, "learning_rate": 9.979025439453657e-06, "loss": 0.3433, "num_tokens": 64800081.0, "step": 158 }, { "epoch": 0.06077981651376147, "grad_norm": 0.2612374594559548, "learning_rate": 9.978484614430687e-06, "loss": 0.3466, "num_tokens": 65225739.0, "step": 159 }, { "epoch": 0.06116207951070336, "grad_norm": 0.340237371157313, "learning_rate": 9.977936922047281e-06, "loss": 0.3779, "num_tokens": 65652885.0, "step": 160 }, { "epoch": 0.06154434250764526, "grad_norm": 0.3532518340415294, "learning_rate": 9.97738236314328e-06, "loss": 0.3763, "num_tokens": 66071033.0, "step": 161 }, { "epoch": 0.06192660550458716, "grad_norm": 0.3323977026176976, "learning_rate": 9.976820938569049e-06, "loss": 0.345, "num_tokens": 66464059.0, "step": 162 }, { "epoch": 0.06230886850152905, "grad_norm": 0.25597537187847486, "learning_rate": 9.976252649185482e-06, "loss": 0.3513, "num_tokens": 66875069.0, "step": 163 }, { "epoch": 0.06269113149847094, "grad_norm": 0.3171545451476222, "learning_rate": 9.975677495864003e-06, "loss": 0.3723, "num_tokens": 67303828.0, "step": 164 }, { "epoch": 0.06307339449541284, "grad_norm": 0.31875321182732913, "learning_rate": 9.97509547948656e-06, "loss": 0.3838, "num_tokens": 67752148.0, "step": 165 }, { "epoch": 0.06345565749235474, "grad_norm": 0.31465238313878235, "learning_rate": 9.974506600945618e-06, "loss": 0.3504, "num_tokens": 68119582.0, "step": 166 }, { "epoch": 0.06383792048929664, "grad_norm": 0.288264212411034, "learning_rate": 9.973910861144174e-06, "loss": 0.3521, "num_tokens": 68504939.0, "step": 167 }, { "epoch": 0.06422018348623854, "grad_norm": 0.2861207799339978, "learning_rate": 9.973308260995744e-06, "loss": 0.3751, "num_tokens": 68947507.0, "step": 168 }, { "epoch": 0.06460244648318043, "grad_norm": 0.2937911499736142, "learning_rate": 9.972698801424358e-06, "loss": 0.3895, "num_tokens": 69362481.0, "step": 169 }, { "epoch": 0.06498470948012232, "grad_norm": 0.29251721334730035, "learning_rate": 9.97208248336457e-06, "loss": 0.3643, "num_tokens": 69753637.0, "step": 170 }, { "epoch": 0.06536697247706422, "grad_norm": 0.2755373044472231, "learning_rate": 9.971459307761453e-06, "loss": 0.354, "num_tokens": 70136759.0, "step": 171 }, { "epoch": 0.06574923547400612, "grad_norm": 0.2978821995634767, "learning_rate": 9.970829275570588e-06, "loss": 0.3645, "num_tokens": 70575011.0, "step": 172 }, { "epoch": 0.06613149847094801, "grad_norm": 0.28501932128057794, "learning_rate": 9.970192387758073e-06, "loss": 0.3628, "num_tokens": 70984617.0, "step": 173 }, { "epoch": 0.06651376146788991, "grad_norm": 0.3779580236228422, "learning_rate": 9.969548645300519e-06, "loss": 0.377, "num_tokens": 71402187.0, "step": 174 }, { "epoch": 0.06689602446483181, "grad_norm": 0.3377881609942034, "learning_rate": 9.968898049185052e-06, "loss": 0.3574, "num_tokens": 71795847.0, "step": 175 }, { "epoch": 0.0672782874617737, "grad_norm": 0.2916527024647668, "learning_rate": 9.9682406004093e-06, "loss": 0.4052, "num_tokens": 72247725.0, "step": 176 }, { "epoch": 0.0676605504587156, "grad_norm": 0.3185391140228152, "learning_rate": 9.967576299981403e-06, "loss": 0.3751, "num_tokens": 72693638.0, "step": 177 }, { "epoch": 0.06804281345565749, "grad_norm": 0.30306148441770686, "learning_rate": 9.966905148920008e-06, "loss": 0.349, "num_tokens": 73105643.0, "step": 178 }, { "epoch": 0.06842507645259939, "grad_norm": 0.28914550640848624, "learning_rate": 9.966227148254268e-06, "loss": 0.3726, "num_tokens": 73563316.0, "step": 179 }, { "epoch": 0.06880733944954129, "grad_norm": 0.33456046764024827, "learning_rate": 9.965542299023833e-06, "loss": 0.3809, "num_tokens": 73986398.0, "step": 180 }, { "epoch": 0.06918960244648319, "grad_norm": 0.31433258807661973, "learning_rate": 9.964850602278859e-06, "loss": 0.3653, "num_tokens": 74399655.0, "step": 181 }, { "epoch": 0.06957186544342507, "grad_norm": 0.38708681678799467, "learning_rate": 9.964152059080007e-06, "loss": 0.3582, "num_tokens": 74764330.0, "step": 182 }, { "epoch": 0.06995412844036697, "grad_norm": 0.29860601007162324, "learning_rate": 9.963446670498424e-06, "loss": 0.3432, "num_tokens": 75121653.0, "step": 183 }, { "epoch": 0.07033639143730887, "grad_norm": 0.30810257588043594, "learning_rate": 9.962734437615767e-06, "loss": 0.3542, "num_tokens": 75531744.0, "step": 184 }, { "epoch": 0.07071865443425077, "grad_norm": 0.3265476811541891, "learning_rate": 9.962015361524179e-06, "loss": 0.3638, "num_tokens": 75898551.0, "step": 185 }, { "epoch": 0.07110091743119266, "grad_norm": 0.3236697275100706, "learning_rate": 9.961289443326301e-06, "loss": 0.3678, "num_tokens": 76290706.0, "step": 186 }, { "epoch": 0.07148318042813456, "grad_norm": 0.3121159841744106, "learning_rate": 9.960556684135264e-06, "loss": 0.3667, "num_tokens": 76705391.0, "step": 187 }, { "epoch": 0.07186544342507645, "grad_norm": 0.3106702976767392, "learning_rate": 9.95981708507469e-06, "loss": 0.369, "num_tokens": 77144503.0, "step": 188 }, { "epoch": 0.07224770642201835, "grad_norm": 0.34847417559638644, "learning_rate": 9.959070647278687e-06, "loss": 0.3601, "num_tokens": 77559844.0, "step": 189 }, { "epoch": 0.07262996941896024, "grad_norm": 0.34678058678749113, "learning_rate": 9.958317371891854e-06, "loss": 0.3824, "num_tokens": 77952748.0, "step": 190 }, { "epoch": 0.07301223241590214, "grad_norm": 0.33886797062927976, "learning_rate": 9.957557260069271e-06, "loss": 0.3559, "num_tokens": 78372383.0, "step": 191 }, { "epoch": 0.07339449541284404, "grad_norm": 0.35132336275819737, "learning_rate": 9.956790312976499e-06, "loss": 0.396, "num_tokens": 78785065.0, "step": 192 }, { "epoch": 0.07377675840978594, "grad_norm": 0.31125388580858143, "learning_rate": 9.956016531789591e-06, "loss": 0.3783, "num_tokens": 79206326.0, "step": 193 }, { "epoch": 0.07415902140672782, "grad_norm": 0.2943694036601107, "learning_rate": 9.955235917695065e-06, "loss": 0.3738, "num_tokens": 79638756.0, "step": 194 }, { "epoch": 0.07454128440366972, "grad_norm": 0.2789237268146603, "learning_rate": 9.954448471889928e-06, "loss": 0.3659, "num_tokens": 80079595.0, "step": 195 }, { "epoch": 0.07492354740061162, "grad_norm": 0.30657986744903526, "learning_rate": 9.953654195581658e-06, "loss": 0.3714, "num_tokens": 80445368.0, "step": 196 }, { "epoch": 0.07530581039755352, "grad_norm": 0.334416440173099, "learning_rate": 9.952853089988205e-06, "loss": 0.3656, "num_tokens": 80840969.0, "step": 197 }, { "epoch": 0.07568807339449542, "grad_norm": 0.25643825215514204, "learning_rate": 9.952045156337998e-06, "loss": 0.3606, "num_tokens": 81250567.0, "step": 198 }, { "epoch": 0.07607033639143732, "grad_norm": 0.2639442653647469, "learning_rate": 9.951230395869926e-06, "loss": 0.3831, "num_tokens": 81661857.0, "step": 199 }, { "epoch": 0.0764525993883792, "grad_norm": 0.27388561109626597, "learning_rate": 9.950408809833356e-06, "loss": 0.3556, "num_tokens": 82056537.0, "step": 200 }, { "epoch": 0.0768348623853211, "grad_norm": 0.3120721722829178, "learning_rate": 9.94958039948812e-06, "loss": 0.3404, "num_tokens": 82439860.0, "step": 201 }, { "epoch": 0.077217125382263, "grad_norm": 0.2851615192730009, "learning_rate": 9.948745166104506e-06, "loss": 0.3733, "num_tokens": 82835173.0, "step": 202 }, { "epoch": 0.0775993883792049, "grad_norm": 0.32410400757222513, "learning_rate": 9.947903110963274e-06, "loss": 0.3628, "num_tokens": 83214244.0, "step": 203 }, { "epoch": 0.0779816513761468, "grad_norm": 0.268172309828167, "learning_rate": 9.947054235355642e-06, "loss": 0.379, "num_tokens": 83627811.0, "step": 204 }, { "epoch": 0.07836391437308869, "grad_norm": 0.27959613704165887, "learning_rate": 9.946198540583285e-06, "loss": 0.3652, "num_tokens": 84018182.0, "step": 205 }, { "epoch": 0.07874617737003058, "grad_norm": 0.29369199615662045, "learning_rate": 9.945336027958333e-06, "loss": 0.352, "num_tokens": 84436023.0, "step": 206 }, { "epoch": 0.07912844036697247, "grad_norm": 0.32528975482882166, "learning_rate": 9.944466698803377e-06, "loss": 0.3937, "num_tokens": 84877529.0, "step": 207 }, { "epoch": 0.07951070336391437, "grad_norm": 0.2955847679126208, "learning_rate": 9.943590554451452e-06, "loss": 0.3635, "num_tokens": 85296496.0, "step": 208 }, { "epoch": 0.07989296636085627, "grad_norm": 0.3062034678724796, "learning_rate": 9.942707596246051e-06, "loss": 0.3528, "num_tokens": 85698238.0, "step": 209 }, { "epoch": 0.08027522935779817, "grad_norm": 0.2732791883322538, "learning_rate": 9.941817825541113e-06, "loss": 0.3802, "num_tokens": 86095973.0, "step": 210 }, { "epoch": 0.08065749235474007, "grad_norm": 0.3076491439128888, "learning_rate": 9.940921243701019e-06, "loss": 0.3887, "num_tokens": 86557278.0, "step": 211 }, { "epoch": 0.08103975535168195, "grad_norm": 0.29215917280438924, "learning_rate": 9.940017852100601e-06, "loss": 0.3644, "num_tokens": 86966313.0, "step": 212 }, { "epoch": 0.08142201834862385, "grad_norm": 0.2874027877225882, "learning_rate": 9.93910765212513e-06, "loss": 0.3697, "num_tokens": 87344800.0, "step": 213 }, { "epoch": 0.08180428134556575, "grad_norm": 0.27936172636785955, "learning_rate": 9.938190645170319e-06, "loss": 0.3673, "num_tokens": 87773672.0, "step": 214 }, { "epoch": 0.08218654434250765, "grad_norm": 0.3024658653566097, "learning_rate": 9.937266832642312e-06, "loss": 0.3633, "num_tokens": 88142515.0, "step": 215 }, { "epoch": 0.08256880733944955, "grad_norm": 0.33991108960012917, "learning_rate": 9.936336215957698e-06, "loss": 0.3987, "num_tokens": 88528287.0, "step": 216 }, { "epoch": 0.08295107033639144, "grad_norm": 0.2996116237222055, "learning_rate": 9.935398796543493e-06, "loss": 0.3699, "num_tokens": 88957091.0, "step": 217 }, { "epoch": 0.08333333333333333, "grad_norm": 0.28663608072342417, "learning_rate": 9.934454575837148e-06, "loss": 0.3478, "num_tokens": 89330917.0, "step": 218 }, { "epoch": 0.08371559633027523, "grad_norm": 0.34356264011292487, "learning_rate": 9.933503555286544e-06, "loss": 0.3907, "num_tokens": 89769457.0, "step": 219 }, { "epoch": 0.08409785932721713, "grad_norm": 0.3145420561026299, "learning_rate": 9.932545736349985e-06, "loss": 0.3339, "num_tokens": 90151953.0, "step": 220 }, { "epoch": 0.08448012232415902, "grad_norm": 0.2842239632904358, "learning_rate": 9.9315811204962e-06, "loss": 0.3546, "num_tokens": 90574328.0, "step": 221 }, { "epoch": 0.08486238532110092, "grad_norm": 0.26724416107162097, "learning_rate": 9.930609709204346e-06, "loss": 0.3377, "num_tokens": 90976429.0, "step": 222 }, { "epoch": 0.0852446483180428, "grad_norm": 0.2645691169484764, "learning_rate": 9.929631503963992e-06, "loss": 0.3822, "num_tokens": 91442490.0, "step": 223 }, { "epoch": 0.0856269113149847, "grad_norm": 0.32135587321564485, "learning_rate": 9.928646506275134e-06, "loss": 0.3579, "num_tokens": 91848632.0, "step": 224 }, { "epoch": 0.0860091743119266, "grad_norm": 0.32670637032541, "learning_rate": 9.927654717648176e-06, "loss": 0.3721, "num_tokens": 92276961.0, "step": 225 }, { "epoch": 0.0863914373088685, "grad_norm": 0.2944202684179206, "learning_rate": 9.926656139603939e-06, "loss": 0.3697, "num_tokens": 92684709.0, "step": 226 }, { "epoch": 0.0867737003058104, "grad_norm": 0.3134041406702019, "learning_rate": 9.925650773673654e-06, "loss": 0.3544, "num_tokens": 93064995.0, "step": 227 }, { "epoch": 0.0871559633027523, "grad_norm": 0.295501923469508, "learning_rate": 9.92463862139896e-06, "loss": 0.3528, "num_tokens": 93493922.0, "step": 228 }, { "epoch": 0.08753822629969418, "grad_norm": 0.2970567683956971, "learning_rate": 9.923619684331904e-06, "loss": 0.3479, "num_tokens": 93902963.0, "step": 229 }, { "epoch": 0.08792048929663608, "grad_norm": 0.28293401962950987, "learning_rate": 9.922593964034936e-06, "loss": 0.3948, "num_tokens": 94312112.0, "step": 230 }, { "epoch": 0.08830275229357798, "grad_norm": 0.276251610132844, "learning_rate": 9.921561462080908e-06, "loss": 0.3563, "num_tokens": 94701284.0, "step": 231 }, { "epoch": 0.08868501529051988, "grad_norm": 0.32819585975983306, "learning_rate": 9.92052218005307e-06, "loss": 0.3664, "num_tokens": 95103164.0, "step": 232 }, { "epoch": 0.08906727828746178, "grad_norm": 0.32120398404390094, "learning_rate": 9.919476119545066e-06, "loss": 0.3613, "num_tokens": 95530537.0, "step": 233 }, { "epoch": 0.08944954128440367, "grad_norm": 0.27343332503880463, "learning_rate": 9.918423282160945e-06, "loss": 0.345, "num_tokens": 95941819.0, "step": 234 }, { "epoch": 0.08983180428134556, "grad_norm": 0.294056649449384, "learning_rate": 9.917363669515133e-06, "loss": 0.3477, "num_tokens": 96355176.0, "step": 235 }, { "epoch": 0.09021406727828746, "grad_norm": 0.29456328672560694, "learning_rate": 9.916297283232456e-06, "loss": 0.3618, "num_tokens": 96772162.0, "step": 236 }, { "epoch": 0.09059633027522936, "grad_norm": 0.30400646807334347, "learning_rate": 9.915224124948119e-06, "loss": 0.3963, "num_tokens": 97250778.0, "step": 237 }, { "epoch": 0.09097859327217125, "grad_norm": 0.27433780692105025, "learning_rate": 9.914144196307721e-06, "loss": 0.3454, "num_tokens": 97654025.0, "step": 238 }, { "epoch": 0.09136085626911315, "grad_norm": 0.2762680234947099, "learning_rate": 9.913057498967233e-06, "loss": 0.3801, "num_tokens": 98081943.0, "step": 239 }, { "epoch": 0.09174311926605505, "grad_norm": 0.29782204047001254, "learning_rate": 9.911964034593013e-06, "loss": 0.3629, "num_tokens": 98491222.0, "step": 240 }, { "epoch": 0.09212538226299694, "grad_norm": 0.27543729415384416, "learning_rate": 9.910863804861788e-06, "loss": 0.342, "num_tokens": 98933450.0, "step": 241 }, { "epoch": 0.09250764525993883, "grad_norm": 0.27333664460770063, "learning_rate": 9.909756811460664e-06, "loss": 0.3615, "num_tokens": 99356292.0, "step": 242 }, { "epoch": 0.09288990825688073, "grad_norm": 0.312849706925737, "learning_rate": 9.908643056087121e-06, "loss": 0.384, "num_tokens": 99807715.0, "step": 243 }, { "epoch": 0.09327217125382263, "grad_norm": 0.29137130049453147, "learning_rate": 9.907522540449002e-06, "loss": 0.3499, "num_tokens": 100189698.0, "step": 244 }, { "epoch": 0.09365443425076453, "grad_norm": 0.3151050491963121, "learning_rate": 9.906395266264517e-06, "loss": 0.3781, "num_tokens": 100660902.0, "step": 245 }, { "epoch": 0.09403669724770643, "grad_norm": 0.3483087694005124, "learning_rate": 9.905261235262244e-06, "loss": 0.3656, "num_tokens": 101086006.0, "step": 246 }, { "epoch": 0.09441896024464831, "grad_norm": 0.3183839353077628, "learning_rate": 9.904120449181117e-06, "loss": 0.348, "num_tokens": 101465165.0, "step": 247 }, { "epoch": 0.09480122324159021, "grad_norm": 0.273075235344002, "learning_rate": 9.902972909770433e-06, "loss": 0.3955, "num_tokens": 101888824.0, "step": 248 }, { "epoch": 0.09518348623853211, "grad_norm": 0.30978642198601963, "learning_rate": 9.901818618789841e-06, "loss": 0.3704, "num_tokens": 102306874.0, "step": 249 }, { "epoch": 0.095565749235474, "grad_norm": 0.3035045757508297, "learning_rate": 9.900657578009344e-06, "loss": 0.3611, "num_tokens": 102700798.0, "step": 250 }, { "epoch": 0.0959480122324159, "grad_norm": 0.29426419140397553, "learning_rate": 9.899489789209298e-06, "loss": 0.3667, "num_tokens": 103096621.0, "step": 251 }, { "epoch": 0.0963302752293578, "grad_norm": 0.3485697258697353, "learning_rate": 9.8983152541804e-06, "loss": 0.3662, "num_tokens": 103539608.0, "step": 252 }, { "epoch": 0.09671253822629969, "grad_norm": 0.3363981229241188, "learning_rate": 9.897133974723698e-06, "loss": 0.3815, "num_tokens": 103963335.0, "step": 253 }, { "epoch": 0.09709480122324159, "grad_norm": 0.26769732540098695, "learning_rate": 9.89594595265058e-06, "loss": 0.3661, "num_tokens": 104402413.0, "step": 254 }, { "epoch": 0.09747706422018348, "grad_norm": 0.29052837736053017, "learning_rate": 9.894751189782773e-06, "loss": 0.3698, "num_tokens": 104812212.0, "step": 255 }, { "epoch": 0.09785932721712538, "grad_norm": 0.30185424646274334, "learning_rate": 9.893549687952337e-06, "loss": 0.3696, "num_tokens": 105208103.0, "step": 256 }, { "epoch": 0.09824159021406728, "grad_norm": 0.272791811311962, "learning_rate": 9.892341449001673e-06, "loss": 0.3654, "num_tokens": 105619143.0, "step": 257 }, { "epoch": 0.09862385321100918, "grad_norm": 0.28252810251886107, "learning_rate": 9.891126474783507e-06, "loss": 0.3641, "num_tokens": 106082216.0, "step": 258 }, { "epoch": 0.09900611620795106, "grad_norm": 0.29207208596330475, "learning_rate": 9.889904767160892e-06, "loss": 0.371, "num_tokens": 106536265.0, "step": 259 }, { "epoch": 0.09938837920489296, "grad_norm": 0.290941329782096, "learning_rate": 9.888676328007215e-06, "loss": 0.3628, "num_tokens": 106964217.0, "step": 260 }, { "epoch": 0.09977064220183486, "grad_norm": 0.2817806507691782, "learning_rate": 9.887441159206173e-06, "loss": 0.3444, "num_tokens": 107385148.0, "step": 261 }, { "epoch": 0.10015290519877676, "grad_norm": 0.27229244163180977, "learning_rate": 9.886199262651792e-06, "loss": 0.3602, "num_tokens": 107772298.0, "step": 262 }, { "epoch": 0.10053516819571866, "grad_norm": 0.2610174333568875, "learning_rate": 9.884950640248406e-06, "loss": 0.345, "num_tokens": 108182473.0, "step": 263 }, { "epoch": 0.10091743119266056, "grad_norm": 0.3020670783261359, "learning_rate": 9.883695293910674e-06, "loss": 0.3566, "num_tokens": 108546122.0, "step": 264 }, { "epoch": 0.10129969418960244, "grad_norm": 0.31040224708867337, "learning_rate": 9.882433225563553e-06, "loss": 0.3527, "num_tokens": 108938626.0, "step": 265 }, { "epoch": 0.10168195718654434, "grad_norm": 0.29221050269568544, "learning_rate": 9.881164437142316e-06, "loss": 0.3467, "num_tokens": 109331960.0, "step": 266 }, { "epoch": 0.10206422018348624, "grad_norm": 0.29095939215956507, "learning_rate": 9.879888930592535e-06, "loss": 0.3827, "num_tokens": 109737450.0, "step": 267 }, { "epoch": 0.10244648318042814, "grad_norm": 0.2714268725173217, "learning_rate": 9.87860670787009e-06, "loss": 0.3606, "num_tokens": 110152085.0, "step": 268 }, { "epoch": 0.10282874617737003, "grad_norm": 0.2749502242222535, "learning_rate": 9.877317770941155e-06, "loss": 0.3696, "num_tokens": 110580750.0, "step": 269 }, { "epoch": 0.10321100917431193, "grad_norm": 0.28202363086242993, "learning_rate": 9.8760221217822e-06, "loss": 0.3719, "num_tokens": 110992053.0, "step": 270 }, { "epoch": 0.10359327217125382, "grad_norm": 0.2939169912519912, "learning_rate": 9.874719762379989e-06, "loss": 0.3904, "num_tokens": 111375803.0, "step": 271 }, { "epoch": 0.10397553516819572, "grad_norm": 0.2524585115843448, "learning_rate": 9.873410694731577e-06, "loss": 0.3729, "num_tokens": 111797446.0, "step": 272 }, { "epoch": 0.10435779816513761, "grad_norm": 0.330271338847113, "learning_rate": 9.872094920844301e-06, "loss": 0.3522, "num_tokens": 112210690.0, "step": 273 }, { "epoch": 0.10474006116207951, "grad_norm": 0.3173092968895186, "learning_rate": 9.870772442735786e-06, "loss": 0.4027, "num_tokens": 112663677.0, "step": 274 }, { "epoch": 0.10512232415902141, "grad_norm": 0.2611720509884505, "learning_rate": 9.869443262433934e-06, "loss": 0.3868, "num_tokens": 113117271.0, "step": 275 }, { "epoch": 0.10550458715596331, "grad_norm": 0.2970284419669802, "learning_rate": 9.868107381976923e-06, "loss": 0.3741, "num_tokens": 113518142.0, "step": 276 }, { "epoch": 0.1058868501529052, "grad_norm": 0.29220238684960315, "learning_rate": 9.866764803413215e-06, "loss": 0.3629, "num_tokens": 113909217.0, "step": 277 }, { "epoch": 0.10626911314984709, "grad_norm": 0.3165983400943323, "learning_rate": 9.865415528801527e-06, "loss": 0.3956, "num_tokens": 114297637.0, "step": 278 }, { "epoch": 0.10665137614678899, "grad_norm": 0.27854892660119135, "learning_rate": 9.864059560210858e-06, "loss": 0.366, "num_tokens": 114726773.0, "step": 279 }, { "epoch": 0.10703363914373089, "grad_norm": 0.2761010119890071, "learning_rate": 9.862696899720465e-06, "loss": 0.3523, "num_tokens": 115115080.0, "step": 280 }, { "epoch": 0.10741590214067279, "grad_norm": 0.30788357270337063, "learning_rate": 9.861327549419866e-06, "loss": 0.3495, "num_tokens": 115516715.0, "step": 281 }, { "epoch": 0.10779816513761468, "grad_norm": 0.312954310400862, "learning_rate": 9.85995151140884e-06, "loss": 0.3628, "num_tokens": 115941499.0, "step": 282 }, { "epoch": 0.10818042813455657, "grad_norm": 0.30918221576751287, "learning_rate": 9.85856878779742e-06, "loss": 0.3922, "num_tokens": 116370691.0, "step": 283 }, { "epoch": 0.10856269113149847, "grad_norm": 0.31223777670382713, "learning_rate": 9.857179380705887e-06, "loss": 0.3821, "num_tokens": 116773367.0, "step": 284 }, { "epoch": 0.10894495412844037, "grad_norm": 0.27289945113722036, "learning_rate": 9.855783292264781e-06, "loss": 0.3629, "num_tokens": 117175862.0, "step": 285 }, { "epoch": 0.10932721712538226, "grad_norm": 0.2909587041265393, "learning_rate": 9.854380524614874e-06, "loss": 0.3811, "num_tokens": 117555972.0, "step": 286 }, { "epoch": 0.10970948012232416, "grad_norm": 0.3508899918081021, "learning_rate": 9.852971079907189e-06, "loss": 0.3908, "num_tokens": 117989763.0, "step": 287 }, { "epoch": 0.11009174311926606, "grad_norm": 0.331276353344019, "learning_rate": 9.851554960302982e-06, "loss": 0.3676, "num_tokens": 118433333.0, "step": 288 }, { "epoch": 0.11047400611620795, "grad_norm": 0.2924357482752978, "learning_rate": 9.85013216797375e-06, "loss": 0.3476, "num_tokens": 118829868.0, "step": 289 }, { "epoch": 0.11085626911314984, "grad_norm": 0.2997056422292096, "learning_rate": 9.848702705101222e-06, "loss": 0.3617, "num_tokens": 119211420.0, "step": 290 }, { "epoch": 0.11123853211009174, "grad_norm": 0.3742836379630399, "learning_rate": 9.847266573877346e-06, "loss": 0.3826, "num_tokens": 119640407.0, "step": 291 }, { "epoch": 0.11162079510703364, "grad_norm": 0.3216917969373817, "learning_rate": 9.845823776504308e-06, "loss": 0.3696, "num_tokens": 120048713.0, "step": 292 }, { "epoch": 0.11200305810397554, "grad_norm": 0.3027352030982205, "learning_rate": 9.844374315194508e-06, "loss": 0.3745, "num_tokens": 120422430.0, "step": 293 }, { "epoch": 0.11238532110091744, "grad_norm": 0.3071429341787571, "learning_rate": 9.842918192170567e-06, "loss": 0.3861, "num_tokens": 120812256.0, "step": 294 }, { "epoch": 0.11276758409785932, "grad_norm": 0.29715278251405547, "learning_rate": 9.841455409665322e-06, "loss": 0.3505, "num_tokens": 121203984.0, "step": 295 }, { "epoch": 0.11314984709480122, "grad_norm": 0.26601257484399915, "learning_rate": 9.83998596992182e-06, "loss": 0.3556, "num_tokens": 121636080.0, "step": 296 }, { "epoch": 0.11353211009174312, "grad_norm": 0.29965657210105656, "learning_rate": 9.838509875193317e-06, "loss": 0.3637, "num_tokens": 122061987.0, "step": 297 }, { "epoch": 0.11391437308868502, "grad_norm": 0.27848002752496387, "learning_rate": 9.837027127743275e-06, "loss": 0.3641, "num_tokens": 122410943.0, "step": 298 }, { "epoch": 0.11429663608562692, "grad_norm": 0.2682770864804808, "learning_rate": 9.835537729845352e-06, "loss": 0.3948, "num_tokens": 122848976.0, "step": 299 }, { "epoch": 0.11467889908256881, "grad_norm": 0.28555695033122447, "learning_rate": 9.834041683783413e-06, "loss": 0.3553, "num_tokens": 123232302.0, "step": 300 }, { "epoch": 0.1150611620795107, "grad_norm": 0.29283430135149235, "learning_rate": 9.83253899185151e-06, "loss": 0.3595, "num_tokens": 123620339.0, "step": 301 }, { "epoch": 0.1154434250764526, "grad_norm": 0.3327873933315915, "learning_rate": 9.83102965635389e-06, "loss": 0.3748, "num_tokens": 124011849.0, "step": 302 }, { "epoch": 0.1158256880733945, "grad_norm": 0.2724899431720382, "learning_rate": 9.829513679604983e-06, "loss": 0.3506, "num_tokens": 124450845.0, "step": 303 }, { "epoch": 0.1162079510703364, "grad_norm": 0.2770946104901708, "learning_rate": 9.827991063929407e-06, "loss": 0.3644, "num_tokens": 124862699.0, "step": 304 }, { "epoch": 0.11659021406727829, "grad_norm": 0.2915694326733997, "learning_rate": 9.826461811661959e-06, "loss": 0.3561, "num_tokens": 125249693.0, "step": 305 }, { "epoch": 0.11697247706422019, "grad_norm": 0.28170953782047087, "learning_rate": 9.824925925147611e-06, "loss": 0.3668, "num_tokens": 125640121.0, "step": 306 }, { "epoch": 0.11735474006116207, "grad_norm": 0.3049371740679874, "learning_rate": 9.823383406741511e-06, "loss": 0.3833, "num_tokens": 126033668.0, "step": 307 }, { "epoch": 0.11773700305810397, "grad_norm": 0.3014529343668211, "learning_rate": 9.821834258808973e-06, "loss": 0.3537, "num_tokens": 126459451.0, "step": 308 }, { "epoch": 0.11811926605504587, "grad_norm": 0.30241226769117924, "learning_rate": 9.82027848372548e-06, "loss": 0.3734, "num_tokens": 126894820.0, "step": 309 }, { "epoch": 0.11850152905198777, "grad_norm": 0.2885703862024239, "learning_rate": 9.818716083876672e-06, "loss": 0.3588, "num_tokens": 127274240.0, "step": 310 }, { "epoch": 0.11888379204892967, "grad_norm": 0.2805862007763283, "learning_rate": 9.817147061658357e-06, "loss": 0.3667, "num_tokens": 127664119.0, "step": 311 }, { "epoch": 0.11926605504587157, "grad_norm": 0.2594680894998848, "learning_rate": 9.815571419476488e-06, "loss": 0.3727, "num_tokens": 128072004.0, "step": 312 }, { "epoch": 0.11964831804281345, "grad_norm": 0.26692183256572516, "learning_rate": 9.813989159747173e-06, "loss": 0.3666, "num_tokens": 128516007.0, "step": 313 }, { "epoch": 0.12003058103975535, "grad_norm": 0.284716987141419, "learning_rate": 9.81240028489667e-06, "loss": 0.3596, "num_tokens": 128893511.0, "step": 314 }, { "epoch": 0.12041284403669725, "grad_norm": 0.2712764803922657, "learning_rate": 9.810804797361374e-06, "loss": 0.368, "num_tokens": 129272733.0, "step": 315 }, { "epoch": 0.12079510703363915, "grad_norm": 0.27976400887056985, "learning_rate": 9.809202699587828e-06, "loss": 0.3645, "num_tokens": 129670952.0, "step": 316 }, { "epoch": 0.12117737003058104, "grad_norm": 0.3127815042159146, "learning_rate": 9.807593994032706e-06, "loss": 0.3977, "num_tokens": 130121126.0, "step": 317 }, { "epoch": 0.12155963302752294, "grad_norm": 0.2863769224906313, "learning_rate": 9.805978683162816e-06, "loss": 0.3939, "num_tokens": 130583458.0, "step": 318 }, { "epoch": 0.12194189602446483, "grad_norm": 0.27258847923530244, "learning_rate": 9.804356769455092e-06, "loss": 0.3482, "num_tokens": 131000170.0, "step": 319 }, { "epoch": 0.12232415902140673, "grad_norm": 0.271669391745868, "learning_rate": 9.802728255396602e-06, "loss": 0.3568, "num_tokens": 131386432.0, "step": 320 }, { "epoch": 0.12270642201834862, "grad_norm": 0.33241178596873316, "learning_rate": 9.801093143484521e-06, "loss": 0.3848, "num_tokens": 131754797.0, "step": 321 }, { "epoch": 0.12308868501529052, "grad_norm": 0.34931101181525115, "learning_rate": 9.799451436226151e-06, "loss": 0.3908, "num_tokens": 132160224.0, "step": 322 }, { "epoch": 0.12347094801223242, "grad_norm": 0.2954467177276294, "learning_rate": 9.797803136138907e-06, "loss": 0.3649, "num_tokens": 132581368.0, "step": 323 }, { "epoch": 0.12385321100917432, "grad_norm": 0.30971607929524986, "learning_rate": 9.796148245750313e-06, "loss": 0.3348, "num_tokens": 132951299.0, "step": 324 }, { "epoch": 0.1242354740061162, "grad_norm": 0.3188511135371351, "learning_rate": 9.794486767597992e-06, "loss": 0.3983, "num_tokens": 133400887.0, "step": 325 }, { "epoch": 0.1246177370030581, "grad_norm": 0.27615267996919446, "learning_rate": 9.792818704229677e-06, "loss": 0.3697, "num_tokens": 133856780.0, "step": 326 }, { "epoch": 0.125, "grad_norm": 0.35218491803529917, "learning_rate": 9.791144058203194e-06, "loss": 0.3819, "num_tokens": 134313328.0, "step": 327 }, { "epoch": 0.12538226299694188, "grad_norm": 0.361793076454163, "learning_rate": 9.789462832086468e-06, "loss": 0.3751, "num_tokens": 134706925.0, "step": 328 }, { "epoch": 0.1257645259938838, "grad_norm": 0.28318972146916227, "learning_rate": 9.787775028457506e-06, "loss": 0.3688, "num_tokens": 135121368.0, "step": 329 }, { "epoch": 0.12614678899082568, "grad_norm": 0.2788107106873188, "learning_rate": 9.786080649904409e-06, "loss": 0.368, "num_tokens": 135543059.0, "step": 330 }, { "epoch": 0.1265290519877676, "grad_norm": 0.3551870330735956, "learning_rate": 9.784379699025358e-06, "loss": 0.368, "num_tokens": 135932194.0, "step": 331 }, { "epoch": 0.12691131498470948, "grad_norm": 0.26915650405734415, "learning_rate": 9.782672178428607e-06, "loss": 0.3676, "num_tokens": 136372866.0, "step": 332 }, { "epoch": 0.12729357798165136, "grad_norm": 0.25923800088776544, "learning_rate": 9.78095809073249e-06, "loss": 0.354, "num_tokens": 136783472.0, "step": 333 }, { "epoch": 0.12767584097859327, "grad_norm": 0.2739029527314487, "learning_rate": 9.77923743856541e-06, "loss": 0.352, "num_tokens": 137193968.0, "step": 334 }, { "epoch": 0.12805810397553516, "grad_norm": 0.2768964480829531, "learning_rate": 9.777510224565834e-06, "loss": 0.362, "num_tokens": 137595702.0, "step": 335 }, { "epoch": 0.12844036697247707, "grad_norm": 0.3178493502924472, "learning_rate": 9.775776451382292e-06, "loss": 0.3686, "num_tokens": 138025233.0, "step": 336 }, { "epoch": 0.12882262996941896, "grad_norm": 0.2985756541941514, "learning_rate": 9.774036121673374e-06, "loss": 0.3759, "num_tokens": 138439624.0, "step": 337 }, { "epoch": 0.12920489296636087, "grad_norm": 0.29629539600652666, "learning_rate": 9.772289238107717e-06, "loss": 0.355, "num_tokens": 138860597.0, "step": 338 }, { "epoch": 0.12958715596330275, "grad_norm": 0.2656162002560928, "learning_rate": 9.770535803364014e-06, "loss": 0.3777, "num_tokens": 139267517.0, "step": 339 }, { "epoch": 0.12996941896024464, "grad_norm": 0.3079874135040319, "learning_rate": 9.768775820131008e-06, "loss": 0.3921, "num_tokens": 139694958.0, "step": 340 }, { "epoch": 0.13035168195718655, "grad_norm": 0.2906988544585396, "learning_rate": 9.767009291107471e-06, "loss": 0.3749, "num_tokens": 140079416.0, "step": 341 }, { "epoch": 0.13073394495412843, "grad_norm": 0.29327212809124914, "learning_rate": 9.765236219002223e-06, "loss": 0.3842, "num_tokens": 140523685.0, "step": 342 }, { "epoch": 0.13111620795107035, "grad_norm": 0.28019234806606597, "learning_rate": 9.763456606534112e-06, "loss": 0.3536, "num_tokens": 140950392.0, "step": 343 }, { "epoch": 0.13149847094801223, "grad_norm": 0.24897455668458857, "learning_rate": 9.761670456432016e-06, "loss": 0.3592, "num_tokens": 141380785.0, "step": 344 }, { "epoch": 0.13188073394495411, "grad_norm": 0.3054569832855343, "learning_rate": 9.75987777143484e-06, "loss": 0.379, "num_tokens": 141792563.0, "step": 345 }, { "epoch": 0.13226299694189603, "grad_norm": 0.2659266431374649, "learning_rate": 9.758078554291505e-06, "loss": 0.3612, "num_tokens": 142162144.0, "step": 346 }, { "epoch": 0.1326452599388379, "grad_norm": 0.28333485294806937, "learning_rate": 9.756272807760954e-06, "loss": 0.3574, "num_tokens": 142520857.0, "step": 347 }, { "epoch": 0.13302752293577982, "grad_norm": 0.3734088636202502, "learning_rate": 9.75446053461214e-06, "loss": 0.3676, "num_tokens": 142917677.0, "step": 348 }, { "epoch": 0.1334097859327217, "grad_norm": 0.29278447614583514, "learning_rate": 9.752641737624023e-06, "loss": 0.3812, "num_tokens": 143317782.0, "step": 349 }, { "epoch": 0.13379204892966362, "grad_norm": 0.28446614185529623, "learning_rate": 9.750816419585569e-06, "loss": 0.3708, "num_tokens": 143729772.0, "step": 350 }, { "epoch": 0.1341743119266055, "grad_norm": 0.23898402619912196, "learning_rate": 9.748984583295736e-06, "loss": 0.3644, "num_tokens": 144178116.0, "step": 351 }, { "epoch": 0.1345565749235474, "grad_norm": 0.27653297398599813, "learning_rate": 9.747146231563491e-06, "loss": 0.3704, "num_tokens": 144575763.0, "step": 352 }, { "epoch": 0.1349388379204893, "grad_norm": 0.2887505187465025, "learning_rate": 9.74530136720778e-06, "loss": 0.3718, "num_tokens": 144968040.0, "step": 353 }, { "epoch": 0.1353211009174312, "grad_norm": 0.26690262658958475, "learning_rate": 9.743449993057537e-06, "loss": 0.3987, "num_tokens": 145435439.0, "step": 354 }, { "epoch": 0.1357033639143731, "grad_norm": 0.2856814337231243, "learning_rate": 9.741592111951687e-06, "loss": 0.3693, "num_tokens": 145839015.0, "step": 355 }, { "epoch": 0.13608562691131498, "grad_norm": 0.24839206187457766, "learning_rate": 9.739727726739122e-06, "loss": 0.3769, "num_tokens": 146287347.0, "step": 356 }, { "epoch": 0.13646788990825687, "grad_norm": 0.25246604863974276, "learning_rate": 9.737856840278713e-06, "loss": 0.3612, "num_tokens": 146671063.0, "step": 357 }, { "epoch": 0.13685015290519878, "grad_norm": 0.3233202343500254, "learning_rate": 9.7359794554393e-06, "loss": 0.3692, "num_tokens": 147076604.0, "step": 358 }, { "epoch": 0.13723241590214066, "grad_norm": 0.34497878728322334, "learning_rate": 9.734095575099684e-06, "loss": 0.375, "num_tokens": 147443378.0, "step": 359 }, { "epoch": 0.13761467889908258, "grad_norm": 0.2975012994682014, "learning_rate": 9.732205202148631e-06, "loss": 0.3597, "num_tokens": 147846174.0, "step": 360 }, { "epoch": 0.13799694189602446, "grad_norm": 0.3009871494813123, "learning_rate": 9.730308339484862e-06, "loss": 0.3823, "num_tokens": 148265170.0, "step": 361 }, { "epoch": 0.13837920489296637, "grad_norm": 0.3269887556100033, "learning_rate": 9.728404990017046e-06, "loss": 0.3714, "num_tokens": 148645677.0, "step": 362 }, { "epoch": 0.13876146788990826, "grad_norm": 0.3486884596736548, "learning_rate": 9.726495156663803e-06, "loss": 0.3587, "num_tokens": 149014119.0, "step": 363 }, { "epoch": 0.13914373088685014, "grad_norm": 0.2741666982338165, "learning_rate": 9.724578842353695e-06, "loss": 0.3454, "num_tokens": 149397613.0, "step": 364 }, { "epoch": 0.13952599388379205, "grad_norm": 0.28797219330893853, "learning_rate": 9.722656050025216e-06, "loss": 0.3664, "num_tokens": 149837858.0, "step": 365 }, { "epoch": 0.13990825688073394, "grad_norm": 0.3049664442019357, "learning_rate": 9.720726782626801e-06, "loss": 0.3736, "num_tokens": 150217716.0, "step": 366 }, { "epoch": 0.14029051987767585, "grad_norm": 0.316178060305947, "learning_rate": 9.718791043116812e-06, "loss": 0.386, "num_tokens": 150618548.0, "step": 367 }, { "epoch": 0.14067278287461774, "grad_norm": 0.2959532293284894, "learning_rate": 9.716848834463532e-06, "loss": 0.3678, "num_tokens": 151057762.0, "step": 368 }, { "epoch": 0.14105504587155962, "grad_norm": 0.2957554022117756, "learning_rate": 9.714900159645169e-06, "loss": 0.3779, "num_tokens": 151466318.0, "step": 369 }, { "epoch": 0.14143730886850153, "grad_norm": 0.29755257785291395, "learning_rate": 9.712945021649842e-06, "loss": 0.3558, "num_tokens": 151857490.0, "step": 370 }, { "epoch": 0.14181957186544342, "grad_norm": 0.28830811691023295, "learning_rate": 9.710983423475583e-06, "loss": 0.3759, "num_tokens": 152254952.0, "step": 371 }, { "epoch": 0.14220183486238533, "grad_norm": 0.2927087385898855, "learning_rate": 9.709015368130328e-06, "loss": 0.3551, "num_tokens": 152626944.0, "step": 372 }, { "epoch": 0.1425840978593272, "grad_norm": 0.2863661063037929, "learning_rate": 9.707040858631918e-06, "loss": 0.3872, "num_tokens": 153076826.0, "step": 373 }, { "epoch": 0.14296636085626913, "grad_norm": 0.2969143733041409, "learning_rate": 9.705059898008087e-06, "loss": 0.4003, "num_tokens": 153513246.0, "step": 374 }, { "epoch": 0.143348623853211, "grad_norm": 0.27998416781365243, "learning_rate": 9.703072489296467e-06, "loss": 0.3886, "num_tokens": 153928116.0, "step": 375 }, { "epoch": 0.1437308868501529, "grad_norm": 0.2740751028842675, "learning_rate": 9.70107863554457e-06, "loss": 0.3627, "num_tokens": 154338955.0, "step": 376 }, { "epoch": 0.1441131498470948, "grad_norm": 0.2618085061553179, "learning_rate": 9.699078339809793e-06, "loss": 0.3555, "num_tokens": 154722147.0, "step": 377 }, { "epoch": 0.1444954128440367, "grad_norm": 0.26328965045222535, "learning_rate": 9.697071605159418e-06, "loss": 0.3624, "num_tokens": 155115005.0, "step": 378 }, { "epoch": 0.1448776758409786, "grad_norm": 0.27170767317557487, "learning_rate": 9.69505843467059e-06, "loss": 0.379, "num_tokens": 155552529.0, "step": 379 }, { "epoch": 0.1452599388379205, "grad_norm": 0.2994766149066218, "learning_rate": 9.693038831430332e-06, "loss": 0.3757, "num_tokens": 155956435.0, "step": 380 }, { "epoch": 0.14564220183486237, "grad_norm": 0.3216199872305448, "learning_rate": 9.691012798535524e-06, "loss": 0.3743, "num_tokens": 156358518.0, "step": 381 }, { "epoch": 0.14602446483180428, "grad_norm": 0.2724435265094422, "learning_rate": 9.68898033909291e-06, "loss": 0.3548, "num_tokens": 156802220.0, "step": 382 }, { "epoch": 0.14640672782874617, "grad_norm": 0.2733083458726887, "learning_rate": 9.686941456219088e-06, "loss": 0.3648, "num_tokens": 157181770.0, "step": 383 }, { "epoch": 0.14678899082568808, "grad_norm": 0.2683717534178117, "learning_rate": 9.684896153040504e-06, "loss": 0.3707, "num_tokens": 157631260.0, "step": 384 }, { "epoch": 0.14717125382262997, "grad_norm": 0.2797821278974627, "learning_rate": 9.682844432693447e-06, "loss": 0.3517, "num_tokens": 158081752.0, "step": 385 }, { "epoch": 0.14755351681957188, "grad_norm": 0.3021244746434412, "learning_rate": 9.680786298324054e-06, "loss": 0.385, "num_tokens": 158503533.0, "step": 386 }, { "epoch": 0.14793577981651376, "grad_norm": 0.30084371736775684, "learning_rate": 9.67872175308829e-06, "loss": 0.3587, "num_tokens": 158886791.0, "step": 387 }, { "epoch": 0.14831804281345565, "grad_norm": 0.28898401069489127, "learning_rate": 9.67665080015195e-06, "loss": 0.3944, "num_tokens": 159326374.0, "step": 388 }, { "epoch": 0.14870030581039756, "grad_norm": 0.27620905341201196, "learning_rate": 9.674573442690658e-06, "loss": 0.349, "num_tokens": 159709556.0, "step": 389 }, { "epoch": 0.14908256880733944, "grad_norm": 0.29430023036015573, "learning_rate": 9.672489683889862e-06, "loss": 0.3652, "num_tokens": 160123233.0, "step": 390 }, { "epoch": 0.14946483180428136, "grad_norm": 0.29224260409863767, "learning_rate": 9.67039952694482e-06, "loss": 0.3664, "num_tokens": 160559424.0, "step": 391 }, { "epoch": 0.14984709480122324, "grad_norm": 0.30496832909422966, "learning_rate": 9.6683029750606e-06, "loss": 0.3999, "num_tokens": 160939165.0, "step": 392 }, { "epoch": 0.15022935779816513, "grad_norm": 0.2603623646692436, "learning_rate": 9.666200031452084e-06, "loss": 0.3774, "num_tokens": 161373163.0, "step": 393 }, { "epoch": 0.15061162079510704, "grad_norm": 0.32390633772363503, "learning_rate": 9.664090699343948e-06, "loss": 0.3982, "num_tokens": 161780011.0, "step": 394 }, { "epoch": 0.15099388379204892, "grad_norm": 0.3260661267973067, "learning_rate": 9.661974981970665e-06, "loss": 0.3752, "num_tokens": 162187042.0, "step": 395 }, { "epoch": 0.15137614678899083, "grad_norm": 0.3354818920882661, "learning_rate": 9.659852882576502e-06, "loss": 0.3691, "num_tokens": 162571052.0, "step": 396 }, { "epoch": 0.15175840978593272, "grad_norm": 0.2716963089106391, "learning_rate": 9.65772440441551e-06, "loss": 0.36, "num_tokens": 162997756.0, "step": 397 }, { "epoch": 0.15214067278287463, "grad_norm": 0.2891863031044559, "learning_rate": 9.655589550751525e-06, "loss": 0.383, "num_tokens": 163449837.0, "step": 398 }, { "epoch": 0.15252293577981652, "grad_norm": 0.31749094295800434, "learning_rate": 9.653448324858151e-06, "loss": 0.3969, "num_tokens": 163835806.0, "step": 399 }, { "epoch": 0.1529051987767584, "grad_norm": 0.30000356817837964, "learning_rate": 9.651300730018776e-06, "loss": 0.3836, "num_tokens": 164276300.0, "step": 400 }, { "epoch": 0.1532874617737003, "grad_norm": 0.31268242628477666, "learning_rate": 9.64914676952654e-06, "loss": 0.3938, "num_tokens": 164726852.0, "step": 401 }, { "epoch": 0.1536697247706422, "grad_norm": 0.2551289872017507, "learning_rate": 9.646986446684357e-06, "loss": 0.3529, "num_tokens": 165146972.0, "step": 402 }, { "epoch": 0.1540519877675841, "grad_norm": 0.3207304470525379, "learning_rate": 9.644819764804888e-06, "loss": 0.3937, "num_tokens": 165587373.0, "step": 403 }, { "epoch": 0.154434250764526, "grad_norm": 0.3350743575339698, "learning_rate": 9.642646727210546e-06, "loss": 0.3956, "num_tokens": 165999875.0, "step": 404 }, { "epoch": 0.15481651376146788, "grad_norm": 0.30082830614480544, "learning_rate": 9.640467337233496e-06, "loss": 0.3674, "num_tokens": 166382211.0, "step": 405 }, { "epoch": 0.1551987767584098, "grad_norm": 0.2891372838044858, "learning_rate": 9.638281598215637e-06, "loss": 0.3812, "num_tokens": 166771480.0, "step": 406 }, { "epoch": 0.15558103975535167, "grad_norm": 0.31050020435082976, "learning_rate": 9.636089513508612e-06, "loss": 0.3813, "num_tokens": 167172230.0, "step": 407 }, { "epoch": 0.1559633027522936, "grad_norm": 0.3031677280970331, "learning_rate": 9.633891086473783e-06, "loss": 0.3737, "num_tokens": 167596027.0, "step": 408 }, { "epoch": 0.15634556574923547, "grad_norm": 0.305489363988088, "learning_rate": 9.631686320482245e-06, "loss": 0.3808, "num_tokens": 168008679.0, "step": 409 }, { "epoch": 0.15672782874617738, "grad_norm": 0.27970483552619974, "learning_rate": 9.629475218914816e-06, "loss": 0.3563, "num_tokens": 168407899.0, "step": 410 }, { "epoch": 0.15711009174311927, "grad_norm": 0.26239782662158345, "learning_rate": 9.62725778516202e-06, "loss": 0.3797, "num_tokens": 168853815.0, "step": 411 }, { "epoch": 0.15749235474006115, "grad_norm": 0.26246837740583684, "learning_rate": 9.625034022624097e-06, "loss": 0.3543, "num_tokens": 169223226.0, "step": 412 }, { "epoch": 0.15787461773700306, "grad_norm": 0.26826639661015544, "learning_rate": 9.622803934710993e-06, "loss": 0.361, "num_tokens": 169628949.0, "step": 413 }, { "epoch": 0.15825688073394495, "grad_norm": 0.3089112536180955, "learning_rate": 9.620567524842347e-06, "loss": 0.3629, "num_tokens": 170039146.0, "step": 414 }, { "epoch": 0.15863914373088686, "grad_norm": 0.32512697450744416, "learning_rate": 9.618324796447497e-06, "loss": 0.3917, "num_tokens": 170460368.0, "step": 415 }, { "epoch": 0.15902140672782875, "grad_norm": 0.3078157476487109, "learning_rate": 9.61607575296547e-06, "loss": 0.3685, "num_tokens": 170889449.0, "step": 416 }, { "epoch": 0.15940366972477063, "grad_norm": 0.272300828529101, "learning_rate": 9.613820397844976e-06, "loss": 0.3745, "num_tokens": 171345859.0, "step": 417 }, { "epoch": 0.15978593272171254, "grad_norm": 0.2723811264053706, "learning_rate": 9.6115587345444e-06, "loss": 0.3807, "num_tokens": 171775963.0, "step": 418 }, { "epoch": 0.16016819571865443, "grad_norm": 0.2971335275753423, "learning_rate": 9.609290766531806e-06, "loss": 0.3761, "num_tokens": 172160080.0, "step": 419 }, { "epoch": 0.16055045871559634, "grad_norm": 0.3314012890549614, "learning_rate": 9.60701649728492e-06, "loss": 0.368, "num_tokens": 172567767.0, "step": 420 }, { "epoch": 0.16093272171253822, "grad_norm": 0.33399396700972167, "learning_rate": 9.604735930291135e-06, "loss": 0.3922, "num_tokens": 172972247.0, "step": 421 }, { "epoch": 0.16131498470948014, "grad_norm": 0.27313234738123665, "learning_rate": 9.602449069047497e-06, "loss": 0.3544, "num_tokens": 173379826.0, "step": 422 }, { "epoch": 0.16169724770642202, "grad_norm": 0.3225316836157724, "learning_rate": 9.600155917060707e-06, "loss": 0.3664, "num_tokens": 173785687.0, "step": 423 }, { "epoch": 0.1620795107033639, "grad_norm": 0.3458487622018256, "learning_rate": 9.597856477847111e-06, "loss": 0.3638, "num_tokens": 174213115.0, "step": 424 }, { "epoch": 0.16246177370030582, "grad_norm": 0.30896769571826427, "learning_rate": 9.595550754932693e-06, "loss": 0.3855, "num_tokens": 174592231.0, "step": 425 }, { "epoch": 0.1628440366972477, "grad_norm": 0.2692727375471378, "learning_rate": 9.59323875185308e-06, "loss": 0.3551, "num_tokens": 174982003.0, "step": 426 }, { "epoch": 0.1632262996941896, "grad_norm": 0.3606429103340105, "learning_rate": 9.590920472153522e-06, "loss": 0.3593, "num_tokens": 175409032.0, "step": 427 }, { "epoch": 0.1636085626911315, "grad_norm": 0.29516050888962153, "learning_rate": 9.588595919388897e-06, "loss": 0.3574, "num_tokens": 175827009.0, "step": 428 }, { "epoch": 0.16399082568807338, "grad_norm": 0.3176550911962296, "learning_rate": 9.586265097123699e-06, "loss": 0.3924, "num_tokens": 176221841.0, "step": 429 }, { "epoch": 0.1643730886850153, "grad_norm": 0.31234635489501056, "learning_rate": 9.58392800893204e-06, "loss": 0.3752, "num_tokens": 176657605.0, "step": 430 }, { "epoch": 0.16475535168195718, "grad_norm": 0.3061075043522359, "learning_rate": 9.581584658397637e-06, "loss": 0.3655, "num_tokens": 177053768.0, "step": 431 }, { "epoch": 0.1651376146788991, "grad_norm": 0.2688613494807311, "learning_rate": 9.579235049113812e-06, "loss": 0.3418, "num_tokens": 177460319.0, "step": 432 }, { "epoch": 0.16551987767584098, "grad_norm": 0.2982373514630313, "learning_rate": 9.576879184683483e-06, "loss": 0.3668, "num_tokens": 177891087.0, "step": 433 }, { "epoch": 0.1659021406727829, "grad_norm": 0.26814008512660753, "learning_rate": 9.57451706871916e-06, "loss": 0.3748, "num_tokens": 178309584.0, "step": 434 }, { "epoch": 0.16628440366972477, "grad_norm": 0.3205179162143746, "learning_rate": 9.57214870484294e-06, "loss": 0.3655, "num_tokens": 178686819.0, "step": 435 }, { "epoch": 0.16666666666666666, "grad_norm": 0.3109140650248626, "learning_rate": 9.569774096686498e-06, "loss": 0.3979, "num_tokens": 179105603.0, "step": 436 }, { "epoch": 0.16704892966360857, "grad_norm": 0.4006651074530381, "learning_rate": 9.567393247891087e-06, "loss": 0.3603, "num_tokens": 179498729.0, "step": 437 }, { "epoch": 0.16743119266055045, "grad_norm": 0.3055036826158832, "learning_rate": 9.565006162107527e-06, "loss": 0.3755, "num_tokens": 179916566.0, "step": 438 }, { "epoch": 0.16781345565749237, "grad_norm": 0.3251595737169241, "learning_rate": 9.562612842996203e-06, "loss": 0.3958, "num_tokens": 180366759.0, "step": 439 }, { "epoch": 0.16819571865443425, "grad_norm": 0.33165817753596266, "learning_rate": 9.560213294227061e-06, "loss": 0.3473, "num_tokens": 180733397.0, "step": 440 }, { "epoch": 0.16857798165137614, "grad_norm": 0.30203021428089855, "learning_rate": 9.557807519479595e-06, "loss": 0.3452, "num_tokens": 181125897.0, "step": 441 }, { "epoch": 0.16896024464831805, "grad_norm": 0.2731981459022347, "learning_rate": 9.555395522442847e-06, "loss": 0.3744, "num_tokens": 181560759.0, "step": 442 }, { "epoch": 0.16934250764525993, "grad_norm": 0.2779944882161738, "learning_rate": 9.552977306815403e-06, "loss": 0.3643, "num_tokens": 182003951.0, "step": 443 }, { "epoch": 0.16972477064220184, "grad_norm": 0.3098125330896663, "learning_rate": 9.550552876305383e-06, "loss": 0.3956, "num_tokens": 182450140.0, "step": 444 }, { "epoch": 0.17010703363914373, "grad_norm": 0.2566730804700202, "learning_rate": 9.548122234630438e-06, "loss": 0.3623, "num_tokens": 182894923.0, "step": 445 }, { "epoch": 0.1704892966360856, "grad_norm": 0.2728091076346928, "learning_rate": 9.54568538551774e-06, "loss": 0.3905, "num_tokens": 183304819.0, "step": 446 }, { "epoch": 0.17087155963302753, "grad_norm": 0.24871517814434455, "learning_rate": 9.543242332703983e-06, "loss": 0.3812, "num_tokens": 183733747.0, "step": 447 }, { "epoch": 0.1712538226299694, "grad_norm": 0.2665952290340883, "learning_rate": 9.54079307993537e-06, "loss": 0.3917, "num_tokens": 184190568.0, "step": 448 }, { "epoch": 0.17163608562691132, "grad_norm": 0.29778086934802145, "learning_rate": 9.538337630967618e-06, "loss": 0.3801, "num_tokens": 184572672.0, "step": 449 }, { "epoch": 0.1720183486238532, "grad_norm": 0.274711025201977, "learning_rate": 9.535875989565937e-06, "loss": 0.3564, "num_tokens": 184995877.0, "step": 450 }, { "epoch": 0.17240061162079512, "grad_norm": 0.28852132868265845, "learning_rate": 9.53340815950504e-06, "loss": 0.39, "num_tokens": 185389102.0, "step": 451 }, { "epoch": 0.172782874617737, "grad_norm": 0.2810517685470472, "learning_rate": 9.530934144569126e-06, "loss": 0.3675, "num_tokens": 185814287.0, "step": 452 }, { "epoch": 0.1731651376146789, "grad_norm": 0.29795417179971595, "learning_rate": 9.528453948551874e-06, "loss": 0.3828, "num_tokens": 186211065.0, "step": 453 }, { "epoch": 0.1735474006116208, "grad_norm": 0.2988078418945623, "learning_rate": 9.52596757525645e-06, "loss": 0.4161, "num_tokens": 186618556.0, "step": 454 }, { "epoch": 0.17392966360856268, "grad_norm": 0.3041655819346514, "learning_rate": 9.523475028495487e-06, "loss": 0.3783, "num_tokens": 187044102.0, "step": 455 }, { "epoch": 0.1743119266055046, "grad_norm": 0.29069153764266276, "learning_rate": 9.520976312091085e-06, "loss": 0.3589, "num_tokens": 187386889.0, "step": 456 }, { "epoch": 0.17469418960244648, "grad_norm": 0.3010367164793014, "learning_rate": 9.518471429874804e-06, "loss": 0.3913, "num_tokens": 187834420.0, "step": 457 }, { "epoch": 0.17507645259938837, "grad_norm": 0.28465055015834334, "learning_rate": 9.51596038568766e-06, "loss": 0.3661, "num_tokens": 188251183.0, "step": 458 }, { "epoch": 0.17545871559633028, "grad_norm": 0.30943574129616025, "learning_rate": 9.513443183380116e-06, "loss": 0.3967, "num_tokens": 188689531.0, "step": 459 }, { "epoch": 0.17584097859327216, "grad_norm": 0.34124866740742443, "learning_rate": 9.510919826812081e-06, "loss": 0.398, "num_tokens": 189152497.0, "step": 460 }, { "epoch": 0.17622324159021407, "grad_norm": 0.2911543379474011, "learning_rate": 9.5083903198529e-06, "loss": 0.3761, "num_tokens": 189594914.0, "step": 461 }, { "epoch": 0.17660550458715596, "grad_norm": 0.2642824578844909, "learning_rate": 9.505854666381347e-06, "loss": 0.3492, "num_tokens": 189962468.0, "step": 462 }, { "epoch": 0.17698776758409787, "grad_norm": 0.2997282910158942, "learning_rate": 9.503312870285623e-06, "loss": 0.3711, "num_tokens": 190366781.0, "step": 463 }, { "epoch": 0.17737003058103976, "grad_norm": 0.3013150234420607, "learning_rate": 9.500764935463348e-06, "loss": 0.3927, "num_tokens": 190767753.0, "step": 464 }, { "epoch": 0.17775229357798164, "grad_norm": 0.294207055976542, "learning_rate": 9.498210865821555e-06, "loss": 0.3969, "num_tokens": 191206908.0, "step": 465 }, { "epoch": 0.17813455657492355, "grad_norm": 0.26404247072093273, "learning_rate": 9.495650665276683e-06, "loss": 0.3529, "num_tokens": 191531944.0, "step": 466 }, { "epoch": 0.17851681957186544, "grad_norm": 0.2680184575876437, "learning_rate": 9.493084337754573e-06, "loss": 0.3526, "num_tokens": 191925006.0, "step": 467 }, { "epoch": 0.17889908256880735, "grad_norm": 0.3128348888657449, "learning_rate": 9.490511887190463e-06, "loss": 0.392, "num_tokens": 192349836.0, "step": 468 }, { "epoch": 0.17928134556574923, "grad_norm": 0.3005216178734559, "learning_rate": 9.487933317528979e-06, "loss": 0.3547, "num_tokens": 192716877.0, "step": 469 }, { "epoch": 0.17966360856269112, "grad_norm": 0.30789452207039825, "learning_rate": 9.485348632724128e-06, "loss": 0.3973, "num_tokens": 193172070.0, "step": 470 }, { "epoch": 0.18004587155963303, "grad_norm": 0.2628617754857212, "learning_rate": 9.482757836739297e-06, "loss": 0.345, "num_tokens": 193563030.0, "step": 471 }, { "epoch": 0.18042813455657492, "grad_norm": 0.28054255699602965, "learning_rate": 9.480160933547243e-06, "loss": 0.3544, "num_tokens": 193983277.0, "step": 472 }, { "epoch": 0.18081039755351683, "grad_norm": 0.30902723175488345, "learning_rate": 9.477557927130085e-06, "loss": 0.3776, "num_tokens": 194420042.0, "step": 473 }, { "epoch": 0.1811926605504587, "grad_norm": 0.30420476138288444, "learning_rate": 9.474948821479306e-06, "loss": 0.3899, "num_tokens": 194842062.0, "step": 474 }, { "epoch": 0.18157492354740062, "grad_norm": 0.31719726414849897, "learning_rate": 9.472333620595739e-06, "loss": 0.3944, "num_tokens": 195276393.0, "step": 475 }, { "epoch": 0.1819571865443425, "grad_norm": 0.3020234755697266, "learning_rate": 9.469712328489561e-06, "loss": 0.3736, "num_tokens": 195631344.0, "step": 476 }, { "epoch": 0.1823394495412844, "grad_norm": 0.27797925604136875, "learning_rate": 9.467084949180297e-06, "loss": 0.3497, "num_tokens": 196006323.0, "step": 477 }, { "epoch": 0.1827217125382263, "grad_norm": 0.2624462326952848, "learning_rate": 9.464451486696793e-06, "loss": 0.3727, "num_tokens": 196438035.0, "step": 478 }, { "epoch": 0.1831039755351682, "grad_norm": 0.31834490894055056, "learning_rate": 9.46181194507724e-06, "loss": 0.3497, "num_tokens": 196845173.0, "step": 479 }, { "epoch": 0.1834862385321101, "grad_norm": 0.36455517719545855, "learning_rate": 9.459166328369135e-06, "loss": 0.3878, "num_tokens": 197294639.0, "step": 480 }, { "epoch": 0.183868501529052, "grad_norm": 0.3291352374849635, "learning_rate": 9.4565146406293e-06, "loss": 0.3657, "num_tokens": 197703636.0, "step": 481 }, { "epoch": 0.18425076452599387, "grad_norm": 0.2601859097081127, "learning_rate": 9.453856885923863e-06, "loss": 0.3778, "num_tokens": 198164083.0, "step": 482 }, { "epoch": 0.18463302752293578, "grad_norm": 0.2756351921066936, "learning_rate": 9.451193068328258e-06, "loss": 0.406, "num_tokens": 198571786.0, "step": 483 }, { "epoch": 0.18501529051987767, "grad_norm": 0.2783378167166546, "learning_rate": 9.448523191927212e-06, "loss": 0.3693, "num_tokens": 198961474.0, "step": 484 }, { "epoch": 0.18539755351681958, "grad_norm": 0.341190072775478, "learning_rate": 9.445847260814745e-06, "loss": 0.3747, "num_tokens": 199368667.0, "step": 485 }, { "epoch": 0.18577981651376146, "grad_norm": 0.2890813140609965, "learning_rate": 9.443165279094162e-06, "loss": 0.3831, "num_tokens": 199755260.0, "step": 486 }, { "epoch": 0.18616207951070338, "grad_norm": 0.2942765983318325, "learning_rate": 9.440477250878044e-06, "loss": 0.3789, "num_tokens": 200161470.0, "step": 487 }, { "epoch": 0.18654434250764526, "grad_norm": 0.30637683198183496, "learning_rate": 9.437783180288244e-06, "loss": 0.3698, "num_tokens": 200557032.0, "step": 488 }, { "epoch": 0.18692660550458715, "grad_norm": 0.30610071389048066, "learning_rate": 9.435083071455883e-06, "loss": 0.3666, "num_tokens": 200939220.0, "step": 489 }, { "epoch": 0.18730886850152906, "grad_norm": 0.36545996446324247, "learning_rate": 9.432376928521336e-06, "loss": 0.375, "num_tokens": 201304275.0, "step": 490 }, { "epoch": 0.18769113149847094, "grad_norm": 0.29029407034186155, "learning_rate": 9.429664755634239e-06, "loss": 0.3762, "num_tokens": 201726357.0, "step": 491 }, { "epoch": 0.18807339449541285, "grad_norm": 0.2615171533792097, "learning_rate": 9.426946556953465e-06, "loss": 0.3638, "num_tokens": 202142078.0, "step": 492 }, { "epoch": 0.18845565749235474, "grad_norm": 0.3257785868272309, "learning_rate": 9.424222336647135e-06, "loss": 0.3661, "num_tokens": 202549698.0, "step": 493 }, { "epoch": 0.18883792048929662, "grad_norm": 0.3102744721706698, "learning_rate": 9.421492098892597e-06, "loss": 0.3812, "num_tokens": 202955662.0, "step": 494 }, { "epoch": 0.18922018348623854, "grad_norm": 0.2853986566575885, "learning_rate": 9.418755847876433e-06, "loss": 0.3636, "num_tokens": 203371062.0, "step": 495 }, { "epoch": 0.18960244648318042, "grad_norm": 0.31114186135769983, "learning_rate": 9.416013587794438e-06, "loss": 0.3851, "num_tokens": 203774734.0, "step": 496 }, { "epoch": 0.18998470948012233, "grad_norm": 0.35213527916400017, "learning_rate": 9.413265322851628e-06, "loss": 0.3934, "num_tokens": 204167686.0, "step": 497 }, { "epoch": 0.19036697247706422, "grad_norm": 0.321062587627723, "learning_rate": 9.410511057262223e-06, "loss": 0.3522, "num_tokens": 204525898.0, "step": 498 }, { "epoch": 0.19074923547400613, "grad_norm": 0.29982928855720004, "learning_rate": 9.407750795249649e-06, "loss": 0.3686, "num_tokens": 204947712.0, "step": 499 }, { "epoch": 0.191131498470948, "grad_norm": 0.3385608318058004, "learning_rate": 9.404984541046522e-06, "loss": 0.3919, "num_tokens": 205398921.0, "step": 500 }, { "epoch": 0.1915137614678899, "grad_norm": 0.27399706824379355, "learning_rate": 9.402212298894646e-06, "loss": 0.3524, "num_tokens": 205794406.0, "step": 501 }, { "epoch": 0.1918960244648318, "grad_norm": 0.2934791355479545, "learning_rate": 9.399434073045013e-06, "loss": 0.3734, "num_tokens": 206213008.0, "step": 502 }, { "epoch": 0.1922782874617737, "grad_norm": 0.29943182622013437, "learning_rate": 9.396649867757783e-06, "loss": 0.3825, "num_tokens": 206610088.0, "step": 503 }, { "epoch": 0.1926605504587156, "grad_norm": 0.31339155050792894, "learning_rate": 9.393859687302294e-06, "loss": 0.398, "num_tokens": 207005404.0, "step": 504 }, { "epoch": 0.1930428134556575, "grad_norm": 0.3000940417256956, "learning_rate": 9.391063535957037e-06, "loss": 0.3768, "num_tokens": 207439377.0, "step": 505 }, { "epoch": 0.19342507645259938, "grad_norm": 0.26964691955243364, "learning_rate": 9.388261418009665e-06, "loss": 0.3807, "num_tokens": 207854975.0, "step": 506 }, { "epoch": 0.1938073394495413, "grad_norm": 0.29753787139304366, "learning_rate": 9.385453337756978e-06, "loss": 0.3754, "num_tokens": 208265785.0, "step": 507 }, { "epoch": 0.19418960244648317, "grad_norm": 0.29788616465134843, "learning_rate": 9.382639299504918e-06, "loss": 0.3717, "num_tokens": 208693798.0, "step": 508 }, { "epoch": 0.19457186544342508, "grad_norm": 0.3019401432066651, "learning_rate": 9.379819307568566e-06, "loss": 0.3665, "num_tokens": 209103195.0, "step": 509 }, { "epoch": 0.19495412844036697, "grad_norm": 0.2938523692663089, "learning_rate": 9.376993366272128e-06, "loss": 0.3775, "num_tokens": 209462352.0, "step": 510 }, { "epoch": 0.19533639143730888, "grad_norm": 0.33653884095044895, "learning_rate": 9.374161479948937e-06, "loss": 0.3745, "num_tokens": 209830545.0, "step": 511 }, { "epoch": 0.19571865443425077, "grad_norm": 0.3399122957923152, "learning_rate": 9.371323652941438e-06, "loss": 0.3926, "num_tokens": 210276774.0, "step": 512 }, { "epoch": 0.19610091743119265, "grad_norm": 0.3070891879415521, "learning_rate": 9.368479889601192e-06, "loss": 0.3856, "num_tokens": 210676946.0, "step": 513 }, { "epoch": 0.19648318042813456, "grad_norm": 0.3488060478890113, "learning_rate": 9.365630194288856e-06, "loss": 0.3807, "num_tokens": 211108153.0, "step": 514 }, { "epoch": 0.19686544342507645, "grad_norm": 0.3270346602514676, "learning_rate": 9.362774571374186e-06, "loss": 0.3612, "num_tokens": 211503099.0, "step": 515 }, { "epoch": 0.19724770642201836, "grad_norm": 0.2945665640848005, "learning_rate": 9.359913025236028e-06, "loss": 0.3972, "num_tokens": 211906574.0, "step": 516 }, { "epoch": 0.19762996941896024, "grad_norm": 0.27355521672222194, "learning_rate": 9.35704556026231e-06, "loss": 0.3746, "num_tokens": 212328522.0, "step": 517 }, { "epoch": 0.19801223241590213, "grad_norm": 0.31336251169488294, "learning_rate": 9.354172180850038e-06, "loss": 0.3947, "num_tokens": 212759186.0, "step": 518 }, { "epoch": 0.19839449541284404, "grad_norm": 0.293011750467182, "learning_rate": 9.351292891405281e-06, "loss": 0.3646, "num_tokens": 213127398.0, "step": 519 }, { "epoch": 0.19877675840978593, "grad_norm": 0.29892819305468965, "learning_rate": 9.34840769634318e-06, "loss": 0.3731, "num_tokens": 213515012.0, "step": 520 }, { "epoch": 0.19915902140672784, "grad_norm": 0.30348130681744817, "learning_rate": 9.345516600087923e-06, "loss": 0.3922, "num_tokens": 213932347.0, "step": 521 }, { "epoch": 0.19954128440366972, "grad_norm": 0.3409043437047744, "learning_rate": 9.342619607072751e-06, "loss": 0.3713, "num_tokens": 214347263.0, "step": 522 }, { "epoch": 0.19992354740061163, "grad_norm": 0.31878638055344494, "learning_rate": 9.339716721739949e-06, "loss": 0.3805, "num_tokens": 214757335.0, "step": 523 }, { "epoch": 0.20030581039755352, "grad_norm": 0.3084691124969446, "learning_rate": 9.336807948540836e-06, "loss": 0.3658, "num_tokens": 215243104.0, "step": 524 }, { "epoch": 0.2006880733944954, "grad_norm": 0.29507224019393014, "learning_rate": 9.333893291935755e-06, "loss": 0.4107, "num_tokens": 215656347.0, "step": 525 }, { "epoch": 0.20107033639143732, "grad_norm": 0.3054571803769428, "learning_rate": 9.330972756394075e-06, "loss": 0.3825, "num_tokens": 216068674.0, "step": 526 }, { "epoch": 0.2014525993883792, "grad_norm": 0.34096781493652856, "learning_rate": 9.328046346394182e-06, "loss": 0.3694, "num_tokens": 216456673.0, "step": 527 }, { "epoch": 0.2018348623853211, "grad_norm": 0.2801169707879607, "learning_rate": 9.325114066423465e-06, "loss": 0.3671, "num_tokens": 216873425.0, "step": 528 }, { "epoch": 0.202217125382263, "grad_norm": 0.303338028634198, "learning_rate": 9.322175920978314e-06, "loss": 0.3509, "num_tokens": 217249540.0, "step": 529 }, { "epoch": 0.20259938837920488, "grad_norm": 0.28886781857183486, "learning_rate": 9.319231914564121e-06, "loss": 0.3817, "num_tokens": 217688565.0, "step": 530 }, { "epoch": 0.2029816513761468, "grad_norm": 0.2914412595675159, "learning_rate": 9.316282051695258e-06, "loss": 0.372, "num_tokens": 218064372.0, "step": 531 }, { "epoch": 0.20336391437308868, "grad_norm": 0.36323745270467844, "learning_rate": 9.313326336895075e-06, "loss": 0.4051, "num_tokens": 218466876.0, "step": 532 }, { "epoch": 0.2037461773700306, "grad_norm": 0.29327679268925266, "learning_rate": 9.310364774695901e-06, "loss": 0.3542, "num_tokens": 218880360.0, "step": 533 }, { "epoch": 0.20412844036697247, "grad_norm": 0.2793067231747973, "learning_rate": 9.307397369639036e-06, "loss": 0.3767, "num_tokens": 219286334.0, "step": 534 }, { "epoch": 0.2045107033639144, "grad_norm": 0.24714537611934967, "learning_rate": 9.304424126274724e-06, "loss": 0.3528, "num_tokens": 219714712.0, "step": 535 }, { "epoch": 0.20489296636085627, "grad_norm": 0.3332075476944507, "learning_rate": 9.301445049162177e-06, "loss": 0.3713, "num_tokens": 220137868.0, "step": 536 }, { "epoch": 0.20527522935779816, "grad_norm": 0.3245785325808012, "learning_rate": 9.298460142869548e-06, "loss": 0.3904, "num_tokens": 220539981.0, "step": 537 }, { "epoch": 0.20565749235474007, "grad_norm": 0.2934244498489479, "learning_rate": 9.295469411973921e-06, "loss": 0.3794, "num_tokens": 220975235.0, "step": 538 }, { "epoch": 0.20603975535168195, "grad_norm": 0.2742907283186334, "learning_rate": 9.292472861061322e-06, "loss": 0.3667, "num_tokens": 221363009.0, "step": 539 }, { "epoch": 0.20642201834862386, "grad_norm": 0.3443272821240969, "learning_rate": 9.289470494726694e-06, "loss": 0.3613, "num_tokens": 221788343.0, "step": 540 }, { "epoch": 0.20680428134556575, "grad_norm": 0.3188059870697428, "learning_rate": 9.286462317573905e-06, "loss": 0.3635, "num_tokens": 222189682.0, "step": 541 }, { "epoch": 0.20718654434250763, "grad_norm": 0.25671065930943543, "learning_rate": 9.283448334215724e-06, "loss": 0.3534, "num_tokens": 222612978.0, "step": 542 }, { "epoch": 0.20756880733944955, "grad_norm": 0.26975231412069167, "learning_rate": 9.280428549273832e-06, "loss": 0.3736, "num_tokens": 223047569.0, "step": 543 }, { "epoch": 0.20795107033639143, "grad_norm": 0.29527444734804703, "learning_rate": 9.2774029673788e-06, "loss": 0.384, "num_tokens": 223436356.0, "step": 544 }, { "epoch": 0.20833333333333334, "grad_norm": 0.2757889590848201, "learning_rate": 9.274371593170091e-06, "loss": 0.3868, "num_tokens": 223914909.0, "step": 545 }, { "epoch": 0.20871559633027523, "grad_norm": 0.29282983952309855, "learning_rate": 9.271334431296051e-06, "loss": 0.3865, "num_tokens": 224320364.0, "step": 546 }, { "epoch": 0.2090978593272171, "grad_norm": 0.31434903217338434, "learning_rate": 9.268291486413897e-06, "loss": 0.3807, "num_tokens": 224754045.0, "step": 547 }, { "epoch": 0.20948012232415902, "grad_norm": 0.32996591090353755, "learning_rate": 9.265242763189717e-06, "loss": 0.3969, "num_tokens": 225181957.0, "step": 548 }, { "epoch": 0.2098623853211009, "grad_norm": 0.30214781509406835, "learning_rate": 9.26218826629846e-06, "loss": 0.3638, "num_tokens": 225580134.0, "step": 549 }, { "epoch": 0.21024464831804282, "grad_norm": 0.3045215069795361, "learning_rate": 9.259128000423926e-06, "loss": 0.3787, "num_tokens": 226003739.0, "step": 550 }, { "epoch": 0.2106269113149847, "grad_norm": 0.31662315525612694, "learning_rate": 9.25606197025876e-06, "loss": 0.3876, "num_tokens": 226403672.0, "step": 551 }, { "epoch": 0.21100917431192662, "grad_norm": 0.2881137354328672, "learning_rate": 9.252990180504451e-06, "loss": 0.3935, "num_tokens": 226845604.0, "step": 552 }, { "epoch": 0.2113914373088685, "grad_norm": 0.28552091425507453, "learning_rate": 9.249912635871317e-06, "loss": 0.3635, "num_tokens": 227275127.0, "step": 553 }, { "epoch": 0.2117737003058104, "grad_norm": 0.30307505848139343, "learning_rate": 9.246829341078503e-06, "loss": 0.3729, "num_tokens": 227670815.0, "step": 554 }, { "epoch": 0.2121559633027523, "grad_norm": 0.2659969997238507, "learning_rate": 9.243740300853964e-06, "loss": 0.3737, "num_tokens": 228103306.0, "step": 555 }, { "epoch": 0.21253822629969418, "grad_norm": 0.2883987213896317, "learning_rate": 9.240645519934474e-06, "loss": 0.3917, "num_tokens": 228539059.0, "step": 556 }, { "epoch": 0.2129204892966361, "grad_norm": 0.29965016857636995, "learning_rate": 9.237545003065604e-06, "loss": 0.3831, "num_tokens": 228932781.0, "step": 557 }, { "epoch": 0.21330275229357798, "grad_norm": 0.26783841808904507, "learning_rate": 9.234438755001725e-06, "loss": 0.3953, "num_tokens": 229365703.0, "step": 558 }, { "epoch": 0.21368501529051986, "grad_norm": 0.2979107885593306, "learning_rate": 9.23132678050599e-06, "loss": 0.4011, "num_tokens": 229812926.0, "step": 559 }, { "epoch": 0.21406727828746178, "grad_norm": 0.28567990501460083, "learning_rate": 9.228209084350342e-06, "loss": 0.4132, "num_tokens": 230250144.0, "step": 560 }, { "epoch": 0.21444954128440366, "grad_norm": 0.2792966842770343, "learning_rate": 9.225085671315491e-06, "loss": 0.3629, "num_tokens": 230650438.0, "step": 561 }, { "epoch": 0.21483180428134557, "grad_norm": 0.3151662483193369, "learning_rate": 9.221956546190912e-06, "loss": 0.364, "num_tokens": 231006858.0, "step": 562 }, { "epoch": 0.21521406727828746, "grad_norm": 0.277563403731734, "learning_rate": 9.218821713774842e-06, "loss": 0.3831, "num_tokens": 231442310.0, "step": 563 }, { "epoch": 0.21559633027522937, "grad_norm": 0.29931319887763247, "learning_rate": 9.215681178874275e-06, "loss": 0.3964, "num_tokens": 231891436.0, "step": 564 }, { "epoch": 0.21597859327217125, "grad_norm": 0.273195612599927, "learning_rate": 9.21253494630494e-06, "loss": 0.3903, "num_tokens": 232347844.0, "step": 565 }, { "epoch": 0.21636085626911314, "grad_norm": 0.2834660193372076, "learning_rate": 9.209383020891304e-06, "loss": 0.3756, "num_tokens": 232742857.0, "step": 566 }, { "epoch": 0.21674311926605505, "grad_norm": 0.31449999017472524, "learning_rate": 9.206225407466572e-06, "loss": 0.3945, "num_tokens": 233124849.0, "step": 567 }, { "epoch": 0.21712538226299694, "grad_norm": 0.25810005816552867, "learning_rate": 9.203062110872658e-06, "loss": 0.3455, "num_tokens": 233550300.0, "step": 568 }, { "epoch": 0.21750764525993885, "grad_norm": 0.33107945291173424, "learning_rate": 9.199893135960203e-06, "loss": 0.3668, "num_tokens": 233953546.0, "step": 569 }, { "epoch": 0.21788990825688073, "grad_norm": 0.24018427043904134, "learning_rate": 9.196718487588552e-06, "loss": 0.3547, "num_tokens": 234356061.0, "step": 570 }, { "epoch": 0.21827217125382262, "grad_norm": 0.33465132286658594, "learning_rate": 9.193538170625743e-06, "loss": 0.3893, "num_tokens": 234813492.0, "step": 571 }, { "epoch": 0.21865443425076453, "grad_norm": 0.31411332156760363, "learning_rate": 9.19035218994851e-06, "loss": 0.3484, "num_tokens": 235204088.0, "step": 572 }, { "epoch": 0.2190366972477064, "grad_norm": 0.2883985430085643, "learning_rate": 9.187160550442278e-06, "loss": 0.3619, "num_tokens": 235602795.0, "step": 573 }, { "epoch": 0.21941896024464833, "grad_norm": 0.30208554680643634, "learning_rate": 9.183963257001142e-06, "loss": 0.3875, "num_tokens": 236003226.0, "step": 574 }, { "epoch": 0.2198012232415902, "grad_norm": 0.30105688110725787, "learning_rate": 9.18076031452787e-06, "loss": 0.389, "num_tokens": 236402991.0, "step": 575 }, { "epoch": 0.22018348623853212, "grad_norm": 0.31191025907793485, "learning_rate": 9.177551727933888e-06, "loss": 0.3908, "num_tokens": 236809589.0, "step": 576 }, { "epoch": 0.220565749235474, "grad_norm": 0.2754676790609133, "learning_rate": 9.17433750213928e-06, "loss": 0.3776, "num_tokens": 237193709.0, "step": 577 }, { "epoch": 0.2209480122324159, "grad_norm": 0.289805606978647, "learning_rate": 9.171117642072783e-06, "loss": 0.369, "num_tokens": 237597969.0, "step": 578 }, { "epoch": 0.2213302752293578, "grad_norm": 0.2792412200321901, "learning_rate": 9.167892152671762e-06, "loss": 0.3736, "num_tokens": 238027892.0, "step": 579 }, { "epoch": 0.2217125382262997, "grad_norm": 0.337657201569649, "learning_rate": 9.164661038882223e-06, "loss": 0.3672, "num_tokens": 238417959.0, "step": 580 }, { "epoch": 0.2220948012232416, "grad_norm": 0.27212654173674283, "learning_rate": 9.161424305658792e-06, "loss": 0.3741, "num_tokens": 238838950.0, "step": 581 }, { "epoch": 0.22247706422018348, "grad_norm": 0.3096392932222775, "learning_rate": 9.158181957964713e-06, "loss": 0.398, "num_tokens": 239288447.0, "step": 582 }, { "epoch": 0.22285932721712537, "grad_norm": 0.3249685247146732, "learning_rate": 9.154934000771844e-06, "loss": 0.3702, "num_tokens": 239634933.0, "step": 583 }, { "epoch": 0.22324159021406728, "grad_norm": 0.31423353987151437, "learning_rate": 9.151680439060636e-06, "loss": 0.3733, "num_tokens": 240025937.0, "step": 584 }, { "epoch": 0.22362385321100917, "grad_norm": 0.2834341205029428, "learning_rate": 9.148421277820138e-06, "loss": 0.3642, "num_tokens": 240411794.0, "step": 585 }, { "epoch": 0.22400611620795108, "grad_norm": 0.3457671514107957, "learning_rate": 9.145156522047986e-06, "loss": 0.3717, "num_tokens": 240819969.0, "step": 586 }, { "epoch": 0.22438837920489296, "grad_norm": 0.30436298898732933, "learning_rate": 9.141886176750397e-06, "loss": 0.372, "num_tokens": 241219686.0, "step": 587 }, { "epoch": 0.22477064220183487, "grad_norm": 0.3015407744851569, "learning_rate": 9.138610246942157e-06, "loss": 0.3985, "num_tokens": 241646744.0, "step": 588 }, { "epoch": 0.22515290519877676, "grad_norm": 0.2618779128631558, "learning_rate": 9.135328737646611e-06, "loss": 0.394, "num_tokens": 242085941.0, "step": 589 }, { "epoch": 0.22553516819571864, "grad_norm": 0.315741853237637, "learning_rate": 9.132041653895668e-06, "loss": 0.3823, "num_tokens": 242488824.0, "step": 590 }, { "epoch": 0.22591743119266056, "grad_norm": 0.2776697465493544, "learning_rate": 9.128749000729777e-06, "loss": 0.3565, "num_tokens": 242890501.0, "step": 591 }, { "epoch": 0.22629969418960244, "grad_norm": 0.299578333638845, "learning_rate": 9.125450783197931e-06, "loss": 0.3905, "num_tokens": 243323495.0, "step": 592 }, { "epoch": 0.22668195718654435, "grad_norm": 0.28717242108889424, "learning_rate": 9.122147006357657e-06, "loss": 0.3804, "num_tokens": 243759735.0, "step": 593 }, { "epoch": 0.22706422018348624, "grad_norm": 0.27615249536903175, "learning_rate": 9.118837675275006e-06, "loss": 0.3902, "num_tokens": 244185214.0, "step": 594 }, { "epoch": 0.22744648318042812, "grad_norm": 0.310301735025032, "learning_rate": 9.11552279502454e-06, "loss": 0.3986, "num_tokens": 244603987.0, "step": 595 }, { "epoch": 0.22782874617737003, "grad_norm": 0.33691526759287665, "learning_rate": 9.112202370689337e-06, "loss": 0.4026, "num_tokens": 245001704.0, "step": 596 }, { "epoch": 0.22821100917431192, "grad_norm": 0.2865904313101305, "learning_rate": 9.108876407360976e-06, "loss": 0.3881, "num_tokens": 245407882.0, "step": 597 }, { "epoch": 0.22859327217125383, "grad_norm": 0.2826526928273864, "learning_rate": 9.105544910139527e-06, "loss": 0.357, "num_tokens": 245851231.0, "step": 598 }, { "epoch": 0.22897553516819572, "grad_norm": 0.27531662437234644, "learning_rate": 9.102207884133548e-06, "loss": 0.3882, "num_tokens": 246265295.0, "step": 599 }, { "epoch": 0.22935779816513763, "grad_norm": 0.3089688461620704, "learning_rate": 9.09886533446007e-06, "loss": 0.3807, "num_tokens": 246671259.0, "step": 600 }, { "epoch": 0.2297400611620795, "grad_norm": 0.28978239421247837, "learning_rate": 9.0955172662446e-06, "loss": 0.3876, "num_tokens": 247074096.0, "step": 601 }, { "epoch": 0.2301223241590214, "grad_norm": 0.2645958772906488, "learning_rate": 9.092163684621105e-06, "loss": 0.3725, "num_tokens": 247477563.0, "step": 602 }, { "epoch": 0.2305045871559633, "grad_norm": 0.28568202828981065, "learning_rate": 9.088804594732006e-06, "loss": 0.3928, "num_tokens": 247940093.0, "step": 603 }, { "epoch": 0.2308868501529052, "grad_norm": 0.27585109128936636, "learning_rate": 9.085440001728168e-06, "loss": 0.3704, "num_tokens": 248368764.0, "step": 604 }, { "epoch": 0.2312691131498471, "grad_norm": 0.27653165882309483, "learning_rate": 9.082069910768901e-06, "loss": 0.4143, "num_tokens": 248829463.0, "step": 605 }, { "epoch": 0.231651376146789, "grad_norm": 0.29920389094743655, "learning_rate": 9.078694327021938e-06, "loss": 0.4172, "num_tokens": 249274207.0, "step": 606 }, { "epoch": 0.23203363914373087, "grad_norm": 0.29640280958146953, "learning_rate": 9.07531325566344e-06, "loss": 0.3873, "num_tokens": 249700194.0, "step": 607 }, { "epoch": 0.2324159021406728, "grad_norm": 0.28915016883483846, "learning_rate": 9.071926701877985e-06, "loss": 0.384, "num_tokens": 250143961.0, "step": 608 }, { "epoch": 0.23279816513761467, "grad_norm": 0.3255035345947518, "learning_rate": 9.068534670858547e-06, "loss": 0.38, "num_tokens": 250582590.0, "step": 609 }, { "epoch": 0.23318042813455658, "grad_norm": 0.2877177628243263, "learning_rate": 9.065137167806509e-06, "loss": 0.3714, "num_tokens": 250973246.0, "step": 610 }, { "epoch": 0.23356269113149847, "grad_norm": 0.28550980486061095, "learning_rate": 9.061734197931645e-06, "loss": 0.4022, "num_tokens": 251370710.0, "step": 611 }, { "epoch": 0.23394495412844038, "grad_norm": 0.31122468608101966, "learning_rate": 9.058325766452104e-06, "loss": 0.3705, "num_tokens": 251778758.0, "step": 612 }, { "epoch": 0.23432721712538226, "grad_norm": 0.3399712579669277, "learning_rate": 9.054911878594415e-06, "loss": 0.3746, "num_tokens": 252191372.0, "step": 613 }, { "epoch": 0.23470948012232415, "grad_norm": 0.35218383149346305, "learning_rate": 9.051492539593473e-06, "loss": 0.4096, "num_tokens": 252612141.0, "step": 614 }, { "epoch": 0.23509174311926606, "grad_norm": 0.3077852425607384, "learning_rate": 9.048067754692538e-06, "loss": 0.3799, "num_tokens": 252985316.0, "step": 615 }, { "epoch": 0.23547400611620795, "grad_norm": 0.27869103967534725, "learning_rate": 9.044637529143206e-06, "loss": 0.3884, "num_tokens": 253394455.0, "step": 616 }, { "epoch": 0.23585626911314986, "grad_norm": 0.31922087897709445, "learning_rate": 9.041201868205432e-06, "loss": 0.3662, "num_tokens": 253762601.0, "step": 617 }, { "epoch": 0.23623853211009174, "grad_norm": 0.3520791582706998, "learning_rate": 9.037760777147497e-06, "loss": 0.3836, "num_tokens": 254227575.0, "step": 618 }, { "epoch": 0.23662079510703363, "grad_norm": 0.2809234911336819, "learning_rate": 9.034314261246007e-06, "loss": 0.3833, "num_tokens": 254670426.0, "step": 619 }, { "epoch": 0.23700305810397554, "grad_norm": 0.40222763235101694, "learning_rate": 9.030862325785893e-06, "loss": 0.39, "num_tokens": 255059367.0, "step": 620 }, { "epoch": 0.23738532110091742, "grad_norm": 0.28733549992330404, "learning_rate": 9.02740497606039e-06, "loss": 0.3714, "num_tokens": 255498551.0, "step": 621 }, { "epoch": 0.23776758409785934, "grad_norm": 0.3046221037146164, "learning_rate": 9.023942217371041e-06, "loss": 0.3862, "num_tokens": 255906818.0, "step": 622 }, { "epoch": 0.23814984709480122, "grad_norm": 0.30704559198814274, "learning_rate": 9.02047405502768e-06, "loss": 0.4006, "num_tokens": 256331779.0, "step": 623 }, { "epoch": 0.23853211009174313, "grad_norm": 0.31680852995126063, "learning_rate": 9.017000494348425e-06, "loss": 0.3921, "num_tokens": 256736444.0, "step": 624 }, { "epoch": 0.23891437308868502, "grad_norm": 0.2832308759764621, "learning_rate": 9.013521540659677e-06, "loss": 0.3694, "num_tokens": 257116436.0, "step": 625 }, { "epoch": 0.2392966360856269, "grad_norm": 0.3321699105755607, "learning_rate": 9.010037199296105e-06, "loss": 0.374, "num_tokens": 257515036.0, "step": 626 }, { "epoch": 0.2396788990825688, "grad_norm": 0.3429248615437049, "learning_rate": 9.006547475600636e-06, "loss": 0.3796, "num_tokens": 257929541.0, "step": 627 }, { "epoch": 0.2400611620795107, "grad_norm": 0.3098280266584217, "learning_rate": 9.003052374924454e-06, "loss": 0.3917, "num_tokens": 258348550.0, "step": 628 }, { "epoch": 0.2404434250764526, "grad_norm": 0.3065584693267545, "learning_rate": 8.999551902626984e-06, "loss": 0.399, "num_tokens": 258803936.0, "step": 629 }, { "epoch": 0.2408256880733945, "grad_norm": 0.3034452570346747, "learning_rate": 8.996046064075897e-06, "loss": 0.3532, "num_tokens": 259207365.0, "step": 630 }, { "epoch": 0.24120795107033638, "grad_norm": 0.30606552366250667, "learning_rate": 8.992534864647084e-06, "loss": 0.3789, "num_tokens": 259637116.0, "step": 631 }, { "epoch": 0.2415902140672783, "grad_norm": 0.27598201111213216, "learning_rate": 8.989018309724657e-06, "loss": 0.3723, "num_tokens": 260087175.0, "step": 632 }, { "epoch": 0.24197247706422018, "grad_norm": 0.26110012452673226, "learning_rate": 8.985496404700946e-06, "loss": 0.3603, "num_tokens": 260474561.0, "step": 633 }, { "epoch": 0.2423547400611621, "grad_norm": 0.3001172491176638, "learning_rate": 8.981969154976477e-06, "loss": 0.4067, "num_tokens": 260936606.0, "step": 634 }, { "epoch": 0.24273700305810397, "grad_norm": 0.3154402451656656, "learning_rate": 8.978436565959977e-06, "loss": 0.419, "num_tokens": 261365549.0, "step": 635 }, { "epoch": 0.24311926605504589, "grad_norm": 0.3131213964994004, "learning_rate": 8.974898643068361e-06, "loss": 0.364, "num_tokens": 261772927.0, "step": 636 }, { "epoch": 0.24350152905198777, "grad_norm": 0.29828935570101023, "learning_rate": 8.971355391726721e-06, "loss": 0.3762, "num_tokens": 262161078.0, "step": 637 }, { "epoch": 0.24388379204892965, "grad_norm": 0.3238019631881244, "learning_rate": 8.967806817368319e-06, "loss": 0.3873, "num_tokens": 262577695.0, "step": 638 }, { "epoch": 0.24426605504587157, "grad_norm": 0.28465918320626743, "learning_rate": 8.96425292543458e-06, "loss": 0.4022, "num_tokens": 262984408.0, "step": 639 }, { "epoch": 0.24464831804281345, "grad_norm": 0.31282646244746515, "learning_rate": 8.96069372137508e-06, "loss": 0.3695, "num_tokens": 263410565.0, "step": 640 }, { "epoch": 0.24503058103975536, "grad_norm": 0.30553718935080953, "learning_rate": 8.957129210647552e-06, "loss": 0.385, "num_tokens": 263804366.0, "step": 641 }, { "epoch": 0.24541284403669725, "grad_norm": 0.2836860368792192, "learning_rate": 8.95355939871785e-06, "loss": 0.3804, "num_tokens": 264224929.0, "step": 642 }, { "epoch": 0.24579510703363913, "grad_norm": 0.2871929467004206, "learning_rate": 8.949984291059972e-06, "loss": 0.3736, "num_tokens": 264635109.0, "step": 643 }, { "epoch": 0.24617737003058104, "grad_norm": 0.30694125326333027, "learning_rate": 8.946403893156025e-06, "loss": 0.3937, "num_tokens": 265073843.0, "step": 644 }, { "epoch": 0.24655963302752293, "grad_norm": 0.3222758746803932, "learning_rate": 8.942818210496235e-06, "loss": 0.4032, "num_tokens": 265506600.0, "step": 645 }, { "epoch": 0.24694189602446484, "grad_norm": 0.3311827532533767, "learning_rate": 8.939227248578926e-06, "loss": 0.3727, "num_tokens": 265941006.0, "step": 646 }, { "epoch": 0.24732415902140673, "grad_norm": 0.30706307595819393, "learning_rate": 8.935631012910526e-06, "loss": 0.3565, "num_tokens": 266319248.0, "step": 647 }, { "epoch": 0.24770642201834864, "grad_norm": 0.2886375491110465, "learning_rate": 8.932029509005542e-06, "loss": 0.3983, "num_tokens": 266749309.0, "step": 648 }, { "epoch": 0.24808868501529052, "grad_norm": 0.2735856467287797, "learning_rate": 8.928422742386563e-06, "loss": 0.3829, "num_tokens": 267109122.0, "step": 649 }, { "epoch": 0.2484709480122324, "grad_norm": 0.2830936313318087, "learning_rate": 8.924810718584243e-06, "loss": 0.3828, "num_tokens": 267508737.0, "step": 650 }, { "epoch": 0.24885321100917432, "grad_norm": 0.3283390051413059, "learning_rate": 8.921193443137309e-06, "loss": 0.3782, "num_tokens": 267918305.0, "step": 651 }, { "epoch": 0.2492354740061162, "grad_norm": 0.30335124243703915, "learning_rate": 8.917570921592525e-06, "loss": 0.3908, "num_tokens": 268361915.0, "step": 652 }, { "epoch": 0.24961773700305812, "grad_norm": 0.2571511256162922, "learning_rate": 8.913943159504714e-06, "loss": 0.3641, "num_tokens": 268800634.0, "step": 653 }, { "epoch": 0.25, "grad_norm": 0.2892310075430736, "learning_rate": 8.910310162436722e-06, "loss": 0.3858, "num_tokens": 269222212.0, "step": 654 }, { "epoch": 0.2503822629969419, "grad_norm": 0.3473412958002292, "learning_rate": 8.906671935959436e-06, "loss": 0.3771, "num_tokens": 269623817.0, "step": 655 }, { "epoch": 0.25076452599388377, "grad_norm": 0.3477403643691002, "learning_rate": 8.903028485651752e-06, "loss": 0.3724, "num_tokens": 270011282.0, "step": 656 }, { "epoch": 0.2511467889908257, "grad_norm": 0.3120544578388952, "learning_rate": 8.899379817100579e-06, "loss": 0.3902, "num_tokens": 270430088.0, "step": 657 }, { "epoch": 0.2515290519877676, "grad_norm": 0.2853900889953197, "learning_rate": 8.895725935900827e-06, "loss": 0.3803, "num_tokens": 270801218.0, "step": 658 }, { "epoch": 0.2519113149847095, "grad_norm": 0.31250388044058636, "learning_rate": 8.892066847655402e-06, "loss": 0.3722, "num_tokens": 271282248.0, "step": 659 }, { "epoch": 0.25229357798165136, "grad_norm": 0.30999972693938765, "learning_rate": 8.88840255797519e-06, "loss": 0.3779, "num_tokens": 271674351.0, "step": 660 }, { "epoch": 0.25267584097859325, "grad_norm": 0.28162609553511925, "learning_rate": 8.884733072479058e-06, "loss": 0.3658, "num_tokens": 272075673.0, "step": 661 }, { "epoch": 0.2530581039755352, "grad_norm": 0.3107133601844678, "learning_rate": 8.881058396793837e-06, "loss": 0.3725, "num_tokens": 272498661.0, "step": 662 }, { "epoch": 0.25344036697247707, "grad_norm": 0.2949726295774104, "learning_rate": 8.877378536554314e-06, "loss": 0.347, "num_tokens": 272902959.0, "step": 663 }, { "epoch": 0.25382262996941896, "grad_norm": 0.3050132976467593, "learning_rate": 8.873693497403234e-06, "loss": 0.4074, "num_tokens": 273299202.0, "step": 664 }, { "epoch": 0.25420489296636084, "grad_norm": 0.29903463140677805, "learning_rate": 8.870003284991277e-06, "loss": 0.4042, "num_tokens": 273732060.0, "step": 665 }, { "epoch": 0.2545871559633027, "grad_norm": 0.26181156233855346, "learning_rate": 8.86630790497706e-06, "loss": 0.4113, "num_tokens": 274182873.0, "step": 666 }, { "epoch": 0.25496941896024466, "grad_norm": 0.28227078861755234, "learning_rate": 8.862607363027116e-06, "loss": 0.3495, "num_tokens": 274592194.0, "step": 667 }, { "epoch": 0.25535168195718655, "grad_norm": 0.2988398503823881, "learning_rate": 8.858901664815906e-06, "loss": 0.3784, "num_tokens": 274999863.0, "step": 668 }, { "epoch": 0.25573394495412843, "grad_norm": 0.2822390360268869, "learning_rate": 8.855190816025789e-06, "loss": 0.3737, "num_tokens": 275415536.0, "step": 669 }, { "epoch": 0.2561162079510703, "grad_norm": 0.25818594107637644, "learning_rate": 8.85147482234702e-06, "loss": 0.3865, "num_tokens": 275861883.0, "step": 670 }, { "epoch": 0.25649847094801226, "grad_norm": 0.36337782304390526, "learning_rate": 8.84775368947775e-06, "loss": 0.3957, "num_tokens": 276270259.0, "step": 671 }, { "epoch": 0.25688073394495414, "grad_norm": 0.26464963445527817, "learning_rate": 8.844027423124005e-06, "loss": 0.3608, "num_tokens": 276717203.0, "step": 672 }, { "epoch": 0.257262996941896, "grad_norm": 0.3045674911455366, "learning_rate": 8.840296028999689e-06, "loss": 0.4024, "num_tokens": 277090223.0, "step": 673 }, { "epoch": 0.2576452599388379, "grad_norm": 0.30217072269278383, "learning_rate": 8.836559512826564e-06, "loss": 0.3803, "num_tokens": 277499390.0, "step": 674 }, { "epoch": 0.2580275229357798, "grad_norm": 0.3147968781143223, "learning_rate": 8.832817880334243e-06, "loss": 0.3604, "num_tokens": 277857284.0, "step": 675 }, { "epoch": 0.25840978593272174, "grad_norm": 0.2818145011318806, "learning_rate": 8.829071137260194e-06, "loss": 0.3832, "num_tokens": 278277734.0, "step": 676 }, { "epoch": 0.2587920489296636, "grad_norm": 0.2849777327379941, "learning_rate": 8.825319289349716e-06, "loss": 0.3948, "num_tokens": 278668507.0, "step": 677 }, { "epoch": 0.2591743119266055, "grad_norm": 0.28558531620995986, "learning_rate": 8.821562342355935e-06, "loss": 0.3841, "num_tokens": 279040696.0, "step": 678 }, { "epoch": 0.2595565749235474, "grad_norm": 0.293298477758778, "learning_rate": 8.817800302039798e-06, "loss": 0.3814, "num_tokens": 279479711.0, "step": 679 }, { "epoch": 0.2599388379204893, "grad_norm": 0.2958623840205463, "learning_rate": 8.814033174170058e-06, "loss": 0.3641, "num_tokens": 279887367.0, "step": 680 }, { "epoch": 0.2603211009174312, "grad_norm": 0.3021930547346609, "learning_rate": 8.810260964523278e-06, "loss": 0.3764, "num_tokens": 280274359.0, "step": 681 }, { "epoch": 0.2607033639143731, "grad_norm": 0.27631890791386393, "learning_rate": 8.806483678883803e-06, "loss": 0.3884, "num_tokens": 280722130.0, "step": 682 }, { "epoch": 0.261085626911315, "grad_norm": 0.2787026800159375, "learning_rate": 8.80270132304377e-06, "loss": 0.4102, "num_tokens": 281192235.0, "step": 683 }, { "epoch": 0.26146788990825687, "grad_norm": 0.2875244754303217, "learning_rate": 8.79891390280309e-06, "loss": 0.3643, "num_tokens": 281574004.0, "step": 684 }, { "epoch": 0.26185015290519875, "grad_norm": 0.2871273613039571, "learning_rate": 8.795121423969432e-06, "loss": 0.3787, "num_tokens": 281953301.0, "step": 685 }, { "epoch": 0.2622324159021407, "grad_norm": 0.3093552039074778, "learning_rate": 8.791323892358229e-06, "loss": 0.3553, "num_tokens": 282340687.0, "step": 686 }, { "epoch": 0.2626146788990826, "grad_norm": 0.2795241201777922, "learning_rate": 8.78752131379266e-06, "loss": 0.3908, "num_tokens": 282714927.0, "step": 687 }, { "epoch": 0.26299694189602446, "grad_norm": 0.31152888701621595, "learning_rate": 8.783713694103645e-06, "loss": 0.3932, "num_tokens": 283135623.0, "step": 688 }, { "epoch": 0.26337920489296635, "grad_norm": 0.27282917193935974, "learning_rate": 8.779901039129832e-06, "loss": 0.3662, "num_tokens": 283541357.0, "step": 689 }, { "epoch": 0.26376146788990823, "grad_norm": 0.2860025357707848, "learning_rate": 8.776083354717587e-06, "loss": 0.3926, "num_tokens": 283964609.0, "step": 690 }, { "epoch": 0.26414373088685017, "grad_norm": 0.3006331324866127, "learning_rate": 8.772260646720997e-06, "loss": 0.3921, "num_tokens": 284388926.0, "step": 691 }, { "epoch": 0.26452599388379205, "grad_norm": 0.2717106153240549, "learning_rate": 8.76843292100184e-06, "loss": 0.3831, "num_tokens": 284825910.0, "step": 692 }, { "epoch": 0.26490825688073394, "grad_norm": 0.3313190538803319, "learning_rate": 8.764600183429604e-06, "loss": 0.3802, "num_tokens": 285231246.0, "step": 693 }, { "epoch": 0.2652905198776758, "grad_norm": 0.32448987086454234, "learning_rate": 8.760762439881447e-06, "loss": 0.3873, "num_tokens": 285662807.0, "step": 694 }, { "epoch": 0.26567278287461776, "grad_norm": 0.3013500656474795, "learning_rate": 8.756919696242212e-06, "loss": 0.405, "num_tokens": 286078151.0, "step": 695 }, { "epoch": 0.26605504587155965, "grad_norm": 0.28840308740113485, "learning_rate": 8.753071958404405e-06, "loss": 0.4126, "num_tokens": 286518934.0, "step": 696 }, { "epoch": 0.26643730886850153, "grad_norm": 0.27868315601677723, "learning_rate": 8.749219232268194e-06, "loss": 0.3991, "num_tokens": 286920211.0, "step": 697 }, { "epoch": 0.2668195718654434, "grad_norm": 0.3016691244999727, "learning_rate": 8.745361523741394e-06, "loss": 0.3907, "num_tokens": 287362784.0, "step": 698 }, { "epoch": 0.2672018348623853, "grad_norm": 0.2471065019793865, "learning_rate": 8.741498838739458e-06, "loss": 0.3862, "num_tokens": 287799619.0, "step": 699 }, { "epoch": 0.26758409785932724, "grad_norm": 0.26288091136846714, "learning_rate": 8.737631183185475e-06, "loss": 0.3865, "num_tokens": 288191426.0, "step": 700 }, { "epoch": 0.2679663608562691, "grad_norm": 0.24829923447744354, "learning_rate": 8.733758563010152e-06, "loss": 0.3597, "num_tokens": 288591548.0, "step": 701 }, { "epoch": 0.268348623853211, "grad_norm": 0.27357169562434436, "learning_rate": 8.72988098415181e-06, "loss": 0.3648, "num_tokens": 289013156.0, "step": 702 }, { "epoch": 0.2687308868501529, "grad_norm": 0.2781983568262563, "learning_rate": 8.72599845255637e-06, "loss": 0.3565, "num_tokens": 289389602.0, "step": 703 }, { "epoch": 0.2691131498470948, "grad_norm": 0.2889515173153062, "learning_rate": 8.722110974177356e-06, "loss": 0.3995, "num_tokens": 289865801.0, "step": 704 }, { "epoch": 0.2694954128440367, "grad_norm": 0.27579678422743137, "learning_rate": 8.718218554975872e-06, "loss": 0.4003, "num_tokens": 290333381.0, "step": 705 }, { "epoch": 0.2698776758409786, "grad_norm": 0.2628651853937428, "learning_rate": 8.714321200920596e-06, "loss": 0.3684, "num_tokens": 290762130.0, "step": 706 }, { "epoch": 0.2702599388379205, "grad_norm": 0.30181639238356406, "learning_rate": 8.710418917987779e-06, "loss": 0.3933, "num_tokens": 291186850.0, "step": 707 }, { "epoch": 0.2706422018348624, "grad_norm": 0.2936170240206027, "learning_rate": 8.706511712161225e-06, "loss": 0.3796, "num_tokens": 291616959.0, "step": 708 }, { "epoch": 0.27102446483180426, "grad_norm": 0.278875878813826, "learning_rate": 8.70259958943229e-06, "loss": 0.3936, "num_tokens": 292048270.0, "step": 709 }, { "epoch": 0.2714067278287462, "grad_norm": 0.31209934903950987, "learning_rate": 8.698682555799868e-06, "loss": 0.3909, "num_tokens": 292505278.0, "step": 710 }, { "epoch": 0.2717889908256881, "grad_norm": 0.28429027756426195, "learning_rate": 8.694760617270386e-06, "loss": 0.3744, "num_tokens": 292908293.0, "step": 711 }, { "epoch": 0.27217125382262997, "grad_norm": 0.26581833855855097, "learning_rate": 8.690833779857788e-06, "loss": 0.3687, "num_tokens": 293336209.0, "step": 712 }, { "epoch": 0.27255351681957185, "grad_norm": 0.2769884388765986, "learning_rate": 8.68690204958353e-06, "loss": 0.3796, "num_tokens": 293731923.0, "step": 713 }, { "epoch": 0.27293577981651373, "grad_norm": 0.31417430839562893, "learning_rate": 8.682965432476579e-06, "loss": 0.3638, "num_tokens": 294106658.0, "step": 714 }, { "epoch": 0.2733180428134557, "grad_norm": 0.26027126113633997, "learning_rate": 8.679023934573385e-06, "loss": 0.3581, "num_tokens": 294510431.0, "step": 715 }, { "epoch": 0.27370030581039756, "grad_norm": 0.28723775787094785, "learning_rate": 8.675077561917888e-06, "loss": 0.3719, "num_tokens": 294867776.0, "step": 716 }, { "epoch": 0.27408256880733944, "grad_norm": 0.24922840961693052, "learning_rate": 8.671126320561501e-06, "loss": 0.3482, "num_tokens": 295273414.0, "step": 717 }, { "epoch": 0.27446483180428133, "grad_norm": 0.31043949172171165, "learning_rate": 8.667170216563103e-06, "loss": 0.3803, "num_tokens": 295728699.0, "step": 718 }, { "epoch": 0.27484709480122327, "grad_norm": 0.27963429578181503, "learning_rate": 8.663209255989033e-06, "loss": 0.3453, "num_tokens": 296159538.0, "step": 719 }, { "epoch": 0.27522935779816515, "grad_norm": 0.3165142671130839, "learning_rate": 8.65924344491307e-06, "loss": 0.3637, "num_tokens": 296538139.0, "step": 720 }, { "epoch": 0.27561162079510704, "grad_norm": 0.27727612936600626, "learning_rate": 8.65527278941644e-06, "loss": 0.3646, "num_tokens": 297000994.0, "step": 721 }, { "epoch": 0.2759938837920489, "grad_norm": 0.29897642761759347, "learning_rate": 8.651297295587788e-06, "loss": 0.3626, "num_tokens": 297390031.0, "step": 722 }, { "epoch": 0.2763761467889908, "grad_norm": 0.29935050424760723, "learning_rate": 8.647316969523185e-06, "loss": 0.3714, "num_tokens": 297809046.0, "step": 723 }, { "epoch": 0.27675840978593275, "grad_norm": 0.3003356809500152, "learning_rate": 8.643331817326105e-06, "loss": 0.3742, "num_tokens": 298190175.0, "step": 724 }, { "epoch": 0.27714067278287463, "grad_norm": 0.2866478373834503, "learning_rate": 8.639341845107432e-06, "loss": 0.3831, "num_tokens": 298607049.0, "step": 725 }, { "epoch": 0.2775229357798165, "grad_norm": 0.2766505825818346, "learning_rate": 8.635347058985433e-06, "loss": 0.3666, "num_tokens": 298961466.0, "step": 726 }, { "epoch": 0.2779051987767584, "grad_norm": 0.2765944526538987, "learning_rate": 8.63134746508576e-06, "loss": 0.3779, "num_tokens": 299348078.0, "step": 727 }, { "epoch": 0.2782874617737003, "grad_norm": 0.2803137196937322, "learning_rate": 8.627343069541438e-06, "loss": 0.3832, "num_tokens": 299768200.0, "step": 728 }, { "epoch": 0.2786697247706422, "grad_norm": 0.2810019152920437, "learning_rate": 8.623333878492853e-06, "loss": 0.3687, "num_tokens": 300207021.0, "step": 729 }, { "epoch": 0.2790519877675841, "grad_norm": 0.29718788487212, "learning_rate": 8.619319898087744e-06, "loss": 0.3602, "num_tokens": 300597918.0, "step": 730 }, { "epoch": 0.279434250764526, "grad_norm": 0.26190886460912044, "learning_rate": 8.615301134481196e-06, "loss": 0.38, "num_tokens": 301013416.0, "step": 731 }, { "epoch": 0.2798165137614679, "grad_norm": 0.3021699373984725, "learning_rate": 8.611277593835631e-06, "loss": 0.376, "num_tokens": 301477349.0, "step": 732 }, { "epoch": 0.28019877675840976, "grad_norm": 0.27704132166165096, "learning_rate": 8.60724928232079e-06, "loss": 0.3887, "num_tokens": 301920036.0, "step": 733 }, { "epoch": 0.2805810397553517, "grad_norm": 0.2727866389698373, "learning_rate": 8.603216206113731e-06, "loss": 0.3812, "num_tokens": 302344946.0, "step": 734 }, { "epoch": 0.2809633027522936, "grad_norm": 0.27559165318625345, "learning_rate": 8.599178371398821e-06, "loss": 0.3865, "num_tokens": 302731410.0, "step": 735 }, { "epoch": 0.28134556574923547, "grad_norm": 0.2714954579309321, "learning_rate": 8.595135784367726e-06, "loss": 0.3792, "num_tokens": 303131112.0, "step": 736 }, { "epoch": 0.28172782874617736, "grad_norm": 0.2640815619811983, "learning_rate": 8.591088451219393e-06, "loss": 0.3778, "num_tokens": 303533874.0, "step": 737 }, { "epoch": 0.28211009174311924, "grad_norm": 0.31661686967028485, "learning_rate": 8.58703637816005e-06, "loss": 0.3766, "num_tokens": 303974027.0, "step": 738 }, { "epoch": 0.2824923547400612, "grad_norm": 0.27389367959925154, "learning_rate": 8.582979571403195e-06, "loss": 0.4082, "num_tokens": 304468445.0, "step": 739 }, { "epoch": 0.28287461773700306, "grad_norm": 0.24764984676801477, "learning_rate": 8.57891803716958e-06, "loss": 0.3864, "num_tokens": 304914490.0, "step": 740 }, { "epoch": 0.28325688073394495, "grad_norm": 0.2901084235574226, "learning_rate": 8.57485178168721e-06, "loss": 0.3652, "num_tokens": 305284830.0, "step": 741 }, { "epoch": 0.28363914373088683, "grad_norm": 0.270887607799054, "learning_rate": 8.57078081119133e-06, "loss": 0.3723, "num_tokens": 305699531.0, "step": 742 }, { "epoch": 0.2840214067278288, "grad_norm": 0.2855770410971349, "learning_rate": 8.566705131924413e-06, "loss": 0.3781, "num_tokens": 306133130.0, "step": 743 }, { "epoch": 0.28440366972477066, "grad_norm": 0.24434254572257388, "learning_rate": 8.56262475013615e-06, "loss": 0.3861, "num_tokens": 306579839.0, "step": 744 }, { "epoch": 0.28478593272171254, "grad_norm": 0.2770541943061836, "learning_rate": 8.558539672083448e-06, "loss": 0.374, "num_tokens": 306985329.0, "step": 745 }, { "epoch": 0.2851681957186544, "grad_norm": 0.28714250206636605, "learning_rate": 8.554449904030416e-06, "loss": 0.396, "num_tokens": 307426256.0, "step": 746 }, { "epoch": 0.2855504587155963, "grad_norm": 0.27453262609510837, "learning_rate": 8.550355452248347e-06, "loss": 0.3919, "num_tokens": 307835931.0, "step": 747 }, { "epoch": 0.28593272171253825, "grad_norm": 0.31968552433669295, "learning_rate": 8.546256323015723e-06, "loss": 0.3672, "num_tokens": 308236698.0, "step": 748 }, { "epoch": 0.28631498470948014, "grad_norm": 0.32285894167829227, "learning_rate": 8.542152522618196e-06, "loss": 0.4107, "num_tokens": 308695472.0, "step": 749 }, { "epoch": 0.286697247706422, "grad_norm": 0.32952689427958964, "learning_rate": 8.538044057348585e-06, "loss": 0.386, "num_tokens": 309148218.0, "step": 750 }, { "epoch": 0.2870795107033639, "grad_norm": 0.31927865084718376, "learning_rate": 8.533930933506854e-06, "loss": 0.3685, "num_tokens": 309526917.0, "step": 751 }, { "epoch": 0.2874617737003058, "grad_norm": 0.33372841073858306, "learning_rate": 8.529813157400116e-06, "loss": 0.4004, "num_tokens": 309961971.0, "step": 752 }, { "epoch": 0.28784403669724773, "grad_norm": 0.32449695145638346, "learning_rate": 8.525690735342618e-06, "loss": 0.3837, "num_tokens": 310359029.0, "step": 753 }, { "epoch": 0.2882262996941896, "grad_norm": 0.32159938945060784, "learning_rate": 8.52156367365573e-06, "loss": 0.3939, "num_tokens": 310779312.0, "step": 754 }, { "epoch": 0.2886085626911315, "grad_norm": 0.3457882631189294, "learning_rate": 8.517431978667934e-06, "loss": 0.3934, "num_tokens": 311175439.0, "step": 755 }, { "epoch": 0.2889908256880734, "grad_norm": 0.2988611167440456, "learning_rate": 8.513295656714822e-06, "loss": 0.3784, "num_tokens": 311598407.0, "step": 756 }, { "epoch": 0.28937308868501527, "grad_norm": 0.31001475480325874, "learning_rate": 8.509154714139077e-06, "loss": 0.376, "num_tokens": 311983732.0, "step": 757 }, { "epoch": 0.2897553516819572, "grad_norm": 0.36449002448110834, "learning_rate": 8.50500915729047e-06, "loss": 0.3818, "num_tokens": 312405697.0, "step": 758 }, { "epoch": 0.2901376146788991, "grad_norm": 0.268266230214534, "learning_rate": 8.50085899252584e-06, "loss": 0.4083, "num_tokens": 312870943.0, "step": 759 }, { "epoch": 0.290519877675841, "grad_norm": 0.2930844917493806, "learning_rate": 8.496704226209107e-06, "loss": 0.3789, "num_tokens": 313232099.0, "step": 760 }, { "epoch": 0.29090214067278286, "grad_norm": 0.32348150559034744, "learning_rate": 8.492544864711234e-06, "loss": 0.3851, "num_tokens": 313660300.0, "step": 761 }, { "epoch": 0.29128440366972475, "grad_norm": 0.32745516459013124, "learning_rate": 8.48838091441023e-06, "loss": 0.3893, "num_tokens": 314103038.0, "step": 762 }, { "epoch": 0.2916666666666667, "grad_norm": 0.2661326440165071, "learning_rate": 8.484212381691154e-06, "loss": 0.3703, "num_tokens": 314515318.0, "step": 763 }, { "epoch": 0.29204892966360857, "grad_norm": 0.3050658620939666, "learning_rate": 8.480039272946076e-06, "loss": 0.384, "num_tokens": 314917771.0, "step": 764 }, { "epoch": 0.29243119266055045, "grad_norm": 0.2967550087655357, "learning_rate": 8.47586159457409e-06, "loss": 0.3942, "num_tokens": 315346405.0, "step": 765 }, { "epoch": 0.29281345565749234, "grad_norm": 0.2810427141876906, "learning_rate": 8.471679352981297e-06, "loss": 0.3753, "num_tokens": 315759265.0, "step": 766 }, { "epoch": 0.2931957186544342, "grad_norm": 0.2948213047299878, "learning_rate": 8.467492554580797e-06, "loss": 0.4113, "num_tokens": 316195881.0, "step": 767 }, { "epoch": 0.29357798165137616, "grad_norm": 0.3579677906421847, "learning_rate": 8.463301205792675e-06, "loss": 0.3805, "num_tokens": 316606942.0, "step": 768 }, { "epoch": 0.29396024464831805, "grad_norm": 0.2564193853033097, "learning_rate": 8.45910531304399e-06, "loss": 0.3939, "num_tokens": 317006053.0, "step": 769 }, { "epoch": 0.29434250764525993, "grad_norm": 0.2638439702177443, "learning_rate": 8.45490488276878e-06, "loss": 0.3775, "num_tokens": 317374484.0, "step": 770 }, { "epoch": 0.2947247706422018, "grad_norm": 0.29373435359595196, "learning_rate": 8.450699921408026e-06, "loss": 0.3871, "num_tokens": 317789255.0, "step": 771 }, { "epoch": 0.29510703363914376, "grad_norm": 0.2884764316999055, "learning_rate": 8.44649043540967e-06, "loss": 0.3678, "num_tokens": 318176220.0, "step": 772 }, { "epoch": 0.29548929663608564, "grad_norm": 0.2693005981826586, "learning_rate": 8.442276431228585e-06, "loss": 0.3928, "num_tokens": 318583073.0, "step": 773 }, { "epoch": 0.2958715596330275, "grad_norm": 0.30183890055638374, "learning_rate": 8.438057915326573e-06, "loss": 0.4048, "num_tokens": 319028991.0, "step": 774 }, { "epoch": 0.2962538226299694, "grad_norm": 0.30575919247306826, "learning_rate": 8.433834894172359e-06, "loss": 0.3692, "num_tokens": 319402762.0, "step": 775 }, { "epoch": 0.2966360856269113, "grad_norm": 0.28600415998443984, "learning_rate": 8.429607374241567e-06, "loss": 0.3822, "num_tokens": 319810973.0, "step": 776 }, { "epoch": 0.29701834862385323, "grad_norm": 0.31645424914189696, "learning_rate": 8.425375362016729e-06, "loss": 0.4085, "num_tokens": 320239788.0, "step": 777 }, { "epoch": 0.2974006116207951, "grad_norm": 0.3030857077387875, "learning_rate": 8.421138863987262e-06, "loss": 0.384, "num_tokens": 320647245.0, "step": 778 }, { "epoch": 0.297782874617737, "grad_norm": 0.3428064915470499, "learning_rate": 8.416897886649462e-06, "loss": 0.3671, "num_tokens": 321026601.0, "step": 779 }, { "epoch": 0.2981651376146789, "grad_norm": 0.27782104974462735, "learning_rate": 8.412652436506492e-06, "loss": 0.3837, "num_tokens": 321430085.0, "step": 780 }, { "epoch": 0.2985474006116208, "grad_norm": 0.2745067472708174, "learning_rate": 8.408402520068371e-06, "loss": 0.3669, "num_tokens": 321818195.0, "step": 781 }, { "epoch": 0.2989296636085627, "grad_norm": 0.32885045001492175, "learning_rate": 8.404148143851977e-06, "loss": 0.3954, "num_tokens": 322243468.0, "step": 782 }, { "epoch": 0.2993119266055046, "grad_norm": 0.3230025919038844, "learning_rate": 8.399889314381016e-06, "loss": 0.3628, "num_tokens": 322638724.0, "step": 783 }, { "epoch": 0.2996941896024465, "grad_norm": 0.28737563103911107, "learning_rate": 8.395626038186027e-06, "loss": 0.3793, "num_tokens": 323058170.0, "step": 784 }, { "epoch": 0.30007645259938837, "grad_norm": 0.31428939590706056, "learning_rate": 8.391358321804367e-06, "loss": 0.3726, "num_tokens": 323463264.0, "step": 785 }, { "epoch": 0.30045871559633025, "grad_norm": 0.2823391673320193, "learning_rate": 8.387086171780204e-06, "loss": 0.3838, "num_tokens": 323873845.0, "step": 786 }, { "epoch": 0.3008409785932722, "grad_norm": 0.2889922907590903, "learning_rate": 8.382809594664502e-06, "loss": 0.396, "num_tokens": 324277009.0, "step": 787 }, { "epoch": 0.3012232415902141, "grad_norm": 0.27363369623365413, "learning_rate": 8.378528597015011e-06, "loss": 0.3916, "num_tokens": 324664966.0, "step": 788 }, { "epoch": 0.30160550458715596, "grad_norm": 0.2955633045303292, "learning_rate": 8.374243185396265e-06, "loss": 0.3827, "num_tokens": 325090059.0, "step": 789 }, { "epoch": 0.30198776758409784, "grad_norm": 0.28679650634294424, "learning_rate": 8.369953366379567e-06, "loss": 0.3714, "num_tokens": 325521271.0, "step": 790 }, { "epoch": 0.30237003058103973, "grad_norm": 0.2889791268461004, "learning_rate": 8.365659146542973e-06, "loss": 0.3585, "num_tokens": 325935003.0, "step": 791 }, { "epoch": 0.30275229357798167, "grad_norm": 0.3098637952148606, "learning_rate": 8.361360532471287e-06, "loss": 0.3638, "num_tokens": 326337208.0, "step": 792 }, { "epoch": 0.30313455657492355, "grad_norm": 0.2866305101051465, "learning_rate": 8.357057530756055e-06, "loss": 0.3917, "num_tokens": 326778364.0, "step": 793 }, { "epoch": 0.30351681957186544, "grad_norm": 0.31695790126832774, "learning_rate": 8.352750147995552e-06, "loss": 0.3849, "num_tokens": 327176568.0, "step": 794 }, { "epoch": 0.3038990825688073, "grad_norm": 0.27524735027753755, "learning_rate": 8.34843839079477e-06, "loss": 0.3484, "num_tokens": 327587243.0, "step": 795 }, { "epoch": 0.30428134556574926, "grad_norm": 0.3408168675009145, "learning_rate": 8.344122265765404e-06, "loss": 0.3702, "num_tokens": 327981583.0, "step": 796 }, { "epoch": 0.30466360856269115, "grad_norm": 0.35280196139374664, "learning_rate": 8.33980177952585e-06, "loss": 0.3731, "num_tokens": 328372050.0, "step": 797 }, { "epoch": 0.30504587155963303, "grad_norm": 0.3116024236215618, "learning_rate": 8.335476938701195e-06, "loss": 0.3912, "num_tokens": 328795145.0, "step": 798 }, { "epoch": 0.3054281345565749, "grad_norm": 0.3190795295981246, "learning_rate": 8.331147749923199e-06, "loss": 0.3929, "num_tokens": 329239546.0, "step": 799 }, { "epoch": 0.3058103975535168, "grad_norm": 0.3413594692377354, "learning_rate": 8.326814219830291e-06, "loss": 0.3862, "num_tokens": 329658125.0, "step": 800 }, { "epoch": 0.30619266055045874, "grad_norm": 0.2968394469777086, "learning_rate": 8.322476355067556e-06, "loss": 0.3752, "num_tokens": 330116826.0, "step": 801 }, { "epoch": 0.3065749235474006, "grad_norm": 0.3303435870483519, "learning_rate": 8.318134162286726e-06, "loss": 0.3952, "num_tokens": 330547109.0, "step": 802 }, { "epoch": 0.3069571865443425, "grad_norm": 0.2798893906222307, "learning_rate": 8.31378764814617e-06, "loss": 0.3427, "num_tokens": 330918289.0, "step": 803 }, { "epoch": 0.3073394495412844, "grad_norm": 0.3012914244458347, "learning_rate": 8.309436819310884e-06, "loss": 0.3783, "num_tokens": 331353693.0, "step": 804 }, { "epoch": 0.3077217125382263, "grad_norm": 0.2597769895819, "learning_rate": 8.30508168245248e-06, "loss": 0.3761, "num_tokens": 331747510.0, "step": 805 }, { "epoch": 0.3081039755351682, "grad_norm": 0.30247055960385544, "learning_rate": 8.300722244249174e-06, "loss": 0.3606, "num_tokens": 332105174.0, "step": 806 }, { "epoch": 0.3084862385321101, "grad_norm": 0.29519748138924146, "learning_rate": 8.296358511385778e-06, "loss": 0.3851, "num_tokens": 332527760.0, "step": 807 }, { "epoch": 0.308868501529052, "grad_norm": 0.3112737830105025, "learning_rate": 8.291990490553696e-06, "loss": 0.3873, "num_tokens": 332932733.0, "step": 808 }, { "epoch": 0.30925076452599387, "grad_norm": 0.25537381687849214, "learning_rate": 8.287618188450896e-06, "loss": 0.3805, "num_tokens": 333386922.0, "step": 809 }, { "epoch": 0.30963302752293576, "grad_norm": 0.2954356875281406, "learning_rate": 8.283241611781922e-06, "loss": 0.3712, "num_tokens": 333770824.0, "step": 810 }, { "epoch": 0.3100152905198777, "grad_norm": 0.2759798212966893, "learning_rate": 8.278860767257865e-06, "loss": 0.3651, "num_tokens": 334174271.0, "step": 811 }, { "epoch": 0.3103975535168196, "grad_norm": 0.2827857374899756, "learning_rate": 8.274475661596361e-06, "loss": 0.3718, "num_tokens": 334558524.0, "step": 812 }, { "epoch": 0.31077981651376146, "grad_norm": 0.33900576718314124, "learning_rate": 8.270086301521587e-06, "loss": 0.3894, "num_tokens": 334953337.0, "step": 813 }, { "epoch": 0.31116207951070335, "grad_norm": 0.2801043854468926, "learning_rate": 8.265692693764235e-06, "loss": 0.3755, "num_tokens": 335362712.0, "step": 814 }, { "epoch": 0.31154434250764523, "grad_norm": 0.26568231939707143, "learning_rate": 8.261294845061516e-06, "loss": 0.3883, "num_tokens": 335772396.0, "step": 815 }, { "epoch": 0.3119266055045872, "grad_norm": 0.2669515365575605, "learning_rate": 8.256892762157141e-06, "loss": 0.3886, "num_tokens": 336167450.0, "step": 816 }, { "epoch": 0.31230886850152906, "grad_norm": 0.2928797893508793, "learning_rate": 8.252486451801315e-06, "loss": 0.3475, "num_tokens": 336530899.0, "step": 817 }, { "epoch": 0.31269113149847094, "grad_norm": 0.33902844016401745, "learning_rate": 8.24807592075073e-06, "loss": 0.3771, "num_tokens": 336947791.0, "step": 818 }, { "epoch": 0.3130733944954128, "grad_norm": 0.30928114189623757, "learning_rate": 8.24366117576854e-06, "loss": 0.378, "num_tokens": 337380589.0, "step": 819 }, { "epoch": 0.31345565749235477, "grad_norm": 0.2774468482510209, "learning_rate": 8.23924222362437e-06, "loss": 0.3774, "num_tokens": 337810888.0, "step": 820 }, { "epoch": 0.31383792048929665, "grad_norm": 0.35037212766222336, "learning_rate": 8.234819071094289e-06, "loss": 0.3871, "num_tokens": 338208041.0, "step": 821 }, { "epoch": 0.31422018348623854, "grad_norm": 0.34283706081215903, "learning_rate": 8.230391724960814e-06, "loss": 0.3996, "num_tokens": 338600734.0, "step": 822 }, { "epoch": 0.3146024464831804, "grad_norm": 0.314867977039219, "learning_rate": 8.225960192012887e-06, "loss": 0.4094, "num_tokens": 339007520.0, "step": 823 }, { "epoch": 0.3149847094801223, "grad_norm": 0.301015476763335, "learning_rate": 8.221524479045875e-06, "loss": 0.3955, "num_tokens": 339426478.0, "step": 824 }, { "epoch": 0.31536697247706424, "grad_norm": 0.2928348904269848, "learning_rate": 8.217084592861549e-06, "loss": 0.3654, "num_tokens": 339854612.0, "step": 825 }, { "epoch": 0.31574923547400613, "grad_norm": 0.3087693238588224, "learning_rate": 8.212640540268083e-06, "loss": 0.4166, "num_tokens": 340259865.0, "step": 826 }, { "epoch": 0.316131498470948, "grad_norm": 0.2511929015522724, "learning_rate": 8.208192328080038e-06, "loss": 0.3701, "num_tokens": 340670669.0, "step": 827 }, { "epoch": 0.3165137614678899, "grad_norm": 0.2760545911932065, "learning_rate": 8.203739963118358e-06, "loss": 0.378, "num_tokens": 341086664.0, "step": 828 }, { "epoch": 0.3168960244648318, "grad_norm": 0.27724973865904723, "learning_rate": 8.199283452210346e-06, "loss": 0.3711, "num_tokens": 341465346.0, "step": 829 }, { "epoch": 0.3172782874617737, "grad_norm": 0.2656431822246982, "learning_rate": 8.194822802189671e-06, "loss": 0.3779, "num_tokens": 341876848.0, "step": 830 }, { "epoch": 0.3176605504587156, "grad_norm": 0.27088161710759145, "learning_rate": 8.190358019896347e-06, "loss": 0.3588, "num_tokens": 342250241.0, "step": 831 }, { "epoch": 0.3180428134556575, "grad_norm": 0.2854141493385398, "learning_rate": 8.18588911217672e-06, "loss": 0.4042, "num_tokens": 342716991.0, "step": 832 }, { "epoch": 0.3184250764525994, "grad_norm": 0.288111898140903, "learning_rate": 8.181416085883467e-06, "loss": 0.3937, "num_tokens": 343096531.0, "step": 833 }, { "epoch": 0.31880733944954126, "grad_norm": 0.2872876611851775, "learning_rate": 8.176938947875577e-06, "loss": 0.3968, "num_tokens": 343521356.0, "step": 834 }, { "epoch": 0.3191896024464832, "grad_norm": 0.28721408327635395, "learning_rate": 8.172457705018347e-06, "loss": 0.3801, "num_tokens": 343940406.0, "step": 835 }, { "epoch": 0.3195718654434251, "grad_norm": 0.28987605333774863, "learning_rate": 8.167972364183365e-06, "loss": 0.365, "num_tokens": 344343179.0, "step": 836 }, { "epoch": 0.31995412844036697, "grad_norm": 0.2686670132156234, "learning_rate": 8.163482932248507e-06, "loss": 0.3688, "num_tokens": 344772760.0, "step": 837 }, { "epoch": 0.32033639143730885, "grad_norm": 0.3091785764889582, "learning_rate": 8.15898941609792e-06, "loss": 0.3839, "num_tokens": 345172609.0, "step": 838 }, { "epoch": 0.32071865443425074, "grad_norm": 0.2830381386223011, "learning_rate": 8.154491822622013e-06, "loss": 0.3852, "num_tokens": 345593278.0, "step": 839 }, { "epoch": 0.3211009174311927, "grad_norm": 0.27614061174230775, "learning_rate": 8.149990158717448e-06, "loss": 0.3948, "num_tokens": 346006505.0, "step": 840 }, { "epoch": 0.32148318042813456, "grad_norm": 0.31682696550944794, "learning_rate": 8.14548443128713e-06, "loss": 0.3749, "num_tokens": 346410269.0, "step": 841 }, { "epoch": 0.32186544342507645, "grad_norm": 0.27666500804697863, "learning_rate": 8.140974647240194e-06, "loss": 0.3973, "num_tokens": 346839973.0, "step": 842 }, { "epoch": 0.32224770642201833, "grad_norm": 0.27084010090748667, "learning_rate": 8.136460813491992e-06, "loss": 0.3754, "num_tokens": 347236802.0, "step": 843 }, { "epoch": 0.32262996941896027, "grad_norm": 0.30049596788616667, "learning_rate": 8.131942936964095e-06, "loss": 0.3733, "num_tokens": 347662619.0, "step": 844 }, { "epoch": 0.32301223241590216, "grad_norm": 0.2747454301894514, "learning_rate": 8.127421024584262e-06, "loss": 0.3916, "num_tokens": 348089652.0, "step": 845 }, { "epoch": 0.32339449541284404, "grad_norm": 0.2898628721090736, "learning_rate": 8.122895083286452e-06, "loss": 0.3704, "num_tokens": 348463724.0, "step": 846 }, { "epoch": 0.3237767584097859, "grad_norm": 0.31609406768503884, "learning_rate": 8.11836512001079e-06, "loss": 0.3811, "num_tokens": 348865207.0, "step": 847 }, { "epoch": 0.3241590214067278, "grad_norm": 0.2750981588647409, "learning_rate": 8.113831141703576e-06, "loss": 0.3649, "num_tokens": 349296137.0, "step": 848 }, { "epoch": 0.32454128440366975, "grad_norm": 0.2798833771485608, "learning_rate": 8.109293155317267e-06, "loss": 0.3817, "num_tokens": 349732349.0, "step": 849 }, { "epoch": 0.32492354740061163, "grad_norm": 0.33848303426295, "learning_rate": 8.104751167810463e-06, "loss": 0.3876, "num_tokens": 350150520.0, "step": 850 }, { "epoch": 0.3253058103975535, "grad_norm": 0.29957109777987384, "learning_rate": 8.100205186147899e-06, "loss": 0.3735, "num_tokens": 350512173.0, "step": 851 }, { "epoch": 0.3256880733944954, "grad_norm": 0.29006854786153796, "learning_rate": 8.095655217300439e-06, "loss": 0.3803, "num_tokens": 350938344.0, "step": 852 }, { "epoch": 0.3260703363914373, "grad_norm": 0.28646832984831927, "learning_rate": 8.091101268245057e-06, "loss": 0.3714, "num_tokens": 351355419.0, "step": 853 }, { "epoch": 0.3264525993883792, "grad_norm": 0.2856206241346726, "learning_rate": 8.086543345964833e-06, "loss": 0.3787, "num_tokens": 351791806.0, "step": 854 }, { "epoch": 0.3268348623853211, "grad_norm": 0.2875238330480725, "learning_rate": 8.081981457448935e-06, "loss": 0.3614, "num_tokens": 352219956.0, "step": 855 }, { "epoch": 0.327217125382263, "grad_norm": 0.24624973609663148, "learning_rate": 8.077415609692617e-06, "loss": 0.3785, "num_tokens": 352658717.0, "step": 856 }, { "epoch": 0.3275993883792049, "grad_norm": 0.2953148164584077, "learning_rate": 8.072845809697205e-06, "loss": 0.3805, "num_tokens": 353039804.0, "step": 857 }, { "epoch": 0.32798165137614677, "grad_norm": 0.3364361285995865, "learning_rate": 8.06827206447008e-06, "loss": 0.3818, "num_tokens": 353431576.0, "step": 858 }, { "epoch": 0.3283639143730887, "grad_norm": 0.3013234211312168, "learning_rate": 8.06369438102468e-06, "loss": 0.359, "num_tokens": 353820728.0, "step": 859 }, { "epoch": 0.3287461773700306, "grad_norm": 0.2701597137317539, "learning_rate": 8.059112766380476e-06, "loss": 0.3773, "num_tokens": 354233950.0, "step": 860 }, { "epoch": 0.3291284403669725, "grad_norm": 0.317078526479236, "learning_rate": 8.05452722756297e-06, "loss": 0.3692, "num_tokens": 354663652.0, "step": 861 }, { "epoch": 0.32951070336391436, "grad_norm": 0.3203424189404694, "learning_rate": 8.04993777160368e-06, "loss": 0.38, "num_tokens": 355038922.0, "step": 862 }, { "epoch": 0.32989296636085624, "grad_norm": 0.3205784823288829, "learning_rate": 8.04534440554013e-06, "loss": 0.4008, "num_tokens": 355472124.0, "step": 863 }, { "epoch": 0.3302752293577982, "grad_norm": 0.3187031346293902, "learning_rate": 8.040747136415843e-06, "loss": 0.3845, "num_tokens": 355866387.0, "step": 864 }, { "epoch": 0.33065749235474007, "grad_norm": 0.30219183265667704, "learning_rate": 8.036145971280325e-06, "loss": 0.3776, "num_tokens": 356267929.0, "step": 865 }, { "epoch": 0.33103975535168195, "grad_norm": 0.2581633352096232, "learning_rate": 8.031540917189056e-06, "loss": 0.3733, "num_tokens": 356677738.0, "step": 866 }, { "epoch": 0.33142201834862384, "grad_norm": 0.2749408289166452, "learning_rate": 8.026931981203477e-06, "loss": 0.3715, "num_tokens": 357091895.0, "step": 867 }, { "epoch": 0.3318042813455658, "grad_norm": 0.2922281465058763, "learning_rate": 8.022319170390987e-06, "loss": 0.42, "num_tokens": 357506950.0, "step": 868 }, { "epoch": 0.33218654434250766, "grad_norm": 0.27818724227876357, "learning_rate": 8.017702491824924e-06, "loss": 0.3785, "num_tokens": 357946281.0, "step": 869 }, { "epoch": 0.33256880733944955, "grad_norm": 0.282792537961623, "learning_rate": 8.013081952584555e-06, "loss": 0.3924, "num_tokens": 358373468.0, "step": 870 }, { "epoch": 0.33295107033639143, "grad_norm": 0.2965260117127636, "learning_rate": 8.00845755975507e-06, "loss": 0.4193, "num_tokens": 358804364.0, "step": 871 }, { "epoch": 0.3333333333333333, "grad_norm": 0.28160253414681796, "learning_rate": 8.003829320427564e-06, "loss": 0.3889, "num_tokens": 359247482.0, "step": 872 }, { "epoch": 0.33371559633027525, "grad_norm": 0.4947661139081739, "learning_rate": 7.999197241699035e-06, "loss": 0.3387, "num_tokens": 359327082.0, "step": 873 }, { "epoch": 1.0003822629969419, "grad_norm": 0.2734509081072418, "learning_rate": 7.994561330672367e-06, "loss": 0.3826, "num_tokens": 359778882.0, "step": 874 }, { "epoch": 1.0007645259938838, "grad_norm": 0.24753083362156889, "learning_rate": 7.989921594456318e-06, "loss": 0.3645, "num_tokens": 360186765.0, "step": 875 }, { "epoch": 1.0011467889908257, "grad_norm": 0.24033503142348886, "learning_rate": 7.985278040165519e-06, "loss": 0.3666, "num_tokens": 360615600.0, "step": 876 }, { "epoch": 1.0015290519877675, "grad_norm": 0.26373030832153915, "learning_rate": 7.980630674920445e-06, "loss": 0.3663, "num_tokens": 361032900.0, "step": 877 }, { "epoch": 1.0019113149847094, "grad_norm": 0.2509618885004304, "learning_rate": 7.97597950584742e-06, "loss": 0.3796, "num_tokens": 361524678.0, "step": 878 }, { "epoch": 1.0022935779816513, "grad_norm": 0.2871807042607093, "learning_rate": 7.9713245400786e-06, "loss": 0.3793, "num_tokens": 361949558.0, "step": 879 }, { "epoch": 1.0026758409785932, "grad_norm": 0.2709460830140365, "learning_rate": 7.966665784751969e-06, "loss": 0.3583, "num_tokens": 362329075.0, "step": 880 }, { "epoch": 1.003058103975535, "grad_norm": 0.2583704557425144, "learning_rate": 7.96200324701131e-06, "loss": 0.4021, "num_tokens": 362742516.0, "step": 881 }, { "epoch": 1.003440366972477, "grad_norm": 0.29727131810007523, "learning_rate": 7.957336934006218e-06, "loss": 0.3779, "num_tokens": 363148339.0, "step": 882 }, { "epoch": 1.003822629969419, "grad_norm": 0.28671444584050393, "learning_rate": 7.952666852892069e-06, "loss": 0.3656, "num_tokens": 363580955.0, "step": 883 }, { "epoch": 1.004204892966361, "grad_norm": 0.2991736165753305, "learning_rate": 7.947993010830021e-06, "loss": 0.3972, "num_tokens": 363997869.0, "step": 884 }, { "epoch": 1.0045871559633028, "grad_norm": 0.29466946874459293, "learning_rate": 7.943315414986998e-06, "loss": 0.3966, "num_tokens": 364378013.0, "step": 885 }, { "epoch": 1.0049694189602447, "grad_norm": 0.28255851877647997, "learning_rate": 7.938634072535675e-06, "loss": 0.375, "num_tokens": 364776227.0, "step": 886 }, { "epoch": 1.0053516819571866, "grad_norm": 0.25214491049081006, "learning_rate": 7.933948990654485e-06, "loss": 0.3951, "num_tokens": 365197610.0, "step": 887 }, { "epoch": 1.0057339449541285, "grad_norm": 0.28402684870030326, "learning_rate": 7.92926017652758e-06, "loss": 0.3613, "num_tokens": 365598931.0, "step": 888 }, { "epoch": 1.0061162079510704, "grad_norm": 0.2886360741100843, "learning_rate": 7.924567637344847e-06, "loss": 0.3662, "num_tokens": 365981513.0, "step": 889 }, { "epoch": 1.0064984709480123, "grad_norm": 0.28937066229234853, "learning_rate": 7.919871380301878e-06, "loss": 0.3937, "num_tokens": 366412673.0, "step": 890 }, { "epoch": 1.0068807339449541, "grad_norm": 0.2690398404128952, "learning_rate": 7.91517141259997e-06, "loss": 0.3389, "num_tokens": 366815104.0, "step": 891 }, { "epoch": 1.007262996941896, "grad_norm": 0.2752084701888421, "learning_rate": 7.910467741446106e-06, "loss": 0.377, "num_tokens": 367200683.0, "step": 892 }, { "epoch": 1.007645259938838, "grad_norm": 0.26906854351119347, "learning_rate": 7.905760374052952e-06, "loss": 0.3983, "num_tokens": 367652495.0, "step": 893 }, { "epoch": 1.0080275229357798, "grad_norm": 0.29980489244560266, "learning_rate": 7.901049317638836e-06, "loss": 0.365, "num_tokens": 368065671.0, "step": 894 }, { "epoch": 1.0084097859327217, "grad_norm": 0.3341095053422161, "learning_rate": 7.89633457942775e-06, "loss": 0.3782, "num_tokens": 368488502.0, "step": 895 }, { "epoch": 1.0087920489296636, "grad_norm": 0.3217500819097353, "learning_rate": 7.891616166649329e-06, "loss": 0.3627, "num_tokens": 368896724.0, "step": 896 }, { "epoch": 1.0091743119266054, "grad_norm": 0.250391334834949, "learning_rate": 7.886894086538841e-06, "loss": 0.3694, "num_tokens": 369321921.0, "step": 897 }, { "epoch": 1.0095565749235473, "grad_norm": 0.28692989476923897, "learning_rate": 7.88216834633718e-06, "loss": 0.3771, "num_tokens": 369795320.0, "step": 898 }, { "epoch": 1.0099388379204892, "grad_norm": 0.2918893341159398, "learning_rate": 7.87743895329085e-06, "loss": 0.373, "num_tokens": 370185879.0, "step": 899 }, { "epoch": 1.010321100917431, "grad_norm": 0.28420288318568054, "learning_rate": 7.872705914651955e-06, "loss": 0.3451, "num_tokens": 370578759.0, "step": 900 }, { "epoch": 1.010703363914373, "grad_norm": 0.2812802793741803, "learning_rate": 7.867969237678194e-06, "loss": 0.3484, "num_tokens": 370965932.0, "step": 901 }, { "epoch": 1.011085626911315, "grad_norm": 0.3069653253627977, "learning_rate": 7.863228929632843e-06, "loss": 0.3556, "num_tokens": 371367178.0, "step": 902 }, { "epoch": 1.011467889908257, "grad_norm": 0.2744791427749656, "learning_rate": 7.858484997784745e-06, "loss": 0.3433, "num_tokens": 371716486.0, "step": 903 }, { "epoch": 1.0118501529051989, "grad_norm": 0.27383270760349376, "learning_rate": 7.853737449408301e-06, "loss": 0.355, "num_tokens": 372088863.0, "step": 904 }, { "epoch": 1.0122324159021407, "grad_norm": 0.26813880667511547, "learning_rate": 7.848986291783454e-06, "loss": 0.3483, "num_tokens": 372481920.0, "step": 905 }, { "epoch": 1.0126146788990826, "grad_norm": 0.286545151058124, "learning_rate": 7.844231532195686e-06, "loss": 0.3826, "num_tokens": 372903843.0, "step": 906 }, { "epoch": 1.0129969418960245, "grad_norm": 0.2936452434650887, "learning_rate": 7.839473177936004e-06, "loss": 0.3645, "num_tokens": 373301571.0, "step": 907 }, { "epoch": 1.0133792048929664, "grad_norm": 0.2915841703093686, "learning_rate": 7.83471123630092e-06, "loss": 0.3336, "num_tokens": 373701131.0, "step": 908 }, { "epoch": 1.0137614678899083, "grad_norm": 0.30035560938560874, "learning_rate": 7.82994571459245e-06, "loss": 0.3462, "num_tokens": 374103993.0, "step": 909 }, { "epoch": 1.0141437308868502, "grad_norm": 0.25518956745177324, "learning_rate": 7.825176620118103e-06, "loss": 0.3554, "num_tokens": 374505973.0, "step": 910 }, { "epoch": 1.014525993883792, "grad_norm": 0.24625564275750858, "learning_rate": 7.820403960190862e-06, "loss": 0.3576, "num_tokens": 374935502.0, "step": 911 }, { "epoch": 1.014908256880734, "grad_norm": 0.2857341089305771, "learning_rate": 7.815627742129183e-06, "loss": 0.3492, "num_tokens": 375321168.0, "step": 912 }, { "epoch": 1.0152905198776758, "grad_norm": 0.2705785892172911, "learning_rate": 7.81084797325697e-06, "loss": 0.3328, "num_tokens": 375756536.0, "step": 913 }, { "epoch": 1.0156727828746177, "grad_norm": 0.2698558723883506, "learning_rate": 7.806064660903579e-06, "loss": 0.3272, "num_tokens": 376136227.0, "step": 914 }, { "epoch": 1.0160550458715596, "grad_norm": 0.2854014853385893, "learning_rate": 7.801277812403794e-06, "loss": 0.3559, "num_tokens": 376538633.0, "step": 915 }, { "epoch": 1.0164373088685015, "grad_norm": 0.3001335963169468, "learning_rate": 7.79648743509783e-06, "loss": 0.3564, "num_tokens": 376998983.0, "step": 916 }, { "epoch": 1.0168195718654434, "grad_norm": 0.26179049903477386, "learning_rate": 7.791693536331299e-06, "loss": 0.356, "num_tokens": 377420810.0, "step": 917 }, { "epoch": 1.0172018348623852, "grad_norm": 0.2609701605371836, "learning_rate": 7.786896123455227e-06, "loss": 0.319, "num_tokens": 377839448.0, "step": 918 }, { "epoch": 1.0175840978593271, "grad_norm": 0.2812961344848241, "learning_rate": 7.782095203826022e-06, "loss": 0.3367, "num_tokens": 378223203.0, "step": 919 }, { "epoch": 1.017966360856269, "grad_norm": 0.28434805329784335, "learning_rate": 7.777290784805469e-06, "loss": 0.3492, "num_tokens": 378629012.0, "step": 920 }, { "epoch": 1.018348623853211, "grad_norm": 0.2710089416075033, "learning_rate": 7.77248287376072e-06, "loss": 0.3457, "num_tokens": 379042477.0, "step": 921 }, { "epoch": 1.018730886850153, "grad_norm": 0.2640759914063179, "learning_rate": 7.767671478064282e-06, "loss": 0.3517, "num_tokens": 379458084.0, "step": 922 }, { "epoch": 1.019113149847095, "grad_norm": 0.2532537472995236, "learning_rate": 7.762856605094004e-06, "loss": 0.3578, "num_tokens": 379890219.0, "step": 923 }, { "epoch": 1.0194954128440368, "grad_norm": 0.28121821182530193, "learning_rate": 7.75803826223307e-06, "loss": 0.3447, "num_tokens": 380299884.0, "step": 924 }, { "epoch": 1.0198776758409787, "grad_norm": 0.2621071680462542, "learning_rate": 7.753216456869984e-06, "loss": 0.3421, "num_tokens": 380701959.0, "step": 925 }, { "epoch": 1.0202599388379205, "grad_norm": 0.2569297932469686, "learning_rate": 7.748391196398557e-06, "loss": 0.3226, "num_tokens": 381087095.0, "step": 926 }, { "epoch": 1.0206422018348624, "grad_norm": 0.23416106523920066, "learning_rate": 7.743562488217901e-06, "loss": 0.3034, "num_tokens": 381477222.0, "step": 927 }, { "epoch": 1.0210244648318043, "grad_norm": 0.2896357187748173, "learning_rate": 7.73873033973241e-06, "loss": 0.329, "num_tokens": 381878904.0, "step": 928 }, { "epoch": 1.0214067278287462, "grad_norm": 0.2865746031683812, "learning_rate": 7.733894758351758e-06, "loss": 0.3372, "num_tokens": 382252804.0, "step": 929 }, { "epoch": 1.021788990825688, "grad_norm": 0.28781775267915927, "learning_rate": 7.729055751490882e-06, "loss": 0.3445, "num_tokens": 382629461.0, "step": 930 }, { "epoch": 1.02217125382263, "grad_norm": 0.30186630563608535, "learning_rate": 7.724213326569972e-06, "loss": 0.352, "num_tokens": 383041658.0, "step": 931 }, { "epoch": 1.0225535168195719, "grad_norm": 0.3144099896682969, "learning_rate": 7.71936749101446e-06, "loss": 0.3387, "num_tokens": 383434312.0, "step": 932 }, { "epoch": 1.0229357798165137, "grad_norm": 0.26972322408190974, "learning_rate": 7.714518252255005e-06, "loss": 0.3428, "num_tokens": 383851671.0, "step": 933 }, { "epoch": 1.0233180428134556, "grad_norm": 0.2799243756691746, "learning_rate": 7.709665617727485e-06, "loss": 0.3498, "num_tokens": 384313170.0, "step": 934 }, { "epoch": 1.0237003058103975, "grad_norm": 0.2633832494849653, "learning_rate": 7.704809594872991e-06, "loss": 0.3401, "num_tokens": 384714741.0, "step": 935 }, { "epoch": 1.0240825688073394, "grad_norm": 0.29294206756598, "learning_rate": 7.699950191137798e-06, "loss": 0.3272, "num_tokens": 385131983.0, "step": 936 }, { "epoch": 1.0244648318042813, "grad_norm": 0.2567301436914333, "learning_rate": 7.695087413973377e-06, "loss": 0.3485, "num_tokens": 385535082.0, "step": 937 }, { "epoch": 1.0248470948012232, "grad_norm": 0.2512906138528013, "learning_rate": 7.690221270836366e-06, "loss": 0.3275, "num_tokens": 385893609.0, "step": 938 }, { "epoch": 1.025229357798165, "grad_norm": 0.2844207843120261, "learning_rate": 7.685351769188566e-06, "loss": 0.3723, "num_tokens": 386312005.0, "step": 939 }, { "epoch": 1.025611620795107, "grad_norm": 0.2694868452031438, "learning_rate": 7.680478916496927e-06, "loss": 0.3245, "num_tokens": 386721961.0, "step": 940 }, { "epoch": 1.025993883792049, "grad_norm": 0.24963732233345717, "learning_rate": 7.675602720233537e-06, "loss": 0.3053, "num_tokens": 387123208.0, "step": 941 }, { "epoch": 1.026376146788991, "grad_norm": 0.25437576273061274, "learning_rate": 7.670723187875613e-06, "loss": 0.3494, "num_tokens": 387550738.0, "step": 942 }, { "epoch": 1.0267584097859328, "grad_norm": 0.3254244914560424, "learning_rate": 7.665840326905488e-06, "loss": 0.3527, "num_tokens": 387963120.0, "step": 943 }, { "epoch": 1.0271406727828747, "grad_norm": 0.2768164362835791, "learning_rate": 7.660954144810597e-06, "loss": 0.3258, "num_tokens": 388364109.0, "step": 944 }, { "epoch": 1.0275229357798166, "grad_norm": 0.2592936147856663, "learning_rate": 7.656064649083466e-06, "loss": 0.3315, "num_tokens": 388763900.0, "step": 945 }, { "epoch": 1.0279051987767585, "grad_norm": 0.28398369657063827, "learning_rate": 7.651171847221708e-06, "loss": 0.3328, "num_tokens": 389162377.0, "step": 946 }, { "epoch": 1.0282874617737003, "grad_norm": 0.27232610027878973, "learning_rate": 7.646275746728002e-06, "loss": 0.3566, "num_tokens": 389603011.0, "step": 947 }, { "epoch": 1.0286697247706422, "grad_norm": 0.24364177031427756, "learning_rate": 7.641376355110085e-06, "loss": 0.3396, "num_tokens": 390040259.0, "step": 948 }, { "epoch": 1.029051987767584, "grad_norm": 0.2680865117005245, "learning_rate": 7.636473679880741e-06, "loss": 0.3411, "num_tokens": 390412441.0, "step": 949 }, { "epoch": 1.029434250764526, "grad_norm": 0.26385915833986956, "learning_rate": 7.631567728557788e-06, "loss": 0.3199, "num_tokens": 390810673.0, "step": 950 }, { "epoch": 1.0298165137614679, "grad_norm": 0.2892762005360835, "learning_rate": 7.62665850866407e-06, "loss": 0.3296, "num_tokens": 391179538.0, "step": 951 }, { "epoch": 1.0301987767584098, "grad_norm": 0.3150005886158518, "learning_rate": 7.621746027727442e-06, "loss": 0.3505, "num_tokens": 391580592.0, "step": 952 }, { "epoch": 1.0305810397553516, "grad_norm": 0.2803165087915309, "learning_rate": 7.616830293280758e-06, "loss": 0.3438, "num_tokens": 391959563.0, "step": 953 }, { "epoch": 1.0309633027522935, "grad_norm": 0.3142488912094723, "learning_rate": 7.611911312861865e-06, "loss": 0.3589, "num_tokens": 392363161.0, "step": 954 }, { "epoch": 1.0313455657492354, "grad_norm": 0.26591786809601603, "learning_rate": 7.606989094013583e-06, "loss": 0.3283, "num_tokens": 392792758.0, "step": 955 }, { "epoch": 1.0317278287461773, "grad_norm": 0.27468645724232027, "learning_rate": 7.6020636442837e-06, "loss": 0.3499, "num_tokens": 393225384.0, "step": 956 }, { "epoch": 1.0321100917431192, "grad_norm": 0.2770314893451792, "learning_rate": 7.59713497122496e-06, "loss": 0.3413, "num_tokens": 393609743.0, "step": 957 }, { "epoch": 1.032492354740061, "grad_norm": 0.2823466583279306, "learning_rate": 7.592203082395044e-06, "loss": 0.3105, "num_tokens": 393978914.0, "step": 958 }, { "epoch": 1.032874617737003, "grad_norm": 0.279883822597775, "learning_rate": 7.58726798535657e-06, "loss": 0.3326, "num_tokens": 394358923.0, "step": 959 }, { "epoch": 1.033256880733945, "grad_norm": 0.22973511777141034, "learning_rate": 7.582329687677073e-06, "loss": 0.3196, "num_tokens": 394782828.0, "step": 960 }, { "epoch": 1.033639143730887, "grad_norm": 0.3228999620454348, "learning_rate": 7.5773881969289965e-06, "loss": 0.3632, "num_tokens": 395238022.0, "step": 961 }, { "epoch": 1.0340214067278288, "grad_norm": 0.29840503700608834, "learning_rate": 7.572443520689679e-06, "loss": 0.3408, "num_tokens": 395610624.0, "step": 962 }, { "epoch": 1.0344036697247707, "grad_norm": 0.31720403006364906, "learning_rate": 7.567495666541343e-06, "loss": 0.3152, "num_tokens": 395983412.0, "step": 963 }, { "epoch": 1.0347859327217126, "grad_norm": 0.28276133101593437, "learning_rate": 7.562544642071089e-06, "loss": 0.3244, "num_tokens": 396371663.0, "step": 964 }, { "epoch": 1.0351681957186545, "grad_norm": 0.2862852498925155, "learning_rate": 7.557590454870874e-06, "loss": 0.3487, "num_tokens": 396812040.0, "step": 965 }, { "epoch": 1.0355504587155964, "grad_norm": 0.2594406121328056, "learning_rate": 7.552633112537506e-06, "loss": 0.3277, "num_tokens": 397205130.0, "step": 966 }, { "epoch": 1.0359327217125383, "grad_norm": 0.30884809717977374, "learning_rate": 7.547672622672633e-06, "loss": 0.3263, "num_tokens": 397597594.0, "step": 967 }, { "epoch": 1.0363149847094801, "grad_norm": 0.2570141156560367, "learning_rate": 7.5427089928827255e-06, "loss": 0.3266, "num_tokens": 397976903.0, "step": 968 }, { "epoch": 1.036697247706422, "grad_norm": 0.2611937830544222, "learning_rate": 7.537742230779075e-06, "loss": 0.3451, "num_tokens": 398400133.0, "step": 969 }, { "epoch": 1.037079510703364, "grad_norm": 0.3053857048182125, "learning_rate": 7.532772343977767e-06, "loss": 0.3313, "num_tokens": 398835764.0, "step": 970 }, { "epoch": 1.0374617737003058, "grad_norm": 0.30072622369984425, "learning_rate": 7.527799340099687e-06, "loss": 0.305, "num_tokens": 399212102.0, "step": 971 }, { "epoch": 1.0378440366972477, "grad_norm": 0.25083005192378344, "learning_rate": 7.522823226770497e-06, "loss": 0.3235, "num_tokens": 399606442.0, "step": 972 }, { "epoch": 1.0382262996941896, "grad_norm": 0.3058278156655185, "learning_rate": 7.517844011620628e-06, "loss": 0.3414, "num_tokens": 400034094.0, "step": 973 }, { "epoch": 1.0386085626911314, "grad_norm": 0.303813502122588, "learning_rate": 7.512861702285262e-06, "loss": 0.3202, "num_tokens": 400445503.0, "step": 974 }, { "epoch": 1.0389908256880733, "grad_norm": 0.3601892276904088, "learning_rate": 7.507876306404336e-06, "loss": 0.3372, "num_tokens": 400781994.0, "step": 975 }, { "epoch": 1.0393730886850152, "grad_norm": 0.3252179447507799, "learning_rate": 7.502887831622509e-06, "loss": 0.3482, "num_tokens": 401195592.0, "step": 976 }, { "epoch": 1.039755351681957, "grad_norm": 0.298817915856359, "learning_rate": 7.497896285589171e-06, "loss": 0.3328, "num_tokens": 401582962.0, "step": 977 }, { "epoch": 1.040137614678899, "grad_norm": 0.3028218108837741, "learning_rate": 7.492901675958413e-06, "loss": 0.3178, "num_tokens": 401970540.0, "step": 978 }, { "epoch": 1.040519877675841, "grad_norm": 0.2879379126692796, "learning_rate": 7.48790401038903e-06, "loss": 0.3148, "num_tokens": 402363696.0, "step": 979 }, { "epoch": 1.040902140672783, "grad_norm": 0.31801831475517145, "learning_rate": 7.482903296544499e-06, "loss": 0.3242, "num_tokens": 402761886.0, "step": 980 }, { "epoch": 1.0412844036697249, "grad_norm": 0.2896183268507168, "learning_rate": 7.477899542092975e-06, "loss": 0.3418, "num_tokens": 403193812.0, "step": 981 }, { "epoch": 1.0416666666666667, "grad_norm": 0.3176839263544348, "learning_rate": 7.47289275470727e-06, "loss": 0.3533, "num_tokens": 403605536.0, "step": 982 }, { "epoch": 1.0420489296636086, "grad_norm": 0.3245265992977043, "learning_rate": 7.46788294206485e-06, "loss": 0.3353, "num_tokens": 403989064.0, "step": 983 }, { "epoch": 1.0424311926605505, "grad_norm": 0.28697975569096, "learning_rate": 7.462870111847823e-06, "loss": 0.3386, "num_tokens": 404401513.0, "step": 984 }, { "epoch": 1.0428134556574924, "grad_norm": 0.25372418601934416, "learning_rate": 7.45785427174292e-06, "loss": 0.3415, "num_tokens": 404875606.0, "step": 985 }, { "epoch": 1.0431957186544343, "grad_norm": 0.3026099073947782, "learning_rate": 7.4528354294414885e-06, "loss": 0.354, "num_tokens": 405301869.0, "step": 986 }, { "epoch": 1.0435779816513762, "grad_norm": 0.281992100959585, "learning_rate": 7.447813592639481e-06, "loss": 0.3303, "num_tokens": 405714339.0, "step": 987 }, { "epoch": 1.043960244648318, "grad_norm": 0.2662856762998438, "learning_rate": 7.442788769037439e-06, "loss": 0.3435, "num_tokens": 406166888.0, "step": 988 }, { "epoch": 1.04434250764526, "grad_norm": 0.29760993090936005, "learning_rate": 7.437760966340483e-06, "loss": 0.3232, "num_tokens": 406579055.0, "step": 989 }, { "epoch": 1.0447247706422018, "grad_norm": 0.2839517525387195, "learning_rate": 7.43273019225831e-06, "loss": 0.3318, "num_tokens": 406984198.0, "step": 990 }, { "epoch": 1.0451070336391437, "grad_norm": 0.24744264406023414, "learning_rate": 7.427696454505162e-06, "loss": 0.3119, "num_tokens": 407385414.0, "step": 991 }, { "epoch": 1.0454892966360856, "grad_norm": 0.291270628975271, "learning_rate": 7.422659760799835e-06, "loss": 0.3441, "num_tokens": 407816649.0, "step": 992 }, { "epoch": 1.0458715596330275, "grad_norm": 0.2911709274219056, "learning_rate": 7.417620118865653e-06, "loss": 0.3353, "num_tokens": 408239160.0, "step": 993 }, { "epoch": 1.0462538226299694, "grad_norm": 0.3144480766185622, "learning_rate": 7.4125775364304586e-06, "loss": 0.3488, "num_tokens": 408664284.0, "step": 994 }, { "epoch": 1.0466360856269112, "grad_norm": 0.27495056161252873, "learning_rate": 7.40753202122661e-06, "loss": 0.3274, "num_tokens": 409108871.0, "step": 995 }, { "epoch": 1.0470183486238531, "grad_norm": 0.2739623181688787, "learning_rate": 7.402483580990958e-06, "loss": 0.3593, "num_tokens": 409557140.0, "step": 996 }, { "epoch": 1.047400611620795, "grad_norm": 0.31174056121420535, "learning_rate": 7.3974322234648375e-06, "loss": 0.3534, "num_tokens": 409956138.0, "step": 997 }, { "epoch": 1.0477828746177371, "grad_norm": 0.31339641948473407, "learning_rate": 7.3923779563940616e-06, "loss": 0.3305, "num_tokens": 410379715.0, "step": 998 }, { "epoch": 1.048165137614679, "grad_norm": 0.2740355101361742, "learning_rate": 7.387320787528902e-06, "loss": 0.3155, "num_tokens": 410785387.0, "step": 999 }, { "epoch": 1.0485474006116209, "grad_norm": 0.33580055239639545, "learning_rate": 7.382260724624079e-06, "loss": 0.3528, "num_tokens": 411244242.0, "step": 1000 }, { "epoch": 1.0489296636085628, "grad_norm": 0.3575548281906381, "learning_rate": 7.377197775438752e-06, "loss": 0.3219, "num_tokens": 411634682.0, "step": 1001 }, { "epoch": 1.0493119266055047, "grad_norm": 0.31109545583851445, "learning_rate": 7.372131947736507e-06, "loss": 0.3358, "num_tokens": 412044013.0, "step": 1002 }, { "epoch": 1.0496941896024465, "grad_norm": 0.2465494986141242, "learning_rate": 7.3670632492853435e-06, "loss": 0.3333, "num_tokens": 412471591.0, "step": 1003 }, { "epoch": 1.0500764525993884, "grad_norm": 0.3491131967698, "learning_rate": 7.361991687857662e-06, "loss": 0.3459, "num_tokens": 412836021.0, "step": 1004 }, { "epoch": 1.0504587155963303, "grad_norm": 0.3258010706493882, "learning_rate": 7.356917271230253e-06, "loss": 0.3437, "num_tokens": 413260199.0, "step": 1005 }, { "epoch": 1.0508409785932722, "grad_norm": 0.2891870649721951, "learning_rate": 7.351840007184288e-06, "loss": 0.3391, "num_tokens": 413690591.0, "step": 1006 }, { "epoch": 1.051223241590214, "grad_norm": 0.2615086606826415, "learning_rate": 7.3467599035053005e-06, "loss": 0.3356, "num_tokens": 414094467.0, "step": 1007 }, { "epoch": 1.051605504587156, "grad_norm": 0.24690199397563625, "learning_rate": 7.341676967983182e-06, "loss": 0.3439, "num_tokens": 414526259.0, "step": 1008 }, { "epoch": 1.0519877675840978, "grad_norm": 0.2702869615782497, "learning_rate": 7.336591208412165e-06, "loss": 0.3139, "num_tokens": 414896654.0, "step": 1009 }, { "epoch": 1.0523700305810397, "grad_norm": 0.2647541961108, "learning_rate": 7.331502632590814e-06, "loss": 0.3395, "num_tokens": 415309627.0, "step": 1010 }, { "epoch": 1.0527522935779816, "grad_norm": 0.2939050431827775, "learning_rate": 7.326411248322008e-06, "loss": 0.3203, "num_tokens": 415728436.0, "step": 1011 }, { "epoch": 1.0531345565749235, "grad_norm": 0.26523517485597786, "learning_rate": 7.321317063412935e-06, "loss": 0.3479, "num_tokens": 416170204.0, "step": 1012 }, { "epoch": 1.0535168195718654, "grad_norm": 0.25734848311290703, "learning_rate": 7.316220085675078e-06, "loss": 0.3318, "num_tokens": 416586132.0, "step": 1013 }, { "epoch": 1.0538990825688073, "grad_norm": 0.24521474713180183, "learning_rate": 7.3111203229242e-06, "loss": 0.3199, "num_tokens": 416981645.0, "step": 1014 }, { "epoch": 1.0542813455657492, "grad_norm": 0.27862044652919127, "learning_rate": 7.30601778298034e-06, "loss": 0.3214, "num_tokens": 417391963.0, "step": 1015 }, { "epoch": 1.054663608562691, "grad_norm": 0.2694085842544258, "learning_rate": 7.30091247366779e-06, "loss": 0.3066, "num_tokens": 417805751.0, "step": 1016 }, { "epoch": 1.0550458715596331, "grad_norm": 0.25785893913958374, "learning_rate": 7.295804402815094e-06, "loss": 0.3313, "num_tokens": 418191155.0, "step": 1017 }, { "epoch": 1.055428134556575, "grad_norm": 0.28146510726280544, "learning_rate": 7.2906935782550235e-06, "loss": 0.3362, "num_tokens": 418592947.0, "step": 1018 }, { "epoch": 1.055810397553517, "grad_norm": 0.2590923198365843, "learning_rate": 7.285580007824577e-06, "loss": 0.3436, "num_tokens": 419041069.0, "step": 1019 }, { "epoch": 1.0561926605504588, "grad_norm": 0.2650615562014495, "learning_rate": 7.280463699364963e-06, "loss": 0.3341, "num_tokens": 419443162.0, "step": 1020 }, { "epoch": 1.0565749235474007, "grad_norm": 0.2733208858566881, "learning_rate": 7.27534466072159e-06, "loss": 0.3588, "num_tokens": 419855925.0, "step": 1021 }, { "epoch": 1.0569571865443426, "grad_norm": 0.29360693062281557, "learning_rate": 7.2702228997440494e-06, "loss": 0.359, "num_tokens": 420288291.0, "step": 1022 }, { "epoch": 1.0573394495412844, "grad_norm": 0.250084621080211, "learning_rate": 7.26509842428611e-06, "loss": 0.3244, "num_tokens": 420749736.0, "step": 1023 }, { "epoch": 1.0577217125382263, "grad_norm": 0.27454331462264997, "learning_rate": 7.259971242205702e-06, "loss": 0.329, "num_tokens": 421209874.0, "step": 1024 }, { "epoch": 1.0581039755351682, "grad_norm": 0.27205923554652683, "learning_rate": 7.2548413613649086e-06, "loss": 0.3269, "num_tokens": 421618221.0, "step": 1025 }, { "epoch": 1.05848623853211, "grad_norm": 0.28898461698944294, "learning_rate": 7.249708789629944e-06, "loss": 0.3605, "num_tokens": 422061571.0, "step": 1026 }, { "epoch": 1.058868501529052, "grad_norm": 0.31891925249008934, "learning_rate": 7.2445735348711564e-06, "loss": 0.3516, "num_tokens": 422511924.0, "step": 1027 }, { "epoch": 1.0592507645259939, "grad_norm": 0.2888571653496046, "learning_rate": 7.239435604963004e-06, "loss": 0.359, "num_tokens": 422944077.0, "step": 1028 }, { "epoch": 1.0596330275229358, "grad_norm": 0.25417299656038544, "learning_rate": 7.23429500778405e-06, "loss": 0.3067, "num_tokens": 423310711.0, "step": 1029 }, { "epoch": 1.0600152905198776, "grad_norm": 0.2699266871932084, "learning_rate": 7.229151751216944e-06, "loss": 0.3325, "num_tokens": 423751067.0, "step": 1030 }, { "epoch": 1.0603975535168195, "grad_norm": 0.2932375497886189, "learning_rate": 7.224005843148419e-06, "loss": 0.3078, "num_tokens": 424127163.0, "step": 1031 }, { "epoch": 1.0607798165137614, "grad_norm": 0.2868996918061548, "learning_rate": 7.2188572914692645e-06, "loss": 0.3105, "num_tokens": 424552821.0, "step": 1032 }, { "epoch": 1.0611620795107033, "grad_norm": 0.27593803646172965, "learning_rate": 7.213706104074335e-06, "loss": 0.3453, "num_tokens": 424979967.0, "step": 1033 }, { "epoch": 1.0615443425076452, "grad_norm": 0.2928973784172809, "learning_rate": 7.208552288862519e-06, "loss": 0.3435, "num_tokens": 425398115.0, "step": 1034 }, { "epoch": 1.061926605504587, "grad_norm": 0.3109627358650455, "learning_rate": 7.203395853736736e-06, "loss": 0.3133, "num_tokens": 425791141.0, "step": 1035 }, { "epoch": 1.062308868501529, "grad_norm": 0.3166862036218244, "learning_rate": 7.198236806603923e-06, "loss": 0.3124, "num_tokens": 426202151.0, "step": 1036 }, { "epoch": 1.0626911314984708, "grad_norm": 0.24198574037841442, "learning_rate": 7.193075155375027e-06, "loss": 0.3368, "num_tokens": 426630910.0, "step": 1037 }, { "epoch": 1.063073394495413, "grad_norm": 0.25856614778208076, "learning_rate": 7.187910907964979e-06, "loss": 0.3488, "num_tokens": 427079230.0, "step": 1038 }, { "epoch": 1.0634556574923548, "grad_norm": 0.2858819417264152, "learning_rate": 7.1827440722927e-06, "loss": 0.3143, "num_tokens": 427446664.0, "step": 1039 }, { "epoch": 1.0638379204892967, "grad_norm": 0.2568596426141947, "learning_rate": 7.177574656281075e-06, "loss": 0.3145, "num_tokens": 427832021.0, "step": 1040 }, { "epoch": 1.0642201834862386, "grad_norm": 0.285783214827123, "learning_rate": 7.1724026678569455e-06, "loss": 0.3379, "num_tokens": 428274589.0, "step": 1041 }, { "epoch": 1.0646024464831805, "grad_norm": 0.26642781031203505, "learning_rate": 7.167228114951099e-06, "loss": 0.3511, "num_tokens": 428689563.0, "step": 1042 }, { "epoch": 1.0649847094801224, "grad_norm": 0.24368239933074232, "learning_rate": 7.162051005498256e-06, "loss": 0.3276, "num_tokens": 429080719.0, "step": 1043 }, { "epoch": 1.0653669724770642, "grad_norm": 0.2405054119061207, "learning_rate": 7.156871347437056e-06, "loss": 0.3154, "num_tokens": 429463841.0, "step": 1044 }, { "epoch": 1.0657492354740061, "grad_norm": 0.2669615213950939, "learning_rate": 7.151689148710046e-06, "loss": 0.3311, "num_tokens": 429902093.0, "step": 1045 }, { "epoch": 1.066131498470948, "grad_norm": 0.30028158018143036, "learning_rate": 7.146504417263671e-06, "loss": 0.3258, "num_tokens": 430311699.0, "step": 1046 }, { "epoch": 1.06651376146789, "grad_norm": 0.2835524117938337, "learning_rate": 7.141317161048259e-06, "loss": 0.3452, "num_tokens": 430729269.0, "step": 1047 }, { "epoch": 1.0668960244648318, "grad_norm": 0.28441046173696943, "learning_rate": 7.13612738801801e-06, "loss": 0.3238, "num_tokens": 431122929.0, "step": 1048 }, { "epoch": 1.0672782874617737, "grad_norm": 0.2941622402641557, "learning_rate": 7.130935106130977e-06, "loss": 0.3662, "num_tokens": 431574807.0, "step": 1049 }, { "epoch": 1.0676605504587156, "grad_norm": 0.2645377216026675, "learning_rate": 7.125740323349071e-06, "loss": 0.3415, "num_tokens": 432020720.0, "step": 1050 }, { "epoch": 1.0680428134556574, "grad_norm": 0.2584199445816942, "learning_rate": 7.120543047638031e-06, "loss": 0.3146, "num_tokens": 432432725.0, "step": 1051 }, { "epoch": 1.0684250764525993, "grad_norm": 0.2472446276124087, "learning_rate": 7.11534328696742e-06, "loss": 0.3364, "num_tokens": 432890398.0, "step": 1052 }, { "epoch": 1.0688073394495412, "grad_norm": 0.2857455439767263, "learning_rate": 7.1101410493106096e-06, "loss": 0.347, "num_tokens": 433313480.0, "step": 1053 }, { "epoch": 1.069189602446483, "grad_norm": 0.26309498057198827, "learning_rate": 7.104936342644774e-06, "loss": 0.3297, "num_tokens": 433726737.0, "step": 1054 }, { "epoch": 1.069571865443425, "grad_norm": 0.2768248416790847, "learning_rate": 7.099729174950869e-06, "loss": 0.3258, "num_tokens": 434091412.0, "step": 1055 }, { "epoch": 1.0699541284403669, "grad_norm": 0.24844614730162984, "learning_rate": 7.094519554213629e-06, "loss": 0.3063, "num_tokens": 434448735.0, "step": 1056 }, { "epoch": 1.070336391437309, "grad_norm": 0.2547043213317315, "learning_rate": 7.089307488421544e-06, "loss": 0.3194, "num_tokens": 434858826.0, "step": 1057 }, { "epoch": 1.0707186544342508, "grad_norm": 0.2662362041579405, "learning_rate": 7.084092985566858e-06, "loss": 0.328, "num_tokens": 435225633.0, "step": 1058 }, { "epoch": 1.0711009174311927, "grad_norm": 0.2610068712434011, "learning_rate": 7.078876053645551e-06, "loss": 0.3324, "num_tokens": 435617788.0, "step": 1059 }, { "epoch": 1.0714831804281346, "grad_norm": 0.26580000514352164, "learning_rate": 7.073656700657325e-06, "loss": 0.3302, "num_tokens": 436032473.0, "step": 1060 }, { "epoch": 1.0718654434250765, "grad_norm": 0.30399536846309827, "learning_rate": 7.0684349346056004e-06, "loss": 0.334, "num_tokens": 436471585.0, "step": 1061 }, { "epoch": 1.0722477064220184, "grad_norm": 0.28058977464944784, "learning_rate": 7.063210763497489e-06, "loss": 0.3282, "num_tokens": 436886926.0, "step": 1062 }, { "epoch": 1.0726299694189603, "grad_norm": 0.3149299892495931, "learning_rate": 7.057984195343799e-06, "loss": 0.3471, "num_tokens": 437279830.0, "step": 1063 }, { "epoch": 1.0730122324159022, "grad_norm": 0.30988094289704926, "learning_rate": 7.0527552381590085e-06, "loss": 0.323, "num_tokens": 437699465.0, "step": 1064 }, { "epoch": 1.073394495412844, "grad_norm": 0.33901552174408217, "learning_rate": 7.047523899961264e-06, "loss": 0.3595, "num_tokens": 438112147.0, "step": 1065 }, { "epoch": 1.073776758409786, "grad_norm": 0.2935244288076594, "learning_rate": 7.042290188772358e-06, "loss": 0.3429, "num_tokens": 438533408.0, "step": 1066 }, { "epoch": 1.0741590214067278, "grad_norm": 0.2806315484525417, "learning_rate": 7.037054112617726e-06, "loss": 0.3384, "num_tokens": 438965838.0, "step": 1067 }, { "epoch": 1.0745412844036697, "grad_norm": 0.2589712218203007, "learning_rate": 7.031815679526428e-06, "loss": 0.3286, "num_tokens": 439406677.0, "step": 1068 }, { "epoch": 1.0749235474006116, "grad_norm": 0.29006896402147997, "learning_rate": 7.026574897531137e-06, "loss": 0.3319, "num_tokens": 439772450.0, "step": 1069 }, { "epoch": 1.0753058103975535, "grad_norm": 0.29855377380418946, "learning_rate": 7.02133177466813e-06, "loss": 0.3331, "num_tokens": 440168051.0, "step": 1070 }, { "epoch": 1.0756880733944953, "grad_norm": 0.256686766820325, "learning_rate": 7.016086318977272e-06, "loss": 0.3202, "num_tokens": 440577649.0, "step": 1071 }, { "epoch": 1.0760703363914372, "grad_norm": 0.28061455939121943, "learning_rate": 7.0108385385020065e-06, "loss": 0.3419, "num_tokens": 440988939.0, "step": 1072 }, { "epoch": 1.0764525993883791, "grad_norm": 0.26164118306674927, "learning_rate": 7.005588441289342e-06, "loss": 0.3178, "num_tokens": 441383619.0, "step": 1073 }, { "epoch": 1.076834862385321, "grad_norm": 0.28237693271596354, "learning_rate": 7.000336035389835e-06, "loss": 0.3066, "num_tokens": 441766942.0, "step": 1074 }, { "epoch": 1.0772171253822629, "grad_norm": 0.2918397421691369, "learning_rate": 6.995081328857589e-06, "loss": 0.3332, "num_tokens": 442162255.0, "step": 1075 }, { "epoch": 1.077599388379205, "grad_norm": 0.2937364277329519, "learning_rate": 6.989824329750233e-06, "loss": 0.3265, "num_tokens": 442541326.0, "step": 1076 }, { "epoch": 1.0779816513761469, "grad_norm": 0.26148526680868966, "learning_rate": 6.984565046128907e-06, "loss": 0.3372, "num_tokens": 442954893.0, "step": 1077 }, { "epoch": 1.0783639143730888, "grad_norm": 0.30040769042891446, "learning_rate": 6.979303486058262e-06, "loss": 0.3248, "num_tokens": 443345264.0, "step": 1078 }, { "epoch": 1.0787461773700306, "grad_norm": 0.275720903760283, "learning_rate": 6.974039657606433e-06, "loss": 0.3165, "num_tokens": 443763105.0, "step": 1079 }, { "epoch": 1.0791284403669725, "grad_norm": 0.274254162769265, "learning_rate": 6.968773568845034e-06, "loss": 0.3574, "num_tokens": 444204611.0, "step": 1080 }, { "epoch": 1.0795107033639144, "grad_norm": 0.2706788599110957, "learning_rate": 6.96350522784915e-06, "loss": 0.3277, "num_tokens": 444623578.0, "step": 1081 }, { "epoch": 1.0798929663608563, "grad_norm": 0.28109320432987295, "learning_rate": 6.958234642697317e-06, "loss": 0.3171, "num_tokens": 445025320.0, "step": 1082 }, { "epoch": 1.0802752293577982, "grad_norm": 0.2640973212756067, "learning_rate": 6.952961821471509e-06, "loss": 0.3381, "num_tokens": 445423055.0, "step": 1083 }, { "epoch": 1.08065749235474, "grad_norm": 0.2824450266139343, "learning_rate": 6.9476867722571315e-06, "loss": 0.3534, "num_tokens": 445884360.0, "step": 1084 }, { "epoch": 1.081039755351682, "grad_norm": 0.25219328692226556, "learning_rate": 6.942409503143008e-06, "loss": 0.3268, "num_tokens": 446293395.0, "step": 1085 }, { "epoch": 1.0814220183486238, "grad_norm": 0.3015921590154933, "learning_rate": 6.9371300222213635e-06, "loss": 0.329, "num_tokens": 446671882.0, "step": 1086 }, { "epoch": 1.0818042813455657, "grad_norm": 0.2847840294891782, "learning_rate": 6.931848337587817e-06, "loss": 0.3306, "num_tokens": 447100754.0, "step": 1087 }, { "epoch": 1.0821865443425076, "grad_norm": 0.29123957000494594, "learning_rate": 6.926564457341362e-06, "loss": 0.3257, "num_tokens": 447469597.0, "step": 1088 }, { "epoch": 1.0825688073394495, "grad_norm": 0.3096082604042419, "learning_rate": 6.9212783895843625e-06, "loss": 0.3602, "num_tokens": 447855369.0, "step": 1089 }, { "epoch": 1.0829510703363914, "grad_norm": 0.28343201862453954, "learning_rate": 6.91599014242254e-06, "loss": 0.3343, "num_tokens": 448284173.0, "step": 1090 }, { "epoch": 1.0833333333333333, "grad_norm": 0.27264911732441066, "learning_rate": 6.910699723964951e-06, "loss": 0.3098, "num_tokens": 448657999.0, "step": 1091 }, { "epoch": 1.0837155963302751, "grad_norm": 0.28459364230071305, "learning_rate": 6.905407142323987e-06, "loss": 0.3558, "num_tokens": 449096539.0, "step": 1092 }, { "epoch": 1.084097859327217, "grad_norm": 0.3236254342650189, "learning_rate": 6.900112405615351e-06, "loss": 0.2991, "num_tokens": 449479035.0, "step": 1093 }, { "epoch": 1.084480122324159, "grad_norm": 0.2998871923147371, "learning_rate": 6.894815521958057e-06, "loss": 0.3187, "num_tokens": 449901410.0, "step": 1094 }, { "epoch": 1.084862385321101, "grad_norm": 0.25020509687915393, "learning_rate": 6.889516499474407e-06, "loss": 0.2998, "num_tokens": 450303511.0, "step": 1095 }, { "epoch": 1.085244648318043, "grad_norm": 0.2723295474064236, "learning_rate": 6.884215346289983e-06, "loss": 0.344, "num_tokens": 450769572.0, "step": 1096 }, { "epoch": 1.0856269113149848, "grad_norm": 0.2957926391593118, "learning_rate": 6.878912070533634e-06, "loss": 0.323, "num_tokens": 451175714.0, "step": 1097 }, { "epoch": 1.0860091743119267, "grad_norm": 0.2949835512301882, "learning_rate": 6.873606680337469e-06, "loss": 0.3374, "num_tokens": 451604043.0, "step": 1098 }, { "epoch": 1.0863914373088686, "grad_norm": 0.2733994849516281, "learning_rate": 6.8682991838368305e-06, "loss": 0.3334, "num_tokens": 452011791.0, "step": 1099 }, { "epoch": 1.0867737003058104, "grad_norm": 0.30542115556871163, "learning_rate": 6.862989589170299e-06, "loss": 0.318, "num_tokens": 452392077.0, "step": 1100 }, { "epoch": 1.0871559633027523, "grad_norm": 0.31666497094524143, "learning_rate": 6.857677904479667e-06, "loss": 0.3192, "num_tokens": 452821004.0, "step": 1101 }, { "epoch": 1.0875382262996942, "grad_norm": 0.2866747926901651, "learning_rate": 6.852364137909934e-06, "loss": 0.3138, "num_tokens": 453230045.0, "step": 1102 }, { "epoch": 1.087920489296636, "grad_norm": 0.27947422912321956, "learning_rate": 6.84704829760929e-06, "loss": 0.354, "num_tokens": 453639194.0, "step": 1103 }, { "epoch": 1.088302752293578, "grad_norm": 0.3328205177355447, "learning_rate": 6.841730391729108e-06, "loss": 0.3178, "num_tokens": 454028366.0, "step": 1104 }, { "epoch": 1.0886850152905199, "grad_norm": 0.2938174101831714, "learning_rate": 6.836410428423926e-06, "loss": 0.3325, "num_tokens": 454430246.0, "step": 1105 }, { "epoch": 1.0890672782874617, "grad_norm": 0.28158318697487555, "learning_rate": 6.831088415851438e-06, "loss": 0.3273, "num_tokens": 454857619.0, "step": 1106 }, { "epoch": 1.0894495412844036, "grad_norm": 0.26633238664989395, "learning_rate": 6.82576436217248e-06, "loss": 0.3099, "num_tokens": 455268901.0, "step": 1107 }, { "epoch": 1.0898318042813455, "grad_norm": 0.2705623576027738, "learning_rate": 6.82043827555102e-06, "loss": 0.312, "num_tokens": 455682258.0, "step": 1108 }, { "epoch": 1.0902140672782874, "grad_norm": 0.300674501338249, "learning_rate": 6.815110164154137e-06, "loss": 0.3247, "num_tokens": 456099244.0, "step": 1109 }, { "epoch": 1.0905963302752293, "grad_norm": 0.2948151027329717, "learning_rate": 6.8097800361520225e-06, "loss": 0.3608, "num_tokens": 456577860.0, "step": 1110 }, { "epoch": 1.0909785932721712, "grad_norm": 0.24641383244158463, "learning_rate": 6.804447899717955e-06, "loss": 0.3091, "num_tokens": 456981107.0, "step": 1111 }, { "epoch": 1.091360856269113, "grad_norm": 0.2981663979325173, "learning_rate": 6.799113763028296e-06, "loss": 0.3387, "num_tokens": 457409025.0, "step": 1112 }, { "epoch": 1.091743119266055, "grad_norm": 0.2917083900879846, "learning_rate": 6.793777634262471e-06, "loss": 0.3274, "num_tokens": 457818304.0, "step": 1113 }, { "epoch": 1.092125382262997, "grad_norm": 0.25636005276714496, "learning_rate": 6.788439521602962e-06, "loss": 0.3078, "num_tokens": 458260532.0, "step": 1114 }, { "epoch": 1.092507645259939, "grad_norm": 0.26098346116048976, "learning_rate": 6.783099433235295e-06, "loss": 0.3236, "num_tokens": 458683374.0, "step": 1115 }, { "epoch": 1.0928899082568808, "grad_norm": 0.304653416954866, "learning_rate": 6.777757377348023e-06, "loss": 0.3491, "num_tokens": 459134797.0, "step": 1116 }, { "epoch": 1.0932721712538227, "grad_norm": 0.3043918583481912, "learning_rate": 6.772413362132716e-06, "loss": 0.313, "num_tokens": 459516780.0, "step": 1117 }, { "epoch": 1.0936544342507646, "grad_norm": 0.3482812716597782, "learning_rate": 6.76706739578395e-06, "loss": 0.3451, "num_tokens": 459987984.0, "step": 1118 }, { "epoch": 1.0940366972477065, "grad_norm": 0.3158493862761315, "learning_rate": 6.761719486499288e-06, "loss": 0.3333, "num_tokens": 460413088.0, "step": 1119 }, { "epoch": 1.0944189602446484, "grad_norm": 0.27121800419914927, "learning_rate": 6.7563696424792834e-06, "loss": 0.3131, "num_tokens": 460792247.0, "step": 1120 }, { "epoch": 1.0948012232415902, "grad_norm": 0.28843980654081725, "learning_rate": 6.751017871927445e-06, "loss": 0.3539, "num_tokens": 461215906.0, "step": 1121 }, { "epoch": 1.0951834862385321, "grad_norm": 0.31733394722713354, "learning_rate": 6.745664183050242e-06, "loss": 0.333, "num_tokens": 461633956.0, "step": 1122 }, { "epoch": 1.095565749235474, "grad_norm": 0.26863273468526633, "learning_rate": 6.7403085840570785e-06, "loss": 0.3247, "num_tokens": 462027880.0, "step": 1123 }, { "epoch": 1.095948012232416, "grad_norm": 0.27096029779507264, "learning_rate": 6.7349510831603e-06, "loss": 0.3286, "num_tokens": 462423703.0, "step": 1124 }, { "epoch": 1.0963302752293578, "grad_norm": 0.26480293378303876, "learning_rate": 6.729591688575153e-06, "loss": 0.3351, "num_tokens": 462866690.0, "step": 1125 }, { "epoch": 1.0967125382262997, "grad_norm": 0.2824549658394046, "learning_rate": 6.7242304085198e-06, "loss": 0.3464, "num_tokens": 463290417.0, "step": 1126 }, { "epoch": 1.0970948012232415, "grad_norm": 0.3008875140021378, "learning_rate": 6.718867251215289e-06, "loss": 0.3285, "num_tokens": 463729495.0, "step": 1127 }, { "epoch": 1.0974770642201834, "grad_norm": 0.31595417403811077, "learning_rate": 6.713502224885549e-06, "loss": 0.3305, "num_tokens": 464139294.0, "step": 1128 }, { "epoch": 1.0978593272171253, "grad_norm": 0.25631436510666517, "learning_rate": 6.708135337757372e-06, "loss": 0.3313, "num_tokens": 464535185.0, "step": 1129 }, { "epoch": 1.0982415902140672, "grad_norm": 0.2524615716569441, "learning_rate": 6.702766598060408e-06, "loss": 0.3278, "num_tokens": 464946225.0, "step": 1130 }, { "epoch": 1.098623853211009, "grad_norm": 0.2931682594698415, "learning_rate": 6.697396014027141e-06, "loss": 0.3292, "num_tokens": 465409298.0, "step": 1131 }, { "epoch": 1.099006116207951, "grad_norm": 0.28157768858053606, "learning_rate": 6.692023593892889e-06, "loss": 0.334, "num_tokens": 465863347.0, "step": 1132 }, { "epoch": 1.099388379204893, "grad_norm": 0.2638431557946797, "learning_rate": 6.686649345895786e-06, "loss": 0.3286, "num_tokens": 466291299.0, "step": 1133 }, { "epoch": 1.099770642201835, "grad_norm": 0.2632175472734525, "learning_rate": 6.681273278276762e-06, "loss": 0.3079, "num_tokens": 466712230.0, "step": 1134 }, { "epoch": 1.1001529051987768, "grad_norm": 0.26981361006172583, "learning_rate": 6.675895399279546e-06, "loss": 0.3198, "num_tokens": 467099380.0, "step": 1135 }, { "epoch": 1.1005351681957187, "grad_norm": 0.27521942940893157, "learning_rate": 6.670515717150636e-06, "loss": 0.3067, "num_tokens": 467509555.0, "step": 1136 }, { "epoch": 1.1009174311926606, "grad_norm": 0.2776913545044279, "learning_rate": 6.665134240139302e-06, "loss": 0.3182, "num_tokens": 467873204.0, "step": 1137 }, { "epoch": 1.1012996941896025, "grad_norm": 0.2958732079870988, "learning_rate": 6.6597509764975635e-06, "loss": 0.3169, "num_tokens": 468265708.0, "step": 1138 }, { "epoch": 1.1016819571865444, "grad_norm": 0.2822060921278334, "learning_rate": 6.654365934480177e-06, "loss": 0.3105, "num_tokens": 468659042.0, "step": 1139 }, { "epoch": 1.1020642201834863, "grad_norm": 0.2614068593879412, "learning_rate": 6.648979122344631e-06, "loss": 0.3428, "num_tokens": 469064532.0, "step": 1140 }, { "epoch": 1.1024464831804281, "grad_norm": 0.2511414073536006, "learning_rate": 6.643590548351127e-06, "loss": 0.3215, "num_tokens": 469479167.0, "step": 1141 }, { "epoch": 1.10282874617737, "grad_norm": 0.3196836181893, "learning_rate": 6.638200220762563e-06, "loss": 0.3309, "num_tokens": 469907832.0, "step": 1142 }, { "epoch": 1.103211009174312, "grad_norm": 0.33916588495808236, "learning_rate": 6.632808147844535e-06, "loss": 0.3326, "num_tokens": 470319135.0, "step": 1143 }, { "epoch": 1.1035932721712538, "grad_norm": 0.30402951280202245, "learning_rate": 6.627414337865308e-06, "loss": 0.3492, "num_tokens": 470702885.0, "step": 1144 }, { "epoch": 1.1039755351681957, "grad_norm": 0.24623565724155985, "learning_rate": 6.622018799095811e-06, "loss": 0.3329, "num_tokens": 471124528.0, "step": 1145 }, { "epoch": 1.1043577981651376, "grad_norm": 0.3273334270655994, "learning_rate": 6.616621539809629e-06, "loss": 0.3203, "num_tokens": 471537772.0, "step": 1146 }, { "epoch": 1.1047400611620795, "grad_norm": 0.29241340463709176, "learning_rate": 6.61122256828298e-06, "loss": 0.3681, "num_tokens": 471990759.0, "step": 1147 }, { "epoch": 1.1051223241590213, "grad_norm": 0.26545928082746434, "learning_rate": 6.6058218927947114e-06, "loss": 0.348, "num_tokens": 472444353.0, "step": 1148 }, { "epoch": 1.1055045871559632, "grad_norm": 0.28358846811134375, "learning_rate": 6.600419521626281e-06, "loss": 0.3367, "num_tokens": 472845224.0, "step": 1149 }, { "epoch": 1.105886850152905, "grad_norm": 0.25962161365557807, "learning_rate": 6.595015463061749e-06, "loss": 0.3245, "num_tokens": 473236299.0, "step": 1150 }, { "epoch": 1.106269113149847, "grad_norm": 0.32720067855698187, "learning_rate": 6.58960972538776e-06, "loss": 0.3565, "num_tokens": 473624719.0, "step": 1151 }, { "epoch": 1.106651376146789, "grad_norm": 0.2735487413450729, "learning_rate": 6.584202316893537e-06, "loss": 0.3302, "num_tokens": 474053855.0, "step": 1152 }, { "epoch": 1.107033639143731, "grad_norm": 0.28172345158536743, "learning_rate": 6.5787932458708595e-06, "loss": 0.3136, "num_tokens": 474442162.0, "step": 1153 }, { "epoch": 1.1074159021406729, "grad_norm": 0.25743930040201973, "learning_rate": 6.573382520614065e-06, "loss": 0.3154, "num_tokens": 474843797.0, "step": 1154 }, { "epoch": 1.1077981651376148, "grad_norm": 0.28732220121762747, "learning_rate": 6.567970149420018e-06, "loss": 0.3266, "num_tokens": 475268581.0, "step": 1155 }, { "epoch": 1.1081804281345566, "grad_norm": 0.3337674907751543, "learning_rate": 6.562556140588113e-06, "loss": 0.3551, "num_tokens": 475697773.0, "step": 1156 }, { "epoch": 1.1085626911314985, "grad_norm": 0.2569855269694447, "learning_rate": 6.5571405024202554e-06, "loss": 0.3449, "num_tokens": 476100449.0, "step": 1157 }, { "epoch": 1.1089449541284404, "grad_norm": 0.26382197825690373, "learning_rate": 6.551723243220847e-06, "loss": 0.3236, "num_tokens": 476502944.0, "step": 1158 }, { "epoch": 1.1093272171253823, "grad_norm": 0.2715214893643109, "learning_rate": 6.546304371296775e-06, "loss": 0.3397, "num_tokens": 476883054.0, "step": 1159 }, { "epoch": 1.1097094801223242, "grad_norm": 0.33546685330152093, "learning_rate": 6.540883894957403e-06, "loss": 0.3565, "num_tokens": 477316845.0, "step": 1160 }, { "epoch": 1.110091743119266, "grad_norm": 0.28206080477550555, "learning_rate": 6.535461822514551e-06, "loss": 0.3346, "num_tokens": 477760415.0, "step": 1161 }, { "epoch": 1.110474006116208, "grad_norm": 0.2862144693541876, "learning_rate": 6.530038162282488e-06, "loss": 0.3123, "num_tokens": 478156950.0, "step": 1162 }, { "epoch": 1.1108562691131498, "grad_norm": 0.2679955889780286, "learning_rate": 6.524612922577917e-06, "loss": 0.3231, "num_tokens": 478538502.0, "step": 1163 }, { "epoch": 1.1112385321100917, "grad_norm": 0.3003856727392727, "learning_rate": 6.519186111719967e-06, "loss": 0.35, "num_tokens": 478967489.0, "step": 1164 }, { "epoch": 1.1116207951070336, "grad_norm": 0.3115099084544135, "learning_rate": 6.51375773803017e-06, "loss": 0.3347, "num_tokens": 479375795.0, "step": 1165 }, { "epoch": 1.1120030581039755, "grad_norm": 0.295069804868219, "learning_rate": 6.508327809832457e-06, "loss": 0.3362, "num_tokens": 479749512.0, "step": 1166 }, { "epoch": 1.1123853211009174, "grad_norm": 0.290660739471113, "learning_rate": 6.502896335453144e-06, "loss": 0.3464, "num_tokens": 480139338.0, "step": 1167 }, { "epoch": 1.1127675840978593, "grad_norm": 0.29644139281228193, "learning_rate": 6.497463323220917e-06, "loss": 0.3165, "num_tokens": 480531066.0, "step": 1168 }, { "epoch": 1.1131498470948011, "grad_norm": 0.3002763380856821, "learning_rate": 6.492028781466822e-06, "loss": 0.3184, "num_tokens": 480963162.0, "step": 1169 }, { "epoch": 1.113532110091743, "grad_norm": 0.26656736267127934, "learning_rate": 6.486592718524245e-06, "loss": 0.3289, "num_tokens": 481389069.0, "step": 1170 }, { "epoch": 1.1139143730886851, "grad_norm": 0.2846814708224225, "learning_rate": 6.48115514272891e-06, "loss": 0.3193, "num_tokens": 481738025.0, "step": 1171 }, { "epoch": 1.114296636085627, "grad_norm": 0.2755750639720911, "learning_rate": 6.475716062418861e-06, "loss": 0.3548, "num_tokens": 482176058.0, "step": 1172 }, { "epoch": 1.114678899082569, "grad_norm": 0.2854278914842033, "learning_rate": 6.470275485934443e-06, "loss": 0.3165, "num_tokens": 482559384.0, "step": 1173 }, { "epoch": 1.1150611620795108, "grad_norm": 0.30221699996712104, "learning_rate": 6.464833421618303e-06, "loss": 0.3201, "num_tokens": 482947421.0, "step": 1174 }, { "epoch": 1.1154434250764527, "grad_norm": 0.28618496860417286, "learning_rate": 6.459389877815364e-06, "loss": 0.3382, "num_tokens": 483338931.0, "step": 1175 }, { "epoch": 1.1158256880733946, "grad_norm": 0.23906740324076617, "learning_rate": 6.4539448628728205e-06, "loss": 0.3151, "num_tokens": 483777927.0, "step": 1176 }, { "epoch": 1.1162079510703364, "grad_norm": 0.2905278554547755, "learning_rate": 6.448498385140119e-06, "loss": 0.326, "num_tokens": 484189781.0, "step": 1177 }, { "epoch": 1.1165902140672783, "grad_norm": 0.29751545395143897, "learning_rate": 6.443050452968955e-06, "loss": 0.3181, "num_tokens": 484576775.0, "step": 1178 }, { "epoch": 1.1169724770642202, "grad_norm": 0.3107554427590532, "learning_rate": 6.437601074713249e-06, "loss": 0.3248, "num_tokens": 484967203.0, "step": 1179 }, { "epoch": 1.117354740061162, "grad_norm": 0.29491794915446384, "learning_rate": 6.432150258729142e-06, "loss": 0.3437, "num_tokens": 485360750.0, "step": 1180 }, { "epoch": 1.117737003058104, "grad_norm": 0.28962619219467095, "learning_rate": 6.426698013374979e-06, "loss": 0.3209, "num_tokens": 485786533.0, "step": 1181 }, { "epoch": 1.1181192660550459, "grad_norm": 0.30000894300982334, "learning_rate": 6.4212443470112965e-06, "loss": 0.3387, "num_tokens": 486221902.0, "step": 1182 }, { "epoch": 1.1185015290519877, "grad_norm": 0.30447255271836693, "learning_rate": 6.415789268000809e-06, "loss": 0.3218, "num_tokens": 486601322.0, "step": 1183 }, { "epoch": 1.1188837920489296, "grad_norm": 0.3352709408941912, "learning_rate": 6.4103327847084e-06, "loss": 0.3266, "num_tokens": 486991201.0, "step": 1184 }, { "epoch": 1.1192660550458715, "grad_norm": 0.2787395935714357, "learning_rate": 6.404874905501103e-06, "loss": 0.3323, "num_tokens": 487399086.0, "step": 1185 }, { "epoch": 1.1196483180428134, "grad_norm": 0.29387062799654673, "learning_rate": 6.399415638748093e-06, "loss": 0.3286, "num_tokens": 487843089.0, "step": 1186 }, { "epoch": 1.1200305810397553, "grad_norm": 0.2965124138924178, "learning_rate": 6.393954992820674e-06, "loss": 0.32, "num_tokens": 488220593.0, "step": 1187 }, { "epoch": 1.1204128440366972, "grad_norm": 0.26581801867197025, "learning_rate": 6.388492976092262e-06, "loss": 0.327, "num_tokens": 488599815.0, "step": 1188 }, { "epoch": 1.120795107033639, "grad_norm": 0.28016071705077283, "learning_rate": 6.383029596938381e-06, "loss": 0.3257, "num_tokens": 488998034.0, "step": 1189 }, { "epoch": 1.1211773700305812, "grad_norm": 0.3062308700634103, "learning_rate": 6.377564863736638e-06, "loss": 0.362, "num_tokens": 489448208.0, "step": 1190 }, { "epoch": 1.121559633027523, "grad_norm": 0.33560211872366813, "learning_rate": 6.372098784866719e-06, "loss": 0.356, "num_tokens": 489910540.0, "step": 1191 }, { "epoch": 1.121941896024465, "grad_norm": 0.3043991400671112, "learning_rate": 6.366631368710372e-06, "loss": 0.3126, "num_tokens": 490327252.0, "step": 1192 }, { "epoch": 1.1223241590214068, "grad_norm": 0.26848870857564255, "learning_rate": 6.361162623651398e-06, "loss": 0.3161, "num_tokens": 490713514.0, "step": 1193 }, { "epoch": 1.1227064220183487, "grad_norm": 0.332462798612261, "learning_rate": 6.355692558075633e-06, "loss": 0.3465, "num_tokens": 491081879.0, "step": 1194 }, { "epoch": 1.1230886850152906, "grad_norm": 0.4022524205956525, "learning_rate": 6.35022118037094e-06, "loss": 0.3558, "num_tokens": 491487306.0, "step": 1195 }, { "epoch": 1.1234709480122325, "grad_norm": 0.34500842554827216, "learning_rate": 6.344748498927193e-06, "loss": 0.3291, "num_tokens": 491908450.0, "step": 1196 }, { "epoch": 1.1238532110091743, "grad_norm": 0.25902689651334515, "learning_rate": 6.339274522136264e-06, "loss": 0.3015, "num_tokens": 492278381.0, "step": 1197 }, { "epoch": 1.1242354740061162, "grad_norm": 0.3104938393351937, "learning_rate": 6.333799258392015e-06, "loss": 0.3641, "num_tokens": 492727969.0, "step": 1198 }, { "epoch": 1.1246177370030581, "grad_norm": 0.3525594222887963, "learning_rate": 6.328322716090279e-06, "loss": 0.3351, "num_tokens": 493183862.0, "step": 1199 }, { "epoch": 1.125, "grad_norm": 0.3017488463769553, "learning_rate": 6.322844903628849e-06, "loss": 0.3506, "num_tokens": 493640410.0, "step": 1200 }, { "epoch": 1.1253822629969419, "grad_norm": 0.3107033247060044, "learning_rate": 6.317365829407465e-06, "loss": 0.3414, "num_tokens": 494034007.0, "step": 1201 }, { "epoch": 1.1257645259938838, "grad_norm": 0.2751720787164906, "learning_rate": 6.311885501827805e-06, "loss": 0.332, "num_tokens": 494448450.0, "step": 1202 }, { "epoch": 1.1261467889908257, "grad_norm": 0.29810723432497416, "learning_rate": 6.306403929293466e-06, "loss": 0.3308, "num_tokens": 494870141.0, "step": 1203 }, { "epoch": 1.1265290519877675, "grad_norm": 0.3299972544888789, "learning_rate": 6.300921120209956e-06, "loss": 0.335, "num_tokens": 495259276.0, "step": 1204 }, { "epoch": 1.1269113149847094, "grad_norm": 0.27606262733857034, "learning_rate": 6.29543708298468e-06, "loss": 0.3314, "num_tokens": 495699948.0, "step": 1205 }, { "epoch": 1.1272935779816513, "grad_norm": 0.2941016109721722, "learning_rate": 6.289951826026921e-06, "loss": 0.3156, "num_tokens": 496110554.0, "step": 1206 }, { "epoch": 1.1276758409785932, "grad_norm": 0.26876778526539413, "learning_rate": 6.284465357747839e-06, "loss": 0.3151, "num_tokens": 496521050.0, "step": 1207 }, { "epoch": 1.128058103975535, "grad_norm": 0.24321100529660128, "learning_rate": 6.278977686560445e-06, "loss": 0.3236, "num_tokens": 496922784.0, "step": 1208 }, { "epoch": 1.1284403669724772, "grad_norm": 0.31176559575102497, "learning_rate": 6.2734888208796e-06, "loss": 0.3337, "num_tokens": 497352315.0, "step": 1209 }, { "epoch": 1.1288226299694188, "grad_norm": 0.2984231143503115, "learning_rate": 6.267998769121995e-06, "loss": 0.3397, "num_tokens": 497766706.0, "step": 1210 }, { "epoch": 1.129204892966361, "grad_norm": 0.2700500572485866, "learning_rate": 6.262507539706138e-06, "loss": 0.3214, "num_tokens": 498187679.0, "step": 1211 }, { "epoch": 1.1295871559633028, "grad_norm": 0.2675612219059672, "learning_rate": 6.2570151410523426e-06, "loss": 0.3371, "num_tokens": 498594599.0, "step": 1212 }, { "epoch": 1.1299694189602447, "grad_norm": 0.2722378509521892, "learning_rate": 6.251521581582721e-06, "loss": 0.3551, "num_tokens": 499022040.0, "step": 1213 }, { "epoch": 1.1303516819571866, "grad_norm": 0.2838951082607539, "learning_rate": 6.246026869721159e-06, "loss": 0.3361, "num_tokens": 499406498.0, "step": 1214 }, { "epoch": 1.1307339449541285, "grad_norm": 0.2884680929780221, "learning_rate": 6.240531013893311e-06, "loss": 0.3484, "num_tokens": 499850767.0, "step": 1215 }, { "epoch": 1.1311162079510704, "grad_norm": 0.28582235021244295, "learning_rate": 6.235034022526587e-06, "loss": 0.3188, "num_tokens": 500277474.0, "step": 1216 }, { "epoch": 1.1314984709480123, "grad_norm": 0.2679533425163113, "learning_rate": 6.229535904050137e-06, "loss": 0.3196, "num_tokens": 500707867.0, "step": 1217 }, { "epoch": 1.1318807339449541, "grad_norm": 0.26615841636781934, "learning_rate": 6.22403666689484e-06, "loss": 0.3412, "num_tokens": 501119645.0, "step": 1218 }, { "epoch": 1.132262996941896, "grad_norm": 0.2593678841037097, "learning_rate": 6.2185363194932925e-06, "loss": 0.3187, "num_tokens": 501489226.0, "step": 1219 }, { "epoch": 1.132645259938838, "grad_norm": 0.2682673518599545, "learning_rate": 6.213034870279789e-06, "loss": 0.3175, "num_tokens": 501847939.0, "step": 1220 }, { "epoch": 1.1330275229357798, "grad_norm": 0.2948442974358067, "learning_rate": 6.207532327690314e-06, "loss": 0.3371, "num_tokens": 502244759.0, "step": 1221 }, { "epoch": 1.1334097859327217, "grad_norm": 0.30076076578492916, "learning_rate": 6.202028700162534e-06, "loss": 0.3425, "num_tokens": 502644864.0, "step": 1222 }, { "epoch": 1.1337920489296636, "grad_norm": 0.28180246181164414, "learning_rate": 6.196523996135774e-06, "loss": 0.3324, "num_tokens": 503056854.0, "step": 1223 }, { "epoch": 1.1341743119266054, "grad_norm": 0.22229708505087292, "learning_rate": 6.191018224051011e-06, "loss": 0.3241, "num_tokens": 503505198.0, "step": 1224 }, { "epoch": 1.1345565749235473, "grad_norm": 0.292999787898749, "learning_rate": 6.185511392350861e-06, "loss": 0.3322, "num_tokens": 503902845.0, "step": 1225 }, { "epoch": 1.1349388379204892, "grad_norm": 0.2928001850570755, "learning_rate": 6.180003509479563e-06, "loss": 0.333, "num_tokens": 504295122.0, "step": 1226 }, { "epoch": 1.135321100917431, "grad_norm": 0.29828780462984006, "learning_rate": 6.174494583882969e-06, "loss": 0.3583, "num_tokens": 504762521.0, "step": 1227 }, { "epoch": 1.1357033639143732, "grad_norm": 0.3019800751996685, "learning_rate": 6.168984624008527e-06, "loss": 0.3319, "num_tokens": 505166097.0, "step": 1228 }, { "epoch": 1.1360856269113149, "grad_norm": 0.28631076928255944, "learning_rate": 6.163473638305278e-06, "loss": 0.3373, "num_tokens": 505614429.0, "step": 1229 }, { "epoch": 1.136467889908257, "grad_norm": 0.28527484407587345, "learning_rate": 6.157961635223829e-06, "loss": 0.3202, "num_tokens": 505998145.0, "step": 1230 }, { "epoch": 1.1368501529051989, "grad_norm": 0.29458881015369165, "learning_rate": 6.152448623216351e-06, "loss": 0.3357, "num_tokens": 506403686.0, "step": 1231 }, { "epoch": 1.1372324159021407, "grad_norm": 0.31226283170569014, "learning_rate": 6.146934610736559e-06, "loss": 0.34, "num_tokens": 506770460.0, "step": 1232 }, { "epoch": 1.1376146788990826, "grad_norm": 0.28916373891386943, "learning_rate": 6.141419606239706e-06, "loss": 0.3238, "num_tokens": 507173256.0, "step": 1233 }, { "epoch": 1.1379969418960245, "grad_norm": 0.30002659299961043, "learning_rate": 6.135903618182563e-06, "loss": 0.3467, "num_tokens": 507592252.0, "step": 1234 }, { "epoch": 1.1383792048929664, "grad_norm": 0.2823773806154856, "learning_rate": 6.1303866550234105e-06, "loss": 0.3344, "num_tokens": 507972759.0, "step": 1235 }, { "epoch": 1.1387614678899083, "grad_norm": 0.29375630237253025, "learning_rate": 6.124868725222022e-06, "loss": 0.3234, "num_tokens": 508341201.0, "step": 1236 }, { "epoch": 1.1391437308868502, "grad_norm": 0.24724330360980634, "learning_rate": 6.11934983723966e-06, "loss": 0.3061, "num_tokens": 508724695.0, "step": 1237 }, { "epoch": 1.139525993883792, "grad_norm": 0.322882724078703, "learning_rate": 6.1138299995390474e-06, "loss": 0.3306, "num_tokens": 509164940.0, "step": 1238 }, { "epoch": 1.139908256880734, "grad_norm": 0.31655016049580065, "learning_rate": 6.108309220584368e-06, "loss": 0.335, "num_tokens": 509544798.0, "step": 1239 }, { "epoch": 1.1402905198776758, "grad_norm": 0.2865019074970294, "learning_rate": 6.102787508841249e-06, "loss": 0.3489, "num_tokens": 509945630.0, "step": 1240 }, { "epoch": 1.1406727828746177, "grad_norm": 0.2702364920607733, "learning_rate": 6.097264872776749e-06, "loss": 0.3334, "num_tokens": 510384844.0, "step": 1241 }, { "epoch": 1.1410550458715596, "grad_norm": 0.3010056332952182, "learning_rate": 6.091741320859342e-06, "loss": 0.3412, "num_tokens": 510793400.0, "step": 1242 }, { "epoch": 1.1414373088685015, "grad_norm": 0.2926277386916459, "learning_rate": 6.086216861558906e-06, "loss": 0.3199, "num_tokens": 511184572.0, "step": 1243 }, { "epoch": 1.1418195718654434, "grad_norm": 0.2909789761519127, "learning_rate": 6.0806915033467095e-06, "loss": 0.3373, "num_tokens": 511582034.0, "step": 1244 }, { "epoch": 1.1422018348623852, "grad_norm": 0.27999556858106905, "learning_rate": 6.075165254695404e-06, "loss": 0.3191, "num_tokens": 511954026.0, "step": 1245 }, { "epoch": 1.1425840978593271, "grad_norm": 0.2721584403384464, "learning_rate": 6.069638124079004e-06, "loss": 0.3507, "num_tokens": 512403908.0, "step": 1246 }, { "epoch": 1.1429663608562692, "grad_norm": 0.27835700256202, "learning_rate": 6.0641101199728725e-06, "loss": 0.3625, "num_tokens": 512840328.0, "step": 1247 }, { "epoch": 1.143348623853211, "grad_norm": 0.29029193418958227, "learning_rate": 6.058581250853718e-06, "loss": 0.3496, "num_tokens": 513255198.0, "step": 1248 }, { "epoch": 1.143730886850153, "grad_norm": 0.27777757438066064, "learning_rate": 6.05305152519957e-06, "loss": 0.3256, "num_tokens": 513666037.0, "step": 1249 }, { "epoch": 1.144113149847095, "grad_norm": 0.27010859494181105, "learning_rate": 6.047520951489777e-06, "loss": 0.3149, "num_tokens": 514049229.0, "step": 1250 }, { "epoch": 1.1444954128440368, "grad_norm": 0.27674372396782687, "learning_rate": 6.041989538204985e-06, "loss": 0.3217, "num_tokens": 514442087.0, "step": 1251 }, { "epoch": 1.1448776758409787, "grad_norm": 0.2628172888278004, "learning_rate": 6.036457293827127e-06, "loss": 0.3405, "num_tokens": 514879611.0, "step": 1252 }, { "epoch": 1.1452599388379205, "grad_norm": 0.2619591647094363, "learning_rate": 6.030924226839409e-06, "loss": 0.3402, "num_tokens": 515283517.0, "step": 1253 }, { "epoch": 1.1456422018348624, "grad_norm": 0.2818768513418224, "learning_rate": 6.025390345726303e-06, "loss": 0.3393, "num_tokens": 515685600.0, "step": 1254 }, { "epoch": 1.1460244648318043, "grad_norm": 0.2678219454813258, "learning_rate": 6.019855658973526e-06, "loss": 0.3203, "num_tokens": 516129302.0, "step": 1255 }, { "epoch": 1.1464067278287462, "grad_norm": 0.2797872167750161, "learning_rate": 6.014320175068029e-06, "loss": 0.3244, "num_tokens": 516508852.0, "step": 1256 }, { "epoch": 1.146788990825688, "grad_norm": 0.2696525313413257, "learning_rate": 6.008783902497991e-06, "loss": 0.3351, "num_tokens": 516958342.0, "step": 1257 }, { "epoch": 1.14717125382263, "grad_norm": 0.27308702191417084, "learning_rate": 6.003246849752795e-06, "loss": 0.3182, "num_tokens": 517408834.0, "step": 1258 }, { "epoch": 1.1475535168195719, "grad_norm": 0.26908608253097527, "learning_rate": 5.997709025323022e-06, "loss": 0.3483, "num_tokens": 517830615.0, "step": 1259 }, { "epoch": 1.1479357798165137, "grad_norm": 0.29413778426744663, "learning_rate": 5.992170437700436e-06, "loss": 0.3225, "num_tokens": 518213873.0, "step": 1260 }, { "epoch": 1.1483180428134556, "grad_norm": 0.29358445603734934, "learning_rate": 5.986631095377973e-06, "loss": 0.3582, "num_tokens": 518653456.0, "step": 1261 }, { "epoch": 1.1487003058103975, "grad_norm": 0.24719658419942464, "learning_rate": 5.981091006849723e-06, "loss": 0.3111, "num_tokens": 519036638.0, "step": 1262 }, { "epoch": 1.1490825688073394, "grad_norm": 0.26242070194244943, "learning_rate": 5.975550180610924e-06, "loss": 0.3304, "num_tokens": 519450315.0, "step": 1263 }, { "epoch": 1.1494648318042813, "grad_norm": 0.26895030510807805, "learning_rate": 5.970008625157943e-06, "loss": 0.3321, "num_tokens": 519886506.0, "step": 1264 }, { "epoch": 1.1498470948012232, "grad_norm": 0.3345310359045355, "learning_rate": 5.964466348988265e-06, "loss": 0.3603, "num_tokens": 520266247.0, "step": 1265 }, { "epoch": 1.150229357798165, "grad_norm": 0.29990485369656844, "learning_rate": 5.958923360600483e-06, "loss": 0.3399, "num_tokens": 520700245.0, "step": 1266 }, { "epoch": 1.150611620795107, "grad_norm": 0.2832255897139781, "learning_rate": 5.953379668494277e-06, "loss": 0.3622, "num_tokens": 521107093.0, "step": 1267 }, { "epoch": 1.150993883792049, "grad_norm": 0.2698577207990674, "learning_rate": 5.947835281170411e-06, "loss": 0.3415, "num_tokens": 521514124.0, "step": 1268 }, { "epoch": 1.151376146788991, "grad_norm": 0.3080503410584079, "learning_rate": 5.942290207130711e-06, "loss": 0.3338, "num_tokens": 521898134.0, "step": 1269 }, { "epoch": 1.1517584097859328, "grad_norm": 0.2805786961709091, "learning_rate": 5.9367444548780606e-06, "loss": 0.3231, "num_tokens": 522324838.0, "step": 1270 }, { "epoch": 1.1521406727828747, "grad_norm": 0.2744235940281009, "learning_rate": 5.931198032916378e-06, "loss": 0.3472, "num_tokens": 522776919.0, "step": 1271 }, { "epoch": 1.1525229357798166, "grad_norm": 0.29146604746825633, "learning_rate": 5.925650949750614e-06, "loss": 0.3585, "num_tokens": 523162888.0, "step": 1272 }, { "epoch": 1.1529051987767585, "grad_norm": 0.2822742713670998, "learning_rate": 5.920103213886731e-06, "loss": 0.3493, "num_tokens": 523603382.0, "step": 1273 }, { "epoch": 1.1532874617737003, "grad_norm": 0.2785104838460252, "learning_rate": 5.914554833831688e-06, "loss": 0.3607, "num_tokens": 524053934.0, "step": 1274 }, { "epoch": 1.1536697247706422, "grad_norm": 0.24806833548658463, "learning_rate": 5.909005818093438e-06, "loss": 0.316, "num_tokens": 524474054.0, "step": 1275 }, { "epoch": 1.154051987767584, "grad_norm": 0.2804731172270925, "learning_rate": 5.903456175180906e-06, "loss": 0.3604, "num_tokens": 524914455.0, "step": 1276 }, { "epoch": 1.154434250764526, "grad_norm": 0.3070257737490006, "learning_rate": 5.897905913603981e-06, "loss": 0.361, "num_tokens": 525326957.0, "step": 1277 }, { "epoch": 1.1548165137614679, "grad_norm": 0.31371418974737814, "learning_rate": 5.892355041873495e-06, "loss": 0.3315, "num_tokens": 525709293.0, "step": 1278 }, { "epoch": 1.1551987767584098, "grad_norm": 0.2716223655433006, "learning_rate": 5.88680356850122e-06, "loss": 0.3427, "num_tokens": 526098562.0, "step": 1279 }, { "epoch": 1.1555810397553516, "grad_norm": 0.3030769375277236, "learning_rate": 5.881251501999852e-06, "loss": 0.3442, "num_tokens": 526499312.0, "step": 1280 }, { "epoch": 1.1559633027522935, "grad_norm": 0.2963042171712443, "learning_rate": 5.875698850882994e-06, "loss": 0.3397, "num_tokens": 526923109.0, "step": 1281 }, { "epoch": 1.1563455657492354, "grad_norm": 0.3071982430267412, "learning_rate": 5.870145623665144e-06, "loss": 0.345, "num_tokens": 527335761.0, "step": 1282 }, { "epoch": 1.1567278287461773, "grad_norm": 0.29866114157295515, "learning_rate": 5.864591828861687e-06, "loss": 0.3201, "num_tokens": 527734981.0, "step": 1283 }, { "epoch": 1.1571100917431192, "grad_norm": 0.26266095162245895, "learning_rate": 5.859037474988875e-06, "loss": 0.3437, "num_tokens": 528180897.0, "step": 1284 }, { "epoch": 1.157492354740061, "grad_norm": 0.25819020802003484, "learning_rate": 5.85348257056382e-06, "loss": 0.3126, "num_tokens": 528550308.0, "step": 1285 }, { "epoch": 1.157874617737003, "grad_norm": 0.3043815202372134, "learning_rate": 5.8479271241044765e-06, "loss": 0.3241, "num_tokens": 528956031.0, "step": 1286 }, { "epoch": 1.158256880733945, "grad_norm": 0.27472679986267445, "learning_rate": 5.842371144129635e-06, "loss": 0.3278, "num_tokens": 529366228.0, "step": 1287 }, { "epoch": 1.158639143730887, "grad_norm": 0.2837054028386318, "learning_rate": 5.836814639158892e-06, "loss": 0.3566, "num_tokens": 529787450.0, "step": 1288 }, { "epoch": 1.1590214067278288, "grad_norm": 0.2589050147121643, "learning_rate": 5.831257617712663e-06, "loss": 0.3357, "num_tokens": 530216531.0, "step": 1289 }, { "epoch": 1.1594036697247707, "grad_norm": 0.2827656661346502, "learning_rate": 5.825700088312146e-06, "loss": 0.3378, "num_tokens": 530672941.0, "step": 1290 }, { "epoch": 1.1597859327217126, "grad_norm": 0.28634448435235477, "learning_rate": 5.820142059479325e-06, "loss": 0.3427, "num_tokens": 531103045.0, "step": 1291 }, { "epoch": 1.1601681957186545, "grad_norm": 0.286830346220727, "learning_rate": 5.814583539736941e-06, "loss": 0.3378, "num_tokens": 531487162.0, "step": 1292 }, { "epoch": 1.1605504587155964, "grad_norm": 0.29849931105834676, "learning_rate": 5.809024537608497e-06, "loss": 0.3347, "num_tokens": 531894849.0, "step": 1293 }, { "epoch": 1.1609327217125383, "grad_norm": 0.28084740509433626, "learning_rate": 5.80346506161823e-06, "loss": 0.3578, "num_tokens": 532299329.0, "step": 1294 }, { "epoch": 1.1613149847094801, "grad_norm": 0.23045410370720337, "learning_rate": 5.797905120291105e-06, "loss": 0.3191, "num_tokens": 532706908.0, "step": 1295 }, { "epoch": 1.161697247706422, "grad_norm": 0.2773975644863712, "learning_rate": 5.792344722152802e-06, "loss": 0.3337, "num_tokens": 533112769.0, "step": 1296 }, { "epoch": 1.162079510703364, "grad_norm": 0.28383362137923096, "learning_rate": 5.786783875729698e-06, "loss": 0.3344, "num_tokens": 533540197.0, "step": 1297 }, { "epoch": 1.1624617737003058, "grad_norm": 0.30083915244172127, "learning_rate": 5.7812225895488624e-06, "loss": 0.3467, "num_tokens": 533919313.0, "step": 1298 }, { "epoch": 1.1628440366972477, "grad_norm": 0.24300447493260083, "learning_rate": 5.775660872138035e-06, "loss": 0.3186, "num_tokens": 534309085.0, "step": 1299 }, { "epoch": 1.1632262996941896, "grad_norm": 0.28124353650839806, "learning_rate": 5.770098732025616e-06, "loss": 0.3309, "num_tokens": 534736114.0, "step": 1300 }, { "epoch": 1.1636085626911314, "grad_norm": 0.34338160046999916, "learning_rate": 5.764536177740658e-06, "loss": 0.3251, "num_tokens": 535154091.0, "step": 1301 }, { "epoch": 1.1639908256880733, "grad_norm": 0.2777355796087579, "learning_rate": 5.758973217812847e-06, "loss": 0.3559, "num_tokens": 535548923.0, "step": 1302 }, { "epoch": 1.1643730886850152, "grad_norm": 0.2517192897385153, "learning_rate": 5.7534098607724886e-06, "loss": 0.3415, "num_tokens": 535984687.0, "step": 1303 }, { "epoch": 1.164755351681957, "grad_norm": 0.27387034825726725, "learning_rate": 5.747846115150501e-06, "loss": 0.3309, "num_tokens": 536380850.0, "step": 1304 }, { "epoch": 1.165137614678899, "grad_norm": 0.2796362784976631, "learning_rate": 5.742281989478396e-06, "loss": 0.3072, "num_tokens": 536787401.0, "step": 1305 }, { "epoch": 1.165519877675841, "grad_norm": 0.24184125498369388, "learning_rate": 5.736717492288265e-06, "loss": 0.3335, "num_tokens": 537218169.0, "step": 1306 }, { "epoch": 1.165902140672783, "grad_norm": 0.25232621546366074, "learning_rate": 5.731152632112779e-06, "loss": 0.3355, "num_tokens": 537636666.0, "step": 1307 }, { "epoch": 1.1662844036697249, "grad_norm": 0.29465033041218436, "learning_rate": 5.725587417485157e-06, "loss": 0.3304, "num_tokens": 538013901.0, "step": 1308 }, { "epoch": 1.1666666666666667, "grad_norm": 0.309041606773954, "learning_rate": 5.720021856939162e-06, "loss": 0.3605, "num_tokens": 538432685.0, "step": 1309 }, { "epoch": 1.1670489296636086, "grad_norm": 0.314272404698483, "learning_rate": 5.714455959009091e-06, "loss": 0.3324, "num_tokens": 538825811.0, "step": 1310 }, { "epoch": 1.1674311926605505, "grad_norm": 0.2614159710867707, "learning_rate": 5.708889732229756e-06, "loss": 0.3409, "num_tokens": 539243648.0, "step": 1311 }, { "epoch": 1.1678134556574924, "grad_norm": 0.2443024470663165, "learning_rate": 5.7033231851364755e-06, "loss": 0.3634, "num_tokens": 539693841.0, "step": 1312 }, { "epoch": 1.1681957186544343, "grad_norm": 0.26154104551841184, "learning_rate": 5.6977563262650545e-06, "loss": 0.3154, "num_tokens": 540060479.0, "step": 1313 }, { "epoch": 1.1685779816513762, "grad_norm": 0.2529838628475418, "learning_rate": 5.692189164151783e-06, "loss": 0.3113, "num_tokens": 540452979.0, "step": 1314 }, { "epoch": 1.168960244648318, "grad_norm": 0.25669393135790824, "learning_rate": 5.686621707333407e-06, "loss": 0.3367, "num_tokens": 540887841.0, "step": 1315 }, { "epoch": 1.16934250764526, "grad_norm": 0.2812699969856882, "learning_rate": 5.681053964347136e-06, "loss": 0.3303, "num_tokens": 541331033.0, "step": 1316 }, { "epoch": 1.1697247706422018, "grad_norm": 0.2535974149901525, "learning_rate": 5.675485943730606e-06, "loss": 0.3609, "num_tokens": 541777222.0, "step": 1317 }, { "epoch": 1.1701070336391437, "grad_norm": 0.25840631807611825, "learning_rate": 5.669917654021891e-06, "loss": 0.327, "num_tokens": 542222005.0, "step": 1318 }, { "epoch": 1.1704892966360856, "grad_norm": 0.2697724432843473, "learning_rate": 5.664349103759467e-06, "loss": 0.3489, "num_tokens": 542631901.0, "step": 1319 }, { "epoch": 1.1708715596330275, "grad_norm": 0.27133888701747094, "learning_rate": 5.658780301482212e-06, "loss": 0.3422, "num_tokens": 543060829.0, "step": 1320 }, { "epoch": 1.1712538226299694, "grad_norm": 0.25878320630111323, "learning_rate": 5.653211255729396e-06, "loss": 0.3551, "num_tokens": 543517650.0, "step": 1321 }, { "epoch": 1.1716360856269112, "grad_norm": 0.26819280364944253, "learning_rate": 5.647641975040656e-06, "loss": 0.3424, "num_tokens": 543899754.0, "step": 1322 }, { "epoch": 1.1720183486238531, "grad_norm": 0.2534303978781987, "learning_rate": 5.6420724679559935e-06, "loss": 0.3221, "num_tokens": 544322959.0, "step": 1323 }, { "epoch": 1.172400611620795, "grad_norm": 0.2639307873962133, "learning_rate": 5.6365027430157544e-06, "loss": 0.3526, "num_tokens": 544716184.0, "step": 1324 }, { "epoch": 1.1727828746177371, "grad_norm": 0.30988974363217614, "learning_rate": 5.630932808760622e-06, "loss": 0.3328, "num_tokens": 545141369.0, "step": 1325 }, { "epoch": 1.1731651376146788, "grad_norm": 0.2818917196130082, "learning_rate": 5.625362673731597e-06, "loss": 0.3465, "num_tokens": 545538147.0, "step": 1326 }, { "epoch": 1.1735474006116209, "grad_norm": 0.3410974850927972, "learning_rate": 5.619792346469988e-06, "loss": 0.3768, "num_tokens": 545945638.0, "step": 1327 }, { "epoch": 1.1739296636085628, "grad_norm": 0.27402305325533044, "learning_rate": 5.614221835517401e-06, "loss": 0.3435, "num_tokens": 546371184.0, "step": 1328 }, { "epoch": 1.1743119266055047, "grad_norm": 0.28880305451530414, "learning_rate": 5.60865114941572e-06, "loss": 0.3204, "num_tokens": 546713971.0, "step": 1329 }, { "epoch": 1.1746941896024465, "grad_norm": 0.3228827000346965, "learning_rate": 5.603080296707104e-06, "loss": 0.3582, "num_tokens": 547161502.0, "step": 1330 }, { "epoch": 1.1750764525993884, "grad_norm": 0.30142371376868743, "learning_rate": 5.5975092859339604e-06, "loss": 0.331, "num_tokens": 547578265.0, "step": 1331 }, { "epoch": 1.1754587155963303, "grad_norm": 0.26576757402878626, "learning_rate": 5.591938125638941e-06, "loss": 0.363, "num_tokens": 548016613.0, "step": 1332 }, { "epoch": 1.1758409785932722, "grad_norm": 0.24641365074347962, "learning_rate": 5.586366824364933e-06, "loss": 0.3676, "num_tokens": 548479579.0, "step": 1333 }, { "epoch": 1.176223241590214, "grad_norm": 0.25605426029897516, "learning_rate": 5.5807953906550305e-06, "loss": 0.3435, "num_tokens": 548921996.0, "step": 1334 }, { "epoch": 1.176605504587156, "grad_norm": 0.2728580609167911, "learning_rate": 5.575223833052535e-06, "loss": 0.3086, "num_tokens": 549289550.0, "step": 1335 }, { "epoch": 1.1769877675840978, "grad_norm": 0.256797464366231, "learning_rate": 5.569652160100938e-06, "loss": 0.335, "num_tokens": 549693863.0, "step": 1336 }, { "epoch": 1.1773700305810397, "grad_norm": 0.2865218483460503, "learning_rate": 5.564080380343908e-06, "loss": 0.3553, "num_tokens": 550094835.0, "step": 1337 }, { "epoch": 1.1777522935779816, "grad_norm": 0.2620523294805251, "learning_rate": 5.5585085023252775e-06, "loss": 0.3613, "num_tokens": 550533990.0, "step": 1338 }, { "epoch": 1.1781345565749235, "grad_norm": 0.2620009970726739, "learning_rate": 5.552936534589029e-06, "loss": 0.3087, "num_tokens": 550859026.0, "step": 1339 }, { "epoch": 1.1785168195718654, "grad_norm": 0.27004059994447555, "learning_rate": 5.54736448567928e-06, "loss": 0.3151, "num_tokens": 551252088.0, "step": 1340 }, { "epoch": 1.1788990825688073, "grad_norm": 0.25920432137446164, "learning_rate": 5.5417923641402795e-06, "loss": 0.3578, "num_tokens": 551676918.0, "step": 1341 }, { "epoch": 1.1792813455657492, "grad_norm": 0.273106799089088, "learning_rate": 5.536220178516381e-06, "loss": 0.3197, "num_tokens": 552043959.0, "step": 1342 }, { "epoch": 1.179663608562691, "grad_norm": 0.25208398023175504, "learning_rate": 5.5306479373520385e-06, "loss": 0.3636, "num_tokens": 552499152.0, "step": 1343 }, { "epoch": 1.1800458715596331, "grad_norm": 0.24943968287218565, "learning_rate": 5.525075649191792e-06, "loss": 0.3075, "num_tokens": 552890112.0, "step": 1344 }, { "epoch": 1.1804281345565748, "grad_norm": 0.26257584519004706, "learning_rate": 5.519503322580253e-06, "loss": 0.3195, "num_tokens": 553310359.0, "step": 1345 }, { "epoch": 1.180810397553517, "grad_norm": 0.3364761227589351, "learning_rate": 5.513930966062093e-06, "loss": 0.3452, "num_tokens": 553747124.0, "step": 1346 }, { "epoch": 1.1811926605504588, "grad_norm": 0.2510117631456629, "learning_rate": 5.508358588182027e-06, "loss": 0.3545, "num_tokens": 554169144.0, "step": 1347 }, { "epoch": 1.1815749235474007, "grad_norm": 0.2586205298542163, "learning_rate": 5.502786197484806e-06, "loss": 0.3615, "num_tokens": 554603475.0, "step": 1348 }, { "epoch": 1.1819571865443426, "grad_norm": 0.29712645669110715, "learning_rate": 5.4972138025151955e-06, "loss": 0.3344, "num_tokens": 554958426.0, "step": 1349 }, { "epoch": 1.1823394495412844, "grad_norm": 0.2552510588902102, "learning_rate": 5.491641411817974e-06, "loss": 0.3119, "num_tokens": 555333405.0, "step": 1350 }, { "epoch": 1.1827217125382263, "grad_norm": 0.25438854300226804, "learning_rate": 5.486069033937907e-06, "loss": 0.3353, "num_tokens": 555765117.0, "step": 1351 }, { "epoch": 1.1831039755351682, "grad_norm": 0.27644822120937756, "learning_rate": 5.480496677419749e-06, "loss": 0.3195, "num_tokens": 556172255.0, "step": 1352 }, { "epoch": 1.18348623853211, "grad_norm": 0.26108602086015437, "learning_rate": 5.474924350808209e-06, "loss": 0.3597, "num_tokens": 556621721.0, "step": 1353 }, { "epoch": 1.183868501529052, "grad_norm": 0.30431586492079626, "learning_rate": 5.469352062647964e-06, "loss": 0.3351, "num_tokens": 557030718.0, "step": 1354 }, { "epoch": 1.1842507645259939, "grad_norm": 0.251684584540872, "learning_rate": 5.463779821483622e-06, "loss": 0.3429, "num_tokens": 557491165.0, "step": 1355 }, { "epoch": 1.1846330275229358, "grad_norm": 0.3214276581907004, "learning_rate": 5.4582076358597236e-06, "loss": 0.3664, "num_tokens": 557898868.0, "step": 1356 }, { "epoch": 1.1850152905198776, "grad_norm": 0.2799918136166761, "learning_rate": 5.452635514320721e-06, "loss": 0.3319, "num_tokens": 558288556.0, "step": 1357 }, { "epoch": 1.1853975535168195, "grad_norm": 0.27860450698348965, "learning_rate": 5.4470634654109734e-06, "loss": 0.3433, "num_tokens": 558695749.0, "step": 1358 }, { "epoch": 1.1857798165137614, "grad_norm": 0.32964096494581424, "learning_rate": 5.4414914976747256e-06, "loss": 0.3448, "num_tokens": 559082342.0, "step": 1359 }, { "epoch": 1.1861620795107033, "grad_norm": 0.36620755813583367, "learning_rate": 5.435919619656092e-06, "loss": 0.3422, "num_tokens": 559488552.0, "step": 1360 }, { "epoch": 1.1865443425076452, "grad_norm": 0.3213781316508178, "learning_rate": 5.4303478398990636e-06, "loss": 0.3354, "num_tokens": 559884114.0, "step": 1361 }, { "epoch": 1.186926605504587, "grad_norm": 0.28058421787237275, "learning_rate": 5.424776166947466e-06, "loss": 0.3326, "num_tokens": 560266302.0, "step": 1362 }, { "epoch": 1.1873088685015292, "grad_norm": 0.3026842162831174, "learning_rate": 5.419204609344971e-06, "loss": 0.3428, "num_tokens": 560631357.0, "step": 1363 }, { "epoch": 1.1876911314984708, "grad_norm": 0.2747070776274948, "learning_rate": 5.413633175635069e-06, "loss": 0.3421, "num_tokens": 561053439.0, "step": 1364 }, { "epoch": 1.188073394495413, "grad_norm": 0.26901634762917187, "learning_rate": 5.408061874361059e-06, "loss": 0.3265, "num_tokens": 561469160.0, "step": 1365 }, { "epoch": 1.1884556574923548, "grad_norm": 0.2750589565784775, "learning_rate": 5.402490714066042e-06, "loss": 0.3338, "num_tokens": 561876780.0, "step": 1366 }, { "epoch": 1.1888379204892967, "grad_norm": 0.3069863313161933, "learning_rate": 5.396919703292898e-06, "loss": 0.3473, "num_tokens": 562282744.0, "step": 1367 }, { "epoch": 1.1892201834862386, "grad_norm": 0.2731402493495636, "learning_rate": 5.391348850584283e-06, "loss": 0.3293, "num_tokens": 562698144.0, "step": 1368 }, { "epoch": 1.1896024464831805, "grad_norm": 0.28710289800065253, "learning_rate": 5.385778164482601e-06, "loss": 0.3501, "num_tokens": 563101816.0, "step": 1369 }, { "epoch": 1.1899847094801224, "grad_norm": 0.3212726567400315, "learning_rate": 5.380207653530014e-06, "loss": 0.3603, "num_tokens": 563494768.0, "step": 1370 }, { "epoch": 1.1903669724770642, "grad_norm": 0.3271944057404377, "learning_rate": 5.374637326268405e-06, "loss": 0.3191, "num_tokens": 563852980.0, "step": 1371 }, { "epoch": 1.1907492354740061, "grad_norm": 0.27908977762854204, "learning_rate": 5.36906719123938e-06, "loss": 0.3353, "num_tokens": 564274794.0, "step": 1372 }, { "epoch": 1.191131498470948, "grad_norm": 0.2725219276464799, "learning_rate": 5.363497256984246e-06, "loss": 0.3629, "num_tokens": 564726003.0, "step": 1373 }, { "epoch": 1.19151376146789, "grad_norm": 0.2680686365133286, "learning_rate": 5.357927532044008e-06, "loss": 0.3178, "num_tokens": 565121488.0, "step": 1374 }, { "epoch": 1.1918960244648318, "grad_norm": 0.2515536947449138, "learning_rate": 5.352358024959347e-06, "loss": 0.3394, "num_tokens": 565540090.0, "step": 1375 }, { "epoch": 1.1922782874617737, "grad_norm": 0.25330141166030484, "learning_rate": 5.346788744270606e-06, "loss": 0.3467, "num_tokens": 565937170.0, "step": 1376 }, { "epoch": 1.1926605504587156, "grad_norm": 0.28424405686817317, "learning_rate": 5.34121969851779e-06, "loss": 0.3616, "num_tokens": 566332486.0, "step": 1377 }, { "epoch": 1.1930428134556574, "grad_norm": 0.28078474279921, "learning_rate": 5.3356508962405355e-06, "loss": 0.3429, "num_tokens": 566766459.0, "step": 1378 }, { "epoch": 1.1934250764525993, "grad_norm": 0.26702394113646466, "learning_rate": 5.33008234597811e-06, "loss": 0.3441, "num_tokens": 567182057.0, "step": 1379 }, { "epoch": 1.1938073394495412, "grad_norm": 0.2771604577913791, "learning_rate": 5.3245140562693935e-06, "loss": 0.3412, "num_tokens": 567592867.0, "step": 1380 }, { "epoch": 1.194189602446483, "grad_norm": 0.2300144032980534, "learning_rate": 5.318946035652865e-06, "loss": 0.3382, "num_tokens": 568020880.0, "step": 1381 }, { "epoch": 1.1945718654434252, "grad_norm": 0.24727581085690706, "learning_rate": 5.313378292666593e-06, "loss": 0.3313, "num_tokens": 568430277.0, "step": 1382 }, { "epoch": 1.1949541284403669, "grad_norm": 0.2523918862437742, "learning_rate": 5.3078108358482195e-06, "loss": 0.339, "num_tokens": 568789434.0, "step": 1383 }, { "epoch": 1.195336391437309, "grad_norm": 0.26821998228851673, "learning_rate": 5.302243673734946e-06, "loss": 0.3398, "num_tokens": 569157627.0, "step": 1384 }, { "epoch": 1.1957186544342508, "grad_norm": 0.27095145054063874, "learning_rate": 5.296676814863526e-06, "loss": 0.3616, "num_tokens": 569603856.0, "step": 1385 }, { "epoch": 1.1961009174311927, "grad_norm": 0.26178490868073856, "learning_rate": 5.291110267770246e-06, "loss": 0.3487, "num_tokens": 570004028.0, "step": 1386 }, { "epoch": 1.1964831804281346, "grad_norm": 0.3154005476351008, "learning_rate": 5.285544040990911e-06, "loss": 0.3508, "num_tokens": 570435235.0, "step": 1387 }, { "epoch": 1.1968654434250765, "grad_norm": 0.26614780841414865, "learning_rate": 5.279978143060841e-06, "loss": 0.3302, "num_tokens": 570830181.0, "step": 1388 }, { "epoch": 1.1972477064220184, "grad_norm": 0.2628416320831463, "learning_rate": 5.274412582514845e-06, "loss": 0.3606, "num_tokens": 571233656.0, "step": 1389 }, { "epoch": 1.1976299694189603, "grad_norm": 0.27715431223283626, "learning_rate": 5.268847367887222e-06, "loss": 0.3382, "num_tokens": 571655604.0, "step": 1390 }, { "epoch": 1.1980122324159022, "grad_norm": 0.2765569380362338, "learning_rate": 5.263282507711734e-06, "loss": 0.3606, "num_tokens": 572086268.0, "step": 1391 }, { "epoch": 1.198394495412844, "grad_norm": 0.26061891209710825, "learning_rate": 5.2577180105216075e-06, "loss": 0.3284, "num_tokens": 572454480.0, "step": 1392 }, { "epoch": 1.198776758409786, "grad_norm": 0.2893721214247075, "learning_rate": 5.2521538848495015e-06, "loss": 0.3378, "num_tokens": 572842094.0, "step": 1393 }, { "epoch": 1.1991590214067278, "grad_norm": 0.2665465987385463, "learning_rate": 5.246590139227513e-06, "loss": 0.3586, "num_tokens": 573259429.0, "step": 1394 }, { "epoch": 1.1995412844036697, "grad_norm": 0.25949626519122665, "learning_rate": 5.2410267821871556e-06, "loss": 0.3409, "num_tokens": 573674345.0, "step": 1395 }, { "epoch": 1.1999235474006116, "grad_norm": 0.25292850092251457, "learning_rate": 5.235463822259343e-06, "loss": 0.3485, "num_tokens": 574084417.0, "step": 1396 }, { "epoch": 1.2003058103975535, "grad_norm": 0.2616914259118696, "learning_rate": 5.229901267974386e-06, "loss": 0.338, "num_tokens": 574570186.0, "step": 1397 }, { "epoch": 1.2006880733944953, "grad_norm": 0.27877354549603256, "learning_rate": 5.224339127861967e-06, "loss": 0.3736, "num_tokens": 574983429.0, "step": 1398 }, { "epoch": 1.2010703363914372, "grad_norm": 0.2621265565285058, "learning_rate": 5.21877741045114e-06, "loss": 0.3476, "num_tokens": 575395756.0, "step": 1399 }, { "epoch": 1.2014525993883791, "grad_norm": 0.2742773048549955, "learning_rate": 5.213216124270302e-06, "loss": 0.3371, "num_tokens": 575783755.0, "step": 1400 }, { "epoch": 1.2018348623853212, "grad_norm": 0.27906385898119274, "learning_rate": 5.2076552778472e-06, "loss": 0.3329, "num_tokens": 576200507.0, "step": 1401 }, { "epoch": 1.2022171253822629, "grad_norm": 0.25687283866749777, "learning_rate": 5.2020948797088966e-06, "loss": 0.3179, "num_tokens": 576576622.0, "step": 1402 }, { "epoch": 1.202599388379205, "grad_norm": 0.2460919647336761, "learning_rate": 5.196534938381772e-06, "loss": 0.3479, "num_tokens": 577015647.0, "step": 1403 }, { "epoch": 1.2029816513761469, "grad_norm": 0.2700509491553811, "learning_rate": 5.190975462391505e-06, "loss": 0.3346, "num_tokens": 577391454.0, "step": 1404 }, { "epoch": 1.2033639143730888, "grad_norm": 0.2815930587995079, "learning_rate": 5.185416460263061e-06, "loss": 0.3729, "num_tokens": 577793958.0, "step": 1405 }, { "epoch": 1.2037461773700306, "grad_norm": 0.24162844223122162, "learning_rate": 5.179857940520678e-06, "loss": 0.322, "num_tokens": 578207442.0, "step": 1406 }, { "epoch": 1.2041284403669725, "grad_norm": 0.2765822632686362, "learning_rate": 5.174299911687854e-06, "loss": 0.3419, "num_tokens": 578613416.0, "step": 1407 }, { "epoch": 1.2045107033639144, "grad_norm": 0.24160945120242927, "learning_rate": 5.16874238228734e-06, "loss": 0.3165, "num_tokens": 579041794.0, "step": 1408 }, { "epoch": 1.2048929663608563, "grad_norm": 0.2554094526264656, "learning_rate": 5.16318536084111e-06, "loss": 0.3416, "num_tokens": 579464950.0, "step": 1409 }, { "epoch": 1.2052752293577982, "grad_norm": 0.26599550197518695, "learning_rate": 5.157628855870369e-06, "loss": 0.3571, "num_tokens": 579867063.0, "step": 1410 }, { "epoch": 1.20565749235474, "grad_norm": 0.2516927193627474, "learning_rate": 5.152072875895524e-06, "loss": 0.3473, "num_tokens": 580302317.0, "step": 1411 }, { "epoch": 1.206039755351682, "grad_norm": 0.24855271629994571, "learning_rate": 5.1465174294361815e-06, "loss": 0.3301, "num_tokens": 580690091.0, "step": 1412 }, { "epoch": 1.2064220183486238, "grad_norm": 0.24558978907512305, "learning_rate": 5.1409625250111265e-06, "loss": 0.3331, "num_tokens": 581115425.0, "step": 1413 }, { "epoch": 1.2068042813455657, "grad_norm": 0.2578010603148845, "learning_rate": 5.1354081711383155e-06, "loss": 0.3329, "num_tokens": 581516764.0, "step": 1414 }, { "epoch": 1.2071865443425076, "grad_norm": 0.23975504250036228, "learning_rate": 5.129854376334859e-06, "loss": 0.3191, "num_tokens": 581940060.0, "step": 1415 }, { "epoch": 1.2075688073394495, "grad_norm": 0.2484998360852266, "learning_rate": 5.124301149117008e-06, "loss": 0.3387, "num_tokens": 582374651.0, "step": 1416 }, { "epoch": 1.2079510703363914, "grad_norm": 0.26548934993454854, "learning_rate": 5.11874849800015e-06, "loss": 0.3481, "num_tokens": 582763438.0, "step": 1417 }, { "epoch": 1.2083333333333333, "grad_norm": 0.252163932582216, "learning_rate": 5.113196431498783e-06, "loss": 0.3535, "num_tokens": 583241991.0, "step": 1418 }, { "epoch": 1.2087155963302751, "grad_norm": 0.2783722614542156, "learning_rate": 5.1076449581265084e-06, "loss": 0.3518, "num_tokens": 583647446.0, "step": 1419 }, { "epoch": 1.209097859327217, "grad_norm": 0.2602301446119456, "learning_rate": 5.102094086396021e-06, "loss": 0.3488, "num_tokens": 584081127.0, "step": 1420 }, { "epoch": 1.209480122324159, "grad_norm": 0.27169725102755915, "learning_rate": 5.096543824819096e-06, "loss": 0.3628, "num_tokens": 584509039.0, "step": 1421 }, { "epoch": 1.209862385321101, "grad_norm": 0.2620672668182354, "learning_rate": 5.0909941819065624e-06, "loss": 0.3307, "num_tokens": 584907216.0, "step": 1422 }, { "epoch": 1.210244648318043, "grad_norm": 0.23969063514026234, "learning_rate": 5.085445166168313e-06, "loss": 0.3456, "num_tokens": 585330821.0, "step": 1423 }, { "epoch": 1.2106269113149848, "grad_norm": 0.2700596226005749, "learning_rate": 5.079896786113271e-06, "loss": 0.3551, "num_tokens": 585730754.0, "step": 1424 }, { "epoch": 1.2110091743119267, "grad_norm": 0.2479005234748073, "learning_rate": 5.0743490502493865e-06, "loss": 0.3592, "num_tokens": 586172686.0, "step": 1425 }, { "epoch": 1.2113914373088686, "grad_norm": 0.24222616670337951, "learning_rate": 5.068801967083624e-06, "loss": 0.3311, "num_tokens": 586602209.0, "step": 1426 }, { "epoch": 1.2117737003058104, "grad_norm": 0.299527088448113, "learning_rate": 5.063255545121941e-06, "loss": 0.3388, "num_tokens": 586997897.0, "step": 1427 }, { "epoch": 1.2121559633027523, "grad_norm": 0.24137978692899348, "learning_rate": 5.057709792869291e-06, "loss": 0.3382, "num_tokens": 587430388.0, "step": 1428 }, { "epoch": 1.2125382262996942, "grad_norm": 0.2389468213471267, "learning_rate": 5.052164718829591e-06, "loss": 0.3574, "num_tokens": 587866141.0, "step": 1429 }, { "epoch": 1.212920489296636, "grad_norm": 0.23843445927237836, "learning_rate": 5.046620331505725e-06, "loss": 0.3468, "num_tokens": 588259863.0, "step": 1430 }, { "epoch": 1.213302752293578, "grad_norm": 0.2592158152435474, "learning_rate": 5.0410766393995196e-06, "loss": 0.3598, "num_tokens": 588692785.0, "step": 1431 }, { "epoch": 1.2136850152905199, "grad_norm": 0.3236524856457128, "learning_rate": 5.035533651011737e-06, "loss": 0.3681, "num_tokens": 589140008.0, "step": 1432 }, { "epoch": 1.2140672782874617, "grad_norm": 0.3231716045774789, "learning_rate": 5.029991374842058e-06, "loss": 0.3776, "num_tokens": 589577226.0, "step": 1433 }, { "epoch": 1.2144495412844036, "grad_norm": 0.27330984416902876, "learning_rate": 5.024449819389079e-06, "loss": 0.3283, "num_tokens": 589977520.0, "step": 1434 }, { "epoch": 1.2148318042813455, "grad_norm": 0.27711815419739766, "learning_rate": 5.0189089931502774e-06, "loss": 0.3314, "num_tokens": 590333940.0, "step": 1435 }, { "epoch": 1.2152140672782874, "grad_norm": 0.25082337758936657, "learning_rate": 5.0133689046220305e-06, "loss": 0.35, "num_tokens": 590769392.0, "step": 1436 }, { "epoch": 1.2155963302752293, "grad_norm": 0.24514922403076825, "learning_rate": 5.007829562299567e-06, "loss": 0.3633, "num_tokens": 591218518.0, "step": 1437 }, { "epoch": 1.2159785932721712, "grad_norm": 0.23652350998859914, "learning_rate": 5.00229097467698e-06, "loss": 0.3578, "num_tokens": 591674926.0, "step": 1438 }, { "epoch": 1.216360856269113, "grad_norm": 0.2875587626857516, "learning_rate": 4.996753150247206e-06, "loss": 0.3416, "num_tokens": 592069939.0, "step": 1439 }, { "epoch": 1.216743119266055, "grad_norm": 0.26569620097643265, "learning_rate": 4.991216097502009e-06, "loss": 0.3596, "num_tokens": 592451931.0, "step": 1440 }, { "epoch": 1.217125382262997, "grad_norm": 0.23722529594349642, "learning_rate": 4.985679824931973e-06, "loss": 0.3123, "num_tokens": 592877382.0, "step": 1441 }, { "epoch": 1.217507645259939, "grad_norm": 0.2718248599459754, "learning_rate": 4.980144341026475e-06, "loss": 0.3361, "num_tokens": 593280628.0, "step": 1442 }, { "epoch": 1.2178899082568808, "grad_norm": 0.259969371177258, "learning_rate": 4.974609654273699e-06, "loss": 0.3166, "num_tokens": 593683143.0, "step": 1443 }, { "epoch": 1.2182721712538227, "grad_norm": 0.2858017941992173, "learning_rate": 4.969075773160591e-06, "loss": 0.3604, "num_tokens": 594140574.0, "step": 1444 }, { "epoch": 1.2186544342507646, "grad_norm": 0.25731140585770546, "learning_rate": 4.963542706172875e-06, "loss": 0.317, "num_tokens": 594531170.0, "step": 1445 }, { "epoch": 1.2190366972477065, "grad_norm": 0.26673586482778844, "learning_rate": 4.958010461795015e-06, "loss": 0.3275, "num_tokens": 594929877.0, "step": 1446 }, { "epoch": 1.2194189602446484, "grad_norm": 0.2724948758311067, "learning_rate": 4.9524790485102245e-06, "loss": 0.3538, "num_tokens": 595330308.0, "step": 1447 }, { "epoch": 1.2198012232415902, "grad_norm": 0.2638491077385973, "learning_rate": 4.946948474800433e-06, "loss": 0.3543, "num_tokens": 595730073.0, "step": 1448 }, { "epoch": 1.2201834862385321, "grad_norm": 0.28052594497754146, "learning_rate": 4.941418749146285e-06, "loss": 0.3575, "num_tokens": 596136671.0, "step": 1449 }, { "epoch": 1.220565749235474, "grad_norm": 0.2730903926179031, "learning_rate": 4.935889880027131e-06, "loss": 0.341, "num_tokens": 596520791.0, "step": 1450 }, { "epoch": 1.220948012232416, "grad_norm": 0.27587189339245105, "learning_rate": 4.9303618759209985e-06, "loss": 0.3364, "num_tokens": 596925051.0, "step": 1451 }, { "epoch": 1.2213302752293578, "grad_norm": 0.2331021046558726, "learning_rate": 4.924834745304597e-06, "loss": 0.3408, "num_tokens": 597354974.0, "step": 1452 }, { "epoch": 1.2217125382262997, "grad_norm": 0.26967929153955755, "learning_rate": 4.919308496653291e-06, "loss": 0.3365, "num_tokens": 597745041.0, "step": 1453 }, { "epoch": 1.2220948012232415, "grad_norm": 0.2441304698585968, "learning_rate": 4.913783138441096e-06, "loss": 0.34, "num_tokens": 598166032.0, "step": 1454 }, { "epoch": 1.2224770642201834, "grad_norm": 0.30568369318908595, "learning_rate": 4.90825867914066e-06, "loss": 0.3662, "num_tokens": 598615529.0, "step": 1455 }, { "epoch": 1.2228593272171253, "grad_norm": 0.27108813426412587, "learning_rate": 4.902735127223251e-06, "loss": 0.3357, "num_tokens": 598962015.0, "step": 1456 }, { "epoch": 1.2232415902140672, "grad_norm": 0.26376651550064845, "learning_rate": 4.897212491158753e-06, "loss": 0.3418, "num_tokens": 599353019.0, "step": 1457 }, { "epoch": 1.223623853211009, "grad_norm": 0.267246845896282, "learning_rate": 4.891690779415635e-06, "loss": 0.3292, "num_tokens": 599738876.0, "step": 1458 }, { "epoch": 1.224006116207951, "grad_norm": 0.2762491852336058, "learning_rate": 4.886170000460956e-06, "loss": 0.3429, "num_tokens": 600147051.0, "step": 1459 }, { "epoch": 1.224388379204893, "grad_norm": 0.2701923633567987, "learning_rate": 4.880650162760342e-06, "loss": 0.3381, "num_tokens": 600546768.0, "step": 1460 }, { "epoch": 1.224770642201835, "grad_norm": 0.27426144067796565, "learning_rate": 4.8751312747779784e-06, "loss": 0.3646, "num_tokens": 600973826.0, "step": 1461 }, { "epoch": 1.2251529051987768, "grad_norm": 0.2361830189895042, "learning_rate": 4.869613344976593e-06, "loss": 0.3586, "num_tokens": 601413023.0, "step": 1462 }, { "epoch": 1.2255351681957187, "grad_norm": 0.32601450858801245, "learning_rate": 4.86409638181744e-06, "loss": 0.3493, "num_tokens": 601815906.0, "step": 1463 }, { "epoch": 1.2259174311926606, "grad_norm": 0.27346705309985486, "learning_rate": 4.858580393760295e-06, "loss": 0.3237, "num_tokens": 602217583.0, "step": 1464 }, { "epoch": 1.2262996941896025, "grad_norm": 0.32413952008488867, "learning_rate": 4.853065389263442e-06, "loss": 0.3583, "num_tokens": 602650577.0, "step": 1465 }, { "epoch": 1.2266819571865444, "grad_norm": 0.32723787793552767, "learning_rate": 4.84755137678365e-06, "loss": 0.3475, "num_tokens": 603086817.0, "step": 1466 }, { "epoch": 1.2270642201834863, "grad_norm": 0.268616572995574, "learning_rate": 4.842038364776171e-06, "loss": 0.3561, "num_tokens": 603512296.0, "step": 1467 }, { "epoch": 1.2274464831804281, "grad_norm": 0.2709343214222051, "learning_rate": 4.836526361694724e-06, "loss": 0.3643, "num_tokens": 603931069.0, "step": 1468 }, { "epoch": 1.22782874617737, "grad_norm": 0.3260819738706885, "learning_rate": 4.8310153759914745e-06, "loss": 0.3697, "num_tokens": 604328786.0, "step": 1469 }, { "epoch": 1.228211009174312, "grad_norm": 0.27786297560386686, "learning_rate": 4.825505416117034e-06, "loss": 0.3543, "num_tokens": 604734964.0, "step": 1470 }, { "epoch": 1.2285932721712538, "grad_norm": 0.2604192561524548, "learning_rate": 4.819996490520438e-06, "loss": 0.3271, "num_tokens": 605178313.0, "step": 1471 }, { "epoch": 1.2289755351681957, "grad_norm": 0.26109735350940677, "learning_rate": 4.814488607649141e-06, "loss": 0.3524, "num_tokens": 605592377.0, "step": 1472 }, { "epoch": 1.2293577981651376, "grad_norm": 0.2797810462333863, "learning_rate": 4.808981775948989e-06, "loss": 0.3476, "num_tokens": 605998341.0, "step": 1473 }, { "epoch": 1.2297400611620795, "grad_norm": 0.24640832170391241, "learning_rate": 4.803476003864227e-06, "loss": 0.3535, "num_tokens": 606401178.0, "step": 1474 }, { "epoch": 1.2301223241590213, "grad_norm": 0.2942989088750153, "learning_rate": 4.797971299837466e-06, "loss": 0.336, "num_tokens": 606804645.0, "step": 1475 }, { "epoch": 1.2305045871559632, "grad_norm": 0.28471978921225577, "learning_rate": 4.792467672309686e-06, "loss": 0.3611, "num_tokens": 607267175.0, "step": 1476 }, { "epoch": 1.230886850152905, "grad_norm": 0.2577764435469393, "learning_rate": 4.7869651297202144e-06, "loss": 0.3389, "num_tokens": 607695846.0, "step": 1477 }, { "epoch": 1.231269113149847, "grad_norm": 0.2552033630619719, "learning_rate": 4.78146368050671e-06, "loss": 0.3786, "num_tokens": 608156545.0, "step": 1478 }, { "epoch": 1.231651376146789, "grad_norm": 0.2678851518078223, "learning_rate": 4.775963333105161e-06, "loss": 0.3833, "num_tokens": 608601289.0, "step": 1479 }, { "epoch": 1.2320336391437308, "grad_norm": 0.25369458979380793, "learning_rate": 4.770464095949865e-06, "loss": 0.3541, "num_tokens": 609027276.0, "step": 1480 }, { "epoch": 1.2324159021406729, "grad_norm": 0.2659788503745061, "learning_rate": 4.764965977473416e-06, "loss": 0.3517, "num_tokens": 609471043.0, "step": 1481 }, { "epoch": 1.2327981651376148, "grad_norm": 0.2657842908883709, "learning_rate": 4.7594689861066904e-06, "loss": 0.3515, "num_tokens": 609909672.0, "step": 1482 }, { "epoch": 1.2331804281345566, "grad_norm": 0.25381790180089137, "learning_rate": 4.7539731302788435e-06, "loss": 0.3375, "num_tokens": 610300328.0, "step": 1483 }, { "epoch": 1.2335626911314985, "grad_norm": 0.28007643986816405, "learning_rate": 4.7484784184172796e-06, "loss": 0.3672, "num_tokens": 610697792.0, "step": 1484 }, { "epoch": 1.2339449541284404, "grad_norm": 0.2665383822644655, "learning_rate": 4.742984858947658e-06, "loss": 0.3383, "num_tokens": 611105840.0, "step": 1485 }, { "epoch": 1.2343272171253823, "grad_norm": 0.26259115872856414, "learning_rate": 4.737492460293865e-06, "loss": 0.3452, "num_tokens": 611518454.0, "step": 1486 }, { "epoch": 1.2347094801223242, "grad_norm": 0.2894301650285961, "learning_rate": 4.7320012308780074e-06, "loss": 0.3782, "num_tokens": 611939223.0, "step": 1487 }, { "epoch": 1.235091743119266, "grad_norm": 0.27807499399347135, "learning_rate": 4.726511179120402e-06, "loss": 0.3441, "num_tokens": 612312398.0, "step": 1488 }, { "epoch": 1.235474006116208, "grad_norm": 0.2613156134344048, "learning_rate": 4.721022313439556e-06, "loss": 0.3534, "num_tokens": 612721537.0, "step": 1489 }, { "epoch": 1.2358562691131498, "grad_norm": 0.2903192050933763, "learning_rate": 4.715534642252163e-06, "loss": 0.3337, "num_tokens": 613089683.0, "step": 1490 }, { "epoch": 1.2362385321100917, "grad_norm": 0.283007698889918, "learning_rate": 4.71004817397308e-06, "loss": 0.3579, "num_tokens": 613554657.0, "step": 1491 }, { "epoch": 1.2366207951070336, "grad_norm": 0.2581786248807065, "learning_rate": 4.704562917015321e-06, "loss": 0.3516, "num_tokens": 613997508.0, "step": 1492 }, { "epoch": 1.2370030581039755, "grad_norm": 0.2694670126769742, "learning_rate": 4.6990788797900435e-06, "loss": 0.3618, "num_tokens": 614386449.0, "step": 1493 }, { "epoch": 1.2373853211009174, "grad_norm": 0.2274473701492788, "learning_rate": 4.693596070706535e-06, "loss": 0.3404, "num_tokens": 614825633.0, "step": 1494 }, { "epoch": 1.2377675840978593, "grad_norm": 0.27876404382055525, "learning_rate": 4.688114498172196e-06, "loss": 0.3535, "num_tokens": 615233900.0, "step": 1495 }, { "epoch": 1.2381498470948011, "grad_norm": 0.3242892792938995, "learning_rate": 4.682634170592537e-06, "loss": 0.3682, "num_tokens": 615658861.0, "step": 1496 }, { "epoch": 1.238532110091743, "grad_norm": 0.2787711577260471, "learning_rate": 4.677155096371153e-06, "loss": 0.3611, "num_tokens": 616063526.0, "step": 1497 }, { "epoch": 1.2389143730886851, "grad_norm": 0.2554049172006759, "learning_rate": 4.671677283909722e-06, "loss": 0.3346, "num_tokens": 616443518.0, "step": 1498 }, { "epoch": 1.2392966360856268, "grad_norm": 0.26168405164306663, "learning_rate": 4.666200741607987e-06, "loss": 0.3441, "num_tokens": 616842118.0, "step": 1499 }, { "epoch": 1.239678899082569, "grad_norm": 0.267189197129404, "learning_rate": 4.660725477863738e-06, "loss": 0.3509, "num_tokens": 617256623.0, "step": 1500 }, { "epoch": 1.2400611620795108, "grad_norm": 0.27880337082632484, "learning_rate": 4.65525150107281e-06, "loss": 0.3601, "num_tokens": 617675632.0, "step": 1501 }, { "epoch": 1.2404434250764527, "grad_norm": 0.2773574935020381, "learning_rate": 4.649778819629062e-06, "loss": 0.3688, "num_tokens": 618131018.0, "step": 1502 }, { "epoch": 1.2408256880733946, "grad_norm": 0.24667708079763367, "learning_rate": 4.6443074419243695e-06, "loss": 0.3227, "num_tokens": 618534447.0, "step": 1503 }, { "epoch": 1.2412079510703364, "grad_norm": 0.25792012719285073, "learning_rate": 4.638837376348603e-06, "loss": 0.3488, "num_tokens": 618964198.0, "step": 1504 }, { "epoch": 1.2415902140672783, "grad_norm": 0.2443927818637308, "learning_rate": 4.633368631289628e-06, "loss": 0.3413, "num_tokens": 619414257.0, "step": 1505 }, { "epoch": 1.2419724770642202, "grad_norm": 0.24132322852009902, "learning_rate": 4.6279012151332815e-06, "loss": 0.3257, "num_tokens": 619801643.0, "step": 1506 }, { "epoch": 1.242354740061162, "grad_norm": 0.30312409830588954, "learning_rate": 4.622435136263363e-06, "loss": 0.3748, "num_tokens": 620263688.0, "step": 1507 }, { "epoch": 1.242737003058104, "grad_norm": 0.29370964523192583, "learning_rate": 4.61697040306162e-06, "loss": 0.387, "num_tokens": 620692631.0, "step": 1508 }, { "epoch": 1.2431192660550459, "grad_norm": 0.2570004905550023, "learning_rate": 4.6115070239077385e-06, "loss": 0.3353, "num_tokens": 621100009.0, "step": 1509 }, { "epoch": 1.2435015290519877, "grad_norm": 0.28160061630265976, "learning_rate": 4.6060450071793295e-06, "loss": 0.3435, "num_tokens": 621488160.0, "step": 1510 }, { "epoch": 1.2438837920489296, "grad_norm": 0.2605355082911742, "learning_rate": 4.600584361251909e-06, "loss": 0.3568, "num_tokens": 621904777.0, "step": 1511 }, { "epoch": 1.2442660550458715, "grad_norm": 0.2704803648218184, "learning_rate": 4.595125094498899e-06, "loss": 0.3662, "num_tokens": 622311490.0, "step": 1512 }, { "epoch": 1.2446483180428134, "grad_norm": 0.2567803921692123, "learning_rate": 4.589667215291601e-06, "loss": 0.3404, "num_tokens": 622737647.0, "step": 1513 }, { "epoch": 1.2450305810397553, "grad_norm": 0.2738559274542928, "learning_rate": 4.5842107319991916e-06, "loss": 0.3521, "num_tokens": 623131448.0, "step": 1514 }, { "epoch": 1.2454128440366972, "grad_norm": 0.23455746570437214, "learning_rate": 4.578755652988705e-06, "loss": 0.3469, "num_tokens": 623552011.0, "step": 1515 }, { "epoch": 1.245795107033639, "grad_norm": 0.33530673271263983, "learning_rate": 4.5733019866250215e-06, "loss": 0.342, "num_tokens": 623962191.0, "step": 1516 }, { "epoch": 1.2461773700305812, "grad_norm": 0.3229263283677262, "learning_rate": 4.567849741270858e-06, "loss": 0.3642, "num_tokens": 624400925.0, "step": 1517 }, { "epoch": 1.2465596330275228, "grad_norm": 0.30700483648842336, "learning_rate": 4.562398925286753e-06, "loss": 0.3736, "num_tokens": 624833682.0, "step": 1518 }, { "epoch": 1.246941896024465, "grad_norm": 0.2616954995028591, "learning_rate": 4.556949547031048e-06, "loss": 0.346, "num_tokens": 625268088.0, "step": 1519 }, { "epoch": 1.2473241590214068, "grad_norm": 0.2698282431970166, "learning_rate": 4.551501614859882e-06, "loss": 0.3256, "num_tokens": 625646330.0, "step": 1520 }, { "epoch": 1.2477064220183487, "grad_norm": 0.28539214412216046, "learning_rate": 4.546055137127182e-06, "loss": 0.3658, "num_tokens": 626076391.0, "step": 1521 }, { "epoch": 1.2480886850152906, "grad_norm": 0.30462971972459074, "learning_rate": 4.540610122184637e-06, "loss": 0.344, "num_tokens": 626436204.0, "step": 1522 }, { "epoch": 1.2484709480122325, "grad_norm": 0.2716450498903016, "learning_rate": 4.535166578381699e-06, "loss": 0.3496, "num_tokens": 626835819.0, "step": 1523 }, { "epoch": 1.2488532110091743, "grad_norm": 0.2913944763533766, "learning_rate": 4.529724514065558e-06, "loss": 0.3496, "num_tokens": 627245387.0, "step": 1524 }, { "epoch": 1.2492354740061162, "grad_norm": 0.2671265516175696, "learning_rate": 4.5242839375811405e-06, "loss": 0.3613, "num_tokens": 627688997.0, "step": 1525 }, { "epoch": 1.2496177370030581, "grad_norm": 0.2414177584090394, "learning_rate": 4.51884485727109e-06, "loss": 0.3324, "num_tokens": 628127716.0, "step": 1526 }, { "epoch": 1.25, "grad_norm": 0.2810210338065537, "learning_rate": 4.513407281475757e-06, "loss": 0.354, "num_tokens": 628549294.0, "step": 1527 }, { "epoch": 1.2503822629969419, "grad_norm": 0.2644659255243031, "learning_rate": 4.50797121853318e-06, "loss": 0.3482, "num_tokens": 628950899.0, "step": 1528 }, { "epoch": 1.2507645259938838, "grad_norm": 0.2393720310392607, "learning_rate": 4.502536676779083e-06, "loss": 0.3442, "num_tokens": 629338364.0, "step": 1529 }, { "epoch": 1.2511467889908257, "grad_norm": 0.2528096983787305, "learning_rate": 4.497103664546858e-06, "loss": 0.3594, "num_tokens": 629757170.0, "step": 1530 }, { "epoch": 1.2515290519877675, "grad_norm": 0.26294797201341047, "learning_rate": 4.4916721901675455e-06, "loss": 0.3445, "num_tokens": 630128300.0, "step": 1531 }, { "epoch": 1.2519113149847094, "grad_norm": 0.2276393231641771, "learning_rate": 4.4862422619698335e-06, "loss": 0.3452, "num_tokens": 630609330.0, "step": 1532 }, { "epoch": 1.2522935779816513, "grad_norm": 0.23850573359463312, "learning_rate": 4.480813888280034e-06, "loss": 0.3458, "num_tokens": 631001433.0, "step": 1533 }, { "epoch": 1.2526758409785932, "grad_norm": 0.23984901912773515, "learning_rate": 4.475387077422083e-06, "loss": 0.3339, "num_tokens": 631402755.0, "step": 1534 }, { "epoch": 1.253058103975535, "grad_norm": 0.22831802415425598, "learning_rate": 4.469961837717512e-06, "loss": 0.3439, "num_tokens": 631825743.0, "step": 1535 }, { "epoch": 1.2534403669724772, "grad_norm": 0.24260055909481143, "learning_rate": 4.46453817748545e-06, "loss": 0.3176, "num_tokens": 632230041.0, "step": 1536 }, { "epoch": 1.2538226299694188, "grad_norm": 0.2751155180509015, "learning_rate": 4.459116105042598e-06, "loss": 0.3733, "num_tokens": 632626284.0, "step": 1537 }, { "epoch": 1.254204892966361, "grad_norm": 0.2553636590595797, "learning_rate": 4.453695628703226e-06, "loss": 0.372, "num_tokens": 633059142.0, "step": 1538 }, { "epoch": 1.2545871559633026, "grad_norm": 0.23782395821537206, "learning_rate": 4.448276756779156e-06, "loss": 0.3764, "num_tokens": 633509955.0, "step": 1539 }, { "epoch": 1.2549694189602447, "grad_norm": 0.24711199820104984, "learning_rate": 4.442859497579746e-06, "loss": 0.3191, "num_tokens": 633919276.0, "step": 1540 }, { "epoch": 1.2553516819571866, "grad_norm": 0.2445412621012625, "learning_rate": 4.4374438594118884e-06, "loss": 0.348, "num_tokens": 634326945.0, "step": 1541 }, { "epoch": 1.2557339449541285, "grad_norm": 0.2372231240419333, "learning_rate": 4.432029850579983e-06, "loss": 0.341, "num_tokens": 634742618.0, "step": 1542 }, { "epoch": 1.2561162079510704, "grad_norm": 0.2499743494221262, "learning_rate": 4.4266174793859375e-06, "loss": 0.3539, "num_tokens": 635188965.0, "step": 1543 }, { "epoch": 1.2564984709480123, "grad_norm": 0.304348645689502, "learning_rate": 4.421206754129142e-06, "loss": 0.3677, "num_tokens": 635597341.0, "step": 1544 }, { "epoch": 1.2568807339449541, "grad_norm": 0.2270217774870297, "learning_rate": 4.4157976831064664e-06, "loss": 0.3305, "num_tokens": 636044285.0, "step": 1545 }, { "epoch": 1.257262996941896, "grad_norm": 0.2727457966571208, "learning_rate": 4.410390274612241e-06, "loss": 0.366, "num_tokens": 636417305.0, "step": 1546 }, { "epoch": 1.257645259938838, "grad_norm": 0.26319062485226313, "learning_rate": 4.4049845369382525e-06, "loss": 0.349, "num_tokens": 636826472.0, "step": 1547 }, { "epoch": 1.2580275229357798, "grad_norm": 0.28737450977268864, "learning_rate": 4.3995804783737185e-06, "loss": 0.3298, "num_tokens": 637184366.0, "step": 1548 }, { "epoch": 1.2584097859327217, "grad_norm": 0.276216735971841, "learning_rate": 4.394178107205289e-06, "loss": 0.351, "num_tokens": 637604816.0, "step": 1549 }, { "epoch": 1.2587920489296636, "grad_norm": 0.2843474613126444, "learning_rate": 4.388777431717022e-06, "loss": 0.3602, "num_tokens": 637995589.0, "step": 1550 }, { "epoch": 1.2591743119266054, "grad_norm": 0.24955350478748686, "learning_rate": 4.383378460190373e-06, "loss": 0.3488, "num_tokens": 638367778.0, "step": 1551 }, { "epoch": 1.2595565749235473, "grad_norm": 0.2593543321261272, "learning_rate": 4.377981200904191e-06, "loss": 0.3523, "num_tokens": 638806793.0, "step": 1552 }, { "epoch": 1.2599388379204892, "grad_norm": 0.24658179112136044, "learning_rate": 4.372585662134695e-06, "loss": 0.3339, "num_tokens": 639214449.0, "step": 1553 }, { "epoch": 1.260321100917431, "grad_norm": 0.26999169634474374, "learning_rate": 4.367191852155467e-06, "loss": 0.3442, "num_tokens": 639601441.0, "step": 1554 }, { "epoch": 1.2607033639143732, "grad_norm": 0.2965005020667414, "learning_rate": 4.3617997792374365e-06, "loss": 0.3572, "num_tokens": 640049212.0, "step": 1555 }, { "epoch": 1.2610856269113149, "grad_norm": 0.24517880168622486, "learning_rate": 4.3564094516488755e-06, "loss": 0.3788, "num_tokens": 640519317.0, "step": 1556 }, { "epoch": 1.261467889908257, "grad_norm": 0.2649115105411148, "learning_rate": 4.351020877655369e-06, "loss": 0.3306, "num_tokens": 640901086.0, "step": 1557 }, { "epoch": 1.2618501529051986, "grad_norm": 0.27893059877771775, "learning_rate": 4.345634065519824e-06, "loss": 0.3457, "num_tokens": 641280383.0, "step": 1558 }, { "epoch": 1.2622324159021407, "grad_norm": 0.2529691612743419, "learning_rate": 4.340249023502439e-06, "loss": 0.3268, "num_tokens": 641667769.0, "step": 1559 }, { "epoch": 1.2626146788990826, "grad_norm": 0.276398285482857, "learning_rate": 4.3348657598607004e-06, "loss": 0.3555, "num_tokens": 642042009.0, "step": 1560 }, { "epoch": 1.2629969418960245, "grad_norm": 0.2602587942489892, "learning_rate": 4.329484282849367e-06, "loss": 0.3631, "num_tokens": 642462705.0, "step": 1561 }, { "epoch": 1.2633792048929664, "grad_norm": 0.22366945952893483, "learning_rate": 4.324104600720457e-06, "loss": 0.3336, "num_tokens": 642868439.0, "step": 1562 }, { "epoch": 1.2637614678899083, "grad_norm": 0.23967243185765635, "learning_rate": 4.31872672172324e-06, "loss": 0.361, "num_tokens": 643291691.0, "step": 1563 }, { "epoch": 1.2641437308868502, "grad_norm": 0.26184581121011513, "learning_rate": 4.313350654104215e-06, "loss": 0.3605, "num_tokens": 643716008.0, "step": 1564 }, { "epoch": 1.264525993883792, "grad_norm": 0.24456196635927346, "learning_rate": 4.307976406107112e-06, "loss": 0.3494, "num_tokens": 644152992.0, "step": 1565 }, { "epoch": 1.264908256880734, "grad_norm": 0.2410128098271759, "learning_rate": 4.302603985972861e-06, "loss": 0.3516, "num_tokens": 644558328.0, "step": 1566 }, { "epoch": 1.2652905198776758, "grad_norm": 0.24540840942105183, "learning_rate": 4.297233401939595e-06, "loss": 0.359, "num_tokens": 644989889.0, "step": 1567 }, { "epoch": 1.2656727828746177, "grad_norm": 0.2523268012164393, "learning_rate": 4.291864662242629e-06, "loss": 0.3723, "num_tokens": 645405233.0, "step": 1568 }, { "epoch": 1.2660550458715596, "grad_norm": 0.25049435681540944, "learning_rate": 4.286497775114453e-06, "loss": 0.3803, "num_tokens": 645846016.0, "step": 1569 }, { "epoch": 1.2664373088685015, "grad_norm": 0.2846210627481902, "learning_rate": 4.281132748784714e-06, "loss": 0.3643, "num_tokens": 646247293.0, "step": 1570 }, { "epoch": 1.2668195718654434, "grad_norm": 0.25653061531991805, "learning_rate": 4.275769591480203e-06, "loss": 0.3599, "num_tokens": 646689866.0, "step": 1571 }, { "epoch": 1.2672018348623852, "grad_norm": 0.24902462114208151, "learning_rate": 4.27040831142485e-06, "loss": 0.3513, "num_tokens": 647126701.0, "step": 1572 }, { "epoch": 1.2675840978593271, "grad_norm": 0.26306414560476404, "learning_rate": 4.265048916839703e-06, "loss": 0.3496, "num_tokens": 647518508.0, "step": 1573 }, { "epoch": 1.2679663608562692, "grad_norm": 0.2537441942123504, "learning_rate": 4.259691415942923e-06, "loss": 0.3263, "num_tokens": 647918630.0, "step": 1574 }, { "epoch": 1.268348623853211, "grad_norm": 0.23722274611176833, "learning_rate": 4.2543358169497615e-06, "loss": 0.334, "num_tokens": 648340238.0, "step": 1575 }, { "epoch": 1.268730886850153, "grad_norm": 0.25457027845642066, "learning_rate": 4.2489821280725575e-06, "loss": 0.3254, "num_tokens": 648716684.0, "step": 1576 }, { "epoch": 1.2691131498470947, "grad_norm": 0.27670111132325226, "learning_rate": 4.243630357520717e-06, "loss": 0.3705, "num_tokens": 649192883.0, "step": 1577 }, { "epoch": 1.2694954128440368, "grad_norm": 0.24326388511272504, "learning_rate": 4.238280513500712e-06, "loss": 0.3706, "num_tokens": 649660463.0, "step": 1578 }, { "epoch": 1.2698776758409787, "grad_norm": 0.2425460141036009, "learning_rate": 4.2329326042160525e-06, "loss": 0.3372, "num_tokens": 650089212.0, "step": 1579 }, { "epoch": 1.2702599388379205, "grad_norm": 0.2654402725799945, "learning_rate": 4.227586637867286e-06, "loss": 0.3621, "num_tokens": 650513932.0, "step": 1580 }, { "epoch": 1.2706422018348624, "grad_norm": 0.2699134869489444, "learning_rate": 4.22224262265198e-06, "loss": 0.3502, "num_tokens": 650944041.0, "step": 1581 }, { "epoch": 1.2710244648318043, "grad_norm": 0.2605280217705378, "learning_rate": 4.216900566764706e-06, "loss": 0.3606, "num_tokens": 651375352.0, "step": 1582 }, { "epoch": 1.2714067278287462, "grad_norm": 0.2592068102550442, "learning_rate": 4.2115604783970395e-06, "loss": 0.3634, "num_tokens": 651832360.0, "step": 1583 }, { "epoch": 1.271788990825688, "grad_norm": 0.2636462795464737, "learning_rate": 4.206222365737531e-06, "loss": 0.3432, "num_tokens": 652235375.0, "step": 1584 }, { "epoch": 1.27217125382263, "grad_norm": 0.2652782810774488, "learning_rate": 4.200886236971707e-06, "loss": 0.3377, "num_tokens": 652663291.0, "step": 1585 }, { "epoch": 1.2725535168195719, "grad_norm": 0.284647558387535, "learning_rate": 4.1955521002820455e-06, "loss": 0.3469, "num_tokens": 653059005.0, "step": 1586 }, { "epoch": 1.2729357798165137, "grad_norm": 0.24201335199991722, "learning_rate": 4.190219963847979e-06, "loss": 0.3339, "num_tokens": 653433740.0, "step": 1587 }, { "epoch": 1.2733180428134556, "grad_norm": 0.2389642406696781, "learning_rate": 4.184889835845862e-06, "loss": 0.3265, "num_tokens": 653837513.0, "step": 1588 }, { "epoch": 1.2737003058103975, "grad_norm": 0.3063674740395495, "learning_rate": 4.179561724448982e-06, "loss": 0.3386, "num_tokens": 654194858.0, "step": 1589 }, { "epoch": 1.2740825688073394, "grad_norm": 0.2448844279083538, "learning_rate": 4.174235637827521e-06, "loss": 0.3154, "num_tokens": 654600496.0, "step": 1590 }, { "epoch": 1.2744648318042813, "grad_norm": 0.24337436179560917, "learning_rate": 4.168911584148564e-06, "loss": 0.3529, "num_tokens": 655055781.0, "step": 1591 }, { "epoch": 1.2748470948012232, "grad_norm": 0.24747389328879174, "learning_rate": 4.163589571576076e-06, "loss": 0.3176, "num_tokens": 655486620.0, "step": 1592 }, { "epoch": 1.2752293577981653, "grad_norm": 0.27613753335126656, "learning_rate": 4.158269608270894e-06, "loss": 0.3348, "num_tokens": 655865221.0, "step": 1593 }, { "epoch": 1.275611620795107, "grad_norm": 0.25013078860053084, "learning_rate": 4.152951702390713e-06, "loss": 0.3363, "num_tokens": 656328076.0, "step": 1594 }, { "epoch": 1.275993883792049, "grad_norm": 0.26905231532084006, "learning_rate": 4.147635862090068e-06, "loss": 0.3323, "num_tokens": 656717113.0, "step": 1595 }, { "epoch": 1.2763761467889907, "grad_norm": 0.24051223036770084, "learning_rate": 4.142322095520334e-06, "loss": 0.344, "num_tokens": 657136128.0, "step": 1596 }, { "epoch": 1.2767584097859328, "grad_norm": 0.24697322697358998, "learning_rate": 4.1370104108297025e-06, "loss": 0.3418, "num_tokens": 657517257.0, "step": 1597 }, { "epoch": 1.2771406727828747, "grad_norm": 0.2608386030485975, "learning_rate": 4.13170081616317e-06, "loss": 0.3525, "num_tokens": 657934131.0, "step": 1598 }, { "epoch": 1.2775229357798166, "grad_norm": 0.2569393826707226, "learning_rate": 4.126393319662531e-06, "loss": 0.3309, "num_tokens": 658288548.0, "step": 1599 }, { "epoch": 1.2779051987767585, "grad_norm": 0.25399379302142755, "learning_rate": 4.121087929466366e-06, "loss": 0.3434, "num_tokens": 658675160.0, "step": 1600 }, { "epoch": 1.2782874617737003, "grad_norm": 0.26215250516895544, "learning_rate": 4.11578465371002e-06, "loss": 0.3521, "num_tokens": 659095282.0, "step": 1601 }, { "epoch": 1.2786697247706422, "grad_norm": 0.2592975490564625, "learning_rate": 4.110483500525595e-06, "loss": 0.3397, "num_tokens": 659534103.0, "step": 1602 }, { "epoch": 1.279051987767584, "grad_norm": 0.2648017001314538, "learning_rate": 4.105184478041945e-06, "loss": 0.3311, "num_tokens": 659925000.0, "step": 1603 }, { "epoch": 1.279434250764526, "grad_norm": 0.2877984613405508, "learning_rate": 4.09988759438465e-06, "loss": 0.348, "num_tokens": 660340498.0, "step": 1604 }, { "epoch": 1.2798165137614679, "grad_norm": 0.24806556714119524, "learning_rate": 4.094592857676015e-06, "loss": 0.35, "num_tokens": 660804431.0, "step": 1605 }, { "epoch": 1.2801987767584098, "grad_norm": 0.26673730452209415, "learning_rate": 4.08930027603505e-06, "loss": 0.3584, "num_tokens": 661247118.0, "step": 1606 }, { "epoch": 1.2805810397553516, "grad_norm": 0.2598817801332126, "learning_rate": 4.084009857577462e-06, "loss": 0.3504, "num_tokens": 661672028.0, "step": 1607 }, { "epoch": 1.2809633027522935, "grad_norm": 0.2680424098907622, "learning_rate": 4.078721610415637e-06, "loss": 0.3534, "num_tokens": 662058492.0, "step": 1608 }, { "epoch": 1.2813455657492354, "grad_norm": 0.2966087797441796, "learning_rate": 4.07343554265864e-06, "loss": 0.3457, "num_tokens": 662458194.0, "step": 1609 }, { "epoch": 1.2817278287461773, "grad_norm": 0.2623679594058613, "learning_rate": 4.0681516624121845e-06, "loss": 0.3449, "num_tokens": 662860956.0, "step": 1610 }, { "epoch": 1.2821100917431192, "grad_norm": 0.2528543503408816, "learning_rate": 4.062869977778637e-06, "loss": 0.3507, "num_tokens": 663301109.0, "step": 1611 }, { "epoch": 1.2824923547400613, "grad_norm": 0.2542946493241396, "learning_rate": 4.057590496856993e-06, "loss": 0.3792, "num_tokens": 663795527.0, "step": 1612 }, { "epoch": 1.282874617737003, "grad_norm": 0.28365718103903786, "learning_rate": 4.05231322774287e-06, "loss": 0.3533, "num_tokens": 664241572.0, "step": 1613 }, { "epoch": 1.283256880733945, "grad_norm": 0.2540843990662738, "learning_rate": 4.047038178528494e-06, "loss": 0.3347, "num_tokens": 664611912.0, "step": 1614 }, { "epoch": 1.2836391437308867, "grad_norm": 0.27683863339835385, "learning_rate": 4.041765357302683e-06, "loss": 0.3413, "num_tokens": 665026613.0, "step": 1615 }, { "epoch": 1.2840214067278288, "grad_norm": 0.26511749851422695, "learning_rate": 4.036494772150851e-06, "loss": 0.348, "num_tokens": 665460212.0, "step": 1616 }, { "epoch": 1.2844036697247707, "grad_norm": 0.22757634047917277, "learning_rate": 4.031226431154967e-06, "loss": 0.3523, "num_tokens": 665906921.0, "step": 1617 }, { "epoch": 1.2847859327217126, "grad_norm": 0.29137810688727744, "learning_rate": 4.02596034239357e-06, "loss": 0.3426, "num_tokens": 666312411.0, "step": 1618 }, { "epoch": 1.2851681957186545, "grad_norm": 0.26685115861412273, "learning_rate": 4.0206965139417395e-06, "loss": 0.3651, "num_tokens": 666753338.0, "step": 1619 }, { "epoch": 1.2855504587155964, "grad_norm": 0.28806855548882604, "learning_rate": 4.015434953871094e-06, "loss": 0.3592, "num_tokens": 667163013.0, "step": 1620 }, { "epoch": 1.2859327217125383, "grad_norm": 0.27816067891620727, "learning_rate": 4.01017567024977e-06, "loss": 0.3389, "num_tokens": 667563780.0, "step": 1621 }, { "epoch": 1.2863149847094801, "grad_norm": 0.2772825435086106, "learning_rate": 4.0049186711424125e-06, "loss": 0.383, "num_tokens": 668022554.0, "step": 1622 }, { "epoch": 1.286697247706422, "grad_norm": 0.2649231069827948, "learning_rate": 3.999663964610168e-06, "loss": 0.3606, "num_tokens": 668475300.0, "step": 1623 }, { "epoch": 1.287079510703364, "grad_norm": 0.25321837681915044, "learning_rate": 3.99441155871066e-06, "loss": 0.3389, "num_tokens": 668853999.0, "step": 1624 }, { "epoch": 1.2874617737003058, "grad_norm": 0.32634566345089056, "learning_rate": 3.989161461497996e-06, "loss": 0.3741, "num_tokens": 669289053.0, "step": 1625 }, { "epoch": 1.2878440366972477, "grad_norm": 0.3355777915972197, "learning_rate": 3.9839136810227285e-06, "loss": 0.3554, "num_tokens": 669686111.0, "step": 1626 }, { "epoch": 1.2882262996941896, "grad_norm": 0.284979952256129, "learning_rate": 3.978668225331872e-06, "loss": 0.3652, "num_tokens": 670106394.0, "step": 1627 }, { "epoch": 1.2886085626911314, "grad_norm": 0.24701234632238284, "learning_rate": 3.973425102468864e-06, "loss": 0.3658, "num_tokens": 670502521.0, "step": 1628 }, { "epoch": 1.2889908256880733, "grad_norm": 0.27781475422291285, "learning_rate": 3.968184320473574e-06, "loss": 0.35, "num_tokens": 670925489.0, "step": 1629 }, { "epoch": 1.2893730886850152, "grad_norm": 0.2630524623493376, "learning_rate": 3.962945887382274e-06, "loss": 0.3459, "num_tokens": 671310814.0, "step": 1630 }, { "epoch": 1.2897553516819573, "grad_norm": 0.25808224344515196, "learning_rate": 3.957709811227642e-06, "loss": 0.3575, "num_tokens": 671732779.0, "step": 1631 }, { "epoch": 1.290137614678899, "grad_norm": 0.32067637746651884, "learning_rate": 3.952476100038738e-06, "loss": 0.3786, "num_tokens": 672198025.0, "step": 1632 }, { "epoch": 1.290519877675841, "grad_norm": 0.28015597849764223, "learning_rate": 3.947244761840993e-06, "loss": 0.3467, "num_tokens": 672559181.0, "step": 1633 }, { "epoch": 1.2909021406727827, "grad_norm": 0.2593265417342103, "learning_rate": 3.942015804656204e-06, "loss": 0.3585, "num_tokens": 672987382.0, "step": 1634 }, { "epoch": 1.2912844036697249, "grad_norm": 0.2619726313070248, "learning_rate": 3.936789236502513e-06, "loss": 0.3626, "num_tokens": 673430120.0, "step": 1635 }, { "epoch": 1.2916666666666667, "grad_norm": 0.25399496160010143, "learning_rate": 3.931565065394403e-06, "loss": 0.3388, "num_tokens": 673842400.0, "step": 1636 }, { "epoch": 1.2920489296636086, "grad_norm": 0.25154664823982636, "learning_rate": 3.926343299342675e-06, "loss": 0.3546, "num_tokens": 674244853.0, "step": 1637 }, { "epoch": 1.2924311926605505, "grad_norm": 0.2794727586637354, "learning_rate": 3.92112394635445e-06, "loss": 0.3641, "num_tokens": 674673487.0, "step": 1638 }, { "epoch": 1.2928134556574924, "grad_norm": 0.2665600194334451, "learning_rate": 3.915907014433142e-06, "loss": 0.3452, "num_tokens": 675086347.0, "step": 1639 }, { "epoch": 1.2931957186544343, "grad_norm": 0.2856590786036397, "learning_rate": 3.910692511578458e-06, "loss": 0.3807, "num_tokens": 675522963.0, "step": 1640 }, { "epoch": 1.2935779816513762, "grad_norm": 0.28696819659440775, "learning_rate": 3.905480445786373e-06, "loss": 0.355, "num_tokens": 675934024.0, "step": 1641 }, { "epoch": 1.293960244648318, "grad_norm": 0.2708796914563745, "learning_rate": 3.900270825049133e-06, "loss": 0.3587, "num_tokens": 676333135.0, "step": 1642 }, { "epoch": 1.29434250764526, "grad_norm": 0.2636085512434039, "learning_rate": 3.895063657355228e-06, "loss": 0.3433, "num_tokens": 676701566.0, "step": 1643 }, { "epoch": 1.2947247706422018, "grad_norm": 0.27322129613486396, "learning_rate": 3.889858950689393e-06, "loss": 0.3574, "num_tokens": 677116337.0, "step": 1644 }, { "epoch": 1.2951070336391437, "grad_norm": 0.28211374401538497, "learning_rate": 3.884656713032583e-06, "loss": 0.3374, "num_tokens": 677503302.0, "step": 1645 }, { "epoch": 1.2954892966360856, "grad_norm": 0.244467040344058, "learning_rate": 3.879456952361971e-06, "loss": 0.359, "num_tokens": 677910155.0, "step": 1646 }, { "epoch": 1.2958715596330275, "grad_norm": 0.23604624092040635, "learning_rate": 3.8742596766509314e-06, "loss": 0.376, "num_tokens": 678356073.0, "step": 1647 }, { "epoch": 1.2962538226299694, "grad_norm": 0.26952222492760003, "learning_rate": 3.869064893869023e-06, "loss": 0.3393, "num_tokens": 678729844.0, "step": 1648 }, { "epoch": 1.2966360856269112, "grad_norm": 0.2760789544630471, "learning_rate": 3.863872611981993e-06, "loss": 0.3516, "num_tokens": 679138055.0, "step": 1649 }, { "epoch": 1.2970183486238533, "grad_norm": 0.2788221969576679, "learning_rate": 3.858682838951741e-06, "loss": 0.3797, "num_tokens": 679566870.0, "step": 1650 }, { "epoch": 1.297400611620795, "grad_norm": 0.2966843717167917, "learning_rate": 3.85349558273633e-06, "loss": 0.3548, "num_tokens": 679974327.0, "step": 1651 }, { "epoch": 1.2977828746177371, "grad_norm": 0.2539789814895678, "learning_rate": 3.848310851289956e-06, "loss": 0.3405, "num_tokens": 680353683.0, "step": 1652 }, { "epoch": 1.2981651376146788, "grad_norm": 0.25887665129057746, "learning_rate": 3.8431286525629456e-06, "loss": 0.3522, "num_tokens": 680757167.0, "step": 1653 }, { "epoch": 1.2985474006116209, "grad_norm": 0.29081855181509403, "learning_rate": 3.837948994501746e-06, "loss": 0.3363, "num_tokens": 681145277.0, "step": 1654 }, { "epoch": 1.2989296636085628, "grad_norm": 0.26370598035478027, "learning_rate": 3.832771885048901e-06, "loss": 0.3684, "num_tokens": 681570550.0, "step": 1655 }, { "epoch": 1.2993119266055047, "grad_norm": 0.24263092886430465, "learning_rate": 3.827597332143056e-06, "loss": 0.3364, "num_tokens": 681965806.0, "step": 1656 }, { "epoch": 1.2996941896024465, "grad_norm": 0.2632476653974427, "learning_rate": 3.822425343718926e-06, "loss": 0.3503, "num_tokens": 682385252.0, "step": 1657 }, { "epoch": 1.3000764525993884, "grad_norm": 0.2538478860440045, "learning_rate": 3.817255927707302e-06, "loss": 0.3461, "num_tokens": 682790346.0, "step": 1658 }, { "epoch": 1.3004587155963303, "grad_norm": 0.24209429624581802, "learning_rate": 3.8120890920350207e-06, "loss": 0.3539, "num_tokens": 683200927.0, "step": 1659 }, { "epoch": 1.3008409785932722, "grad_norm": 0.26058844525798086, "learning_rate": 3.806924844624975e-06, "loss": 0.3632, "num_tokens": 683604091.0, "step": 1660 }, { "epoch": 1.301223241590214, "grad_norm": 0.2699157188183783, "learning_rate": 3.8017631933960764e-06, "loss": 0.3569, "num_tokens": 683992048.0, "step": 1661 }, { "epoch": 1.301605504587156, "grad_norm": 0.24002697908250872, "learning_rate": 3.7966041462632665e-06, "loss": 0.354, "num_tokens": 684417141.0, "step": 1662 }, { "epoch": 1.3019877675840978, "grad_norm": 0.24839689242861876, "learning_rate": 3.791447711137484e-06, "loss": 0.3428, "num_tokens": 684848353.0, "step": 1663 }, { "epoch": 1.3023700305810397, "grad_norm": 0.2699450196767002, "learning_rate": 3.7862938959256656e-06, "loss": 0.3301, "num_tokens": 685262085.0, "step": 1664 }, { "epoch": 1.3027522935779816, "grad_norm": 0.2472182995261609, "learning_rate": 3.781142708530736e-06, "loss": 0.3371, "num_tokens": 685664290.0, "step": 1665 }, { "epoch": 1.3031345565749235, "grad_norm": 0.23440243081424475, "learning_rate": 3.7759941568515835e-06, "loss": 0.3621, "num_tokens": 686105446.0, "step": 1666 }, { "epoch": 1.3035168195718654, "grad_norm": 0.26462861733039406, "learning_rate": 3.7708482487830566e-06, "loss": 0.3558, "num_tokens": 686503650.0, "step": 1667 }, { "epoch": 1.3038990825688073, "grad_norm": 0.2609048004264284, "learning_rate": 3.7657049922159507e-06, "loss": 0.3215, "num_tokens": 686914325.0, "step": 1668 }, { "epoch": 1.3042813455657494, "grad_norm": 0.2451725490958671, "learning_rate": 3.7605643950369973e-06, "loss": 0.3444, "num_tokens": 687308665.0, "step": 1669 }, { "epoch": 1.304663608562691, "grad_norm": 0.2496194545521366, "learning_rate": 3.755426465128844e-06, "loss": 0.3478, "num_tokens": 687699132.0, "step": 1670 }, { "epoch": 1.3050458715596331, "grad_norm": 0.24990664239611124, "learning_rate": 3.7502912103700573e-06, "loss": 0.3631, "num_tokens": 688122227.0, "step": 1671 }, { "epoch": 1.3054281345565748, "grad_norm": 0.2809508423841265, "learning_rate": 3.7451586386350937e-06, "loss": 0.3668, "num_tokens": 688566628.0, "step": 1672 }, { "epoch": 1.305810397553517, "grad_norm": 0.25952884156917994, "learning_rate": 3.7400287577942994e-06, "loss": 0.36, "num_tokens": 688985207.0, "step": 1673 }, { "epoch": 1.3061926605504588, "grad_norm": 0.24860883677654053, "learning_rate": 3.734901575713892e-06, "loss": 0.3489, "num_tokens": 689443908.0, "step": 1674 }, { "epoch": 1.3065749235474007, "grad_norm": 0.25107903298566203, "learning_rate": 3.7297771002559524e-06, "loss": 0.3684, "num_tokens": 689874191.0, "step": 1675 }, { "epoch": 1.3069571865443426, "grad_norm": 0.24659106899573477, "learning_rate": 3.7246553392784125e-06, "loss": 0.3132, "num_tokens": 690245371.0, "step": 1676 }, { "epoch": 1.3073394495412844, "grad_norm": 0.2522391218690764, "learning_rate": 3.7195363006350372e-06, "loss": 0.3529, "num_tokens": 690680775.0, "step": 1677 }, { "epoch": 1.3077217125382263, "grad_norm": 0.24318386137949508, "learning_rate": 3.7144199921754252e-06, "loss": 0.3433, "num_tokens": 691074592.0, "step": 1678 }, { "epoch": 1.3081039755351682, "grad_norm": 0.2561114796433777, "learning_rate": 3.7093064217449783e-06, "loss": 0.3314, "num_tokens": 691432256.0, "step": 1679 }, { "epoch": 1.30848623853211, "grad_norm": 0.25192527442953083, "learning_rate": 3.7041955971849065e-06, "loss": 0.3571, "num_tokens": 691854842.0, "step": 1680 }, { "epoch": 1.308868501529052, "grad_norm": 0.25462418470163234, "learning_rate": 3.699087526332209e-06, "loss": 0.3596, "num_tokens": 692259815.0, "step": 1681 }, { "epoch": 1.3092507645259939, "grad_norm": 0.22805146689730482, "learning_rate": 3.6939822170196616e-06, "loss": 0.3515, "num_tokens": 692714004.0, "step": 1682 }, { "epoch": 1.3096330275229358, "grad_norm": 0.24649529687676333, "learning_rate": 3.6888796770758016e-06, "loss": 0.3421, "num_tokens": 693097906.0, "step": 1683 }, { "epoch": 1.3100152905198776, "grad_norm": 0.26031175005142626, "learning_rate": 3.6837799143249244e-06, "loss": 0.335, "num_tokens": 693501353.0, "step": 1684 }, { "epoch": 1.3103975535168195, "grad_norm": 0.24710625441535372, "learning_rate": 3.678682936587068e-06, "loss": 0.3404, "num_tokens": 693885606.0, "step": 1685 }, { "epoch": 1.3107798165137614, "grad_norm": 0.24884736398500612, "learning_rate": 3.6735887516779946e-06, "loss": 0.3629, "num_tokens": 694280419.0, "step": 1686 }, { "epoch": 1.3111620795107033, "grad_norm": 0.22828035151065715, "learning_rate": 3.6684973674091885e-06, "loss": 0.346, "num_tokens": 694689794.0, "step": 1687 }, { "epoch": 1.3115443425076452, "grad_norm": 0.25747512086740254, "learning_rate": 3.6634087915878347e-06, "loss": 0.3578, "num_tokens": 695099478.0, "step": 1688 }, { "epoch": 1.311926605504587, "grad_norm": 0.26365855014073813, "learning_rate": 3.6583230320168194e-06, "loss": 0.3561, "num_tokens": 695494532.0, "step": 1689 }, { "epoch": 1.3123088685015292, "grad_norm": 0.23329354373292693, "learning_rate": 3.6532400964947e-06, "loss": 0.3182, "num_tokens": 695857981.0, "step": 1690 }, { "epoch": 1.3126911314984708, "grad_norm": 0.23915121220716568, "learning_rate": 3.648159992815714e-06, "loss": 0.3524, "num_tokens": 696274873.0, "step": 1691 }, { "epoch": 1.313073394495413, "grad_norm": 0.26791199410756306, "learning_rate": 3.6430827287697466e-06, "loss": 0.3522, "num_tokens": 696707671.0, "step": 1692 }, { "epoch": 1.3134556574923548, "grad_norm": 0.23200952961797544, "learning_rate": 3.6380083121423395e-06, "loss": 0.3494, "num_tokens": 697137970.0, "step": 1693 }, { "epoch": 1.3138379204892967, "grad_norm": 0.26802039122052357, "learning_rate": 3.6329367507146583e-06, "loss": 0.3615, "num_tokens": 697535123.0, "step": 1694 }, { "epoch": 1.3142201834862386, "grad_norm": 0.27258075226736395, "learning_rate": 3.6278680522634948e-06, "loss": 0.3732, "num_tokens": 697927816.0, "step": 1695 }, { "epoch": 1.3146024464831805, "grad_norm": 0.27244744922626585, "learning_rate": 3.6228022245612494e-06, "loss": 0.3806, "num_tokens": 698334602.0, "step": 1696 }, { "epoch": 1.3149847094801224, "grad_norm": 0.25482725022212754, "learning_rate": 3.6177392753759233e-06, "loss": 0.367, "num_tokens": 698753560.0, "step": 1697 }, { "epoch": 1.3153669724770642, "grad_norm": 0.25676648138733404, "learning_rate": 3.6126792124710995e-06, "loss": 0.3393, "num_tokens": 699181694.0, "step": 1698 }, { "epoch": 1.3157492354740061, "grad_norm": 0.27411543729618837, "learning_rate": 3.6076220436059386e-06, "loss": 0.3868, "num_tokens": 699586947.0, "step": 1699 }, { "epoch": 1.316131498470948, "grad_norm": 0.23366188176398012, "learning_rate": 3.602567776535164e-06, "loss": 0.3387, "num_tokens": 699997751.0, "step": 1700 }, { "epoch": 1.31651376146789, "grad_norm": 0.2339723951439109, "learning_rate": 3.5975164190090427e-06, "loss": 0.3485, "num_tokens": 700413746.0, "step": 1701 }, { "epoch": 1.3168960244648318, "grad_norm": 0.24634552093703108, "learning_rate": 3.592467978773392e-06, "loss": 0.3403, "num_tokens": 700792428.0, "step": 1702 }, { "epoch": 1.3172782874617737, "grad_norm": 0.24579856747239542, "learning_rate": 3.5874224635695433e-06, "loss": 0.3476, "num_tokens": 701203930.0, "step": 1703 }, { "epoch": 1.3176605504587156, "grad_norm": 0.2483170996101416, "learning_rate": 3.582379881134349e-06, "loss": 0.3279, "num_tokens": 701577323.0, "step": 1704 }, { "epoch": 1.3180428134556574, "grad_norm": 0.2528532610079257, "learning_rate": 3.5773402392001666e-06, "loss": 0.3766, "num_tokens": 702044073.0, "step": 1705 }, { "epoch": 1.3184250764525993, "grad_norm": 0.25962494185583007, "learning_rate": 3.5723035454948385e-06, "loss": 0.3616, "num_tokens": 702423613.0, "step": 1706 }, { "epoch": 1.3188073394495412, "grad_norm": 0.23479186487362205, "learning_rate": 3.5672698077416913e-06, "loss": 0.367, "num_tokens": 702848438.0, "step": 1707 }, { "epoch": 1.319189602446483, "grad_norm": 0.2672396381349754, "learning_rate": 3.5622390336595168e-06, "loss": 0.3526, "num_tokens": 703267488.0, "step": 1708 }, { "epoch": 1.3195718654434252, "grad_norm": 0.25379542698726226, "learning_rate": 3.557211230962565e-06, "loss": 0.3378, "num_tokens": 703670261.0, "step": 1709 }, { "epoch": 1.3199541284403669, "grad_norm": 0.2408783238066999, "learning_rate": 3.5521864073605197e-06, "loss": 0.341, "num_tokens": 704099842.0, "step": 1710 }, { "epoch": 1.320336391437309, "grad_norm": 0.2711219733211456, "learning_rate": 3.5471645705585125e-06, "loss": 0.3565, "num_tokens": 704499691.0, "step": 1711 }, { "epoch": 1.3207186544342506, "grad_norm": 0.2358202603524401, "learning_rate": 3.5421457282570794e-06, "loss": 0.3559, "num_tokens": 704920360.0, "step": 1712 }, { "epoch": 1.3211009174311927, "grad_norm": 0.23997352472324385, "learning_rate": 3.5371298881521775e-06, "loss": 0.3654, "num_tokens": 705333587.0, "step": 1713 }, { "epoch": 1.3214831804281346, "grad_norm": 0.253045374800109, "learning_rate": 3.5321170579351514e-06, "loss": 0.3496, "num_tokens": 705737351.0, "step": 1714 }, { "epoch": 1.3218654434250765, "grad_norm": 0.23724805464031495, "learning_rate": 3.5271072452927333e-06, "loss": 0.3683, "num_tokens": 706167055.0, "step": 1715 }, { "epoch": 1.3222477064220184, "grad_norm": 0.2495027557367922, "learning_rate": 3.5221004579070295e-06, "loss": 0.3454, "num_tokens": 706563884.0, "step": 1716 }, { "epoch": 1.3226299694189603, "grad_norm": 0.2619542188917075, "learning_rate": 3.5170967034555015e-06, "loss": 0.3475, "num_tokens": 706989701.0, "step": 1717 }, { "epoch": 1.3230122324159022, "grad_norm": 0.23258872139739586, "learning_rate": 3.512095989610972e-06, "loss": 0.3634, "num_tokens": 707416734.0, "step": 1718 }, { "epoch": 1.323394495412844, "grad_norm": 0.24531778588397174, "learning_rate": 3.507098324041587e-06, "loss": 0.3414, "num_tokens": 707790806.0, "step": 1719 }, { "epoch": 1.323776758409786, "grad_norm": 0.25038908760295703, "learning_rate": 3.5021037144108305e-06, "loss": 0.3544, "num_tokens": 708192289.0, "step": 1720 }, { "epoch": 1.3241590214067278, "grad_norm": 0.2437190032053367, "learning_rate": 3.4971121683774913e-06, "loss": 0.3378, "num_tokens": 708623219.0, "step": 1721 }, { "epoch": 1.3245412844036697, "grad_norm": 0.2304499154964965, "learning_rate": 3.492123693595666e-06, "loss": 0.3545, "num_tokens": 709059431.0, "step": 1722 }, { "epoch": 1.3249235474006116, "grad_norm": 0.25332064199962345, "learning_rate": 3.487138297714738e-06, "loss": 0.3622, "num_tokens": 709477602.0, "step": 1723 }, { "epoch": 1.3253058103975535, "grad_norm": 0.26226155182675603, "learning_rate": 3.4821559883793737e-06, "loss": 0.3437, "num_tokens": 709839255.0, "step": 1724 }, { "epoch": 1.3256880733944953, "grad_norm": 0.23792977461652384, "learning_rate": 3.4771767732295047e-06, "loss": 0.3538, "num_tokens": 710265426.0, "step": 1725 }, { "epoch": 1.3260703363914372, "grad_norm": 0.22686093364377113, "learning_rate": 3.4722006599003134e-06, "loss": 0.3432, "num_tokens": 710682501.0, "step": 1726 }, { "epoch": 1.3264525993883791, "grad_norm": 0.22365803653205701, "learning_rate": 3.467227656022236e-06, "loss": 0.3511, "num_tokens": 711118888.0, "step": 1727 }, { "epoch": 1.3268348623853212, "grad_norm": 0.23940006059197358, "learning_rate": 3.462257769220928e-06, "loss": 0.3349, "num_tokens": 711547038.0, "step": 1728 }, { "epoch": 1.3272171253822629, "grad_norm": 0.24654397058057753, "learning_rate": 3.4572910071172755e-06, "loss": 0.3492, "num_tokens": 711985799.0, "step": 1729 }, { "epoch": 1.327599388379205, "grad_norm": 0.2639918645245174, "learning_rate": 3.452327377327369e-06, "loss": 0.3511, "num_tokens": 712366886.0, "step": 1730 }, { "epoch": 1.3279816513761467, "grad_norm": 0.2540100855501442, "learning_rate": 3.4473668874624945e-06, "loss": 0.3562, "num_tokens": 712758658.0, "step": 1731 }, { "epoch": 1.3283639143730888, "grad_norm": 0.25801100106214636, "learning_rate": 3.4424095451291273e-06, "loss": 0.3327, "num_tokens": 713147810.0, "step": 1732 }, { "epoch": 1.3287461773700306, "grad_norm": 0.26082401625287394, "learning_rate": 3.4374553579289117e-06, "loss": 0.3497, "num_tokens": 713561032.0, "step": 1733 }, { "epoch": 1.3291284403669725, "grad_norm": 0.23322128338134585, "learning_rate": 3.43250433345866e-06, "loss": 0.3447, "num_tokens": 713990734.0, "step": 1734 }, { "epoch": 1.3295107033639144, "grad_norm": 0.2757943606287788, "learning_rate": 3.4275564793103226e-06, "loss": 0.3527, "num_tokens": 714366004.0, "step": 1735 }, { "epoch": 1.3298929663608563, "grad_norm": 0.2683112481848239, "learning_rate": 3.4226118030710066e-06, "loss": 0.3745, "num_tokens": 714799206.0, "step": 1736 }, { "epoch": 1.3302752293577982, "grad_norm": 0.2709962268635325, "learning_rate": 3.4176703123229294e-06, "loss": 0.3572, "num_tokens": 715193469.0, "step": 1737 }, { "epoch": 1.33065749235474, "grad_norm": 0.25891456430430126, "learning_rate": 3.412732014643432e-06, "loss": 0.3523, "num_tokens": 715595011.0, "step": 1738 }, { "epoch": 1.331039755351682, "grad_norm": 0.23862958356199837, "learning_rate": 3.4077969176049576e-06, "loss": 0.3435, "num_tokens": 716004820.0, "step": 1739 }, { "epoch": 1.3314220183486238, "grad_norm": 0.23589486284793224, "learning_rate": 3.4028650287750413e-06, "loss": 0.3433, "num_tokens": 716418977.0, "step": 1740 }, { "epoch": 1.3318042813455657, "grad_norm": 0.24345351341480068, "learning_rate": 3.3979363557163e-06, "loss": 0.3898, "num_tokens": 716834032.0, "step": 1741 }, { "epoch": 1.3321865443425076, "grad_norm": 0.24354670691167477, "learning_rate": 3.3930109059864173e-06, "loss": 0.3519, "num_tokens": 717273363.0, "step": 1742 }, { "epoch": 1.3325688073394495, "grad_norm": 0.28913050334476587, "learning_rate": 3.3880886871381358e-06, "loss": 0.3645, "num_tokens": 717700550.0, "step": 1743 }, { "epoch": 1.3329510703363914, "grad_norm": 0.27124550100484784, "learning_rate": 3.3831697067192437e-06, "loss": 0.3911, "num_tokens": 718131446.0, "step": 1744 }, { "epoch": 1.3333333333333333, "grad_norm": 0.24208857222921995, "learning_rate": 3.3782539722725606e-06, "loss": 0.3615, "num_tokens": 718574564.0, "step": 1745 }, { "epoch": 1.3337155963302751, "grad_norm": 0.45601948254369123, "learning_rate": 3.373341491335932e-06, "loss": 0.2743, "num_tokens": 718654164.0, "step": 1746 }, { "epoch": 2.000382262996942, "grad_norm": 0.2605193468540187, "learning_rate": 3.368432271442214e-06, "loss": 0.3556, "num_tokens": 719105964.0, "step": 1747 }, { "epoch": 2.0007645259938838, "grad_norm": 0.2519441102677706, "learning_rate": 3.3635263201192604e-06, "loss": 0.335, "num_tokens": 719513847.0, "step": 1748 }, { "epoch": 2.001146788990826, "grad_norm": 0.2567707267895316, "learning_rate": 3.358623644889916e-06, "loss": 0.3374, "num_tokens": 719942682.0, "step": 1749 }, { "epoch": 2.0015290519877675, "grad_norm": 0.25680848468131723, "learning_rate": 3.3537242532719983e-06, "loss": 0.3374, "num_tokens": 720359982.0, "step": 1750 }, { "epoch": 2.0019113149847096, "grad_norm": 0.21342056302975543, "learning_rate": 3.348828152778291e-06, "loss": 0.3524, "num_tokens": 720851760.0, "step": 1751 }, { "epoch": 2.0022935779816513, "grad_norm": 0.2753737792965614, "learning_rate": 3.343935350916533e-06, "loss": 0.352, "num_tokens": 721276640.0, "step": 1752 }, { "epoch": 2.0026758409785934, "grad_norm": 0.2388606259824953, "learning_rate": 3.3390458551894056e-06, "loss": 0.3277, "num_tokens": 721656157.0, "step": 1753 }, { "epoch": 2.003058103975535, "grad_norm": 0.24593308106541187, "learning_rate": 3.3341596730945114e-06, "loss": 0.3705, "num_tokens": 722069598.0, "step": 1754 }, { "epoch": 2.003440366972477, "grad_norm": 0.23737540411386032, "learning_rate": 3.329276812124388e-06, "loss": 0.3506, "num_tokens": 722475421.0, "step": 1755 }, { "epoch": 2.003822629969419, "grad_norm": 0.244529161580806, "learning_rate": 3.3243972797664647e-06, "loss": 0.3394, "num_tokens": 722908037.0, "step": 1756 }, { "epoch": 2.004204892966361, "grad_norm": 0.25326561274385917, "learning_rate": 3.319521083503075e-06, "loss": 0.3684, "num_tokens": 723324951.0, "step": 1757 }, { "epoch": 2.0045871559633026, "grad_norm": 0.2540233851780087, "learning_rate": 3.314648230811436e-06, "loss": 0.364, "num_tokens": 723705095.0, "step": 1758 }, { "epoch": 2.0049694189602447, "grad_norm": 0.24514828134241698, "learning_rate": 3.3097787291636348e-06, "loss": 0.3465, "num_tokens": 724103309.0, "step": 1759 }, { "epoch": 2.0053516819571864, "grad_norm": 0.31711799608973806, "learning_rate": 3.3049125860266252e-06, "loss": 0.363, "num_tokens": 724524692.0, "step": 1760 }, { "epoch": 2.0057339449541285, "grad_norm": 0.24220622404194303, "learning_rate": 3.300049808862203e-06, "loss": 0.3341, "num_tokens": 724926013.0, "step": 1761 }, { "epoch": 2.00611620795107, "grad_norm": 0.24282703269159878, "learning_rate": 3.2951904051270122e-06, "loss": 0.3375, "num_tokens": 725308595.0, "step": 1762 }, { "epoch": 2.0064984709480123, "grad_norm": 0.2279259878939482, "learning_rate": 3.2903343822725143e-06, "loss": 0.3648, "num_tokens": 725739755.0, "step": 1763 }, { "epoch": 2.006880733944954, "grad_norm": 0.23653094846075276, "learning_rate": 3.285481747744997e-06, "loss": 0.3116, "num_tokens": 726142186.0, "step": 1764 }, { "epoch": 2.007262996941896, "grad_norm": 0.25262850048228397, "learning_rate": 3.2806325089855408e-06, "loss": 0.3451, "num_tokens": 726527765.0, "step": 1765 }, { "epoch": 2.007645259938838, "grad_norm": 0.2519291337921129, "learning_rate": 3.275786673430028e-06, "loss": 0.3683, "num_tokens": 726979577.0, "step": 1766 }, { "epoch": 2.00802752293578, "grad_norm": 0.26062581077908115, "learning_rate": 3.270944248509119e-06, "loss": 0.3386, "num_tokens": 727392753.0, "step": 1767 }, { "epoch": 2.008409785932722, "grad_norm": 0.26258706473535826, "learning_rate": 3.266105241648243e-06, "loss": 0.3541, "num_tokens": 727815584.0, "step": 1768 }, { "epoch": 2.0087920489296636, "grad_norm": 0.2296541171626925, "learning_rate": 3.2612696602675943e-06, "loss": 0.3379, "num_tokens": 728223806.0, "step": 1769 }, { "epoch": 2.0091743119266057, "grad_norm": 0.24104741959317136, "learning_rate": 3.256437511782101e-06, "loss": 0.3395, "num_tokens": 728649003.0, "step": 1770 }, { "epoch": 2.0095565749235473, "grad_norm": 0.27132654885297314, "learning_rate": 3.2516088036014444e-06, "loss": 0.351, "num_tokens": 729122402.0, "step": 1771 }, { "epoch": 2.0099388379204894, "grad_norm": 0.2525735619578914, "learning_rate": 3.246783543130014e-06, "loss": 0.3428, "num_tokens": 729512961.0, "step": 1772 }, { "epoch": 2.010321100917431, "grad_norm": 0.2569081413889353, "learning_rate": 3.24196173776693e-06, "loss": 0.3176, "num_tokens": 729905841.0, "step": 1773 }, { "epoch": 2.010703363914373, "grad_norm": 0.245024897720478, "learning_rate": 3.237143394905996e-06, "loss": 0.3196, "num_tokens": 730293014.0, "step": 1774 }, { "epoch": 2.011085626911315, "grad_norm": 0.2241928592542632, "learning_rate": 3.2323285219357194e-06, "loss": 0.3299, "num_tokens": 730694260.0, "step": 1775 }, { "epoch": 2.011467889908257, "grad_norm": 0.25810531840059814, "learning_rate": 3.2275171262392835e-06, "loss": 0.311, "num_tokens": 731043568.0, "step": 1776 }, { "epoch": 2.0118501529051986, "grad_norm": 0.2873222386701609, "learning_rate": 3.2227092151945327e-06, "loss": 0.3248, "num_tokens": 731415945.0, "step": 1777 }, { "epoch": 2.0122324159021407, "grad_norm": 0.2581797267971073, "learning_rate": 3.2179047961739807e-06, "loss": 0.3184, "num_tokens": 731809002.0, "step": 1778 }, { "epoch": 2.0126146788990824, "grad_norm": 0.263628999936447, "learning_rate": 3.213103876544773e-06, "loss": 0.352, "num_tokens": 732230925.0, "step": 1779 }, { "epoch": 2.0129969418960245, "grad_norm": 0.2670793384235646, "learning_rate": 3.208306463668703e-06, "loss": 0.3343, "num_tokens": 732628653.0, "step": 1780 }, { "epoch": 2.013379204892966, "grad_norm": 0.24439546657845299, "learning_rate": 3.2035125649021733e-06, "loss": 0.3056, "num_tokens": 733028213.0, "step": 1781 }, { "epoch": 2.0137614678899083, "grad_norm": 0.2569393905404549, "learning_rate": 3.198722187596206e-06, "loss": 0.3197, "num_tokens": 733431075.0, "step": 1782 }, { "epoch": 2.01414373088685, "grad_norm": 0.23536067023370272, "learning_rate": 3.1939353390964224e-06, "loss": 0.3231, "num_tokens": 733833055.0, "step": 1783 }, { "epoch": 2.014525993883792, "grad_norm": 0.24503016888061527, "learning_rate": 3.189152026743031e-06, "loss": 0.3251, "num_tokens": 734262584.0, "step": 1784 }, { "epoch": 2.014908256880734, "grad_norm": 0.25207753138399924, "learning_rate": 3.184372257870818e-06, "loss": 0.3197, "num_tokens": 734648250.0, "step": 1785 }, { "epoch": 2.015290519877676, "grad_norm": 0.2405222425834741, "learning_rate": 3.1795960398091373e-06, "loss": 0.3059, "num_tokens": 735083618.0, "step": 1786 }, { "epoch": 2.015672782874618, "grad_norm": 0.27418869787100386, "learning_rate": 3.1748233798818997e-06, "loss": 0.2965, "num_tokens": 735463309.0, "step": 1787 }, { "epoch": 2.0160550458715596, "grad_norm": 0.24639733231677355, "learning_rate": 3.1700542854075508e-06, "loss": 0.3252, "num_tokens": 735865715.0, "step": 1788 }, { "epoch": 2.0164373088685017, "grad_norm": 0.2304158868186682, "learning_rate": 3.165288763699084e-06, "loss": 0.3311, "num_tokens": 736326065.0, "step": 1789 }, { "epoch": 2.0168195718654434, "grad_norm": 0.24822470747781525, "learning_rate": 3.1605268220639986e-06, "loss": 0.324, "num_tokens": 736747892.0, "step": 1790 }, { "epoch": 2.0172018348623855, "grad_norm": 0.23398422506593472, "learning_rate": 3.1557684678043145e-06, "loss": 0.2899, "num_tokens": 737166530.0, "step": 1791 }, { "epoch": 2.017584097859327, "grad_norm": 0.2744979500517675, "learning_rate": 3.1510137082165475e-06, "loss": 0.3061, "num_tokens": 737550285.0, "step": 1792 }, { "epoch": 2.0179663608562692, "grad_norm": 0.2589243971620022, "learning_rate": 3.146262550591701e-06, "loss": 0.3193, "num_tokens": 737956094.0, "step": 1793 }, { "epoch": 2.018348623853211, "grad_norm": 0.25721306935861216, "learning_rate": 3.1415150022152564e-06, "loss": 0.3151, "num_tokens": 738369559.0, "step": 1794 }, { "epoch": 2.018730886850153, "grad_norm": 0.25607224589419947, "learning_rate": 3.136771070367157e-06, "loss": 0.3186, "num_tokens": 738785166.0, "step": 1795 }, { "epoch": 2.0191131498470947, "grad_norm": 0.2646648028661554, "learning_rate": 3.1320307623218075e-06, "loss": 0.3264, "num_tokens": 739217301.0, "step": 1796 }, { "epoch": 2.0194954128440368, "grad_norm": 0.2426999415604464, "learning_rate": 3.1272940853480473e-06, "loss": 0.3142, "num_tokens": 739626966.0, "step": 1797 }, { "epoch": 2.0198776758409784, "grad_norm": 0.27346599109973424, "learning_rate": 3.1225610467091533e-06, "loss": 0.3109, "num_tokens": 740029041.0, "step": 1798 }, { "epoch": 2.0202599388379205, "grad_norm": 0.25062726426604437, "learning_rate": 3.117831653662822e-06, "loss": 0.2905, "num_tokens": 740414177.0, "step": 1799 }, { "epoch": 2.020642201834862, "grad_norm": 0.2286203124903576, "learning_rate": 3.1131059134611595e-06, "loss": 0.2722, "num_tokens": 740804304.0, "step": 1800 }, { "epoch": 2.0210244648318043, "grad_norm": 0.23416956734703526, "learning_rate": 3.1083838333506715e-06, "loss": 0.2998, "num_tokens": 741205986.0, "step": 1801 }, { "epoch": 2.021406727828746, "grad_norm": 0.24339664061892777, "learning_rate": 3.1036654205722503e-06, "loss": 0.3052, "num_tokens": 741579886.0, "step": 1802 }, { "epoch": 2.021788990825688, "grad_norm": 0.24777188271430553, "learning_rate": 3.098950682361166e-06, "loss": 0.3113, "num_tokens": 741956543.0, "step": 1803 }, { "epoch": 2.02217125382263, "grad_norm": 0.2505532161477693, "learning_rate": 3.094239625947051e-06, "loss": 0.3224, "num_tokens": 742368740.0, "step": 1804 }, { "epoch": 2.022553516819572, "grad_norm": 0.2740097837549157, "learning_rate": 3.089532258553895e-06, "loss": 0.3109, "num_tokens": 742761394.0, "step": 1805 }, { "epoch": 2.022935779816514, "grad_norm": 0.25186197910802094, "learning_rate": 3.0848285874000326e-06, "loss": 0.3125, "num_tokens": 743178753.0, "step": 1806 }, { "epoch": 2.0233180428134556, "grad_norm": 0.25063294806680786, "learning_rate": 3.0801286196981234e-06, "loss": 0.3221, "num_tokens": 743640252.0, "step": 1807 }, { "epoch": 2.0237003058103977, "grad_norm": 0.25593016534508156, "learning_rate": 3.0754323626551543e-06, "loss": 0.3053, "num_tokens": 744041823.0, "step": 1808 }, { "epoch": 2.0240825688073394, "grad_norm": 0.22744175225847083, "learning_rate": 3.0707398234724206e-06, "loss": 0.2991, "num_tokens": 744459065.0, "step": 1809 }, { "epoch": 2.0244648318042815, "grad_norm": 0.2446511972336764, "learning_rate": 3.066051009345517e-06, "loss": 0.3144, "num_tokens": 744862164.0, "step": 1810 }, { "epoch": 2.024847094801223, "grad_norm": 0.25524397782622377, "learning_rate": 3.0613659274643255e-06, "loss": 0.2919, "num_tokens": 745220691.0, "step": 1811 }, { "epoch": 2.0252293577981653, "grad_norm": 0.24391027544377664, "learning_rate": 3.0566845850130043e-06, "loss": 0.3383, "num_tokens": 745639087.0, "step": 1812 }, { "epoch": 2.025611620795107, "grad_norm": 0.2486522240362927, "learning_rate": 3.052006989169981e-06, "loss": 0.2932, "num_tokens": 746049043.0, "step": 1813 }, { "epoch": 2.025993883792049, "grad_norm": 0.23771560424583243, "learning_rate": 3.0473331471079307e-06, "loss": 0.2735, "num_tokens": 746450290.0, "step": 1814 }, { "epoch": 2.0263761467889907, "grad_norm": 0.26724819167392405, "learning_rate": 3.0426630659937834e-06, "loss": 0.3156, "num_tokens": 746877820.0, "step": 1815 }, { "epoch": 2.026758409785933, "grad_norm": 0.2765474352632359, "learning_rate": 3.0379967529886904e-06, "loss": 0.3261, "num_tokens": 747290202.0, "step": 1816 }, { "epoch": 2.0271406727828745, "grad_norm": 0.2525481884464414, "learning_rate": 3.0333342152480332e-06, "loss": 0.295, "num_tokens": 747691191.0, "step": 1817 }, { "epoch": 2.0275229357798166, "grad_norm": 0.24171910353744017, "learning_rate": 3.0286754599214007e-06, "loss": 0.2984, "num_tokens": 748090982.0, "step": 1818 }, { "epoch": 2.0279051987767582, "grad_norm": 0.2728246330869588, "learning_rate": 3.0240204941525818e-06, "loss": 0.3023, "num_tokens": 748489459.0, "step": 1819 }, { "epoch": 2.0282874617737003, "grad_norm": 0.27101232699076977, "learning_rate": 3.0193693250795587e-06, "loss": 0.3246, "num_tokens": 748930093.0, "step": 1820 }, { "epoch": 2.028669724770642, "grad_norm": 0.2527746512773829, "learning_rate": 3.0147219598344823e-06, "loss": 0.3073, "num_tokens": 749367341.0, "step": 1821 }, { "epoch": 2.029051987767584, "grad_norm": 0.2668158023550485, "learning_rate": 3.0100784055436818e-06, "loss": 0.3062, "num_tokens": 749739523.0, "step": 1822 }, { "epoch": 2.0294342507645258, "grad_norm": 0.22226742564938898, "learning_rate": 3.005438669327633e-06, "loss": 0.2882, "num_tokens": 750137755.0, "step": 1823 }, { "epoch": 2.029816513761468, "grad_norm": 0.25713361899649634, "learning_rate": 3.000802758300967e-06, "loss": 0.2961, "num_tokens": 750506620.0, "step": 1824 }, { "epoch": 2.03019877675841, "grad_norm": 0.29298680481859035, "learning_rate": 2.9961706795724366e-06, "loss": 0.3216, "num_tokens": 750907674.0, "step": 1825 }, { "epoch": 2.0305810397553516, "grad_norm": 0.24964897621978419, "learning_rate": 2.9915424402449334e-06, "loss": 0.3096, "num_tokens": 751286645.0, "step": 1826 }, { "epoch": 2.0309633027522938, "grad_norm": 0.25105094629201735, "learning_rate": 2.9869180474154473e-06, "loss": 0.3299, "num_tokens": 751690243.0, "step": 1827 }, { "epoch": 2.0313455657492354, "grad_norm": 0.24559835593024112, "learning_rate": 2.9822975081750776e-06, "loss": 0.2992, "num_tokens": 752119840.0, "step": 1828 }, { "epoch": 2.0317278287461775, "grad_norm": 0.24776057343020483, "learning_rate": 2.9776808296090155e-06, "loss": 0.3201, "num_tokens": 752552466.0, "step": 1829 }, { "epoch": 2.032110091743119, "grad_norm": 0.2515573527523867, "learning_rate": 2.9730680187965237e-06, "loss": 0.3068, "num_tokens": 752936825.0, "step": 1830 }, { "epoch": 2.0324923547400613, "grad_norm": 0.2531370379471016, "learning_rate": 2.9684590828109473e-06, "loss": 0.2794, "num_tokens": 753305996.0, "step": 1831 }, { "epoch": 2.032874617737003, "grad_norm": 0.24464212022852835, "learning_rate": 2.963854028719676e-06, "loss": 0.3002, "num_tokens": 753686005.0, "step": 1832 }, { "epoch": 2.033256880733945, "grad_norm": 0.2248890795516489, "learning_rate": 2.959252863584159e-06, "loss": 0.2842, "num_tokens": 754109910.0, "step": 1833 }, { "epoch": 2.0336391437308867, "grad_norm": 0.25739735506399464, "learning_rate": 2.954655594459872e-06, "loss": 0.3363, "num_tokens": 754565104.0, "step": 1834 }, { "epoch": 2.034021406727829, "grad_norm": 0.2726606936521271, "learning_rate": 2.950062228396323e-06, "loss": 0.3092, "num_tokens": 754937706.0, "step": 1835 }, { "epoch": 2.0344036697247705, "grad_norm": 0.248065062932608, "learning_rate": 2.9454727724370325e-06, "loss": 0.2879, "num_tokens": 755310494.0, "step": 1836 }, { "epoch": 2.0347859327217126, "grad_norm": 0.23527074332222286, "learning_rate": 2.9408872336195252e-06, "loss": 0.2931, "num_tokens": 755698745.0, "step": 1837 }, { "epoch": 2.0351681957186543, "grad_norm": 0.2594880311173301, "learning_rate": 2.9363056189753224e-06, "loss": 0.3189, "num_tokens": 756139122.0, "step": 1838 }, { "epoch": 2.0355504587155964, "grad_norm": 0.29091929784036424, "learning_rate": 2.931727935529921e-06, "loss": 0.2956, "num_tokens": 756532212.0, "step": 1839 }, { "epoch": 2.035932721712538, "grad_norm": 0.2378701459873727, "learning_rate": 2.9271541903027984e-06, "loss": 0.2975, "num_tokens": 756924676.0, "step": 1840 }, { "epoch": 2.03631498470948, "grad_norm": 0.24182257083588726, "learning_rate": 2.9225843903073854e-06, "loss": 0.2907, "num_tokens": 757303985.0, "step": 1841 }, { "epoch": 2.036697247706422, "grad_norm": 0.24109892958833565, "learning_rate": 2.9180185425510678e-06, "loss": 0.3131, "num_tokens": 757727215.0, "step": 1842 }, { "epoch": 2.037079510703364, "grad_norm": 0.23183766436989323, "learning_rate": 2.9134566540351695e-06, "loss": 0.3051, "num_tokens": 758162846.0, "step": 1843 }, { "epoch": 2.037461773700306, "grad_norm": 0.24656029234859986, "learning_rate": 2.9088987317549443e-06, "loss": 0.277, "num_tokens": 758539184.0, "step": 1844 }, { "epoch": 2.0378440366972477, "grad_norm": 0.234823666828163, "learning_rate": 2.904344782699562e-06, "loss": 0.2874, "num_tokens": 758933524.0, "step": 1845 }, { "epoch": 2.03822629969419, "grad_norm": 0.24334942235774046, "learning_rate": 2.899794813852102e-06, "loss": 0.3138, "num_tokens": 759361176.0, "step": 1846 }, { "epoch": 2.0386085626911314, "grad_norm": 0.2313774041503865, "learning_rate": 2.895248832189541e-06, "loss": 0.291, "num_tokens": 759772585.0, "step": 1847 }, { "epoch": 2.0389908256880735, "grad_norm": 0.2591648093940398, "learning_rate": 2.8907068446827348e-06, "loss": 0.3053, "num_tokens": 760109076.0, "step": 1848 }, { "epoch": 2.039373088685015, "grad_norm": 0.26922079840987917, "learning_rate": 2.8861688582964263e-06, "loss": 0.3204, "num_tokens": 760522674.0, "step": 1849 }, { "epoch": 2.0397553516819573, "grad_norm": 0.25498507046188246, "learning_rate": 2.8816348799892134e-06, "loss": 0.302, "num_tokens": 760910044.0, "step": 1850 }, { "epoch": 2.040137614678899, "grad_norm": 0.25255352580310136, "learning_rate": 2.8771049167135507e-06, "loss": 0.2887, "num_tokens": 761297622.0, "step": 1851 }, { "epoch": 2.040519877675841, "grad_norm": 0.2525503254488794, "learning_rate": 2.8725789754157385e-06, "loss": 0.2867, "num_tokens": 761690778.0, "step": 1852 }, { "epoch": 2.0409021406727827, "grad_norm": 0.25335127324537604, "learning_rate": 2.868057063035906e-06, "loss": 0.2973, "num_tokens": 762088968.0, "step": 1853 }, { "epoch": 2.041284403669725, "grad_norm": 0.24506273018165037, "learning_rate": 2.8635391865080074e-06, "loss": 0.3124, "num_tokens": 762520894.0, "step": 1854 }, { "epoch": 2.0416666666666665, "grad_norm": 0.2638475241286148, "learning_rate": 2.8590253527598073e-06, "loss": 0.3248, "num_tokens": 762932618.0, "step": 1855 }, { "epoch": 2.0420489296636086, "grad_norm": 0.25979010390983487, "learning_rate": 2.8545155687128706e-06, "loss": 0.3051, "num_tokens": 763316146.0, "step": 1856 }, { "epoch": 2.0424311926605503, "grad_norm": 0.24717900134876009, "learning_rate": 2.850009841282554e-06, "loss": 0.3068, "num_tokens": 763728595.0, "step": 1857 }, { "epoch": 2.0428134556574924, "grad_norm": 0.23594218221231555, "learning_rate": 2.8455081773779893e-06, "loss": 0.3113, "num_tokens": 764202688.0, "step": 1858 }, { "epoch": 2.043195718654434, "grad_norm": 0.2567370431225666, "learning_rate": 2.841010583902082e-06, "loss": 0.3265, "num_tokens": 764628951.0, "step": 1859 }, { "epoch": 2.043577981651376, "grad_norm": 0.2354944113587352, "learning_rate": 2.836517067751494e-06, "loss": 0.3004, "num_tokens": 765041421.0, "step": 1860 }, { "epoch": 2.043960244648318, "grad_norm": 0.24806082804422305, "learning_rate": 2.8320276358166365e-06, "loss": 0.3125, "num_tokens": 765493970.0, "step": 1861 }, { "epoch": 2.04434250764526, "grad_norm": 0.24444157506679715, "learning_rate": 2.8275422949816556e-06, "loss": 0.2959, "num_tokens": 765906137.0, "step": 1862 }, { "epoch": 2.044724770642202, "grad_norm": 0.2510230348240986, "learning_rate": 2.823061052124425e-06, "loss": 0.3016, "num_tokens": 766311280.0, "step": 1863 }, { "epoch": 2.0451070336391437, "grad_norm": 0.23636816214188436, "learning_rate": 2.818583914116535e-06, "loss": 0.2784, "num_tokens": 766712496.0, "step": 1864 }, { "epoch": 2.045489296636086, "grad_norm": 0.2670457603858624, "learning_rate": 2.814110887823281e-06, "loss": 0.314, "num_tokens": 767143731.0, "step": 1865 }, { "epoch": 2.0458715596330275, "grad_norm": 0.25096696615664604, "learning_rate": 2.8096419801036552e-06, "loss": 0.305, "num_tokens": 767566242.0, "step": 1866 }, { "epoch": 2.0462538226299696, "grad_norm": 0.23710405984066324, "learning_rate": 2.805177197810329e-06, "loss": 0.3205, "num_tokens": 767991366.0, "step": 1867 }, { "epoch": 2.0466360856269112, "grad_norm": 0.23647908195758097, "learning_rate": 2.800716547789656e-06, "loss": 0.3, "num_tokens": 768435953.0, "step": 1868 }, { "epoch": 2.0470183486238533, "grad_norm": 0.25090574546474453, "learning_rate": 2.796260036881645e-06, "loss": 0.3293, "num_tokens": 768884222.0, "step": 1869 }, { "epoch": 2.047400611620795, "grad_norm": 0.25437833149929656, "learning_rate": 2.7918076719199626e-06, "loss": 0.3239, "num_tokens": 769283220.0, "step": 1870 }, { "epoch": 2.047782874617737, "grad_norm": 0.25256325876657243, "learning_rate": 2.787359459731919e-06, "loss": 0.3054, "num_tokens": 769706797.0, "step": 1871 }, { "epoch": 2.0481651376146788, "grad_norm": 0.23409241520891302, "learning_rate": 2.7829154071384528e-06, "loss": 0.2852, "num_tokens": 770112469.0, "step": 1872 }, { "epoch": 2.048547400611621, "grad_norm": 0.24157602590040073, "learning_rate": 2.7784755209541283e-06, "loss": 0.3288, "num_tokens": 770571324.0, "step": 1873 }, { "epoch": 2.0489296636085625, "grad_norm": 0.2410693897499373, "learning_rate": 2.7740398079871133e-06, "loss": 0.2974, "num_tokens": 770961764.0, "step": 1874 }, { "epoch": 2.0493119266055047, "grad_norm": 0.2921642399613461, "learning_rate": 2.7696082750391886e-06, "loss": 0.3079, "num_tokens": 771371095.0, "step": 1875 }, { "epoch": 2.0496941896024463, "grad_norm": 0.24826658815863353, "learning_rate": 2.765180928905712e-06, "loss": 0.3009, "num_tokens": 771798673.0, "step": 1876 }, { "epoch": 2.0500764525993884, "grad_norm": 0.2836604359914633, "learning_rate": 2.7607577763756333e-06, "loss": 0.3188, "num_tokens": 772163103.0, "step": 1877 }, { "epoch": 2.05045871559633, "grad_norm": 0.2365639298982091, "learning_rate": 2.7563388242314615e-06, "loss": 0.3171, "num_tokens": 772587281.0, "step": 1878 }, { "epoch": 2.050840978593272, "grad_norm": 0.21631205242485615, "learning_rate": 2.7519240792492717e-06, "loss": 0.3118, "num_tokens": 773017673.0, "step": 1879 }, { "epoch": 2.051223241590214, "grad_norm": 0.22689033386267388, "learning_rate": 2.7475135481986847e-06, "loss": 0.3031, "num_tokens": 773421549.0, "step": 1880 }, { "epoch": 2.051605504587156, "grad_norm": 0.22404085649956276, "learning_rate": 2.7431072378428604e-06, "loss": 0.3132, "num_tokens": 773853341.0, "step": 1881 }, { "epoch": 2.051987767584098, "grad_norm": 0.2613187639618827, "learning_rate": 2.738705154938487e-06, "loss": 0.2824, "num_tokens": 774223736.0, "step": 1882 }, { "epoch": 2.0523700305810397, "grad_norm": 0.23813393871831123, "learning_rate": 2.7343073062357655e-06, "loss": 0.3069, "num_tokens": 774636709.0, "step": 1883 }, { "epoch": 2.052752293577982, "grad_norm": 0.2528773175125097, "learning_rate": 2.7299136984784146e-06, "loss": 0.2937, "num_tokens": 775055518.0, "step": 1884 }, { "epoch": 2.0531345565749235, "grad_norm": 0.25005658199878217, "learning_rate": 2.7255243384036383e-06, "loss": 0.3174, "num_tokens": 775497286.0, "step": 1885 }, { "epoch": 2.0535168195718656, "grad_norm": 0.22476413457932462, "learning_rate": 2.721139232742137e-06, "loss": 0.2983, "num_tokens": 775913214.0, "step": 1886 }, { "epoch": 2.0538990825688073, "grad_norm": 0.23422843440615318, "learning_rate": 2.7167583882180794e-06, "loss": 0.2849, "num_tokens": 776308727.0, "step": 1887 }, { "epoch": 2.0542813455657494, "grad_norm": 0.2367788294580391, "learning_rate": 2.712381811549104e-06, "loss": 0.2918, "num_tokens": 776719045.0, "step": 1888 }, { "epoch": 2.054663608562691, "grad_norm": 0.25154860367862086, "learning_rate": 2.708009509446307e-06, "loss": 0.2772, "num_tokens": 777132833.0, "step": 1889 }, { "epoch": 2.055045871559633, "grad_norm": 0.2853307411961546, "learning_rate": 2.703641488614222e-06, "loss": 0.2947, "num_tokens": 777518237.0, "step": 1890 }, { "epoch": 2.055428134556575, "grad_norm": 0.2892952858160809, "learning_rate": 2.6992777557508287e-06, "loss": 0.3031, "num_tokens": 777920029.0, "step": 1891 }, { "epoch": 2.055810397553517, "grad_norm": 0.23930523401692808, "learning_rate": 2.6949183175475213e-06, "loss": 0.312, "num_tokens": 778368151.0, "step": 1892 }, { "epoch": 2.0561926605504586, "grad_norm": 0.2462465355906849, "learning_rate": 2.6905631806891176e-06, "loss": 0.3006, "num_tokens": 778770244.0, "step": 1893 }, { "epoch": 2.0565749235474007, "grad_norm": 0.2678351029342093, "learning_rate": 2.6862123518538306e-06, "loss": 0.3246, "num_tokens": 779183007.0, "step": 1894 }, { "epoch": 2.0569571865443423, "grad_norm": 0.2602739183230476, "learning_rate": 2.681865837713275e-06, "loss": 0.3279, "num_tokens": 779615373.0, "step": 1895 }, { "epoch": 2.0573394495412844, "grad_norm": 0.22151913278866325, "learning_rate": 2.6775236449324448e-06, "loss": 0.2939, "num_tokens": 780076818.0, "step": 1896 }, { "epoch": 2.057721712538226, "grad_norm": 0.23749909608017644, "learning_rate": 2.6731857801697096e-06, "loss": 0.3013, "num_tokens": 780536956.0, "step": 1897 }, { "epoch": 2.058103975535168, "grad_norm": 0.2809814164220392, "learning_rate": 2.668852250076801e-06, "loss": 0.2972, "num_tokens": 780945303.0, "step": 1898 }, { "epoch": 2.05848623853211, "grad_norm": 0.25984682014967614, "learning_rate": 2.664523061298806e-06, "loss": 0.331, "num_tokens": 781388653.0, "step": 1899 }, { "epoch": 2.058868501529052, "grad_norm": 0.23952194385567244, "learning_rate": 2.6601982204741524e-06, "loss": 0.3258, "num_tokens": 781839006.0, "step": 1900 }, { "epoch": 2.059250764525994, "grad_norm": 0.2556101438240176, "learning_rate": 2.6558777342345982e-06, "loss": 0.3302, "num_tokens": 782271159.0, "step": 1901 }, { "epoch": 2.0596330275229358, "grad_norm": 0.2852184673002273, "learning_rate": 2.6515616092052332e-06, "loss": 0.274, "num_tokens": 782637793.0, "step": 1902 }, { "epoch": 2.060015290519878, "grad_norm": 0.2529499833638473, "learning_rate": 2.647249852004449e-06, "loss": 0.304, "num_tokens": 783078149.0, "step": 1903 }, { "epoch": 2.0603975535168195, "grad_norm": 0.25474573584430266, "learning_rate": 2.6429424692439467e-06, "loss": 0.2794, "num_tokens": 783454245.0, "step": 1904 }, { "epoch": 2.0607798165137616, "grad_norm": 0.2538441741205928, "learning_rate": 2.638639467528715e-06, "loss": 0.2841, "num_tokens": 783879903.0, "step": 1905 }, { "epoch": 2.0611620795107033, "grad_norm": 0.23563383153650205, "learning_rate": 2.6343408534570295e-06, "loss": 0.3146, "num_tokens": 784307049.0, "step": 1906 }, { "epoch": 2.0615443425076454, "grad_norm": 0.2995604393309861, "learning_rate": 2.6300466336204333e-06, "loss": 0.3144, "num_tokens": 784725197.0, "step": 1907 }, { "epoch": 2.061926605504587, "grad_norm": 0.239721608159916, "learning_rate": 2.625756814603734e-06, "loss": 0.2867, "num_tokens": 785118223.0, "step": 1908 }, { "epoch": 2.062308868501529, "grad_norm": 0.236866412604189, "learning_rate": 2.621471402984991e-06, "loss": 0.2878, "num_tokens": 785529233.0, "step": 1909 }, { "epoch": 2.062691131498471, "grad_norm": 0.2505921338189514, "learning_rate": 2.6171904053355012e-06, "loss": 0.3021, "num_tokens": 785957992.0, "step": 1910 }, { "epoch": 2.063073394495413, "grad_norm": 0.25622741884888617, "learning_rate": 2.6129138282197976e-06, "loss": 0.3178, "num_tokens": 786406312.0, "step": 1911 }, { "epoch": 2.0634556574923546, "grad_norm": 0.23381622623250836, "learning_rate": 2.6086416781956342e-06, "loss": 0.2836, "num_tokens": 786773746.0, "step": 1912 }, { "epoch": 2.0638379204892967, "grad_norm": 0.2419129447930029, "learning_rate": 2.6043739618139744e-06, "loss": 0.2818, "num_tokens": 787159103.0, "step": 1913 }, { "epoch": 2.0642201834862384, "grad_norm": 0.2417967267051386, "learning_rate": 2.600110685618985e-06, "loss": 0.3093, "num_tokens": 787601671.0, "step": 1914 }, { "epoch": 2.0646024464831805, "grad_norm": 0.26314094824173706, "learning_rate": 2.595851856148024e-06, "loss": 0.3177, "num_tokens": 788016645.0, "step": 1915 }, { "epoch": 2.064984709480122, "grad_norm": 0.23691810789002488, "learning_rate": 2.591597479931629e-06, "loss": 0.2942, "num_tokens": 788407801.0, "step": 1916 }, { "epoch": 2.0653669724770642, "grad_norm": 0.2603964480266365, "learning_rate": 2.58734756349351e-06, "loss": 0.2809, "num_tokens": 788790923.0, "step": 1917 }, { "epoch": 2.065749235474006, "grad_norm": 0.2573798564504652, "learning_rate": 2.5831021133505385e-06, "loss": 0.3026, "num_tokens": 789229175.0, "step": 1918 }, { "epoch": 2.066131498470948, "grad_norm": 0.29045468874027264, "learning_rate": 2.578861136012739e-06, "loss": 0.2979, "num_tokens": 789638781.0, "step": 1919 }, { "epoch": 2.06651376146789, "grad_norm": 0.2758999303570352, "learning_rate": 2.5746246379832716e-06, "loss": 0.3151, "num_tokens": 790056351.0, "step": 1920 }, { "epoch": 2.066896024464832, "grad_norm": 0.25087129346208803, "learning_rate": 2.5703926257584344e-06, "loss": 0.2939, "num_tokens": 790450011.0, "step": 1921 }, { "epoch": 2.067278287461774, "grad_norm": 0.256337724302696, "learning_rate": 2.566165105827644e-06, "loss": 0.3372, "num_tokens": 790901889.0, "step": 1922 }, { "epoch": 2.0676605504587156, "grad_norm": 0.23369123777508471, "learning_rate": 2.561942084673428e-06, "loss": 0.3118, "num_tokens": 791347802.0, "step": 1923 }, { "epoch": 2.0680428134556577, "grad_norm": 0.22942665179069943, "learning_rate": 2.5577235687714162e-06, "loss": 0.2842, "num_tokens": 791759807.0, "step": 1924 }, { "epoch": 2.0684250764525993, "grad_norm": 0.2383708107043818, "learning_rate": 2.553509564590331e-06, "loss": 0.3055, "num_tokens": 792217480.0, "step": 1925 }, { "epoch": 2.0688073394495414, "grad_norm": 0.28099771231634224, "learning_rate": 2.549300078591975e-06, "loss": 0.3189, "num_tokens": 792640562.0, "step": 1926 }, { "epoch": 2.069189602446483, "grad_norm": 0.26192783135072806, "learning_rate": 2.545095117231221e-06, "loss": 0.2982, "num_tokens": 793053819.0, "step": 1927 }, { "epoch": 2.069571865443425, "grad_norm": 0.27683330662659467, "learning_rate": 2.54089468695601e-06, "loss": 0.2922, "num_tokens": 793418494.0, "step": 1928 }, { "epoch": 2.069954128440367, "grad_norm": 0.24193480713225446, "learning_rate": 2.536698794207327e-06, "loss": 0.2728, "num_tokens": 793775817.0, "step": 1929 }, { "epoch": 2.070336391437309, "grad_norm": 0.2585965594706889, "learning_rate": 2.5325074454192035e-06, "loss": 0.2889, "num_tokens": 794185908.0, "step": 1930 }, { "epoch": 2.0707186544342506, "grad_norm": 0.2578078103759536, "learning_rate": 2.5283206470187034e-06, "loss": 0.2951, "num_tokens": 794552715.0, "step": 1931 }, { "epoch": 2.0711009174311927, "grad_norm": 0.24788166986132384, "learning_rate": 2.5241384054259114e-06, "loss": 0.2995, "num_tokens": 794944870.0, "step": 1932 }, { "epoch": 2.0714831804281344, "grad_norm": 0.24359820048510666, "learning_rate": 2.519960727053927e-06, "loss": 0.2982, "num_tokens": 795359555.0, "step": 1933 }, { "epoch": 2.0718654434250765, "grad_norm": 0.2705659431054324, "learning_rate": 2.515787618308847e-06, "loss": 0.3081, "num_tokens": 795798667.0, "step": 1934 }, { "epoch": 2.072247706422018, "grad_norm": 0.26599523237631695, "learning_rate": 2.5116190855897703e-06, "loss": 0.3, "num_tokens": 796214008.0, "step": 1935 }, { "epoch": 2.0726299694189603, "grad_norm": 0.2981101636985226, "learning_rate": 2.507455135288767e-06, "loss": 0.3183, "num_tokens": 796606912.0, "step": 1936 }, { "epoch": 2.073012232415902, "grad_norm": 0.2521113794732323, "learning_rate": 2.5032957737908946e-06, "loss": 0.2975, "num_tokens": 797026547.0, "step": 1937 }, { "epoch": 2.073394495412844, "grad_norm": 0.2579350055292374, "learning_rate": 2.4991410074741586e-06, "loss": 0.3322, "num_tokens": 797439229.0, "step": 1938 }, { "epoch": 2.073776758409786, "grad_norm": 0.2595412828204378, "learning_rate": 2.494990842709533e-06, "loss": 0.3151, "num_tokens": 797860490.0, "step": 1939 }, { "epoch": 2.074159021406728, "grad_norm": 0.2526795661057503, "learning_rate": 2.4908452858609245e-06, "loss": 0.3105, "num_tokens": 798292920.0, "step": 1940 }, { "epoch": 2.07454128440367, "grad_norm": 0.25075016400492633, "learning_rate": 2.486704343285179e-06, "loss": 0.2994, "num_tokens": 798733759.0, "step": 1941 }, { "epoch": 2.0749235474006116, "grad_norm": 0.2696263996982417, "learning_rate": 2.4825680213320684e-06, "loss": 0.3002, "num_tokens": 799099532.0, "step": 1942 }, { "epoch": 2.0753058103975537, "grad_norm": 0.2390526713381615, "learning_rate": 2.4784363263442716e-06, "loss": 0.3059, "num_tokens": 799495133.0, "step": 1943 }, { "epoch": 2.0756880733944953, "grad_norm": 0.2550420721264511, "learning_rate": 2.474309264657384e-06, "loss": 0.2897, "num_tokens": 799904731.0, "step": 1944 }, { "epoch": 2.0760703363914375, "grad_norm": 0.24725962070993487, "learning_rate": 2.4701868425998844e-06, "loss": 0.3118, "num_tokens": 800316021.0, "step": 1945 }, { "epoch": 2.076452599388379, "grad_norm": 0.23779828671091766, "learning_rate": 2.466069066493148e-06, "loss": 0.2874, "num_tokens": 800710701.0, "step": 1946 }, { "epoch": 2.0768348623853212, "grad_norm": 0.23819682478321458, "learning_rate": 2.4619559426514166e-06, "loss": 0.2776, "num_tokens": 801094024.0, "step": 1947 }, { "epoch": 2.077217125382263, "grad_norm": 0.23609053747481742, "learning_rate": 2.4578474773818037e-06, "loss": 0.3023, "num_tokens": 801489337.0, "step": 1948 }, { "epoch": 2.077599388379205, "grad_norm": 0.26608075426914574, "learning_rate": 2.453743676984278e-06, "loss": 0.2953, "num_tokens": 801868408.0, "step": 1949 }, { "epoch": 2.0779816513761467, "grad_norm": 0.24970868308401248, "learning_rate": 2.4496445477516546e-06, "loss": 0.3031, "num_tokens": 802281975.0, "step": 1950 }, { "epoch": 2.0783639143730888, "grad_norm": 0.2523229839785525, "learning_rate": 2.445550095969587e-06, "loss": 0.2952, "num_tokens": 802672346.0, "step": 1951 }, { "epoch": 2.0787461773700304, "grad_norm": 0.2637241949361023, "learning_rate": 2.4414603279165524e-06, "loss": 0.2875, "num_tokens": 803090187.0, "step": 1952 }, { "epoch": 2.0791284403669725, "grad_norm": 0.2395272340386784, "learning_rate": 2.437375249863852e-06, "loss": 0.3254, "num_tokens": 803531693.0, "step": 1953 }, { "epoch": 2.079510703363914, "grad_norm": 0.23258140931262117, "learning_rate": 2.4332948680755893e-06, "loss": 0.2981, "num_tokens": 803950660.0, "step": 1954 }, { "epoch": 2.0798929663608563, "grad_norm": 0.26122239728892716, "learning_rate": 2.429219188808671e-06, "loss": 0.2871, "num_tokens": 804352402.0, "step": 1955 }, { "epoch": 2.080275229357798, "grad_norm": 0.2688096280805627, "learning_rate": 2.42514821831279e-06, "loss": 0.3044, "num_tokens": 804750137.0, "step": 1956 }, { "epoch": 2.08065749235474, "grad_norm": 0.2509567696378054, "learning_rate": 2.42108196283042e-06, "loss": 0.3245, "num_tokens": 805211442.0, "step": 1957 }, { "epoch": 2.081039755351682, "grad_norm": 0.24655872686391142, "learning_rate": 2.417020428596806e-06, "loss": 0.2929, "num_tokens": 805620477.0, "step": 1958 }, { "epoch": 2.081422018348624, "grad_norm": 0.2640749001798263, "learning_rate": 2.4129636218399497e-06, "loss": 0.2991, "num_tokens": 805998964.0, "step": 1959 }, { "epoch": 2.081804281345566, "grad_norm": 0.2422026631687079, "learning_rate": 2.408911548780609e-06, "loss": 0.3035, "num_tokens": 806427836.0, "step": 1960 }, { "epoch": 2.0821865443425076, "grad_norm": 0.27184362826920394, "learning_rate": 2.4048642156322745e-06, "loss": 0.2957, "num_tokens": 806796679.0, "step": 1961 }, { "epoch": 2.0825688073394497, "grad_norm": 0.26559039268905466, "learning_rate": 2.40082162860118e-06, "loss": 0.328, "num_tokens": 807182451.0, "step": 1962 }, { "epoch": 2.0829510703363914, "grad_norm": 0.23635795940344678, "learning_rate": 2.396783793886272e-06, "loss": 0.3068, "num_tokens": 807611255.0, "step": 1963 }, { "epoch": 2.0833333333333335, "grad_norm": 0.24169401109767216, "learning_rate": 2.392750717679213e-06, "loss": 0.2794, "num_tokens": 807985081.0, "step": 1964 }, { "epoch": 2.083715596330275, "grad_norm": 0.24705460951153824, "learning_rate": 2.388722406164371e-06, "loss": 0.325, "num_tokens": 808423621.0, "step": 1965 }, { "epoch": 2.0840978593272173, "grad_norm": 0.23582888552655434, "learning_rate": 2.3846988655188037e-06, "loss": 0.2736, "num_tokens": 808806117.0, "step": 1966 }, { "epoch": 2.084480122324159, "grad_norm": 0.2437760865097742, "learning_rate": 2.3806801019122562e-06, "loss": 0.2938, "num_tokens": 809228492.0, "step": 1967 }, { "epoch": 2.084862385321101, "grad_norm": 0.24323459057573368, "learning_rate": 2.3766661215071473e-06, "loss": 0.269, "num_tokens": 809630593.0, "step": 1968 }, { "epoch": 2.0852446483180427, "grad_norm": 0.250714448534589, "learning_rate": 2.372656930458562e-06, "loss": 0.316, "num_tokens": 810096654.0, "step": 1969 }, { "epoch": 2.085626911314985, "grad_norm": 0.23347996825839065, "learning_rate": 2.3686525349142415e-06, "loss": 0.2948, "num_tokens": 810502796.0, "step": 1970 }, { "epoch": 2.0860091743119265, "grad_norm": 0.24149363672236024, "learning_rate": 2.3646529410145684e-06, "loss": 0.3086, "num_tokens": 810931125.0, "step": 1971 }, { "epoch": 2.0863914373088686, "grad_norm": 0.251702201892877, "learning_rate": 2.3606581548925696e-06, "loss": 0.3035, "num_tokens": 811338873.0, "step": 1972 }, { "epoch": 2.08677370030581, "grad_norm": 0.2378565025242665, "learning_rate": 2.356668182673896e-06, "loss": 0.2898, "num_tokens": 811719159.0, "step": 1973 }, { "epoch": 2.0871559633027523, "grad_norm": 0.23432044530748078, "learning_rate": 2.3526830304768177e-06, "loss": 0.2961, "num_tokens": 812148086.0, "step": 1974 }, { "epoch": 2.087538226299694, "grad_norm": 0.2239112584765275, "learning_rate": 2.3487027044122134e-06, "loss": 0.2883, "num_tokens": 812557127.0, "step": 1975 }, { "epoch": 2.087920489296636, "grad_norm": 0.25228758430024656, "learning_rate": 2.3447272105835604e-06, "loss": 0.3228, "num_tokens": 812966276.0, "step": 1976 }, { "epoch": 2.088302752293578, "grad_norm": 0.2390794479841412, "learning_rate": 2.340756555086929e-06, "loss": 0.2931, "num_tokens": 813355448.0, "step": 1977 }, { "epoch": 2.08868501529052, "grad_norm": 0.24907755446918084, "learning_rate": 2.336790744010967e-06, "loss": 0.3056, "num_tokens": 813757328.0, "step": 1978 }, { "epoch": 2.089067278287462, "grad_norm": 0.24878801457552382, "learning_rate": 2.332829783436898e-06, "loss": 0.2999, "num_tokens": 814184701.0, "step": 1979 }, { "epoch": 2.0894495412844036, "grad_norm": 0.21905298708359253, "learning_rate": 2.3288736794385e-06, "loss": 0.2834, "num_tokens": 814595983.0, "step": 1980 }, { "epoch": 2.0898318042813457, "grad_norm": 0.22281881033402656, "learning_rate": 2.324922438082114e-06, "loss": 0.2837, "num_tokens": 815009340.0, "step": 1981 }, { "epoch": 2.0902140672782874, "grad_norm": 0.2399234834585078, "learning_rate": 2.320976065426617e-06, "loss": 0.2981, "num_tokens": 815426326.0, "step": 1982 }, { "epoch": 2.0905963302752295, "grad_norm": 0.25795713022908623, "learning_rate": 2.3170345675234225e-06, "loss": 0.3348, "num_tokens": 815904942.0, "step": 1983 }, { "epoch": 2.090978593272171, "grad_norm": 0.23209352874040443, "learning_rate": 2.3130979504164695e-06, "loss": 0.2794, "num_tokens": 816308189.0, "step": 1984 }, { "epoch": 2.0913608562691133, "grad_norm": 0.23854635856560508, "learning_rate": 2.3091662201422136e-06, "loss": 0.3096, "num_tokens": 816736107.0, "step": 1985 }, { "epoch": 2.091743119266055, "grad_norm": 0.22731179585407965, "learning_rate": 2.3052393827296163e-06, "loss": 0.2996, "num_tokens": 817145386.0, "step": 1986 }, { "epoch": 2.092125382262997, "grad_norm": 0.21329199173990762, "learning_rate": 2.3013174442001315e-06, "loss": 0.2807, "num_tokens": 817587614.0, "step": 1987 }, { "epoch": 2.0925076452599387, "grad_norm": 0.22277926937398698, "learning_rate": 2.2974004105677114e-06, "loss": 0.2938, "num_tokens": 818010456.0, "step": 1988 }, { "epoch": 2.092889908256881, "grad_norm": 0.24219572862208105, "learning_rate": 2.2934882878387753e-06, "loss": 0.323, "num_tokens": 818461879.0, "step": 1989 }, { "epoch": 2.0932721712538225, "grad_norm": 0.23487379519997126, "learning_rate": 2.2895810820122225e-06, "loss": 0.2864, "num_tokens": 818843862.0, "step": 1990 }, { "epoch": 2.0936544342507646, "grad_norm": 0.2605759269692881, "learning_rate": 2.2856787990794054e-06, "loss": 0.323, "num_tokens": 819315066.0, "step": 1991 }, { "epoch": 2.0940366972477062, "grad_norm": 0.24434020247001167, "learning_rate": 2.28178144502413e-06, "loss": 0.3077, "num_tokens": 819740170.0, "step": 1992 }, { "epoch": 2.0944189602446484, "grad_norm": 0.2442194190968147, "learning_rate": 2.277889025822645e-06, "loss": 0.283, "num_tokens": 820119329.0, "step": 1993 }, { "epoch": 2.09480122324159, "grad_norm": 0.25180199181834334, "learning_rate": 2.274001547443631e-06, "loss": 0.3244, "num_tokens": 820542988.0, "step": 1994 }, { "epoch": 2.095183486238532, "grad_norm": 0.24554385098163184, "learning_rate": 2.2701190158481935e-06, "loss": 0.3063, "num_tokens": 820961038.0, "step": 1995 }, { "epoch": 2.0955657492354742, "grad_norm": 0.2339533271538118, "learning_rate": 2.2662414369898494e-06, "loss": 0.2949, "num_tokens": 821354962.0, "step": 1996 }, { "epoch": 2.095948012232416, "grad_norm": 0.24496440765255373, "learning_rate": 2.262368816814527e-06, "loss": 0.298, "num_tokens": 821750785.0, "step": 1997 }, { "epoch": 2.096330275229358, "grad_norm": 0.2261868350447137, "learning_rate": 2.2585011612605418e-06, "loss": 0.3062, "num_tokens": 822193772.0, "step": 1998 }, { "epoch": 2.0967125382262997, "grad_norm": 0.26994358754522324, "learning_rate": 2.2546384762586083e-06, "loss": 0.3178, "num_tokens": 822617499.0, "step": 1999 }, { "epoch": 2.0970948012232418, "grad_norm": 0.23462254403417077, "learning_rate": 2.250780767731807e-06, "loss": 0.3044, "num_tokens": 823056577.0, "step": 2000 }, { "epoch": 2.0974770642201834, "grad_norm": 0.23405054098614497, "learning_rate": 2.246928041595596e-06, "loss": 0.3041, "num_tokens": 823466376.0, "step": 2001 }, { "epoch": 2.0978593272171255, "grad_norm": 0.22959013247104987, "learning_rate": 2.2430803037577912e-06, "loss": 0.2983, "num_tokens": 823862267.0, "step": 2002 }, { "epoch": 2.098241590214067, "grad_norm": 0.24849909428549352, "learning_rate": 2.2392375601185545e-06, "loss": 0.2977, "num_tokens": 824273307.0, "step": 2003 }, { "epoch": 2.0986238532110093, "grad_norm": 0.2364683042373293, "learning_rate": 2.2353998165703987e-06, "loss": 0.3048, "num_tokens": 824736380.0, "step": 2004 }, { "epoch": 2.099006116207951, "grad_norm": 0.2255693384447123, "learning_rate": 2.231567078998159e-06, "loss": 0.3063, "num_tokens": 825190429.0, "step": 2005 }, { "epoch": 2.099388379204893, "grad_norm": 0.2213904725353695, "learning_rate": 2.227739353279006e-06, "loss": 0.3011, "num_tokens": 825618381.0, "step": 2006 }, { "epoch": 2.0997706422018347, "grad_norm": 0.23089715070582656, "learning_rate": 2.2239166452824145e-06, "loss": 0.2795, "num_tokens": 826039312.0, "step": 2007 }, { "epoch": 2.100152905198777, "grad_norm": 0.22930414673560054, "learning_rate": 2.2200989608701707e-06, "loss": 0.2892, "num_tokens": 826426462.0, "step": 2008 }, { "epoch": 2.1005351681957185, "grad_norm": 0.2521482277940626, "learning_rate": 2.216286305896356e-06, "loss": 0.2795, "num_tokens": 826836637.0, "step": 2009 }, { "epoch": 2.1009174311926606, "grad_norm": 0.24752548698512802, "learning_rate": 2.2124786862073405e-06, "loss": 0.2868, "num_tokens": 827200286.0, "step": 2010 }, { "epoch": 2.1012996941896023, "grad_norm": 0.2478400220769795, "learning_rate": 2.2086761076417735e-06, "loss": 0.2888, "num_tokens": 827592790.0, "step": 2011 }, { "epoch": 2.1016819571865444, "grad_norm": 0.24880147903011235, "learning_rate": 2.2048785760305695e-06, "loss": 0.2827, "num_tokens": 827986124.0, "step": 2012 }, { "epoch": 2.102064220183486, "grad_norm": 0.25528296902774644, "learning_rate": 2.201086097196913e-06, "loss": 0.3096, "num_tokens": 828391614.0, "step": 2013 }, { "epoch": 2.102446483180428, "grad_norm": 0.2714611599882389, "learning_rate": 2.1972986769562294e-06, "loss": 0.2902, "num_tokens": 828806249.0, "step": 2014 }, { "epoch": 2.1028287461773703, "grad_norm": 0.2555390738130769, "learning_rate": 2.1935163211161986e-06, "loss": 0.3064, "num_tokens": 829234914.0, "step": 2015 }, { "epoch": 2.103211009174312, "grad_norm": 0.2657611151258409, "learning_rate": 2.1897390354767243e-06, "loss": 0.3078, "num_tokens": 829646217.0, "step": 2016 }, { "epoch": 2.103593272171254, "grad_norm": 0.26745704438052575, "learning_rate": 2.1859668258299434e-06, "loss": 0.3199, "num_tokens": 830029967.0, "step": 2017 }, { "epoch": 2.1039755351681957, "grad_norm": 0.23241422555435154, "learning_rate": 2.1821996979602043e-06, "loss": 0.3022, "num_tokens": 830451610.0, "step": 2018 }, { "epoch": 2.104357798165138, "grad_norm": 0.2579197989641137, "learning_rate": 2.1784376576440664e-06, "loss": 0.2975, "num_tokens": 830864854.0, "step": 2019 }, { "epoch": 2.1047400611620795, "grad_norm": 0.2798820665668038, "learning_rate": 2.1746807106502844e-06, "loss": 0.3414, "num_tokens": 831317841.0, "step": 2020 }, { "epoch": 2.1051223241590216, "grad_norm": 0.24866314835968373, "learning_rate": 2.170928862739806e-06, "loss": 0.3199, "num_tokens": 831771435.0, "step": 2021 }, { "epoch": 2.1055045871559632, "grad_norm": 0.26631208259875394, "learning_rate": 2.167182119665759e-06, "loss": 0.3087, "num_tokens": 832172306.0, "step": 2022 }, { "epoch": 2.1058868501529053, "grad_norm": 0.25136749651347573, "learning_rate": 2.1634404871734392e-06, "loss": 0.2941, "num_tokens": 832563381.0, "step": 2023 }, { "epoch": 2.106269113149847, "grad_norm": 0.2583765361800248, "learning_rate": 2.159703971000313e-06, "loss": 0.3297, "num_tokens": 832951801.0, "step": 2024 }, { "epoch": 2.106651376146789, "grad_norm": 0.25263068069704947, "learning_rate": 2.1559725768759966e-06, "loss": 0.3045, "num_tokens": 833380937.0, "step": 2025 }, { "epoch": 2.1070336391437308, "grad_norm": 0.26471568693762637, "learning_rate": 2.1522463105222525e-06, "loss": 0.2856, "num_tokens": 833769244.0, "step": 2026 }, { "epoch": 2.107415902140673, "grad_norm": 0.2501651951026688, "learning_rate": 2.148525177652982e-06, "loss": 0.286, "num_tokens": 834170879.0, "step": 2027 }, { "epoch": 2.1077981651376145, "grad_norm": 0.24737874487181458, "learning_rate": 2.144809183974213e-06, "loss": 0.2987, "num_tokens": 834595663.0, "step": 2028 }, { "epoch": 2.1081804281345566, "grad_norm": 0.24719525513969742, "learning_rate": 2.1410983351840943e-06, "loss": 0.3306, "num_tokens": 835024855.0, "step": 2029 }, { "epoch": 2.1085626911314983, "grad_norm": 0.26438308141806255, "learning_rate": 2.137392636972883e-06, "loss": 0.3128, "num_tokens": 835427531.0, "step": 2030 }, { "epoch": 2.1089449541284404, "grad_norm": 0.2554820156248308, "learning_rate": 2.1336920950229413e-06, "loss": 0.2942, "num_tokens": 835830026.0, "step": 2031 }, { "epoch": 2.109327217125382, "grad_norm": 0.27509783342152216, "learning_rate": 2.129996715008724e-06, "loss": 0.3079, "num_tokens": 836210136.0, "step": 2032 }, { "epoch": 2.109709480122324, "grad_norm": 0.28296166588011973, "learning_rate": 2.1263065025967673e-06, "loss": 0.3321, "num_tokens": 836643927.0, "step": 2033 }, { "epoch": 2.1100917431192663, "grad_norm": 0.25425671111071196, "learning_rate": 2.122621463445687e-06, "loss": 0.3083, "num_tokens": 837087497.0, "step": 2034 }, { "epoch": 2.110474006116208, "grad_norm": 0.23935604428036045, "learning_rate": 2.118941603206166e-06, "loss": 0.2867, "num_tokens": 837484032.0, "step": 2035 }, { "epoch": 2.11085626911315, "grad_norm": 0.251681254033373, "learning_rate": 2.115266927520943e-06, "loss": 0.2918, "num_tokens": 837865584.0, "step": 2036 }, { "epoch": 2.1112385321100917, "grad_norm": 0.255413678319928, "learning_rate": 2.1115974420248105e-06, "loss": 0.3223, "num_tokens": 838294571.0, "step": 2037 }, { "epoch": 2.111620795107034, "grad_norm": 0.26388550275817085, "learning_rate": 2.1079331523445986e-06, "loss": 0.3094, "num_tokens": 838702877.0, "step": 2038 }, { "epoch": 2.1120030581039755, "grad_norm": 0.25089956420069187, "learning_rate": 2.104274064099174e-06, "loss": 0.3072, "num_tokens": 839076594.0, "step": 2039 }, { "epoch": 2.1123853211009176, "grad_norm": 0.25726818546206537, "learning_rate": 2.100620182899421e-06, "loss": 0.3169, "num_tokens": 839466420.0, "step": 2040 }, { "epoch": 2.1127675840978593, "grad_norm": 0.25339202275411654, "learning_rate": 2.096971514348249e-06, "loss": 0.2918, "num_tokens": 839858148.0, "step": 2041 }, { "epoch": 2.1131498470948014, "grad_norm": 0.22848555482716043, "learning_rate": 2.0933280640405645e-06, "loss": 0.2945, "num_tokens": 840290244.0, "step": 2042 }, { "epoch": 2.113532110091743, "grad_norm": 0.23106470839698875, "learning_rate": 2.089689837563278e-06, "loss": 0.3024, "num_tokens": 840716151.0, "step": 2043 }, { "epoch": 2.113914373088685, "grad_norm": 0.2740976401430039, "learning_rate": 2.0860568404952885e-06, "loss": 0.2875, "num_tokens": 841065107.0, "step": 2044 }, { "epoch": 2.114296636085627, "grad_norm": 0.27010890166749135, "learning_rate": 2.082429078407476e-06, "loss": 0.3264, "num_tokens": 841503140.0, "step": 2045 }, { "epoch": 2.114678899082569, "grad_norm": 0.2735882223763793, "learning_rate": 2.0788065568626946e-06, "loss": 0.2883, "num_tokens": 841886466.0, "step": 2046 }, { "epoch": 2.1150611620795106, "grad_norm": 0.24438416388030718, "learning_rate": 2.0751892814157564e-06, "loss": 0.2917, "num_tokens": 842274503.0, "step": 2047 }, { "epoch": 2.1154434250764527, "grad_norm": 0.2375739412144376, "learning_rate": 2.0715772576134397e-06, "loss": 0.3072, "num_tokens": 842666013.0, "step": 2048 }, { "epoch": 2.1158256880733943, "grad_norm": 0.2243074091393917, "learning_rate": 2.0679704909944584e-06, "loss": 0.2856, "num_tokens": 843105009.0, "step": 2049 }, { "epoch": 2.1162079510703364, "grad_norm": 0.25234466827749763, "learning_rate": 2.064368987089475e-06, "loss": 0.2996, "num_tokens": 843516863.0, "step": 2050 }, { "epoch": 2.116590214067278, "grad_norm": 0.24812982032305772, "learning_rate": 2.0607727514210747e-06, "loss": 0.2915, "num_tokens": 843903857.0, "step": 2051 }, { "epoch": 2.11697247706422, "grad_norm": 0.2620049148259815, "learning_rate": 2.0571817895037672e-06, "loss": 0.2967, "num_tokens": 844294285.0, "step": 2052 }, { "epoch": 2.117354740061162, "grad_norm": 0.26645720141380624, "learning_rate": 2.053596106843976e-06, "loss": 0.3135, "num_tokens": 844687832.0, "step": 2053 }, { "epoch": 2.117737003058104, "grad_norm": 0.23271984877675667, "learning_rate": 2.0500157089400288e-06, "loss": 0.297, "num_tokens": 845113615.0, "step": 2054 }, { "epoch": 2.118119266055046, "grad_norm": 0.2550077534661697, "learning_rate": 2.0464406012821507e-06, "loss": 0.3136, "num_tokens": 845548984.0, "step": 2055 }, { "epoch": 2.1185015290519877, "grad_norm": 0.24746712194569306, "learning_rate": 2.0428707893524485e-06, "loss": 0.2959, "num_tokens": 845928404.0, "step": 2056 }, { "epoch": 2.11888379204893, "grad_norm": 0.25403428901855957, "learning_rate": 2.03930627862492e-06, "loss": 0.3017, "num_tokens": 846318283.0, "step": 2057 }, { "epoch": 2.1192660550458715, "grad_norm": 0.24048069695483532, "learning_rate": 2.0357470745654213e-06, "loss": 0.3053, "num_tokens": 846726168.0, "step": 2058 }, { "epoch": 2.1196483180428136, "grad_norm": 0.2309192420389093, "learning_rate": 2.032193182631683e-06, "loss": 0.3039, "num_tokens": 847170171.0, "step": 2059 }, { "epoch": 2.1200305810397553, "grad_norm": 0.25813798578078306, "learning_rate": 2.0286446082732803e-06, "loss": 0.2923, "num_tokens": 847547675.0, "step": 2060 }, { "epoch": 2.1204128440366974, "grad_norm": 0.25284963149916945, "learning_rate": 2.025101356931639e-06, "loss": 0.2974, "num_tokens": 847926897.0, "step": 2061 }, { "epoch": 2.120795107033639, "grad_norm": 0.22831198067575204, "learning_rate": 2.0215634340400235e-06, "loss": 0.2978, "num_tokens": 848325116.0, "step": 2062 }, { "epoch": 2.121177370030581, "grad_norm": 0.23665012171810665, "learning_rate": 2.018030845023525e-06, "loss": 0.336, "num_tokens": 848775290.0, "step": 2063 }, { "epoch": 2.121559633027523, "grad_norm": 0.24903501908121609, "learning_rate": 2.0145035952990572e-06, "loss": 0.3335, "num_tokens": 849237622.0, "step": 2064 }, { "epoch": 2.121941896024465, "grad_norm": 0.23389107693187228, "learning_rate": 2.010981690275343e-06, "loss": 0.29, "num_tokens": 849654334.0, "step": 2065 }, { "epoch": 2.1223241590214066, "grad_norm": 0.24003041927197125, "learning_rate": 2.0074651353529185e-06, "loss": 0.2865, "num_tokens": 850040596.0, "step": 2066 }, { "epoch": 2.1227064220183487, "grad_norm": 0.2973018611005875, "learning_rate": 2.003953935924104e-06, "loss": 0.3187, "num_tokens": 850408961.0, "step": 2067 }, { "epoch": 2.1230886850152904, "grad_norm": 0.26229869108522647, "learning_rate": 2.0004480973730166e-06, "loss": 0.334, "num_tokens": 850814388.0, "step": 2068 }, { "epoch": 2.1234709480122325, "grad_norm": 0.23326672724616832, "learning_rate": 1.9969476250755487e-06, "loss": 0.3073, "num_tokens": 851235532.0, "step": 2069 }, { "epoch": 2.123853211009174, "grad_norm": 0.2391720452457327, "learning_rate": 1.9934525243993665e-06, "loss": 0.2738, "num_tokens": 851605463.0, "step": 2070 }, { "epoch": 2.1242354740061162, "grad_norm": 0.25321180105901725, "learning_rate": 1.989962800703897e-06, "loss": 0.3395, "num_tokens": 852055051.0, "step": 2071 }, { "epoch": 2.124617737003058, "grad_norm": 0.240504103568182, "learning_rate": 1.986478459340323e-06, "loss": 0.316, "num_tokens": 852510944.0, "step": 2072 }, { "epoch": 2.125, "grad_norm": 0.24235255145885043, "learning_rate": 1.9829995056515772e-06, "loss": 0.3263, "num_tokens": 852967492.0, "step": 2073 }, { "epoch": 2.1253822629969417, "grad_norm": 0.23413229550806186, "learning_rate": 1.9795259449723213e-06, "loss": 0.3153, "num_tokens": 853361089.0, "step": 2074 }, { "epoch": 2.1257645259938838, "grad_norm": 0.22275484065719478, "learning_rate": 1.976057782628961e-06, "loss": 0.306, "num_tokens": 853775532.0, "step": 2075 }, { "epoch": 2.126146788990826, "grad_norm": 0.23233734626700345, "learning_rate": 1.9725950239396113e-06, "loss": 0.3066, "num_tokens": 854197223.0, "step": 2076 }, { "epoch": 2.1265290519877675, "grad_norm": 0.2591172338498743, "learning_rate": 1.9691376742141087e-06, "loss": 0.311, "num_tokens": 854586358.0, "step": 2077 }, { "epoch": 2.1269113149847096, "grad_norm": 0.23554163602919484, "learning_rate": 1.9656857387539942e-06, "loss": 0.3069, "num_tokens": 855027030.0, "step": 2078 }, { "epoch": 2.1272935779816513, "grad_norm": 0.25517726667959767, "learning_rate": 1.9622392228525046e-06, "loss": 0.2916, "num_tokens": 855437636.0, "step": 2079 }, { "epoch": 2.1276758409785934, "grad_norm": 0.2548350322680789, "learning_rate": 1.958798131794568e-06, "loss": 0.2889, "num_tokens": 855848132.0, "step": 2080 }, { "epoch": 2.128058103975535, "grad_norm": 0.23371903947443085, "learning_rate": 1.9553624708567937e-06, "loss": 0.2934, "num_tokens": 856249866.0, "step": 2081 }, { "epoch": 2.128440366972477, "grad_norm": 0.234377234637379, "learning_rate": 1.951932245307464e-06, "loss": 0.3091, "num_tokens": 856679397.0, "step": 2082 }, { "epoch": 2.128822629969419, "grad_norm": 0.23379784586195537, "learning_rate": 1.9485074604065276e-06, "loss": 0.3148, "num_tokens": 857093788.0, "step": 2083 }, { "epoch": 2.129204892966361, "grad_norm": 0.22282828980897662, "learning_rate": 1.945088121405588e-06, "loss": 0.2958, "num_tokens": 857514761.0, "step": 2084 }, { "epoch": 2.1295871559633026, "grad_norm": 0.25271048117768247, "learning_rate": 1.941674233547899e-06, "loss": 0.3083, "num_tokens": 857921681.0, "step": 2085 }, { "epoch": 2.1299694189602447, "grad_norm": 0.2695747617883831, "learning_rate": 1.9382658020683572e-06, "loss": 0.3257, "num_tokens": 858349122.0, "step": 2086 }, { "epoch": 2.1303516819571864, "grad_norm": 0.2595980465753796, "learning_rate": 1.934862832193491e-06, "loss": 0.3079, "num_tokens": 858733580.0, "step": 2087 }, { "epoch": 2.1307339449541285, "grad_norm": 0.23111075929869002, "learning_rate": 1.931465329141454e-06, "loss": 0.3226, "num_tokens": 859177849.0, "step": 2088 }, { "epoch": 2.13111620795107, "grad_norm": 0.23130721592097422, "learning_rate": 1.9280732981220165e-06, "loss": 0.2951, "num_tokens": 859604556.0, "step": 2089 }, { "epoch": 2.1314984709480123, "grad_norm": 0.22097454244051973, "learning_rate": 1.924686744336559e-06, "loss": 0.2938, "num_tokens": 860034949.0, "step": 2090 }, { "epoch": 2.131880733944954, "grad_norm": 0.24224798779880968, "learning_rate": 1.921305672978062e-06, "loss": 0.3107, "num_tokens": 860446727.0, "step": 2091 }, { "epoch": 2.132262996941896, "grad_norm": 0.2762927430242535, "learning_rate": 1.9179300892311007e-06, "loss": 0.2878, "num_tokens": 860816308.0, "step": 2092 }, { "epoch": 2.1326452599388377, "grad_norm": 0.26176854757147, "learning_rate": 1.9145599982718317e-06, "loss": 0.2878, "num_tokens": 861175021.0, "step": 2093 }, { "epoch": 2.13302752293578, "grad_norm": 0.2597065954605703, "learning_rate": 1.911195405267996e-06, "loss": 0.3115, "num_tokens": 861571841.0, "step": 2094 }, { "epoch": 2.133409785932722, "grad_norm": 0.2657402495227854, "learning_rate": 1.9078363153788964e-06, "loss": 0.3168, "num_tokens": 861971946.0, "step": 2095 }, { "epoch": 2.1337920489296636, "grad_norm": 0.247429988583406, "learning_rate": 1.9044827337554012e-06, "loss": 0.3057, "num_tokens": 862383936.0, "step": 2096 }, { "epoch": 2.1341743119266057, "grad_norm": 0.22179317488302783, "learning_rate": 1.901134665539931e-06, "loss": 0.2939, "num_tokens": 862832280.0, "step": 2097 }, { "epoch": 2.1345565749235473, "grad_norm": 0.24733400676777484, "learning_rate": 1.8977921158664537e-06, "loss": 0.3071, "num_tokens": 863229927.0, "step": 2098 }, { "epoch": 2.1349388379204894, "grad_norm": 0.2553919717421349, "learning_rate": 1.8944550898604742e-06, "loss": 0.3059, "num_tokens": 863622204.0, "step": 2099 }, { "epoch": 2.135321100917431, "grad_norm": 0.265051457062787, "learning_rate": 1.8911235926390243e-06, "loss": 0.3334, "num_tokens": 864089603.0, "step": 2100 }, { "epoch": 2.135703363914373, "grad_norm": 0.2439916774368985, "learning_rate": 1.8877976293106647e-06, "loss": 0.3069, "num_tokens": 864493179.0, "step": 2101 }, { "epoch": 2.136085626911315, "grad_norm": 0.22513611569333414, "learning_rate": 1.8844772049754614e-06, "loss": 0.3129, "num_tokens": 864941511.0, "step": 2102 }, { "epoch": 2.136467889908257, "grad_norm": 0.23384222393051904, "learning_rate": 1.881162324724997e-06, "loss": 0.2946, "num_tokens": 865325227.0, "step": 2103 }, { "epoch": 2.1368501529051986, "grad_norm": 0.24912012636897485, "learning_rate": 1.877852993642344e-06, "loss": 0.3095, "num_tokens": 865730768.0, "step": 2104 }, { "epoch": 2.1372324159021407, "grad_norm": 0.25216571323223674, "learning_rate": 1.8745492168020695e-06, "loss": 0.313, "num_tokens": 866097542.0, "step": 2105 }, { "epoch": 2.1376146788990824, "grad_norm": 0.2449973160899574, "learning_rate": 1.8712509992702247e-06, "loss": 0.2979, "num_tokens": 866500338.0, "step": 2106 }, { "epoch": 2.1379969418960245, "grad_norm": 0.2287193781804762, "learning_rate": 1.8679583461043333e-06, "loss": 0.3221, "num_tokens": 866919334.0, "step": 2107 }, { "epoch": 2.138379204892966, "grad_norm": 0.24464941355917846, "learning_rate": 1.8646712623533903e-06, "loss": 0.3048, "num_tokens": 867299841.0, "step": 2108 }, { "epoch": 2.1387614678899083, "grad_norm": 0.24585957074572262, "learning_rate": 1.8613897530578437e-06, "loss": 0.2951, "num_tokens": 867668283.0, "step": 2109 }, { "epoch": 2.13914373088685, "grad_norm": 0.299476256903208, "learning_rate": 1.8581138232496038e-06, "loss": 0.2769, "num_tokens": 868051777.0, "step": 2110 }, { "epoch": 2.139525993883792, "grad_norm": 0.23993846809728608, "learning_rate": 1.8548434779520139e-06, "loss": 0.3085, "num_tokens": 868492022.0, "step": 2111 }, { "epoch": 2.1399082568807337, "grad_norm": 0.2548569357821261, "learning_rate": 1.8515787221798641e-06, "loss": 0.3089, "num_tokens": 868871880.0, "step": 2112 }, { "epoch": 2.140290519877676, "grad_norm": 0.25270569893686956, "learning_rate": 1.8483195609393667e-06, "loss": 0.321, "num_tokens": 869272712.0, "step": 2113 }, { "epoch": 2.140672782874618, "grad_norm": 0.24062152292476804, "learning_rate": 1.845065999228158e-06, "loss": 0.3076, "num_tokens": 869711926.0, "step": 2114 }, { "epoch": 2.1410550458715596, "grad_norm": 0.24838880956409964, "learning_rate": 1.8418180420352877e-06, "loss": 0.316, "num_tokens": 870120482.0, "step": 2115 }, { "epoch": 2.1414373088685017, "grad_norm": 0.25354298363481975, "learning_rate": 1.8385756943412086e-06, "loss": 0.2952, "num_tokens": 870511654.0, "step": 2116 }, { "epoch": 2.1418195718654434, "grad_norm": 0.26799249071943454, "learning_rate": 1.8353389611177793e-06, "loss": 0.3106, "num_tokens": 870909116.0, "step": 2117 }, { "epoch": 2.1422018348623855, "grad_norm": 0.270490782295248, "learning_rate": 1.8321078473282386e-06, "loss": 0.2929, "num_tokens": 871281108.0, "step": 2118 }, { "epoch": 2.142584097859327, "grad_norm": 0.33455488248063114, "learning_rate": 1.828882357927219e-06, "loss": 0.3248, "num_tokens": 871730990.0, "step": 2119 }, { "epoch": 2.1429663608562692, "grad_norm": 0.27779858548988456, "learning_rate": 1.8256624978607203e-06, "loss": 0.3354, "num_tokens": 872167410.0, "step": 2120 }, { "epoch": 2.143348623853211, "grad_norm": 0.23346536858247227, "learning_rate": 1.822448272066114e-06, "loss": 0.3239, "num_tokens": 872582280.0, "step": 2121 }, { "epoch": 2.143730886850153, "grad_norm": 0.2276523124022871, "learning_rate": 1.8192396854721323e-06, "loss": 0.3006, "num_tokens": 872993119.0, "step": 2122 }, { "epoch": 2.1441131498470947, "grad_norm": 0.2302734004635181, "learning_rate": 1.8160367429988585e-06, "loss": 0.2876, "num_tokens": 873376311.0, "step": 2123 }, { "epoch": 2.1444954128440368, "grad_norm": 0.24809741208573627, "learning_rate": 1.8128394495577228e-06, "loss": 0.2953, "num_tokens": 873769169.0, "step": 2124 }, { "epoch": 2.1448776758409784, "grad_norm": 0.2711681837130615, "learning_rate": 1.8096478100514897e-06, "loss": 0.3136, "num_tokens": 874206693.0, "step": 2125 }, { "epoch": 2.1452599388379205, "grad_norm": 0.20984038159537788, "learning_rate": 1.8064618293742597e-06, "loss": 0.3125, "num_tokens": 874610599.0, "step": 2126 }, { "epoch": 2.145642201834862, "grad_norm": 0.24482871458709266, "learning_rate": 1.8032815124114488e-06, "loss": 0.312, "num_tokens": 875012682.0, "step": 2127 }, { "epoch": 2.1460244648318043, "grad_norm": 0.23517788175497842, "learning_rate": 1.8001068640397973e-06, "loss": 0.2966, "num_tokens": 875456384.0, "step": 2128 }, { "epoch": 2.146406727828746, "grad_norm": 0.23129295818561682, "learning_rate": 1.7969378891273432e-06, "loss": 0.2971, "num_tokens": 875835934.0, "step": 2129 }, { "epoch": 2.146788990825688, "grad_norm": 0.22280537626236196, "learning_rate": 1.793774592533431e-06, "loss": 0.3115, "num_tokens": 876285424.0, "step": 2130 }, { "epoch": 2.1471712538226297, "grad_norm": 0.22985160745783734, "learning_rate": 1.7906169791086975e-06, "loss": 0.2947, "num_tokens": 876735916.0, "step": 2131 }, { "epoch": 2.147553516819572, "grad_norm": 0.24841327006440816, "learning_rate": 1.7874650536950621e-06, "loss": 0.3208, "num_tokens": 877157697.0, "step": 2132 }, { "epoch": 2.147935779816514, "grad_norm": 0.23966632621868167, "learning_rate": 1.784318821125726e-06, "loss": 0.2975, "num_tokens": 877540955.0, "step": 2133 }, { "epoch": 2.1483180428134556, "grad_norm": 0.24478603037443272, "learning_rate": 1.781178286225157e-06, "loss": 0.3343, "num_tokens": 877980538.0, "step": 2134 }, { "epoch": 2.1487003058103977, "grad_norm": 0.2400158310787067, "learning_rate": 1.7780434538090902e-06, "loss": 0.2824, "num_tokens": 878363720.0, "step": 2135 }, { "epoch": 2.1490825688073394, "grad_norm": 0.23607289093743822, "learning_rate": 1.774914328684511e-06, "loss": 0.3041, "num_tokens": 878777397.0, "step": 2136 }, { "epoch": 2.1494648318042815, "grad_norm": 0.21622115872393688, "learning_rate": 1.7717909156496585e-06, "loss": 0.3074, "num_tokens": 879213588.0, "step": 2137 }, { "epoch": 2.149847094801223, "grad_norm": 0.2417339252321948, "learning_rate": 1.7686732194940098e-06, "loss": 0.3351, "num_tokens": 879593329.0, "step": 2138 }, { "epoch": 2.1502293577981653, "grad_norm": 0.23294002059377833, "learning_rate": 1.7655612449982767e-06, "loss": 0.3183, "num_tokens": 880027327.0, "step": 2139 }, { "epoch": 2.150611620795107, "grad_norm": 0.24035358315138267, "learning_rate": 1.7624549969343962e-06, "loss": 0.3345, "num_tokens": 880434175.0, "step": 2140 }, { "epoch": 2.150993883792049, "grad_norm": 0.25487144822665003, "learning_rate": 1.7593544800655272e-06, "loss": 0.3137, "num_tokens": 880841206.0, "step": 2141 }, { "epoch": 2.1513761467889907, "grad_norm": 0.23784469700114344, "learning_rate": 1.7562596991460368e-06, "loss": 0.3086, "num_tokens": 881225216.0, "step": 2142 }, { "epoch": 2.151758409785933, "grad_norm": 0.22176898973464948, "learning_rate": 1.7531706589214997e-06, "loss": 0.2992, "num_tokens": 881651920.0, "step": 2143 }, { "epoch": 2.1521406727828745, "grad_norm": 0.21977701631691313, "learning_rate": 1.7500873641286826e-06, "loss": 0.3215, "num_tokens": 882104001.0, "step": 2144 }, { "epoch": 2.1525229357798166, "grad_norm": 0.23825845544773375, "learning_rate": 1.7470098194955502e-06, "loss": 0.3306, "num_tokens": 882489970.0, "step": 2145 }, { "epoch": 2.1529051987767582, "grad_norm": 0.24218138523003677, "learning_rate": 1.7439380297412416e-06, "loss": 0.3252, "num_tokens": 882930464.0, "step": 2146 }, { "epoch": 2.1532874617737003, "grad_norm": 0.24005861282488286, "learning_rate": 1.740871999576077e-06, "loss": 0.3362, "num_tokens": 883381016.0, "step": 2147 }, { "epoch": 2.153669724770642, "grad_norm": 0.21511687215090416, "learning_rate": 1.7378117337015421e-06, "loss": 0.2903, "num_tokens": 883801136.0, "step": 2148 }, { "epoch": 2.154051987767584, "grad_norm": 0.23474547948790533, "learning_rate": 1.7347572368102842e-06, "loss": 0.3362, "num_tokens": 884241537.0, "step": 2149 }, { "epoch": 2.1544342507645258, "grad_norm": 0.2734136876791039, "learning_rate": 1.7317085135861042e-06, "loss": 0.3367, "num_tokens": 884654039.0, "step": 2150 }, { "epoch": 2.154816513761468, "grad_norm": 0.2498551895217333, "learning_rate": 1.72866556870395e-06, "loss": 0.3087, "num_tokens": 885036375.0, "step": 2151 }, { "epoch": 2.15519877675841, "grad_norm": 0.2677136733975535, "learning_rate": 1.7256284068299106e-06, "loss": 0.3156, "num_tokens": 885425644.0, "step": 2152 }, { "epoch": 2.1555810397553516, "grad_norm": 0.2392599141536694, "learning_rate": 1.7225970326212003e-06, "loss": 0.3192, "num_tokens": 885826394.0, "step": 2153 }, { "epoch": 2.1559633027522938, "grad_norm": 0.25009117343100296, "learning_rate": 1.719571450726169e-06, "loss": 0.3171, "num_tokens": 886250191.0, "step": 2154 }, { "epoch": 2.1563455657492354, "grad_norm": 0.22542024108272465, "learning_rate": 1.7165516657842768e-06, "loss": 0.3217, "num_tokens": 886662843.0, "step": 2155 }, { "epoch": 2.1567278287461775, "grad_norm": 0.22388441923626276, "learning_rate": 1.7135376824260968e-06, "loss": 0.2972, "num_tokens": 887062063.0, "step": 2156 }, { "epoch": 2.157110091743119, "grad_norm": 0.22413732083869867, "learning_rate": 1.7105295052733061e-06, "loss": 0.3202, "num_tokens": 887507979.0, "step": 2157 }, { "epoch": 2.1574923547400613, "grad_norm": 0.24925762297099385, "learning_rate": 1.70752713893868e-06, "loss": 0.2843, "num_tokens": 887877390.0, "step": 2158 }, { "epoch": 2.157874617737003, "grad_norm": 0.2400224275647967, "learning_rate": 1.7045305880260811e-06, "loss": 0.3023, "num_tokens": 888283113.0, "step": 2159 }, { "epoch": 2.158256880733945, "grad_norm": 0.23544314939670413, "learning_rate": 1.7015398571304543e-06, "loss": 0.3016, "num_tokens": 888693310.0, "step": 2160 }, { "epoch": 2.1586391437308867, "grad_norm": 0.2346447544702893, "learning_rate": 1.698554950837824e-06, "loss": 0.3305, "num_tokens": 889114532.0, "step": 2161 }, { "epoch": 2.159021406727829, "grad_norm": 0.22520061592034207, "learning_rate": 1.695575873725276e-06, "loss": 0.3102, "num_tokens": 889543613.0, "step": 2162 }, { "epoch": 2.1594036697247705, "grad_norm": 0.2374390942659956, "learning_rate": 1.6926026303609666e-06, "loss": 0.3145, "num_tokens": 890000023.0, "step": 2163 }, { "epoch": 2.1597859327217126, "grad_norm": 0.2297304773443682, "learning_rate": 1.6896352253040993e-06, "loss": 0.3191, "num_tokens": 890430127.0, "step": 2164 }, { "epoch": 2.1601681957186543, "grad_norm": 0.26326553296748756, "learning_rate": 1.6866736631049268e-06, "loss": 0.3118, "num_tokens": 890814244.0, "step": 2165 }, { "epoch": 2.1605504587155964, "grad_norm": 0.2655643448713461, "learning_rate": 1.6837179483047444e-06, "loss": 0.311, "num_tokens": 891221931.0, "step": 2166 }, { "epoch": 2.160932721712538, "grad_norm": 0.2743386388651708, "learning_rate": 1.6807680854358794e-06, "loss": 0.331, "num_tokens": 891626411.0, "step": 2167 }, { "epoch": 2.16131498470948, "grad_norm": 0.2240515591428254, "learning_rate": 1.6778240790216862e-06, "loss": 0.2917, "num_tokens": 892033990.0, "step": 2168 }, { "epoch": 2.161697247706422, "grad_norm": 0.22303228559331495, "learning_rate": 1.674885933576536e-06, "loss": 0.3092, "num_tokens": 892439851.0, "step": 2169 }, { "epoch": 2.162079510703364, "grad_norm": 0.22988815312965005, "learning_rate": 1.6719536536058195e-06, "loss": 0.3118, "num_tokens": 892867279.0, "step": 2170 }, { "epoch": 2.162461773700306, "grad_norm": 0.25808832106658236, "learning_rate": 1.6690272436059247e-06, "loss": 0.3207, "num_tokens": 893246395.0, "step": 2171 }, { "epoch": 2.1628440366972477, "grad_norm": 0.2280001549484002, "learning_rate": 1.6661067080642466e-06, "loss": 0.2924, "num_tokens": 893636167.0, "step": 2172 }, { "epoch": 2.16322629969419, "grad_norm": 0.21580364727461712, "learning_rate": 1.6631920514591654e-06, "loss": 0.309, "num_tokens": 894063196.0, "step": 2173 }, { "epoch": 2.1636085626911314, "grad_norm": 0.24763860416960673, "learning_rate": 1.6602832782600509e-06, "loss": 0.3075, "num_tokens": 894481173.0, "step": 2174 }, { "epoch": 2.1639908256880735, "grad_norm": 0.25639163356992056, "learning_rate": 1.6573803929272487e-06, "loss": 0.3293, "num_tokens": 894876005.0, "step": 2175 }, { "epoch": 2.164373088685015, "grad_norm": 0.23846091212504994, "learning_rate": 1.654483399912078e-06, "loss": 0.3158, "num_tokens": 895311769.0, "step": 2176 }, { "epoch": 2.1647553516819573, "grad_norm": 0.23754704691085898, "learning_rate": 1.651592303656822e-06, "loss": 0.3061, "num_tokens": 895707932.0, "step": 2177 }, { "epoch": 2.165137614678899, "grad_norm": 0.23976539419012896, "learning_rate": 1.6487071085947193e-06, "loss": 0.286, "num_tokens": 896114483.0, "step": 2178 }, { "epoch": 2.165519877675841, "grad_norm": 0.21447832330315947, "learning_rate": 1.6458278191499644e-06, "loss": 0.3075, "num_tokens": 896545251.0, "step": 2179 }, { "epoch": 2.1659021406727827, "grad_norm": 0.2299798965965033, "learning_rate": 1.6429544397376911e-06, "loss": 0.3089, "num_tokens": 896963748.0, "step": 2180 }, { "epoch": 2.166284403669725, "grad_norm": 0.23722486935301504, "learning_rate": 1.640086974763973e-06, "loss": 0.3057, "num_tokens": 897340983.0, "step": 2181 }, { "epoch": 2.1666666666666665, "grad_norm": 0.2527094707513443, "learning_rate": 1.6372254286258156e-06, "loss": 0.3354, "num_tokens": 897759767.0, "step": 2182 }, { "epoch": 2.1670489296636086, "grad_norm": 0.26662822816229126, "learning_rate": 1.6343698057111454e-06, "loss": 0.3097, "num_tokens": 898152893.0, "step": 2183 }, { "epoch": 2.1674311926605503, "grad_norm": 0.241867663457883, "learning_rate": 1.6315201103988088e-06, "loss": 0.3146, "num_tokens": 898570730.0, "step": 2184 }, { "epoch": 2.1678134556574924, "grad_norm": 0.26400921040030584, "learning_rate": 1.6286763470585618e-06, "loss": 0.3363, "num_tokens": 899020923.0, "step": 2185 }, { "epoch": 2.168195718654434, "grad_norm": 0.22934436883295536, "learning_rate": 1.6258385200510652e-06, "loss": 0.29, "num_tokens": 899387561.0, "step": 2186 }, { "epoch": 2.168577981651376, "grad_norm": 0.2185352284892116, "learning_rate": 1.6230066337278723e-06, "loss": 0.2851, "num_tokens": 899780061.0, "step": 2187 }, { "epoch": 2.168960244648318, "grad_norm": 0.22267719267504493, "learning_rate": 1.6201806924314356e-06, "loss": 0.3106, "num_tokens": 900214923.0, "step": 2188 }, { "epoch": 2.16934250764526, "grad_norm": 0.2370172982760451, "learning_rate": 1.6173607004950822e-06, "loss": 0.3091, "num_tokens": 900658115.0, "step": 2189 }, { "epoch": 2.169724770642202, "grad_norm": 0.24890328982520848, "learning_rate": 1.6145466622430229e-06, "loss": 0.3345, "num_tokens": 901104304.0, "step": 2190 }, { "epoch": 2.1701070336391437, "grad_norm": 0.23334691754876039, "learning_rate": 1.6117385819903347e-06, "loss": 0.3045, "num_tokens": 901549087.0, "step": 2191 }, { "epoch": 2.170489296636086, "grad_norm": 0.24893520678779027, "learning_rate": 1.608936464042963e-06, "loss": 0.322, "num_tokens": 901958983.0, "step": 2192 }, { "epoch": 2.1708715596330275, "grad_norm": 0.2335239583187043, "learning_rate": 1.6061403126977066e-06, "loss": 0.3185, "num_tokens": 902387911.0, "step": 2193 }, { "epoch": 2.1712538226299696, "grad_norm": 0.22523536140055822, "learning_rate": 1.6033501322422162e-06, "loss": 0.3305, "num_tokens": 902844732.0, "step": 2194 }, { "epoch": 2.1716360856269112, "grad_norm": 0.24817934425633256, "learning_rate": 1.6005659269549882e-06, "loss": 0.3154, "num_tokens": 903226836.0, "step": 2195 }, { "epoch": 2.1720183486238533, "grad_norm": 0.2189113136735565, "learning_rate": 1.5977877011053556e-06, "loss": 0.2978, "num_tokens": 903650041.0, "step": 2196 }, { "epoch": 2.172400611620795, "grad_norm": 0.23439605113917591, "learning_rate": 1.5950154589534806e-06, "loss": 0.3259, "num_tokens": 904043266.0, "step": 2197 }, { "epoch": 2.172782874617737, "grad_norm": 0.2511393392124079, "learning_rate": 1.5922492047503521e-06, "loss": 0.3124, "num_tokens": 904468451.0, "step": 2198 }, { "epoch": 2.1731651376146788, "grad_norm": 0.25283704329209317, "learning_rate": 1.5894889427377768e-06, "loss": 0.3215, "num_tokens": 904865229.0, "step": 2199 }, { "epoch": 2.173547400611621, "grad_norm": 0.268779467281901, "learning_rate": 1.5867346771483732e-06, "loss": 0.3538, "num_tokens": 905272720.0, "step": 2200 }, { "epoch": 2.1739296636085625, "grad_norm": 0.24167880279655754, "learning_rate": 1.583986412205563e-06, "loss": 0.3186, "num_tokens": 905698266.0, "step": 2201 }, { "epoch": 2.1743119266055047, "grad_norm": 0.2592728408756888, "learning_rate": 1.5812441521235686e-06, "loss": 0.2947, "num_tokens": 906041053.0, "step": 2202 }, { "epoch": 2.1746941896024463, "grad_norm": 0.2392668050830515, "learning_rate": 1.578507901107403e-06, "loss": 0.3384, "num_tokens": 906488584.0, "step": 2203 }, { "epoch": 2.1750764525993884, "grad_norm": 0.24248852863353335, "learning_rate": 1.5757776633528654e-06, "loss": 0.3097, "num_tokens": 906905347.0, "step": 2204 }, { "epoch": 2.17545871559633, "grad_norm": 0.2258979972111339, "learning_rate": 1.5730534430465358e-06, "loss": 0.3381, "num_tokens": 907343695.0, "step": 2205 }, { "epoch": 2.175840978593272, "grad_norm": 0.2505783495895386, "learning_rate": 1.5703352443657615e-06, "loss": 0.3423, "num_tokens": 907806661.0, "step": 2206 }, { "epoch": 2.176223241590214, "grad_norm": 0.234528317351922, "learning_rate": 1.5676230714786646e-06, "loss": 0.3204, "num_tokens": 908249078.0, "step": 2207 }, { "epoch": 2.176605504587156, "grad_norm": 0.240582554262787, "learning_rate": 1.564916928544119e-06, "loss": 0.2832, "num_tokens": 908616632.0, "step": 2208 }, { "epoch": 2.176987767584098, "grad_norm": 0.2532827480126124, "learning_rate": 1.562216819711757e-06, "loss": 0.3087, "num_tokens": 909020945.0, "step": 2209 }, { "epoch": 2.1773700305810397, "grad_norm": 0.227986714594487, "learning_rate": 1.5595227491219572e-06, "loss": 0.3314, "num_tokens": 909421917.0, "step": 2210 }, { "epoch": 2.177752293577982, "grad_norm": 0.22561577681881217, "learning_rate": 1.5568347209058387e-06, "loss": 0.3366, "num_tokens": 909861072.0, "step": 2211 }, { "epoch": 2.1781345565749235, "grad_norm": 0.24628669248200902, "learning_rate": 1.5541527391852561e-06, "loss": 0.2797, "num_tokens": 910186108.0, "step": 2212 }, { "epoch": 2.1785168195718656, "grad_norm": 0.22918259509551395, "learning_rate": 1.5514768080727885e-06, "loss": 0.2914, "num_tokens": 910579170.0, "step": 2213 }, { "epoch": 2.1788990825688073, "grad_norm": 0.22790157562790342, "learning_rate": 1.5488069316717437e-06, "loss": 0.3315, "num_tokens": 911004000.0, "step": 2214 }, { "epoch": 2.1792813455657494, "grad_norm": 0.24872085181802045, "learning_rate": 1.5461431140761368e-06, "loss": 0.295, "num_tokens": 911371041.0, "step": 2215 }, { "epoch": 2.179663608562691, "grad_norm": 0.24737913360578787, "learning_rate": 1.5434853593707017e-06, "loss": 0.3361, "num_tokens": 911826234.0, "step": 2216 }, { "epoch": 2.180045871559633, "grad_norm": 0.2287124782176664, "learning_rate": 1.540833671630867e-06, "loss": 0.2819, "num_tokens": 912217194.0, "step": 2217 }, { "epoch": 2.180428134556575, "grad_norm": 0.23422642745576158, "learning_rate": 1.5381880549227622e-06, "loss": 0.2958, "num_tokens": 912637441.0, "step": 2218 }, { "epoch": 2.180810397553517, "grad_norm": 0.24946965050875264, "learning_rate": 1.535548513303207e-06, "loss": 0.326, "num_tokens": 913074206.0, "step": 2219 }, { "epoch": 2.1811926605504586, "grad_norm": 0.22803680271701338, "learning_rate": 1.5329150508197053e-06, "loss": 0.3274, "num_tokens": 913496226.0, "step": 2220 }, { "epoch": 2.1815749235474007, "grad_norm": 0.23459413487022435, "learning_rate": 1.5302876715104397e-06, "loss": 0.3363, "num_tokens": 913930557.0, "step": 2221 }, { "epoch": 2.1819571865443423, "grad_norm": 0.251718183603139, "learning_rate": 1.5276663794042618e-06, "loss": 0.309, "num_tokens": 914285508.0, "step": 2222 }, { "epoch": 2.1823394495412844, "grad_norm": 0.21860858401753172, "learning_rate": 1.525051178520695e-06, "loss": 0.2851, "num_tokens": 914660487.0, "step": 2223 }, { "epoch": 2.182721712538226, "grad_norm": 0.21937232669635792, "learning_rate": 1.5224420728699157e-06, "loss": 0.3105, "num_tokens": 915092199.0, "step": 2224 }, { "epoch": 2.183103975535168, "grad_norm": 0.23911207937321602, "learning_rate": 1.5198390664527595e-06, "loss": 0.2977, "num_tokens": 915499337.0, "step": 2225 }, { "epoch": 2.18348623853211, "grad_norm": 0.23484188018834565, "learning_rate": 1.5172421632607045e-06, "loss": 0.3357, "num_tokens": 915948803.0, "step": 2226 }, { "epoch": 2.183868501529052, "grad_norm": 0.25470772185951285, "learning_rate": 1.5146513672758733e-06, "loss": 0.3146, "num_tokens": 916357800.0, "step": 2227 }, { "epoch": 2.184250764525994, "grad_norm": 0.22091256980267132, "learning_rate": 1.5120666824710233e-06, "loss": 0.3201, "num_tokens": 916818247.0, "step": 2228 }, { "epoch": 2.1846330275229358, "grad_norm": 0.25480031651744306, "learning_rate": 1.509488112809538e-06, "loss": 0.3432, "num_tokens": 917225950.0, "step": 2229 }, { "epoch": 2.185015290519878, "grad_norm": 0.24017436040468665, "learning_rate": 1.5069156622454286e-06, "loss": 0.3081, "num_tokens": 917615638.0, "step": 2230 }, { "epoch": 2.1853975535168195, "grad_norm": 0.2436175156978095, "learning_rate": 1.5043493347233187e-06, "loss": 0.3202, "num_tokens": 918022831.0, "step": 2231 }, { "epoch": 2.1857798165137616, "grad_norm": 0.23740927983326002, "learning_rate": 1.5017891341784471e-06, "loss": 0.3239, "num_tokens": 918409424.0, "step": 2232 }, { "epoch": 2.1861620795107033, "grad_norm": 0.25019661443580676, "learning_rate": 1.4992350645366529e-06, "loss": 0.3234, "num_tokens": 918815634.0, "step": 2233 }, { "epoch": 2.1865443425076454, "grad_norm": 0.23846630836269916, "learning_rate": 1.4966871297143776e-06, "loss": 0.3145, "num_tokens": 919211196.0, "step": 2234 }, { "epoch": 2.186926605504587, "grad_norm": 0.24319562313396528, "learning_rate": 1.4941453336186532e-06, "loss": 0.3089, "num_tokens": 919593384.0, "step": 2235 }, { "epoch": 2.187308868501529, "grad_norm": 0.26326880051970314, "learning_rate": 1.4916096801471002e-06, "loss": 0.3186, "num_tokens": 919958439.0, "step": 2236 }, { "epoch": 2.187691131498471, "grad_norm": 0.21928210405981835, "learning_rate": 1.4890801731879198e-06, "loss": 0.3201, "num_tokens": 920380521.0, "step": 2237 }, { "epoch": 2.188073394495413, "grad_norm": 0.2271918808386219, "learning_rate": 1.4865568166198843e-06, "loss": 0.3039, "num_tokens": 920796242.0, "step": 2238 }, { "epoch": 2.1884556574923546, "grad_norm": 0.22879151647382603, "learning_rate": 1.484039614312342e-06, "loss": 0.3106, "num_tokens": 921203862.0, "step": 2239 }, { "epoch": 2.1888379204892967, "grad_norm": 0.25356331608615573, "learning_rate": 1.481528570125197e-06, "loss": 0.3256, "num_tokens": 921609826.0, "step": 2240 }, { "epoch": 2.1892201834862384, "grad_norm": 0.25670375029848935, "learning_rate": 1.4790236879089161e-06, "loss": 0.3071, "num_tokens": 922025226.0, "step": 2241 }, { "epoch": 2.1896024464831805, "grad_norm": 0.25028564081571036, "learning_rate": 1.4765249715045136e-06, "loss": 0.3266, "num_tokens": 922428898.0, "step": 2242 }, { "epoch": 2.189984709480122, "grad_norm": 0.2457858575478593, "learning_rate": 1.47403242474355e-06, "loss": 0.3381, "num_tokens": 922821850.0, "step": 2243 }, { "epoch": 2.1903669724770642, "grad_norm": 0.24796580458470796, "learning_rate": 1.4715460514481265e-06, "loss": 0.2985, "num_tokens": 923180062.0, "step": 2244 }, { "epoch": 2.190749235474006, "grad_norm": 0.2281638392614198, "learning_rate": 1.4690658554308763e-06, "loss": 0.3124, "num_tokens": 923601876.0, "step": 2245 }, { "epoch": 2.191131498470948, "grad_norm": 0.24819064319689557, "learning_rate": 1.4665918404949609e-06, "loss": 0.3414, "num_tokens": 924053085.0, "step": 2246 }, { "epoch": 2.19151376146789, "grad_norm": 0.221779737860921, "learning_rate": 1.4641240104340635e-06, "loss": 0.2957, "num_tokens": 924448570.0, "step": 2247 }, { "epoch": 2.191896024464832, "grad_norm": 0.2353912156533869, "learning_rate": 1.4616623690323845e-06, "loss": 0.3153, "num_tokens": 924867172.0, "step": 2248 }, { "epoch": 2.192278287461774, "grad_norm": 0.2409162808634134, "learning_rate": 1.4592069200646316e-06, "loss": 0.3207, "num_tokens": 925264252.0, "step": 2249 }, { "epoch": 2.1926605504587156, "grad_norm": 0.2575582061153032, "learning_rate": 1.4567576672960198e-06, "loss": 0.3376, "num_tokens": 925659568.0, "step": 2250 }, { "epoch": 2.1930428134556577, "grad_norm": 0.23666965423824904, "learning_rate": 1.4543146144822623e-06, "loss": 0.321, "num_tokens": 926093541.0, "step": 2251 }, { "epoch": 2.1934250764525993, "grad_norm": 0.22345524625671717, "learning_rate": 1.4518777653695632e-06, "loss": 0.3212, "num_tokens": 926509139.0, "step": 2252 }, { "epoch": 2.1938073394495414, "grad_norm": 0.24472584350893706, "learning_rate": 1.4494471236946172e-06, "loss": 0.3191, "num_tokens": 926919949.0, "step": 2253 }, { "epoch": 2.194189602446483, "grad_norm": 0.2237900397108314, "learning_rate": 1.4470226931845968e-06, "loss": 0.312, "num_tokens": 927347962.0, "step": 2254 }, { "epoch": 2.194571865443425, "grad_norm": 0.24192376693367446, "learning_rate": 1.4446044775571539e-06, "loss": 0.3051, "num_tokens": 927757359.0, "step": 2255 }, { "epoch": 2.194954128440367, "grad_norm": 0.25049196542039615, "learning_rate": 1.4421924805204075e-06, "loss": 0.3114, "num_tokens": 928116516.0, "step": 2256 }, { "epoch": 2.195336391437309, "grad_norm": 0.2740250120932496, "learning_rate": 1.4397867057729401e-06, "loss": 0.3131, "num_tokens": 928484709.0, "step": 2257 }, { "epoch": 2.1957186544342506, "grad_norm": 0.26813796233385834, "learning_rate": 1.4373871570037978e-06, "loss": 0.3388, "num_tokens": 928930938.0, "step": 2258 }, { "epoch": 2.1961009174311927, "grad_norm": 0.2395394764193769, "learning_rate": 1.434993837892475e-06, "loss": 0.3223, "num_tokens": 929331110.0, "step": 2259 }, { "epoch": 2.1964831804281344, "grad_norm": 0.2390888572187172, "learning_rate": 1.4326067521089149e-06, "loss": 0.3314, "num_tokens": 929762317.0, "step": 2260 }, { "epoch": 2.1968654434250765, "grad_norm": 0.23187735590618286, "learning_rate": 1.4302259033135035e-06, "loss": 0.3078, "num_tokens": 930157263.0, "step": 2261 }, { "epoch": 2.197247706422018, "grad_norm": 0.24650218368092827, "learning_rate": 1.4278512951570615e-06, "loss": 0.3355, "num_tokens": 930560738.0, "step": 2262 }, { "epoch": 2.1976299694189603, "grad_norm": 0.24718079585326058, "learning_rate": 1.4254829312808405e-06, "loss": 0.3166, "num_tokens": 930982686.0, "step": 2263 }, { "epoch": 2.198012232415902, "grad_norm": 0.24536463572552442, "learning_rate": 1.4231208153165178e-06, "loss": 0.3372, "num_tokens": 931413350.0, "step": 2264 }, { "epoch": 2.198394495412844, "grad_norm": 0.23812044230751547, "learning_rate": 1.4207649508861893e-06, "loss": 0.3035, "num_tokens": 931781562.0, "step": 2265 }, { "epoch": 2.198776758409786, "grad_norm": 0.23306212887001213, "learning_rate": 1.4184153416023638e-06, "loss": 0.3156, "num_tokens": 932169176.0, "step": 2266 }, { "epoch": 2.199159021406728, "grad_norm": 0.22855772156103168, "learning_rate": 1.4160719910679621e-06, "loss": 0.3354, "num_tokens": 932586511.0, "step": 2267 }, { "epoch": 2.19954128440367, "grad_norm": 0.2554414905313789, "learning_rate": 1.4137349028763031e-06, "loss": 0.3175, "num_tokens": 933001427.0, "step": 2268 }, { "epoch": 2.1999235474006116, "grad_norm": 0.2357468823759153, "learning_rate": 1.4114040806111056e-06, "loss": 0.3245, "num_tokens": 933411499.0, "step": 2269 }, { "epoch": 2.2003058103975537, "grad_norm": 0.2232072191124909, "learning_rate": 1.4090795278464791e-06, "loss": 0.3193, "num_tokens": 933897268.0, "step": 2270 }, { "epoch": 2.2006880733944953, "grad_norm": 0.2421418674687781, "learning_rate": 1.4067612481469209e-06, "loss": 0.3503, "num_tokens": 934310511.0, "step": 2271 }, { "epoch": 2.2010703363914375, "grad_norm": 0.22236266053699993, "learning_rate": 1.404449245067308e-06, "loss": 0.3239, "num_tokens": 934722838.0, "step": 2272 }, { "epoch": 2.201452599388379, "grad_norm": 0.25544114978141097, "learning_rate": 1.4021435221528907e-06, "loss": 0.3139, "num_tokens": 935110837.0, "step": 2273 }, { "epoch": 2.2018348623853212, "grad_norm": 0.22882052149328255, "learning_rate": 1.399844082939295e-06, "loss": 0.3122, "num_tokens": 935527589.0, "step": 2274 }, { "epoch": 2.202217125382263, "grad_norm": 0.23133086508808312, "learning_rate": 1.3975509309525036e-06, "loss": 0.2945, "num_tokens": 935903704.0, "step": 2275 }, { "epoch": 2.202599388379205, "grad_norm": 0.2219750007159546, "learning_rate": 1.3952640697088664e-06, "loss": 0.3246, "num_tokens": 936342729.0, "step": 2276 }, { "epoch": 2.2029816513761467, "grad_norm": 0.23543136760802605, "learning_rate": 1.3929835027150806e-06, "loss": 0.3107, "num_tokens": 936718536.0, "step": 2277 }, { "epoch": 2.2033639143730888, "grad_norm": 0.24465846908175518, "learning_rate": 1.3907092334681954e-06, "loss": 0.3491, "num_tokens": 937121040.0, "step": 2278 }, { "epoch": 2.2037461773700304, "grad_norm": 0.2174748946547011, "learning_rate": 1.3884412654556004e-06, "loss": 0.2991, "num_tokens": 937534524.0, "step": 2279 }, { "epoch": 2.2041284403669725, "grad_norm": 0.22507665843183142, "learning_rate": 1.3861796021550253e-06, "loss": 0.3205, "num_tokens": 937940498.0, "step": 2280 }, { "epoch": 2.204510703363914, "grad_norm": 0.2189591135263967, "learning_rate": 1.383924247034531e-06, "loss": 0.2938, "num_tokens": 938368876.0, "step": 2281 }, { "epoch": 2.2048929663608563, "grad_norm": 0.2379367896549823, "learning_rate": 1.3816752035525035e-06, "loss": 0.319, "num_tokens": 938792032.0, "step": 2282 }, { "epoch": 2.205275229357798, "grad_norm": 0.241070821822433, "learning_rate": 1.3794324751576551e-06, "loss": 0.3328, "num_tokens": 939194145.0, "step": 2283 }, { "epoch": 2.20565749235474, "grad_norm": 0.2217473087425511, "learning_rate": 1.3771960652890086e-06, "loss": 0.3249, "num_tokens": 939629399.0, "step": 2284 }, { "epoch": 2.206039755351682, "grad_norm": 0.2262142179988771, "learning_rate": 1.3749659773759038e-06, "loss": 0.3056, "num_tokens": 940017173.0, "step": 2285 }, { "epoch": 2.206422018348624, "grad_norm": 0.23237353982198988, "learning_rate": 1.3727422148379816e-06, "loss": 0.3101, "num_tokens": 940442507.0, "step": 2286 }, { "epoch": 2.206804281345566, "grad_norm": 0.2348016416938737, "learning_rate": 1.3705247810851857e-06, "loss": 0.3107, "num_tokens": 940843846.0, "step": 2287 }, { "epoch": 2.2071865443425076, "grad_norm": 0.21990185962230782, "learning_rate": 1.3683136795177549e-06, "loss": 0.2973, "num_tokens": 941267142.0, "step": 2288 }, { "epoch": 2.2075688073394497, "grad_norm": 0.22656945532680414, "learning_rate": 1.3661089135262188e-06, "loss": 0.3162, "num_tokens": 941701733.0, "step": 2289 }, { "epoch": 2.2079510703363914, "grad_norm": 0.24345309167467757, "learning_rate": 1.3639104864913908e-06, "loss": 0.3243, "num_tokens": 942090520.0, "step": 2290 }, { "epoch": 2.2083333333333335, "grad_norm": 0.22572103977646577, "learning_rate": 1.3617184017843628e-06, "loss": 0.3319, "num_tokens": 942569073.0, "step": 2291 }, { "epoch": 2.208715596330275, "grad_norm": 0.23471783702654825, "learning_rate": 1.3595326627665061e-06, "loss": 0.33, "num_tokens": 942974528.0, "step": 2292 }, { "epoch": 2.2090978593272173, "grad_norm": 0.25122562058168113, "learning_rate": 1.3573532727894561e-06, "loss": 0.3267, "num_tokens": 943408209.0, "step": 2293 }, { "epoch": 2.209480122324159, "grad_norm": 0.2401020226565427, "learning_rate": 1.3551802351951155e-06, "loss": 0.3388, "num_tokens": 943836121.0, "step": 2294 }, { "epoch": 2.209862385321101, "grad_norm": 0.23346182496953494, "learning_rate": 1.3530135533156449e-06, "loss": 0.3086, "num_tokens": 944234298.0, "step": 2295 }, { "epoch": 2.2102446483180427, "grad_norm": 0.22608818047665813, "learning_rate": 1.3508532304734603e-06, "loss": 0.3215, "num_tokens": 944657903.0, "step": 2296 }, { "epoch": 2.210626911314985, "grad_norm": 0.24261592210672542, "learning_rate": 1.348699269981226e-06, "loss": 0.333, "num_tokens": 945057836.0, "step": 2297 }, { "epoch": 2.2110091743119265, "grad_norm": 0.2261756969915708, "learning_rate": 1.3465516751418489e-06, "loss": 0.3358, "num_tokens": 945499768.0, "step": 2298 }, { "epoch": 2.2113914373088686, "grad_norm": 0.22595008294888091, "learning_rate": 1.3444104492484778e-06, "loss": 0.3091, "num_tokens": 945929291.0, "step": 2299 }, { "epoch": 2.21177370030581, "grad_norm": 0.23402033546061082, "learning_rate": 1.3422755955844904e-06, "loss": 0.3184, "num_tokens": 946324979.0, "step": 2300 }, { "epoch": 2.2121559633027523, "grad_norm": 0.2279214969978577, "learning_rate": 1.3401471174235004e-06, "loss": 0.3153, "num_tokens": 946757470.0, "step": 2301 }, { "epoch": 2.212538226299694, "grad_norm": 0.2222163162909481, "learning_rate": 1.3380250180293368e-06, "loss": 0.3335, "num_tokens": 947193223.0, "step": 2302 }, { "epoch": 2.212920489296636, "grad_norm": 0.23056795436250863, "learning_rate": 1.3359093006560542e-06, "loss": 0.32, "num_tokens": 947586945.0, "step": 2303 }, { "epoch": 2.213302752293578, "grad_norm": 0.25343283225955054, "learning_rate": 1.3337999685479172e-06, "loss": 0.3377, "num_tokens": 948019867.0, "step": 2304 }, { "epoch": 2.21368501529052, "grad_norm": 0.23873458289519447, "learning_rate": 1.3316970249394e-06, "loss": 0.3496, "num_tokens": 948467090.0, "step": 2305 }, { "epoch": 2.214067278287462, "grad_norm": 0.22709695742963218, "learning_rate": 1.3296004730551817e-06, "loss": 0.3585, "num_tokens": 948904308.0, "step": 2306 }, { "epoch": 2.2144495412844036, "grad_norm": 0.22136998928004317, "learning_rate": 1.3275103161101386e-06, "loss": 0.3071, "num_tokens": 949304602.0, "step": 2307 }, { "epoch": 2.2148318042813457, "grad_norm": 0.26575182512318135, "learning_rate": 1.3254265573093417e-06, "loss": 0.3096, "num_tokens": 949661022.0, "step": 2308 }, { "epoch": 2.2152140672782874, "grad_norm": 0.22764252975869667, "learning_rate": 1.3233491998480525e-06, "loss": 0.3285, "num_tokens": 950096474.0, "step": 2309 }, { "epoch": 2.2155963302752295, "grad_norm": 0.21688755798732887, "learning_rate": 1.3212782469117124e-06, "loss": 0.34, "num_tokens": 950545600.0, "step": 2310 }, { "epoch": 2.215978593272171, "grad_norm": 0.22693689396465785, "learning_rate": 1.3192137016759467e-06, "loss": 0.3362, "num_tokens": 951002008.0, "step": 2311 }, { "epoch": 2.2163608562691133, "grad_norm": 0.24767412885410545, "learning_rate": 1.3171555673065528e-06, "loss": 0.3217, "num_tokens": 951397021.0, "step": 2312 }, { "epoch": 2.216743119266055, "grad_norm": 0.24195651311111752, "learning_rate": 1.3151038469594976e-06, "loss": 0.3355, "num_tokens": 951779013.0, "step": 2313 }, { "epoch": 2.217125382262997, "grad_norm": 0.21759711994940784, "learning_rate": 1.313058543780913e-06, "loss": 0.2914, "num_tokens": 952204464.0, "step": 2314 }, { "epoch": 2.2175076452599387, "grad_norm": 0.24907461933173577, "learning_rate": 1.3110196609070905e-06, "loss": 0.3147, "num_tokens": 952607710.0, "step": 2315 }, { "epoch": 2.217889908256881, "grad_norm": 0.22592717975716248, "learning_rate": 1.3089872014644772e-06, "loss": 0.2956, "num_tokens": 953010225.0, "step": 2316 }, { "epoch": 2.2182721712538225, "grad_norm": 0.24232146017803918, "learning_rate": 1.3069611685696698e-06, "loss": 0.3417, "num_tokens": 953467656.0, "step": 2317 }, { "epoch": 2.2186544342507646, "grad_norm": 0.23711273303917801, "learning_rate": 1.3049415653294114e-06, "loss": 0.2952, "num_tokens": 953858252.0, "step": 2318 }, { "epoch": 2.2190366972477062, "grad_norm": 1.1279001716458446, "learning_rate": 1.3029283948405838e-06, "loss": 0.3057, "num_tokens": 954256959.0, "step": 2319 }, { "epoch": 2.2194189602446484, "grad_norm": 0.2627908620020632, "learning_rate": 1.3009216601902081e-06, "loss": 0.3322, "num_tokens": 954657390.0, "step": 2320 }, { "epoch": 2.21980122324159, "grad_norm": 0.25476591817781247, "learning_rate": 1.2989213644554322e-06, "loss": 0.3311, "num_tokens": 955057155.0, "step": 2321 }, { "epoch": 2.220183486238532, "grad_norm": 0.23294470526809133, "learning_rate": 1.2969275107035344e-06, "loss": 0.336, "num_tokens": 955463753.0, "step": 2322 }, { "epoch": 2.2205657492354742, "grad_norm": 0.22848285033664562, "learning_rate": 1.2949401019919122e-06, "loss": 0.3195, "num_tokens": 955847873.0, "step": 2323 }, { "epoch": 2.220948012232416, "grad_norm": 0.2426497760128737, "learning_rate": 1.2929591413680829e-06, "loss": 0.3166, "num_tokens": 956252133.0, "step": 2324 }, { "epoch": 2.221330275229358, "grad_norm": 0.23044559560800626, "learning_rate": 1.2909846318696733e-06, "loss": 0.3183, "num_tokens": 956682056.0, "step": 2325 }, { "epoch": 2.2217125382262997, "grad_norm": 0.24209794611596508, "learning_rate": 1.2890165765244187e-06, "loss": 0.3145, "num_tokens": 957072123.0, "step": 2326 }, { "epoch": 2.2220948012232418, "grad_norm": 0.23379577599053974, "learning_rate": 1.28705497835016e-06, "loss": 0.3178, "num_tokens": 957493114.0, "step": 2327 }, { "epoch": 2.2224770642201834, "grad_norm": 0.25773880165038165, "learning_rate": 1.2850998403548317e-06, "loss": 0.3479, "num_tokens": 957942611.0, "step": 2328 }, { "epoch": 2.2228593272171255, "grad_norm": 0.2552289625550681, "learning_rate": 1.283151165536469e-06, "loss": 0.3118, "num_tokens": 958289097.0, "step": 2329 }, { "epoch": 2.223241590214067, "grad_norm": 0.23588348418555952, "learning_rate": 1.28120895688319e-06, "loss": 0.3201, "num_tokens": 958680101.0, "step": 2330 }, { "epoch": 2.2236238532110093, "grad_norm": 0.24121142920277158, "learning_rate": 1.2792732173732e-06, "loss": 0.3077, "num_tokens": 959065958.0, "step": 2331 }, { "epoch": 2.224006116207951, "grad_norm": 0.23547260481922436, "learning_rate": 1.2773439499747857e-06, "loss": 0.3222, "num_tokens": 959474133.0, "step": 2332 }, { "epoch": 2.224388379204893, "grad_norm": 0.2573138381206103, "learning_rate": 1.2754211576463072e-06, "loss": 0.3163, "num_tokens": 959873850.0, "step": 2333 }, { "epoch": 2.2247706422018347, "grad_norm": 0.2424576405277795, "learning_rate": 1.273504843336198e-06, "loss": 0.3435, "num_tokens": 960300908.0, "step": 2334 }, { "epoch": 2.225152905198777, "grad_norm": 0.2574693554800805, "learning_rate": 1.2715950099829538e-06, "loss": 0.3363, "num_tokens": 960740105.0, "step": 2335 }, { "epoch": 2.2255351681957185, "grad_norm": 0.24031655648209174, "learning_rate": 1.2696916605151393e-06, "loss": 0.331, "num_tokens": 961142988.0, "step": 2336 }, { "epoch": 2.2259174311926606, "grad_norm": 0.23842976302984917, "learning_rate": 1.2677947978513692e-06, "loss": 0.3044, "num_tokens": 961544665.0, "step": 2337 }, { "epoch": 2.2262996941896023, "grad_norm": 0.2666112477759676, "learning_rate": 1.2659044249003177e-06, "loss": 0.3416, "num_tokens": 961977659.0, "step": 2338 }, { "epoch": 2.2266819571865444, "grad_norm": 0.27182505991297184, "learning_rate": 1.2640205445607024e-06, "loss": 0.3312, "num_tokens": 962413899.0, "step": 2339 }, { "epoch": 2.227064220183486, "grad_norm": 0.2405277906069935, "learning_rate": 1.262143159721288e-06, "loss": 0.3361, "num_tokens": 962839378.0, "step": 2340 }, { "epoch": 2.227446483180428, "grad_norm": 0.22899951744301314, "learning_rate": 1.2602722732608797e-06, "loss": 0.3418, "num_tokens": 963258151.0, "step": 2341 }, { "epoch": 2.2278287461773703, "grad_norm": 0.2704219932385218, "learning_rate": 1.2584078880483138e-06, "loss": 0.3504, "num_tokens": 963655868.0, "step": 2342 }, { "epoch": 2.228211009174312, "grad_norm": 0.2501115418240735, "learning_rate": 1.2565500069424627e-06, "loss": 0.3343, "num_tokens": 964062046.0, "step": 2343 }, { "epoch": 2.228593272171254, "grad_norm": 0.2377390548463414, "learning_rate": 1.2546986327922218e-06, "loss": 0.3088, "num_tokens": 964505395.0, "step": 2344 }, { "epoch": 2.2289755351681957, "grad_norm": 0.2514710340773721, "learning_rate": 1.2528537684365103e-06, "loss": 0.3305, "num_tokens": 964919459.0, "step": 2345 }, { "epoch": 2.229357798165138, "grad_norm": 0.2944574806796761, "learning_rate": 1.2510154167042645e-06, "loss": 0.3266, "num_tokens": 965325423.0, "step": 2346 }, { "epoch": 2.2297400611620795, "grad_norm": 0.2255865241598775, "learning_rate": 1.2491835804144337e-06, "loss": 0.3307, "num_tokens": 965728260.0, "step": 2347 }, { "epoch": 2.2301223241590216, "grad_norm": 0.22356574088705075, "learning_rate": 1.2473582623759777e-06, "loss": 0.3171, "num_tokens": 966131727.0, "step": 2348 }, { "epoch": 2.2305045871559632, "grad_norm": 0.250177987823189, "learning_rate": 1.2455394653878605e-06, "loss": 0.3436, "num_tokens": 966594257.0, "step": 2349 }, { "epoch": 2.2308868501529053, "grad_norm": 0.21561272375601487, "learning_rate": 1.243727192239047e-06, "loss": 0.3201, "num_tokens": 967022928.0, "step": 2350 }, { "epoch": 2.231269113149847, "grad_norm": 0.3783810508075721, "learning_rate": 1.2419214457084957e-06, "loss": 0.3569, "num_tokens": 967483627.0, "step": 2351 }, { "epoch": 2.231651376146789, "grad_norm": 0.269147342832729, "learning_rate": 1.240122228565162e-06, "loss": 0.3622, "num_tokens": 967928371.0, "step": 2352 }, { "epoch": 2.2320336391437308, "grad_norm": 0.27442759091246816, "learning_rate": 1.2383295435679845e-06, "loss": 0.3327, "num_tokens": 968354358.0, "step": 2353 }, { "epoch": 2.232415902140673, "grad_norm": 0.23952720590378324, "learning_rate": 1.2365433934658894e-06, "loss": 0.3318, "num_tokens": 968798125.0, "step": 2354 }, { "epoch": 2.2327981651376145, "grad_norm": 0.2218412787475607, "learning_rate": 1.2347637809977778e-06, "loss": 0.3321, "num_tokens": 969236754.0, "step": 2355 }, { "epoch": 2.2331804281345566, "grad_norm": 0.24723712819115837, "learning_rate": 1.2329907088925288e-06, "loss": 0.3162, "num_tokens": 969627410.0, "step": 2356 }, { "epoch": 2.2335626911314983, "grad_norm": 0.23037465157581424, "learning_rate": 1.2312241798689926e-06, "loss": 0.347, "num_tokens": 970024874.0, "step": 2357 }, { "epoch": 2.2339449541284404, "grad_norm": 0.2341244909650061, "learning_rate": 1.2294641966359847e-06, "loss": 0.3174, "num_tokens": 970432922.0, "step": 2358 }, { "epoch": 2.234327217125382, "grad_norm": 0.22461168607791035, "learning_rate": 1.2277107618922845e-06, "loss": 0.3245, "num_tokens": 970845536.0, "step": 2359 }, { "epoch": 2.234709480122324, "grad_norm": 0.24555144689063618, "learning_rate": 1.225963878326628e-06, "loss": 0.3573, "num_tokens": 971266305.0, "step": 2360 }, { "epoch": 2.2350917431192663, "grad_norm": 0.2604432967209313, "learning_rate": 1.2242235486177089e-06, "loss": 0.322, "num_tokens": 971639480.0, "step": 2361 }, { "epoch": 2.235474006116208, "grad_norm": 0.257334344877142, "learning_rate": 1.2224897754341664e-06, "loss": 0.3327, "num_tokens": 972048619.0, "step": 2362 }, { "epoch": 2.23585626911315, "grad_norm": 0.2646046183525611, "learning_rate": 1.2207625614345906e-06, "loss": 0.3138, "num_tokens": 972416765.0, "step": 2363 }, { "epoch": 2.2362385321100917, "grad_norm": 0.24901280762033648, "learning_rate": 1.2190419092675103e-06, "loss": 0.3408, "num_tokens": 972881739.0, "step": 2364 }, { "epoch": 2.236620795107034, "grad_norm": 0.22882767739962545, "learning_rate": 1.217327821571394e-06, "loss": 0.3328, "num_tokens": 973324590.0, "step": 2365 }, { "epoch": 2.2370030581039755, "grad_norm": 0.24258467297047012, "learning_rate": 1.2156203009746435e-06, "loss": 0.3394, "num_tokens": 973713531.0, "step": 2366 }, { "epoch": 2.2373853211009176, "grad_norm": 0.24712962912409794, "learning_rate": 1.2139193500955915e-06, "loss": 0.319, "num_tokens": 974152715.0, "step": 2367 }, { "epoch": 2.2377675840978593, "grad_norm": 0.23290576817608108, "learning_rate": 1.2122249715424946e-06, "loss": 0.3337, "num_tokens": 974560982.0, "step": 2368 }, { "epoch": 2.2381498470948014, "grad_norm": 0.2638009822973547, "learning_rate": 1.2105371679135347e-06, "loss": 0.3515, "num_tokens": 974985943.0, "step": 2369 }, { "epoch": 2.238532110091743, "grad_norm": 0.23306091906775117, "learning_rate": 1.208855941796807e-06, "loss": 0.3415, "num_tokens": 975390608.0, "step": 2370 }, { "epoch": 2.238914373088685, "grad_norm": 0.2294760950209685, "learning_rate": 1.207181295770325e-06, "loss": 0.3132, "num_tokens": 975770600.0, "step": 2371 }, { "epoch": 2.239296636085627, "grad_norm": 0.24913543687894454, "learning_rate": 1.2055132324020097e-06, "loss": 0.3236, "num_tokens": 976169200.0, "step": 2372 }, { "epoch": 2.239678899082569, "grad_norm": 0.24559171160646293, "learning_rate": 1.2038517542496887e-06, "loss": 0.3314, "num_tokens": 976583705.0, "step": 2373 }, { "epoch": 2.2400611620795106, "grad_norm": 0.23009442171228991, "learning_rate": 1.2021968638610923e-06, "loss": 0.3413, "num_tokens": 977002714.0, "step": 2374 }, { "epoch": 2.2404434250764527, "grad_norm": 0.23805876172751517, "learning_rate": 1.2005485637738485e-06, "loss": 0.3511, "num_tokens": 977458100.0, "step": 2375 }, { "epoch": 2.2408256880733943, "grad_norm": 0.3550575328518845, "learning_rate": 1.19890685651548e-06, "loss": 0.3028, "num_tokens": 977861529.0, "step": 2376 }, { "epoch": 2.2412079510703364, "grad_norm": 0.24806454185425694, "learning_rate": 1.1972717446033997e-06, "loss": 0.3293, "num_tokens": 978291280.0, "step": 2377 }, { "epoch": 2.241590214067278, "grad_norm": 0.21584684248480998, "learning_rate": 1.1956432305449083e-06, "loss": 0.3226, "num_tokens": 978741339.0, "step": 2378 }, { "epoch": 2.24197247706422, "grad_norm": 0.23255449153082586, "learning_rate": 1.1940213168371855e-06, "loss": 0.3051, "num_tokens": 979128725.0, "step": 2379 }, { "epoch": 2.2423547400611623, "grad_norm": 0.23109287396888736, "learning_rate": 1.1924060059672956e-06, "loss": 0.3576, "num_tokens": 979590770.0, "step": 2380 }, { "epoch": 2.242737003058104, "grad_norm": 0.24246922535894574, "learning_rate": 1.190797300412174e-06, "loss": 0.3682, "num_tokens": 980019713.0, "step": 2381 }, { "epoch": 2.243119266055046, "grad_norm": 0.20591953580243422, "learning_rate": 1.1891952026386274e-06, "loss": 0.3161, "num_tokens": 980427091.0, "step": 2382 }, { "epoch": 2.2435015290519877, "grad_norm": 0.23444119629223706, "learning_rate": 1.1875997151033323e-06, "loss": 0.324, "num_tokens": 980815242.0, "step": 2383 }, { "epoch": 2.24388379204893, "grad_norm": 0.24435098789599358, "learning_rate": 1.186010840252828e-06, "loss": 0.3364, "num_tokens": 981231859.0, "step": 2384 }, { "epoch": 2.2442660550458715, "grad_norm": 0.2554346704101858, "learning_rate": 1.184428580523514e-06, "loss": 0.3457, "num_tokens": 981638572.0, "step": 2385 }, { "epoch": 2.2446483180428136, "grad_norm": 0.2184947627844497, "learning_rate": 1.182852938341644e-06, "loss": 0.3213, "num_tokens": 982064729.0, "step": 2386 }, { "epoch": 2.2450305810397553, "grad_norm": 0.2360576833754859, "learning_rate": 1.1812839161233283e-06, "loss": 0.3319, "num_tokens": 982458530.0, "step": 2387 }, { "epoch": 2.2454128440366974, "grad_norm": 0.2307922137214105, "learning_rate": 1.1797215162745213e-06, "loss": 0.3253, "num_tokens": 982879093.0, "step": 2388 }, { "epoch": 2.245795107033639, "grad_norm": 0.23752900972508847, "learning_rate": 1.1781657411910283e-06, "loss": 0.3269, "num_tokens": 983289273.0, "step": 2389 }, { "epoch": 2.246177370030581, "grad_norm": 0.24424483910199665, "learning_rate": 1.1766165932584904e-06, "loss": 0.3488, "num_tokens": 983728007.0, "step": 2390 }, { "epoch": 2.246559633027523, "grad_norm": 0.25929355296369944, "learning_rate": 1.1750740748523895e-06, "loss": 0.3568, "num_tokens": 984160764.0, "step": 2391 }, { "epoch": 2.246941896024465, "grad_norm": 0.21736834185746057, "learning_rate": 1.173538188338042e-06, "loss": 0.3279, "num_tokens": 984595170.0, "step": 2392 }, { "epoch": 2.2473241590214066, "grad_norm": 0.23243257558138927, "learning_rate": 1.1720089360705938e-06, "loss": 0.3063, "num_tokens": 984973412.0, "step": 2393 }, { "epoch": 2.2477064220183487, "grad_norm": 0.23678355239935087, "learning_rate": 1.1704863203950187e-06, "loss": 0.3482, "num_tokens": 985403473.0, "step": 2394 }, { "epoch": 2.2480886850152904, "grad_norm": 0.23438387545897227, "learning_rate": 1.1689703436461121e-06, "loss": 0.3246, "num_tokens": 985763286.0, "step": 2395 }, { "epoch": 2.2484709480122325, "grad_norm": 0.22719810461895487, "learning_rate": 1.1674610081484913e-06, "loss": 0.331, "num_tokens": 986162901.0, "step": 2396 }, { "epoch": 2.248853211009174, "grad_norm": 0.2442947173746423, "learning_rate": 1.165958316216588e-06, "loss": 0.3326, "num_tokens": 986572469.0, "step": 2397 }, { "epoch": 2.2492354740061162, "grad_norm": 0.22448501469058643, "learning_rate": 1.1644622701546491e-06, "loss": 0.3433, "num_tokens": 987016079.0, "step": 2398 }, { "epoch": 2.2496177370030583, "grad_norm": 0.22184255826468807, "learning_rate": 1.1629728722567276e-06, "loss": 0.3139, "num_tokens": 987454798.0, "step": 2399 }, { "epoch": 2.25, "grad_norm": 0.28746205382492435, "learning_rate": 1.161490124806684e-06, "loss": 0.3364, "num_tokens": 987876376.0, "step": 2400 }, { "epoch": 2.2503822629969417, "grad_norm": 0.24710237038320823, "learning_rate": 1.160014030078181e-06, "loss": 0.3286, "num_tokens": 988277981.0, "step": 2401 }, { "epoch": 2.2507645259938838, "grad_norm": 0.2267036496230456, "learning_rate": 1.1585445903346784e-06, "loss": 0.3232, "num_tokens": 988665446.0, "step": 2402 }, { "epoch": 2.251146788990826, "grad_norm": 0.22240446421589935, "learning_rate": 1.1570818078294336e-06, "loss": 0.3392, "num_tokens": 989084252.0, "step": 2403 }, { "epoch": 2.2515290519877675, "grad_norm": 0.2457982278835067, "learning_rate": 1.1556256848054923e-06, "loss": 0.3236, "num_tokens": 989455382.0, "step": 2404 }, { "epoch": 2.2519113149847096, "grad_norm": 0.21584865743447265, "learning_rate": 1.1541762234956927e-06, "loss": 0.3263, "num_tokens": 989936412.0, "step": 2405 }, { "epoch": 2.2522935779816513, "grad_norm": 0.2300170346912694, "learning_rate": 1.1527334261226545e-06, "loss": 0.3242, "num_tokens": 990328515.0, "step": 2406 }, { "epoch": 2.2526758409785934, "grad_norm": 0.23039095277682822, "learning_rate": 1.1512972948987801e-06, "loss": 0.3143, "num_tokens": 990729837.0, "step": 2407 }, { "epoch": 2.253058103975535, "grad_norm": 0.22492205125592585, "learning_rate": 1.1498678320262497e-06, "loss": 0.324, "num_tokens": 991152825.0, "step": 2408 }, { "epoch": 2.253440366972477, "grad_norm": 0.2253740070545322, "learning_rate": 1.1484450396970186e-06, "loss": 0.2986, "num_tokens": 991557123.0, "step": 2409 }, { "epoch": 2.253822629969419, "grad_norm": 0.250902049729847, "learning_rate": 1.1470289200928129e-06, "loss": 0.3532, "num_tokens": 991953366.0, "step": 2410 }, { "epoch": 2.254204892966361, "grad_norm": 0.2230592486884478, "learning_rate": 1.1456194753851274e-06, "loss": 0.3518, "num_tokens": 992386224.0, "step": 2411 }, { "epoch": 2.2545871559633026, "grad_norm": 0.2187880536890912, "learning_rate": 1.1442167077352203e-06, "loss": 0.3557, "num_tokens": 992837037.0, "step": 2412 }, { "epoch": 2.2549694189602447, "grad_norm": 0.21920680206903553, "learning_rate": 1.1428206192941121e-06, "loss": 0.3008, "num_tokens": 993246358.0, "step": 2413 }, { "epoch": 2.2553516819571864, "grad_norm": 0.2211077967432299, "learning_rate": 1.1414312122025812e-06, "loss": 0.3289, "num_tokens": 993654027.0, "step": 2414 }, { "epoch": 2.2557339449541285, "grad_norm": 0.23348801568444205, "learning_rate": 1.1400484885911608e-06, "loss": 0.3206, "num_tokens": 994069700.0, "step": 2415 }, { "epoch": 2.25611620795107, "grad_norm": 0.22417454627328656, "learning_rate": 1.1386724505801348e-06, "loss": 0.3357, "num_tokens": 994516047.0, "step": 2416 }, { "epoch": 2.2564984709480123, "grad_norm": 0.25749464587236665, "learning_rate": 1.137303100279536e-06, "loss": 0.3502, "num_tokens": 994924423.0, "step": 2417 }, { "epoch": 2.2568807339449544, "grad_norm": 0.21323452439042662, "learning_rate": 1.1359404397891425e-06, "loss": 0.3124, "num_tokens": 995371367.0, "step": 2418 }, { "epoch": 2.257262996941896, "grad_norm": 0.2626278791586827, "learning_rate": 1.1345844711984736e-06, "loss": 0.3444, "num_tokens": 995744387.0, "step": 2419 }, { "epoch": 2.2576452599388377, "grad_norm": 0.24647406811267328, "learning_rate": 1.133235196586787e-06, "loss": 0.3302, "num_tokens": 996153554.0, "step": 2420 }, { "epoch": 2.25802752293578, "grad_norm": 0.23851547349763427, "learning_rate": 1.1318926180230768e-06, "loss": 0.3128, "num_tokens": 996511448.0, "step": 2421 }, { "epoch": 2.258409785932722, "grad_norm": 0.2275516894358364, "learning_rate": 1.130556737566068e-06, "loss": 0.3336, "num_tokens": 996931898.0, "step": 2422 }, { "epoch": 2.2587920489296636, "grad_norm": 0.2513206093283021, "learning_rate": 1.1292275572642152e-06, "loss": 0.3414, "num_tokens": 997322671.0, "step": 2423 }, { "epoch": 2.2591743119266057, "grad_norm": 0.24373213840456173, "learning_rate": 1.1279050791556998e-06, "loss": 0.3274, "num_tokens": 997694860.0, "step": 2424 }, { "epoch": 2.2595565749235473, "grad_norm": 0.23072164853942825, "learning_rate": 1.1265893052684239e-06, "loss": 0.3349, "num_tokens": 998133875.0, "step": 2425 }, { "epoch": 2.2599388379204894, "grad_norm": 0.21236945439116897, "learning_rate": 1.1252802376200108e-06, "loss": 0.315, "num_tokens": 998541531.0, "step": 2426 }, { "epoch": 2.260321100917431, "grad_norm": 0.22694767316606934, "learning_rate": 1.1239778782178005e-06, "loss": 0.3251, "num_tokens": 998928523.0, "step": 2427 }, { "epoch": 2.260703363914373, "grad_norm": 0.24063553186959405, "learning_rate": 1.1226822290588466e-06, "loss": 0.3417, "num_tokens": 999376294.0, "step": 2428 }, { "epoch": 2.261085626911315, "grad_norm": 0.2220640093426646, "learning_rate": 1.1213932921299111e-06, "loss": 0.3601, "num_tokens": 999846399.0, "step": 2429 }, { "epoch": 2.261467889908257, "grad_norm": 0.2364277592625136, "learning_rate": 1.1201110694074657e-06, "loss": 0.3114, "num_tokens": 1000228168.0, "step": 2430 }, { "epoch": 2.2618501529051986, "grad_norm": 0.24034084515428292, "learning_rate": 1.1188355628576863e-06, "loss": 0.3277, "num_tokens": 1000607465.0, "step": 2431 }, { "epoch": 2.2622324159021407, "grad_norm": 0.22000277276288935, "learning_rate": 1.1175667744364482e-06, "loss": 0.3086, "num_tokens": 1000994851.0, "step": 2432 }, { "epoch": 2.2626146788990824, "grad_norm": 0.24137844973686678, "learning_rate": 1.1163047060893274e-06, "loss": 0.336, "num_tokens": 1001369091.0, "step": 2433 }, { "epoch": 2.2629969418960245, "grad_norm": 0.23021306253268806, "learning_rate": 1.1150493597515936e-06, "loss": 0.3443, "num_tokens": 1001789787.0, "step": 2434 }, { "epoch": 2.263379204892966, "grad_norm": 0.2161987935811964, "learning_rate": 1.1138007373482098e-06, "loss": 0.3124, "num_tokens": 1002195521.0, "step": 2435 }, { "epoch": 2.2637614678899083, "grad_norm": 0.22001589376340233, "learning_rate": 1.1125588407938276e-06, "loss": 0.3411, "num_tokens": 1002618773.0, "step": 2436 }, { "epoch": 2.2641437308868504, "grad_norm": 0.2279997990434341, "learning_rate": 1.1113236719927858e-06, "loss": 0.3416, "num_tokens": 1003043090.0, "step": 2437 }, { "epoch": 2.264525993883792, "grad_norm": 0.2291006277691492, "learning_rate": 1.110095232839108e-06, "loss": 0.3302, "num_tokens": 1003480074.0, "step": 2438 }, { "epoch": 2.2649082568807337, "grad_norm": 0.2545025363657803, "learning_rate": 1.1088735252164943e-06, "loss": 0.3315, "num_tokens": 1003885410.0, "step": 2439 }, { "epoch": 2.265290519877676, "grad_norm": 0.22325642009139007, "learning_rate": 1.1076585509983285e-06, "loss": 0.3401, "num_tokens": 1004316971.0, "step": 2440 }, { "epoch": 2.265672782874618, "grad_norm": 0.2619365379370404, "learning_rate": 1.1064503120476633e-06, "loss": 0.3521, "num_tokens": 1004732315.0, "step": 2441 }, { "epoch": 2.2660550458715596, "grad_norm": 0.22302334345209515, "learning_rate": 1.1052488102172289e-06, "loss": 0.3609, "num_tokens": 1005173098.0, "step": 2442 }, { "epoch": 2.2664373088685017, "grad_norm": 0.24858579207722525, "learning_rate": 1.1040540473494204e-06, "loss": 0.3465, "num_tokens": 1005574375.0, "step": 2443 }, { "epoch": 2.2668195718654434, "grad_norm": 0.22887026474639136, "learning_rate": 1.1028660252763019e-06, "loss": 0.3413, "num_tokens": 1006016948.0, "step": 2444 }, { "epoch": 2.2672018348623855, "grad_norm": 0.23490994931571146, "learning_rate": 1.1016847458196e-06, "loss": 0.3333, "num_tokens": 1006453783.0, "step": 2445 }, { "epoch": 2.267584097859327, "grad_norm": 0.22729442171332015, "learning_rate": 1.100510210790703e-06, "loss": 0.3305, "num_tokens": 1006845590.0, "step": 2446 }, { "epoch": 2.2679663608562692, "grad_norm": 0.23019563326392958, "learning_rate": 1.099342421990656e-06, "loss": 0.3091, "num_tokens": 1007245712.0, "step": 2447 }, { "epoch": 2.268348623853211, "grad_norm": 0.21654243132755285, "learning_rate": 1.0981813812101597e-06, "loss": 0.3156, "num_tokens": 1007667320.0, "step": 2448 }, { "epoch": 2.268730886850153, "grad_norm": 0.24294838728171128, "learning_rate": 1.0970270902295682e-06, "loss": 0.3074, "num_tokens": 1008043766.0, "step": 2449 }, { "epoch": 2.2691131498470947, "grad_norm": 0.23831900831720476, "learning_rate": 1.0958795508188836e-06, "loss": 0.3544, "num_tokens": 1008519965.0, "step": 2450 }, { "epoch": 2.2694954128440368, "grad_norm": 0.22209478460253104, "learning_rate": 1.0947387647377577e-06, "loss": 0.3528, "num_tokens": 1008987545.0, "step": 2451 }, { "epoch": 2.2698776758409784, "grad_norm": 0.2229096082060342, "learning_rate": 1.0936047337354843e-06, "loss": 0.3193, "num_tokens": 1009416294.0, "step": 2452 }, { "epoch": 2.2702599388379205, "grad_norm": 0.2454184509830498, "learning_rate": 1.0924774595509998e-06, "loss": 0.3435, "num_tokens": 1009841014.0, "step": 2453 }, { "epoch": 2.270642201834862, "grad_norm": 0.21828349370561898, "learning_rate": 1.0913569439128803e-06, "loss": 0.3336, "num_tokens": 1010271123.0, "step": 2454 }, { "epoch": 2.2710244648318043, "grad_norm": 0.25708198825260103, "learning_rate": 1.0902431885393359e-06, "loss": 0.3425, "num_tokens": 1010702434.0, "step": 2455 }, { "epoch": 2.2714067278287464, "grad_norm": 0.2654127500719497, "learning_rate": 1.0891361951382135e-06, "loss": 0.3465, "num_tokens": 1011159442.0, "step": 2456 }, { "epoch": 2.271788990825688, "grad_norm": 0.23541331898086507, "learning_rate": 1.0880359654069887e-06, "loss": 0.3259, "num_tokens": 1011562457.0, "step": 2457 }, { "epoch": 2.2721712538226297, "grad_norm": 0.22997285270085244, "learning_rate": 1.0869425010327675e-06, "loss": 0.3213, "num_tokens": 1011990373.0, "step": 2458 }, { "epoch": 2.272553516819572, "grad_norm": 0.23854453314317425, "learning_rate": 1.0858558036922804e-06, "loss": 0.3303, "num_tokens": 1012386087.0, "step": 2459 }, { "epoch": 2.272935779816514, "grad_norm": 0.22850580374894988, "learning_rate": 1.0847758750518818e-06, "loss": 0.3141, "num_tokens": 1012760822.0, "step": 2460 }, { "epoch": 2.2733180428134556, "grad_norm": 0.22177009226263714, "learning_rate": 1.0837027167675467e-06, "loss": 0.3088, "num_tokens": 1013164595.0, "step": 2461 }, { "epoch": 2.2737003058103977, "grad_norm": 0.2471851556767312, "learning_rate": 1.0826363304848686e-06, "loss": 0.3221, "num_tokens": 1013521940.0, "step": 2462 }, { "epoch": 2.2740825688073394, "grad_norm": 0.20770442392940838, "learning_rate": 1.081576717839057e-06, "loss": 0.2981, "num_tokens": 1013927578.0, "step": 2463 }, { "epoch": 2.2744648318042815, "grad_norm": 0.24233006861887452, "learning_rate": 1.0805238804549334e-06, "loss": 0.3355, "num_tokens": 1014382863.0, "step": 2464 }, { "epoch": 2.274847094801223, "grad_norm": 0.21310212218947608, "learning_rate": 1.0794778199469321e-06, "loss": 0.3017, "num_tokens": 1014813702.0, "step": 2465 }, { "epoch": 2.2752293577981653, "grad_norm": 0.22936044493682983, "learning_rate": 1.0784385379190935e-06, "loss": 0.3179, "num_tokens": 1015192303.0, "step": 2466 }, { "epoch": 2.275611620795107, "grad_norm": 0.23366685622894057, "learning_rate": 1.077406035965065e-06, "loss": 0.3205, "num_tokens": 1015655158.0, "step": 2467 }, { "epoch": 2.275993883792049, "grad_norm": 0.24503070845159775, "learning_rate": 1.076380315668097e-06, "loss": 0.3151, "num_tokens": 1016044195.0, "step": 2468 }, { "epoch": 2.2763761467889907, "grad_norm": 0.22621904277630284, "learning_rate": 1.0753613786010414e-06, "loss": 0.3269, "num_tokens": 1016463210.0, "step": 2469 }, { "epoch": 2.276758409785933, "grad_norm": 0.24239948071102563, "learning_rate": 1.0743492263263481e-06, "loss": 0.3217, "num_tokens": 1016844339.0, "step": 2470 }, { "epoch": 2.2771406727828745, "grad_norm": 0.24631978782728897, "learning_rate": 1.0733438603960623e-06, "loss": 0.3353, "num_tokens": 1017261213.0, "step": 2471 }, { "epoch": 2.2775229357798166, "grad_norm": 0.24916868586881602, "learning_rate": 1.072345282351825e-06, "loss": 0.3109, "num_tokens": 1017615630.0, "step": 2472 }, { "epoch": 2.2779051987767582, "grad_norm": 0.24592236476542634, "learning_rate": 1.0713534937248669e-06, "loss": 0.3242, "num_tokens": 1018002242.0, "step": 2473 }, { "epoch": 2.2782874617737003, "grad_norm": 0.24434138236353822, "learning_rate": 1.0703684960360082e-06, "loss": 0.3352, "num_tokens": 1018422364.0, "step": 2474 }, { "epoch": 2.2786697247706424, "grad_norm": 0.24275741522292824, "learning_rate": 1.0693902907956555e-06, "loss": 0.3236, "num_tokens": 1018861185.0, "step": 2475 }, { "epoch": 2.279051987767584, "grad_norm": 0.23454360157374696, "learning_rate": 1.0684188795038004e-06, "loss": 0.3139, "num_tokens": 1019252082.0, "step": 2476 }, { "epoch": 2.2794342507645258, "grad_norm": 0.24633318774513466, "learning_rate": 1.0674542636500164e-06, "loss": 0.3327, "num_tokens": 1019667580.0, "step": 2477 }, { "epoch": 2.279816513761468, "grad_norm": 0.2215715592344735, "learning_rate": 1.066496444713457e-06, "loss": 0.3339, "num_tokens": 1020131513.0, "step": 2478 }, { "epoch": 2.28019877675841, "grad_norm": 0.2369951451708204, "learning_rate": 1.0655454241628516e-06, "loss": 0.3422, "num_tokens": 1020574200.0, "step": 2479 }, { "epoch": 2.2805810397553516, "grad_norm": 0.20555307622572766, "learning_rate": 1.0646012034565075e-06, "loss": 0.3338, "num_tokens": 1020999110.0, "step": 2480 }, { "epoch": 2.2809633027522938, "grad_norm": 0.2386519993262111, "learning_rate": 1.0636637840423036e-06, "loss": 0.3361, "num_tokens": 1021385574.0, "step": 2481 }, { "epoch": 2.2813455657492354, "grad_norm": 0.2325576182407741, "learning_rate": 1.0627331673576897e-06, "loss": 0.3301, "num_tokens": 1021785276.0, "step": 2482 }, { "epoch": 2.2817278287461775, "grad_norm": 0.22632855578489075, "learning_rate": 1.0618093548296832e-06, "loss": 0.3282, "num_tokens": 1022188038.0, "step": 2483 }, { "epoch": 2.282110091743119, "grad_norm": 0.22207231420518883, "learning_rate": 1.0608923478748704e-06, "loss": 0.3348, "num_tokens": 1022628191.0, "step": 2484 }, { "epoch": 2.2824923547400613, "grad_norm": 0.21461671627297577, "learning_rate": 1.0599821478993992e-06, "loss": 0.3631, "num_tokens": 1023122609.0, "step": 2485 }, { "epoch": 2.282874617737003, "grad_norm": 0.2217188580276791, "learning_rate": 1.0590787562989818e-06, "loss": 0.3384, "num_tokens": 1023568654.0, "step": 2486 }, { "epoch": 2.283256880733945, "grad_norm": 0.2233008731424751, "learning_rate": 1.058182174458889e-06, "loss": 0.3168, "num_tokens": 1023938994.0, "step": 2487 }, { "epoch": 2.2836391437308867, "grad_norm": 0.237373223058795, "learning_rate": 1.0572924037539496e-06, "loss": 0.3256, "num_tokens": 1024353695.0, "step": 2488 }, { "epoch": 2.284021406727829, "grad_norm": 0.25126775459473333, "learning_rate": 1.0564094455485487e-06, "loss": 0.3318, "num_tokens": 1024787294.0, "step": 2489 }, { "epoch": 2.2844036697247705, "grad_norm": 0.216384087436434, "learning_rate": 1.0555333011966248e-06, "loss": 0.3341, "num_tokens": 1025234003.0, "step": 2490 }, { "epoch": 2.2847859327217126, "grad_norm": 0.24847245347434235, "learning_rate": 1.054663972041668e-06, "loss": 0.3277, "num_tokens": 1025639493.0, "step": 2491 }, { "epoch": 2.2851681957186543, "grad_norm": 0.24744861784953204, "learning_rate": 1.0538014594167164e-06, "loss": 0.3491, "num_tokens": 1026080420.0, "step": 2492 }, { "epoch": 2.2855504587155964, "grad_norm": 0.38082254245006325, "learning_rate": 1.0529457646443592e-06, "loss": 0.3432, "num_tokens": 1026490095.0, "step": 2493 }, { "epoch": 2.2859327217125385, "grad_norm": 0.23890051881340058, "learning_rate": 1.0520968890367262e-06, "loss": 0.3225, "num_tokens": 1026890862.0, "step": 2494 }, { "epoch": 2.28631498470948, "grad_norm": 0.24995117410668877, "learning_rate": 1.051254833895495e-06, "loss": 0.367, "num_tokens": 1027349636.0, "step": 2495 }, { "epoch": 2.286697247706422, "grad_norm": 0.261554029088189, "learning_rate": 1.0504196005118822e-06, "loss": 0.345, "num_tokens": 1027802382.0, "step": 2496 }, { "epoch": 2.287079510703364, "grad_norm": 0.22765795161334693, "learning_rate": 1.049591190166644e-06, "loss": 0.3202, "num_tokens": 1028181081.0, "step": 2497 }, { "epoch": 2.287461773700306, "grad_norm": 0.23888710057451729, "learning_rate": 1.0487696041300751e-06, "loss": 0.361, "num_tokens": 1028616135.0, "step": 2498 }, { "epoch": 2.2878440366972477, "grad_norm": 0.24111396839299082, "learning_rate": 1.047954843662004e-06, "loss": 0.3417, "num_tokens": 1029013193.0, "step": 2499 }, { "epoch": 2.28822629969419, "grad_norm": 0.25126342407327235, "learning_rate": 1.0471469100117956e-06, "loss": 0.3497, "num_tokens": 1029433476.0, "step": 2500 }, { "epoch": 2.2886085626911314, "grad_norm": 0.24075279450239317, "learning_rate": 1.046345804418343e-06, "loss": 0.3476, "num_tokens": 1029829603.0, "step": 2501 }, { "epoch": 2.2889908256880735, "grad_norm": 0.23261187753686255, "learning_rate": 1.0455515281100723e-06, "loss": 0.3352, "num_tokens": 1030252571.0, "step": 2502 }, { "epoch": 2.289373088685015, "grad_norm": 0.2431561018775167, "learning_rate": 1.0447640823049351e-06, "loss": 0.3286, "num_tokens": 1030637896.0, "step": 2503 }, { "epoch": 2.2897553516819573, "grad_norm": 0.24355417236149104, "learning_rate": 1.0439834682104104e-06, "loss": 0.3415, "num_tokens": 1031059861.0, "step": 2504 }, { "epoch": 2.290137614678899, "grad_norm": 0.2238937945937746, "learning_rate": 1.0432096870235008e-06, "loss": 0.3662, "num_tokens": 1031525107.0, "step": 2505 }, { "epoch": 2.290519877675841, "grad_norm": 0.23742281997755257, "learning_rate": 1.0424427399307311e-06, "loss": 0.3303, "num_tokens": 1031886263.0, "step": 2506 }, { "epoch": 2.2909021406727827, "grad_norm": 0.22219249894177853, "learning_rate": 1.0416826281081475e-06, "loss": 0.3428, "num_tokens": 1032314464.0, "step": 2507 }, { "epoch": 2.291284403669725, "grad_norm": 0.23859566227237639, "learning_rate": 1.0409293527213138e-06, "loss": 0.3469, "num_tokens": 1032757202.0, "step": 2508 }, { "epoch": 2.2916666666666665, "grad_norm": 0.22197096845323003, "learning_rate": 1.0401829149253118e-06, "loss": 0.3226, "num_tokens": 1033169482.0, "step": 2509 }, { "epoch": 2.2920489296636086, "grad_norm": 0.22648885510158728, "learning_rate": 1.0394433158647366e-06, "loss": 0.3375, "num_tokens": 1033571935.0, "step": 2510 }, { "epoch": 2.2924311926605503, "grad_norm": 0.23897672390103608, "learning_rate": 1.0387105566736996e-06, "loss": 0.349, "num_tokens": 1034000569.0, "step": 2511 }, { "epoch": 2.2928134556574924, "grad_norm": 0.22080956936944648, "learning_rate": 1.0379846384758216e-06, "loss": 0.3298, "num_tokens": 1034413429.0, "step": 2512 }, { "epoch": 2.293195718654434, "grad_norm": 0.230914093737077, "learning_rate": 1.037265562384234e-06, "loss": 0.3657, "num_tokens": 1034850045.0, "step": 2513 }, { "epoch": 2.293577981651376, "grad_norm": 0.23314237997544482, "learning_rate": 1.0365533295015762e-06, "loss": 0.3403, "num_tokens": 1035261106.0, "step": 2514 }, { "epoch": 2.293960244648318, "grad_norm": 0.23580618281791355, "learning_rate": 1.0358479409199952e-06, "loss": 0.3422, "num_tokens": 1035660217.0, "step": 2515 }, { "epoch": 2.29434250764526, "grad_norm": 0.22224157880256534, "learning_rate": 1.0351493977211414e-06, "loss": 0.3261, "num_tokens": 1036028648.0, "step": 2516 }, { "epoch": 2.294724770642202, "grad_norm": 0.22964886278721158, "learning_rate": 1.0344577009761687e-06, "loss": 0.3421, "num_tokens": 1036443419.0, "step": 2517 }, { "epoch": 2.2951070336391437, "grad_norm": 0.23296474146998342, "learning_rate": 1.033772851745734e-06, "loss": 0.3224, "num_tokens": 1036830384.0, "step": 2518 }, { "epoch": 2.295489296636086, "grad_norm": 0.23051644311855313, "learning_rate": 1.0330948510799923e-06, "loss": 0.3409, "num_tokens": 1037237237.0, "step": 2519 }, { "epoch": 2.2958715596330275, "grad_norm": 0.22462641573422928, "learning_rate": 1.0324237000185984e-06, "loss": 0.3582, "num_tokens": 1037683155.0, "step": 2520 }, { "epoch": 2.2962538226299696, "grad_norm": 0.2328542438143196, "learning_rate": 1.0317593995907015e-06, "loss": 0.3229, "num_tokens": 1038056926.0, "step": 2521 }, { "epoch": 2.2966360856269112, "grad_norm": 0.2409888331971136, "learning_rate": 1.0311019508149495e-06, "loss": 0.3364, "num_tokens": 1038465137.0, "step": 2522 }, { "epoch": 2.2970183486238533, "grad_norm": 0.2510625211342022, "learning_rate": 1.0304513546994814e-06, "loss": 0.3642, "num_tokens": 1038893952.0, "step": 2523 }, { "epoch": 2.297400611620795, "grad_norm": 0.24607363352809503, "learning_rate": 1.0298076122419289e-06, "loss": 0.3403, "num_tokens": 1039301409.0, "step": 2524 }, { "epoch": 2.297782874617737, "grad_norm": 0.25978387243465945, "learning_rate": 1.0291707244294139e-06, "loss": 0.3237, "num_tokens": 1039680765.0, "step": 2525 }, { "epoch": 2.2981651376146788, "grad_norm": 0.24181156677840934, "learning_rate": 1.0285406922385473e-06, "loss": 0.3359, "num_tokens": 1040084249.0, "step": 2526 }, { "epoch": 2.298547400611621, "grad_norm": 0.24329747091372597, "learning_rate": 1.0279175166354286e-06, "loss": 0.3224, "num_tokens": 1040472359.0, "step": 2527 }, { "epoch": 2.2989296636085625, "grad_norm": 0.24984109042275252, "learning_rate": 1.0273011985756423e-06, "loss": 0.3527, "num_tokens": 1040897632.0, "step": 2528 }, { "epoch": 2.2993119266055047, "grad_norm": 0.23180393663264445, "learning_rate": 1.0266917390042572e-06, "loss": 0.3202, "num_tokens": 1041292888.0, "step": 2529 }, { "epoch": 2.2996941896024463, "grad_norm": 0.22831126182005135, "learning_rate": 1.0260891388558262e-06, "loss": 0.3351, "num_tokens": 1041712334.0, "step": 2530 }, { "epoch": 2.3000764525993884, "grad_norm": 0.2298160715014218, "learning_rate": 1.0254933990543832e-06, "loss": 0.3306, "num_tokens": 1042117428.0, "step": 2531 }, { "epoch": 2.30045871559633, "grad_norm": 0.21852322629232612, "learning_rate": 1.0249045205134426e-06, "loss": 0.337, "num_tokens": 1042528009.0, "step": 2532 }, { "epoch": 2.300840978593272, "grad_norm": 0.29880010296834103, "learning_rate": 1.0243225041359973e-06, "loss": 0.346, "num_tokens": 1042931173.0, "step": 2533 }, { "epoch": 2.301223241590214, "grad_norm": 0.2524327301267761, "learning_rate": 1.0237473508145184e-06, "loss": 0.3396, "num_tokens": 1043319130.0, "step": 2534 }, { "epoch": 2.301605504587156, "grad_norm": 0.22399662240110776, "learning_rate": 1.0231790614309528e-06, "loss": 0.3375, "num_tokens": 1043744223.0, "step": 2535 }, { "epoch": 2.301987767584098, "grad_norm": 0.21996258757076784, "learning_rate": 1.0226176368567209e-06, "loss": 0.3273, "num_tokens": 1044175435.0, "step": 2536 }, { "epoch": 2.3023700305810397, "grad_norm": 0.22040390109681718, "learning_rate": 1.0220630779527195e-06, "loss": 0.3157, "num_tokens": 1044589167.0, "step": 2537 }, { "epoch": 2.302752293577982, "grad_norm": 0.2376891651149006, "learning_rate": 1.021515385569314e-06, "loss": 0.3214, "num_tokens": 1044991372.0, "step": 2538 }, { "epoch": 2.3031345565749235, "grad_norm": 0.22474595409268258, "learning_rate": 1.0209745605463435e-06, "loss": 0.3449, "num_tokens": 1045432528.0, "step": 2539 }, { "epoch": 2.3035168195718656, "grad_norm": 0.2262652063302646, "learning_rate": 1.0204406037131151e-06, "loss": 0.3389, "num_tokens": 1045830732.0, "step": 2540 }, { "epoch": 2.3038990825688073, "grad_norm": 0.22462505511446673, "learning_rate": 1.0199135158884041e-06, "loss": 0.3079, "num_tokens": 1046241407.0, "step": 2541 }, { "epoch": 2.3042813455657494, "grad_norm": 0.22350405477389867, "learning_rate": 1.019393297880453e-06, "loss": 0.3278, "num_tokens": 1046635747.0, "step": 2542 }, { "epoch": 2.304663608562691, "grad_norm": 0.2442778544216497, "learning_rate": 1.0188799504869703e-06, "loss": 0.3315, "num_tokens": 1047026214.0, "step": 2543 }, { "epoch": 2.305045871559633, "grad_norm": 0.22896654353288845, "learning_rate": 1.0183734744951297e-06, "loss": 0.3466, "num_tokens": 1047449309.0, "step": 2544 }, { "epoch": 2.305428134556575, "grad_norm": 0.244732032673048, "learning_rate": 1.0178738706815656e-06, "loss": 0.3532, "num_tokens": 1047893710.0, "step": 2545 }, { "epoch": 2.305810397553517, "grad_norm": 0.2474406684090097, "learning_rate": 1.0173811398123782e-06, "loss": 0.3445, "num_tokens": 1048312289.0, "step": 2546 }, { "epoch": 2.3061926605504586, "grad_norm": 0.24448324306281707, "learning_rate": 1.0168952826431252e-06, "loss": 0.3348, "num_tokens": 1048770990.0, "step": 2547 }, { "epoch": 2.3065749235474007, "grad_norm": 0.23737176881433894, "learning_rate": 1.0164162999188269e-06, "loss": 0.3522, "num_tokens": 1049201273.0, "step": 2548 }, { "epoch": 2.3069571865443423, "grad_norm": 0.23311706203338056, "learning_rate": 1.0159441923739603e-06, "loss": 0.2976, "num_tokens": 1049572453.0, "step": 2549 }, { "epoch": 2.3073394495412844, "grad_norm": 0.21908035996337083, "learning_rate": 1.0154789607324605e-06, "loss": 0.339, "num_tokens": 1050007857.0, "step": 2550 }, { "epoch": 2.307721712538226, "grad_norm": 0.22278710961037473, "learning_rate": 1.0150206057077197e-06, "loss": 0.3264, "num_tokens": 1050401674.0, "step": 2551 }, { "epoch": 2.308103975535168, "grad_norm": 0.22054604231911978, "learning_rate": 1.014569128002584e-06, "loss": 0.315, "num_tokens": 1050759338.0, "step": 2552 }, { "epoch": 2.30848623853211, "grad_norm": 0.22081096541220202, "learning_rate": 1.0141245283093553e-06, "loss": 0.3416, "num_tokens": 1051181924.0, "step": 2553 }, { "epoch": 2.308868501529052, "grad_norm": 0.2510745144149683, "learning_rate": 1.0136868073097877e-06, "loss": 0.3434, "num_tokens": 1051586897.0, "step": 2554 }, { "epoch": 2.309250764525994, "grad_norm": 0.21096862292524723, "learning_rate": 1.0132559656750875e-06, "loss": 0.3361, "num_tokens": 1052041086.0, "step": 2555 }, { "epoch": 2.3096330275229358, "grad_norm": 0.23509415857047336, "learning_rate": 1.0128320040659124e-06, "loss": 0.3259, "num_tokens": 1052424988.0, "step": 2556 }, { "epoch": 2.310015290519878, "grad_norm": 0.2212050536162352, "learning_rate": 1.0124149231323704e-06, "loss": 0.3202, "num_tokens": 1052828435.0, "step": 2557 }, { "epoch": 2.3103975535168195, "grad_norm": 0.23468803635063076, "learning_rate": 1.0120047235140178e-06, "loss": 0.3235, "num_tokens": 1053212688.0, "step": 2558 }, { "epoch": 2.3107798165137616, "grad_norm": 0.23189443243628696, "learning_rate": 1.011601405839859e-06, "loss": 0.346, "num_tokens": 1053607501.0, "step": 2559 }, { "epoch": 2.3111620795107033, "grad_norm": 0.21508531306556702, "learning_rate": 1.0112049707283475e-06, "loss": 0.3291, "num_tokens": 1054016876.0, "step": 2560 }, { "epoch": 2.3115443425076454, "grad_norm": 0.23227188638372645, "learning_rate": 1.0108154187873804e-06, "loss": 0.3429, "num_tokens": 1054426560.0, "step": 2561 }, { "epoch": 2.311926605504587, "grad_norm": 0.23612394657024277, "learning_rate": 1.0104327506143014e-06, "loss": 0.3401, "num_tokens": 1054821614.0, "step": 2562 }, { "epoch": 2.312308868501529, "grad_norm": 0.22920504353452664, "learning_rate": 1.0100569667958982e-06, "loss": 0.3008, "num_tokens": 1055185063.0, "step": 2563 }, { "epoch": 2.312691131498471, "grad_norm": 0.22824922786612356, "learning_rate": 1.0096880679084025e-06, "loss": 0.3367, "num_tokens": 1055601955.0, "step": 2564 }, { "epoch": 2.313073394495413, "grad_norm": 0.2526336087338982, "learning_rate": 1.009326054517488e-06, "loss": 0.3382, "num_tokens": 1056034753.0, "step": 2565 }, { "epoch": 2.3134556574923546, "grad_norm": 0.22288612631581622, "learning_rate": 1.0089709271782696e-06, "loss": 0.334, "num_tokens": 1056465052.0, "step": 2566 }, { "epoch": 2.3138379204892967, "grad_norm": 0.2294912223328932, "learning_rate": 1.008622686435305e-06, "loss": 0.3463, "num_tokens": 1056862205.0, "step": 2567 }, { "epoch": 2.3142201834862384, "grad_norm": 0.24227930904450967, "learning_rate": 1.0082813328225893e-06, "loss": 0.3584, "num_tokens": 1057254898.0, "step": 2568 }, { "epoch": 2.3146024464831805, "grad_norm": 0.23695038104521526, "learning_rate": 1.0079468668635584e-06, "loss": 0.3652, "num_tokens": 1057661684.0, "step": 2569 }, { "epoch": 2.314984709480122, "grad_norm": 0.2709236842675759, "learning_rate": 1.007619289071087e-06, "loss": 0.3518, "num_tokens": 1058080642.0, "step": 2570 }, { "epoch": 2.3153669724770642, "grad_norm": 0.22561965993818303, "learning_rate": 1.007298599947486e-06, "loss": 0.3259, "num_tokens": 1058508776.0, "step": 2571 }, { "epoch": 2.315749235474006, "grad_norm": 0.2509449265816071, "learning_rate": 1.0069847999845041e-06, "loss": 0.3713, "num_tokens": 1058914029.0, "step": 2572 }, { "epoch": 2.316131498470948, "grad_norm": 0.26544896502927867, "learning_rate": 1.006677889663326e-06, "loss": 0.323, "num_tokens": 1059324833.0, "step": 2573 }, { "epoch": 2.31651376146789, "grad_norm": 0.24484947257304102, "learning_rate": 1.0063778694545715e-06, "loss": 0.3327, "num_tokens": 1059740828.0, "step": 2574 }, { "epoch": 2.316896024464832, "grad_norm": 0.2482732124964759, "learning_rate": 1.0060847398182944e-06, "loss": 0.3242, "num_tokens": 1060119510.0, "step": 2575 }, { "epoch": 2.317278287461774, "grad_norm": 0.22607380741711375, "learning_rate": 1.0057985012039843e-06, "loss": 0.3324, "num_tokens": 1060531012.0, "step": 2576 }, { "epoch": 2.3176605504587156, "grad_norm": 0.23654994191862802, "learning_rate": 1.0055191540505613e-06, "loss": 0.3118, "num_tokens": 1060904405.0, "step": 2577 }, { "epoch": 2.3180428134556577, "grad_norm": 0.21409976253788449, "learning_rate": 1.005246698786381e-06, "loss": 0.3622, "num_tokens": 1061371155.0, "step": 2578 }, { "epoch": 2.3184250764525993, "grad_norm": 0.2309483797904588, "learning_rate": 1.004981135829229e-06, "loss": 0.345, "num_tokens": 1061750695.0, "step": 2579 }, { "epoch": 2.3188073394495414, "grad_norm": 0.2209584863444376, "learning_rate": 1.004722465586322e-06, "loss": 0.35, "num_tokens": 1062175520.0, "step": 2580 }, { "epoch": 2.319189602446483, "grad_norm": 0.23909379587847404, "learning_rate": 1.004470688454309e-06, "loss": 0.3392, "num_tokens": 1062594570.0, "step": 2581 }, { "epoch": 2.319571865443425, "grad_norm": 0.22906900138976263, "learning_rate": 1.004225804819267e-06, "loss": 0.3235, "num_tokens": 1062997343.0, "step": 2582 }, { "epoch": 2.319954128440367, "grad_norm": 0.20983776796263418, "learning_rate": 1.003987815056704e-06, "loss": 0.3265, "num_tokens": 1063426924.0, "step": 2583 }, { "epoch": 2.320336391437309, "grad_norm": 0.24436396259085938, "learning_rate": 1.0037567195315564e-06, "loss": 0.3423, "num_tokens": 1063826773.0, "step": 2584 }, { "epoch": 2.3207186544342506, "grad_norm": 0.23346624228085094, "learning_rate": 1.0035325185981882e-06, "loss": 0.34, "num_tokens": 1064247442.0, "step": 2585 }, { "epoch": 2.3211009174311927, "grad_norm": 0.2148594761325381, "learning_rate": 1.0033152126003927e-06, "loss": 0.3498, "num_tokens": 1064660669.0, "step": 2586 }, { "epoch": 2.3214831804281344, "grad_norm": 0.23159282734808287, "learning_rate": 1.0031048018713885e-06, "loss": 0.3354, "num_tokens": 1065064433.0, "step": 2587 }, { "epoch": 2.3218654434250765, "grad_norm": 0.22804428060431975, "learning_rate": 1.0029012867338217e-06, "loss": 0.3528, "num_tokens": 1065494137.0, "step": 2588 }, { "epoch": 2.322247706422018, "grad_norm": 0.22346420028074468, "learning_rate": 1.0027046674997656e-06, "loss": 0.3305, "num_tokens": 1065890966.0, "step": 2589 }, { "epoch": 2.3226299694189603, "grad_norm": 0.24306989857321962, "learning_rate": 1.002514944470718e-06, "loss": 0.3341, "num_tokens": 1066316783.0, "step": 2590 }, { "epoch": 2.323012232415902, "grad_norm": 0.22381326387345193, "learning_rate": 1.0023321179376029e-06, "loss": 0.3483, "num_tokens": 1066743816.0, "step": 2591 }, { "epoch": 2.323394495412844, "grad_norm": 0.2420401955009923, "learning_rate": 1.0021561881807676e-06, "loss": 0.3263, "num_tokens": 1067117888.0, "step": 2592 }, { "epoch": 2.323776758409786, "grad_norm": 0.23509563537049455, "learning_rate": 1.0019871554699855e-06, "loss": 0.3393, "num_tokens": 1067519371.0, "step": 2593 }, { "epoch": 2.324159021406728, "grad_norm": 0.23558611403571264, "learning_rate": 1.001825020064453e-06, "loss": 0.3238, "num_tokens": 1067950301.0, "step": 2594 }, { "epoch": 2.32454128440367, "grad_norm": 0.21200595149700782, "learning_rate": 1.0016697822127914e-06, "loss": 0.3397, "num_tokens": 1068386513.0, "step": 2595 }, { "epoch": 2.3249235474006116, "grad_norm": 0.2243911394866069, "learning_rate": 1.001521442153043e-06, "loss": 0.3473, "num_tokens": 1068804684.0, "step": 2596 }, { "epoch": 2.3253058103975537, "grad_norm": 0.24157224102672215, "learning_rate": 1.001380000112675e-06, "loss": 0.3275, "num_tokens": 1069166337.0, "step": 2597 }, { "epoch": 2.3256880733944953, "grad_norm": 0.22273570512325785, "learning_rate": 1.0012454563085758e-06, "loss": 0.3393, "num_tokens": 1069592508.0, "step": 2598 }, { "epoch": 2.3260703363914375, "grad_norm": 0.2234569969951848, "learning_rate": 1.0011178109470566e-06, "loss": 0.327, "num_tokens": 1070009583.0, "step": 2599 }, { "epoch": 2.326452599388379, "grad_norm": 0.21476705658140316, "learning_rate": 1.00099706422385e-06, "loss": 0.3356, "num_tokens": 1070445970.0, "step": 2600 }, { "epoch": 2.3268348623853212, "grad_norm": 0.23114710206420533, "learning_rate": 1.000883216324111e-06, "loss": 0.3208, "num_tokens": 1070874120.0, "step": 2601 }, { "epoch": 2.327217125382263, "grad_norm": 0.22901004329693428, "learning_rate": 1.0007762674224153e-06, "loss": 0.3358, "num_tokens": 1071312881.0, "step": 2602 }, { "epoch": 2.327599388379205, "grad_norm": 0.25549508600903154, "learning_rate": 1.0006762176827586e-06, "loss": 0.3363, "num_tokens": 1071693968.0, "step": 2603 }, { "epoch": 2.3279816513761467, "grad_norm": 0.23640053974141, "learning_rate": 1.0005830672585594e-06, "loss": 0.3412, "num_tokens": 1072085740.0, "step": 2604 }, { "epoch": 2.3283639143730888, "grad_norm": 0.2323327589925819, "learning_rate": 1.000496816292655e-06, "loss": 0.3189, "num_tokens": 1072474892.0, "step": 2605 }, { "epoch": 2.3287461773700304, "grad_norm": 0.2466367038767514, "learning_rate": 1.000417464917304e-06, "loss": 0.3368, "num_tokens": 1072888114.0, "step": 2606 }, { "epoch": 2.3291284403669725, "grad_norm": 0.22164749243531737, "learning_rate": 1.0003450132541852e-06, "loss": 0.3302, "num_tokens": 1073317816.0, "step": 2607 }, { "epoch": 2.329510703363914, "grad_norm": 0.2351053279203124, "learning_rate": 1.0002794614143953e-06, "loss": 0.3388, "num_tokens": 1073693086.0, "step": 2608 }, { "epoch": 2.3298929663608563, "grad_norm": 0.26298485086037265, "learning_rate": 1.0002208094984539e-06, "loss": 0.3607, "num_tokens": 1074126288.0, "step": 2609 }, { "epoch": 2.330275229357798, "grad_norm": 0.2490011840136058, "learning_rate": 1.0001690575962977e-06, "loss": 0.3431, "num_tokens": 1074520551.0, "step": 2610 }, { "epoch": 2.33065749235474, "grad_norm": 0.22911822715821784, "learning_rate": 1.0001242057872839e-06, "loss": 0.3392, "num_tokens": 1074922093.0, "step": 2611 }, { "epoch": 2.331039755351682, "grad_norm": 0.22918798351783345, "learning_rate": 1.0000862541401885e-06, "loss": 0.329, "num_tokens": 1075331902.0, "step": 2612 }, { "epoch": 2.331422018348624, "grad_norm": 0.22857812030033325, "learning_rate": 1.0000552027132067e-06, "loss": 0.3287, "num_tokens": 1075746059.0, "step": 2613 }, { "epoch": 2.331804281345566, "grad_norm": 0.23829378961735942, "learning_rate": 1.0000310515539543e-06, "loss": 0.3738, "num_tokens": 1076161114.0, "step": 2614 }, { "epoch": 2.3321865443425076, "grad_norm": 0.314099427435107, "learning_rate": 1.000013800699464e-06, "loss": 0.3384, "num_tokens": 1076600445.0, "step": 2615 }, { "epoch": 2.3325688073394497, "grad_norm": 0.25738694977595783, "learning_rate": 1.000003450176189e-06, "loss": 0.3525, "num_tokens": 1077027632.0, "step": 2616 } ], "logging_steps": 1, "max_steps": 2616, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5212247128866816.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }