diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20962 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3325688073394497, + "eval_steps": 500, + "global_step": 2616, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00038226299694189603, + "grad_norm": 0.21810047996490894, + "learning_rate": 0.0, + "loss": 0.3999, + "num_tokens": 451800.0, + "step": 1 + }, + { + "epoch": 0.0007645259938837921, + "grad_norm": 0.22618320492130675, + "learning_rate": 1.2658227848101266e-07, + "loss": 0.3816, + "num_tokens": 859683.0, + "step": 2 + }, + { + "epoch": 0.0011467889908256881, + "grad_norm": 0.21791046559310479, + "learning_rate": 2.5316455696202533e-07, + "loss": 0.3841, + "num_tokens": 1288518.0, + "step": 3 + }, + { + "epoch": 0.0015290519877675841, + "grad_norm": 0.2393639885413405, + "learning_rate": 3.79746835443038e-07, + "loss": 0.3845, + "num_tokens": 1705818.0, + "step": 4 + }, + { + "epoch": 0.00191131498470948, + "grad_norm": 0.2324716366978111, + "learning_rate": 5.063291139240507e-07, + "loss": 0.3978, + "num_tokens": 2197596.0, + "step": 5 + }, + { + "epoch": 0.0022935779816513763, + "grad_norm": 0.25843851753079405, + "learning_rate": 6.329113924050634e-07, + "loss": 0.3975, + "num_tokens": 2622476.0, + "step": 6 + }, + { + "epoch": 0.002675840978593272, + "grad_norm": 0.2315188490221056, + "learning_rate": 7.59493670886076e-07, + "loss": 0.3779, + "num_tokens": 3001993.0, + "step": 7 + }, + { + "epoch": 0.0030581039755351682, + "grad_norm": 0.23681575988460776, + "learning_rate": 8.860759493670887e-07, + "loss": 0.4222, + "num_tokens": 3415434.0, + "step": 8 + }, + { + "epoch": 0.0034403669724770644, + "grad_norm": 0.23905098958071425, + "learning_rate": 1.0126582278481013e-06, + "loss": 0.3978, + "num_tokens": 3821257.0, + "step": 9 + }, + { + "epoch": 0.00382262996941896, + "grad_norm": 0.26184159159239856, + "learning_rate": 1.139240506329114e-06, + "loss": 0.3847, + "num_tokens": 4253873.0, + "step": 10 + }, + { + "epoch": 0.004204892966360856, + "grad_norm": 0.26006057447778413, + "learning_rate": 1.2658227848101267e-06, + "loss": 0.4175, + "num_tokens": 4670787.0, + "step": 11 + }, + { + "epoch": 0.0045871559633027525, + "grad_norm": 0.29033047360969433, + "learning_rate": 1.3924050632911392e-06, + "loss": 0.4179, + "num_tokens": 5050931.0, + "step": 12 + }, + { + "epoch": 0.004969418960244648, + "grad_norm": 0.22946819381085923, + "learning_rate": 1.518987341772152e-06, + "loss": 0.396, + "num_tokens": 5449145.0, + "step": 13 + }, + { + "epoch": 0.005351681957186544, + "grad_norm": 0.23651528126169233, + "learning_rate": 1.6455696202531647e-06, + "loss": 0.4167, + "num_tokens": 5870528.0, + "step": 14 + }, + { + "epoch": 0.005733944954128441, + "grad_norm": 0.2214382864278198, + "learning_rate": 1.7721518987341774e-06, + "loss": 0.3823, + "num_tokens": 6271849.0, + "step": 15 + }, + { + "epoch": 0.0061162079510703364, + "grad_norm": 0.23587934762400245, + "learning_rate": 1.8987341772151901e-06, + "loss": 0.3882, + "num_tokens": 6654431.0, + "step": 16 + }, + { + "epoch": 0.006498470948012232, + "grad_norm": 0.25429501849461983, + "learning_rate": 2.0253164556962026e-06, + "loss": 0.4154, + "num_tokens": 7085591.0, + "step": 17 + }, + { + "epoch": 0.006880733944954129, + "grad_norm": 0.24366557685855064, + "learning_rate": 2.1518987341772153e-06, + "loss": 0.3597, + "num_tokens": 7488022.0, + "step": 18 + }, + { + "epoch": 0.007262996941896025, + "grad_norm": 0.23975901272004702, + "learning_rate": 2.278481012658228e-06, + "loss": 0.3993, + "num_tokens": 7873601.0, + "step": 19 + }, + { + "epoch": 0.00764525993883792, + "grad_norm": 0.23535810193170292, + "learning_rate": 2.4050632911392408e-06, + "loss": 0.4204, + "num_tokens": 8325413.0, + "step": 20 + }, + { + "epoch": 0.008027522935779817, + "grad_norm": 0.2452036677236248, + "learning_rate": 2.5316455696202535e-06, + "loss": 0.386, + "num_tokens": 8738589.0, + "step": 21 + }, + { + "epoch": 0.008409785932721712, + "grad_norm": 0.24165922913315604, + "learning_rate": 2.6582278481012658e-06, + "loss": 0.3995, + "num_tokens": 9161420.0, + "step": 22 + }, + { + "epoch": 0.008792048929663608, + "grad_norm": 0.26628800861046653, + "learning_rate": 2.7848101265822785e-06, + "loss": 0.3829, + "num_tokens": 9569642.0, + "step": 23 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 0.23133751904315147, + "learning_rate": 2.9113924050632912e-06, + "loss": 0.3918, + "num_tokens": 9994839.0, + "step": 24 + }, + { + "epoch": 0.0095565749235474, + "grad_norm": 0.2227586971283885, + "learning_rate": 3.037974683544304e-06, + "loss": 0.3997, + "num_tokens": 10468238.0, + "step": 25 + }, + { + "epoch": 0.009938837920489297, + "grad_norm": 0.24385418192481537, + "learning_rate": 3.164556962025317e-06, + "loss": 0.3972, + "num_tokens": 10858797.0, + "step": 26 + }, + { + "epoch": 0.010321100917431193, + "grad_norm": 0.25515084915475794, + "learning_rate": 3.2911392405063294e-06, + "loss": 0.3655, + "num_tokens": 11251677.0, + "step": 27 + }, + { + "epoch": 0.010703363914373088, + "grad_norm": 0.24269046503305744, + "learning_rate": 3.417721518987342e-06, + "loss": 0.3705, + "num_tokens": 11638850.0, + "step": 28 + }, + { + "epoch": 0.011085626911314985, + "grad_norm": 0.2309672930448967, + "learning_rate": 3.544303797468355e-06, + "loss": 0.3784, + "num_tokens": 12040096.0, + "step": 29 + }, + { + "epoch": 0.011467889908256881, + "grad_norm": 0.2435658585064058, + "learning_rate": 3.6708860759493675e-06, + "loss": 0.3669, + "num_tokens": 12389404.0, + "step": 30 + }, + { + "epoch": 0.011850152905198776, + "grad_norm": 0.25743490316274564, + "learning_rate": 3.7974683544303802e-06, + "loss": 0.377, + "num_tokens": 12761781.0, + "step": 31 + }, + { + "epoch": 0.012232415902140673, + "grad_norm": 0.25201610288625453, + "learning_rate": 3.924050632911393e-06, + "loss": 0.3702, + "num_tokens": 13154838.0, + "step": 32 + }, + { + "epoch": 0.01261467889908257, + "grad_norm": 0.2469634285116853, + "learning_rate": 4.050632911392405e-06, + "loss": 0.4076, + "num_tokens": 13576761.0, + "step": 33 + }, + { + "epoch": 0.012996941896024464, + "grad_norm": 0.2496549628504463, + "learning_rate": 4.177215189873418e-06, + "loss": 0.3892, + "num_tokens": 13974489.0, + "step": 34 + }, + { + "epoch": 0.013379204892966361, + "grad_norm": 0.25846070232991863, + "learning_rate": 4.303797468354431e-06, + "loss": 0.3559, + "num_tokens": 14374049.0, + "step": 35 + }, + { + "epoch": 0.013761467889908258, + "grad_norm": 0.2663423942828327, + "learning_rate": 4.430379746835443e-06, + "loss": 0.3688, + "num_tokens": 14776911.0, + "step": 36 + }, + { + "epoch": 0.014143730886850153, + "grad_norm": 0.2505114169928893, + "learning_rate": 4.556962025316456e-06, + "loss": 0.3806, + "num_tokens": 15178891.0, + "step": 37 + }, + { + "epoch": 0.01452599388379205, + "grad_norm": 0.28586301253591584, + "learning_rate": 4.683544303797468e-06, + "loss": 0.38, + "num_tokens": 15608420.0, + "step": 38 + }, + { + "epoch": 0.014908256880733946, + "grad_norm": 0.2637903406700679, + "learning_rate": 4.8101265822784815e-06, + "loss": 0.3727, + "num_tokens": 15994086.0, + "step": 39 + }, + { + "epoch": 0.01529051987767584, + "grad_norm": 0.24464885901758707, + "learning_rate": 4.936708860759495e-06, + "loss": 0.3564, + "num_tokens": 16429454.0, + "step": 40 + }, + { + "epoch": 0.015672782874617736, + "grad_norm": 0.2469314868927337, + "learning_rate": 5.063291139240507e-06, + "loss": 0.3522, + "num_tokens": 16809145.0, + "step": 41 + }, + { + "epoch": 0.016055045871559634, + "grad_norm": 0.24748811283451091, + "learning_rate": 5.189873417721519e-06, + "loss": 0.3827, + "num_tokens": 17211551.0, + "step": 42 + }, + { + "epoch": 0.01643730886850153, + "grad_norm": 0.23723819861725134, + "learning_rate": 5.3164556962025316e-06, + "loss": 0.3826, + "num_tokens": 17671901.0, + "step": 43 + }, + { + "epoch": 0.016819571865443424, + "grad_norm": 0.27105169685529573, + "learning_rate": 5.443037974683545e-06, + "loss": 0.3815, + "num_tokens": 18093728.0, + "step": 44 + }, + { + "epoch": 0.017201834862385322, + "grad_norm": 0.23791331359443213, + "learning_rate": 5.569620253164557e-06, + "loss": 0.3451, + "num_tokens": 18512366.0, + "step": 45 + }, + { + "epoch": 0.017584097859327217, + "grad_norm": 0.27793141561269313, + "learning_rate": 5.69620253164557e-06, + "loss": 0.3627, + "num_tokens": 18896121.0, + "step": 46 + }, + { + "epoch": 0.017966360856269112, + "grad_norm": 0.26524920695661636, + "learning_rate": 5.8227848101265824e-06, + "loss": 0.3771, + "num_tokens": 19301930.0, + "step": 47 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 0.29818416745906995, + "learning_rate": 5.949367088607595e-06, + "loss": 0.3702, + "num_tokens": 19715395.0, + "step": 48 + }, + { + "epoch": 0.018730886850152905, + "grad_norm": 0.30338112047811494, + "learning_rate": 6.075949367088608e-06, + "loss": 0.3773, + "num_tokens": 20131002.0, + "step": 49 + }, + { + "epoch": 0.0191131498470948, + "grad_norm": 0.30308659699156243, + "learning_rate": 6.20253164556962e-06, + "loss": 0.3831, + "num_tokens": 20563137.0, + "step": 50 + }, + { + "epoch": 0.0194954128440367, + "grad_norm": 0.2535074196213391, + "learning_rate": 6.329113924050634e-06, + "loss": 0.3744, + "num_tokens": 20972802.0, + "step": 51 + }, + { + "epoch": 0.019877675840978593, + "grad_norm": 0.2735974167361757, + "learning_rate": 6.4556962025316464e-06, + "loss": 0.3701, + "num_tokens": 21374877.0, + "step": 52 + }, + { + "epoch": 0.020259938837920488, + "grad_norm": 0.26877877811745887, + "learning_rate": 6.582278481012659e-06, + "loss": 0.3507, + "num_tokens": 21760013.0, + "step": 53 + }, + { + "epoch": 0.020642201834862386, + "grad_norm": 0.2443412417585162, + "learning_rate": 6.708860759493672e-06, + "loss": 0.3311, + "num_tokens": 22150140.0, + "step": 54 + }, + { + "epoch": 0.02102446483180428, + "grad_norm": 0.25033632489186186, + "learning_rate": 6.835443037974684e-06, + "loss": 0.3588, + "num_tokens": 22551822.0, + "step": 55 + }, + { + "epoch": 0.021406727828746176, + "grad_norm": 0.25421424683231686, + "learning_rate": 6.962025316455697e-06, + "loss": 0.3687, + "num_tokens": 22925722.0, + "step": 56 + }, + { + "epoch": 0.021788990825688075, + "grad_norm": 0.285201645139862, + "learning_rate": 7.08860759493671e-06, + "loss": 0.3748, + "num_tokens": 23302379.0, + "step": 57 + }, + { + "epoch": 0.02217125382262997, + "grad_norm": 0.2901922327347829, + "learning_rate": 7.215189873417722e-06, + "loss": 0.3814, + "num_tokens": 23714576.0, + "step": 58 + }, + { + "epoch": 0.022553516819571864, + "grad_norm": 0.3270813662793202, + "learning_rate": 7.341772151898735e-06, + "loss": 0.3658, + "num_tokens": 24107230.0, + "step": 59 + }, + { + "epoch": 0.022935779816513763, + "grad_norm": 0.2842972823309671, + "learning_rate": 7.468354430379747e-06, + "loss": 0.3717, + "num_tokens": 24524589.0, + "step": 60 + }, + { + "epoch": 0.023318042813455658, + "grad_norm": 0.2940890615468725, + "learning_rate": 7.5949367088607605e-06, + "loss": 0.3768, + "num_tokens": 24986088.0, + "step": 61 + }, + { + "epoch": 0.023700305810397553, + "grad_norm": 0.29303219141347625, + "learning_rate": 7.721518987341773e-06, + "loss": 0.371, + "num_tokens": 25387659.0, + "step": 62 + }, + { + "epoch": 0.02408256880733945, + "grad_norm": 0.2712988604342114, + "learning_rate": 7.848101265822786e-06, + "loss": 0.3578, + "num_tokens": 25804901.0, + "step": 63 + }, + { + "epoch": 0.024464831804281346, + "grad_norm": 0.2583146632355596, + "learning_rate": 7.974683544303799e-06, + "loss": 0.3838, + "num_tokens": 26208000.0, + "step": 64 + }, + { + "epoch": 0.02484709480122324, + "grad_norm": 0.3066777920300367, + "learning_rate": 8.10126582278481e-06, + "loss": 0.3577, + "num_tokens": 26566527.0, + "step": 65 + }, + { + "epoch": 0.02522935779816514, + "grad_norm": 0.2918873290167186, + "learning_rate": 8.227848101265824e-06, + "loss": 0.4074, + "num_tokens": 26984923.0, + "step": 66 + }, + { + "epoch": 0.025611620795107034, + "grad_norm": 0.24972708510248107, + "learning_rate": 8.354430379746837e-06, + "loss": 0.3597, + "num_tokens": 27394879.0, + "step": 67 + }, + { + "epoch": 0.02599388379204893, + "grad_norm": 0.2533277702671471, + "learning_rate": 8.481012658227848e-06, + "loss": 0.3383, + "num_tokens": 27796126.0, + "step": 68 + }, + { + "epoch": 0.026376146788990827, + "grad_norm": 0.2794694608524889, + "learning_rate": 8.607594936708861e-06, + "loss": 0.3831, + "num_tokens": 28223656.0, + "step": 69 + }, + { + "epoch": 0.026758409785932722, + "grad_norm": 0.30892432798732183, + "learning_rate": 8.734177215189874e-06, + "loss": 0.3833, + "num_tokens": 28636038.0, + "step": 70 + }, + { + "epoch": 0.027140672782874617, + "grad_norm": 0.2872201553687094, + "learning_rate": 8.860759493670886e-06, + "loss": 0.3581, + "num_tokens": 29037027.0, + "step": 71 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 0.26788403211970613, + "learning_rate": 8.987341772151899e-06, + "loss": 0.3657, + "num_tokens": 29436818.0, + "step": 72 + }, + { + "epoch": 0.02790519877675841, + "grad_norm": 0.2928976601330555, + "learning_rate": 9.113924050632912e-06, + "loss": 0.3659, + "num_tokens": 29835295.0, + "step": 73 + }, + { + "epoch": 0.028287461773700305, + "grad_norm": 0.27351385486989827, + "learning_rate": 9.240506329113925e-06, + "loss": 0.3926, + "num_tokens": 30275929.0, + "step": 74 + }, + { + "epoch": 0.028669724770642203, + "grad_norm": 0.24860015314094797, + "learning_rate": 9.367088607594937e-06, + "loss": 0.3755, + "num_tokens": 30713177.0, + "step": 75 + }, + { + "epoch": 0.0290519877675841, + "grad_norm": 0.3059374767943009, + "learning_rate": 9.49367088607595e-06, + "loss": 0.376, + "num_tokens": 31085359.0, + "step": 76 + }, + { + "epoch": 0.029434250764525993, + "grad_norm": 0.2779162746474668, + "learning_rate": 9.620253164556963e-06, + "loss": 0.3549, + "num_tokens": 31483591.0, + "step": 77 + }, + { + "epoch": 0.02981651376146789, + "grad_norm": 0.30027546164974406, + "learning_rate": 9.746835443037975e-06, + "loss": 0.3661, + "num_tokens": 31852456.0, + "step": 78 + }, + { + "epoch": 0.030198776758409786, + "grad_norm": 0.3563030266321935, + "learning_rate": 9.87341772151899e-06, + "loss": 0.3815, + "num_tokens": 32253510.0, + "step": 79 + }, + { + "epoch": 0.03058103975535168, + "grad_norm": 0.328592724448693, + "learning_rate": 1e-05, + "loss": 0.3793, + "num_tokens": 32632481.0, + "step": 80 + }, + { + "epoch": 0.03096330275229358, + "grad_norm": 0.33221793764748714, + "learning_rate": 9.999996549823812e-06, + "loss": 0.393, + "num_tokens": 33036079.0, + "step": 81 + }, + { + "epoch": 0.03134556574923547, + "grad_norm": 0.2629167220936857, + "learning_rate": 9.999986199300538e-06, + "loss": 0.3636, + "num_tokens": 33465676.0, + "step": 82 + }, + { + "epoch": 0.03172782874617737, + "grad_norm": 0.3063229560821108, + "learning_rate": 9.999968948446047e-06, + "loss": 0.382, + "num_tokens": 33898302.0, + "step": 83 + }, + { + "epoch": 0.03211009174311927, + "grad_norm": 0.30754736893216966, + "learning_rate": 9.999944797286795e-06, + "loss": 0.3781, + "num_tokens": 34282661.0, + "step": 84 + }, + { + "epoch": 0.03249235474006116, + "grad_norm": 0.32776552225107286, + "learning_rate": 9.999913745859813e-06, + "loss": 0.3431, + "num_tokens": 34651832.0, + "step": 85 + }, + { + "epoch": 0.03287461773700306, + "grad_norm": 0.3174044990343936, + "learning_rate": 9.999875794212719e-06, + "loss": 0.367, + "num_tokens": 35031841.0, + "step": 86 + }, + { + "epoch": 0.033256880733944956, + "grad_norm": 0.24005371718221158, + "learning_rate": 9.999830942403703e-06, + "loss": 0.3586, + "num_tokens": 35455746.0, + "step": 87 + }, + { + "epoch": 0.03363914373088685, + "grad_norm": 0.3617154019715443, + "learning_rate": 9.999779190501546e-06, + "loss": 0.3931, + "num_tokens": 35910940.0, + "step": 88 + }, + { + "epoch": 0.034021406727828746, + "grad_norm": 0.36614336324493635, + "learning_rate": 9.999720538585606e-06, + "loss": 0.3725, + "num_tokens": 36283542.0, + "step": 89 + }, + { + "epoch": 0.034403669724770644, + "grad_norm": 0.33038980708954735, + "learning_rate": 9.999654986745815e-06, + "loss": 0.3463, + "num_tokens": 36656330.0, + "step": 90 + }, + { + "epoch": 0.034785932721712536, + "grad_norm": 0.2881952745690317, + "learning_rate": 9.999582535082697e-06, + "loss": 0.3599, + "num_tokens": 37044581.0, + "step": 91 + }, + { + "epoch": 0.035168195718654434, + "grad_norm": 0.3018344869659633, + "learning_rate": 9.999503183707346e-06, + "loss": 0.383, + "num_tokens": 37484958.0, + "step": 92 + }, + { + "epoch": 0.03555045871559633, + "grad_norm": 0.3010144740002948, + "learning_rate": 9.999416932741441e-06, + "loss": 0.3622, + "num_tokens": 37878048.0, + "step": 93 + }, + { + "epoch": 0.035932721712538224, + "grad_norm": 0.3333443305835725, + "learning_rate": 9.999323782317242e-06, + "loss": 0.3579, + "num_tokens": 38270512.0, + "step": 94 + }, + { + "epoch": 0.03631498470948012, + "grad_norm": 0.2669100929329328, + "learning_rate": 9.999223732577585e-06, + "loss": 0.3662, + "num_tokens": 38649821.0, + "step": 95 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 0.2701402260845986, + "learning_rate": 9.99911678367589e-06, + "loss": 0.3815, + "num_tokens": 39073051.0, + "step": 96 + }, + { + "epoch": 0.03707951070336391, + "grad_norm": 0.3077991029054113, + "learning_rate": 9.999002935776151e-06, + "loss": 0.3621, + "num_tokens": 39508682.0, + "step": 97 + }, + { + "epoch": 0.03746177370030581, + "grad_norm": 0.3124151218543076, + "learning_rate": 9.998882189052944e-06, + "loss": 0.3364, + "num_tokens": 39885020.0, + "step": 98 + }, + { + "epoch": 0.03784403669724771, + "grad_norm": 0.2659437648117742, + "learning_rate": 9.998754543691425e-06, + "loss": 0.3625, + "num_tokens": 40279360.0, + "step": 99 + }, + { + "epoch": 0.0382262996941896, + "grad_norm": 0.3320407836544897, + "learning_rate": 9.998619999887325e-06, + "loss": 0.3727, + "num_tokens": 40707012.0, + "step": 100 + }, + { + "epoch": 0.0386085626911315, + "grad_norm": 0.3010028394901584, + "learning_rate": 9.998478557846959e-06, + "loss": 0.3538, + "num_tokens": 41118421.0, + "step": 101 + }, + { + "epoch": 0.0389908256880734, + "grad_norm": 0.34633059384007736, + "learning_rate": 9.99833021778721e-06, + "loss": 0.3749, + "num_tokens": 41454912.0, + "step": 102 + }, + { + "epoch": 0.03937308868501529, + "grad_norm": 0.3189762698698707, + "learning_rate": 9.998174979935548e-06, + "loss": 0.3822, + "num_tokens": 41868510.0, + "step": 103 + }, + { + "epoch": 0.039755351681957186, + "grad_norm": 0.3241454062281064, + "learning_rate": 9.998012844530015e-06, + "loss": 0.3661, + "num_tokens": 42255880.0, + "step": 104 + }, + { + "epoch": 0.040137614678899085, + "grad_norm": 0.3205981920415945, + "learning_rate": 9.997843811819233e-06, + "loss": 0.3505, + "num_tokens": 42643458.0, + "step": 105 + }, + { + "epoch": 0.040519877675840976, + "grad_norm": 0.2886754997136016, + "learning_rate": 9.997667882062399e-06, + "loss": 0.3485, + "num_tokens": 43036614.0, + "step": 106 + }, + { + "epoch": 0.040902140672782875, + "grad_norm": 0.2827586254697602, + "learning_rate": 9.997485055529284e-06, + "loss": 0.3597, + "num_tokens": 43434804.0, + "step": 107 + }, + { + "epoch": 0.04128440366972477, + "grad_norm": 0.3107714425116188, + "learning_rate": 9.997295332500235e-06, + "loss": 0.3757, + "num_tokens": 43866730.0, + "step": 108 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 0.3475879363319308, + "learning_rate": 9.99709871326618e-06, + "loss": 0.3857, + "num_tokens": 44278454.0, + "step": 109 + }, + { + "epoch": 0.04204892966360856, + "grad_norm": 0.3441567304199641, + "learning_rate": 9.996895198128611e-06, + "loss": 0.3708, + "num_tokens": 44661982.0, + "step": 110 + }, + { + "epoch": 0.04243119266055046, + "grad_norm": 0.3166935566476928, + "learning_rate": 9.996684787399607e-06, + "loss": 0.3744, + "num_tokens": 45074431.0, + "step": 111 + }, + { + "epoch": 0.04281345565749235, + "grad_norm": 0.2663876326581232, + "learning_rate": 9.996467481401812e-06, + "loss": 0.3769, + "num_tokens": 45548524.0, + "step": 112 + }, + { + "epoch": 0.04319571865443425, + "grad_norm": 0.33096965103381665, + "learning_rate": 9.996243280468445e-06, + "loss": 0.3864, + "num_tokens": 45974787.0, + "step": 113 + }, + { + "epoch": 0.04357798165137615, + "grad_norm": 0.2826887984249236, + "learning_rate": 9.996012184943296e-06, + "loss": 0.3667, + "num_tokens": 46387257.0, + "step": 114 + }, + { + "epoch": 0.04396024464831804, + "grad_norm": 0.29292802715977545, + "learning_rate": 9.995774195180734e-06, + "loss": 0.3795, + "num_tokens": 46839806.0, + "step": 115 + }, + { + "epoch": 0.04434250764525994, + "grad_norm": 0.30349471334335787, + "learning_rate": 9.995529311545691e-06, + "loss": 0.3563, + "num_tokens": 47251973.0, + "step": 116 + }, + { + "epoch": 0.04472477064220184, + "grad_norm": 0.27282794281680073, + "learning_rate": 9.995277534413679e-06, + "loss": 0.3712, + "num_tokens": 47657116.0, + "step": 117 + }, + { + "epoch": 0.04510703363914373, + "grad_norm": 0.27268959033730367, + "learning_rate": 9.995018864170771e-06, + "loss": 0.3487, + "num_tokens": 48058332.0, + "step": 118 + }, + { + "epoch": 0.04548929663608563, + "grad_norm": 0.30209907729014973, + "learning_rate": 9.99475330121362e-06, + "loss": 0.3799, + "num_tokens": 48489567.0, + "step": 119 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 0.31225728938204095, + "learning_rate": 9.994480845949439e-06, + "loss": 0.3708, + "num_tokens": 48912078.0, + "step": 120 + }, + { + "epoch": 0.04625382262996942, + "grad_norm": 0.3273189575817362, + "learning_rate": 9.994201498796016e-06, + "loss": 0.3832, + "num_tokens": 49337202.0, + "step": 121 + }, + { + "epoch": 0.046636085626911315, + "grad_norm": 0.2671602528577052, + "learning_rate": 9.993915260181706e-06, + "loss": 0.3625, + "num_tokens": 49781789.0, + "step": 122 + }, + { + "epoch": 0.047018348623853214, + "grad_norm": 0.32368873694887007, + "learning_rate": 9.99362213054543e-06, + "loss": 0.3922, + "num_tokens": 50230058.0, + "step": 123 + }, + { + "epoch": 0.047400611620795105, + "grad_norm": 0.3579166193844263, + "learning_rate": 9.993322110336673e-06, + "loss": 0.3875, + "num_tokens": 50629056.0, + "step": 124 + }, + { + "epoch": 0.047782874617737, + "grad_norm": 0.30383100284797826, + "learning_rate": 9.993015200015497e-06, + "loss": 0.3636, + "num_tokens": 51052633.0, + "step": 125 + }, + { + "epoch": 0.0481651376146789, + "grad_norm": 0.2805275983840069, + "learning_rate": 9.992701400052515e-06, + "loss": 0.352, + "num_tokens": 51458305.0, + "step": 126 + }, + { + "epoch": 0.04854740061162079, + "grad_norm": 0.4891914795855784, + "learning_rate": 9.992380710928915e-06, + "loss": 0.3765, + "num_tokens": 51917160.0, + "step": 127 + }, + { + "epoch": 0.04892966360856269, + "grad_norm": 0.38387755477046753, + "learning_rate": 9.992053133136444e-06, + "loss": 0.3509, + "num_tokens": 52307600.0, + "step": 128 + }, + { + "epoch": 0.04931192660550459, + "grad_norm": 0.32856095051074563, + "learning_rate": 9.991718667177412e-06, + "loss": 0.3695, + "num_tokens": 52716931.0, + "step": 129 + }, + { + "epoch": 0.04969418960244648, + "grad_norm": 0.2914171894527055, + "learning_rate": 9.991377313564696e-06, + "loss": 0.3691, + "num_tokens": 53144509.0, + "step": 130 + }, + { + "epoch": 0.05007645259938838, + "grad_norm": 0.42507072873813, + "learning_rate": 9.991029072821732e-06, + "loss": 0.3758, + "num_tokens": 53508939.0, + "step": 131 + }, + { + "epoch": 0.05045871559633028, + "grad_norm": 0.38037928356246004, + "learning_rate": 9.990673945482513e-06, + "loss": 0.375, + "num_tokens": 53933117.0, + "step": 132 + }, + { + "epoch": 0.05084097859327217, + "grad_norm": 0.3220768092935591, + "learning_rate": 9.990311932091598e-06, + "loss": 0.3717, + "num_tokens": 54363509.0, + "step": 133 + }, + { + "epoch": 0.05122324159021407, + "grad_norm": 0.31290892909963786, + "learning_rate": 9.989943033204103e-06, + "loss": 0.3716, + "num_tokens": 54767385.0, + "step": 134 + }, + { + "epoch": 0.051605504587155966, + "grad_norm": 0.31645783180029785, + "learning_rate": 9.9895672493857e-06, + "loss": 0.3766, + "num_tokens": 55199177.0, + "step": 135 + }, + { + "epoch": 0.05198776758409786, + "grad_norm": 0.32448742324934265, + "learning_rate": 9.989184581212621e-06, + "loss": 0.3477, + "num_tokens": 55569572.0, + "step": 136 + }, + { + "epoch": 0.052370030581039756, + "grad_norm": 0.33787619079802583, + "learning_rate": 9.988795029271652e-06, + "loss": 0.3729, + "num_tokens": 55982545.0, + "step": 137 + }, + { + "epoch": 0.052752293577981654, + "grad_norm": 0.31022855396474125, + "learning_rate": 9.988398594160143e-06, + "loss": 0.3526, + "num_tokens": 56401354.0, + "step": 138 + }, + { + "epoch": 0.053134556574923546, + "grad_norm": 0.2741685239111767, + "learning_rate": 9.987995276485984e-06, + "loss": 0.3856, + "num_tokens": 56843122.0, + "step": 139 + }, + { + "epoch": 0.053516819571865444, + "grad_norm": 0.2774758423136237, + "learning_rate": 9.987585076867631e-06, + "loss": 0.3706, + "num_tokens": 57259050.0, + "step": 140 + }, + { + "epoch": 0.05389908256880734, + "grad_norm": 0.3030510256051822, + "learning_rate": 9.987167995934088e-06, + "loss": 0.3561, + "num_tokens": 57654563.0, + "step": 141 + }, + { + "epoch": 0.054281345565749234, + "grad_norm": 0.295612594198623, + "learning_rate": 9.986744034324915e-06, + "loss": 0.3567, + "num_tokens": 58064881.0, + "step": 142 + }, + { + "epoch": 0.05466360856269113, + "grad_norm": 0.28410601764906807, + "learning_rate": 9.986313192690214e-06, + "loss": 0.3411, + "num_tokens": 58478669.0, + "step": 143 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 0.3142195781797062, + "learning_rate": 9.985875471690646e-06, + "loss": 0.3682, + "num_tokens": 58864073.0, + "step": 144 + }, + { + "epoch": 0.05542813455657492, + "grad_norm": 0.3526499476007933, + "learning_rate": 9.985430871997419e-06, + "loss": 0.3704, + "num_tokens": 59265865.0, + "step": 145 + }, + { + "epoch": 0.05581039755351682, + "grad_norm": 0.29526552988659543, + "learning_rate": 9.984979394292281e-06, + "loss": 0.3789, + "num_tokens": 59713987.0, + "step": 146 + }, + { + "epoch": 0.05619266055045872, + "grad_norm": 0.29245778696402475, + "learning_rate": 9.984521039267541e-06, + "loss": 0.3721, + "num_tokens": 60116080.0, + "step": 147 + }, + { + "epoch": 0.05657492354740061, + "grad_norm": 0.29858161465460215, + "learning_rate": 9.98405580762604e-06, + "loss": 0.3985, + "num_tokens": 60528843.0, + "step": 148 + }, + { + "epoch": 0.05695718654434251, + "grad_norm": 0.36715593710952876, + "learning_rate": 9.983583700081175e-06, + "loss": 0.3927, + "num_tokens": 60961209.0, + "step": 149 + }, + { + "epoch": 0.05733944954128441, + "grad_norm": 0.31612664518260053, + "learning_rate": 9.983104717356876e-06, + "loss": 0.3571, + "num_tokens": 61422654.0, + "step": 150 + }, + { + "epoch": 0.0577217125382263, + "grad_norm": 0.3171890235532633, + "learning_rate": 9.982618860187622e-06, + "loss": 0.36, + "num_tokens": 61882792.0, + "step": 151 + }, + { + "epoch": 0.0581039755351682, + "grad_norm": 0.3009225032179536, + "learning_rate": 9.982126129318434e-06, + "loss": 0.3624, + "num_tokens": 62291139.0, + "step": 152 + }, + { + "epoch": 0.058486238532110095, + "grad_norm": 0.344516015270405, + "learning_rate": 9.981626525504872e-06, + "loss": 0.3947, + "num_tokens": 62734489.0, + "step": 153 + }, + { + "epoch": 0.058868501529051986, + "grad_norm": 0.40734618283334334, + "learning_rate": 9.981120049513031e-06, + "loss": 0.3809, + "num_tokens": 63184842.0, + "step": 154 + }, + { + "epoch": 0.059250764525993885, + "grad_norm": 0.3718170700195857, + "learning_rate": 9.980606702119547e-06, + "loss": 0.3899, + "num_tokens": 63616995.0, + "step": 155 + }, + { + "epoch": 0.05963302752293578, + "grad_norm": 0.28053844302417147, + "learning_rate": 9.980086484111596e-06, + "loss": 0.3449, + "num_tokens": 63983629.0, + "step": 156 + }, + { + "epoch": 0.060015290519877675, + "grad_norm": 0.2979628145729509, + "learning_rate": 9.979559396286885e-06, + "loss": 0.3677, + "num_tokens": 64423985.0, + "step": 157 + }, + { + "epoch": 0.06039755351681957, + "grad_norm": 0.29871227972915554, + "learning_rate": 9.979025439453657e-06, + "loss": 0.3433, + "num_tokens": 64800081.0, + "step": 158 + }, + { + "epoch": 0.06077981651376147, + "grad_norm": 0.2612374594559548, + "learning_rate": 9.978484614430687e-06, + "loss": 0.3466, + "num_tokens": 65225739.0, + "step": 159 + }, + { + "epoch": 0.06116207951070336, + "grad_norm": 0.340237371157313, + "learning_rate": 9.977936922047281e-06, + "loss": 0.3779, + "num_tokens": 65652885.0, + "step": 160 + }, + { + "epoch": 0.06154434250764526, + "grad_norm": 0.3532518340415294, + "learning_rate": 9.97738236314328e-06, + "loss": 0.3763, + "num_tokens": 66071033.0, + "step": 161 + }, + { + "epoch": 0.06192660550458716, + "grad_norm": 0.3323977026176976, + "learning_rate": 9.976820938569049e-06, + "loss": 0.345, + "num_tokens": 66464059.0, + "step": 162 + }, + { + "epoch": 0.06230886850152905, + "grad_norm": 0.25597537187847486, + "learning_rate": 9.976252649185482e-06, + "loss": 0.3513, + "num_tokens": 66875069.0, + "step": 163 + }, + { + "epoch": 0.06269113149847094, + "grad_norm": 0.3171545451476222, + "learning_rate": 9.975677495864003e-06, + "loss": 0.3723, + "num_tokens": 67303828.0, + "step": 164 + }, + { + "epoch": 0.06307339449541284, + "grad_norm": 0.31875321182732913, + "learning_rate": 9.97509547948656e-06, + "loss": 0.3838, + "num_tokens": 67752148.0, + "step": 165 + }, + { + "epoch": 0.06345565749235474, + "grad_norm": 0.31465238313878235, + "learning_rate": 9.974506600945618e-06, + "loss": 0.3504, + "num_tokens": 68119582.0, + "step": 166 + }, + { + "epoch": 0.06383792048929664, + "grad_norm": 0.288264212411034, + "learning_rate": 9.973910861144174e-06, + "loss": 0.3521, + "num_tokens": 68504939.0, + "step": 167 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 0.2861207799339978, + "learning_rate": 9.973308260995744e-06, + "loss": 0.3751, + "num_tokens": 68947507.0, + "step": 168 + }, + { + "epoch": 0.06460244648318043, + "grad_norm": 0.2937911499736142, + "learning_rate": 9.972698801424358e-06, + "loss": 0.3895, + "num_tokens": 69362481.0, + "step": 169 + }, + { + "epoch": 0.06498470948012232, + "grad_norm": 0.29251721334730035, + "learning_rate": 9.97208248336457e-06, + "loss": 0.3643, + "num_tokens": 69753637.0, + "step": 170 + }, + { + "epoch": 0.06536697247706422, + "grad_norm": 0.2755373044472231, + "learning_rate": 9.971459307761453e-06, + "loss": 0.354, + "num_tokens": 70136759.0, + "step": 171 + }, + { + "epoch": 0.06574923547400612, + "grad_norm": 0.2978821995634767, + "learning_rate": 9.970829275570588e-06, + "loss": 0.3645, + "num_tokens": 70575011.0, + "step": 172 + }, + { + "epoch": 0.06613149847094801, + "grad_norm": 0.28501932128057794, + "learning_rate": 9.970192387758073e-06, + "loss": 0.3628, + "num_tokens": 70984617.0, + "step": 173 + }, + { + "epoch": 0.06651376146788991, + "grad_norm": 0.3779580236228422, + "learning_rate": 9.969548645300519e-06, + "loss": 0.377, + "num_tokens": 71402187.0, + "step": 174 + }, + { + "epoch": 0.06689602446483181, + "grad_norm": 0.3377881609942034, + "learning_rate": 9.968898049185052e-06, + "loss": 0.3574, + "num_tokens": 71795847.0, + "step": 175 + }, + { + "epoch": 0.0672782874617737, + "grad_norm": 0.2916527024647668, + "learning_rate": 9.9682406004093e-06, + "loss": 0.4052, + "num_tokens": 72247725.0, + "step": 176 + }, + { + "epoch": 0.0676605504587156, + "grad_norm": 0.3185391140228152, + "learning_rate": 9.967576299981403e-06, + "loss": 0.3751, + "num_tokens": 72693638.0, + "step": 177 + }, + { + "epoch": 0.06804281345565749, + "grad_norm": 0.30306148441770686, + "learning_rate": 9.966905148920008e-06, + "loss": 0.349, + "num_tokens": 73105643.0, + "step": 178 + }, + { + "epoch": 0.06842507645259939, + "grad_norm": 0.28914550640848624, + "learning_rate": 9.966227148254268e-06, + "loss": 0.3726, + "num_tokens": 73563316.0, + "step": 179 + }, + { + "epoch": 0.06880733944954129, + "grad_norm": 0.33456046764024827, + "learning_rate": 9.965542299023833e-06, + "loss": 0.3809, + "num_tokens": 73986398.0, + "step": 180 + }, + { + "epoch": 0.06918960244648319, + "grad_norm": 0.31433258807661973, + "learning_rate": 9.964850602278859e-06, + "loss": 0.3653, + "num_tokens": 74399655.0, + "step": 181 + }, + { + "epoch": 0.06957186544342507, + "grad_norm": 0.38708681678799467, + "learning_rate": 9.964152059080007e-06, + "loss": 0.3582, + "num_tokens": 74764330.0, + "step": 182 + }, + { + "epoch": 0.06995412844036697, + "grad_norm": 0.29860601007162324, + "learning_rate": 9.963446670498424e-06, + "loss": 0.3432, + "num_tokens": 75121653.0, + "step": 183 + }, + { + "epoch": 0.07033639143730887, + "grad_norm": 0.30810257588043594, + "learning_rate": 9.962734437615767e-06, + "loss": 0.3542, + "num_tokens": 75531744.0, + "step": 184 + }, + { + "epoch": 0.07071865443425077, + "grad_norm": 0.3265476811541891, + "learning_rate": 9.962015361524179e-06, + "loss": 0.3638, + "num_tokens": 75898551.0, + "step": 185 + }, + { + "epoch": 0.07110091743119266, + "grad_norm": 0.3236697275100706, + "learning_rate": 9.961289443326301e-06, + "loss": 0.3678, + "num_tokens": 76290706.0, + "step": 186 + }, + { + "epoch": 0.07148318042813456, + "grad_norm": 0.3121159841744106, + "learning_rate": 9.960556684135264e-06, + "loss": 0.3667, + "num_tokens": 76705391.0, + "step": 187 + }, + { + "epoch": 0.07186544342507645, + "grad_norm": 0.3106702976767392, + "learning_rate": 9.95981708507469e-06, + "loss": 0.369, + "num_tokens": 77144503.0, + "step": 188 + }, + { + "epoch": 0.07224770642201835, + "grad_norm": 0.34847417559638644, + "learning_rate": 9.959070647278687e-06, + "loss": 0.3601, + "num_tokens": 77559844.0, + "step": 189 + }, + { + "epoch": 0.07262996941896024, + "grad_norm": 0.34678058678749113, + "learning_rate": 9.958317371891854e-06, + "loss": 0.3824, + "num_tokens": 77952748.0, + "step": 190 + }, + { + "epoch": 0.07301223241590214, + "grad_norm": 0.33886797062927976, + "learning_rate": 9.957557260069271e-06, + "loss": 0.3559, + "num_tokens": 78372383.0, + "step": 191 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 0.35132336275819737, + "learning_rate": 9.956790312976499e-06, + "loss": 0.396, + "num_tokens": 78785065.0, + "step": 192 + }, + { + "epoch": 0.07377675840978594, + "grad_norm": 0.31125388580858143, + "learning_rate": 9.956016531789591e-06, + "loss": 0.3783, + "num_tokens": 79206326.0, + "step": 193 + }, + { + "epoch": 0.07415902140672782, + "grad_norm": 0.2943694036601107, + "learning_rate": 9.955235917695065e-06, + "loss": 0.3738, + "num_tokens": 79638756.0, + "step": 194 + }, + { + "epoch": 0.07454128440366972, + "grad_norm": 0.2789237268146603, + "learning_rate": 9.954448471889928e-06, + "loss": 0.3659, + "num_tokens": 80079595.0, + "step": 195 + }, + { + "epoch": 0.07492354740061162, + "grad_norm": 0.30657986744903526, + "learning_rate": 9.953654195581658e-06, + "loss": 0.3714, + "num_tokens": 80445368.0, + "step": 196 + }, + { + "epoch": 0.07530581039755352, + "grad_norm": 0.334416440173099, + "learning_rate": 9.952853089988205e-06, + "loss": 0.3656, + "num_tokens": 80840969.0, + "step": 197 + }, + { + "epoch": 0.07568807339449542, + "grad_norm": 0.25643825215514204, + "learning_rate": 9.952045156337998e-06, + "loss": 0.3606, + "num_tokens": 81250567.0, + "step": 198 + }, + { + "epoch": 0.07607033639143732, + "grad_norm": 0.2639442653647469, + "learning_rate": 9.951230395869926e-06, + "loss": 0.3831, + "num_tokens": 81661857.0, + "step": 199 + }, + { + "epoch": 0.0764525993883792, + "grad_norm": 0.27388561109626597, + "learning_rate": 9.950408809833356e-06, + "loss": 0.3556, + "num_tokens": 82056537.0, + "step": 200 + }, + { + "epoch": 0.0768348623853211, + "grad_norm": 0.3120721722829178, + "learning_rate": 9.94958039948812e-06, + "loss": 0.3404, + "num_tokens": 82439860.0, + "step": 201 + }, + { + "epoch": 0.077217125382263, + "grad_norm": 0.2851615192730009, + "learning_rate": 9.948745166104506e-06, + "loss": 0.3733, + "num_tokens": 82835173.0, + "step": 202 + }, + { + "epoch": 0.0775993883792049, + "grad_norm": 0.32410400757222513, + "learning_rate": 9.947903110963274e-06, + "loss": 0.3628, + "num_tokens": 83214244.0, + "step": 203 + }, + { + "epoch": 0.0779816513761468, + "grad_norm": 0.268172309828167, + "learning_rate": 9.947054235355642e-06, + "loss": 0.379, + "num_tokens": 83627811.0, + "step": 204 + }, + { + "epoch": 0.07836391437308869, + "grad_norm": 0.27959613704165887, + "learning_rate": 9.946198540583285e-06, + "loss": 0.3652, + "num_tokens": 84018182.0, + "step": 205 + }, + { + "epoch": 0.07874617737003058, + "grad_norm": 0.29369199615662045, + "learning_rate": 9.945336027958333e-06, + "loss": 0.352, + "num_tokens": 84436023.0, + "step": 206 + }, + { + "epoch": 0.07912844036697247, + "grad_norm": 0.32528975482882166, + "learning_rate": 9.944466698803377e-06, + "loss": 0.3937, + "num_tokens": 84877529.0, + "step": 207 + }, + { + "epoch": 0.07951070336391437, + "grad_norm": 0.2955847679126208, + "learning_rate": 9.943590554451452e-06, + "loss": 0.3635, + "num_tokens": 85296496.0, + "step": 208 + }, + { + "epoch": 0.07989296636085627, + "grad_norm": 0.3062034678724796, + "learning_rate": 9.942707596246051e-06, + "loss": 0.3528, + "num_tokens": 85698238.0, + "step": 209 + }, + { + "epoch": 0.08027522935779817, + "grad_norm": 0.2732791883322538, + "learning_rate": 9.941817825541113e-06, + "loss": 0.3802, + "num_tokens": 86095973.0, + "step": 210 + }, + { + "epoch": 0.08065749235474007, + "grad_norm": 0.3076491439128888, + "learning_rate": 9.940921243701019e-06, + "loss": 0.3887, + "num_tokens": 86557278.0, + "step": 211 + }, + { + "epoch": 0.08103975535168195, + "grad_norm": 0.29215917280438924, + "learning_rate": 9.940017852100601e-06, + "loss": 0.3644, + "num_tokens": 86966313.0, + "step": 212 + }, + { + "epoch": 0.08142201834862385, + "grad_norm": 0.2874027877225882, + "learning_rate": 9.93910765212513e-06, + "loss": 0.3697, + "num_tokens": 87344800.0, + "step": 213 + }, + { + "epoch": 0.08180428134556575, + "grad_norm": 0.27936172636785955, + "learning_rate": 9.938190645170319e-06, + "loss": 0.3673, + "num_tokens": 87773672.0, + "step": 214 + }, + { + "epoch": 0.08218654434250765, + "grad_norm": 0.3024658653566097, + "learning_rate": 9.937266832642312e-06, + "loss": 0.3633, + "num_tokens": 88142515.0, + "step": 215 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 0.33991108960012917, + "learning_rate": 9.936336215957698e-06, + "loss": 0.3987, + "num_tokens": 88528287.0, + "step": 216 + }, + { + "epoch": 0.08295107033639144, + "grad_norm": 0.2996116237222055, + "learning_rate": 9.935398796543493e-06, + "loss": 0.3699, + "num_tokens": 88957091.0, + "step": 217 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.28663608072342417, + "learning_rate": 9.934454575837148e-06, + "loss": 0.3478, + "num_tokens": 89330917.0, + "step": 218 + }, + { + "epoch": 0.08371559633027523, + "grad_norm": 0.34356264011292487, + "learning_rate": 9.933503555286544e-06, + "loss": 0.3907, + "num_tokens": 89769457.0, + "step": 219 + }, + { + "epoch": 0.08409785932721713, + "grad_norm": 0.3145420561026299, + "learning_rate": 9.932545736349985e-06, + "loss": 0.3339, + "num_tokens": 90151953.0, + "step": 220 + }, + { + "epoch": 0.08448012232415902, + "grad_norm": 0.2842239632904358, + "learning_rate": 9.9315811204962e-06, + "loss": 0.3546, + "num_tokens": 90574328.0, + "step": 221 + }, + { + "epoch": 0.08486238532110092, + "grad_norm": 0.26724416107162097, + "learning_rate": 9.930609709204346e-06, + "loss": 0.3377, + "num_tokens": 90976429.0, + "step": 222 + }, + { + "epoch": 0.0852446483180428, + "grad_norm": 0.2645691169484764, + "learning_rate": 9.929631503963992e-06, + "loss": 0.3822, + "num_tokens": 91442490.0, + "step": 223 + }, + { + "epoch": 0.0856269113149847, + "grad_norm": 0.32135587321564485, + "learning_rate": 9.928646506275134e-06, + "loss": 0.3579, + "num_tokens": 91848632.0, + "step": 224 + }, + { + "epoch": 0.0860091743119266, + "grad_norm": 0.32670637032541, + "learning_rate": 9.927654717648176e-06, + "loss": 0.3721, + "num_tokens": 92276961.0, + "step": 225 + }, + { + "epoch": 0.0863914373088685, + "grad_norm": 0.2944202684179206, + "learning_rate": 9.926656139603939e-06, + "loss": 0.3697, + "num_tokens": 92684709.0, + "step": 226 + }, + { + "epoch": 0.0867737003058104, + "grad_norm": 0.3134041406702019, + "learning_rate": 9.925650773673654e-06, + "loss": 0.3544, + "num_tokens": 93064995.0, + "step": 227 + }, + { + "epoch": 0.0871559633027523, + "grad_norm": 0.295501923469508, + "learning_rate": 9.92463862139896e-06, + "loss": 0.3528, + "num_tokens": 93493922.0, + "step": 228 + }, + { + "epoch": 0.08753822629969418, + "grad_norm": 0.2970567683956971, + "learning_rate": 9.923619684331904e-06, + "loss": 0.3479, + "num_tokens": 93902963.0, + "step": 229 + }, + { + "epoch": 0.08792048929663608, + "grad_norm": 0.28293401962950987, + "learning_rate": 9.922593964034936e-06, + "loss": 0.3948, + "num_tokens": 94312112.0, + "step": 230 + }, + { + "epoch": 0.08830275229357798, + "grad_norm": 0.276251610132844, + "learning_rate": 9.921561462080908e-06, + "loss": 0.3563, + "num_tokens": 94701284.0, + "step": 231 + }, + { + "epoch": 0.08868501529051988, + "grad_norm": 0.32819585975983306, + "learning_rate": 9.92052218005307e-06, + "loss": 0.3664, + "num_tokens": 95103164.0, + "step": 232 + }, + { + "epoch": 0.08906727828746178, + "grad_norm": 0.32120398404390094, + "learning_rate": 9.919476119545066e-06, + "loss": 0.3613, + "num_tokens": 95530537.0, + "step": 233 + }, + { + "epoch": 0.08944954128440367, + "grad_norm": 0.27343332503880463, + "learning_rate": 9.918423282160945e-06, + "loss": 0.345, + "num_tokens": 95941819.0, + "step": 234 + }, + { + "epoch": 0.08983180428134556, + "grad_norm": 0.294056649449384, + "learning_rate": 9.917363669515133e-06, + "loss": 0.3477, + "num_tokens": 96355176.0, + "step": 235 + }, + { + "epoch": 0.09021406727828746, + "grad_norm": 0.29456328672560694, + "learning_rate": 9.916297283232456e-06, + "loss": 0.3618, + "num_tokens": 96772162.0, + "step": 236 + }, + { + "epoch": 0.09059633027522936, + "grad_norm": 0.30400646807334347, + "learning_rate": 9.915224124948119e-06, + "loss": 0.3963, + "num_tokens": 97250778.0, + "step": 237 + }, + { + "epoch": 0.09097859327217125, + "grad_norm": 0.27433780692105025, + "learning_rate": 9.914144196307721e-06, + "loss": 0.3454, + "num_tokens": 97654025.0, + "step": 238 + }, + { + "epoch": 0.09136085626911315, + "grad_norm": 0.2762680234947099, + "learning_rate": 9.913057498967233e-06, + "loss": 0.3801, + "num_tokens": 98081943.0, + "step": 239 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 0.29782204047001254, + "learning_rate": 9.911964034593013e-06, + "loss": 0.3629, + "num_tokens": 98491222.0, + "step": 240 + }, + { + "epoch": 0.09212538226299694, + "grad_norm": 0.27543729415384416, + "learning_rate": 9.910863804861788e-06, + "loss": 0.342, + "num_tokens": 98933450.0, + "step": 241 + }, + { + "epoch": 0.09250764525993883, + "grad_norm": 0.27333664460770063, + "learning_rate": 9.909756811460664e-06, + "loss": 0.3615, + "num_tokens": 99356292.0, + "step": 242 + }, + { + "epoch": 0.09288990825688073, + "grad_norm": 0.312849706925737, + "learning_rate": 9.908643056087121e-06, + "loss": 0.384, + "num_tokens": 99807715.0, + "step": 243 + }, + { + "epoch": 0.09327217125382263, + "grad_norm": 0.29137130049453147, + "learning_rate": 9.907522540449002e-06, + "loss": 0.3499, + "num_tokens": 100189698.0, + "step": 244 + }, + { + "epoch": 0.09365443425076453, + "grad_norm": 0.3151050491963121, + "learning_rate": 9.906395266264517e-06, + "loss": 0.3781, + "num_tokens": 100660902.0, + "step": 245 + }, + { + "epoch": 0.09403669724770643, + "grad_norm": 0.3483087694005124, + "learning_rate": 9.905261235262244e-06, + "loss": 0.3656, + "num_tokens": 101086006.0, + "step": 246 + }, + { + "epoch": 0.09441896024464831, + "grad_norm": 0.3183839353077628, + "learning_rate": 9.904120449181117e-06, + "loss": 0.348, + "num_tokens": 101465165.0, + "step": 247 + }, + { + "epoch": 0.09480122324159021, + "grad_norm": 0.273075235344002, + "learning_rate": 9.902972909770433e-06, + "loss": 0.3955, + "num_tokens": 101888824.0, + "step": 248 + }, + { + "epoch": 0.09518348623853211, + "grad_norm": 0.30978642198601963, + "learning_rate": 9.901818618789841e-06, + "loss": 0.3704, + "num_tokens": 102306874.0, + "step": 249 + }, + { + "epoch": 0.095565749235474, + "grad_norm": 0.3035045757508297, + "learning_rate": 9.900657578009344e-06, + "loss": 0.3611, + "num_tokens": 102700798.0, + "step": 250 + }, + { + "epoch": 0.0959480122324159, + "grad_norm": 0.29426419140397553, + "learning_rate": 9.899489789209298e-06, + "loss": 0.3667, + "num_tokens": 103096621.0, + "step": 251 + }, + { + "epoch": 0.0963302752293578, + "grad_norm": 0.3485697258697353, + "learning_rate": 9.8983152541804e-06, + "loss": 0.3662, + "num_tokens": 103539608.0, + "step": 252 + }, + { + "epoch": 0.09671253822629969, + "grad_norm": 0.3363981229241188, + "learning_rate": 9.897133974723698e-06, + "loss": 0.3815, + "num_tokens": 103963335.0, + "step": 253 + }, + { + "epoch": 0.09709480122324159, + "grad_norm": 0.26769732540098695, + "learning_rate": 9.89594595265058e-06, + "loss": 0.3661, + "num_tokens": 104402413.0, + "step": 254 + }, + { + "epoch": 0.09747706422018348, + "grad_norm": 0.29052837736053017, + "learning_rate": 9.894751189782773e-06, + "loss": 0.3698, + "num_tokens": 104812212.0, + "step": 255 + }, + { + "epoch": 0.09785932721712538, + "grad_norm": 0.30185424646274334, + "learning_rate": 9.893549687952337e-06, + "loss": 0.3696, + "num_tokens": 105208103.0, + "step": 256 + }, + { + "epoch": 0.09824159021406728, + "grad_norm": 0.272791811311962, + "learning_rate": 9.892341449001673e-06, + "loss": 0.3654, + "num_tokens": 105619143.0, + "step": 257 + }, + { + "epoch": 0.09862385321100918, + "grad_norm": 0.28252810251886107, + "learning_rate": 9.891126474783507e-06, + "loss": 0.3641, + "num_tokens": 106082216.0, + "step": 258 + }, + { + "epoch": 0.09900611620795106, + "grad_norm": 0.29207208596330475, + "learning_rate": 9.889904767160892e-06, + "loss": 0.371, + "num_tokens": 106536265.0, + "step": 259 + }, + { + "epoch": 0.09938837920489296, + "grad_norm": 0.290941329782096, + "learning_rate": 9.888676328007215e-06, + "loss": 0.3628, + "num_tokens": 106964217.0, + "step": 260 + }, + { + "epoch": 0.09977064220183486, + "grad_norm": 0.2817806507691782, + "learning_rate": 9.887441159206173e-06, + "loss": 0.3444, + "num_tokens": 107385148.0, + "step": 261 + }, + { + "epoch": 0.10015290519877676, + "grad_norm": 0.27229244163180977, + "learning_rate": 9.886199262651792e-06, + "loss": 0.3602, + "num_tokens": 107772298.0, + "step": 262 + }, + { + "epoch": 0.10053516819571866, + "grad_norm": 0.2610174333568875, + "learning_rate": 9.884950640248406e-06, + "loss": 0.345, + "num_tokens": 108182473.0, + "step": 263 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 0.3020670783261359, + "learning_rate": 9.883695293910674e-06, + "loss": 0.3566, + "num_tokens": 108546122.0, + "step": 264 + }, + { + "epoch": 0.10129969418960244, + "grad_norm": 0.31040224708867337, + "learning_rate": 9.882433225563553e-06, + "loss": 0.3527, + "num_tokens": 108938626.0, + "step": 265 + }, + { + "epoch": 0.10168195718654434, + "grad_norm": 0.29221050269568544, + "learning_rate": 9.881164437142316e-06, + "loss": 0.3467, + "num_tokens": 109331960.0, + "step": 266 + }, + { + "epoch": 0.10206422018348624, + "grad_norm": 0.29095939215956507, + "learning_rate": 9.879888930592535e-06, + "loss": 0.3827, + "num_tokens": 109737450.0, + "step": 267 + }, + { + "epoch": 0.10244648318042814, + "grad_norm": 0.2714268725173217, + "learning_rate": 9.87860670787009e-06, + "loss": 0.3606, + "num_tokens": 110152085.0, + "step": 268 + }, + { + "epoch": 0.10282874617737003, + "grad_norm": 0.2749502242222535, + "learning_rate": 9.877317770941155e-06, + "loss": 0.3696, + "num_tokens": 110580750.0, + "step": 269 + }, + { + "epoch": 0.10321100917431193, + "grad_norm": 0.28202363086242993, + "learning_rate": 9.8760221217822e-06, + "loss": 0.3719, + "num_tokens": 110992053.0, + "step": 270 + }, + { + "epoch": 0.10359327217125382, + "grad_norm": 0.2939169912519912, + "learning_rate": 9.874719762379989e-06, + "loss": 0.3904, + "num_tokens": 111375803.0, + "step": 271 + }, + { + "epoch": 0.10397553516819572, + "grad_norm": 0.2524585115843448, + "learning_rate": 9.873410694731577e-06, + "loss": 0.3729, + "num_tokens": 111797446.0, + "step": 272 + }, + { + "epoch": 0.10435779816513761, + "grad_norm": 0.330271338847113, + "learning_rate": 9.872094920844301e-06, + "loss": 0.3522, + "num_tokens": 112210690.0, + "step": 273 + }, + { + "epoch": 0.10474006116207951, + "grad_norm": 0.3173092968895186, + "learning_rate": 9.870772442735786e-06, + "loss": 0.4027, + "num_tokens": 112663677.0, + "step": 274 + }, + { + "epoch": 0.10512232415902141, + "grad_norm": 0.2611720509884505, + "learning_rate": 9.869443262433934e-06, + "loss": 0.3868, + "num_tokens": 113117271.0, + "step": 275 + }, + { + "epoch": 0.10550458715596331, + "grad_norm": 0.2970284419669802, + "learning_rate": 9.868107381976923e-06, + "loss": 0.3741, + "num_tokens": 113518142.0, + "step": 276 + }, + { + "epoch": 0.1058868501529052, + "grad_norm": 0.29220238684960315, + "learning_rate": 9.866764803413215e-06, + "loss": 0.3629, + "num_tokens": 113909217.0, + "step": 277 + }, + { + "epoch": 0.10626911314984709, + "grad_norm": 0.3165983400943323, + "learning_rate": 9.865415528801527e-06, + "loss": 0.3956, + "num_tokens": 114297637.0, + "step": 278 + }, + { + "epoch": 0.10665137614678899, + "grad_norm": 0.27854892660119135, + "learning_rate": 9.864059560210858e-06, + "loss": 0.366, + "num_tokens": 114726773.0, + "step": 279 + }, + { + "epoch": 0.10703363914373089, + "grad_norm": 0.2761010119890071, + "learning_rate": 9.862696899720465e-06, + "loss": 0.3523, + "num_tokens": 115115080.0, + "step": 280 + }, + { + "epoch": 0.10741590214067279, + "grad_norm": 0.30788357270337063, + "learning_rate": 9.861327549419866e-06, + "loss": 0.3495, + "num_tokens": 115516715.0, + "step": 281 + }, + { + "epoch": 0.10779816513761468, + "grad_norm": 0.312954310400862, + "learning_rate": 9.85995151140884e-06, + "loss": 0.3628, + "num_tokens": 115941499.0, + "step": 282 + }, + { + "epoch": 0.10818042813455657, + "grad_norm": 0.30918221576751287, + "learning_rate": 9.85856878779742e-06, + "loss": 0.3922, + "num_tokens": 116370691.0, + "step": 283 + }, + { + "epoch": 0.10856269113149847, + "grad_norm": 0.31223777670382713, + "learning_rate": 9.857179380705887e-06, + "loss": 0.3821, + "num_tokens": 116773367.0, + "step": 284 + }, + { + "epoch": 0.10894495412844037, + "grad_norm": 0.27289945113722036, + "learning_rate": 9.855783292264781e-06, + "loss": 0.3629, + "num_tokens": 117175862.0, + "step": 285 + }, + { + "epoch": 0.10932721712538226, + "grad_norm": 0.2909587041265393, + "learning_rate": 9.854380524614874e-06, + "loss": 0.3811, + "num_tokens": 117555972.0, + "step": 286 + }, + { + "epoch": 0.10970948012232416, + "grad_norm": 0.3508899918081021, + "learning_rate": 9.852971079907189e-06, + "loss": 0.3908, + "num_tokens": 117989763.0, + "step": 287 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 0.331276353344019, + "learning_rate": 9.851554960302982e-06, + "loss": 0.3676, + "num_tokens": 118433333.0, + "step": 288 + }, + { + "epoch": 0.11047400611620795, + "grad_norm": 0.2924357482752978, + "learning_rate": 9.85013216797375e-06, + "loss": 0.3476, + "num_tokens": 118829868.0, + "step": 289 + }, + { + "epoch": 0.11085626911314984, + "grad_norm": 0.2997056422292096, + "learning_rate": 9.848702705101222e-06, + "loss": 0.3617, + "num_tokens": 119211420.0, + "step": 290 + }, + { + "epoch": 0.11123853211009174, + "grad_norm": 0.3742836379630399, + "learning_rate": 9.847266573877346e-06, + "loss": 0.3826, + "num_tokens": 119640407.0, + "step": 291 + }, + { + "epoch": 0.11162079510703364, + "grad_norm": 0.3216917969373817, + "learning_rate": 9.845823776504308e-06, + "loss": 0.3696, + "num_tokens": 120048713.0, + "step": 292 + }, + { + "epoch": 0.11200305810397554, + "grad_norm": 0.3027352030982205, + "learning_rate": 9.844374315194508e-06, + "loss": 0.3745, + "num_tokens": 120422430.0, + "step": 293 + }, + { + "epoch": 0.11238532110091744, + "grad_norm": 0.3071429341787571, + "learning_rate": 9.842918192170567e-06, + "loss": 0.3861, + "num_tokens": 120812256.0, + "step": 294 + }, + { + "epoch": 0.11276758409785932, + "grad_norm": 0.29715278251405547, + "learning_rate": 9.841455409665322e-06, + "loss": 0.3505, + "num_tokens": 121203984.0, + "step": 295 + }, + { + "epoch": 0.11314984709480122, + "grad_norm": 0.26601257484399915, + "learning_rate": 9.83998596992182e-06, + "loss": 0.3556, + "num_tokens": 121636080.0, + "step": 296 + }, + { + "epoch": 0.11353211009174312, + "grad_norm": 0.29965657210105656, + "learning_rate": 9.838509875193317e-06, + "loss": 0.3637, + "num_tokens": 122061987.0, + "step": 297 + }, + { + "epoch": 0.11391437308868502, + "grad_norm": 0.27848002752496387, + "learning_rate": 9.837027127743275e-06, + "loss": 0.3641, + "num_tokens": 122410943.0, + "step": 298 + }, + { + "epoch": 0.11429663608562692, + "grad_norm": 0.2682770864804808, + "learning_rate": 9.835537729845352e-06, + "loss": 0.3948, + "num_tokens": 122848976.0, + "step": 299 + }, + { + "epoch": 0.11467889908256881, + "grad_norm": 0.28555695033122447, + "learning_rate": 9.834041683783413e-06, + "loss": 0.3553, + "num_tokens": 123232302.0, + "step": 300 + }, + { + "epoch": 0.1150611620795107, + "grad_norm": 0.29283430135149235, + "learning_rate": 9.83253899185151e-06, + "loss": 0.3595, + "num_tokens": 123620339.0, + "step": 301 + }, + { + "epoch": 0.1154434250764526, + "grad_norm": 0.3327873933315915, + "learning_rate": 9.83102965635389e-06, + "loss": 0.3748, + "num_tokens": 124011849.0, + "step": 302 + }, + { + "epoch": 0.1158256880733945, + "grad_norm": 0.2724899431720382, + "learning_rate": 9.829513679604983e-06, + "loss": 0.3506, + "num_tokens": 124450845.0, + "step": 303 + }, + { + "epoch": 0.1162079510703364, + "grad_norm": 0.2770946104901708, + "learning_rate": 9.827991063929407e-06, + "loss": 0.3644, + "num_tokens": 124862699.0, + "step": 304 + }, + { + "epoch": 0.11659021406727829, + "grad_norm": 0.2915694326733997, + "learning_rate": 9.826461811661959e-06, + "loss": 0.3561, + "num_tokens": 125249693.0, + "step": 305 + }, + { + "epoch": 0.11697247706422019, + "grad_norm": 0.28170953782047087, + "learning_rate": 9.824925925147611e-06, + "loss": 0.3668, + "num_tokens": 125640121.0, + "step": 306 + }, + { + "epoch": 0.11735474006116207, + "grad_norm": 0.3049371740679874, + "learning_rate": 9.823383406741511e-06, + "loss": 0.3833, + "num_tokens": 126033668.0, + "step": 307 + }, + { + "epoch": 0.11773700305810397, + "grad_norm": 0.3014529343668211, + "learning_rate": 9.821834258808973e-06, + "loss": 0.3537, + "num_tokens": 126459451.0, + "step": 308 + }, + { + "epoch": 0.11811926605504587, + "grad_norm": 0.30241226769117924, + "learning_rate": 9.82027848372548e-06, + "loss": 0.3734, + "num_tokens": 126894820.0, + "step": 309 + }, + { + "epoch": 0.11850152905198777, + "grad_norm": 0.2885703862024239, + "learning_rate": 9.818716083876672e-06, + "loss": 0.3588, + "num_tokens": 127274240.0, + "step": 310 + }, + { + "epoch": 0.11888379204892967, + "grad_norm": 0.2805862007763283, + "learning_rate": 9.817147061658357e-06, + "loss": 0.3667, + "num_tokens": 127664119.0, + "step": 311 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 0.2594680894998848, + "learning_rate": 9.815571419476488e-06, + "loss": 0.3727, + "num_tokens": 128072004.0, + "step": 312 + }, + { + "epoch": 0.11964831804281345, + "grad_norm": 0.26692183256572516, + "learning_rate": 9.813989159747173e-06, + "loss": 0.3666, + "num_tokens": 128516007.0, + "step": 313 + }, + { + "epoch": 0.12003058103975535, + "grad_norm": 0.284716987141419, + "learning_rate": 9.81240028489667e-06, + "loss": 0.3596, + "num_tokens": 128893511.0, + "step": 314 + }, + { + "epoch": 0.12041284403669725, + "grad_norm": 0.2712764803922657, + "learning_rate": 9.810804797361374e-06, + "loss": 0.368, + "num_tokens": 129272733.0, + "step": 315 + }, + { + "epoch": 0.12079510703363915, + "grad_norm": 0.27976400887056985, + "learning_rate": 9.809202699587828e-06, + "loss": 0.3645, + "num_tokens": 129670952.0, + "step": 316 + }, + { + "epoch": 0.12117737003058104, + "grad_norm": 0.3127815042159146, + "learning_rate": 9.807593994032706e-06, + "loss": 0.3977, + "num_tokens": 130121126.0, + "step": 317 + }, + { + "epoch": 0.12155963302752294, + "grad_norm": 0.2863769224906313, + "learning_rate": 9.805978683162816e-06, + "loss": 0.3939, + "num_tokens": 130583458.0, + "step": 318 + }, + { + "epoch": 0.12194189602446483, + "grad_norm": 0.27258847923530244, + "learning_rate": 9.804356769455092e-06, + "loss": 0.3482, + "num_tokens": 131000170.0, + "step": 319 + }, + { + "epoch": 0.12232415902140673, + "grad_norm": 0.271669391745868, + "learning_rate": 9.802728255396602e-06, + "loss": 0.3568, + "num_tokens": 131386432.0, + "step": 320 + }, + { + "epoch": 0.12270642201834862, + "grad_norm": 0.33241178596873316, + "learning_rate": 9.801093143484521e-06, + "loss": 0.3848, + "num_tokens": 131754797.0, + "step": 321 + }, + { + "epoch": 0.12308868501529052, + "grad_norm": 0.34931101181525115, + "learning_rate": 9.799451436226151e-06, + "loss": 0.3908, + "num_tokens": 132160224.0, + "step": 322 + }, + { + "epoch": 0.12347094801223242, + "grad_norm": 0.2954467177276294, + "learning_rate": 9.797803136138907e-06, + "loss": 0.3649, + "num_tokens": 132581368.0, + "step": 323 + }, + { + "epoch": 0.12385321100917432, + "grad_norm": 0.30971607929524986, + "learning_rate": 9.796148245750313e-06, + "loss": 0.3348, + "num_tokens": 132951299.0, + "step": 324 + }, + { + "epoch": 0.1242354740061162, + "grad_norm": 0.3188511135371351, + "learning_rate": 9.794486767597992e-06, + "loss": 0.3983, + "num_tokens": 133400887.0, + "step": 325 + }, + { + "epoch": 0.1246177370030581, + "grad_norm": 0.27615267996919446, + "learning_rate": 9.792818704229677e-06, + "loss": 0.3697, + "num_tokens": 133856780.0, + "step": 326 + }, + { + "epoch": 0.125, + "grad_norm": 0.35218491803529917, + "learning_rate": 9.791144058203194e-06, + "loss": 0.3819, + "num_tokens": 134313328.0, + "step": 327 + }, + { + "epoch": 0.12538226299694188, + "grad_norm": 0.361793076454163, + "learning_rate": 9.789462832086468e-06, + "loss": 0.3751, + "num_tokens": 134706925.0, + "step": 328 + }, + { + "epoch": 0.1257645259938838, + "grad_norm": 0.28318972146916227, + "learning_rate": 9.787775028457506e-06, + "loss": 0.3688, + "num_tokens": 135121368.0, + "step": 329 + }, + { + "epoch": 0.12614678899082568, + "grad_norm": 0.2788107106873188, + "learning_rate": 9.786080649904409e-06, + "loss": 0.368, + "num_tokens": 135543059.0, + "step": 330 + }, + { + "epoch": 0.1265290519877676, + "grad_norm": 0.3551870330735956, + "learning_rate": 9.784379699025358e-06, + "loss": 0.368, + "num_tokens": 135932194.0, + "step": 331 + }, + { + "epoch": 0.12691131498470948, + "grad_norm": 0.26915650405734415, + "learning_rate": 9.782672178428607e-06, + "loss": 0.3676, + "num_tokens": 136372866.0, + "step": 332 + }, + { + "epoch": 0.12729357798165136, + "grad_norm": 0.25923800088776544, + "learning_rate": 9.78095809073249e-06, + "loss": 0.354, + "num_tokens": 136783472.0, + "step": 333 + }, + { + "epoch": 0.12767584097859327, + "grad_norm": 0.2739029527314487, + "learning_rate": 9.77923743856541e-06, + "loss": 0.352, + "num_tokens": 137193968.0, + "step": 334 + }, + { + "epoch": 0.12805810397553516, + "grad_norm": 0.2768964480829531, + "learning_rate": 9.777510224565834e-06, + "loss": 0.362, + "num_tokens": 137595702.0, + "step": 335 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 0.3178493502924472, + "learning_rate": 9.775776451382292e-06, + "loss": 0.3686, + "num_tokens": 138025233.0, + "step": 336 + }, + { + "epoch": 0.12882262996941896, + "grad_norm": 0.2985756541941514, + "learning_rate": 9.774036121673374e-06, + "loss": 0.3759, + "num_tokens": 138439624.0, + "step": 337 + }, + { + "epoch": 0.12920489296636087, + "grad_norm": 0.29629539600652666, + "learning_rate": 9.772289238107717e-06, + "loss": 0.355, + "num_tokens": 138860597.0, + "step": 338 + }, + { + "epoch": 0.12958715596330275, + "grad_norm": 0.2656162002560928, + "learning_rate": 9.770535803364014e-06, + "loss": 0.3777, + "num_tokens": 139267517.0, + "step": 339 + }, + { + "epoch": 0.12996941896024464, + "grad_norm": 0.3079874135040319, + "learning_rate": 9.768775820131008e-06, + "loss": 0.3921, + "num_tokens": 139694958.0, + "step": 340 + }, + { + "epoch": 0.13035168195718655, + "grad_norm": 0.2906988544585396, + "learning_rate": 9.767009291107471e-06, + "loss": 0.3749, + "num_tokens": 140079416.0, + "step": 341 + }, + { + "epoch": 0.13073394495412843, + "grad_norm": 0.29327212809124914, + "learning_rate": 9.765236219002223e-06, + "loss": 0.3842, + "num_tokens": 140523685.0, + "step": 342 + }, + { + "epoch": 0.13111620795107035, + "grad_norm": 0.28019234806606597, + "learning_rate": 9.763456606534112e-06, + "loss": 0.3536, + "num_tokens": 140950392.0, + "step": 343 + }, + { + "epoch": 0.13149847094801223, + "grad_norm": 0.24897455668458857, + "learning_rate": 9.761670456432016e-06, + "loss": 0.3592, + "num_tokens": 141380785.0, + "step": 344 + }, + { + "epoch": 0.13188073394495411, + "grad_norm": 0.3054569832855343, + "learning_rate": 9.75987777143484e-06, + "loss": 0.379, + "num_tokens": 141792563.0, + "step": 345 + }, + { + "epoch": 0.13226299694189603, + "grad_norm": 0.2659266431374649, + "learning_rate": 9.758078554291505e-06, + "loss": 0.3612, + "num_tokens": 142162144.0, + "step": 346 + }, + { + "epoch": 0.1326452599388379, + "grad_norm": 0.28333485294806937, + "learning_rate": 9.756272807760954e-06, + "loss": 0.3574, + "num_tokens": 142520857.0, + "step": 347 + }, + { + "epoch": 0.13302752293577982, + "grad_norm": 0.3734088636202502, + "learning_rate": 9.75446053461214e-06, + "loss": 0.3676, + "num_tokens": 142917677.0, + "step": 348 + }, + { + "epoch": 0.1334097859327217, + "grad_norm": 0.29278447614583514, + "learning_rate": 9.752641737624023e-06, + "loss": 0.3812, + "num_tokens": 143317782.0, + "step": 349 + }, + { + "epoch": 0.13379204892966362, + "grad_norm": 0.28446614185529623, + "learning_rate": 9.750816419585569e-06, + "loss": 0.3708, + "num_tokens": 143729772.0, + "step": 350 + }, + { + "epoch": 0.1341743119266055, + "grad_norm": 0.23898402619912196, + "learning_rate": 9.748984583295736e-06, + "loss": 0.3644, + "num_tokens": 144178116.0, + "step": 351 + }, + { + "epoch": 0.1345565749235474, + "grad_norm": 0.27653297398599813, + "learning_rate": 9.747146231563491e-06, + "loss": 0.3704, + "num_tokens": 144575763.0, + "step": 352 + }, + { + "epoch": 0.1349388379204893, + "grad_norm": 0.2887505187465025, + "learning_rate": 9.74530136720778e-06, + "loss": 0.3718, + "num_tokens": 144968040.0, + "step": 353 + }, + { + "epoch": 0.1353211009174312, + "grad_norm": 0.26690262658958475, + "learning_rate": 9.743449993057537e-06, + "loss": 0.3987, + "num_tokens": 145435439.0, + "step": 354 + }, + { + "epoch": 0.1357033639143731, + "grad_norm": 0.2856814337231243, + "learning_rate": 9.741592111951687e-06, + "loss": 0.3693, + "num_tokens": 145839015.0, + "step": 355 + }, + { + "epoch": 0.13608562691131498, + "grad_norm": 0.24839206187457766, + "learning_rate": 9.739727726739122e-06, + "loss": 0.3769, + "num_tokens": 146287347.0, + "step": 356 + }, + { + "epoch": 0.13646788990825687, + "grad_norm": 0.25246604863974276, + "learning_rate": 9.737856840278713e-06, + "loss": 0.3612, + "num_tokens": 146671063.0, + "step": 357 + }, + { + "epoch": 0.13685015290519878, + "grad_norm": 0.3233202343500254, + "learning_rate": 9.7359794554393e-06, + "loss": 0.3692, + "num_tokens": 147076604.0, + "step": 358 + }, + { + "epoch": 0.13723241590214066, + "grad_norm": 0.34497878728322334, + "learning_rate": 9.734095575099684e-06, + "loss": 0.375, + "num_tokens": 147443378.0, + "step": 359 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 0.2975012994682014, + "learning_rate": 9.732205202148631e-06, + "loss": 0.3597, + "num_tokens": 147846174.0, + "step": 360 + }, + { + "epoch": 0.13799694189602446, + "grad_norm": 0.3009871494813123, + "learning_rate": 9.730308339484862e-06, + "loss": 0.3823, + "num_tokens": 148265170.0, + "step": 361 + }, + { + "epoch": 0.13837920489296637, + "grad_norm": 0.3269887556100033, + "learning_rate": 9.728404990017046e-06, + "loss": 0.3714, + "num_tokens": 148645677.0, + "step": 362 + }, + { + "epoch": 0.13876146788990826, + "grad_norm": 0.3486884596736548, + "learning_rate": 9.726495156663803e-06, + "loss": 0.3587, + "num_tokens": 149014119.0, + "step": 363 + }, + { + "epoch": 0.13914373088685014, + "grad_norm": 0.2741666982338165, + "learning_rate": 9.724578842353695e-06, + "loss": 0.3454, + "num_tokens": 149397613.0, + "step": 364 + }, + { + "epoch": 0.13952599388379205, + "grad_norm": 0.28797219330893853, + "learning_rate": 9.722656050025216e-06, + "loss": 0.3664, + "num_tokens": 149837858.0, + "step": 365 + }, + { + "epoch": 0.13990825688073394, + "grad_norm": 0.3049664442019357, + "learning_rate": 9.720726782626801e-06, + "loss": 0.3736, + "num_tokens": 150217716.0, + "step": 366 + }, + { + "epoch": 0.14029051987767585, + "grad_norm": 0.316178060305947, + "learning_rate": 9.718791043116812e-06, + "loss": 0.386, + "num_tokens": 150618548.0, + "step": 367 + }, + { + "epoch": 0.14067278287461774, + "grad_norm": 0.2959532293284894, + "learning_rate": 9.716848834463532e-06, + "loss": 0.3678, + "num_tokens": 151057762.0, + "step": 368 + }, + { + "epoch": 0.14105504587155962, + "grad_norm": 0.2957554022117756, + "learning_rate": 9.714900159645169e-06, + "loss": 0.3779, + "num_tokens": 151466318.0, + "step": 369 + }, + { + "epoch": 0.14143730886850153, + "grad_norm": 0.29755257785291395, + "learning_rate": 9.712945021649842e-06, + "loss": 0.3558, + "num_tokens": 151857490.0, + "step": 370 + }, + { + "epoch": 0.14181957186544342, + "grad_norm": 0.28830811691023295, + "learning_rate": 9.710983423475583e-06, + "loss": 0.3759, + "num_tokens": 152254952.0, + "step": 371 + }, + { + "epoch": 0.14220183486238533, + "grad_norm": 0.2927087385898855, + "learning_rate": 9.709015368130328e-06, + "loss": 0.3551, + "num_tokens": 152626944.0, + "step": 372 + }, + { + "epoch": 0.1425840978593272, + "grad_norm": 0.2863661063037929, + "learning_rate": 9.707040858631918e-06, + "loss": 0.3872, + "num_tokens": 153076826.0, + "step": 373 + }, + { + "epoch": 0.14296636085626913, + "grad_norm": 0.2969143733041409, + "learning_rate": 9.705059898008087e-06, + "loss": 0.4003, + "num_tokens": 153513246.0, + "step": 374 + }, + { + "epoch": 0.143348623853211, + "grad_norm": 0.27998416781365243, + "learning_rate": 9.703072489296467e-06, + "loss": 0.3886, + "num_tokens": 153928116.0, + "step": 375 + }, + { + "epoch": 0.1437308868501529, + "grad_norm": 0.2740751028842675, + "learning_rate": 9.70107863554457e-06, + "loss": 0.3627, + "num_tokens": 154338955.0, + "step": 376 + }, + { + "epoch": 0.1441131498470948, + "grad_norm": 0.2618085061553179, + "learning_rate": 9.699078339809793e-06, + "loss": 0.3555, + "num_tokens": 154722147.0, + "step": 377 + }, + { + "epoch": 0.1444954128440367, + "grad_norm": 0.26328965045222535, + "learning_rate": 9.697071605159418e-06, + "loss": 0.3624, + "num_tokens": 155115005.0, + "step": 378 + }, + { + "epoch": 0.1448776758409786, + "grad_norm": 0.27170767317557487, + "learning_rate": 9.69505843467059e-06, + "loss": 0.379, + "num_tokens": 155552529.0, + "step": 379 + }, + { + "epoch": 0.1452599388379205, + "grad_norm": 0.2994766149066218, + "learning_rate": 9.693038831430332e-06, + "loss": 0.3757, + "num_tokens": 155956435.0, + "step": 380 + }, + { + "epoch": 0.14564220183486237, + "grad_norm": 0.3216199872305448, + "learning_rate": 9.691012798535524e-06, + "loss": 0.3743, + "num_tokens": 156358518.0, + "step": 381 + }, + { + "epoch": 0.14602446483180428, + "grad_norm": 0.2724435265094422, + "learning_rate": 9.68898033909291e-06, + "loss": 0.3548, + "num_tokens": 156802220.0, + "step": 382 + }, + { + "epoch": 0.14640672782874617, + "grad_norm": 0.2733083458726887, + "learning_rate": 9.686941456219088e-06, + "loss": 0.3648, + "num_tokens": 157181770.0, + "step": 383 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 0.2683717534178117, + "learning_rate": 9.684896153040504e-06, + "loss": 0.3707, + "num_tokens": 157631260.0, + "step": 384 + }, + { + "epoch": 0.14717125382262997, + "grad_norm": 0.2797821278974627, + "learning_rate": 9.682844432693447e-06, + "loss": 0.3517, + "num_tokens": 158081752.0, + "step": 385 + }, + { + "epoch": 0.14755351681957188, + "grad_norm": 0.3021244746434412, + "learning_rate": 9.680786298324054e-06, + "loss": 0.385, + "num_tokens": 158503533.0, + "step": 386 + }, + { + "epoch": 0.14793577981651376, + "grad_norm": 0.30084371736775684, + "learning_rate": 9.67872175308829e-06, + "loss": 0.3587, + "num_tokens": 158886791.0, + "step": 387 + }, + { + "epoch": 0.14831804281345565, + "grad_norm": 0.28898401069489127, + "learning_rate": 9.67665080015195e-06, + "loss": 0.3944, + "num_tokens": 159326374.0, + "step": 388 + }, + { + "epoch": 0.14870030581039756, + "grad_norm": 0.27620905341201196, + "learning_rate": 9.674573442690658e-06, + "loss": 0.349, + "num_tokens": 159709556.0, + "step": 389 + }, + { + "epoch": 0.14908256880733944, + "grad_norm": 0.29430023036015573, + "learning_rate": 9.672489683889862e-06, + "loss": 0.3652, + "num_tokens": 160123233.0, + "step": 390 + }, + { + "epoch": 0.14946483180428136, + "grad_norm": 0.29224260409863767, + "learning_rate": 9.67039952694482e-06, + "loss": 0.3664, + "num_tokens": 160559424.0, + "step": 391 + }, + { + "epoch": 0.14984709480122324, + "grad_norm": 0.30496832909422966, + "learning_rate": 9.6683029750606e-06, + "loss": 0.3999, + "num_tokens": 160939165.0, + "step": 392 + }, + { + "epoch": 0.15022935779816513, + "grad_norm": 0.2603623646692436, + "learning_rate": 9.666200031452084e-06, + "loss": 0.3774, + "num_tokens": 161373163.0, + "step": 393 + }, + { + "epoch": 0.15061162079510704, + "grad_norm": 0.32390633772363503, + "learning_rate": 9.664090699343948e-06, + "loss": 0.3982, + "num_tokens": 161780011.0, + "step": 394 + }, + { + "epoch": 0.15099388379204892, + "grad_norm": 0.3260661267973067, + "learning_rate": 9.661974981970665e-06, + "loss": 0.3752, + "num_tokens": 162187042.0, + "step": 395 + }, + { + "epoch": 0.15137614678899083, + "grad_norm": 0.3354818920882661, + "learning_rate": 9.659852882576502e-06, + "loss": 0.3691, + "num_tokens": 162571052.0, + "step": 396 + }, + { + "epoch": 0.15175840978593272, + "grad_norm": 0.2716963089106391, + "learning_rate": 9.65772440441551e-06, + "loss": 0.36, + "num_tokens": 162997756.0, + "step": 397 + }, + { + "epoch": 0.15214067278287463, + "grad_norm": 0.2891863031044559, + "learning_rate": 9.655589550751525e-06, + "loss": 0.383, + "num_tokens": 163449837.0, + "step": 398 + }, + { + "epoch": 0.15252293577981652, + "grad_norm": 0.31749094295800434, + "learning_rate": 9.653448324858151e-06, + "loss": 0.3969, + "num_tokens": 163835806.0, + "step": 399 + }, + { + "epoch": 0.1529051987767584, + "grad_norm": 0.30000356817837964, + "learning_rate": 9.651300730018776e-06, + "loss": 0.3836, + "num_tokens": 164276300.0, + "step": 400 + }, + { + "epoch": 0.1532874617737003, + "grad_norm": 0.31268242628477666, + "learning_rate": 9.64914676952654e-06, + "loss": 0.3938, + "num_tokens": 164726852.0, + "step": 401 + }, + { + "epoch": 0.1536697247706422, + "grad_norm": 0.2551289872017507, + "learning_rate": 9.646986446684357e-06, + "loss": 0.3529, + "num_tokens": 165146972.0, + "step": 402 + }, + { + "epoch": 0.1540519877675841, + "grad_norm": 0.3207304470525379, + "learning_rate": 9.644819764804888e-06, + "loss": 0.3937, + "num_tokens": 165587373.0, + "step": 403 + }, + { + "epoch": 0.154434250764526, + "grad_norm": 0.3350743575339698, + "learning_rate": 9.642646727210546e-06, + "loss": 0.3956, + "num_tokens": 165999875.0, + "step": 404 + }, + { + "epoch": 0.15481651376146788, + "grad_norm": 0.30082830614480544, + "learning_rate": 9.640467337233496e-06, + "loss": 0.3674, + "num_tokens": 166382211.0, + "step": 405 + }, + { + "epoch": 0.1551987767584098, + "grad_norm": 0.2891372838044858, + "learning_rate": 9.638281598215637e-06, + "loss": 0.3812, + "num_tokens": 166771480.0, + "step": 406 + }, + { + "epoch": 0.15558103975535167, + "grad_norm": 0.31050020435082976, + "learning_rate": 9.636089513508612e-06, + "loss": 0.3813, + "num_tokens": 167172230.0, + "step": 407 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 0.3031677280970331, + "learning_rate": 9.633891086473783e-06, + "loss": 0.3737, + "num_tokens": 167596027.0, + "step": 408 + }, + { + "epoch": 0.15634556574923547, + "grad_norm": 0.305489363988088, + "learning_rate": 9.631686320482245e-06, + "loss": 0.3808, + "num_tokens": 168008679.0, + "step": 409 + }, + { + "epoch": 0.15672782874617738, + "grad_norm": 0.27970483552619974, + "learning_rate": 9.629475218914816e-06, + "loss": 0.3563, + "num_tokens": 168407899.0, + "step": 410 + }, + { + "epoch": 0.15711009174311927, + "grad_norm": 0.26239782662158345, + "learning_rate": 9.62725778516202e-06, + "loss": 0.3797, + "num_tokens": 168853815.0, + "step": 411 + }, + { + "epoch": 0.15749235474006115, + "grad_norm": 0.26246837740583684, + "learning_rate": 9.625034022624097e-06, + "loss": 0.3543, + "num_tokens": 169223226.0, + "step": 412 + }, + { + "epoch": 0.15787461773700306, + "grad_norm": 0.26826639661015544, + "learning_rate": 9.622803934710993e-06, + "loss": 0.361, + "num_tokens": 169628949.0, + "step": 413 + }, + { + "epoch": 0.15825688073394495, + "grad_norm": 0.3089112536180955, + "learning_rate": 9.620567524842347e-06, + "loss": 0.3629, + "num_tokens": 170039146.0, + "step": 414 + }, + { + "epoch": 0.15863914373088686, + "grad_norm": 0.32512697450744416, + "learning_rate": 9.618324796447497e-06, + "loss": 0.3917, + "num_tokens": 170460368.0, + "step": 415 + }, + { + "epoch": 0.15902140672782875, + "grad_norm": 0.3078157476487109, + "learning_rate": 9.61607575296547e-06, + "loss": 0.3685, + "num_tokens": 170889449.0, + "step": 416 + }, + { + "epoch": 0.15940366972477063, + "grad_norm": 0.272300828529101, + "learning_rate": 9.613820397844976e-06, + "loss": 0.3745, + "num_tokens": 171345859.0, + "step": 417 + }, + { + "epoch": 0.15978593272171254, + "grad_norm": 0.2723811264053706, + "learning_rate": 9.6115587345444e-06, + "loss": 0.3807, + "num_tokens": 171775963.0, + "step": 418 + }, + { + "epoch": 0.16016819571865443, + "grad_norm": 0.2971335275753423, + "learning_rate": 9.609290766531806e-06, + "loss": 0.3761, + "num_tokens": 172160080.0, + "step": 419 + }, + { + "epoch": 0.16055045871559634, + "grad_norm": 0.3314012890549614, + "learning_rate": 9.60701649728492e-06, + "loss": 0.368, + "num_tokens": 172567767.0, + "step": 420 + }, + { + "epoch": 0.16093272171253822, + "grad_norm": 0.33399396700972167, + "learning_rate": 9.604735930291135e-06, + "loss": 0.3922, + "num_tokens": 172972247.0, + "step": 421 + }, + { + "epoch": 0.16131498470948014, + "grad_norm": 0.27313234738123665, + "learning_rate": 9.602449069047497e-06, + "loss": 0.3544, + "num_tokens": 173379826.0, + "step": 422 + }, + { + "epoch": 0.16169724770642202, + "grad_norm": 0.3225316836157724, + "learning_rate": 9.600155917060707e-06, + "loss": 0.3664, + "num_tokens": 173785687.0, + "step": 423 + }, + { + "epoch": 0.1620795107033639, + "grad_norm": 0.3458487622018256, + "learning_rate": 9.597856477847111e-06, + "loss": 0.3638, + "num_tokens": 174213115.0, + "step": 424 + }, + { + "epoch": 0.16246177370030582, + "grad_norm": 0.30896769571826427, + "learning_rate": 9.595550754932693e-06, + "loss": 0.3855, + "num_tokens": 174592231.0, + "step": 425 + }, + { + "epoch": 0.1628440366972477, + "grad_norm": 0.2692727375471378, + "learning_rate": 9.59323875185308e-06, + "loss": 0.3551, + "num_tokens": 174982003.0, + "step": 426 + }, + { + "epoch": 0.1632262996941896, + "grad_norm": 0.3606429103340105, + "learning_rate": 9.590920472153522e-06, + "loss": 0.3593, + "num_tokens": 175409032.0, + "step": 427 + }, + { + "epoch": 0.1636085626911315, + "grad_norm": 0.29516050888962153, + "learning_rate": 9.588595919388897e-06, + "loss": 0.3574, + "num_tokens": 175827009.0, + "step": 428 + }, + { + "epoch": 0.16399082568807338, + "grad_norm": 0.3176550911962296, + "learning_rate": 9.586265097123699e-06, + "loss": 0.3924, + "num_tokens": 176221841.0, + "step": 429 + }, + { + "epoch": 0.1643730886850153, + "grad_norm": 0.31234635489501056, + "learning_rate": 9.58392800893204e-06, + "loss": 0.3752, + "num_tokens": 176657605.0, + "step": 430 + }, + { + "epoch": 0.16475535168195718, + "grad_norm": 0.3061075043522359, + "learning_rate": 9.581584658397637e-06, + "loss": 0.3655, + "num_tokens": 177053768.0, + "step": 431 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 0.2688613494807311, + "learning_rate": 9.579235049113812e-06, + "loss": 0.3418, + "num_tokens": 177460319.0, + "step": 432 + }, + { + "epoch": 0.16551987767584098, + "grad_norm": 0.2982373514630313, + "learning_rate": 9.576879184683483e-06, + "loss": 0.3668, + "num_tokens": 177891087.0, + "step": 433 + }, + { + "epoch": 0.1659021406727829, + "grad_norm": 0.26814008512660753, + "learning_rate": 9.57451706871916e-06, + "loss": 0.3748, + "num_tokens": 178309584.0, + "step": 434 + }, + { + "epoch": 0.16628440366972477, + "grad_norm": 0.3205179162143746, + "learning_rate": 9.57214870484294e-06, + "loss": 0.3655, + "num_tokens": 178686819.0, + "step": 435 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.3109140650248626, + "learning_rate": 9.569774096686498e-06, + "loss": 0.3979, + "num_tokens": 179105603.0, + "step": 436 + }, + { + "epoch": 0.16704892966360857, + "grad_norm": 0.4006651074530381, + "learning_rate": 9.567393247891087e-06, + "loss": 0.3603, + "num_tokens": 179498729.0, + "step": 437 + }, + { + "epoch": 0.16743119266055045, + "grad_norm": 0.3055036826158832, + "learning_rate": 9.565006162107527e-06, + "loss": 0.3755, + "num_tokens": 179916566.0, + "step": 438 + }, + { + "epoch": 0.16781345565749237, + "grad_norm": 0.3251595737169241, + "learning_rate": 9.562612842996203e-06, + "loss": 0.3958, + "num_tokens": 180366759.0, + "step": 439 + }, + { + "epoch": 0.16819571865443425, + "grad_norm": 0.33165817753596266, + "learning_rate": 9.560213294227061e-06, + "loss": 0.3473, + "num_tokens": 180733397.0, + "step": 440 + }, + { + "epoch": 0.16857798165137614, + "grad_norm": 0.30203021428089855, + "learning_rate": 9.557807519479595e-06, + "loss": 0.3452, + "num_tokens": 181125897.0, + "step": 441 + }, + { + "epoch": 0.16896024464831805, + "grad_norm": 0.2731981459022347, + "learning_rate": 9.555395522442847e-06, + "loss": 0.3744, + "num_tokens": 181560759.0, + "step": 442 + }, + { + "epoch": 0.16934250764525993, + "grad_norm": 0.2779944882161738, + "learning_rate": 9.552977306815403e-06, + "loss": 0.3643, + "num_tokens": 182003951.0, + "step": 443 + }, + { + "epoch": 0.16972477064220184, + "grad_norm": 0.3098125330896663, + "learning_rate": 9.550552876305383e-06, + "loss": 0.3956, + "num_tokens": 182450140.0, + "step": 444 + }, + { + "epoch": 0.17010703363914373, + "grad_norm": 0.2566730804700202, + "learning_rate": 9.548122234630438e-06, + "loss": 0.3623, + "num_tokens": 182894923.0, + "step": 445 + }, + { + "epoch": 0.1704892966360856, + "grad_norm": 0.2728091076346928, + "learning_rate": 9.54568538551774e-06, + "loss": 0.3905, + "num_tokens": 183304819.0, + "step": 446 + }, + { + "epoch": 0.17087155963302753, + "grad_norm": 0.24871517814434455, + "learning_rate": 9.543242332703983e-06, + "loss": 0.3812, + "num_tokens": 183733747.0, + "step": 447 + }, + { + "epoch": 0.1712538226299694, + "grad_norm": 0.2665952290340883, + "learning_rate": 9.54079307993537e-06, + "loss": 0.3917, + "num_tokens": 184190568.0, + "step": 448 + }, + { + "epoch": 0.17163608562691132, + "grad_norm": 0.29778086934802145, + "learning_rate": 9.538337630967618e-06, + "loss": 0.3801, + "num_tokens": 184572672.0, + "step": 449 + }, + { + "epoch": 0.1720183486238532, + "grad_norm": 0.274711025201977, + "learning_rate": 9.535875989565937e-06, + "loss": 0.3564, + "num_tokens": 184995877.0, + "step": 450 + }, + { + "epoch": 0.17240061162079512, + "grad_norm": 0.28852132868265845, + "learning_rate": 9.53340815950504e-06, + "loss": 0.39, + "num_tokens": 185389102.0, + "step": 451 + }, + { + "epoch": 0.172782874617737, + "grad_norm": 0.2810517685470472, + "learning_rate": 9.530934144569126e-06, + "loss": 0.3675, + "num_tokens": 185814287.0, + "step": 452 + }, + { + "epoch": 0.1731651376146789, + "grad_norm": 0.29795417179971595, + "learning_rate": 9.528453948551874e-06, + "loss": 0.3828, + "num_tokens": 186211065.0, + "step": 453 + }, + { + "epoch": 0.1735474006116208, + "grad_norm": 0.2988078418945623, + "learning_rate": 9.52596757525645e-06, + "loss": 0.4161, + "num_tokens": 186618556.0, + "step": 454 + }, + { + "epoch": 0.17392966360856268, + "grad_norm": 0.3041655819346514, + "learning_rate": 9.523475028495487e-06, + "loss": 0.3783, + "num_tokens": 187044102.0, + "step": 455 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 0.29069153764266276, + "learning_rate": 9.520976312091085e-06, + "loss": 0.3589, + "num_tokens": 187386889.0, + "step": 456 + }, + { + "epoch": 0.17469418960244648, + "grad_norm": 0.3010367164793014, + "learning_rate": 9.518471429874804e-06, + "loss": 0.3913, + "num_tokens": 187834420.0, + "step": 457 + }, + { + "epoch": 0.17507645259938837, + "grad_norm": 0.28465055015834334, + "learning_rate": 9.51596038568766e-06, + "loss": 0.3661, + "num_tokens": 188251183.0, + "step": 458 + }, + { + "epoch": 0.17545871559633028, + "grad_norm": 0.30943574129616025, + "learning_rate": 9.513443183380116e-06, + "loss": 0.3967, + "num_tokens": 188689531.0, + "step": 459 + }, + { + "epoch": 0.17584097859327216, + "grad_norm": 0.34124866740742443, + "learning_rate": 9.510919826812081e-06, + "loss": 0.398, + "num_tokens": 189152497.0, + "step": 460 + }, + { + "epoch": 0.17622324159021407, + "grad_norm": 0.2911543379474011, + "learning_rate": 9.5083903198529e-06, + "loss": 0.3761, + "num_tokens": 189594914.0, + "step": 461 + }, + { + "epoch": 0.17660550458715596, + "grad_norm": 0.2642824578844909, + "learning_rate": 9.505854666381347e-06, + "loss": 0.3492, + "num_tokens": 189962468.0, + "step": 462 + }, + { + "epoch": 0.17698776758409787, + "grad_norm": 0.2997282910158942, + "learning_rate": 9.503312870285623e-06, + "loss": 0.3711, + "num_tokens": 190366781.0, + "step": 463 + }, + { + "epoch": 0.17737003058103976, + "grad_norm": 0.3013150234420607, + "learning_rate": 9.500764935463348e-06, + "loss": 0.3927, + "num_tokens": 190767753.0, + "step": 464 + }, + { + "epoch": 0.17775229357798164, + "grad_norm": 0.294207055976542, + "learning_rate": 9.498210865821555e-06, + "loss": 0.3969, + "num_tokens": 191206908.0, + "step": 465 + }, + { + "epoch": 0.17813455657492355, + "grad_norm": 0.26404247072093273, + "learning_rate": 9.495650665276683e-06, + "loss": 0.3529, + "num_tokens": 191531944.0, + "step": 466 + }, + { + "epoch": 0.17851681957186544, + "grad_norm": 0.2680184575876437, + "learning_rate": 9.493084337754573e-06, + "loss": 0.3526, + "num_tokens": 191925006.0, + "step": 467 + }, + { + "epoch": 0.17889908256880735, + "grad_norm": 0.3128348888657449, + "learning_rate": 9.490511887190463e-06, + "loss": 0.392, + "num_tokens": 192349836.0, + "step": 468 + }, + { + "epoch": 0.17928134556574923, + "grad_norm": 0.3005216178734559, + "learning_rate": 9.487933317528979e-06, + "loss": 0.3547, + "num_tokens": 192716877.0, + "step": 469 + }, + { + "epoch": 0.17966360856269112, + "grad_norm": 0.30789452207039825, + "learning_rate": 9.485348632724128e-06, + "loss": 0.3973, + "num_tokens": 193172070.0, + "step": 470 + }, + { + "epoch": 0.18004587155963303, + "grad_norm": 0.2628617754857212, + "learning_rate": 9.482757836739297e-06, + "loss": 0.345, + "num_tokens": 193563030.0, + "step": 471 + }, + { + "epoch": 0.18042813455657492, + "grad_norm": 0.28054255699602965, + "learning_rate": 9.480160933547243e-06, + "loss": 0.3544, + "num_tokens": 193983277.0, + "step": 472 + }, + { + "epoch": 0.18081039755351683, + "grad_norm": 0.30902723175488345, + "learning_rate": 9.477557927130085e-06, + "loss": 0.3776, + "num_tokens": 194420042.0, + "step": 473 + }, + { + "epoch": 0.1811926605504587, + "grad_norm": 0.30420476138288444, + "learning_rate": 9.474948821479306e-06, + "loss": 0.3899, + "num_tokens": 194842062.0, + "step": 474 + }, + { + "epoch": 0.18157492354740062, + "grad_norm": 0.31719726414849897, + "learning_rate": 9.472333620595739e-06, + "loss": 0.3944, + "num_tokens": 195276393.0, + "step": 475 + }, + { + "epoch": 0.1819571865443425, + "grad_norm": 0.3020234755697266, + "learning_rate": 9.469712328489561e-06, + "loss": 0.3736, + "num_tokens": 195631344.0, + "step": 476 + }, + { + "epoch": 0.1823394495412844, + "grad_norm": 0.27797925604136875, + "learning_rate": 9.467084949180297e-06, + "loss": 0.3497, + "num_tokens": 196006323.0, + "step": 477 + }, + { + "epoch": 0.1827217125382263, + "grad_norm": 0.2624462326952848, + "learning_rate": 9.464451486696793e-06, + "loss": 0.3727, + "num_tokens": 196438035.0, + "step": 478 + }, + { + "epoch": 0.1831039755351682, + "grad_norm": 0.31834490894055056, + "learning_rate": 9.46181194507724e-06, + "loss": 0.3497, + "num_tokens": 196845173.0, + "step": 479 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 0.36455517719545855, + "learning_rate": 9.459166328369135e-06, + "loss": 0.3878, + "num_tokens": 197294639.0, + "step": 480 + }, + { + "epoch": 0.183868501529052, + "grad_norm": 0.3291352374849635, + "learning_rate": 9.4565146406293e-06, + "loss": 0.3657, + "num_tokens": 197703636.0, + "step": 481 + }, + { + "epoch": 0.18425076452599387, + "grad_norm": 0.2601859097081127, + "learning_rate": 9.453856885923863e-06, + "loss": 0.3778, + "num_tokens": 198164083.0, + "step": 482 + }, + { + "epoch": 0.18463302752293578, + "grad_norm": 0.2756351921066936, + "learning_rate": 9.451193068328258e-06, + "loss": 0.406, + "num_tokens": 198571786.0, + "step": 483 + }, + { + "epoch": 0.18501529051987767, + "grad_norm": 0.2783378167166546, + "learning_rate": 9.448523191927212e-06, + "loss": 0.3693, + "num_tokens": 198961474.0, + "step": 484 + }, + { + "epoch": 0.18539755351681958, + "grad_norm": 0.341190072775478, + "learning_rate": 9.445847260814745e-06, + "loss": 0.3747, + "num_tokens": 199368667.0, + "step": 485 + }, + { + "epoch": 0.18577981651376146, + "grad_norm": 0.2890813140609965, + "learning_rate": 9.443165279094162e-06, + "loss": 0.3831, + "num_tokens": 199755260.0, + "step": 486 + }, + { + "epoch": 0.18616207951070338, + "grad_norm": 0.2942765983318325, + "learning_rate": 9.440477250878044e-06, + "loss": 0.3789, + "num_tokens": 200161470.0, + "step": 487 + }, + { + "epoch": 0.18654434250764526, + "grad_norm": 0.30637683198183496, + "learning_rate": 9.437783180288244e-06, + "loss": 0.3698, + "num_tokens": 200557032.0, + "step": 488 + }, + { + "epoch": 0.18692660550458715, + "grad_norm": 0.30610071389048066, + "learning_rate": 9.435083071455883e-06, + "loss": 0.3666, + "num_tokens": 200939220.0, + "step": 489 + }, + { + "epoch": 0.18730886850152906, + "grad_norm": 0.36545996446324247, + "learning_rate": 9.432376928521336e-06, + "loss": 0.375, + "num_tokens": 201304275.0, + "step": 490 + }, + { + "epoch": 0.18769113149847094, + "grad_norm": 0.29029407034186155, + "learning_rate": 9.429664755634239e-06, + "loss": 0.3762, + "num_tokens": 201726357.0, + "step": 491 + }, + { + "epoch": 0.18807339449541285, + "grad_norm": 0.2615171533792097, + "learning_rate": 9.426946556953465e-06, + "loss": 0.3638, + "num_tokens": 202142078.0, + "step": 492 + }, + { + "epoch": 0.18845565749235474, + "grad_norm": 0.3257785868272309, + "learning_rate": 9.424222336647135e-06, + "loss": 0.3661, + "num_tokens": 202549698.0, + "step": 493 + }, + { + "epoch": 0.18883792048929662, + "grad_norm": 0.3102744721706698, + "learning_rate": 9.421492098892597e-06, + "loss": 0.3812, + "num_tokens": 202955662.0, + "step": 494 + }, + { + "epoch": 0.18922018348623854, + "grad_norm": 0.2853986566575885, + "learning_rate": 9.418755847876433e-06, + "loss": 0.3636, + "num_tokens": 203371062.0, + "step": 495 + }, + { + "epoch": 0.18960244648318042, + "grad_norm": 0.31114186135769983, + "learning_rate": 9.416013587794438e-06, + "loss": 0.3851, + "num_tokens": 203774734.0, + "step": 496 + }, + { + "epoch": 0.18998470948012233, + "grad_norm": 0.35213527916400017, + "learning_rate": 9.413265322851628e-06, + "loss": 0.3934, + "num_tokens": 204167686.0, + "step": 497 + }, + { + "epoch": 0.19036697247706422, + "grad_norm": 0.321062587627723, + "learning_rate": 9.410511057262223e-06, + "loss": 0.3522, + "num_tokens": 204525898.0, + "step": 498 + }, + { + "epoch": 0.19074923547400613, + "grad_norm": 0.29982928855720004, + "learning_rate": 9.407750795249649e-06, + "loss": 0.3686, + "num_tokens": 204947712.0, + "step": 499 + }, + { + "epoch": 0.191131498470948, + "grad_norm": 0.3385608318058004, + "learning_rate": 9.404984541046522e-06, + "loss": 0.3919, + "num_tokens": 205398921.0, + "step": 500 + }, + { + "epoch": 0.1915137614678899, + "grad_norm": 0.27399706824379355, + "learning_rate": 9.402212298894646e-06, + "loss": 0.3524, + "num_tokens": 205794406.0, + "step": 501 + }, + { + "epoch": 0.1918960244648318, + "grad_norm": 0.2934791355479545, + "learning_rate": 9.399434073045013e-06, + "loss": 0.3734, + "num_tokens": 206213008.0, + "step": 502 + }, + { + "epoch": 0.1922782874617737, + "grad_norm": 0.29943182622013437, + "learning_rate": 9.396649867757783e-06, + "loss": 0.3825, + "num_tokens": 206610088.0, + "step": 503 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 0.31339155050792894, + "learning_rate": 9.393859687302294e-06, + "loss": 0.398, + "num_tokens": 207005404.0, + "step": 504 + }, + { + "epoch": 0.1930428134556575, + "grad_norm": 0.3000940417256956, + "learning_rate": 9.391063535957037e-06, + "loss": 0.3768, + "num_tokens": 207439377.0, + "step": 505 + }, + { + "epoch": 0.19342507645259938, + "grad_norm": 0.26964691955243364, + "learning_rate": 9.388261418009665e-06, + "loss": 0.3807, + "num_tokens": 207854975.0, + "step": 506 + }, + { + "epoch": 0.1938073394495413, + "grad_norm": 0.29753787139304366, + "learning_rate": 9.385453337756978e-06, + "loss": 0.3754, + "num_tokens": 208265785.0, + "step": 507 + }, + { + "epoch": 0.19418960244648317, + "grad_norm": 0.29788616465134843, + "learning_rate": 9.382639299504918e-06, + "loss": 0.3717, + "num_tokens": 208693798.0, + "step": 508 + }, + { + "epoch": 0.19457186544342508, + "grad_norm": 0.3019401432066651, + "learning_rate": 9.379819307568566e-06, + "loss": 0.3665, + "num_tokens": 209103195.0, + "step": 509 + }, + { + "epoch": 0.19495412844036697, + "grad_norm": 0.2938523692663089, + "learning_rate": 9.376993366272128e-06, + "loss": 0.3775, + "num_tokens": 209462352.0, + "step": 510 + }, + { + "epoch": 0.19533639143730888, + "grad_norm": 0.33653884095044895, + "learning_rate": 9.374161479948937e-06, + "loss": 0.3745, + "num_tokens": 209830545.0, + "step": 511 + }, + { + "epoch": 0.19571865443425077, + "grad_norm": 0.3399122957923152, + "learning_rate": 9.371323652941438e-06, + "loss": 0.3926, + "num_tokens": 210276774.0, + "step": 512 + }, + { + "epoch": 0.19610091743119265, + "grad_norm": 0.3070891879415521, + "learning_rate": 9.368479889601192e-06, + "loss": 0.3856, + "num_tokens": 210676946.0, + "step": 513 + }, + { + "epoch": 0.19648318042813456, + "grad_norm": 0.3488060478890113, + "learning_rate": 9.365630194288856e-06, + "loss": 0.3807, + "num_tokens": 211108153.0, + "step": 514 + }, + { + "epoch": 0.19686544342507645, + "grad_norm": 0.3270346602514676, + "learning_rate": 9.362774571374186e-06, + "loss": 0.3612, + "num_tokens": 211503099.0, + "step": 515 + }, + { + "epoch": 0.19724770642201836, + "grad_norm": 0.2945665640848005, + "learning_rate": 9.359913025236028e-06, + "loss": 0.3972, + "num_tokens": 211906574.0, + "step": 516 + }, + { + "epoch": 0.19762996941896024, + "grad_norm": 0.27355521672222194, + "learning_rate": 9.35704556026231e-06, + "loss": 0.3746, + "num_tokens": 212328522.0, + "step": 517 + }, + { + "epoch": 0.19801223241590213, + "grad_norm": 0.31336251169488294, + "learning_rate": 9.354172180850038e-06, + "loss": 0.3947, + "num_tokens": 212759186.0, + "step": 518 + }, + { + "epoch": 0.19839449541284404, + "grad_norm": 0.293011750467182, + "learning_rate": 9.351292891405281e-06, + "loss": 0.3646, + "num_tokens": 213127398.0, + "step": 519 + }, + { + "epoch": 0.19877675840978593, + "grad_norm": 0.29892819305468965, + "learning_rate": 9.34840769634318e-06, + "loss": 0.3731, + "num_tokens": 213515012.0, + "step": 520 + }, + { + "epoch": 0.19915902140672784, + "grad_norm": 0.30348130681744817, + "learning_rate": 9.345516600087923e-06, + "loss": 0.3922, + "num_tokens": 213932347.0, + "step": 521 + }, + { + "epoch": 0.19954128440366972, + "grad_norm": 0.3409043437047744, + "learning_rate": 9.342619607072751e-06, + "loss": 0.3713, + "num_tokens": 214347263.0, + "step": 522 + }, + { + "epoch": 0.19992354740061163, + "grad_norm": 0.31878638055344494, + "learning_rate": 9.339716721739949e-06, + "loss": 0.3805, + "num_tokens": 214757335.0, + "step": 523 + }, + { + "epoch": 0.20030581039755352, + "grad_norm": 0.3084691124969446, + "learning_rate": 9.336807948540836e-06, + "loss": 0.3658, + "num_tokens": 215243104.0, + "step": 524 + }, + { + "epoch": 0.2006880733944954, + "grad_norm": 0.29507224019393014, + "learning_rate": 9.333893291935755e-06, + "loss": 0.4107, + "num_tokens": 215656347.0, + "step": 525 + }, + { + "epoch": 0.20107033639143732, + "grad_norm": 0.3054571803769428, + "learning_rate": 9.330972756394075e-06, + "loss": 0.3825, + "num_tokens": 216068674.0, + "step": 526 + }, + { + "epoch": 0.2014525993883792, + "grad_norm": 0.34096781493652856, + "learning_rate": 9.328046346394182e-06, + "loss": 0.3694, + "num_tokens": 216456673.0, + "step": 527 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 0.2801169707879607, + "learning_rate": 9.325114066423465e-06, + "loss": 0.3671, + "num_tokens": 216873425.0, + "step": 528 + }, + { + "epoch": 0.202217125382263, + "grad_norm": 0.303338028634198, + "learning_rate": 9.322175920978314e-06, + "loss": 0.3509, + "num_tokens": 217249540.0, + "step": 529 + }, + { + "epoch": 0.20259938837920488, + "grad_norm": 0.28886781857183486, + "learning_rate": 9.319231914564121e-06, + "loss": 0.3817, + "num_tokens": 217688565.0, + "step": 530 + }, + { + "epoch": 0.2029816513761468, + "grad_norm": 0.2914412595675159, + "learning_rate": 9.316282051695258e-06, + "loss": 0.372, + "num_tokens": 218064372.0, + "step": 531 + }, + { + "epoch": 0.20336391437308868, + "grad_norm": 0.36323745270467844, + "learning_rate": 9.313326336895075e-06, + "loss": 0.4051, + "num_tokens": 218466876.0, + "step": 532 + }, + { + "epoch": 0.2037461773700306, + "grad_norm": 0.29327679268925266, + "learning_rate": 9.310364774695901e-06, + "loss": 0.3542, + "num_tokens": 218880360.0, + "step": 533 + }, + { + "epoch": 0.20412844036697247, + "grad_norm": 0.2793067231747973, + "learning_rate": 9.307397369639036e-06, + "loss": 0.3767, + "num_tokens": 219286334.0, + "step": 534 + }, + { + "epoch": 0.2045107033639144, + "grad_norm": 0.24714537611934967, + "learning_rate": 9.304424126274724e-06, + "loss": 0.3528, + "num_tokens": 219714712.0, + "step": 535 + }, + { + "epoch": 0.20489296636085627, + "grad_norm": 0.3332075476944507, + "learning_rate": 9.301445049162177e-06, + "loss": 0.3713, + "num_tokens": 220137868.0, + "step": 536 + }, + { + "epoch": 0.20527522935779816, + "grad_norm": 0.3245785325808012, + "learning_rate": 9.298460142869548e-06, + "loss": 0.3904, + "num_tokens": 220539981.0, + "step": 537 + }, + { + "epoch": 0.20565749235474007, + "grad_norm": 0.2934244498489479, + "learning_rate": 9.295469411973921e-06, + "loss": 0.3794, + "num_tokens": 220975235.0, + "step": 538 + }, + { + "epoch": 0.20603975535168195, + "grad_norm": 0.2742907283186334, + "learning_rate": 9.292472861061322e-06, + "loss": 0.3667, + "num_tokens": 221363009.0, + "step": 539 + }, + { + "epoch": 0.20642201834862386, + "grad_norm": 0.3443272821240969, + "learning_rate": 9.289470494726694e-06, + "loss": 0.3613, + "num_tokens": 221788343.0, + "step": 540 + }, + { + "epoch": 0.20680428134556575, + "grad_norm": 0.3188059870697428, + "learning_rate": 9.286462317573905e-06, + "loss": 0.3635, + "num_tokens": 222189682.0, + "step": 541 + }, + { + "epoch": 0.20718654434250763, + "grad_norm": 0.25671065930943543, + "learning_rate": 9.283448334215724e-06, + "loss": 0.3534, + "num_tokens": 222612978.0, + "step": 542 + }, + { + "epoch": 0.20756880733944955, + "grad_norm": 0.26975231412069167, + "learning_rate": 9.280428549273832e-06, + "loss": 0.3736, + "num_tokens": 223047569.0, + "step": 543 + }, + { + "epoch": 0.20795107033639143, + "grad_norm": 0.29527444734804703, + "learning_rate": 9.2774029673788e-06, + "loss": 0.384, + "num_tokens": 223436356.0, + "step": 544 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.2757889590848201, + "learning_rate": 9.274371593170091e-06, + "loss": 0.3868, + "num_tokens": 223914909.0, + "step": 545 + }, + { + "epoch": 0.20871559633027523, + "grad_norm": 0.29282983952309855, + "learning_rate": 9.271334431296051e-06, + "loss": 0.3865, + "num_tokens": 224320364.0, + "step": 546 + }, + { + "epoch": 0.2090978593272171, + "grad_norm": 0.31434903217338434, + "learning_rate": 9.268291486413897e-06, + "loss": 0.3807, + "num_tokens": 224754045.0, + "step": 547 + }, + { + "epoch": 0.20948012232415902, + "grad_norm": 0.32996591090353755, + "learning_rate": 9.265242763189717e-06, + "loss": 0.3969, + "num_tokens": 225181957.0, + "step": 548 + }, + { + "epoch": 0.2098623853211009, + "grad_norm": 0.30214781509406835, + "learning_rate": 9.26218826629846e-06, + "loss": 0.3638, + "num_tokens": 225580134.0, + "step": 549 + }, + { + "epoch": 0.21024464831804282, + "grad_norm": 0.3045215069795361, + "learning_rate": 9.259128000423926e-06, + "loss": 0.3787, + "num_tokens": 226003739.0, + "step": 550 + }, + { + "epoch": 0.2106269113149847, + "grad_norm": 0.31662315525612694, + "learning_rate": 9.25606197025876e-06, + "loss": 0.3876, + "num_tokens": 226403672.0, + "step": 551 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 0.2881137354328672, + "learning_rate": 9.252990180504451e-06, + "loss": 0.3935, + "num_tokens": 226845604.0, + "step": 552 + }, + { + "epoch": 0.2113914373088685, + "grad_norm": 0.28552091425507453, + "learning_rate": 9.249912635871317e-06, + "loss": 0.3635, + "num_tokens": 227275127.0, + "step": 553 + }, + { + "epoch": 0.2117737003058104, + "grad_norm": 0.30307505848139343, + "learning_rate": 9.246829341078503e-06, + "loss": 0.3729, + "num_tokens": 227670815.0, + "step": 554 + }, + { + "epoch": 0.2121559633027523, + "grad_norm": 0.2659969997238507, + "learning_rate": 9.243740300853964e-06, + "loss": 0.3737, + "num_tokens": 228103306.0, + "step": 555 + }, + { + "epoch": 0.21253822629969418, + "grad_norm": 0.2883987213896317, + "learning_rate": 9.240645519934474e-06, + "loss": 0.3917, + "num_tokens": 228539059.0, + "step": 556 + }, + { + "epoch": 0.2129204892966361, + "grad_norm": 0.29965016857636995, + "learning_rate": 9.237545003065604e-06, + "loss": 0.3831, + "num_tokens": 228932781.0, + "step": 557 + }, + { + "epoch": 0.21330275229357798, + "grad_norm": 0.26783841808904507, + "learning_rate": 9.234438755001725e-06, + "loss": 0.3953, + "num_tokens": 229365703.0, + "step": 558 + }, + { + "epoch": 0.21368501529051986, + "grad_norm": 0.2979107885593306, + "learning_rate": 9.23132678050599e-06, + "loss": 0.4011, + "num_tokens": 229812926.0, + "step": 559 + }, + { + "epoch": 0.21406727828746178, + "grad_norm": 0.28567990501460083, + "learning_rate": 9.228209084350342e-06, + "loss": 0.4132, + "num_tokens": 230250144.0, + "step": 560 + }, + { + "epoch": 0.21444954128440366, + "grad_norm": 0.2792966842770343, + "learning_rate": 9.225085671315491e-06, + "loss": 0.3629, + "num_tokens": 230650438.0, + "step": 561 + }, + { + "epoch": 0.21483180428134557, + "grad_norm": 0.3151662483193369, + "learning_rate": 9.221956546190912e-06, + "loss": 0.364, + "num_tokens": 231006858.0, + "step": 562 + }, + { + "epoch": 0.21521406727828746, + "grad_norm": 0.277563403731734, + "learning_rate": 9.218821713774842e-06, + "loss": 0.3831, + "num_tokens": 231442310.0, + "step": 563 + }, + { + "epoch": 0.21559633027522937, + "grad_norm": 0.29931319887763247, + "learning_rate": 9.215681178874275e-06, + "loss": 0.3964, + "num_tokens": 231891436.0, + "step": 564 + }, + { + "epoch": 0.21597859327217125, + "grad_norm": 0.273195612599927, + "learning_rate": 9.21253494630494e-06, + "loss": 0.3903, + "num_tokens": 232347844.0, + "step": 565 + }, + { + "epoch": 0.21636085626911314, + "grad_norm": 0.2834660193372076, + "learning_rate": 9.209383020891304e-06, + "loss": 0.3756, + "num_tokens": 232742857.0, + "step": 566 + }, + { + "epoch": 0.21674311926605505, + "grad_norm": 0.31449999017472524, + "learning_rate": 9.206225407466572e-06, + "loss": 0.3945, + "num_tokens": 233124849.0, + "step": 567 + }, + { + "epoch": 0.21712538226299694, + "grad_norm": 0.25810005816552867, + "learning_rate": 9.203062110872658e-06, + "loss": 0.3455, + "num_tokens": 233550300.0, + "step": 568 + }, + { + "epoch": 0.21750764525993885, + "grad_norm": 0.33107945291173424, + "learning_rate": 9.199893135960203e-06, + "loss": 0.3668, + "num_tokens": 233953546.0, + "step": 569 + }, + { + "epoch": 0.21788990825688073, + "grad_norm": 0.24018427043904134, + "learning_rate": 9.196718487588552e-06, + "loss": 0.3547, + "num_tokens": 234356061.0, + "step": 570 + }, + { + "epoch": 0.21827217125382262, + "grad_norm": 0.33465132286658594, + "learning_rate": 9.193538170625743e-06, + "loss": 0.3893, + "num_tokens": 234813492.0, + "step": 571 + }, + { + "epoch": 0.21865443425076453, + "grad_norm": 0.31411332156760363, + "learning_rate": 9.19035218994851e-06, + "loss": 0.3484, + "num_tokens": 235204088.0, + "step": 572 + }, + { + "epoch": 0.2190366972477064, + "grad_norm": 0.2883985430085643, + "learning_rate": 9.187160550442278e-06, + "loss": 0.3619, + "num_tokens": 235602795.0, + "step": 573 + }, + { + "epoch": 0.21941896024464833, + "grad_norm": 0.30208554680643634, + "learning_rate": 9.183963257001142e-06, + "loss": 0.3875, + "num_tokens": 236003226.0, + "step": 574 + }, + { + "epoch": 0.2198012232415902, + "grad_norm": 0.30105688110725787, + "learning_rate": 9.18076031452787e-06, + "loss": 0.389, + "num_tokens": 236402991.0, + "step": 575 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 0.31191025907793485, + "learning_rate": 9.177551727933888e-06, + "loss": 0.3908, + "num_tokens": 236809589.0, + "step": 576 + }, + { + "epoch": 0.220565749235474, + "grad_norm": 0.2754676790609133, + "learning_rate": 9.17433750213928e-06, + "loss": 0.3776, + "num_tokens": 237193709.0, + "step": 577 + }, + { + "epoch": 0.2209480122324159, + "grad_norm": 0.289805606978647, + "learning_rate": 9.171117642072783e-06, + "loss": 0.369, + "num_tokens": 237597969.0, + "step": 578 + }, + { + "epoch": 0.2213302752293578, + "grad_norm": 0.2792412200321901, + "learning_rate": 9.167892152671762e-06, + "loss": 0.3736, + "num_tokens": 238027892.0, + "step": 579 + }, + { + "epoch": 0.2217125382262997, + "grad_norm": 0.337657201569649, + "learning_rate": 9.164661038882223e-06, + "loss": 0.3672, + "num_tokens": 238417959.0, + "step": 580 + }, + { + "epoch": 0.2220948012232416, + "grad_norm": 0.27212654173674283, + "learning_rate": 9.161424305658792e-06, + "loss": 0.3741, + "num_tokens": 238838950.0, + "step": 581 + }, + { + "epoch": 0.22247706422018348, + "grad_norm": 0.3096392932222775, + "learning_rate": 9.158181957964713e-06, + "loss": 0.398, + "num_tokens": 239288447.0, + "step": 582 + }, + { + "epoch": 0.22285932721712537, + "grad_norm": 0.3249685247146732, + "learning_rate": 9.154934000771844e-06, + "loss": 0.3702, + "num_tokens": 239634933.0, + "step": 583 + }, + { + "epoch": 0.22324159021406728, + "grad_norm": 0.31423353987151437, + "learning_rate": 9.151680439060636e-06, + "loss": 0.3733, + "num_tokens": 240025937.0, + "step": 584 + }, + { + "epoch": 0.22362385321100917, + "grad_norm": 0.2834341205029428, + "learning_rate": 9.148421277820138e-06, + "loss": 0.3642, + "num_tokens": 240411794.0, + "step": 585 + }, + { + "epoch": 0.22400611620795108, + "grad_norm": 0.3457671514107957, + "learning_rate": 9.145156522047986e-06, + "loss": 0.3717, + "num_tokens": 240819969.0, + "step": 586 + }, + { + "epoch": 0.22438837920489296, + "grad_norm": 0.30436298898732933, + "learning_rate": 9.141886176750397e-06, + "loss": 0.372, + "num_tokens": 241219686.0, + "step": 587 + }, + { + "epoch": 0.22477064220183487, + "grad_norm": 0.3015407744851569, + "learning_rate": 9.138610246942157e-06, + "loss": 0.3985, + "num_tokens": 241646744.0, + "step": 588 + }, + { + "epoch": 0.22515290519877676, + "grad_norm": 0.2618779128631558, + "learning_rate": 9.135328737646611e-06, + "loss": 0.394, + "num_tokens": 242085941.0, + "step": 589 + }, + { + "epoch": 0.22553516819571864, + "grad_norm": 0.315741853237637, + "learning_rate": 9.132041653895668e-06, + "loss": 0.3823, + "num_tokens": 242488824.0, + "step": 590 + }, + { + "epoch": 0.22591743119266056, + "grad_norm": 0.2776697465493544, + "learning_rate": 9.128749000729777e-06, + "loss": 0.3565, + "num_tokens": 242890501.0, + "step": 591 + }, + { + "epoch": 0.22629969418960244, + "grad_norm": 0.299578333638845, + "learning_rate": 9.125450783197931e-06, + "loss": 0.3905, + "num_tokens": 243323495.0, + "step": 592 + }, + { + "epoch": 0.22668195718654435, + "grad_norm": 0.28717242108889424, + "learning_rate": 9.122147006357657e-06, + "loss": 0.3804, + "num_tokens": 243759735.0, + "step": 593 + }, + { + "epoch": 0.22706422018348624, + "grad_norm": 0.27615249536903175, + "learning_rate": 9.118837675275006e-06, + "loss": 0.3902, + "num_tokens": 244185214.0, + "step": 594 + }, + { + "epoch": 0.22744648318042812, + "grad_norm": 0.310301735025032, + "learning_rate": 9.11552279502454e-06, + "loss": 0.3986, + "num_tokens": 244603987.0, + "step": 595 + }, + { + "epoch": 0.22782874617737003, + "grad_norm": 0.33691526759287665, + "learning_rate": 9.112202370689337e-06, + "loss": 0.4026, + "num_tokens": 245001704.0, + "step": 596 + }, + { + "epoch": 0.22821100917431192, + "grad_norm": 0.2865904313101305, + "learning_rate": 9.108876407360976e-06, + "loss": 0.3881, + "num_tokens": 245407882.0, + "step": 597 + }, + { + "epoch": 0.22859327217125383, + "grad_norm": 0.2826526928273864, + "learning_rate": 9.105544910139527e-06, + "loss": 0.357, + "num_tokens": 245851231.0, + "step": 598 + }, + { + "epoch": 0.22897553516819572, + "grad_norm": 0.27531662437234644, + "learning_rate": 9.102207884133548e-06, + "loss": 0.3882, + "num_tokens": 246265295.0, + "step": 599 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 0.3089688461620704, + "learning_rate": 9.09886533446007e-06, + "loss": 0.3807, + "num_tokens": 246671259.0, + "step": 600 + }, + { + "epoch": 0.2297400611620795, + "grad_norm": 0.28978239421247837, + "learning_rate": 9.0955172662446e-06, + "loss": 0.3876, + "num_tokens": 247074096.0, + "step": 601 + }, + { + "epoch": 0.2301223241590214, + "grad_norm": 0.2645958772906488, + "learning_rate": 9.092163684621105e-06, + "loss": 0.3725, + "num_tokens": 247477563.0, + "step": 602 + }, + { + "epoch": 0.2305045871559633, + "grad_norm": 0.28568202828981065, + "learning_rate": 9.088804594732006e-06, + "loss": 0.3928, + "num_tokens": 247940093.0, + "step": 603 + }, + { + "epoch": 0.2308868501529052, + "grad_norm": 0.27585109128936636, + "learning_rate": 9.085440001728168e-06, + "loss": 0.3704, + "num_tokens": 248368764.0, + "step": 604 + }, + { + "epoch": 0.2312691131498471, + "grad_norm": 0.27653165882309483, + "learning_rate": 9.082069910768901e-06, + "loss": 0.4143, + "num_tokens": 248829463.0, + "step": 605 + }, + { + "epoch": 0.231651376146789, + "grad_norm": 0.29920389094743655, + "learning_rate": 9.078694327021938e-06, + "loss": 0.4172, + "num_tokens": 249274207.0, + "step": 606 + }, + { + "epoch": 0.23203363914373087, + "grad_norm": 0.29640280958146953, + "learning_rate": 9.07531325566344e-06, + "loss": 0.3873, + "num_tokens": 249700194.0, + "step": 607 + }, + { + "epoch": 0.2324159021406728, + "grad_norm": 0.28915016883483846, + "learning_rate": 9.071926701877985e-06, + "loss": 0.384, + "num_tokens": 250143961.0, + "step": 608 + }, + { + "epoch": 0.23279816513761467, + "grad_norm": 0.3255035345947518, + "learning_rate": 9.068534670858547e-06, + "loss": 0.38, + "num_tokens": 250582590.0, + "step": 609 + }, + { + "epoch": 0.23318042813455658, + "grad_norm": 0.2877177628243263, + "learning_rate": 9.065137167806509e-06, + "loss": 0.3714, + "num_tokens": 250973246.0, + "step": 610 + }, + { + "epoch": 0.23356269113149847, + "grad_norm": 0.28550980486061095, + "learning_rate": 9.061734197931645e-06, + "loss": 0.4022, + "num_tokens": 251370710.0, + "step": 611 + }, + { + "epoch": 0.23394495412844038, + "grad_norm": 0.31122468608101966, + "learning_rate": 9.058325766452104e-06, + "loss": 0.3705, + "num_tokens": 251778758.0, + "step": 612 + }, + { + "epoch": 0.23432721712538226, + "grad_norm": 0.3399712579669277, + "learning_rate": 9.054911878594415e-06, + "loss": 0.3746, + "num_tokens": 252191372.0, + "step": 613 + }, + { + "epoch": 0.23470948012232415, + "grad_norm": 0.35218383149346305, + "learning_rate": 9.051492539593473e-06, + "loss": 0.4096, + "num_tokens": 252612141.0, + "step": 614 + }, + { + "epoch": 0.23509174311926606, + "grad_norm": 0.3077852425607384, + "learning_rate": 9.048067754692538e-06, + "loss": 0.3799, + "num_tokens": 252985316.0, + "step": 615 + }, + { + "epoch": 0.23547400611620795, + "grad_norm": 0.27869103967534725, + "learning_rate": 9.044637529143206e-06, + "loss": 0.3884, + "num_tokens": 253394455.0, + "step": 616 + }, + { + "epoch": 0.23585626911314986, + "grad_norm": 0.31922087897709445, + "learning_rate": 9.041201868205432e-06, + "loss": 0.3662, + "num_tokens": 253762601.0, + "step": 617 + }, + { + "epoch": 0.23623853211009174, + "grad_norm": 0.3520791582706998, + "learning_rate": 9.037760777147497e-06, + "loss": 0.3836, + "num_tokens": 254227575.0, + "step": 618 + }, + { + "epoch": 0.23662079510703363, + "grad_norm": 0.2809234911336819, + "learning_rate": 9.034314261246007e-06, + "loss": 0.3833, + "num_tokens": 254670426.0, + "step": 619 + }, + { + "epoch": 0.23700305810397554, + "grad_norm": 0.40222763235101694, + "learning_rate": 9.030862325785893e-06, + "loss": 0.39, + "num_tokens": 255059367.0, + "step": 620 + }, + { + "epoch": 0.23738532110091742, + "grad_norm": 0.28733549992330404, + "learning_rate": 9.02740497606039e-06, + "loss": 0.3714, + "num_tokens": 255498551.0, + "step": 621 + }, + { + "epoch": 0.23776758409785934, + "grad_norm": 0.3046221037146164, + "learning_rate": 9.023942217371041e-06, + "loss": 0.3862, + "num_tokens": 255906818.0, + "step": 622 + }, + { + "epoch": 0.23814984709480122, + "grad_norm": 0.30704559198814274, + "learning_rate": 9.02047405502768e-06, + "loss": 0.4006, + "num_tokens": 256331779.0, + "step": 623 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 0.31680852995126063, + "learning_rate": 9.017000494348425e-06, + "loss": 0.3921, + "num_tokens": 256736444.0, + "step": 624 + }, + { + "epoch": 0.23891437308868502, + "grad_norm": 0.2832308759764621, + "learning_rate": 9.013521540659677e-06, + "loss": 0.3694, + "num_tokens": 257116436.0, + "step": 625 + }, + { + "epoch": 0.2392966360856269, + "grad_norm": 0.3321699105755607, + "learning_rate": 9.010037199296105e-06, + "loss": 0.374, + "num_tokens": 257515036.0, + "step": 626 + }, + { + "epoch": 0.2396788990825688, + "grad_norm": 0.3429248615437049, + "learning_rate": 9.006547475600636e-06, + "loss": 0.3796, + "num_tokens": 257929541.0, + "step": 627 + }, + { + "epoch": 0.2400611620795107, + "grad_norm": 0.3098280266584217, + "learning_rate": 9.003052374924454e-06, + "loss": 0.3917, + "num_tokens": 258348550.0, + "step": 628 + }, + { + "epoch": 0.2404434250764526, + "grad_norm": 0.3065584693267545, + "learning_rate": 8.999551902626984e-06, + "loss": 0.399, + "num_tokens": 258803936.0, + "step": 629 + }, + { + "epoch": 0.2408256880733945, + "grad_norm": 0.3034452570346747, + "learning_rate": 8.996046064075897e-06, + "loss": 0.3532, + "num_tokens": 259207365.0, + "step": 630 + }, + { + "epoch": 0.24120795107033638, + "grad_norm": 0.30606552366250667, + "learning_rate": 8.992534864647084e-06, + "loss": 0.3789, + "num_tokens": 259637116.0, + "step": 631 + }, + { + "epoch": 0.2415902140672783, + "grad_norm": 0.27598201111213216, + "learning_rate": 8.989018309724657e-06, + "loss": 0.3723, + "num_tokens": 260087175.0, + "step": 632 + }, + { + "epoch": 0.24197247706422018, + "grad_norm": 0.26110012452673226, + "learning_rate": 8.985496404700946e-06, + "loss": 0.3603, + "num_tokens": 260474561.0, + "step": 633 + }, + { + "epoch": 0.2423547400611621, + "grad_norm": 0.3001172491176638, + "learning_rate": 8.981969154976477e-06, + "loss": 0.4067, + "num_tokens": 260936606.0, + "step": 634 + }, + { + "epoch": 0.24273700305810397, + "grad_norm": 0.3154402451656656, + "learning_rate": 8.978436565959977e-06, + "loss": 0.419, + "num_tokens": 261365549.0, + "step": 635 + }, + { + "epoch": 0.24311926605504589, + "grad_norm": 0.3131213964994004, + "learning_rate": 8.974898643068361e-06, + "loss": 0.364, + "num_tokens": 261772927.0, + "step": 636 + }, + { + "epoch": 0.24350152905198777, + "grad_norm": 0.29828935570101023, + "learning_rate": 8.971355391726721e-06, + "loss": 0.3762, + "num_tokens": 262161078.0, + "step": 637 + }, + { + "epoch": 0.24388379204892965, + "grad_norm": 0.3238019631881244, + "learning_rate": 8.967806817368319e-06, + "loss": 0.3873, + "num_tokens": 262577695.0, + "step": 638 + }, + { + "epoch": 0.24426605504587157, + "grad_norm": 0.28465918320626743, + "learning_rate": 8.96425292543458e-06, + "loss": 0.4022, + "num_tokens": 262984408.0, + "step": 639 + }, + { + "epoch": 0.24464831804281345, + "grad_norm": 0.31282646244746515, + "learning_rate": 8.96069372137508e-06, + "loss": 0.3695, + "num_tokens": 263410565.0, + "step": 640 + }, + { + "epoch": 0.24503058103975536, + "grad_norm": 0.30553718935080953, + "learning_rate": 8.957129210647552e-06, + "loss": 0.385, + "num_tokens": 263804366.0, + "step": 641 + }, + { + "epoch": 0.24541284403669725, + "grad_norm": 0.2836860368792192, + "learning_rate": 8.95355939871785e-06, + "loss": 0.3804, + "num_tokens": 264224929.0, + "step": 642 + }, + { + "epoch": 0.24579510703363913, + "grad_norm": 0.2871929467004206, + "learning_rate": 8.949984291059972e-06, + "loss": 0.3736, + "num_tokens": 264635109.0, + "step": 643 + }, + { + "epoch": 0.24617737003058104, + "grad_norm": 0.30694125326333027, + "learning_rate": 8.946403893156025e-06, + "loss": 0.3937, + "num_tokens": 265073843.0, + "step": 644 + }, + { + "epoch": 0.24655963302752293, + "grad_norm": 0.3222758746803932, + "learning_rate": 8.942818210496235e-06, + "loss": 0.4032, + "num_tokens": 265506600.0, + "step": 645 + }, + { + "epoch": 0.24694189602446484, + "grad_norm": 0.3311827532533767, + "learning_rate": 8.939227248578926e-06, + "loss": 0.3727, + "num_tokens": 265941006.0, + "step": 646 + }, + { + "epoch": 0.24732415902140673, + "grad_norm": 0.30706307595819393, + "learning_rate": 8.935631012910526e-06, + "loss": 0.3565, + "num_tokens": 266319248.0, + "step": 647 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 0.2886375491110465, + "learning_rate": 8.932029509005542e-06, + "loss": 0.3983, + "num_tokens": 266749309.0, + "step": 648 + }, + { + "epoch": 0.24808868501529052, + "grad_norm": 0.2735856467287797, + "learning_rate": 8.928422742386563e-06, + "loss": 0.3829, + "num_tokens": 267109122.0, + "step": 649 + }, + { + "epoch": 0.2484709480122324, + "grad_norm": 0.2830936313318087, + "learning_rate": 8.924810718584243e-06, + "loss": 0.3828, + "num_tokens": 267508737.0, + "step": 650 + }, + { + "epoch": 0.24885321100917432, + "grad_norm": 0.3283390051413059, + "learning_rate": 8.921193443137309e-06, + "loss": 0.3782, + "num_tokens": 267918305.0, + "step": 651 + }, + { + "epoch": 0.2492354740061162, + "grad_norm": 0.30335124243703915, + "learning_rate": 8.917570921592525e-06, + "loss": 0.3908, + "num_tokens": 268361915.0, + "step": 652 + }, + { + "epoch": 0.24961773700305812, + "grad_norm": 0.2571511256162922, + "learning_rate": 8.913943159504714e-06, + "loss": 0.3641, + "num_tokens": 268800634.0, + "step": 653 + }, + { + "epoch": 0.25, + "grad_norm": 0.2892310075430736, + "learning_rate": 8.910310162436722e-06, + "loss": 0.3858, + "num_tokens": 269222212.0, + "step": 654 + }, + { + "epoch": 0.2503822629969419, + "grad_norm": 0.3473412958002292, + "learning_rate": 8.906671935959436e-06, + "loss": 0.3771, + "num_tokens": 269623817.0, + "step": 655 + }, + { + "epoch": 0.25076452599388377, + "grad_norm": 0.3477403643691002, + "learning_rate": 8.903028485651752e-06, + "loss": 0.3724, + "num_tokens": 270011282.0, + "step": 656 + }, + { + "epoch": 0.2511467889908257, + "grad_norm": 0.3120544578388952, + "learning_rate": 8.899379817100579e-06, + "loss": 0.3902, + "num_tokens": 270430088.0, + "step": 657 + }, + { + "epoch": 0.2515290519877676, + "grad_norm": 0.2853900889953197, + "learning_rate": 8.895725935900827e-06, + "loss": 0.3803, + "num_tokens": 270801218.0, + "step": 658 + }, + { + "epoch": 0.2519113149847095, + "grad_norm": 0.31250388044058636, + "learning_rate": 8.892066847655402e-06, + "loss": 0.3722, + "num_tokens": 271282248.0, + "step": 659 + }, + { + "epoch": 0.25229357798165136, + "grad_norm": 0.30999972693938765, + "learning_rate": 8.88840255797519e-06, + "loss": 0.3779, + "num_tokens": 271674351.0, + "step": 660 + }, + { + "epoch": 0.25267584097859325, + "grad_norm": 0.28162609553511925, + "learning_rate": 8.884733072479058e-06, + "loss": 0.3658, + "num_tokens": 272075673.0, + "step": 661 + }, + { + "epoch": 0.2530581039755352, + "grad_norm": 0.3107133601844678, + "learning_rate": 8.881058396793837e-06, + "loss": 0.3725, + "num_tokens": 272498661.0, + "step": 662 + }, + { + "epoch": 0.25344036697247707, + "grad_norm": 0.2949726295774104, + "learning_rate": 8.877378536554314e-06, + "loss": 0.347, + "num_tokens": 272902959.0, + "step": 663 + }, + { + "epoch": 0.25382262996941896, + "grad_norm": 0.3050132976467593, + "learning_rate": 8.873693497403234e-06, + "loss": 0.4074, + "num_tokens": 273299202.0, + "step": 664 + }, + { + "epoch": 0.25420489296636084, + "grad_norm": 0.29903463140677805, + "learning_rate": 8.870003284991277e-06, + "loss": 0.4042, + "num_tokens": 273732060.0, + "step": 665 + }, + { + "epoch": 0.2545871559633027, + "grad_norm": 0.26181156233855346, + "learning_rate": 8.86630790497706e-06, + "loss": 0.4113, + "num_tokens": 274182873.0, + "step": 666 + }, + { + "epoch": 0.25496941896024466, + "grad_norm": 0.28227078861755234, + "learning_rate": 8.862607363027116e-06, + "loss": 0.3495, + "num_tokens": 274592194.0, + "step": 667 + }, + { + "epoch": 0.25535168195718655, + "grad_norm": 0.2988398503823881, + "learning_rate": 8.858901664815906e-06, + "loss": 0.3784, + "num_tokens": 274999863.0, + "step": 668 + }, + { + "epoch": 0.25573394495412843, + "grad_norm": 0.2822390360268869, + "learning_rate": 8.855190816025789e-06, + "loss": 0.3737, + "num_tokens": 275415536.0, + "step": 669 + }, + { + "epoch": 0.2561162079510703, + "grad_norm": 0.25818594107637644, + "learning_rate": 8.85147482234702e-06, + "loss": 0.3865, + "num_tokens": 275861883.0, + "step": 670 + }, + { + "epoch": 0.25649847094801226, + "grad_norm": 0.36337782304390526, + "learning_rate": 8.84775368947775e-06, + "loss": 0.3957, + "num_tokens": 276270259.0, + "step": 671 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 0.26464963445527817, + "learning_rate": 8.844027423124005e-06, + "loss": 0.3608, + "num_tokens": 276717203.0, + "step": 672 + }, + { + "epoch": 0.257262996941896, + "grad_norm": 0.3045674911455366, + "learning_rate": 8.840296028999689e-06, + "loss": 0.4024, + "num_tokens": 277090223.0, + "step": 673 + }, + { + "epoch": 0.2576452599388379, + "grad_norm": 0.30217072269278383, + "learning_rate": 8.836559512826564e-06, + "loss": 0.3803, + "num_tokens": 277499390.0, + "step": 674 + }, + { + "epoch": 0.2580275229357798, + "grad_norm": 0.3147968781143223, + "learning_rate": 8.832817880334243e-06, + "loss": 0.3604, + "num_tokens": 277857284.0, + "step": 675 + }, + { + "epoch": 0.25840978593272174, + "grad_norm": 0.2818145011318806, + "learning_rate": 8.829071137260194e-06, + "loss": 0.3832, + "num_tokens": 278277734.0, + "step": 676 + }, + { + "epoch": 0.2587920489296636, + "grad_norm": 0.2849777327379941, + "learning_rate": 8.825319289349716e-06, + "loss": 0.3948, + "num_tokens": 278668507.0, + "step": 677 + }, + { + "epoch": 0.2591743119266055, + "grad_norm": 0.28558531620995986, + "learning_rate": 8.821562342355935e-06, + "loss": 0.3841, + "num_tokens": 279040696.0, + "step": 678 + }, + { + "epoch": 0.2595565749235474, + "grad_norm": 0.293298477758778, + "learning_rate": 8.817800302039798e-06, + "loss": 0.3814, + "num_tokens": 279479711.0, + "step": 679 + }, + { + "epoch": 0.2599388379204893, + "grad_norm": 0.2958623840205463, + "learning_rate": 8.814033174170058e-06, + "loss": 0.3641, + "num_tokens": 279887367.0, + "step": 680 + }, + { + "epoch": 0.2603211009174312, + "grad_norm": 0.3021930547346609, + "learning_rate": 8.810260964523278e-06, + "loss": 0.3764, + "num_tokens": 280274359.0, + "step": 681 + }, + { + "epoch": 0.2607033639143731, + "grad_norm": 0.27631890791386393, + "learning_rate": 8.806483678883803e-06, + "loss": 0.3884, + "num_tokens": 280722130.0, + "step": 682 + }, + { + "epoch": 0.261085626911315, + "grad_norm": 0.2787026800159375, + "learning_rate": 8.80270132304377e-06, + "loss": 0.4102, + "num_tokens": 281192235.0, + "step": 683 + }, + { + "epoch": 0.26146788990825687, + "grad_norm": 0.2875244754303217, + "learning_rate": 8.79891390280309e-06, + "loss": 0.3643, + "num_tokens": 281574004.0, + "step": 684 + }, + { + "epoch": 0.26185015290519875, + "grad_norm": 0.2871273613039571, + "learning_rate": 8.795121423969432e-06, + "loss": 0.3787, + "num_tokens": 281953301.0, + "step": 685 + }, + { + "epoch": 0.2622324159021407, + "grad_norm": 0.3093552039074778, + "learning_rate": 8.791323892358229e-06, + "loss": 0.3553, + "num_tokens": 282340687.0, + "step": 686 + }, + { + "epoch": 0.2626146788990826, + "grad_norm": 0.2795241201777922, + "learning_rate": 8.78752131379266e-06, + "loss": 0.3908, + "num_tokens": 282714927.0, + "step": 687 + }, + { + "epoch": 0.26299694189602446, + "grad_norm": 0.31152888701621595, + "learning_rate": 8.783713694103645e-06, + "loss": 0.3932, + "num_tokens": 283135623.0, + "step": 688 + }, + { + "epoch": 0.26337920489296635, + "grad_norm": 0.27282917193935974, + "learning_rate": 8.779901039129832e-06, + "loss": 0.3662, + "num_tokens": 283541357.0, + "step": 689 + }, + { + "epoch": 0.26376146788990823, + "grad_norm": 0.2860025357707848, + "learning_rate": 8.776083354717587e-06, + "loss": 0.3926, + "num_tokens": 283964609.0, + "step": 690 + }, + { + "epoch": 0.26414373088685017, + "grad_norm": 0.3006331324866127, + "learning_rate": 8.772260646720997e-06, + "loss": 0.3921, + "num_tokens": 284388926.0, + "step": 691 + }, + { + "epoch": 0.26452599388379205, + "grad_norm": 0.2717106153240549, + "learning_rate": 8.76843292100184e-06, + "loss": 0.3831, + "num_tokens": 284825910.0, + "step": 692 + }, + { + "epoch": 0.26490825688073394, + "grad_norm": 0.3313190538803319, + "learning_rate": 8.764600183429604e-06, + "loss": 0.3802, + "num_tokens": 285231246.0, + "step": 693 + }, + { + "epoch": 0.2652905198776758, + "grad_norm": 0.32448987086454234, + "learning_rate": 8.760762439881447e-06, + "loss": 0.3873, + "num_tokens": 285662807.0, + "step": 694 + }, + { + "epoch": 0.26567278287461776, + "grad_norm": 0.3013500656474795, + "learning_rate": 8.756919696242212e-06, + "loss": 0.405, + "num_tokens": 286078151.0, + "step": 695 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 0.28840308740113485, + "learning_rate": 8.753071958404405e-06, + "loss": 0.4126, + "num_tokens": 286518934.0, + "step": 696 + }, + { + "epoch": 0.26643730886850153, + "grad_norm": 0.27868315601677723, + "learning_rate": 8.749219232268194e-06, + "loss": 0.3991, + "num_tokens": 286920211.0, + "step": 697 + }, + { + "epoch": 0.2668195718654434, + "grad_norm": 0.3016691244999727, + "learning_rate": 8.745361523741394e-06, + "loss": 0.3907, + "num_tokens": 287362784.0, + "step": 698 + }, + { + "epoch": 0.2672018348623853, + "grad_norm": 0.2471065019793865, + "learning_rate": 8.741498838739458e-06, + "loss": 0.3862, + "num_tokens": 287799619.0, + "step": 699 + }, + { + "epoch": 0.26758409785932724, + "grad_norm": 0.26288091136846714, + "learning_rate": 8.737631183185475e-06, + "loss": 0.3865, + "num_tokens": 288191426.0, + "step": 700 + }, + { + "epoch": 0.2679663608562691, + "grad_norm": 0.24829923447744354, + "learning_rate": 8.733758563010152e-06, + "loss": 0.3597, + "num_tokens": 288591548.0, + "step": 701 + }, + { + "epoch": 0.268348623853211, + "grad_norm": 0.27357169562434436, + "learning_rate": 8.72988098415181e-06, + "loss": 0.3648, + "num_tokens": 289013156.0, + "step": 702 + }, + { + "epoch": 0.2687308868501529, + "grad_norm": 0.2781983568262563, + "learning_rate": 8.72599845255637e-06, + "loss": 0.3565, + "num_tokens": 289389602.0, + "step": 703 + }, + { + "epoch": 0.2691131498470948, + "grad_norm": 0.2889515173153062, + "learning_rate": 8.722110974177356e-06, + "loss": 0.3995, + "num_tokens": 289865801.0, + "step": 704 + }, + { + "epoch": 0.2694954128440367, + "grad_norm": 0.27579678422743137, + "learning_rate": 8.718218554975872e-06, + "loss": 0.4003, + "num_tokens": 290333381.0, + "step": 705 + }, + { + "epoch": 0.2698776758409786, + "grad_norm": 0.2628651853937428, + "learning_rate": 8.714321200920596e-06, + "loss": 0.3684, + "num_tokens": 290762130.0, + "step": 706 + }, + { + "epoch": 0.2702599388379205, + "grad_norm": 0.30181639238356406, + "learning_rate": 8.710418917987779e-06, + "loss": 0.3933, + "num_tokens": 291186850.0, + "step": 707 + }, + { + "epoch": 0.2706422018348624, + "grad_norm": 0.2936170240206027, + "learning_rate": 8.706511712161225e-06, + "loss": 0.3796, + "num_tokens": 291616959.0, + "step": 708 + }, + { + "epoch": 0.27102446483180426, + "grad_norm": 0.278875878813826, + "learning_rate": 8.70259958943229e-06, + "loss": 0.3936, + "num_tokens": 292048270.0, + "step": 709 + }, + { + "epoch": 0.2714067278287462, + "grad_norm": 0.31209934903950987, + "learning_rate": 8.698682555799868e-06, + "loss": 0.3909, + "num_tokens": 292505278.0, + "step": 710 + }, + { + "epoch": 0.2717889908256881, + "grad_norm": 0.28429027756426195, + "learning_rate": 8.694760617270386e-06, + "loss": 0.3744, + "num_tokens": 292908293.0, + "step": 711 + }, + { + "epoch": 0.27217125382262997, + "grad_norm": 0.26581833855855097, + "learning_rate": 8.690833779857788e-06, + "loss": 0.3687, + "num_tokens": 293336209.0, + "step": 712 + }, + { + "epoch": 0.27255351681957185, + "grad_norm": 0.2769884388765986, + "learning_rate": 8.68690204958353e-06, + "loss": 0.3796, + "num_tokens": 293731923.0, + "step": 713 + }, + { + "epoch": 0.27293577981651373, + "grad_norm": 0.31417430839562893, + "learning_rate": 8.682965432476579e-06, + "loss": 0.3638, + "num_tokens": 294106658.0, + "step": 714 + }, + { + "epoch": 0.2733180428134557, + "grad_norm": 0.26027126113633997, + "learning_rate": 8.679023934573385e-06, + "loss": 0.3581, + "num_tokens": 294510431.0, + "step": 715 + }, + { + "epoch": 0.27370030581039756, + "grad_norm": 0.28723775787094785, + "learning_rate": 8.675077561917888e-06, + "loss": 0.3719, + "num_tokens": 294867776.0, + "step": 716 + }, + { + "epoch": 0.27408256880733944, + "grad_norm": 0.24922840961693052, + "learning_rate": 8.671126320561501e-06, + "loss": 0.3482, + "num_tokens": 295273414.0, + "step": 717 + }, + { + "epoch": 0.27446483180428133, + "grad_norm": 0.31043949172171165, + "learning_rate": 8.667170216563103e-06, + "loss": 0.3803, + "num_tokens": 295728699.0, + "step": 718 + }, + { + "epoch": 0.27484709480122327, + "grad_norm": 0.27963429578181503, + "learning_rate": 8.663209255989033e-06, + "loss": 0.3453, + "num_tokens": 296159538.0, + "step": 719 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 0.3165142671130839, + "learning_rate": 8.65924344491307e-06, + "loss": 0.3637, + "num_tokens": 296538139.0, + "step": 720 + }, + { + "epoch": 0.27561162079510704, + "grad_norm": 0.27727612936600626, + "learning_rate": 8.65527278941644e-06, + "loss": 0.3646, + "num_tokens": 297000994.0, + "step": 721 + }, + { + "epoch": 0.2759938837920489, + "grad_norm": 0.29897642761759347, + "learning_rate": 8.651297295587788e-06, + "loss": 0.3626, + "num_tokens": 297390031.0, + "step": 722 + }, + { + "epoch": 0.2763761467889908, + "grad_norm": 0.29935050424760723, + "learning_rate": 8.647316969523185e-06, + "loss": 0.3714, + "num_tokens": 297809046.0, + "step": 723 + }, + { + "epoch": 0.27675840978593275, + "grad_norm": 0.3003356809500152, + "learning_rate": 8.643331817326105e-06, + "loss": 0.3742, + "num_tokens": 298190175.0, + "step": 724 + }, + { + "epoch": 0.27714067278287463, + "grad_norm": 0.2866478373834503, + "learning_rate": 8.639341845107432e-06, + "loss": 0.3831, + "num_tokens": 298607049.0, + "step": 725 + }, + { + "epoch": 0.2775229357798165, + "grad_norm": 0.2766505825818346, + "learning_rate": 8.635347058985433e-06, + "loss": 0.3666, + "num_tokens": 298961466.0, + "step": 726 + }, + { + "epoch": 0.2779051987767584, + "grad_norm": 0.2765944526538987, + "learning_rate": 8.63134746508576e-06, + "loss": 0.3779, + "num_tokens": 299348078.0, + "step": 727 + }, + { + "epoch": 0.2782874617737003, + "grad_norm": 0.2803137196937322, + "learning_rate": 8.627343069541438e-06, + "loss": 0.3832, + "num_tokens": 299768200.0, + "step": 728 + }, + { + "epoch": 0.2786697247706422, + "grad_norm": 0.2810019152920437, + "learning_rate": 8.623333878492853e-06, + "loss": 0.3687, + "num_tokens": 300207021.0, + "step": 729 + }, + { + "epoch": 0.2790519877675841, + "grad_norm": 0.29718788487212, + "learning_rate": 8.619319898087744e-06, + "loss": 0.3602, + "num_tokens": 300597918.0, + "step": 730 + }, + { + "epoch": 0.279434250764526, + "grad_norm": 0.26190886460912044, + "learning_rate": 8.615301134481196e-06, + "loss": 0.38, + "num_tokens": 301013416.0, + "step": 731 + }, + { + "epoch": 0.2798165137614679, + "grad_norm": 0.3021699373984725, + "learning_rate": 8.611277593835631e-06, + "loss": 0.376, + "num_tokens": 301477349.0, + "step": 732 + }, + { + "epoch": 0.28019877675840976, + "grad_norm": 0.27704132166165096, + "learning_rate": 8.60724928232079e-06, + "loss": 0.3887, + "num_tokens": 301920036.0, + "step": 733 + }, + { + "epoch": 0.2805810397553517, + "grad_norm": 0.2727866389698373, + "learning_rate": 8.603216206113731e-06, + "loss": 0.3812, + "num_tokens": 302344946.0, + "step": 734 + }, + { + "epoch": 0.2809633027522936, + "grad_norm": 0.27559165318625345, + "learning_rate": 8.599178371398821e-06, + "loss": 0.3865, + "num_tokens": 302731410.0, + "step": 735 + }, + { + "epoch": 0.28134556574923547, + "grad_norm": 0.2714954579309321, + "learning_rate": 8.595135784367726e-06, + "loss": 0.3792, + "num_tokens": 303131112.0, + "step": 736 + }, + { + "epoch": 0.28172782874617736, + "grad_norm": 0.2640815619811983, + "learning_rate": 8.591088451219393e-06, + "loss": 0.3778, + "num_tokens": 303533874.0, + "step": 737 + }, + { + "epoch": 0.28211009174311924, + "grad_norm": 0.31661686967028485, + "learning_rate": 8.58703637816005e-06, + "loss": 0.3766, + "num_tokens": 303974027.0, + "step": 738 + }, + { + "epoch": 0.2824923547400612, + "grad_norm": 0.27389367959925154, + "learning_rate": 8.582979571403195e-06, + "loss": 0.4082, + "num_tokens": 304468445.0, + "step": 739 + }, + { + "epoch": 0.28287461773700306, + "grad_norm": 0.24764984676801477, + "learning_rate": 8.57891803716958e-06, + "loss": 0.3864, + "num_tokens": 304914490.0, + "step": 740 + }, + { + "epoch": 0.28325688073394495, + "grad_norm": 0.2901084235574226, + "learning_rate": 8.57485178168721e-06, + "loss": 0.3652, + "num_tokens": 305284830.0, + "step": 741 + }, + { + "epoch": 0.28363914373088683, + "grad_norm": 0.270887607799054, + "learning_rate": 8.57078081119133e-06, + "loss": 0.3723, + "num_tokens": 305699531.0, + "step": 742 + }, + { + "epoch": 0.2840214067278288, + "grad_norm": 0.2855770410971349, + "learning_rate": 8.566705131924413e-06, + "loss": 0.3781, + "num_tokens": 306133130.0, + "step": 743 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 0.24434254572257388, + "learning_rate": 8.56262475013615e-06, + "loss": 0.3861, + "num_tokens": 306579839.0, + "step": 744 + }, + { + "epoch": 0.28478593272171254, + "grad_norm": 0.2770541943061836, + "learning_rate": 8.558539672083448e-06, + "loss": 0.374, + "num_tokens": 306985329.0, + "step": 745 + }, + { + "epoch": 0.2851681957186544, + "grad_norm": 0.28714250206636605, + "learning_rate": 8.554449904030416e-06, + "loss": 0.396, + "num_tokens": 307426256.0, + "step": 746 + }, + { + "epoch": 0.2855504587155963, + "grad_norm": 0.27453262609510837, + "learning_rate": 8.550355452248347e-06, + "loss": 0.3919, + "num_tokens": 307835931.0, + "step": 747 + }, + { + "epoch": 0.28593272171253825, + "grad_norm": 0.31968552433669295, + "learning_rate": 8.546256323015723e-06, + "loss": 0.3672, + "num_tokens": 308236698.0, + "step": 748 + }, + { + "epoch": 0.28631498470948014, + "grad_norm": 0.32285894167829227, + "learning_rate": 8.542152522618196e-06, + "loss": 0.4107, + "num_tokens": 308695472.0, + "step": 749 + }, + { + "epoch": 0.286697247706422, + "grad_norm": 0.32952689427958964, + "learning_rate": 8.538044057348585e-06, + "loss": 0.386, + "num_tokens": 309148218.0, + "step": 750 + }, + { + "epoch": 0.2870795107033639, + "grad_norm": 0.31927865084718376, + "learning_rate": 8.533930933506854e-06, + "loss": 0.3685, + "num_tokens": 309526917.0, + "step": 751 + }, + { + "epoch": 0.2874617737003058, + "grad_norm": 0.33372841073858306, + "learning_rate": 8.529813157400116e-06, + "loss": 0.4004, + "num_tokens": 309961971.0, + "step": 752 + }, + { + "epoch": 0.28784403669724773, + "grad_norm": 0.32449695145638346, + "learning_rate": 8.525690735342618e-06, + "loss": 0.3837, + "num_tokens": 310359029.0, + "step": 753 + }, + { + "epoch": 0.2882262996941896, + "grad_norm": 0.32159938945060784, + "learning_rate": 8.52156367365573e-06, + "loss": 0.3939, + "num_tokens": 310779312.0, + "step": 754 + }, + { + "epoch": 0.2886085626911315, + "grad_norm": 0.3457882631189294, + "learning_rate": 8.517431978667934e-06, + "loss": 0.3934, + "num_tokens": 311175439.0, + "step": 755 + }, + { + "epoch": 0.2889908256880734, + "grad_norm": 0.2988611167440456, + "learning_rate": 8.513295656714822e-06, + "loss": 0.3784, + "num_tokens": 311598407.0, + "step": 756 + }, + { + "epoch": 0.28937308868501527, + "grad_norm": 0.31001475480325874, + "learning_rate": 8.509154714139077e-06, + "loss": 0.376, + "num_tokens": 311983732.0, + "step": 757 + }, + { + "epoch": 0.2897553516819572, + "grad_norm": 0.36449002448110834, + "learning_rate": 8.50500915729047e-06, + "loss": 0.3818, + "num_tokens": 312405697.0, + "step": 758 + }, + { + "epoch": 0.2901376146788991, + "grad_norm": 0.268266230214534, + "learning_rate": 8.50085899252584e-06, + "loss": 0.4083, + "num_tokens": 312870943.0, + "step": 759 + }, + { + "epoch": 0.290519877675841, + "grad_norm": 0.2930844917493806, + "learning_rate": 8.496704226209107e-06, + "loss": 0.3789, + "num_tokens": 313232099.0, + "step": 760 + }, + { + "epoch": 0.29090214067278286, + "grad_norm": 0.32348150559034744, + "learning_rate": 8.492544864711234e-06, + "loss": 0.3851, + "num_tokens": 313660300.0, + "step": 761 + }, + { + "epoch": 0.29128440366972475, + "grad_norm": 0.32745516459013124, + "learning_rate": 8.48838091441023e-06, + "loss": 0.3893, + "num_tokens": 314103038.0, + "step": 762 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 0.2661326440165071, + "learning_rate": 8.484212381691154e-06, + "loss": 0.3703, + "num_tokens": 314515318.0, + "step": 763 + }, + { + "epoch": 0.29204892966360857, + "grad_norm": 0.3050658620939666, + "learning_rate": 8.480039272946076e-06, + "loss": 0.384, + "num_tokens": 314917771.0, + "step": 764 + }, + { + "epoch": 0.29243119266055045, + "grad_norm": 0.2967550087655357, + "learning_rate": 8.47586159457409e-06, + "loss": 0.3942, + "num_tokens": 315346405.0, + "step": 765 + }, + { + "epoch": 0.29281345565749234, + "grad_norm": 0.2810427141876906, + "learning_rate": 8.471679352981297e-06, + "loss": 0.3753, + "num_tokens": 315759265.0, + "step": 766 + }, + { + "epoch": 0.2931957186544342, + "grad_norm": 0.2948213047299878, + "learning_rate": 8.467492554580797e-06, + "loss": 0.4113, + "num_tokens": 316195881.0, + "step": 767 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 0.3579677906421847, + "learning_rate": 8.463301205792675e-06, + "loss": 0.3805, + "num_tokens": 316606942.0, + "step": 768 + }, + { + "epoch": 0.29396024464831805, + "grad_norm": 0.2564193853033097, + "learning_rate": 8.45910531304399e-06, + "loss": 0.3939, + "num_tokens": 317006053.0, + "step": 769 + }, + { + "epoch": 0.29434250764525993, + "grad_norm": 0.2638439702177443, + "learning_rate": 8.45490488276878e-06, + "loss": 0.3775, + "num_tokens": 317374484.0, + "step": 770 + }, + { + "epoch": 0.2947247706422018, + "grad_norm": 0.29373435359595196, + "learning_rate": 8.450699921408026e-06, + "loss": 0.3871, + "num_tokens": 317789255.0, + "step": 771 + }, + { + "epoch": 0.29510703363914376, + "grad_norm": 0.2884764316999055, + "learning_rate": 8.44649043540967e-06, + "loss": 0.3678, + "num_tokens": 318176220.0, + "step": 772 + }, + { + "epoch": 0.29548929663608564, + "grad_norm": 0.2693005981826586, + "learning_rate": 8.442276431228585e-06, + "loss": 0.3928, + "num_tokens": 318583073.0, + "step": 773 + }, + { + "epoch": 0.2958715596330275, + "grad_norm": 0.30183890055638374, + "learning_rate": 8.438057915326573e-06, + "loss": 0.4048, + "num_tokens": 319028991.0, + "step": 774 + }, + { + "epoch": 0.2962538226299694, + "grad_norm": 0.30575919247306826, + "learning_rate": 8.433834894172359e-06, + "loss": 0.3692, + "num_tokens": 319402762.0, + "step": 775 + }, + { + "epoch": 0.2966360856269113, + "grad_norm": 0.28600415998443984, + "learning_rate": 8.429607374241567e-06, + "loss": 0.3822, + "num_tokens": 319810973.0, + "step": 776 + }, + { + "epoch": 0.29701834862385323, + "grad_norm": 0.31645424914189696, + "learning_rate": 8.425375362016729e-06, + "loss": 0.4085, + "num_tokens": 320239788.0, + "step": 777 + }, + { + "epoch": 0.2974006116207951, + "grad_norm": 0.3030857077387875, + "learning_rate": 8.421138863987262e-06, + "loss": 0.384, + "num_tokens": 320647245.0, + "step": 778 + }, + { + "epoch": 0.297782874617737, + "grad_norm": 0.3428064915470499, + "learning_rate": 8.416897886649462e-06, + "loss": 0.3671, + "num_tokens": 321026601.0, + "step": 779 + }, + { + "epoch": 0.2981651376146789, + "grad_norm": 0.27782104974462735, + "learning_rate": 8.412652436506492e-06, + "loss": 0.3837, + "num_tokens": 321430085.0, + "step": 780 + }, + { + "epoch": 0.2985474006116208, + "grad_norm": 0.2745067472708174, + "learning_rate": 8.408402520068371e-06, + "loss": 0.3669, + "num_tokens": 321818195.0, + "step": 781 + }, + { + "epoch": 0.2989296636085627, + "grad_norm": 0.32885045001492175, + "learning_rate": 8.404148143851977e-06, + "loss": 0.3954, + "num_tokens": 322243468.0, + "step": 782 + }, + { + "epoch": 0.2993119266055046, + "grad_norm": 0.3230025919038844, + "learning_rate": 8.399889314381016e-06, + "loss": 0.3628, + "num_tokens": 322638724.0, + "step": 783 + }, + { + "epoch": 0.2996941896024465, + "grad_norm": 0.28737563103911107, + "learning_rate": 8.395626038186027e-06, + "loss": 0.3793, + "num_tokens": 323058170.0, + "step": 784 + }, + { + "epoch": 0.30007645259938837, + "grad_norm": 0.31428939590706056, + "learning_rate": 8.391358321804367e-06, + "loss": 0.3726, + "num_tokens": 323463264.0, + "step": 785 + }, + { + "epoch": 0.30045871559633025, + "grad_norm": 0.2823391673320193, + "learning_rate": 8.387086171780204e-06, + "loss": 0.3838, + "num_tokens": 323873845.0, + "step": 786 + }, + { + "epoch": 0.3008409785932722, + "grad_norm": 0.2889922907590903, + "learning_rate": 8.382809594664502e-06, + "loss": 0.396, + "num_tokens": 324277009.0, + "step": 787 + }, + { + "epoch": 0.3012232415902141, + "grad_norm": 0.27363369623365413, + "learning_rate": 8.378528597015011e-06, + "loss": 0.3916, + "num_tokens": 324664966.0, + "step": 788 + }, + { + "epoch": 0.30160550458715596, + "grad_norm": 0.2955633045303292, + "learning_rate": 8.374243185396265e-06, + "loss": 0.3827, + "num_tokens": 325090059.0, + "step": 789 + }, + { + "epoch": 0.30198776758409784, + "grad_norm": 0.28679650634294424, + "learning_rate": 8.369953366379567e-06, + "loss": 0.3714, + "num_tokens": 325521271.0, + "step": 790 + }, + { + "epoch": 0.30237003058103973, + "grad_norm": 0.2889791268461004, + "learning_rate": 8.365659146542973e-06, + "loss": 0.3585, + "num_tokens": 325935003.0, + "step": 791 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 0.3098637952148606, + "learning_rate": 8.361360532471287e-06, + "loss": 0.3638, + "num_tokens": 326337208.0, + "step": 792 + }, + { + "epoch": 0.30313455657492355, + "grad_norm": 0.2866305101051465, + "learning_rate": 8.357057530756055e-06, + "loss": 0.3917, + "num_tokens": 326778364.0, + "step": 793 + }, + { + "epoch": 0.30351681957186544, + "grad_norm": 0.31695790126832774, + "learning_rate": 8.352750147995552e-06, + "loss": 0.3849, + "num_tokens": 327176568.0, + "step": 794 + }, + { + "epoch": 0.3038990825688073, + "grad_norm": 0.27524735027753755, + "learning_rate": 8.34843839079477e-06, + "loss": 0.3484, + "num_tokens": 327587243.0, + "step": 795 + }, + { + "epoch": 0.30428134556574926, + "grad_norm": 0.3408168675009145, + "learning_rate": 8.344122265765404e-06, + "loss": 0.3702, + "num_tokens": 327981583.0, + "step": 796 + }, + { + "epoch": 0.30466360856269115, + "grad_norm": 0.35280196139374664, + "learning_rate": 8.33980177952585e-06, + "loss": 0.3731, + "num_tokens": 328372050.0, + "step": 797 + }, + { + "epoch": 0.30504587155963303, + "grad_norm": 0.3116024236215618, + "learning_rate": 8.335476938701195e-06, + "loss": 0.3912, + "num_tokens": 328795145.0, + "step": 798 + }, + { + "epoch": 0.3054281345565749, + "grad_norm": 0.3190795295981246, + "learning_rate": 8.331147749923199e-06, + "loss": 0.3929, + "num_tokens": 329239546.0, + "step": 799 + }, + { + "epoch": 0.3058103975535168, + "grad_norm": 0.3413594692377354, + "learning_rate": 8.326814219830291e-06, + "loss": 0.3862, + "num_tokens": 329658125.0, + "step": 800 + }, + { + "epoch": 0.30619266055045874, + "grad_norm": 0.2968394469777086, + "learning_rate": 8.322476355067556e-06, + "loss": 0.3752, + "num_tokens": 330116826.0, + "step": 801 + }, + { + "epoch": 0.3065749235474006, + "grad_norm": 0.3303435870483519, + "learning_rate": 8.318134162286726e-06, + "loss": 0.3952, + "num_tokens": 330547109.0, + "step": 802 + }, + { + "epoch": 0.3069571865443425, + "grad_norm": 0.2798893906222307, + "learning_rate": 8.31378764814617e-06, + "loss": 0.3427, + "num_tokens": 330918289.0, + "step": 803 + }, + { + "epoch": 0.3073394495412844, + "grad_norm": 0.3012914244458347, + "learning_rate": 8.309436819310884e-06, + "loss": 0.3783, + "num_tokens": 331353693.0, + "step": 804 + }, + { + "epoch": 0.3077217125382263, + "grad_norm": 0.2597769895819, + "learning_rate": 8.30508168245248e-06, + "loss": 0.3761, + "num_tokens": 331747510.0, + "step": 805 + }, + { + "epoch": 0.3081039755351682, + "grad_norm": 0.30247055960385544, + "learning_rate": 8.300722244249174e-06, + "loss": 0.3606, + "num_tokens": 332105174.0, + "step": 806 + }, + { + "epoch": 0.3084862385321101, + "grad_norm": 0.29519748138924146, + "learning_rate": 8.296358511385778e-06, + "loss": 0.3851, + "num_tokens": 332527760.0, + "step": 807 + }, + { + "epoch": 0.308868501529052, + "grad_norm": 0.3112737830105025, + "learning_rate": 8.291990490553696e-06, + "loss": 0.3873, + "num_tokens": 332932733.0, + "step": 808 + }, + { + "epoch": 0.30925076452599387, + "grad_norm": 0.25537381687849214, + "learning_rate": 8.287618188450896e-06, + "loss": 0.3805, + "num_tokens": 333386922.0, + "step": 809 + }, + { + "epoch": 0.30963302752293576, + "grad_norm": 0.2954356875281406, + "learning_rate": 8.283241611781922e-06, + "loss": 0.3712, + "num_tokens": 333770824.0, + "step": 810 + }, + { + "epoch": 0.3100152905198777, + "grad_norm": 0.2759798212966893, + "learning_rate": 8.278860767257865e-06, + "loss": 0.3651, + "num_tokens": 334174271.0, + "step": 811 + }, + { + "epoch": 0.3103975535168196, + "grad_norm": 0.2827857374899756, + "learning_rate": 8.274475661596361e-06, + "loss": 0.3718, + "num_tokens": 334558524.0, + "step": 812 + }, + { + "epoch": 0.31077981651376146, + "grad_norm": 0.33900576718314124, + "learning_rate": 8.270086301521587e-06, + "loss": 0.3894, + "num_tokens": 334953337.0, + "step": 813 + }, + { + "epoch": 0.31116207951070335, + "grad_norm": 0.2801043854468926, + "learning_rate": 8.265692693764235e-06, + "loss": 0.3755, + "num_tokens": 335362712.0, + "step": 814 + }, + { + "epoch": 0.31154434250764523, + "grad_norm": 0.26568231939707143, + "learning_rate": 8.261294845061516e-06, + "loss": 0.3883, + "num_tokens": 335772396.0, + "step": 815 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 0.2669515365575605, + "learning_rate": 8.256892762157141e-06, + "loss": 0.3886, + "num_tokens": 336167450.0, + "step": 816 + }, + { + "epoch": 0.31230886850152906, + "grad_norm": 0.2928797893508793, + "learning_rate": 8.252486451801315e-06, + "loss": 0.3475, + "num_tokens": 336530899.0, + "step": 817 + }, + { + "epoch": 0.31269113149847094, + "grad_norm": 0.33902844016401745, + "learning_rate": 8.24807592075073e-06, + "loss": 0.3771, + "num_tokens": 336947791.0, + "step": 818 + }, + { + "epoch": 0.3130733944954128, + "grad_norm": 0.30928114189623757, + "learning_rate": 8.24366117576854e-06, + "loss": 0.378, + "num_tokens": 337380589.0, + "step": 819 + }, + { + "epoch": 0.31345565749235477, + "grad_norm": 0.2774468482510209, + "learning_rate": 8.23924222362437e-06, + "loss": 0.3774, + "num_tokens": 337810888.0, + "step": 820 + }, + { + "epoch": 0.31383792048929665, + "grad_norm": 0.35037212766222336, + "learning_rate": 8.234819071094289e-06, + "loss": 0.3871, + "num_tokens": 338208041.0, + "step": 821 + }, + { + "epoch": 0.31422018348623854, + "grad_norm": 0.34283706081215903, + "learning_rate": 8.230391724960814e-06, + "loss": 0.3996, + "num_tokens": 338600734.0, + "step": 822 + }, + { + "epoch": 0.3146024464831804, + "grad_norm": 0.314867977039219, + "learning_rate": 8.225960192012887e-06, + "loss": 0.4094, + "num_tokens": 339007520.0, + "step": 823 + }, + { + "epoch": 0.3149847094801223, + "grad_norm": 0.301015476763335, + "learning_rate": 8.221524479045875e-06, + "loss": 0.3955, + "num_tokens": 339426478.0, + "step": 824 + }, + { + "epoch": 0.31536697247706424, + "grad_norm": 0.2928348904269848, + "learning_rate": 8.217084592861549e-06, + "loss": 0.3654, + "num_tokens": 339854612.0, + "step": 825 + }, + { + "epoch": 0.31574923547400613, + "grad_norm": 0.3087693238588224, + "learning_rate": 8.212640540268083e-06, + "loss": 0.4166, + "num_tokens": 340259865.0, + "step": 826 + }, + { + "epoch": 0.316131498470948, + "grad_norm": 0.2511929015522724, + "learning_rate": 8.208192328080038e-06, + "loss": 0.3701, + "num_tokens": 340670669.0, + "step": 827 + }, + { + "epoch": 0.3165137614678899, + "grad_norm": 0.2760545911932065, + "learning_rate": 8.203739963118358e-06, + "loss": 0.378, + "num_tokens": 341086664.0, + "step": 828 + }, + { + "epoch": 0.3168960244648318, + "grad_norm": 0.27724973865904723, + "learning_rate": 8.199283452210346e-06, + "loss": 0.3711, + "num_tokens": 341465346.0, + "step": 829 + }, + { + "epoch": 0.3172782874617737, + "grad_norm": 0.2656431822246982, + "learning_rate": 8.194822802189671e-06, + "loss": 0.3779, + "num_tokens": 341876848.0, + "step": 830 + }, + { + "epoch": 0.3176605504587156, + "grad_norm": 0.27088161710759145, + "learning_rate": 8.190358019896347e-06, + "loss": 0.3588, + "num_tokens": 342250241.0, + "step": 831 + }, + { + "epoch": 0.3180428134556575, + "grad_norm": 0.2854141493385398, + "learning_rate": 8.18588911217672e-06, + "loss": 0.4042, + "num_tokens": 342716991.0, + "step": 832 + }, + { + "epoch": 0.3184250764525994, + "grad_norm": 0.288111898140903, + "learning_rate": 8.181416085883467e-06, + "loss": 0.3937, + "num_tokens": 343096531.0, + "step": 833 + }, + { + "epoch": 0.31880733944954126, + "grad_norm": 0.2872876611851775, + "learning_rate": 8.176938947875577e-06, + "loss": 0.3968, + "num_tokens": 343521356.0, + "step": 834 + }, + { + "epoch": 0.3191896024464832, + "grad_norm": 0.28721408327635395, + "learning_rate": 8.172457705018347e-06, + "loss": 0.3801, + "num_tokens": 343940406.0, + "step": 835 + }, + { + "epoch": 0.3195718654434251, + "grad_norm": 0.28987605333774863, + "learning_rate": 8.167972364183365e-06, + "loss": 0.365, + "num_tokens": 344343179.0, + "step": 836 + }, + { + "epoch": 0.31995412844036697, + "grad_norm": 0.2686670132156234, + "learning_rate": 8.163482932248507e-06, + "loss": 0.3688, + "num_tokens": 344772760.0, + "step": 837 + }, + { + "epoch": 0.32033639143730885, + "grad_norm": 0.3091785764889582, + "learning_rate": 8.15898941609792e-06, + "loss": 0.3839, + "num_tokens": 345172609.0, + "step": 838 + }, + { + "epoch": 0.32071865443425074, + "grad_norm": 0.2830381386223011, + "learning_rate": 8.154491822622013e-06, + "loss": 0.3852, + "num_tokens": 345593278.0, + "step": 839 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 0.27614061174230775, + "learning_rate": 8.149990158717448e-06, + "loss": 0.3948, + "num_tokens": 346006505.0, + "step": 840 + }, + { + "epoch": 0.32148318042813456, + "grad_norm": 0.31682696550944794, + "learning_rate": 8.14548443128713e-06, + "loss": 0.3749, + "num_tokens": 346410269.0, + "step": 841 + }, + { + "epoch": 0.32186544342507645, + "grad_norm": 0.27666500804697863, + "learning_rate": 8.140974647240194e-06, + "loss": 0.3973, + "num_tokens": 346839973.0, + "step": 842 + }, + { + "epoch": 0.32224770642201833, + "grad_norm": 0.27084010090748667, + "learning_rate": 8.136460813491992e-06, + "loss": 0.3754, + "num_tokens": 347236802.0, + "step": 843 + }, + { + "epoch": 0.32262996941896027, + "grad_norm": 0.30049596788616667, + "learning_rate": 8.131942936964095e-06, + "loss": 0.3733, + "num_tokens": 347662619.0, + "step": 844 + }, + { + "epoch": 0.32301223241590216, + "grad_norm": 0.2747454301894514, + "learning_rate": 8.127421024584262e-06, + "loss": 0.3916, + "num_tokens": 348089652.0, + "step": 845 + }, + { + "epoch": 0.32339449541284404, + "grad_norm": 0.2898628721090736, + "learning_rate": 8.122895083286452e-06, + "loss": 0.3704, + "num_tokens": 348463724.0, + "step": 846 + }, + { + "epoch": 0.3237767584097859, + "grad_norm": 0.31609406768503884, + "learning_rate": 8.11836512001079e-06, + "loss": 0.3811, + "num_tokens": 348865207.0, + "step": 847 + }, + { + "epoch": 0.3241590214067278, + "grad_norm": 0.2750981588647409, + "learning_rate": 8.113831141703576e-06, + "loss": 0.3649, + "num_tokens": 349296137.0, + "step": 848 + }, + { + "epoch": 0.32454128440366975, + "grad_norm": 0.2798833771485608, + "learning_rate": 8.109293155317267e-06, + "loss": 0.3817, + "num_tokens": 349732349.0, + "step": 849 + }, + { + "epoch": 0.32492354740061163, + "grad_norm": 0.33848303426295, + "learning_rate": 8.104751167810463e-06, + "loss": 0.3876, + "num_tokens": 350150520.0, + "step": 850 + }, + { + "epoch": 0.3253058103975535, + "grad_norm": 0.29957109777987384, + "learning_rate": 8.100205186147899e-06, + "loss": 0.3735, + "num_tokens": 350512173.0, + "step": 851 + }, + { + "epoch": 0.3256880733944954, + "grad_norm": 0.29006854786153796, + "learning_rate": 8.095655217300439e-06, + "loss": 0.3803, + "num_tokens": 350938344.0, + "step": 852 + }, + { + "epoch": 0.3260703363914373, + "grad_norm": 0.28646832984831927, + "learning_rate": 8.091101268245057e-06, + "loss": 0.3714, + "num_tokens": 351355419.0, + "step": 853 + }, + { + "epoch": 0.3264525993883792, + "grad_norm": 0.2856206241346726, + "learning_rate": 8.086543345964833e-06, + "loss": 0.3787, + "num_tokens": 351791806.0, + "step": 854 + }, + { + "epoch": 0.3268348623853211, + "grad_norm": 0.2875238330480725, + "learning_rate": 8.081981457448935e-06, + "loss": 0.3614, + "num_tokens": 352219956.0, + "step": 855 + }, + { + "epoch": 0.327217125382263, + "grad_norm": 0.24624973609663148, + "learning_rate": 8.077415609692617e-06, + "loss": 0.3785, + "num_tokens": 352658717.0, + "step": 856 + }, + { + "epoch": 0.3275993883792049, + "grad_norm": 0.2953148164584077, + "learning_rate": 8.072845809697205e-06, + "loss": 0.3805, + "num_tokens": 353039804.0, + "step": 857 + }, + { + "epoch": 0.32798165137614677, + "grad_norm": 0.3364361285995865, + "learning_rate": 8.06827206447008e-06, + "loss": 0.3818, + "num_tokens": 353431576.0, + "step": 858 + }, + { + "epoch": 0.3283639143730887, + "grad_norm": 0.3013234211312168, + "learning_rate": 8.06369438102468e-06, + "loss": 0.359, + "num_tokens": 353820728.0, + "step": 859 + }, + { + "epoch": 0.3287461773700306, + "grad_norm": 0.2701597137317539, + "learning_rate": 8.059112766380476e-06, + "loss": 0.3773, + "num_tokens": 354233950.0, + "step": 860 + }, + { + "epoch": 0.3291284403669725, + "grad_norm": 0.317078526479236, + "learning_rate": 8.05452722756297e-06, + "loss": 0.3692, + "num_tokens": 354663652.0, + "step": 861 + }, + { + "epoch": 0.32951070336391436, + "grad_norm": 0.3203424189404694, + "learning_rate": 8.04993777160368e-06, + "loss": 0.38, + "num_tokens": 355038922.0, + "step": 862 + }, + { + "epoch": 0.32989296636085624, + "grad_norm": 0.3205784823288829, + "learning_rate": 8.04534440554013e-06, + "loss": 0.4008, + "num_tokens": 355472124.0, + "step": 863 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 0.3187031346293902, + "learning_rate": 8.040747136415843e-06, + "loss": 0.3845, + "num_tokens": 355866387.0, + "step": 864 + }, + { + "epoch": 0.33065749235474007, + "grad_norm": 0.30219183265667704, + "learning_rate": 8.036145971280325e-06, + "loss": 0.3776, + "num_tokens": 356267929.0, + "step": 865 + }, + { + "epoch": 0.33103975535168195, + "grad_norm": 0.2581633352096232, + "learning_rate": 8.031540917189056e-06, + "loss": 0.3733, + "num_tokens": 356677738.0, + "step": 866 + }, + { + "epoch": 0.33142201834862384, + "grad_norm": 0.2749408289166452, + "learning_rate": 8.026931981203477e-06, + "loss": 0.3715, + "num_tokens": 357091895.0, + "step": 867 + }, + { + "epoch": 0.3318042813455658, + "grad_norm": 0.2922281465058763, + "learning_rate": 8.022319170390987e-06, + "loss": 0.42, + "num_tokens": 357506950.0, + "step": 868 + }, + { + "epoch": 0.33218654434250766, + "grad_norm": 0.27818724227876357, + "learning_rate": 8.017702491824924e-06, + "loss": 0.3785, + "num_tokens": 357946281.0, + "step": 869 + }, + { + "epoch": 0.33256880733944955, + "grad_norm": 0.282792537961623, + "learning_rate": 8.013081952584555e-06, + "loss": 0.3924, + "num_tokens": 358373468.0, + "step": 870 + }, + { + "epoch": 0.33295107033639143, + "grad_norm": 0.2965260117127636, + "learning_rate": 8.00845755975507e-06, + "loss": 0.4193, + "num_tokens": 358804364.0, + "step": 871 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.28160253414681796, + "learning_rate": 8.003829320427564e-06, + "loss": 0.3889, + "num_tokens": 359247482.0, + "step": 872 + }, + { + "epoch": 0.33371559633027525, + "grad_norm": 0.4947661139081739, + "learning_rate": 7.999197241699035e-06, + "loss": 0.3387, + "num_tokens": 359327082.0, + "step": 873 + }, + { + "epoch": 1.0003822629969419, + "grad_norm": 0.2734509081072418, + "learning_rate": 7.994561330672367e-06, + "loss": 0.3826, + "num_tokens": 359778882.0, + "step": 874 + }, + { + "epoch": 1.0007645259938838, + "grad_norm": 0.24753083362156889, + "learning_rate": 7.989921594456318e-06, + "loss": 0.3645, + "num_tokens": 360186765.0, + "step": 875 + }, + { + "epoch": 1.0011467889908257, + "grad_norm": 0.24033503142348886, + "learning_rate": 7.985278040165519e-06, + "loss": 0.3666, + "num_tokens": 360615600.0, + "step": 876 + }, + { + "epoch": 1.0015290519877675, + "grad_norm": 0.26373030832153915, + "learning_rate": 7.980630674920445e-06, + "loss": 0.3663, + "num_tokens": 361032900.0, + "step": 877 + }, + { + "epoch": 1.0019113149847094, + "grad_norm": 0.2509618885004304, + "learning_rate": 7.97597950584742e-06, + "loss": 0.3796, + "num_tokens": 361524678.0, + "step": 878 + }, + { + "epoch": 1.0022935779816513, + "grad_norm": 0.2871807042607093, + "learning_rate": 7.9713245400786e-06, + "loss": 0.3793, + "num_tokens": 361949558.0, + "step": 879 + }, + { + "epoch": 1.0026758409785932, + "grad_norm": 0.2709460830140365, + "learning_rate": 7.966665784751969e-06, + "loss": 0.3583, + "num_tokens": 362329075.0, + "step": 880 + }, + { + "epoch": 1.003058103975535, + "grad_norm": 0.2583704557425144, + "learning_rate": 7.96200324701131e-06, + "loss": 0.4021, + "num_tokens": 362742516.0, + "step": 881 + }, + { + "epoch": 1.003440366972477, + "grad_norm": 0.29727131810007523, + "learning_rate": 7.957336934006218e-06, + "loss": 0.3779, + "num_tokens": 363148339.0, + "step": 882 + }, + { + "epoch": 1.003822629969419, + "grad_norm": 0.28671444584050393, + "learning_rate": 7.952666852892069e-06, + "loss": 0.3656, + "num_tokens": 363580955.0, + "step": 883 + }, + { + "epoch": 1.004204892966361, + "grad_norm": 0.2991736165753305, + "learning_rate": 7.947993010830021e-06, + "loss": 0.3972, + "num_tokens": 363997869.0, + "step": 884 + }, + { + "epoch": 1.0045871559633028, + "grad_norm": 0.29466946874459293, + "learning_rate": 7.943315414986998e-06, + "loss": 0.3966, + "num_tokens": 364378013.0, + "step": 885 + }, + { + "epoch": 1.0049694189602447, + "grad_norm": 0.28255851877647997, + "learning_rate": 7.938634072535675e-06, + "loss": 0.375, + "num_tokens": 364776227.0, + "step": 886 + }, + { + "epoch": 1.0053516819571866, + "grad_norm": 0.25214491049081006, + "learning_rate": 7.933948990654485e-06, + "loss": 0.3951, + "num_tokens": 365197610.0, + "step": 887 + }, + { + "epoch": 1.0057339449541285, + "grad_norm": 0.28402684870030326, + "learning_rate": 7.92926017652758e-06, + "loss": 0.3613, + "num_tokens": 365598931.0, + "step": 888 + }, + { + "epoch": 1.0061162079510704, + "grad_norm": 0.2886360741100843, + "learning_rate": 7.924567637344847e-06, + "loss": 0.3662, + "num_tokens": 365981513.0, + "step": 889 + }, + { + "epoch": 1.0064984709480123, + "grad_norm": 0.28937066229234853, + "learning_rate": 7.919871380301878e-06, + "loss": 0.3937, + "num_tokens": 366412673.0, + "step": 890 + }, + { + "epoch": 1.0068807339449541, + "grad_norm": 0.2690398404128952, + "learning_rate": 7.91517141259997e-06, + "loss": 0.3389, + "num_tokens": 366815104.0, + "step": 891 + }, + { + "epoch": 1.007262996941896, + "grad_norm": 0.2752084701888421, + "learning_rate": 7.910467741446106e-06, + "loss": 0.377, + "num_tokens": 367200683.0, + "step": 892 + }, + { + "epoch": 1.007645259938838, + "grad_norm": 0.26906854351119347, + "learning_rate": 7.905760374052952e-06, + "loss": 0.3983, + "num_tokens": 367652495.0, + "step": 893 + }, + { + "epoch": 1.0080275229357798, + "grad_norm": 0.29980489244560266, + "learning_rate": 7.901049317638836e-06, + "loss": 0.365, + "num_tokens": 368065671.0, + "step": 894 + }, + { + "epoch": 1.0084097859327217, + "grad_norm": 0.3341095053422161, + "learning_rate": 7.89633457942775e-06, + "loss": 0.3782, + "num_tokens": 368488502.0, + "step": 895 + }, + { + "epoch": 1.0087920489296636, + "grad_norm": 0.3217500819097353, + "learning_rate": 7.891616166649329e-06, + "loss": 0.3627, + "num_tokens": 368896724.0, + "step": 896 + }, + { + "epoch": 1.0091743119266054, + "grad_norm": 0.250391334834949, + "learning_rate": 7.886894086538841e-06, + "loss": 0.3694, + "num_tokens": 369321921.0, + "step": 897 + }, + { + "epoch": 1.0095565749235473, + "grad_norm": 0.28692989476923897, + "learning_rate": 7.88216834633718e-06, + "loss": 0.3771, + "num_tokens": 369795320.0, + "step": 898 + }, + { + "epoch": 1.0099388379204892, + "grad_norm": 0.2918893341159398, + "learning_rate": 7.87743895329085e-06, + "loss": 0.373, + "num_tokens": 370185879.0, + "step": 899 + }, + { + "epoch": 1.010321100917431, + "grad_norm": 0.28420288318568054, + "learning_rate": 7.872705914651955e-06, + "loss": 0.3451, + "num_tokens": 370578759.0, + "step": 900 + }, + { + "epoch": 1.010703363914373, + "grad_norm": 0.2812802793741803, + "learning_rate": 7.867969237678194e-06, + "loss": 0.3484, + "num_tokens": 370965932.0, + "step": 901 + }, + { + "epoch": 1.011085626911315, + "grad_norm": 0.3069653253627977, + "learning_rate": 7.863228929632843e-06, + "loss": 0.3556, + "num_tokens": 371367178.0, + "step": 902 + }, + { + "epoch": 1.011467889908257, + "grad_norm": 0.2744791427749656, + "learning_rate": 7.858484997784745e-06, + "loss": 0.3433, + "num_tokens": 371716486.0, + "step": 903 + }, + { + "epoch": 1.0118501529051989, + "grad_norm": 0.27383270760349376, + "learning_rate": 7.853737449408301e-06, + "loss": 0.355, + "num_tokens": 372088863.0, + "step": 904 + }, + { + "epoch": 1.0122324159021407, + "grad_norm": 0.26813880667511547, + "learning_rate": 7.848986291783454e-06, + "loss": 0.3483, + "num_tokens": 372481920.0, + "step": 905 + }, + { + "epoch": 1.0126146788990826, + "grad_norm": 0.286545151058124, + "learning_rate": 7.844231532195686e-06, + "loss": 0.3826, + "num_tokens": 372903843.0, + "step": 906 + }, + { + "epoch": 1.0129969418960245, + "grad_norm": 0.2936452434650887, + "learning_rate": 7.839473177936004e-06, + "loss": 0.3645, + "num_tokens": 373301571.0, + "step": 907 + }, + { + "epoch": 1.0133792048929664, + "grad_norm": 0.2915841703093686, + "learning_rate": 7.83471123630092e-06, + "loss": 0.3336, + "num_tokens": 373701131.0, + "step": 908 + }, + { + "epoch": 1.0137614678899083, + "grad_norm": 0.30035560938560874, + "learning_rate": 7.82994571459245e-06, + "loss": 0.3462, + "num_tokens": 374103993.0, + "step": 909 + }, + { + "epoch": 1.0141437308868502, + "grad_norm": 0.25518956745177324, + "learning_rate": 7.825176620118103e-06, + "loss": 0.3554, + "num_tokens": 374505973.0, + "step": 910 + }, + { + "epoch": 1.014525993883792, + "grad_norm": 0.24625564275750858, + "learning_rate": 7.820403960190862e-06, + "loss": 0.3576, + "num_tokens": 374935502.0, + "step": 911 + }, + { + "epoch": 1.014908256880734, + "grad_norm": 0.2857341089305771, + "learning_rate": 7.815627742129183e-06, + "loss": 0.3492, + "num_tokens": 375321168.0, + "step": 912 + }, + { + "epoch": 1.0152905198776758, + "grad_norm": 0.2705785892172911, + "learning_rate": 7.81084797325697e-06, + "loss": 0.3328, + "num_tokens": 375756536.0, + "step": 913 + }, + { + "epoch": 1.0156727828746177, + "grad_norm": 0.2698558723883506, + "learning_rate": 7.806064660903579e-06, + "loss": 0.3272, + "num_tokens": 376136227.0, + "step": 914 + }, + { + "epoch": 1.0160550458715596, + "grad_norm": 0.2854014853385893, + "learning_rate": 7.801277812403794e-06, + "loss": 0.3559, + "num_tokens": 376538633.0, + "step": 915 + }, + { + "epoch": 1.0164373088685015, + "grad_norm": 0.3001335963169468, + "learning_rate": 7.79648743509783e-06, + "loss": 0.3564, + "num_tokens": 376998983.0, + "step": 916 + }, + { + "epoch": 1.0168195718654434, + "grad_norm": 0.26179049903477386, + "learning_rate": 7.791693536331299e-06, + "loss": 0.356, + "num_tokens": 377420810.0, + "step": 917 + }, + { + "epoch": 1.0172018348623852, + "grad_norm": 0.2609701605371836, + "learning_rate": 7.786896123455227e-06, + "loss": 0.319, + "num_tokens": 377839448.0, + "step": 918 + }, + { + "epoch": 1.0175840978593271, + "grad_norm": 0.2812961344848241, + "learning_rate": 7.782095203826022e-06, + "loss": 0.3367, + "num_tokens": 378223203.0, + "step": 919 + }, + { + "epoch": 1.017966360856269, + "grad_norm": 0.28434805329784335, + "learning_rate": 7.777290784805469e-06, + "loss": 0.3492, + "num_tokens": 378629012.0, + "step": 920 + }, + { + "epoch": 1.018348623853211, + "grad_norm": 0.2710089416075033, + "learning_rate": 7.77248287376072e-06, + "loss": 0.3457, + "num_tokens": 379042477.0, + "step": 921 + }, + { + "epoch": 1.018730886850153, + "grad_norm": 0.2640759914063179, + "learning_rate": 7.767671478064282e-06, + "loss": 0.3517, + "num_tokens": 379458084.0, + "step": 922 + }, + { + "epoch": 1.019113149847095, + "grad_norm": 0.2532537472995236, + "learning_rate": 7.762856605094004e-06, + "loss": 0.3578, + "num_tokens": 379890219.0, + "step": 923 + }, + { + "epoch": 1.0194954128440368, + "grad_norm": 0.28121821182530193, + "learning_rate": 7.75803826223307e-06, + "loss": 0.3447, + "num_tokens": 380299884.0, + "step": 924 + }, + { + "epoch": 1.0198776758409787, + "grad_norm": 0.2621071680462542, + "learning_rate": 7.753216456869984e-06, + "loss": 0.3421, + "num_tokens": 380701959.0, + "step": 925 + }, + { + "epoch": 1.0202599388379205, + "grad_norm": 0.2569297932469686, + "learning_rate": 7.748391196398557e-06, + "loss": 0.3226, + "num_tokens": 381087095.0, + "step": 926 + }, + { + "epoch": 1.0206422018348624, + "grad_norm": 0.23416106523920066, + "learning_rate": 7.743562488217901e-06, + "loss": 0.3034, + "num_tokens": 381477222.0, + "step": 927 + }, + { + "epoch": 1.0210244648318043, + "grad_norm": 0.2896357187748173, + "learning_rate": 7.73873033973241e-06, + "loss": 0.329, + "num_tokens": 381878904.0, + "step": 928 + }, + { + "epoch": 1.0214067278287462, + "grad_norm": 0.2865746031683812, + "learning_rate": 7.733894758351758e-06, + "loss": 0.3372, + "num_tokens": 382252804.0, + "step": 929 + }, + { + "epoch": 1.021788990825688, + "grad_norm": 0.28781775267915927, + "learning_rate": 7.729055751490882e-06, + "loss": 0.3445, + "num_tokens": 382629461.0, + "step": 930 + }, + { + "epoch": 1.02217125382263, + "grad_norm": 0.30186630563608535, + "learning_rate": 7.724213326569972e-06, + "loss": 0.352, + "num_tokens": 383041658.0, + "step": 931 + }, + { + "epoch": 1.0225535168195719, + "grad_norm": 0.3144099896682969, + "learning_rate": 7.71936749101446e-06, + "loss": 0.3387, + "num_tokens": 383434312.0, + "step": 932 + }, + { + "epoch": 1.0229357798165137, + "grad_norm": 0.26972322408190974, + "learning_rate": 7.714518252255005e-06, + "loss": 0.3428, + "num_tokens": 383851671.0, + "step": 933 + }, + { + "epoch": 1.0233180428134556, + "grad_norm": 0.2799243756691746, + "learning_rate": 7.709665617727485e-06, + "loss": 0.3498, + "num_tokens": 384313170.0, + "step": 934 + }, + { + "epoch": 1.0237003058103975, + "grad_norm": 0.2633832494849653, + "learning_rate": 7.704809594872991e-06, + "loss": 0.3401, + "num_tokens": 384714741.0, + "step": 935 + }, + { + "epoch": 1.0240825688073394, + "grad_norm": 0.29294206756598, + "learning_rate": 7.699950191137798e-06, + "loss": 0.3272, + "num_tokens": 385131983.0, + "step": 936 + }, + { + "epoch": 1.0244648318042813, + "grad_norm": 0.2567301436914333, + "learning_rate": 7.695087413973377e-06, + "loss": 0.3485, + "num_tokens": 385535082.0, + "step": 937 + }, + { + "epoch": 1.0248470948012232, + "grad_norm": 0.2512906138528013, + "learning_rate": 7.690221270836366e-06, + "loss": 0.3275, + "num_tokens": 385893609.0, + "step": 938 + }, + { + "epoch": 1.025229357798165, + "grad_norm": 0.2844207843120261, + "learning_rate": 7.685351769188566e-06, + "loss": 0.3723, + "num_tokens": 386312005.0, + "step": 939 + }, + { + "epoch": 1.025611620795107, + "grad_norm": 0.2694868452031438, + "learning_rate": 7.680478916496927e-06, + "loss": 0.3245, + "num_tokens": 386721961.0, + "step": 940 + }, + { + "epoch": 1.025993883792049, + "grad_norm": 0.24963732233345717, + "learning_rate": 7.675602720233537e-06, + "loss": 0.3053, + "num_tokens": 387123208.0, + "step": 941 + }, + { + "epoch": 1.026376146788991, + "grad_norm": 0.25437576273061274, + "learning_rate": 7.670723187875613e-06, + "loss": 0.3494, + "num_tokens": 387550738.0, + "step": 942 + }, + { + "epoch": 1.0267584097859328, + "grad_norm": 0.3254244914560424, + "learning_rate": 7.665840326905488e-06, + "loss": 0.3527, + "num_tokens": 387963120.0, + "step": 943 + }, + { + "epoch": 1.0271406727828747, + "grad_norm": 0.2768164362835791, + "learning_rate": 7.660954144810597e-06, + "loss": 0.3258, + "num_tokens": 388364109.0, + "step": 944 + }, + { + "epoch": 1.0275229357798166, + "grad_norm": 0.2592936147856663, + "learning_rate": 7.656064649083466e-06, + "loss": 0.3315, + "num_tokens": 388763900.0, + "step": 945 + }, + { + "epoch": 1.0279051987767585, + "grad_norm": 0.28398369657063827, + "learning_rate": 7.651171847221708e-06, + "loss": 0.3328, + "num_tokens": 389162377.0, + "step": 946 + }, + { + "epoch": 1.0282874617737003, + "grad_norm": 0.27232610027878973, + "learning_rate": 7.646275746728002e-06, + "loss": 0.3566, + "num_tokens": 389603011.0, + "step": 947 + }, + { + "epoch": 1.0286697247706422, + "grad_norm": 0.24364177031427756, + "learning_rate": 7.641376355110085e-06, + "loss": 0.3396, + "num_tokens": 390040259.0, + "step": 948 + }, + { + "epoch": 1.029051987767584, + "grad_norm": 0.2680865117005245, + "learning_rate": 7.636473679880741e-06, + "loss": 0.3411, + "num_tokens": 390412441.0, + "step": 949 + }, + { + "epoch": 1.029434250764526, + "grad_norm": 0.26385915833986956, + "learning_rate": 7.631567728557788e-06, + "loss": 0.3199, + "num_tokens": 390810673.0, + "step": 950 + }, + { + "epoch": 1.0298165137614679, + "grad_norm": 0.2892762005360835, + "learning_rate": 7.62665850866407e-06, + "loss": 0.3296, + "num_tokens": 391179538.0, + "step": 951 + }, + { + "epoch": 1.0301987767584098, + "grad_norm": 0.3150005886158518, + "learning_rate": 7.621746027727442e-06, + "loss": 0.3505, + "num_tokens": 391580592.0, + "step": 952 + }, + { + "epoch": 1.0305810397553516, + "grad_norm": 0.2803165087915309, + "learning_rate": 7.616830293280758e-06, + "loss": 0.3438, + "num_tokens": 391959563.0, + "step": 953 + }, + { + "epoch": 1.0309633027522935, + "grad_norm": 0.3142488912094723, + "learning_rate": 7.611911312861865e-06, + "loss": 0.3589, + "num_tokens": 392363161.0, + "step": 954 + }, + { + "epoch": 1.0313455657492354, + "grad_norm": 0.26591786809601603, + "learning_rate": 7.606989094013583e-06, + "loss": 0.3283, + "num_tokens": 392792758.0, + "step": 955 + }, + { + "epoch": 1.0317278287461773, + "grad_norm": 0.27468645724232027, + "learning_rate": 7.6020636442837e-06, + "loss": 0.3499, + "num_tokens": 393225384.0, + "step": 956 + }, + { + "epoch": 1.0321100917431192, + "grad_norm": 0.2770314893451792, + "learning_rate": 7.59713497122496e-06, + "loss": 0.3413, + "num_tokens": 393609743.0, + "step": 957 + }, + { + "epoch": 1.032492354740061, + "grad_norm": 0.2823466583279306, + "learning_rate": 7.592203082395044e-06, + "loss": 0.3105, + "num_tokens": 393978914.0, + "step": 958 + }, + { + "epoch": 1.032874617737003, + "grad_norm": 0.279883822597775, + "learning_rate": 7.58726798535657e-06, + "loss": 0.3326, + "num_tokens": 394358923.0, + "step": 959 + }, + { + "epoch": 1.033256880733945, + "grad_norm": 0.22973511777141034, + "learning_rate": 7.582329687677073e-06, + "loss": 0.3196, + "num_tokens": 394782828.0, + "step": 960 + }, + { + "epoch": 1.033639143730887, + "grad_norm": 0.3228999620454348, + "learning_rate": 7.5773881969289965e-06, + "loss": 0.3632, + "num_tokens": 395238022.0, + "step": 961 + }, + { + "epoch": 1.0340214067278288, + "grad_norm": 0.29840503700608834, + "learning_rate": 7.572443520689679e-06, + "loss": 0.3408, + "num_tokens": 395610624.0, + "step": 962 + }, + { + "epoch": 1.0344036697247707, + "grad_norm": 0.31720403006364906, + "learning_rate": 7.567495666541343e-06, + "loss": 0.3152, + "num_tokens": 395983412.0, + "step": 963 + }, + { + "epoch": 1.0347859327217126, + "grad_norm": 0.28276133101593437, + "learning_rate": 7.562544642071089e-06, + "loss": 0.3244, + "num_tokens": 396371663.0, + "step": 964 + }, + { + "epoch": 1.0351681957186545, + "grad_norm": 0.2862852498925155, + "learning_rate": 7.557590454870874e-06, + "loss": 0.3487, + "num_tokens": 396812040.0, + "step": 965 + }, + { + "epoch": 1.0355504587155964, + "grad_norm": 0.2594406121328056, + "learning_rate": 7.552633112537506e-06, + "loss": 0.3277, + "num_tokens": 397205130.0, + "step": 966 + }, + { + "epoch": 1.0359327217125383, + "grad_norm": 0.30884809717977374, + "learning_rate": 7.547672622672633e-06, + "loss": 0.3263, + "num_tokens": 397597594.0, + "step": 967 + }, + { + "epoch": 1.0363149847094801, + "grad_norm": 0.2570141156560367, + "learning_rate": 7.5427089928827255e-06, + "loss": 0.3266, + "num_tokens": 397976903.0, + "step": 968 + }, + { + "epoch": 1.036697247706422, + "grad_norm": 0.2611937830544222, + "learning_rate": 7.537742230779075e-06, + "loss": 0.3451, + "num_tokens": 398400133.0, + "step": 969 + }, + { + "epoch": 1.037079510703364, + "grad_norm": 0.3053857048182125, + "learning_rate": 7.532772343977767e-06, + "loss": 0.3313, + "num_tokens": 398835764.0, + "step": 970 + }, + { + "epoch": 1.0374617737003058, + "grad_norm": 0.30072622369984425, + "learning_rate": 7.527799340099687e-06, + "loss": 0.305, + "num_tokens": 399212102.0, + "step": 971 + }, + { + "epoch": 1.0378440366972477, + "grad_norm": 0.25083005192378344, + "learning_rate": 7.522823226770497e-06, + "loss": 0.3235, + "num_tokens": 399606442.0, + "step": 972 + }, + { + "epoch": 1.0382262996941896, + "grad_norm": 0.3058278156655185, + "learning_rate": 7.517844011620628e-06, + "loss": 0.3414, + "num_tokens": 400034094.0, + "step": 973 + }, + { + "epoch": 1.0386085626911314, + "grad_norm": 0.303813502122588, + "learning_rate": 7.512861702285262e-06, + "loss": 0.3202, + "num_tokens": 400445503.0, + "step": 974 + }, + { + "epoch": 1.0389908256880733, + "grad_norm": 0.3601892276904088, + "learning_rate": 7.507876306404336e-06, + "loss": 0.3372, + "num_tokens": 400781994.0, + "step": 975 + }, + { + "epoch": 1.0393730886850152, + "grad_norm": 0.3252179447507799, + "learning_rate": 7.502887831622509e-06, + "loss": 0.3482, + "num_tokens": 401195592.0, + "step": 976 + }, + { + "epoch": 1.039755351681957, + "grad_norm": 0.298817915856359, + "learning_rate": 7.497896285589171e-06, + "loss": 0.3328, + "num_tokens": 401582962.0, + "step": 977 + }, + { + "epoch": 1.040137614678899, + "grad_norm": 0.3028218108837741, + "learning_rate": 7.492901675958413e-06, + "loss": 0.3178, + "num_tokens": 401970540.0, + "step": 978 + }, + { + "epoch": 1.040519877675841, + "grad_norm": 0.2879379126692796, + "learning_rate": 7.48790401038903e-06, + "loss": 0.3148, + "num_tokens": 402363696.0, + "step": 979 + }, + { + "epoch": 1.040902140672783, + "grad_norm": 0.31801831475517145, + "learning_rate": 7.482903296544499e-06, + "loss": 0.3242, + "num_tokens": 402761886.0, + "step": 980 + }, + { + "epoch": 1.0412844036697249, + "grad_norm": 0.2896183268507168, + "learning_rate": 7.477899542092975e-06, + "loss": 0.3418, + "num_tokens": 403193812.0, + "step": 981 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.3176839263544348, + "learning_rate": 7.47289275470727e-06, + "loss": 0.3533, + "num_tokens": 403605536.0, + "step": 982 + }, + { + "epoch": 1.0420489296636086, + "grad_norm": 0.3245265992977043, + "learning_rate": 7.46788294206485e-06, + "loss": 0.3353, + "num_tokens": 403989064.0, + "step": 983 + }, + { + "epoch": 1.0424311926605505, + "grad_norm": 0.28697975569096, + "learning_rate": 7.462870111847823e-06, + "loss": 0.3386, + "num_tokens": 404401513.0, + "step": 984 + }, + { + "epoch": 1.0428134556574924, + "grad_norm": 0.25372418601934416, + "learning_rate": 7.45785427174292e-06, + "loss": 0.3415, + "num_tokens": 404875606.0, + "step": 985 + }, + { + "epoch": 1.0431957186544343, + "grad_norm": 0.3026099073947782, + "learning_rate": 7.4528354294414885e-06, + "loss": 0.354, + "num_tokens": 405301869.0, + "step": 986 + }, + { + "epoch": 1.0435779816513762, + "grad_norm": 0.281992100959585, + "learning_rate": 7.447813592639481e-06, + "loss": 0.3303, + "num_tokens": 405714339.0, + "step": 987 + }, + { + "epoch": 1.043960244648318, + "grad_norm": 0.2662856762998438, + "learning_rate": 7.442788769037439e-06, + "loss": 0.3435, + "num_tokens": 406166888.0, + "step": 988 + }, + { + "epoch": 1.04434250764526, + "grad_norm": 0.29760993090936005, + "learning_rate": 7.437760966340483e-06, + "loss": 0.3232, + "num_tokens": 406579055.0, + "step": 989 + }, + { + "epoch": 1.0447247706422018, + "grad_norm": 0.2839517525387195, + "learning_rate": 7.43273019225831e-06, + "loss": 0.3318, + "num_tokens": 406984198.0, + "step": 990 + }, + { + "epoch": 1.0451070336391437, + "grad_norm": 0.24744264406023414, + "learning_rate": 7.427696454505162e-06, + "loss": 0.3119, + "num_tokens": 407385414.0, + "step": 991 + }, + { + "epoch": 1.0454892966360856, + "grad_norm": 0.291270628975271, + "learning_rate": 7.422659760799835e-06, + "loss": 0.3441, + "num_tokens": 407816649.0, + "step": 992 + }, + { + "epoch": 1.0458715596330275, + "grad_norm": 0.2911709274219056, + "learning_rate": 7.417620118865653e-06, + "loss": 0.3353, + "num_tokens": 408239160.0, + "step": 993 + }, + { + "epoch": 1.0462538226299694, + "grad_norm": 0.3144480766185622, + "learning_rate": 7.4125775364304586e-06, + "loss": 0.3488, + "num_tokens": 408664284.0, + "step": 994 + }, + { + "epoch": 1.0466360856269112, + "grad_norm": 0.27495056161252873, + "learning_rate": 7.40753202122661e-06, + "loss": 0.3274, + "num_tokens": 409108871.0, + "step": 995 + }, + { + "epoch": 1.0470183486238531, + "grad_norm": 0.2739623181688787, + "learning_rate": 7.402483580990958e-06, + "loss": 0.3593, + "num_tokens": 409557140.0, + "step": 996 + }, + { + "epoch": 1.047400611620795, + "grad_norm": 0.31174056121420535, + "learning_rate": 7.3974322234648375e-06, + "loss": 0.3534, + "num_tokens": 409956138.0, + "step": 997 + }, + { + "epoch": 1.0477828746177371, + "grad_norm": 0.31339641948473407, + "learning_rate": 7.3923779563940616e-06, + "loss": 0.3305, + "num_tokens": 410379715.0, + "step": 998 + }, + { + "epoch": 1.048165137614679, + "grad_norm": 0.2740355101361742, + "learning_rate": 7.387320787528902e-06, + "loss": 0.3155, + "num_tokens": 410785387.0, + "step": 999 + }, + { + "epoch": 1.0485474006116209, + "grad_norm": 0.33580055239639545, + "learning_rate": 7.382260724624079e-06, + "loss": 0.3528, + "num_tokens": 411244242.0, + "step": 1000 + }, + { + "epoch": 1.0489296636085628, + "grad_norm": 0.3575548281906381, + "learning_rate": 7.377197775438752e-06, + "loss": 0.3219, + "num_tokens": 411634682.0, + "step": 1001 + }, + { + "epoch": 1.0493119266055047, + "grad_norm": 0.31109545583851445, + "learning_rate": 7.372131947736507e-06, + "loss": 0.3358, + "num_tokens": 412044013.0, + "step": 1002 + }, + { + "epoch": 1.0496941896024465, + "grad_norm": 0.2465494986141242, + "learning_rate": 7.3670632492853435e-06, + "loss": 0.3333, + "num_tokens": 412471591.0, + "step": 1003 + }, + { + "epoch": 1.0500764525993884, + "grad_norm": 0.3491131967698, + "learning_rate": 7.361991687857662e-06, + "loss": 0.3459, + "num_tokens": 412836021.0, + "step": 1004 + }, + { + "epoch": 1.0504587155963303, + "grad_norm": 0.3258010706493882, + "learning_rate": 7.356917271230253e-06, + "loss": 0.3437, + "num_tokens": 413260199.0, + "step": 1005 + }, + { + "epoch": 1.0508409785932722, + "grad_norm": 0.2891870649721951, + "learning_rate": 7.351840007184288e-06, + "loss": 0.3391, + "num_tokens": 413690591.0, + "step": 1006 + }, + { + "epoch": 1.051223241590214, + "grad_norm": 0.2615086606826415, + "learning_rate": 7.3467599035053005e-06, + "loss": 0.3356, + "num_tokens": 414094467.0, + "step": 1007 + }, + { + "epoch": 1.051605504587156, + "grad_norm": 0.24690199397563625, + "learning_rate": 7.341676967983182e-06, + "loss": 0.3439, + "num_tokens": 414526259.0, + "step": 1008 + }, + { + "epoch": 1.0519877675840978, + "grad_norm": 0.2702869615782497, + "learning_rate": 7.336591208412165e-06, + "loss": 0.3139, + "num_tokens": 414896654.0, + "step": 1009 + }, + { + "epoch": 1.0523700305810397, + "grad_norm": 0.2647541961108, + "learning_rate": 7.331502632590814e-06, + "loss": 0.3395, + "num_tokens": 415309627.0, + "step": 1010 + }, + { + "epoch": 1.0527522935779816, + "grad_norm": 0.2939050431827775, + "learning_rate": 7.326411248322008e-06, + "loss": 0.3203, + "num_tokens": 415728436.0, + "step": 1011 + }, + { + "epoch": 1.0531345565749235, + "grad_norm": 0.26523517485597786, + "learning_rate": 7.321317063412935e-06, + "loss": 0.3479, + "num_tokens": 416170204.0, + "step": 1012 + }, + { + "epoch": 1.0535168195718654, + "grad_norm": 0.25734848311290703, + "learning_rate": 7.316220085675078e-06, + "loss": 0.3318, + "num_tokens": 416586132.0, + "step": 1013 + }, + { + "epoch": 1.0538990825688073, + "grad_norm": 0.24521474713180183, + "learning_rate": 7.3111203229242e-06, + "loss": 0.3199, + "num_tokens": 416981645.0, + "step": 1014 + }, + { + "epoch": 1.0542813455657492, + "grad_norm": 0.27862044652919127, + "learning_rate": 7.30601778298034e-06, + "loss": 0.3214, + "num_tokens": 417391963.0, + "step": 1015 + }, + { + "epoch": 1.054663608562691, + "grad_norm": 0.2694085842544258, + "learning_rate": 7.30091247366779e-06, + "loss": 0.3066, + "num_tokens": 417805751.0, + "step": 1016 + }, + { + "epoch": 1.0550458715596331, + "grad_norm": 0.25785893913958374, + "learning_rate": 7.295804402815094e-06, + "loss": 0.3313, + "num_tokens": 418191155.0, + "step": 1017 + }, + { + "epoch": 1.055428134556575, + "grad_norm": 0.28146510726280544, + "learning_rate": 7.2906935782550235e-06, + "loss": 0.3362, + "num_tokens": 418592947.0, + "step": 1018 + }, + { + "epoch": 1.055810397553517, + "grad_norm": 0.2590923198365843, + "learning_rate": 7.285580007824577e-06, + "loss": 0.3436, + "num_tokens": 419041069.0, + "step": 1019 + }, + { + "epoch": 1.0561926605504588, + "grad_norm": 0.2650615562014495, + "learning_rate": 7.280463699364963e-06, + "loss": 0.3341, + "num_tokens": 419443162.0, + "step": 1020 + }, + { + "epoch": 1.0565749235474007, + "grad_norm": 0.2733208858566881, + "learning_rate": 7.27534466072159e-06, + "loss": 0.3588, + "num_tokens": 419855925.0, + "step": 1021 + }, + { + "epoch": 1.0569571865443426, + "grad_norm": 0.29360693062281557, + "learning_rate": 7.2702228997440494e-06, + "loss": 0.359, + "num_tokens": 420288291.0, + "step": 1022 + }, + { + "epoch": 1.0573394495412844, + "grad_norm": 0.250084621080211, + "learning_rate": 7.26509842428611e-06, + "loss": 0.3244, + "num_tokens": 420749736.0, + "step": 1023 + }, + { + "epoch": 1.0577217125382263, + "grad_norm": 0.27454331462264997, + "learning_rate": 7.259971242205702e-06, + "loss": 0.329, + "num_tokens": 421209874.0, + "step": 1024 + }, + { + "epoch": 1.0581039755351682, + "grad_norm": 0.27205923554652683, + "learning_rate": 7.2548413613649086e-06, + "loss": 0.3269, + "num_tokens": 421618221.0, + "step": 1025 + }, + { + "epoch": 1.05848623853211, + "grad_norm": 0.28898461698944294, + "learning_rate": 7.249708789629944e-06, + "loss": 0.3605, + "num_tokens": 422061571.0, + "step": 1026 + }, + { + "epoch": 1.058868501529052, + "grad_norm": 0.31891925249008934, + "learning_rate": 7.2445735348711564e-06, + "loss": 0.3516, + "num_tokens": 422511924.0, + "step": 1027 + }, + { + "epoch": 1.0592507645259939, + "grad_norm": 0.2888571653496046, + "learning_rate": 7.239435604963004e-06, + "loss": 0.359, + "num_tokens": 422944077.0, + "step": 1028 + }, + { + "epoch": 1.0596330275229358, + "grad_norm": 0.25417299656038544, + "learning_rate": 7.23429500778405e-06, + "loss": 0.3067, + "num_tokens": 423310711.0, + "step": 1029 + }, + { + "epoch": 1.0600152905198776, + "grad_norm": 0.2699266871932084, + "learning_rate": 7.229151751216944e-06, + "loss": 0.3325, + "num_tokens": 423751067.0, + "step": 1030 + }, + { + "epoch": 1.0603975535168195, + "grad_norm": 0.2932375497886189, + "learning_rate": 7.224005843148419e-06, + "loss": 0.3078, + "num_tokens": 424127163.0, + "step": 1031 + }, + { + "epoch": 1.0607798165137614, + "grad_norm": 0.2868996918061548, + "learning_rate": 7.2188572914692645e-06, + "loss": 0.3105, + "num_tokens": 424552821.0, + "step": 1032 + }, + { + "epoch": 1.0611620795107033, + "grad_norm": 0.27593803646172965, + "learning_rate": 7.213706104074335e-06, + "loss": 0.3453, + "num_tokens": 424979967.0, + "step": 1033 + }, + { + "epoch": 1.0615443425076452, + "grad_norm": 0.2928973784172809, + "learning_rate": 7.208552288862519e-06, + "loss": 0.3435, + "num_tokens": 425398115.0, + "step": 1034 + }, + { + "epoch": 1.061926605504587, + "grad_norm": 0.3109627358650455, + "learning_rate": 7.203395853736736e-06, + "loss": 0.3133, + "num_tokens": 425791141.0, + "step": 1035 + }, + { + "epoch": 1.062308868501529, + "grad_norm": 0.3166862036218244, + "learning_rate": 7.198236806603923e-06, + "loss": 0.3124, + "num_tokens": 426202151.0, + "step": 1036 + }, + { + "epoch": 1.0626911314984708, + "grad_norm": 0.24198574037841442, + "learning_rate": 7.193075155375027e-06, + "loss": 0.3368, + "num_tokens": 426630910.0, + "step": 1037 + }, + { + "epoch": 1.063073394495413, + "grad_norm": 0.25856614778208076, + "learning_rate": 7.187910907964979e-06, + "loss": 0.3488, + "num_tokens": 427079230.0, + "step": 1038 + }, + { + "epoch": 1.0634556574923548, + "grad_norm": 0.2858819417264152, + "learning_rate": 7.1827440722927e-06, + "loss": 0.3143, + "num_tokens": 427446664.0, + "step": 1039 + }, + { + "epoch": 1.0638379204892967, + "grad_norm": 0.2568596426141947, + "learning_rate": 7.177574656281075e-06, + "loss": 0.3145, + "num_tokens": 427832021.0, + "step": 1040 + }, + { + "epoch": 1.0642201834862386, + "grad_norm": 0.285783214827123, + "learning_rate": 7.1724026678569455e-06, + "loss": 0.3379, + "num_tokens": 428274589.0, + "step": 1041 + }, + { + "epoch": 1.0646024464831805, + "grad_norm": 0.26642781031203505, + "learning_rate": 7.167228114951099e-06, + "loss": 0.3511, + "num_tokens": 428689563.0, + "step": 1042 + }, + { + "epoch": 1.0649847094801224, + "grad_norm": 0.24368239933074232, + "learning_rate": 7.162051005498256e-06, + "loss": 0.3276, + "num_tokens": 429080719.0, + "step": 1043 + }, + { + "epoch": 1.0653669724770642, + "grad_norm": 0.2405054119061207, + "learning_rate": 7.156871347437056e-06, + "loss": 0.3154, + "num_tokens": 429463841.0, + "step": 1044 + }, + { + "epoch": 1.0657492354740061, + "grad_norm": 0.2669615213950939, + "learning_rate": 7.151689148710046e-06, + "loss": 0.3311, + "num_tokens": 429902093.0, + "step": 1045 + }, + { + "epoch": 1.066131498470948, + "grad_norm": 0.30028158018143036, + "learning_rate": 7.146504417263671e-06, + "loss": 0.3258, + "num_tokens": 430311699.0, + "step": 1046 + }, + { + "epoch": 1.06651376146789, + "grad_norm": 0.2835524117938337, + "learning_rate": 7.141317161048259e-06, + "loss": 0.3452, + "num_tokens": 430729269.0, + "step": 1047 + }, + { + "epoch": 1.0668960244648318, + "grad_norm": 0.28441046173696943, + "learning_rate": 7.13612738801801e-06, + "loss": 0.3238, + "num_tokens": 431122929.0, + "step": 1048 + }, + { + "epoch": 1.0672782874617737, + "grad_norm": 0.2941622402641557, + "learning_rate": 7.130935106130977e-06, + "loss": 0.3662, + "num_tokens": 431574807.0, + "step": 1049 + }, + { + "epoch": 1.0676605504587156, + "grad_norm": 0.2645377216026675, + "learning_rate": 7.125740323349071e-06, + "loss": 0.3415, + "num_tokens": 432020720.0, + "step": 1050 + }, + { + "epoch": 1.0680428134556574, + "grad_norm": 0.2584199445816942, + "learning_rate": 7.120543047638031e-06, + "loss": 0.3146, + "num_tokens": 432432725.0, + "step": 1051 + }, + { + "epoch": 1.0684250764525993, + "grad_norm": 0.2472446276124087, + "learning_rate": 7.11534328696742e-06, + "loss": 0.3364, + "num_tokens": 432890398.0, + "step": 1052 + }, + { + "epoch": 1.0688073394495412, + "grad_norm": 0.2857455439767263, + "learning_rate": 7.1101410493106096e-06, + "loss": 0.347, + "num_tokens": 433313480.0, + "step": 1053 + }, + { + "epoch": 1.069189602446483, + "grad_norm": 0.26309498057198827, + "learning_rate": 7.104936342644774e-06, + "loss": 0.3297, + "num_tokens": 433726737.0, + "step": 1054 + }, + { + "epoch": 1.069571865443425, + "grad_norm": 0.2768248416790847, + "learning_rate": 7.099729174950869e-06, + "loss": 0.3258, + "num_tokens": 434091412.0, + "step": 1055 + }, + { + "epoch": 1.0699541284403669, + "grad_norm": 0.24844614730162984, + "learning_rate": 7.094519554213629e-06, + "loss": 0.3063, + "num_tokens": 434448735.0, + "step": 1056 + }, + { + "epoch": 1.070336391437309, + "grad_norm": 0.2547043213317315, + "learning_rate": 7.089307488421544e-06, + "loss": 0.3194, + "num_tokens": 434858826.0, + "step": 1057 + }, + { + "epoch": 1.0707186544342508, + "grad_norm": 0.2662362041579405, + "learning_rate": 7.084092985566858e-06, + "loss": 0.328, + "num_tokens": 435225633.0, + "step": 1058 + }, + { + "epoch": 1.0711009174311927, + "grad_norm": 0.2610068712434011, + "learning_rate": 7.078876053645551e-06, + "loss": 0.3324, + "num_tokens": 435617788.0, + "step": 1059 + }, + { + "epoch": 1.0714831804281346, + "grad_norm": 0.26580000514352164, + "learning_rate": 7.073656700657325e-06, + "loss": 0.3302, + "num_tokens": 436032473.0, + "step": 1060 + }, + { + "epoch": 1.0718654434250765, + "grad_norm": 0.30399536846309827, + "learning_rate": 7.0684349346056004e-06, + "loss": 0.334, + "num_tokens": 436471585.0, + "step": 1061 + }, + { + "epoch": 1.0722477064220184, + "grad_norm": 0.28058977464944784, + "learning_rate": 7.063210763497489e-06, + "loss": 0.3282, + "num_tokens": 436886926.0, + "step": 1062 + }, + { + "epoch": 1.0726299694189603, + "grad_norm": 0.3149299892495931, + "learning_rate": 7.057984195343799e-06, + "loss": 0.3471, + "num_tokens": 437279830.0, + "step": 1063 + }, + { + "epoch": 1.0730122324159022, + "grad_norm": 0.30988094289704926, + "learning_rate": 7.0527552381590085e-06, + "loss": 0.323, + "num_tokens": 437699465.0, + "step": 1064 + }, + { + "epoch": 1.073394495412844, + "grad_norm": 0.33901552174408217, + "learning_rate": 7.047523899961264e-06, + "loss": 0.3595, + "num_tokens": 438112147.0, + "step": 1065 + }, + { + "epoch": 1.073776758409786, + "grad_norm": 0.2935244288076594, + "learning_rate": 7.042290188772358e-06, + "loss": 0.3429, + "num_tokens": 438533408.0, + "step": 1066 + }, + { + "epoch": 1.0741590214067278, + "grad_norm": 0.2806315484525417, + "learning_rate": 7.037054112617726e-06, + "loss": 0.3384, + "num_tokens": 438965838.0, + "step": 1067 + }, + { + "epoch": 1.0745412844036697, + "grad_norm": 0.2589712218203007, + "learning_rate": 7.031815679526428e-06, + "loss": 0.3286, + "num_tokens": 439406677.0, + "step": 1068 + }, + { + "epoch": 1.0749235474006116, + "grad_norm": 0.29006896402147997, + "learning_rate": 7.026574897531137e-06, + "loss": 0.3319, + "num_tokens": 439772450.0, + "step": 1069 + }, + { + "epoch": 1.0753058103975535, + "grad_norm": 0.29855377380418946, + "learning_rate": 7.02133177466813e-06, + "loss": 0.3331, + "num_tokens": 440168051.0, + "step": 1070 + }, + { + "epoch": 1.0756880733944953, + "grad_norm": 0.256686766820325, + "learning_rate": 7.016086318977272e-06, + "loss": 0.3202, + "num_tokens": 440577649.0, + "step": 1071 + }, + { + "epoch": 1.0760703363914372, + "grad_norm": 0.28061455939121943, + "learning_rate": 7.0108385385020065e-06, + "loss": 0.3419, + "num_tokens": 440988939.0, + "step": 1072 + }, + { + "epoch": 1.0764525993883791, + "grad_norm": 0.26164118306674927, + "learning_rate": 7.005588441289342e-06, + "loss": 0.3178, + "num_tokens": 441383619.0, + "step": 1073 + }, + { + "epoch": 1.076834862385321, + "grad_norm": 0.28237693271596354, + "learning_rate": 7.000336035389835e-06, + "loss": 0.3066, + "num_tokens": 441766942.0, + "step": 1074 + }, + { + "epoch": 1.0772171253822629, + "grad_norm": 0.2918397421691369, + "learning_rate": 6.995081328857589e-06, + "loss": 0.3332, + "num_tokens": 442162255.0, + "step": 1075 + }, + { + "epoch": 1.077599388379205, + "grad_norm": 0.2937364277329519, + "learning_rate": 6.989824329750233e-06, + "loss": 0.3265, + "num_tokens": 442541326.0, + "step": 1076 + }, + { + "epoch": 1.0779816513761469, + "grad_norm": 0.26148526680868966, + "learning_rate": 6.984565046128907e-06, + "loss": 0.3372, + "num_tokens": 442954893.0, + "step": 1077 + }, + { + "epoch": 1.0783639143730888, + "grad_norm": 0.30040769042891446, + "learning_rate": 6.979303486058262e-06, + "loss": 0.3248, + "num_tokens": 443345264.0, + "step": 1078 + }, + { + "epoch": 1.0787461773700306, + "grad_norm": 0.275720903760283, + "learning_rate": 6.974039657606433e-06, + "loss": 0.3165, + "num_tokens": 443763105.0, + "step": 1079 + }, + { + "epoch": 1.0791284403669725, + "grad_norm": 0.274254162769265, + "learning_rate": 6.968773568845034e-06, + "loss": 0.3574, + "num_tokens": 444204611.0, + "step": 1080 + }, + { + "epoch": 1.0795107033639144, + "grad_norm": 0.2706788599110957, + "learning_rate": 6.96350522784915e-06, + "loss": 0.3277, + "num_tokens": 444623578.0, + "step": 1081 + }, + { + "epoch": 1.0798929663608563, + "grad_norm": 0.28109320432987295, + "learning_rate": 6.958234642697317e-06, + "loss": 0.3171, + "num_tokens": 445025320.0, + "step": 1082 + }, + { + "epoch": 1.0802752293577982, + "grad_norm": 0.2640973212756067, + "learning_rate": 6.952961821471509e-06, + "loss": 0.3381, + "num_tokens": 445423055.0, + "step": 1083 + }, + { + "epoch": 1.08065749235474, + "grad_norm": 0.2824450266139343, + "learning_rate": 6.9476867722571315e-06, + "loss": 0.3534, + "num_tokens": 445884360.0, + "step": 1084 + }, + { + "epoch": 1.081039755351682, + "grad_norm": 0.25219328692226556, + "learning_rate": 6.942409503143008e-06, + "loss": 0.3268, + "num_tokens": 446293395.0, + "step": 1085 + }, + { + "epoch": 1.0814220183486238, + "grad_norm": 0.3015921590154933, + "learning_rate": 6.9371300222213635e-06, + "loss": 0.329, + "num_tokens": 446671882.0, + "step": 1086 + }, + { + "epoch": 1.0818042813455657, + "grad_norm": 0.2847840294891782, + "learning_rate": 6.931848337587817e-06, + "loss": 0.3306, + "num_tokens": 447100754.0, + "step": 1087 + }, + { + "epoch": 1.0821865443425076, + "grad_norm": 0.29123957000494594, + "learning_rate": 6.926564457341362e-06, + "loss": 0.3257, + "num_tokens": 447469597.0, + "step": 1088 + }, + { + "epoch": 1.0825688073394495, + "grad_norm": 0.3096082604042419, + "learning_rate": 6.9212783895843625e-06, + "loss": 0.3602, + "num_tokens": 447855369.0, + "step": 1089 + }, + { + "epoch": 1.0829510703363914, + "grad_norm": 0.28343201862453954, + "learning_rate": 6.91599014242254e-06, + "loss": 0.3343, + "num_tokens": 448284173.0, + "step": 1090 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 0.27264911732441066, + "learning_rate": 6.910699723964951e-06, + "loss": 0.3098, + "num_tokens": 448657999.0, + "step": 1091 + }, + { + "epoch": 1.0837155963302751, + "grad_norm": 0.28459364230071305, + "learning_rate": 6.905407142323987e-06, + "loss": 0.3558, + "num_tokens": 449096539.0, + "step": 1092 + }, + { + "epoch": 1.084097859327217, + "grad_norm": 0.3236254342650189, + "learning_rate": 6.900112405615351e-06, + "loss": 0.2991, + "num_tokens": 449479035.0, + "step": 1093 + }, + { + "epoch": 1.084480122324159, + "grad_norm": 0.2998871923147371, + "learning_rate": 6.894815521958057e-06, + "loss": 0.3187, + "num_tokens": 449901410.0, + "step": 1094 + }, + { + "epoch": 1.084862385321101, + "grad_norm": 0.25020509687915393, + "learning_rate": 6.889516499474407e-06, + "loss": 0.2998, + "num_tokens": 450303511.0, + "step": 1095 + }, + { + "epoch": 1.085244648318043, + "grad_norm": 0.2723295474064236, + "learning_rate": 6.884215346289983e-06, + "loss": 0.344, + "num_tokens": 450769572.0, + "step": 1096 + }, + { + "epoch": 1.0856269113149848, + "grad_norm": 0.2957926391593118, + "learning_rate": 6.878912070533634e-06, + "loss": 0.323, + "num_tokens": 451175714.0, + "step": 1097 + }, + { + "epoch": 1.0860091743119267, + "grad_norm": 0.2949835512301882, + "learning_rate": 6.873606680337469e-06, + "loss": 0.3374, + "num_tokens": 451604043.0, + "step": 1098 + }, + { + "epoch": 1.0863914373088686, + "grad_norm": 0.2733994849516281, + "learning_rate": 6.8682991838368305e-06, + "loss": 0.3334, + "num_tokens": 452011791.0, + "step": 1099 + }, + { + "epoch": 1.0867737003058104, + "grad_norm": 0.30542115556871163, + "learning_rate": 6.862989589170299e-06, + "loss": 0.318, + "num_tokens": 452392077.0, + "step": 1100 + }, + { + "epoch": 1.0871559633027523, + "grad_norm": 0.31666497094524143, + "learning_rate": 6.857677904479667e-06, + "loss": 0.3192, + "num_tokens": 452821004.0, + "step": 1101 + }, + { + "epoch": 1.0875382262996942, + "grad_norm": 0.2866747926901651, + "learning_rate": 6.852364137909934e-06, + "loss": 0.3138, + "num_tokens": 453230045.0, + "step": 1102 + }, + { + "epoch": 1.087920489296636, + "grad_norm": 0.27947422912321956, + "learning_rate": 6.84704829760929e-06, + "loss": 0.354, + "num_tokens": 453639194.0, + "step": 1103 + }, + { + "epoch": 1.088302752293578, + "grad_norm": 0.3328205177355447, + "learning_rate": 6.841730391729108e-06, + "loss": 0.3178, + "num_tokens": 454028366.0, + "step": 1104 + }, + { + "epoch": 1.0886850152905199, + "grad_norm": 0.2938174101831714, + "learning_rate": 6.836410428423926e-06, + "loss": 0.3325, + "num_tokens": 454430246.0, + "step": 1105 + }, + { + "epoch": 1.0890672782874617, + "grad_norm": 0.28158318697487555, + "learning_rate": 6.831088415851438e-06, + "loss": 0.3273, + "num_tokens": 454857619.0, + "step": 1106 + }, + { + "epoch": 1.0894495412844036, + "grad_norm": 0.26633238664989395, + "learning_rate": 6.82576436217248e-06, + "loss": 0.3099, + "num_tokens": 455268901.0, + "step": 1107 + }, + { + "epoch": 1.0898318042813455, + "grad_norm": 0.2705623576027738, + "learning_rate": 6.82043827555102e-06, + "loss": 0.312, + "num_tokens": 455682258.0, + "step": 1108 + }, + { + "epoch": 1.0902140672782874, + "grad_norm": 0.300674501338249, + "learning_rate": 6.815110164154137e-06, + "loss": 0.3247, + "num_tokens": 456099244.0, + "step": 1109 + }, + { + "epoch": 1.0905963302752293, + "grad_norm": 0.2948151027329717, + "learning_rate": 6.8097800361520225e-06, + "loss": 0.3608, + "num_tokens": 456577860.0, + "step": 1110 + }, + { + "epoch": 1.0909785932721712, + "grad_norm": 0.24641383244158463, + "learning_rate": 6.804447899717955e-06, + "loss": 0.3091, + "num_tokens": 456981107.0, + "step": 1111 + }, + { + "epoch": 1.091360856269113, + "grad_norm": 0.2981663979325173, + "learning_rate": 6.799113763028296e-06, + "loss": 0.3387, + "num_tokens": 457409025.0, + "step": 1112 + }, + { + "epoch": 1.091743119266055, + "grad_norm": 0.2917083900879846, + "learning_rate": 6.793777634262471e-06, + "loss": 0.3274, + "num_tokens": 457818304.0, + "step": 1113 + }, + { + "epoch": 1.092125382262997, + "grad_norm": 0.25636005276714496, + "learning_rate": 6.788439521602962e-06, + "loss": 0.3078, + "num_tokens": 458260532.0, + "step": 1114 + }, + { + "epoch": 1.092507645259939, + "grad_norm": 0.26098346116048976, + "learning_rate": 6.783099433235295e-06, + "loss": 0.3236, + "num_tokens": 458683374.0, + "step": 1115 + }, + { + "epoch": 1.0928899082568808, + "grad_norm": 0.304653416954866, + "learning_rate": 6.777757377348023e-06, + "loss": 0.3491, + "num_tokens": 459134797.0, + "step": 1116 + }, + { + "epoch": 1.0932721712538227, + "grad_norm": 0.3043918583481912, + "learning_rate": 6.772413362132716e-06, + "loss": 0.313, + "num_tokens": 459516780.0, + "step": 1117 + }, + { + "epoch": 1.0936544342507646, + "grad_norm": 0.3482812716597782, + "learning_rate": 6.76706739578395e-06, + "loss": 0.3451, + "num_tokens": 459987984.0, + "step": 1118 + }, + { + "epoch": 1.0940366972477065, + "grad_norm": 0.3158493862761315, + "learning_rate": 6.761719486499288e-06, + "loss": 0.3333, + "num_tokens": 460413088.0, + "step": 1119 + }, + { + "epoch": 1.0944189602446484, + "grad_norm": 0.27121800419914927, + "learning_rate": 6.7563696424792834e-06, + "loss": 0.3131, + "num_tokens": 460792247.0, + "step": 1120 + }, + { + "epoch": 1.0948012232415902, + "grad_norm": 0.28843980654081725, + "learning_rate": 6.751017871927445e-06, + "loss": 0.3539, + "num_tokens": 461215906.0, + "step": 1121 + }, + { + "epoch": 1.0951834862385321, + "grad_norm": 0.31733394722713354, + "learning_rate": 6.745664183050242e-06, + "loss": 0.333, + "num_tokens": 461633956.0, + "step": 1122 + }, + { + "epoch": 1.095565749235474, + "grad_norm": 0.26863273468526633, + "learning_rate": 6.7403085840570785e-06, + "loss": 0.3247, + "num_tokens": 462027880.0, + "step": 1123 + }, + { + "epoch": 1.095948012232416, + "grad_norm": 0.27096029779507264, + "learning_rate": 6.7349510831603e-06, + "loss": 0.3286, + "num_tokens": 462423703.0, + "step": 1124 + }, + { + "epoch": 1.0963302752293578, + "grad_norm": 0.26480293378303876, + "learning_rate": 6.729591688575153e-06, + "loss": 0.3351, + "num_tokens": 462866690.0, + "step": 1125 + }, + { + "epoch": 1.0967125382262997, + "grad_norm": 0.2824549658394046, + "learning_rate": 6.7242304085198e-06, + "loss": 0.3464, + "num_tokens": 463290417.0, + "step": 1126 + }, + { + "epoch": 1.0970948012232415, + "grad_norm": 0.3008875140021378, + "learning_rate": 6.718867251215289e-06, + "loss": 0.3285, + "num_tokens": 463729495.0, + "step": 1127 + }, + { + "epoch": 1.0974770642201834, + "grad_norm": 0.31595417403811077, + "learning_rate": 6.713502224885549e-06, + "loss": 0.3305, + "num_tokens": 464139294.0, + "step": 1128 + }, + { + "epoch": 1.0978593272171253, + "grad_norm": 0.25631436510666517, + "learning_rate": 6.708135337757372e-06, + "loss": 0.3313, + "num_tokens": 464535185.0, + "step": 1129 + }, + { + "epoch": 1.0982415902140672, + "grad_norm": 0.2524615716569441, + "learning_rate": 6.702766598060408e-06, + "loss": 0.3278, + "num_tokens": 464946225.0, + "step": 1130 + }, + { + "epoch": 1.098623853211009, + "grad_norm": 0.2931682594698415, + "learning_rate": 6.697396014027141e-06, + "loss": 0.3292, + "num_tokens": 465409298.0, + "step": 1131 + }, + { + "epoch": 1.099006116207951, + "grad_norm": 0.28157768858053606, + "learning_rate": 6.692023593892889e-06, + "loss": 0.334, + "num_tokens": 465863347.0, + "step": 1132 + }, + { + "epoch": 1.099388379204893, + "grad_norm": 0.2638431557946797, + "learning_rate": 6.686649345895786e-06, + "loss": 0.3286, + "num_tokens": 466291299.0, + "step": 1133 + }, + { + "epoch": 1.099770642201835, + "grad_norm": 0.2632175472734525, + "learning_rate": 6.681273278276762e-06, + "loss": 0.3079, + "num_tokens": 466712230.0, + "step": 1134 + }, + { + "epoch": 1.1001529051987768, + "grad_norm": 0.26981361006172583, + "learning_rate": 6.675895399279546e-06, + "loss": 0.3198, + "num_tokens": 467099380.0, + "step": 1135 + }, + { + "epoch": 1.1005351681957187, + "grad_norm": 0.27521942940893157, + "learning_rate": 6.670515717150636e-06, + "loss": 0.3067, + "num_tokens": 467509555.0, + "step": 1136 + }, + { + "epoch": 1.1009174311926606, + "grad_norm": 0.2776913545044279, + "learning_rate": 6.665134240139302e-06, + "loss": 0.3182, + "num_tokens": 467873204.0, + "step": 1137 + }, + { + "epoch": 1.1012996941896025, + "grad_norm": 0.2958732079870988, + "learning_rate": 6.6597509764975635e-06, + "loss": 0.3169, + "num_tokens": 468265708.0, + "step": 1138 + }, + { + "epoch": 1.1016819571865444, + "grad_norm": 0.2822060921278334, + "learning_rate": 6.654365934480177e-06, + "loss": 0.3105, + "num_tokens": 468659042.0, + "step": 1139 + }, + { + "epoch": 1.1020642201834863, + "grad_norm": 0.2614068593879412, + "learning_rate": 6.648979122344631e-06, + "loss": 0.3428, + "num_tokens": 469064532.0, + "step": 1140 + }, + { + "epoch": 1.1024464831804281, + "grad_norm": 0.2511414073536006, + "learning_rate": 6.643590548351127e-06, + "loss": 0.3215, + "num_tokens": 469479167.0, + "step": 1141 + }, + { + "epoch": 1.10282874617737, + "grad_norm": 0.3196836181893, + "learning_rate": 6.638200220762563e-06, + "loss": 0.3309, + "num_tokens": 469907832.0, + "step": 1142 + }, + { + "epoch": 1.103211009174312, + "grad_norm": 0.33916588495808236, + "learning_rate": 6.632808147844535e-06, + "loss": 0.3326, + "num_tokens": 470319135.0, + "step": 1143 + }, + { + "epoch": 1.1035932721712538, + "grad_norm": 0.30402951280202245, + "learning_rate": 6.627414337865308e-06, + "loss": 0.3492, + "num_tokens": 470702885.0, + "step": 1144 + }, + { + "epoch": 1.1039755351681957, + "grad_norm": 0.24623565724155985, + "learning_rate": 6.622018799095811e-06, + "loss": 0.3329, + "num_tokens": 471124528.0, + "step": 1145 + }, + { + "epoch": 1.1043577981651376, + "grad_norm": 0.3273334270655994, + "learning_rate": 6.616621539809629e-06, + "loss": 0.3203, + "num_tokens": 471537772.0, + "step": 1146 + }, + { + "epoch": 1.1047400611620795, + "grad_norm": 0.29241340463709176, + "learning_rate": 6.61122256828298e-06, + "loss": 0.3681, + "num_tokens": 471990759.0, + "step": 1147 + }, + { + "epoch": 1.1051223241590213, + "grad_norm": 0.26545928082746434, + "learning_rate": 6.6058218927947114e-06, + "loss": 0.348, + "num_tokens": 472444353.0, + "step": 1148 + }, + { + "epoch": 1.1055045871559632, + "grad_norm": 0.28358846811134375, + "learning_rate": 6.600419521626281e-06, + "loss": 0.3367, + "num_tokens": 472845224.0, + "step": 1149 + }, + { + "epoch": 1.105886850152905, + "grad_norm": 0.25962161365557807, + "learning_rate": 6.595015463061749e-06, + "loss": 0.3245, + "num_tokens": 473236299.0, + "step": 1150 + }, + { + "epoch": 1.106269113149847, + "grad_norm": 0.32720067855698187, + "learning_rate": 6.58960972538776e-06, + "loss": 0.3565, + "num_tokens": 473624719.0, + "step": 1151 + }, + { + "epoch": 1.106651376146789, + "grad_norm": 0.2735487413450729, + "learning_rate": 6.584202316893537e-06, + "loss": 0.3302, + "num_tokens": 474053855.0, + "step": 1152 + }, + { + "epoch": 1.107033639143731, + "grad_norm": 0.28172345158536743, + "learning_rate": 6.5787932458708595e-06, + "loss": 0.3136, + "num_tokens": 474442162.0, + "step": 1153 + }, + { + "epoch": 1.1074159021406729, + "grad_norm": 0.25743930040201973, + "learning_rate": 6.573382520614065e-06, + "loss": 0.3154, + "num_tokens": 474843797.0, + "step": 1154 + }, + { + "epoch": 1.1077981651376148, + "grad_norm": 0.28732220121762747, + "learning_rate": 6.567970149420018e-06, + "loss": 0.3266, + "num_tokens": 475268581.0, + "step": 1155 + }, + { + "epoch": 1.1081804281345566, + "grad_norm": 0.3337674907751543, + "learning_rate": 6.562556140588113e-06, + "loss": 0.3551, + "num_tokens": 475697773.0, + "step": 1156 + }, + { + "epoch": 1.1085626911314985, + "grad_norm": 0.2569855269694447, + "learning_rate": 6.5571405024202554e-06, + "loss": 0.3449, + "num_tokens": 476100449.0, + "step": 1157 + }, + { + "epoch": 1.1089449541284404, + "grad_norm": 0.26382197825690373, + "learning_rate": 6.551723243220847e-06, + "loss": 0.3236, + "num_tokens": 476502944.0, + "step": 1158 + }, + { + "epoch": 1.1093272171253823, + "grad_norm": 0.2715214893643109, + "learning_rate": 6.546304371296775e-06, + "loss": 0.3397, + "num_tokens": 476883054.0, + "step": 1159 + }, + { + "epoch": 1.1097094801223242, + "grad_norm": 0.33546685330152093, + "learning_rate": 6.540883894957403e-06, + "loss": 0.3565, + "num_tokens": 477316845.0, + "step": 1160 + }, + { + "epoch": 1.110091743119266, + "grad_norm": 0.28206080477550555, + "learning_rate": 6.535461822514551e-06, + "loss": 0.3346, + "num_tokens": 477760415.0, + "step": 1161 + }, + { + "epoch": 1.110474006116208, + "grad_norm": 0.2862144693541876, + "learning_rate": 6.530038162282488e-06, + "loss": 0.3123, + "num_tokens": 478156950.0, + "step": 1162 + }, + { + "epoch": 1.1108562691131498, + "grad_norm": 0.2679955889780286, + "learning_rate": 6.524612922577917e-06, + "loss": 0.3231, + "num_tokens": 478538502.0, + "step": 1163 + }, + { + "epoch": 1.1112385321100917, + "grad_norm": 0.3003856727392727, + "learning_rate": 6.519186111719967e-06, + "loss": 0.35, + "num_tokens": 478967489.0, + "step": 1164 + }, + { + "epoch": 1.1116207951070336, + "grad_norm": 0.3115099084544135, + "learning_rate": 6.51375773803017e-06, + "loss": 0.3347, + "num_tokens": 479375795.0, + "step": 1165 + }, + { + "epoch": 1.1120030581039755, + "grad_norm": 0.295069804868219, + "learning_rate": 6.508327809832457e-06, + "loss": 0.3362, + "num_tokens": 479749512.0, + "step": 1166 + }, + { + "epoch": 1.1123853211009174, + "grad_norm": 0.290660739471113, + "learning_rate": 6.502896335453144e-06, + "loss": 0.3464, + "num_tokens": 480139338.0, + "step": 1167 + }, + { + "epoch": 1.1127675840978593, + "grad_norm": 0.29644139281228193, + "learning_rate": 6.497463323220917e-06, + "loss": 0.3165, + "num_tokens": 480531066.0, + "step": 1168 + }, + { + "epoch": 1.1131498470948011, + "grad_norm": 0.3002763380856821, + "learning_rate": 6.492028781466822e-06, + "loss": 0.3184, + "num_tokens": 480963162.0, + "step": 1169 + }, + { + "epoch": 1.113532110091743, + "grad_norm": 0.26656736267127934, + "learning_rate": 6.486592718524245e-06, + "loss": 0.3289, + "num_tokens": 481389069.0, + "step": 1170 + }, + { + "epoch": 1.1139143730886851, + "grad_norm": 0.2846814708224225, + "learning_rate": 6.48115514272891e-06, + "loss": 0.3193, + "num_tokens": 481738025.0, + "step": 1171 + }, + { + "epoch": 1.114296636085627, + "grad_norm": 0.2755750639720911, + "learning_rate": 6.475716062418861e-06, + "loss": 0.3548, + "num_tokens": 482176058.0, + "step": 1172 + }, + { + "epoch": 1.114678899082569, + "grad_norm": 0.2854278914842033, + "learning_rate": 6.470275485934443e-06, + "loss": 0.3165, + "num_tokens": 482559384.0, + "step": 1173 + }, + { + "epoch": 1.1150611620795108, + "grad_norm": 0.30221699996712104, + "learning_rate": 6.464833421618303e-06, + "loss": 0.3201, + "num_tokens": 482947421.0, + "step": 1174 + }, + { + "epoch": 1.1154434250764527, + "grad_norm": 0.28618496860417286, + "learning_rate": 6.459389877815364e-06, + "loss": 0.3382, + "num_tokens": 483338931.0, + "step": 1175 + }, + { + "epoch": 1.1158256880733946, + "grad_norm": 0.23906740324076617, + "learning_rate": 6.4539448628728205e-06, + "loss": 0.3151, + "num_tokens": 483777927.0, + "step": 1176 + }, + { + "epoch": 1.1162079510703364, + "grad_norm": 0.2905278554547755, + "learning_rate": 6.448498385140119e-06, + "loss": 0.326, + "num_tokens": 484189781.0, + "step": 1177 + }, + { + "epoch": 1.1165902140672783, + "grad_norm": 0.29751545395143897, + "learning_rate": 6.443050452968955e-06, + "loss": 0.3181, + "num_tokens": 484576775.0, + "step": 1178 + }, + { + "epoch": 1.1169724770642202, + "grad_norm": 0.3107554427590532, + "learning_rate": 6.437601074713249e-06, + "loss": 0.3248, + "num_tokens": 484967203.0, + "step": 1179 + }, + { + "epoch": 1.117354740061162, + "grad_norm": 0.29491794915446384, + "learning_rate": 6.432150258729142e-06, + "loss": 0.3437, + "num_tokens": 485360750.0, + "step": 1180 + }, + { + "epoch": 1.117737003058104, + "grad_norm": 0.28962619219467095, + "learning_rate": 6.426698013374979e-06, + "loss": 0.3209, + "num_tokens": 485786533.0, + "step": 1181 + }, + { + "epoch": 1.1181192660550459, + "grad_norm": 0.30000894300982334, + "learning_rate": 6.4212443470112965e-06, + "loss": 0.3387, + "num_tokens": 486221902.0, + "step": 1182 + }, + { + "epoch": 1.1185015290519877, + "grad_norm": 0.30447255271836693, + "learning_rate": 6.415789268000809e-06, + "loss": 0.3218, + "num_tokens": 486601322.0, + "step": 1183 + }, + { + "epoch": 1.1188837920489296, + "grad_norm": 0.3352709408941912, + "learning_rate": 6.4103327847084e-06, + "loss": 0.3266, + "num_tokens": 486991201.0, + "step": 1184 + }, + { + "epoch": 1.1192660550458715, + "grad_norm": 0.2787395935714357, + "learning_rate": 6.404874905501103e-06, + "loss": 0.3323, + "num_tokens": 487399086.0, + "step": 1185 + }, + { + "epoch": 1.1196483180428134, + "grad_norm": 0.29387062799654673, + "learning_rate": 6.399415638748093e-06, + "loss": 0.3286, + "num_tokens": 487843089.0, + "step": 1186 + }, + { + "epoch": 1.1200305810397553, + "grad_norm": 0.2965124138924178, + "learning_rate": 6.393954992820674e-06, + "loss": 0.32, + "num_tokens": 488220593.0, + "step": 1187 + }, + { + "epoch": 1.1204128440366972, + "grad_norm": 0.26581801867197025, + "learning_rate": 6.388492976092262e-06, + "loss": 0.327, + "num_tokens": 488599815.0, + "step": 1188 + }, + { + "epoch": 1.120795107033639, + "grad_norm": 0.28016071705077283, + "learning_rate": 6.383029596938381e-06, + "loss": 0.3257, + "num_tokens": 488998034.0, + "step": 1189 + }, + { + "epoch": 1.1211773700305812, + "grad_norm": 0.3062308700634103, + "learning_rate": 6.377564863736638e-06, + "loss": 0.362, + "num_tokens": 489448208.0, + "step": 1190 + }, + { + "epoch": 1.121559633027523, + "grad_norm": 0.33560211872366813, + "learning_rate": 6.372098784866719e-06, + "loss": 0.356, + "num_tokens": 489910540.0, + "step": 1191 + }, + { + "epoch": 1.121941896024465, + "grad_norm": 0.3043991400671112, + "learning_rate": 6.366631368710372e-06, + "loss": 0.3126, + "num_tokens": 490327252.0, + "step": 1192 + }, + { + "epoch": 1.1223241590214068, + "grad_norm": 0.26848870857564255, + "learning_rate": 6.361162623651398e-06, + "loss": 0.3161, + "num_tokens": 490713514.0, + "step": 1193 + }, + { + "epoch": 1.1227064220183487, + "grad_norm": 0.332462798612261, + "learning_rate": 6.355692558075633e-06, + "loss": 0.3465, + "num_tokens": 491081879.0, + "step": 1194 + }, + { + "epoch": 1.1230886850152906, + "grad_norm": 0.4022524205956525, + "learning_rate": 6.35022118037094e-06, + "loss": 0.3558, + "num_tokens": 491487306.0, + "step": 1195 + }, + { + "epoch": 1.1234709480122325, + "grad_norm": 0.34500842554827216, + "learning_rate": 6.344748498927193e-06, + "loss": 0.3291, + "num_tokens": 491908450.0, + "step": 1196 + }, + { + "epoch": 1.1238532110091743, + "grad_norm": 0.25902689651334515, + "learning_rate": 6.339274522136264e-06, + "loss": 0.3015, + "num_tokens": 492278381.0, + "step": 1197 + }, + { + "epoch": 1.1242354740061162, + "grad_norm": 0.3104938393351937, + "learning_rate": 6.333799258392015e-06, + "loss": 0.3641, + "num_tokens": 492727969.0, + "step": 1198 + }, + { + "epoch": 1.1246177370030581, + "grad_norm": 0.3525594222887963, + "learning_rate": 6.328322716090279e-06, + "loss": 0.3351, + "num_tokens": 493183862.0, + "step": 1199 + }, + { + "epoch": 1.125, + "grad_norm": 0.3017488463769553, + "learning_rate": 6.322844903628849e-06, + "loss": 0.3506, + "num_tokens": 493640410.0, + "step": 1200 + }, + { + "epoch": 1.1253822629969419, + "grad_norm": 0.3107033247060044, + "learning_rate": 6.317365829407465e-06, + "loss": 0.3414, + "num_tokens": 494034007.0, + "step": 1201 + }, + { + "epoch": 1.1257645259938838, + "grad_norm": 0.2751720787164906, + "learning_rate": 6.311885501827805e-06, + "loss": 0.332, + "num_tokens": 494448450.0, + "step": 1202 + }, + { + "epoch": 1.1261467889908257, + "grad_norm": 0.29810723432497416, + "learning_rate": 6.306403929293466e-06, + "loss": 0.3308, + "num_tokens": 494870141.0, + "step": 1203 + }, + { + "epoch": 1.1265290519877675, + "grad_norm": 0.3299972544888789, + "learning_rate": 6.300921120209956e-06, + "loss": 0.335, + "num_tokens": 495259276.0, + "step": 1204 + }, + { + "epoch": 1.1269113149847094, + "grad_norm": 0.27606262733857034, + "learning_rate": 6.29543708298468e-06, + "loss": 0.3314, + "num_tokens": 495699948.0, + "step": 1205 + }, + { + "epoch": 1.1272935779816513, + "grad_norm": 0.2941016109721722, + "learning_rate": 6.289951826026921e-06, + "loss": 0.3156, + "num_tokens": 496110554.0, + "step": 1206 + }, + { + "epoch": 1.1276758409785932, + "grad_norm": 0.26876778526539413, + "learning_rate": 6.284465357747839e-06, + "loss": 0.3151, + "num_tokens": 496521050.0, + "step": 1207 + }, + { + "epoch": 1.128058103975535, + "grad_norm": 0.24321100529660128, + "learning_rate": 6.278977686560445e-06, + "loss": 0.3236, + "num_tokens": 496922784.0, + "step": 1208 + }, + { + "epoch": 1.1284403669724772, + "grad_norm": 0.31176559575102497, + "learning_rate": 6.2734888208796e-06, + "loss": 0.3337, + "num_tokens": 497352315.0, + "step": 1209 + }, + { + "epoch": 1.1288226299694188, + "grad_norm": 0.2984231143503115, + "learning_rate": 6.267998769121995e-06, + "loss": 0.3397, + "num_tokens": 497766706.0, + "step": 1210 + }, + { + "epoch": 1.129204892966361, + "grad_norm": 0.2700500572485866, + "learning_rate": 6.262507539706138e-06, + "loss": 0.3214, + "num_tokens": 498187679.0, + "step": 1211 + }, + { + "epoch": 1.1295871559633028, + "grad_norm": 0.2675612219059672, + "learning_rate": 6.2570151410523426e-06, + "loss": 0.3371, + "num_tokens": 498594599.0, + "step": 1212 + }, + { + "epoch": 1.1299694189602447, + "grad_norm": 0.2722378509521892, + "learning_rate": 6.251521581582721e-06, + "loss": 0.3551, + "num_tokens": 499022040.0, + "step": 1213 + }, + { + "epoch": 1.1303516819571866, + "grad_norm": 0.2838951082607539, + "learning_rate": 6.246026869721159e-06, + "loss": 0.3361, + "num_tokens": 499406498.0, + "step": 1214 + }, + { + "epoch": 1.1307339449541285, + "grad_norm": 0.2884680929780221, + "learning_rate": 6.240531013893311e-06, + "loss": 0.3484, + "num_tokens": 499850767.0, + "step": 1215 + }, + { + "epoch": 1.1311162079510704, + "grad_norm": 0.28582235021244295, + "learning_rate": 6.235034022526587e-06, + "loss": 0.3188, + "num_tokens": 500277474.0, + "step": 1216 + }, + { + "epoch": 1.1314984709480123, + "grad_norm": 0.2679533425163113, + "learning_rate": 6.229535904050137e-06, + "loss": 0.3196, + "num_tokens": 500707867.0, + "step": 1217 + }, + { + "epoch": 1.1318807339449541, + "grad_norm": 0.26615841636781934, + "learning_rate": 6.22403666689484e-06, + "loss": 0.3412, + "num_tokens": 501119645.0, + "step": 1218 + }, + { + "epoch": 1.132262996941896, + "grad_norm": 0.2593678841037097, + "learning_rate": 6.2185363194932925e-06, + "loss": 0.3187, + "num_tokens": 501489226.0, + "step": 1219 + }, + { + "epoch": 1.132645259938838, + "grad_norm": 0.2682673518599545, + "learning_rate": 6.213034870279789e-06, + "loss": 0.3175, + "num_tokens": 501847939.0, + "step": 1220 + }, + { + "epoch": 1.1330275229357798, + "grad_norm": 0.2948442974358067, + "learning_rate": 6.207532327690314e-06, + "loss": 0.3371, + "num_tokens": 502244759.0, + "step": 1221 + }, + { + "epoch": 1.1334097859327217, + "grad_norm": 0.30076076578492916, + "learning_rate": 6.202028700162534e-06, + "loss": 0.3425, + "num_tokens": 502644864.0, + "step": 1222 + }, + { + "epoch": 1.1337920489296636, + "grad_norm": 0.28180246181164414, + "learning_rate": 6.196523996135774e-06, + "loss": 0.3324, + "num_tokens": 503056854.0, + "step": 1223 + }, + { + "epoch": 1.1341743119266054, + "grad_norm": 0.22229708505087292, + "learning_rate": 6.191018224051011e-06, + "loss": 0.3241, + "num_tokens": 503505198.0, + "step": 1224 + }, + { + "epoch": 1.1345565749235473, + "grad_norm": 0.292999787898749, + "learning_rate": 6.185511392350861e-06, + "loss": 0.3322, + "num_tokens": 503902845.0, + "step": 1225 + }, + { + "epoch": 1.1349388379204892, + "grad_norm": 0.2928001850570755, + "learning_rate": 6.180003509479563e-06, + "loss": 0.333, + "num_tokens": 504295122.0, + "step": 1226 + }, + { + "epoch": 1.135321100917431, + "grad_norm": 0.29828780462984006, + "learning_rate": 6.174494583882969e-06, + "loss": 0.3583, + "num_tokens": 504762521.0, + "step": 1227 + }, + { + "epoch": 1.1357033639143732, + "grad_norm": 0.3019800751996685, + "learning_rate": 6.168984624008527e-06, + "loss": 0.3319, + "num_tokens": 505166097.0, + "step": 1228 + }, + { + "epoch": 1.1360856269113149, + "grad_norm": 0.28631076928255944, + "learning_rate": 6.163473638305278e-06, + "loss": 0.3373, + "num_tokens": 505614429.0, + "step": 1229 + }, + { + "epoch": 1.136467889908257, + "grad_norm": 0.28527484407587345, + "learning_rate": 6.157961635223829e-06, + "loss": 0.3202, + "num_tokens": 505998145.0, + "step": 1230 + }, + { + "epoch": 1.1368501529051989, + "grad_norm": 0.29458881015369165, + "learning_rate": 6.152448623216351e-06, + "loss": 0.3357, + "num_tokens": 506403686.0, + "step": 1231 + }, + { + "epoch": 1.1372324159021407, + "grad_norm": 0.31226283170569014, + "learning_rate": 6.146934610736559e-06, + "loss": 0.34, + "num_tokens": 506770460.0, + "step": 1232 + }, + { + "epoch": 1.1376146788990826, + "grad_norm": 0.28916373891386943, + "learning_rate": 6.141419606239706e-06, + "loss": 0.3238, + "num_tokens": 507173256.0, + "step": 1233 + }, + { + "epoch": 1.1379969418960245, + "grad_norm": 0.30002659299961043, + "learning_rate": 6.135903618182563e-06, + "loss": 0.3467, + "num_tokens": 507592252.0, + "step": 1234 + }, + { + "epoch": 1.1383792048929664, + "grad_norm": 0.2823773806154856, + "learning_rate": 6.1303866550234105e-06, + "loss": 0.3344, + "num_tokens": 507972759.0, + "step": 1235 + }, + { + "epoch": 1.1387614678899083, + "grad_norm": 0.29375630237253025, + "learning_rate": 6.124868725222022e-06, + "loss": 0.3234, + "num_tokens": 508341201.0, + "step": 1236 + }, + { + "epoch": 1.1391437308868502, + "grad_norm": 0.24724330360980634, + "learning_rate": 6.11934983723966e-06, + "loss": 0.3061, + "num_tokens": 508724695.0, + "step": 1237 + }, + { + "epoch": 1.139525993883792, + "grad_norm": 0.322882724078703, + "learning_rate": 6.1138299995390474e-06, + "loss": 0.3306, + "num_tokens": 509164940.0, + "step": 1238 + }, + { + "epoch": 1.139908256880734, + "grad_norm": 0.31655016049580065, + "learning_rate": 6.108309220584368e-06, + "loss": 0.335, + "num_tokens": 509544798.0, + "step": 1239 + }, + { + "epoch": 1.1402905198776758, + "grad_norm": 0.2865019074970294, + "learning_rate": 6.102787508841249e-06, + "loss": 0.3489, + "num_tokens": 509945630.0, + "step": 1240 + }, + { + "epoch": 1.1406727828746177, + "grad_norm": 0.2702364920607733, + "learning_rate": 6.097264872776749e-06, + "loss": 0.3334, + "num_tokens": 510384844.0, + "step": 1241 + }, + { + "epoch": 1.1410550458715596, + "grad_norm": 0.3010056332952182, + "learning_rate": 6.091741320859342e-06, + "loss": 0.3412, + "num_tokens": 510793400.0, + "step": 1242 + }, + { + "epoch": 1.1414373088685015, + "grad_norm": 0.2926277386916459, + "learning_rate": 6.086216861558906e-06, + "loss": 0.3199, + "num_tokens": 511184572.0, + "step": 1243 + }, + { + "epoch": 1.1418195718654434, + "grad_norm": 0.2909789761519127, + "learning_rate": 6.0806915033467095e-06, + "loss": 0.3373, + "num_tokens": 511582034.0, + "step": 1244 + }, + { + "epoch": 1.1422018348623852, + "grad_norm": 0.27999556858106905, + "learning_rate": 6.075165254695404e-06, + "loss": 0.3191, + "num_tokens": 511954026.0, + "step": 1245 + }, + { + "epoch": 1.1425840978593271, + "grad_norm": 0.2721584403384464, + "learning_rate": 6.069638124079004e-06, + "loss": 0.3507, + "num_tokens": 512403908.0, + "step": 1246 + }, + { + "epoch": 1.1429663608562692, + "grad_norm": 0.27835700256202, + "learning_rate": 6.0641101199728725e-06, + "loss": 0.3625, + "num_tokens": 512840328.0, + "step": 1247 + }, + { + "epoch": 1.143348623853211, + "grad_norm": 0.29029193418958227, + "learning_rate": 6.058581250853718e-06, + "loss": 0.3496, + "num_tokens": 513255198.0, + "step": 1248 + }, + { + "epoch": 1.143730886850153, + "grad_norm": 0.27777757438066064, + "learning_rate": 6.05305152519957e-06, + "loss": 0.3256, + "num_tokens": 513666037.0, + "step": 1249 + }, + { + "epoch": 1.144113149847095, + "grad_norm": 0.27010859494181105, + "learning_rate": 6.047520951489777e-06, + "loss": 0.3149, + "num_tokens": 514049229.0, + "step": 1250 + }, + { + "epoch": 1.1444954128440368, + "grad_norm": 0.27674372396782687, + "learning_rate": 6.041989538204985e-06, + "loss": 0.3217, + "num_tokens": 514442087.0, + "step": 1251 + }, + { + "epoch": 1.1448776758409787, + "grad_norm": 0.2628172888278004, + "learning_rate": 6.036457293827127e-06, + "loss": 0.3405, + "num_tokens": 514879611.0, + "step": 1252 + }, + { + "epoch": 1.1452599388379205, + "grad_norm": 0.2619591647094363, + "learning_rate": 6.030924226839409e-06, + "loss": 0.3402, + "num_tokens": 515283517.0, + "step": 1253 + }, + { + "epoch": 1.1456422018348624, + "grad_norm": 0.2818768513418224, + "learning_rate": 6.025390345726303e-06, + "loss": 0.3393, + "num_tokens": 515685600.0, + "step": 1254 + }, + { + "epoch": 1.1460244648318043, + "grad_norm": 0.2678219454813258, + "learning_rate": 6.019855658973526e-06, + "loss": 0.3203, + "num_tokens": 516129302.0, + "step": 1255 + }, + { + "epoch": 1.1464067278287462, + "grad_norm": 0.2797872167750161, + "learning_rate": 6.014320175068029e-06, + "loss": 0.3244, + "num_tokens": 516508852.0, + "step": 1256 + }, + { + "epoch": 1.146788990825688, + "grad_norm": 0.2696525313413257, + "learning_rate": 6.008783902497991e-06, + "loss": 0.3351, + "num_tokens": 516958342.0, + "step": 1257 + }, + { + "epoch": 1.14717125382263, + "grad_norm": 0.27308702191417084, + "learning_rate": 6.003246849752795e-06, + "loss": 0.3182, + "num_tokens": 517408834.0, + "step": 1258 + }, + { + "epoch": 1.1475535168195719, + "grad_norm": 0.26908608253097527, + "learning_rate": 5.997709025323022e-06, + "loss": 0.3483, + "num_tokens": 517830615.0, + "step": 1259 + }, + { + "epoch": 1.1479357798165137, + "grad_norm": 0.29413778426744663, + "learning_rate": 5.992170437700436e-06, + "loss": 0.3225, + "num_tokens": 518213873.0, + "step": 1260 + }, + { + "epoch": 1.1483180428134556, + "grad_norm": 0.29358445603734934, + "learning_rate": 5.986631095377973e-06, + "loss": 0.3582, + "num_tokens": 518653456.0, + "step": 1261 + }, + { + "epoch": 1.1487003058103975, + "grad_norm": 0.24719658419942464, + "learning_rate": 5.981091006849723e-06, + "loss": 0.3111, + "num_tokens": 519036638.0, + "step": 1262 + }, + { + "epoch": 1.1490825688073394, + "grad_norm": 0.26242070194244943, + "learning_rate": 5.975550180610924e-06, + "loss": 0.3304, + "num_tokens": 519450315.0, + "step": 1263 + }, + { + "epoch": 1.1494648318042813, + "grad_norm": 0.26895030510807805, + "learning_rate": 5.970008625157943e-06, + "loss": 0.3321, + "num_tokens": 519886506.0, + "step": 1264 + }, + { + "epoch": 1.1498470948012232, + "grad_norm": 0.3345310359045355, + "learning_rate": 5.964466348988265e-06, + "loss": 0.3603, + "num_tokens": 520266247.0, + "step": 1265 + }, + { + "epoch": 1.150229357798165, + "grad_norm": 0.29990485369656844, + "learning_rate": 5.958923360600483e-06, + "loss": 0.3399, + "num_tokens": 520700245.0, + "step": 1266 + }, + { + "epoch": 1.150611620795107, + "grad_norm": 0.2832255897139781, + "learning_rate": 5.953379668494277e-06, + "loss": 0.3622, + "num_tokens": 521107093.0, + "step": 1267 + }, + { + "epoch": 1.150993883792049, + "grad_norm": 0.2698577207990674, + "learning_rate": 5.947835281170411e-06, + "loss": 0.3415, + "num_tokens": 521514124.0, + "step": 1268 + }, + { + "epoch": 1.151376146788991, + "grad_norm": 0.3080503410584079, + "learning_rate": 5.942290207130711e-06, + "loss": 0.3338, + "num_tokens": 521898134.0, + "step": 1269 + }, + { + "epoch": 1.1517584097859328, + "grad_norm": 0.2805786961709091, + "learning_rate": 5.9367444548780606e-06, + "loss": 0.3231, + "num_tokens": 522324838.0, + "step": 1270 + }, + { + "epoch": 1.1521406727828747, + "grad_norm": 0.2744235940281009, + "learning_rate": 5.931198032916378e-06, + "loss": 0.3472, + "num_tokens": 522776919.0, + "step": 1271 + }, + { + "epoch": 1.1525229357798166, + "grad_norm": 0.29146604746825633, + "learning_rate": 5.925650949750614e-06, + "loss": 0.3585, + "num_tokens": 523162888.0, + "step": 1272 + }, + { + "epoch": 1.1529051987767585, + "grad_norm": 0.2822742713670998, + "learning_rate": 5.920103213886731e-06, + "loss": 0.3493, + "num_tokens": 523603382.0, + "step": 1273 + }, + { + "epoch": 1.1532874617737003, + "grad_norm": 0.2785104838460252, + "learning_rate": 5.914554833831688e-06, + "loss": 0.3607, + "num_tokens": 524053934.0, + "step": 1274 + }, + { + "epoch": 1.1536697247706422, + "grad_norm": 0.24806833548658463, + "learning_rate": 5.909005818093438e-06, + "loss": 0.316, + "num_tokens": 524474054.0, + "step": 1275 + }, + { + "epoch": 1.154051987767584, + "grad_norm": 0.2804731172270925, + "learning_rate": 5.903456175180906e-06, + "loss": 0.3604, + "num_tokens": 524914455.0, + "step": 1276 + }, + { + "epoch": 1.154434250764526, + "grad_norm": 0.3070257737490006, + "learning_rate": 5.897905913603981e-06, + "loss": 0.361, + "num_tokens": 525326957.0, + "step": 1277 + }, + { + "epoch": 1.1548165137614679, + "grad_norm": 0.31371418974737814, + "learning_rate": 5.892355041873495e-06, + "loss": 0.3315, + "num_tokens": 525709293.0, + "step": 1278 + }, + { + "epoch": 1.1551987767584098, + "grad_norm": 0.2716223655433006, + "learning_rate": 5.88680356850122e-06, + "loss": 0.3427, + "num_tokens": 526098562.0, + "step": 1279 + }, + { + "epoch": 1.1555810397553516, + "grad_norm": 0.3030769375277236, + "learning_rate": 5.881251501999852e-06, + "loss": 0.3442, + "num_tokens": 526499312.0, + "step": 1280 + }, + { + "epoch": 1.1559633027522935, + "grad_norm": 0.2963042171712443, + "learning_rate": 5.875698850882994e-06, + "loss": 0.3397, + "num_tokens": 526923109.0, + "step": 1281 + }, + { + "epoch": 1.1563455657492354, + "grad_norm": 0.3071982430267412, + "learning_rate": 5.870145623665144e-06, + "loss": 0.345, + "num_tokens": 527335761.0, + "step": 1282 + }, + { + "epoch": 1.1567278287461773, + "grad_norm": 0.29866114157295515, + "learning_rate": 5.864591828861687e-06, + "loss": 0.3201, + "num_tokens": 527734981.0, + "step": 1283 + }, + { + "epoch": 1.1571100917431192, + "grad_norm": 0.26266095162245895, + "learning_rate": 5.859037474988875e-06, + "loss": 0.3437, + "num_tokens": 528180897.0, + "step": 1284 + }, + { + "epoch": 1.157492354740061, + "grad_norm": 0.25819020802003484, + "learning_rate": 5.85348257056382e-06, + "loss": 0.3126, + "num_tokens": 528550308.0, + "step": 1285 + }, + { + "epoch": 1.157874617737003, + "grad_norm": 0.3043815202372134, + "learning_rate": 5.8479271241044765e-06, + "loss": 0.3241, + "num_tokens": 528956031.0, + "step": 1286 + }, + { + "epoch": 1.158256880733945, + "grad_norm": 0.27472679986267445, + "learning_rate": 5.842371144129635e-06, + "loss": 0.3278, + "num_tokens": 529366228.0, + "step": 1287 + }, + { + "epoch": 1.158639143730887, + "grad_norm": 0.2837054028386318, + "learning_rate": 5.836814639158892e-06, + "loss": 0.3566, + "num_tokens": 529787450.0, + "step": 1288 + }, + { + "epoch": 1.1590214067278288, + "grad_norm": 0.2589050147121643, + "learning_rate": 5.831257617712663e-06, + "loss": 0.3357, + "num_tokens": 530216531.0, + "step": 1289 + }, + { + "epoch": 1.1594036697247707, + "grad_norm": 0.2827656661346502, + "learning_rate": 5.825700088312146e-06, + "loss": 0.3378, + "num_tokens": 530672941.0, + "step": 1290 + }, + { + "epoch": 1.1597859327217126, + "grad_norm": 0.28634448435235477, + "learning_rate": 5.820142059479325e-06, + "loss": 0.3427, + "num_tokens": 531103045.0, + "step": 1291 + }, + { + "epoch": 1.1601681957186545, + "grad_norm": 0.286830346220727, + "learning_rate": 5.814583539736941e-06, + "loss": 0.3378, + "num_tokens": 531487162.0, + "step": 1292 + }, + { + "epoch": 1.1605504587155964, + "grad_norm": 0.29849931105834676, + "learning_rate": 5.809024537608497e-06, + "loss": 0.3347, + "num_tokens": 531894849.0, + "step": 1293 + }, + { + "epoch": 1.1609327217125383, + "grad_norm": 0.28084740509433626, + "learning_rate": 5.80346506161823e-06, + "loss": 0.3578, + "num_tokens": 532299329.0, + "step": 1294 + }, + { + "epoch": 1.1613149847094801, + "grad_norm": 0.23045410370720337, + "learning_rate": 5.797905120291105e-06, + "loss": 0.3191, + "num_tokens": 532706908.0, + "step": 1295 + }, + { + "epoch": 1.161697247706422, + "grad_norm": 0.2773975644863712, + "learning_rate": 5.792344722152802e-06, + "loss": 0.3337, + "num_tokens": 533112769.0, + "step": 1296 + }, + { + "epoch": 1.162079510703364, + "grad_norm": 0.28383362137923096, + "learning_rate": 5.786783875729698e-06, + "loss": 0.3344, + "num_tokens": 533540197.0, + "step": 1297 + }, + { + "epoch": 1.1624617737003058, + "grad_norm": 0.30083915244172127, + "learning_rate": 5.7812225895488624e-06, + "loss": 0.3467, + "num_tokens": 533919313.0, + "step": 1298 + }, + { + "epoch": 1.1628440366972477, + "grad_norm": 0.24300447493260083, + "learning_rate": 5.775660872138035e-06, + "loss": 0.3186, + "num_tokens": 534309085.0, + "step": 1299 + }, + { + "epoch": 1.1632262996941896, + "grad_norm": 0.28124353650839806, + "learning_rate": 5.770098732025616e-06, + "loss": 0.3309, + "num_tokens": 534736114.0, + "step": 1300 + }, + { + "epoch": 1.1636085626911314, + "grad_norm": 0.34338160046999916, + "learning_rate": 5.764536177740658e-06, + "loss": 0.3251, + "num_tokens": 535154091.0, + "step": 1301 + }, + { + "epoch": 1.1639908256880733, + "grad_norm": 0.2777355796087579, + "learning_rate": 5.758973217812847e-06, + "loss": 0.3559, + "num_tokens": 535548923.0, + "step": 1302 + }, + { + "epoch": 1.1643730886850152, + "grad_norm": 0.2517192897385153, + "learning_rate": 5.7534098607724886e-06, + "loss": 0.3415, + "num_tokens": 535984687.0, + "step": 1303 + }, + { + "epoch": 1.164755351681957, + "grad_norm": 0.27387034825726725, + "learning_rate": 5.747846115150501e-06, + "loss": 0.3309, + "num_tokens": 536380850.0, + "step": 1304 + }, + { + "epoch": 1.165137614678899, + "grad_norm": 0.2796362784976631, + "learning_rate": 5.742281989478396e-06, + "loss": 0.3072, + "num_tokens": 536787401.0, + "step": 1305 + }, + { + "epoch": 1.165519877675841, + "grad_norm": 0.24184125498369388, + "learning_rate": 5.736717492288265e-06, + "loss": 0.3335, + "num_tokens": 537218169.0, + "step": 1306 + }, + { + "epoch": 1.165902140672783, + "grad_norm": 0.25232621546366074, + "learning_rate": 5.731152632112779e-06, + "loss": 0.3355, + "num_tokens": 537636666.0, + "step": 1307 + }, + { + "epoch": 1.1662844036697249, + "grad_norm": 0.29465033041218436, + "learning_rate": 5.725587417485157e-06, + "loss": 0.3304, + "num_tokens": 538013901.0, + "step": 1308 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.309041606773954, + "learning_rate": 5.720021856939162e-06, + "loss": 0.3605, + "num_tokens": 538432685.0, + "step": 1309 + }, + { + "epoch": 1.1670489296636086, + "grad_norm": 0.314272404698483, + "learning_rate": 5.714455959009091e-06, + "loss": 0.3324, + "num_tokens": 538825811.0, + "step": 1310 + }, + { + "epoch": 1.1674311926605505, + "grad_norm": 0.2614159710867707, + "learning_rate": 5.708889732229756e-06, + "loss": 0.3409, + "num_tokens": 539243648.0, + "step": 1311 + }, + { + "epoch": 1.1678134556574924, + "grad_norm": 0.2443024470663165, + "learning_rate": 5.7033231851364755e-06, + "loss": 0.3634, + "num_tokens": 539693841.0, + "step": 1312 + }, + { + "epoch": 1.1681957186544343, + "grad_norm": 0.26154104551841184, + "learning_rate": 5.6977563262650545e-06, + "loss": 0.3154, + "num_tokens": 540060479.0, + "step": 1313 + }, + { + "epoch": 1.1685779816513762, + "grad_norm": 0.2529838628475418, + "learning_rate": 5.692189164151783e-06, + "loss": 0.3113, + "num_tokens": 540452979.0, + "step": 1314 + }, + { + "epoch": 1.168960244648318, + "grad_norm": 0.25669393135790824, + "learning_rate": 5.686621707333407e-06, + "loss": 0.3367, + "num_tokens": 540887841.0, + "step": 1315 + }, + { + "epoch": 1.16934250764526, + "grad_norm": 0.2812699969856882, + "learning_rate": 5.681053964347136e-06, + "loss": 0.3303, + "num_tokens": 541331033.0, + "step": 1316 + }, + { + "epoch": 1.1697247706422018, + "grad_norm": 0.2535974149901525, + "learning_rate": 5.675485943730606e-06, + "loss": 0.3609, + "num_tokens": 541777222.0, + "step": 1317 + }, + { + "epoch": 1.1701070336391437, + "grad_norm": 0.25840631807611825, + "learning_rate": 5.669917654021891e-06, + "loss": 0.327, + "num_tokens": 542222005.0, + "step": 1318 + }, + { + "epoch": 1.1704892966360856, + "grad_norm": 0.2697724432843473, + "learning_rate": 5.664349103759467e-06, + "loss": 0.3489, + "num_tokens": 542631901.0, + "step": 1319 + }, + { + "epoch": 1.1708715596330275, + "grad_norm": 0.27133888701747094, + "learning_rate": 5.658780301482212e-06, + "loss": 0.3422, + "num_tokens": 543060829.0, + "step": 1320 + }, + { + "epoch": 1.1712538226299694, + "grad_norm": 0.25878320630111323, + "learning_rate": 5.653211255729396e-06, + "loss": 0.3551, + "num_tokens": 543517650.0, + "step": 1321 + }, + { + "epoch": 1.1716360856269112, + "grad_norm": 0.26819280364944253, + "learning_rate": 5.647641975040656e-06, + "loss": 0.3424, + "num_tokens": 543899754.0, + "step": 1322 + }, + { + "epoch": 1.1720183486238531, + "grad_norm": 0.2534303978781987, + "learning_rate": 5.6420724679559935e-06, + "loss": 0.3221, + "num_tokens": 544322959.0, + "step": 1323 + }, + { + "epoch": 1.172400611620795, + "grad_norm": 0.2639307873962133, + "learning_rate": 5.6365027430157544e-06, + "loss": 0.3526, + "num_tokens": 544716184.0, + "step": 1324 + }, + { + "epoch": 1.1727828746177371, + "grad_norm": 0.30988974363217614, + "learning_rate": 5.630932808760622e-06, + "loss": 0.3328, + "num_tokens": 545141369.0, + "step": 1325 + }, + { + "epoch": 1.1731651376146788, + "grad_norm": 0.2818917196130082, + "learning_rate": 5.625362673731597e-06, + "loss": 0.3465, + "num_tokens": 545538147.0, + "step": 1326 + }, + { + "epoch": 1.1735474006116209, + "grad_norm": 0.3410974850927972, + "learning_rate": 5.619792346469988e-06, + "loss": 0.3768, + "num_tokens": 545945638.0, + "step": 1327 + }, + { + "epoch": 1.1739296636085628, + "grad_norm": 0.27402305325533044, + "learning_rate": 5.614221835517401e-06, + "loss": 0.3435, + "num_tokens": 546371184.0, + "step": 1328 + }, + { + "epoch": 1.1743119266055047, + "grad_norm": 0.28880305451530414, + "learning_rate": 5.60865114941572e-06, + "loss": 0.3204, + "num_tokens": 546713971.0, + "step": 1329 + }, + { + "epoch": 1.1746941896024465, + "grad_norm": 0.3228827000346965, + "learning_rate": 5.603080296707104e-06, + "loss": 0.3582, + "num_tokens": 547161502.0, + "step": 1330 + }, + { + "epoch": 1.1750764525993884, + "grad_norm": 0.30142371376868743, + "learning_rate": 5.5975092859339604e-06, + "loss": 0.331, + "num_tokens": 547578265.0, + "step": 1331 + }, + { + "epoch": 1.1754587155963303, + "grad_norm": 0.26576757402878626, + "learning_rate": 5.591938125638941e-06, + "loss": 0.363, + "num_tokens": 548016613.0, + "step": 1332 + }, + { + "epoch": 1.1758409785932722, + "grad_norm": 0.24641365074347962, + "learning_rate": 5.586366824364933e-06, + "loss": 0.3676, + "num_tokens": 548479579.0, + "step": 1333 + }, + { + "epoch": 1.176223241590214, + "grad_norm": 0.25605426029897516, + "learning_rate": 5.5807953906550305e-06, + "loss": 0.3435, + "num_tokens": 548921996.0, + "step": 1334 + }, + { + "epoch": 1.176605504587156, + "grad_norm": 0.2728580609167911, + "learning_rate": 5.575223833052535e-06, + "loss": 0.3086, + "num_tokens": 549289550.0, + "step": 1335 + }, + { + "epoch": 1.1769877675840978, + "grad_norm": 0.256797464366231, + "learning_rate": 5.569652160100938e-06, + "loss": 0.335, + "num_tokens": 549693863.0, + "step": 1336 + }, + { + "epoch": 1.1773700305810397, + "grad_norm": 0.2865218483460503, + "learning_rate": 5.564080380343908e-06, + "loss": 0.3553, + "num_tokens": 550094835.0, + "step": 1337 + }, + { + "epoch": 1.1777522935779816, + "grad_norm": 0.2620523294805251, + "learning_rate": 5.5585085023252775e-06, + "loss": 0.3613, + "num_tokens": 550533990.0, + "step": 1338 + }, + { + "epoch": 1.1781345565749235, + "grad_norm": 0.2620009970726739, + "learning_rate": 5.552936534589029e-06, + "loss": 0.3087, + "num_tokens": 550859026.0, + "step": 1339 + }, + { + "epoch": 1.1785168195718654, + "grad_norm": 0.27004059994447555, + "learning_rate": 5.54736448567928e-06, + "loss": 0.3151, + "num_tokens": 551252088.0, + "step": 1340 + }, + { + "epoch": 1.1788990825688073, + "grad_norm": 0.25920432137446164, + "learning_rate": 5.5417923641402795e-06, + "loss": 0.3578, + "num_tokens": 551676918.0, + "step": 1341 + }, + { + "epoch": 1.1792813455657492, + "grad_norm": 0.273106799089088, + "learning_rate": 5.536220178516381e-06, + "loss": 0.3197, + "num_tokens": 552043959.0, + "step": 1342 + }, + { + "epoch": 1.179663608562691, + "grad_norm": 0.25208398023175504, + "learning_rate": 5.5306479373520385e-06, + "loss": 0.3636, + "num_tokens": 552499152.0, + "step": 1343 + }, + { + "epoch": 1.1800458715596331, + "grad_norm": 0.24943968287218565, + "learning_rate": 5.525075649191792e-06, + "loss": 0.3075, + "num_tokens": 552890112.0, + "step": 1344 + }, + { + "epoch": 1.1804281345565748, + "grad_norm": 0.26257584519004706, + "learning_rate": 5.519503322580253e-06, + "loss": 0.3195, + "num_tokens": 553310359.0, + "step": 1345 + }, + { + "epoch": 1.180810397553517, + "grad_norm": 0.3364761227589351, + "learning_rate": 5.513930966062093e-06, + "loss": 0.3452, + "num_tokens": 553747124.0, + "step": 1346 + }, + { + "epoch": 1.1811926605504588, + "grad_norm": 0.2510117631456629, + "learning_rate": 5.508358588182027e-06, + "loss": 0.3545, + "num_tokens": 554169144.0, + "step": 1347 + }, + { + "epoch": 1.1815749235474007, + "grad_norm": 0.2586205298542163, + "learning_rate": 5.502786197484806e-06, + "loss": 0.3615, + "num_tokens": 554603475.0, + "step": 1348 + }, + { + "epoch": 1.1819571865443426, + "grad_norm": 0.29712645669110715, + "learning_rate": 5.4972138025151955e-06, + "loss": 0.3344, + "num_tokens": 554958426.0, + "step": 1349 + }, + { + "epoch": 1.1823394495412844, + "grad_norm": 0.2552510588902102, + "learning_rate": 5.491641411817974e-06, + "loss": 0.3119, + "num_tokens": 555333405.0, + "step": 1350 + }, + { + "epoch": 1.1827217125382263, + "grad_norm": 0.25438854300226804, + "learning_rate": 5.486069033937907e-06, + "loss": 0.3353, + "num_tokens": 555765117.0, + "step": 1351 + }, + { + "epoch": 1.1831039755351682, + "grad_norm": 0.27644822120937756, + "learning_rate": 5.480496677419749e-06, + "loss": 0.3195, + "num_tokens": 556172255.0, + "step": 1352 + }, + { + "epoch": 1.18348623853211, + "grad_norm": 0.26108602086015437, + "learning_rate": 5.474924350808209e-06, + "loss": 0.3597, + "num_tokens": 556621721.0, + "step": 1353 + }, + { + "epoch": 1.183868501529052, + "grad_norm": 0.30431586492079626, + "learning_rate": 5.469352062647964e-06, + "loss": 0.3351, + "num_tokens": 557030718.0, + "step": 1354 + }, + { + "epoch": 1.1842507645259939, + "grad_norm": 0.251684584540872, + "learning_rate": 5.463779821483622e-06, + "loss": 0.3429, + "num_tokens": 557491165.0, + "step": 1355 + }, + { + "epoch": 1.1846330275229358, + "grad_norm": 0.3214276581907004, + "learning_rate": 5.4582076358597236e-06, + "loss": 0.3664, + "num_tokens": 557898868.0, + "step": 1356 + }, + { + "epoch": 1.1850152905198776, + "grad_norm": 0.2799918136166761, + "learning_rate": 5.452635514320721e-06, + "loss": 0.3319, + "num_tokens": 558288556.0, + "step": 1357 + }, + { + "epoch": 1.1853975535168195, + "grad_norm": 0.27860450698348965, + "learning_rate": 5.4470634654109734e-06, + "loss": 0.3433, + "num_tokens": 558695749.0, + "step": 1358 + }, + { + "epoch": 1.1857798165137614, + "grad_norm": 0.32964096494581424, + "learning_rate": 5.4414914976747256e-06, + "loss": 0.3448, + "num_tokens": 559082342.0, + "step": 1359 + }, + { + "epoch": 1.1861620795107033, + "grad_norm": 0.36620755813583367, + "learning_rate": 5.435919619656092e-06, + "loss": 0.3422, + "num_tokens": 559488552.0, + "step": 1360 + }, + { + "epoch": 1.1865443425076452, + "grad_norm": 0.3213781316508178, + "learning_rate": 5.4303478398990636e-06, + "loss": 0.3354, + "num_tokens": 559884114.0, + "step": 1361 + }, + { + "epoch": 1.186926605504587, + "grad_norm": 0.28058421787237275, + "learning_rate": 5.424776166947466e-06, + "loss": 0.3326, + "num_tokens": 560266302.0, + "step": 1362 + }, + { + "epoch": 1.1873088685015292, + "grad_norm": 0.3026842162831174, + "learning_rate": 5.419204609344971e-06, + "loss": 0.3428, + "num_tokens": 560631357.0, + "step": 1363 + }, + { + "epoch": 1.1876911314984708, + "grad_norm": 0.2747070776274948, + "learning_rate": 5.413633175635069e-06, + "loss": 0.3421, + "num_tokens": 561053439.0, + "step": 1364 + }, + { + "epoch": 1.188073394495413, + "grad_norm": 0.26901634762917187, + "learning_rate": 5.408061874361059e-06, + "loss": 0.3265, + "num_tokens": 561469160.0, + "step": 1365 + }, + { + "epoch": 1.1884556574923548, + "grad_norm": 0.2750589565784775, + "learning_rate": 5.402490714066042e-06, + "loss": 0.3338, + "num_tokens": 561876780.0, + "step": 1366 + }, + { + "epoch": 1.1888379204892967, + "grad_norm": 0.3069863313161933, + "learning_rate": 5.396919703292898e-06, + "loss": 0.3473, + "num_tokens": 562282744.0, + "step": 1367 + }, + { + "epoch": 1.1892201834862386, + "grad_norm": 0.2731402493495636, + "learning_rate": 5.391348850584283e-06, + "loss": 0.3293, + "num_tokens": 562698144.0, + "step": 1368 + }, + { + "epoch": 1.1896024464831805, + "grad_norm": 0.28710289800065253, + "learning_rate": 5.385778164482601e-06, + "loss": 0.3501, + "num_tokens": 563101816.0, + "step": 1369 + }, + { + "epoch": 1.1899847094801224, + "grad_norm": 0.3212726567400315, + "learning_rate": 5.380207653530014e-06, + "loss": 0.3603, + "num_tokens": 563494768.0, + "step": 1370 + }, + { + "epoch": 1.1903669724770642, + "grad_norm": 0.3271944057404377, + "learning_rate": 5.374637326268405e-06, + "loss": 0.3191, + "num_tokens": 563852980.0, + "step": 1371 + }, + { + "epoch": 1.1907492354740061, + "grad_norm": 0.27908977762854204, + "learning_rate": 5.36906719123938e-06, + "loss": 0.3353, + "num_tokens": 564274794.0, + "step": 1372 + }, + { + "epoch": 1.191131498470948, + "grad_norm": 0.2725219276464799, + "learning_rate": 5.363497256984246e-06, + "loss": 0.3629, + "num_tokens": 564726003.0, + "step": 1373 + }, + { + "epoch": 1.19151376146789, + "grad_norm": 0.2680686365133286, + "learning_rate": 5.357927532044008e-06, + "loss": 0.3178, + "num_tokens": 565121488.0, + "step": 1374 + }, + { + "epoch": 1.1918960244648318, + "grad_norm": 0.2515536947449138, + "learning_rate": 5.352358024959347e-06, + "loss": 0.3394, + "num_tokens": 565540090.0, + "step": 1375 + }, + { + "epoch": 1.1922782874617737, + "grad_norm": 0.25330141166030484, + "learning_rate": 5.346788744270606e-06, + "loss": 0.3467, + "num_tokens": 565937170.0, + "step": 1376 + }, + { + "epoch": 1.1926605504587156, + "grad_norm": 0.28424405686817317, + "learning_rate": 5.34121969851779e-06, + "loss": 0.3616, + "num_tokens": 566332486.0, + "step": 1377 + }, + { + "epoch": 1.1930428134556574, + "grad_norm": 0.28078474279921, + "learning_rate": 5.3356508962405355e-06, + "loss": 0.3429, + "num_tokens": 566766459.0, + "step": 1378 + }, + { + "epoch": 1.1934250764525993, + "grad_norm": 0.26702394113646466, + "learning_rate": 5.33008234597811e-06, + "loss": 0.3441, + "num_tokens": 567182057.0, + "step": 1379 + }, + { + "epoch": 1.1938073394495412, + "grad_norm": 0.2771604577913791, + "learning_rate": 5.3245140562693935e-06, + "loss": 0.3412, + "num_tokens": 567592867.0, + "step": 1380 + }, + { + "epoch": 1.194189602446483, + "grad_norm": 0.2300144032980534, + "learning_rate": 5.318946035652865e-06, + "loss": 0.3382, + "num_tokens": 568020880.0, + "step": 1381 + }, + { + "epoch": 1.1945718654434252, + "grad_norm": 0.24727581085690706, + "learning_rate": 5.313378292666593e-06, + "loss": 0.3313, + "num_tokens": 568430277.0, + "step": 1382 + }, + { + "epoch": 1.1949541284403669, + "grad_norm": 0.2523918862437742, + "learning_rate": 5.3078108358482195e-06, + "loss": 0.339, + "num_tokens": 568789434.0, + "step": 1383 + }, + { + "epoch": 1.195336391437309, + "grad_norm": 0.26821998228851673, + "learning_rate": 5.302243673734946e-06, + "loss": 0.3398, + "num_tokens": 569157627.0, + "step": 1384 + }, + { + "epoch": 1.1957186544342508, + "grad_norm": 0.27095145054063874, + "learning_rate": 5.296676814863526e-06, + "loss": 0.3616, + "num_tokens": 569603856.0, + "step": 1385 + }, + { + "epoch": 1.1961009174311927, + "grad_norm": 0.26178490868073856, + "learning_rate": 5.291110267770246e-06, + "loss": 0.3487, + "num_tokens": 570004028.0, + "step": 1386 + }, + { + "epoch": 1.1964831804281346, + "grad_norm": 0.3154005476351008, + "learning_rate": 5.285544040990911e-06, + "loss": 0.3508, + "num_tokens": 570435235.0, + "step": 1387 + }, + { + "epoch": 1.1968654434250765, + "grad_norm": 0.26614780841414865, + "learning_rate": 5.279978143060841e-06, + "loss": 0.3302, + "num_tokens": 570830181.0, + "step": 1388 + }, + { + "epoch": 1.1972477064220184, + "grad_norm": 0.2628416320831463, + "learning_rate": 5.274412582514845e-06, + "loss": 0.3606, + "num_tokens": 571233656.0, + "step": 1389 + }, + { + "epoch": 1.1976299694189603, + "grad_norm": 0.27715431223283626, + "learning_rate": 5.268847367887222e-06, + "loss": 0.3382, + "num_tokens": 571655604.0, + "step": 1390 + }, + { + "epoch": 1.1980122324159022, + "grad_norm": 0.2765569380362338, + "learning_rate": 5.263282507711734e-06, + "loss": 0.3606, + "num_tokens": 572086268.0, + "step": 1391 + }, + { + "epoch": 1.198394495412844, + "grad_norm": 0.26061891209710825, + "learning_rate": 5.2577180105216075e-06, + "loss": 0.3284, + "num_tokens": 572454480.0, + "step": 1392 + }, + { + "epoch": 1.198776758409786, + "grad_norm": 0.2893721214247075, + "learning_rate": 5.2521538848495015e-06, + "loss": 0.3378, + "num_tokens": 572842094.0, + "step": 1393 + }, + { + "epoch": 1.1991590214067278, + "grad_norm": 0.2665465987385463, + "learning_rate": 5.246590139227513e-06, + "loss": 0.3586, + "num_tokens": 573259429.0, + "step": 1394 + }, + { + "epoch": 1.1995412844036697, + "grad_norm": 0.25949626519122665, + "learning_rate": 5.2410267821871556e-06, + "loss": 0.3409, + "num_tokens": 573674345.0, + "step": 1395 + }, + { + "epoch": 1.1999235474006116, + "grad_norm": 0.25292850092251457, + "learning_rate": 5.235463822259343e-06, + "loss": 0.3485, + "num_tokens": 574084417.0, + "step": 1396 + }, + { + "epoch": 1.2003058103975535, + "grad_norm": 0.2616914259118696, + "learning_rate": 5.229901267974386e-06, + "loss": 0.338, + "num_tokens": 574570186.0, + "step": 1397 + }, + { + "epoch": 1.2006880733944953, + "grad_norm": 0.27877354549603256, + "learning_rate": 5.224339127861967e-06, + "loss": 0.3736, + "num_tokens": 574983429.0, + "step": 1398 + }, + { + "epoch": 1.2010703363914372, + "grad_norm": 0.2621265565285058, + "learning_rate": 5.21877741045114e-06, + "loss": 0.3476, + "num_tokens": 575395756.0, + "step": 1399 + }, + { + "epoch": 1.2014525993883791, + "grad_norm": 0.2742773048549955, + "learning_rate": 5.213216124270302e-06, + "loss": 0.3371, + "num_tokens": 575783755.0, + "step": 1400 + }, + { + "epoch": 1.2018348623853212, + "grad_norm": 0.27906385898119274, + "learning_rate": 5.2076552778472e-06, + "loss": 0.3329, + "num_tokens": 576200507.0, + "step": 1401 + }, + { + "epoch": 1.2022171253822629, + "grad_norm": 0.25687283866749777, + "learning_rate": 5.2020948797088966e-06, + "loss": 0.3179, + "num_tokens": 576576622.0, + "step": 1402 + }, + { + "epoch": 1.202599388379205, + "grad_norm": 0.2460919647336761, + "learning_rate": 5.196534938381772e-06, + "loss": 0.3479, + "num_tokens": 577015647.0, + "step": 1403 + }, + { + "epoch": 1.2029816513761469, + "grad_norm": 0.2700509491553811, + "learning_rate": 5.190975462391505e-06, + "loss": 0.3346, + "num_tokens": 577391454.0, + "step": 1404 + }, + { + "epoch": 1.2033639143730888, + "grad_norm": 0.2815930587995079, + "learning_rate": 5.185416460263061e-06, + "loss": 0.3729, + "num_tokens": 577793958.0, + "step": 1405 + }, + { + "epoch": 1.2037461773700306, + "grad_norm": 0.24162844223122162, + "learning_rate": 5.179857940520678e-06, + "loss": 0.322, + "num_tokens": 578207442.0, + "step": 1406 + }, + { + "epoch": 1.2041284403669725, + "grad_norm": 0.2765822632686362, + "learning_rate": 5.174299911687854e-06, + "loss": 0.3419, + "num_tokens": 578613416.0, + "step": 1407 + }, + { + "epoch": 1.2045107033639144, + "grad_norm": 0.24160945120242927, + "learning_rate": 5.16874238228734e-06, + "loss": 0.3165, + "num_tokens": 579041794.0, + "step": 1408 + }, + { + "epoch": 1.2048929663608563, + "grad_norm": 0.2554094526264656, + "learning_rate": 5.16318536084111e-06, + "loss": 0.3416, + "num_tokens": 579464950.0, + "step": 1409 + }, + { + "epoch": 1.2052752293577982, + "grad_norm": 0.26599550197518695, + "learning_rate": 5.157628855870369e-06, + "loss": 0.3571, + "num_tokens": 579867063.0, + "step": 1410 + }, + { + "epoch": 1.20565749235474, + "grad_norm": 0.2516927193627474, + "learning_rate": 5.152072875895524e-06, + "loss": 0.3473, + "num_tokens": 580302317.0, + "step": 1411 + }, + { + "epoch": 1.206039755351682, + "grad_norm": 0.24855271629994571, + "learning_rate": 5.1465174294361815e-06, + "loss": 0.3301, + "num_tokens": 580690091.0, + "step": 1412 + }, + { + "epoch": 1.2064220183486238, + "grad_norm": 0.24558978907512305, + "learning_rate": 5.1409625250111265e-06, + "loss": 0.3331, + "num_tokens": 581115425.0, + "step": 1413 + }, + { + "epoch": 1.2068042813455657, + "grad_norm": 0.2578010603148845, + "learning_rate": 5.1354081711383155e-06, + "loss": 0.3329, + "num_tokens": 581516764.0, + "step": 1414 + }, + { + "epoch": 1.2071865443425076, + "grad_norm": 0.23975504250036228, + "learning_rate": 5.129854376334859e-06, + "loss": 0.3191, + "num_tokens": 581940060.0, + "step": 1415 + }, + { + "epoch": 1.2075688073394495, + "grad_norm": 0.2484998360852266, + "learning_rate": 5.124301149117008e-06, + "loss": 0.3387, + "num_tokens": 582374651.0, + "step": 1416 + }, + { + "epoch": 1.2079510703363914, + "grad_norm": 0.26548934993454854, + "learning_rate": 5.11874849800015e-06, + "loss": 0.3481, + "num_tokens": 582763438.0, + "step": 1417 + }, + { + "epoch": 1.2083333333333333, + "grad_norm": 0.252163932582216, + "learning_rate": 5.113196431498783e-06, + "loss": 0.3535, + "num_tokens": 583241991.0, + "step": 1418 + }, + { + "epoch": 1.2087155963302751, + "grad_norm": 0.2783722614542156, + "learning_rate": 5.1076449581265084e-06, + "loss": 0.3518, + "num_tokens": 583647446.0, + "step": 1419 + }, + { + "epoch": 1.209097859327217, + "grad_norm": 0.2602301446119456, + "learning_rate": 5.102094086396021e-06, + "loss": 0.3488, + "num_tokens": 584081127.0, + "step": 1420 + }, + { + "epoch": 1.209480122324159, + "grad_norm": 0.27169725102755915, + "learning_rate": 5.096543824819096e-06, + "loss": 0.3628, + "num_tokens": 584509039.0, + "step": 1421 + }, + { + "epoch": 1.209862385321101, + "grad_norm": 0.2620672668182354, + "learning_rate": 5.0909941819065624e-06, + "loss": 0.3307, + "num_tokens": 584907216.0, + "step": 1422 + }, + { + "epoch": 1.210244648318043, + "grad_norm": 0.23969063514026234, + "learning_rate": 5.085445166168313e-06, + "loss": 0.3456, + "num_tokens": 585330821.0, + "step": 1423 + }, + { + "epoch": 1.2106269113149848, + "grad_norm": 0.2700596226005749, + "learning_rate": 5.079896786113271e-06, + "loss": 0.3551, + "num_tokens": 585730754.0, + "step": 1424 + }, + { + "epoch": 1.2110091743119267, + "grad_norm": 0.2479005234748073, + "learning_rate": 5.0743490502493865e-06, + "loss": 0.3592, + "num_tokens": 586172686.0, + "step": 1425 + }, + { + "epoch": 1.2113914373088686, + "grad_norm": 0.24222616670337951, + "learning_rate": 5.068801967083624e-06, + "loss": 0.3311, + "num_tokens": 586602209.0, + "step": 1426 + }, + { + "epoch": 1.2117737003058104, + "grad_norm": 0.299527088448113, + "learning_rate": 5.063255545121941e-06, + "loss": 0.3388, + "num_tokens": 586997897.0, + "step": 1427 + }, + { + "epoch": 1.2121559633027523, + "grad_norm": 0.24137978692899348, + "learning_rate": 5.057709792869291e-06, + "loss": 0.3382, + "num_tokens": 587430388.0, + "step": 1428 + }, + { + "epoch": 1.2125382262996942, + "grad_norm": 0.2389468213471267, + "learning_rate": 5.052164718829591e-06, + "loss": 0.3574, + "num_tokens": 587866141.0, + "step": 1429 + }, + { + "epoch": 1.212920489296636, + "grad_norm": 0.23843445927237836, + "learning_rate": 5.046620331505725e-06, + "loss": 0.3468, + "num_tokens": 588259863.0, + "step": 1430 + }, + { + "epoch": 1.213302752293578, + "grad_norm": 0.2592158152435474, + "learning_rate": 5.0410766393995196e-06, + "loss": 0.3598, + "num_tokens": 588692785.0, + "step": 1431 + }, + { + "epoch": 1.2136850152905199, + "grad_norm": 0.3236524856457128, + "learning_rate": 5.035533651011737e-06, + "loss": 0.3681, + "num_tokens": 589140008.0, + "step": 1432 + }, + { + "epoch": 1.2140672782874617, + "grad_norm": 0.3231716045774789, + "learning_rate": 5.029991374842058e-06, + "loss": 0.3776, + "num_tokens": 589577226.0, + "step": 1433 + }, + { + "epoch": 1.2144495412844036, + "grad_norm": 0.27330984416902876, + "learning_rate": 5.024449819389079e-06, + "loss": 0.3283, + "num_tokens": 589977520.0, + "step": 1434 + }, + { + "epoch": 1.2148318042813455, + "grad_norm": 0.27711815419739766, + "learning_rate": 5.0189089931502774e-06, + "loss": 0.3314, + "num_tokens": 590333940.0, + "step": 1435 + }, + { + "epoch": 1.2152140672782874, + "grad_norm": 0.25082337758936657, + "learning_rate": 5.0133689046220305e-06, + "loss": 0.35, + "num_tokens": 590769392.0, + "step": 1436 + }, + { + "epoch": 1.2155963302752293, + "grad_norm": 0.24514922403076825, + "learning_rate": 5.007829562299567e-06, + "loss": 0.3633, + "num_tokens": 591218518.0, + "step": 1437 + }, + { + "epoch": 1.2159785932721712, + "grad_norm": 0.23652350998859914, + "learning_rate": 5.00229097467698e-06, + "loss": 0.3578, + "num_tokens": 591674926.0, + "step": 1438 + }, + { + "epoch": 1.216360856269113, + "grad_norm": 0.2875587626857516, + "learning_rate": 4.996753150247206e-06, + "loss": 0.3416, + "num_tokens": 592069939.0, + "step": 1439 + }, + { + "epoch": 1.216743119266055, + "grad_norm": 0.26569620097643265, + "learning_rate": 4.991216097502009e-06, + "loss": 0.3596, + "num_tokens": 592451931.0, + "step": 1440 + }, + { + "epoch": 1.217125382262997, + "grad_norm": 0.23722529594349642, + "learning_rate": 4.985679824931973e-06, + "loss": 0.3123, + "num_tokens": 592877382.0, + "step": 1441 + }, + { + "epoch": 1.217507645259939, + "grad_norm": 0.2718248599459754, + "learning_rate": 4.980144341026475e-06, + "loss": 0.3361, + "num_tokens": 593280628.0, + "step": 1442 + }, + { + "epoch": 1.2178899082568808, + "grad_norm": 0.259969371177258, + "learning_rate": 4.974609654273699e-06, + "loss": 0.3166, + "num_tokens": 593683143.0, + "step": 1443 + }, + { + "epoch": 1.2182721712538227, + "grad_norm": 0.2858017941992173, + "learning_rate": 4.969075773160591e-06, + "loss": 0.3604, + "num_tokens": 594140574.0, + "step": 1444 + }, + { + "epoch": 1.2186544342507646, + "grad_norm": 0.25731140585770546, + "learning_rate": 4.963542706172875e-06, + "loss": 0.317, + "num_tokens": 594531170.0, + "step": 1445 + }, + { + "epoch": 1.2190366972477065, + "grad_norm": 0.26673586482778844, + "learning_rate": 4.958010461795015e-06, + "loss": 0.3275, + "num_tokens": 594929877.0, + "step": 1446 + }, + { + "epoch": 1.2194189602446484, + "grad_norm": 0.2724948758311067, + "learning_rate": 4.9524790485102245e-06, + "loss": 0.3538, + "num_tokens": 595330308.0, + "step": 1447 + }, + { + "epoch": 1.2198012232415902, + "grad_norm": 0.2638491077385973, + "learning_rate": 4.946948474800433e-06, + "loss": 0.3543, + "num_tokens": 595730073.0, + "step": 1448 + }, + { + "epoch": 1.2201834862385321, + "grad_norm": 0.28052594497754146, + "learning_rate": 4.941418749146285e-06, + "loss": 0.3575, + "num_tokens": 596136671.0, + "step": 1449 + }, + { + "epoch": 1.220565749235474, + "grad_norm": 0.2730903926179031, + "learning_rate": 4.935889880027131e-06, + "loss": 0.341, + "num_tokens": 596520791.0, + "step": 1450 + }, + { + "epoch": 1.220948012232416, + "grad_norm": 0.27587189339245105, + "learning_rate": 4.9303618759209985e-06, + "loss": 0.3364, + "num_tokens": 596925051.0, + "step": 1451 + }, + { + "epoch": 1.2213302752293578, + "grad_norm": 0.2331021046558726, + "learning_rate": 4.924834745304597e-06, + "loss": 0.3408, + "num_tokens": 597354974.0, + "step": 1452 + }, + { + "epoch": 1.2217125382262997, + "grad_norm": 0.26967929153955755, + "learning_rate": 4.919308496653291e-06, + "loss": 0.3365, + "num_tokens": 597745041.0, + "step": 1453 + }, + { + "epoch": 1.2220948012232415, + "grad_norm": 0.2441304698585968, + "learning_rate": 4.913783138441096e-06, + "loss": 0.34, + "num_tokens": 598166032.0, + "step": 1454 + }, + { + "epoch": 1.2224770642201834, + "grad_norm": 0.30568369318908595, + "learning_rate": 4.90825867914066e-06, + "loss": 0.3662, + "num_tokens": 598615529.0, + "step": 1455 + }, + { + "epoch": 1.2228593272171253, + "grad_norm": 0.27108813426412587, + "learning_rate": 4.902735127223251e-06, + "loss": 0.3357, + "num_tokens": 598962015.0, + "step": 1456 + }, + { + "epoch": 1.2232415902140672, + "grad_norm": 0.26376651550064845, + "learning_rate": 4.897212491158753e-06, + "loss": 0.3418, + "num_tokens": 599353019.0, + "step": 1457 + }, + { + "epoch": 1.223623853211009, + "grad_norm": 0.267246845896282, + "learning_rate": 4.891690779415635e-06, + "loss": 0.3292, + "num_tokens": 599738876.0, + "step": 1458 + }, + { + "epoch": 1.224006116207951, + "grad_norm": 0.2762491852336058, + "learning_rate": 4.886170000460956e-06, + "loss": 0.3429, + "num_tokens": 600147051.0, + "step": 1459 + }, + { + "epoch": 1.224388379204893, + "grad_norm": 0.2701923633567987, + "learning_rate": 4.880650162760342e-06, + "loss": 0.3381, + "num_tokens": 600546768.0, + "step": 1460 + }, + { + "epoch": 1.224770642201835, + "grad_norm": 0.27426144067796565, + "learning_rate": 4.8751312747779784e-06, + "loss": 0.3646, + "num_tokens": 600973826.0, + "step": 1461 + }, + { + "epoch": 1.2251529051987768, + "grad_norm": 0.2361830189895042, + "learning_rate": 4.869613344976593e-06, + "loss": 0.3586, + "num_tokens": 601413023.0, + "step": 1462 + }, + { + "epoch": 1.2255351681957187, + "grad_norm": 0.32601450858801245, + "learning_rate": 4.86409638181744e-06, + "loss": 0.3493, + "num_tokens": 601815906.0, + "step": 1463 + }, + { + "epoch": 1.2259174311926606, + "grad_norm": 0.27346705309985486, + "learning_rate": 4.858580393760295e-06, + "loss": 0.3237, + "num_tokens": 602217583.0, + "step": 1464 + }, + { + "epoch": 1.2262996941896025, + "grad_norm": 0.32413952008488867, + "learning_rate": 4.853065389263442e-06, + "loss": 0.3583, + "num_tokens": 602650577.0, + "step": 1465 + }, + { + "epoch": 1.2266819571865444, + "grad_norm": 0.32723787793552767, + "learning_rate": 4.84755137678365e-06, + "loss": 0.3475, + "num_tokens": 603086817.0, + "step": 1466 + }, + { + "epoch": 1.2270642201834863, + "grad_norm": 0.268616572995574, + "learning_rate": 4.842038364776171e-06, + "loss": 0.3561, + "num_tokens": 603512296.0, + "step": 1467 + }, + { + "epoch": 1.2274464831804281, + "grad_norm": 0.2709343214222051, + "learning_rate": 4.836526361694724e-06, + "loss": 0.3643, + "num_tokens": 603931069.0, + "step": 1468 + }, + { + "epoch": 1.22782874617737, + "grad_norm": 0.3260819738706885, + "learning_rate": 4.8310153759914745e-06, + "loss": 0.3697, + "num_tokens": 604328786.0, + "step": 1469 + }, + { + "epoch": 1.228211009174312, + "grad_norm": 0.27786297560386686, + "learning_rate": 4.825505416117034e-06, + "loss": 0.3543, + "num_tokens": 604734964.0, + "step": 1470 + }, + { + "epoch": 1.2285932721712538, + "grad_norm": 0.2604192561524548, + "learning_rate": 4.819996490520438e-06, + "loss": 0.3271, + "num_tokens": 605178313.0, + "step": 1471 + }, + { + "epoch": 1.2289755351681957, + "grad_norm": 0.26109735350940677, + "learning_rate": 4.814488607649141e-06, + "loss": 0.3524, + "num_tokens": 605592377.0, + "step": 1472 + }, + { + "epoch": 1.2293577981651376, + "grad_norm": 0.2797810462333863, + "learning_rate": 4.808981775948989e-06, + "loss": 0.3476, + "num_tokens": 605998341.0, + "step": 1473 + }, + { + "epoch": 1.2297400611620795, + "grad_norm": 0.24640832170391241, + "learning_rate": 4.803476003864227e-06, + "loss": 0.3535, + "num_tokens": 606401178.0, + "step": 1474 + }, + { + "epoch": 1.2301223241590213, + "grad_norm": 0.2942989088750153, + "learning_rate": 4.797971299837466e-06, + "loss": 0.336, + "num_tokens": 606804645.0, + "step": 1475 + }, + { + "epoch": 1.2305045871559632, + "grad_norm": 0.28471978921225577, + "learning_rate": 4.792467672309686e-06, + "loss": 0.3611, + "num_tokens": 607267175.0, + "step": 1476 + }, + { + "epoch": 1.230886850152905, + "grad_norm": 0.2577764435469393, + "learning_rate": 4.7869651297202144e-06, + "loss": 0.3389, + "num_tokens": 607695846.0, + "step": 1477 + }, + { + "epoch": 1.231269113149847, + "grad_norm": 0.2552033630619719, + "learning_rate": 4.78146368050671e-06, + "loss": 0.3786, + "num_tokens": 608156545.0, + "step": 1478 + }, + { + "epoch": 1.231651376146789, + "grad_norm": 0.2678851518078223, + "learning_rate": 4.775963333105161e-06, + "loss": 0.3833, + "num_tokens": 608601289.0, + "step": 1479 + }, + { + "epoch": 1.2320336391437308, + "grad_norm": 0.25369458979380793, + "learning_rate": 4.770464095949865e-06, + "loss": 0.3541, + "num_tokens": 609027276.0, + "step": 1480 + }, + { + "epoch": 1.2324159021406729, + "grad_norm": 0.2659788503745061, + "learning_rate": 4.764965977473416e-06, + "loss": 0.3517, + "num_tokens": 609471043.0, + "step": 1481 + }, + { + "epoch": 1.2327981651376148, + "grad_norm": 0.2657842908883709, + "learning_rate": 4.7594689861066904e-06, + "loss": 0.3515, + "num_tokens": 609909672.0, + "step": 1482 + }, + { + "epoch": 1.2331804281345566, + "grad_norm": 0.25381790180089137, + "learning_rate": 4.7539731302788435e-06, + "loss": 0.3375, + "num_tokens": 610300328.0, + "step": 1483 + }, + { + "epoch": 1.2335626911314985, + "grad_norm": 0.28007643986816405, + "learning_rate": 4.7484784184172796e-06, + "loss": 0.3672, + "num_tokens": 610697792.0, + "step": 1484 + }, + { + "epoch": 1.2339449541284404, + "grad_norm": 0.2665383822644655, + "learning_rate": 4.742984858947658e-06, + "loss": 0.3383, + "num_tokens": 611105840.0, + "step": 1485 + }, + { + "epoch": 1.2343272171253823, + "grad_norm": 0.26259115872856414, + "learning_rate": 4.737492460293865e-06, + "loss": 0.3452, + "num_tokens": 611518454.0, + "step": 1486 + }, + { + "epoch": 1.2347094801223242, + "grad_norm": 0.2894301650285961, + "learning_rate": 4.7320012308780074e-06, + "loss": 0.3782, + "num_tokens": 611939223.0, + "step": 1487 + }, + { + "epoch": 1.235091743119266, + "grad_norm": 0.27807499399347135, + "learning_rate": 4.726511179120402e-06, + "loss": 0.3441, + "num_tokens": 612312398.0, + "step": 1488 + }, + { + "epoch": 1.235474006116208, + "grad_norm": 0.2613156134344048, + "learning_rate": 4.721022313439556e-06, + "loss": 0.3534, + "num_tokens": 612721537.0, + "step": 1489 + }, + { + "epoch": 1.2358562691131498, + "grad_norm": 0.2903192050933763, + "learning_rate": 4.715534642252163e-06, + "loss": 0.3337, + "num_tokens": 613089683.0, + "step": 1490 + }, + { + "epoch": 1.2362385321100917, + "grad_norm": 0.283007698889918, + "learning_rate": 4.71004817397308e-06, + "loss": 0.3579, + "num_tokens": 613554657.0, + "step": 1491 + }, + { + "epoch": 1.2366207951070336, + "grad_norm": 0.2581786248807065, + "learning_rate": 4.704562917015321e-06, + "loss": 0.3516, + "num_tokens": 613997508.0, + "step": 1492 + }, + { + "epoch": 1.2370030581039755, + "grad_norm": 0.2694670126769742, + "learning_rate": 4.6990788797900435e-06, + "loss": 0.3618, + "num_tokens": 614386449.0, + "step": 1493 + }, + { + "epoch": 1.2373853211009174, + "grad_norm": 0.2274473701492788, + "learning_rate": 4.693596070706535e-06, + "loss": 0.3404, + "num_tokens": 614825633.0, + "step": 1494 + }, + { + "epoch": 1.2377675840978593, + "grad_norm": 0.27876404382055525, + "learning_rate": 4.688114498172196e-06, + "loss": 0.3535, + "num_tokens": 615233900.0, + "step": 1495 + }, + { + "epoch": 1.2381498470948011, + "grad_norm": 0.3242892792938995, + "learning_rate": 4.682634170592537e-06, + "loss": 0.3682, + "num_tokens": 615658861.0, + "step": 1496 + }, + { + "epoch": 1.238532110091743, + "grad_norm": 0.2787711577260471, + "learning_rate": 4.677155096371153e-06, + "loss": 0.3611, + "num_tokens": 616063526.0, + "step": 1497 + }, + { + "epoch": 1.2389143730886851, + "grad_norm": 0.2554049172006759, + "learning_rate": 4.671677283909722e-06, + "loss": 0.3346, + "num_tokens": 616443518.0, + "step": 1498 + }, + { + "epoch": 1.2392966360856268, + "grad_norm": 0.26168405164306663, + "learning_rate": 4.666200741607987e-06, + "loss": 0.3441, + "num_tokens": 616842118.0, + "step": 1499 + }, + { + "epoch": 1.239678899082569, + "grad_norm": 0.267189197129404, + "learning_rate": 4.660725477863738e-06, + "loss": 0.3509, + "num_tokens": 617256623.0, + "step": 1500 + }, + { + "epoch": 1.2400611620795108, + "grad_norm": 0.27880337082632484, + "learning_rate": 4.65525150107281e-06, + "loss": 0.3601, + "num_tokens": 617675632.0, + "step": 1501 + }, + { + "epoch": 1.2404434250764527, + "grad_norm": 0.2773574935020381, + "learning_rate": 4.649778819629062e-06, + "loss": 0.3688, + "num_tokens": 618131018.0, + "step": 1502 + }, + { + "epoch": 1.2408256880733946, + "grad_norm": 0.24667708079763367, + "learning_rate": 4.6443074419243695e-06, + "loss": 0.3227, + "num_tokens": 618534447.0, + "step": 1503 + }, + { + "epoch": 1.2412079510703364, + "grad_norm": 0.25792012719285073, + "learning_rate": 4.638837376348603e-06, + "loss": 0.3488, + "num_tokens": 618964198.0, + "step": 1504 + }, + { + "epoch": 1.2415902140672783, + "grad_norm": 0.2443927818637308, + "learning_rate": 4.633368631289628e-06, + "loss": 0.3413, + "num_tokens": 619414257.0, + "step": 1505 + }, + { + "epoch": 1.2419724770642202, + "grad_norm": 0.24132322852009902, + "learning_rate": 4.6279012151332815e-06, + "loss": 0.3257, + "num_tokens": 619801643.0, + "step": 1506 + }, + { + "epoch": 1.242354740061162, + "grad_norm": 0.30312409830588954, + "learning_rate": 4.622435136263363e-06, + "loss": 0.3748, + "num_tokens": 620263688.0, + "step": 1507 + }, + { + "epoch": 1.242737003058104, + "grad_norm": 0.29370964523192583, + "learning_rate": 4.61697040306162e-06, + "loss": 0.387, + "num_tokens": 620692631.0, + "step": 1508 + }, + { + "epoch": 1.2431192660550459, + "grad_norm": 0.2570004905550023, + "learning_rate": 4.6115070239077385e-06, + "loss": 0.3353, + "num_tokens": 621100009.0, + "step": 1509 + }, + { + "epoch": 1.2435015290519877, + "grad_norm": 0.28160061630265976, + "learning_rate": 4.6060450071793295e-06, + "loss": 0.3435, + "num_tokens": 621488160.0, + "step": 1510 + }, + { + "epoch": 1.2438837920489296, + "grad_norm": 0.2605355082911742, + "learning_rate": 4.600584361251909e-06, + "loss": 0.3568, + "num_tokens": 621904777.0, + "step": 1511 + }, + { + "epoch": 1.2442660550458715, + "grad_norm": 0.2704803648218184, + "learning_rate": 4.595125094498899e-06, + "loss": 0.3662, + "num_tokens": 622311490.0, + "step": 1512 + }, + { + "epoch": 1.2446483180428134, + "grad_norm": 0.2567803921692123, + "learning_rate": 4.589667215291601e-06, + "loss": 0.3404, + "num_tokens": 622737647.0, + "step": 1513 + }, + { + "epoch": 1.2450305810397553, + "grad_norm": 0.2738559274542928, + "learning_rate": 4.5842107319991916e-06, + "loss": 0.3521, + "num_tokens": 623131448.0, + "step": 1514 + }, + { + "epoch": 1.2454128440366972, + "grad_norm": 0.23455746570437214, + "learning_rate": 4.578755652988705e-06, + "loss": 0.3469, + "num_tokens": 623552011.0, + "step": 1515 + }, + { + "epoch": 1.245795107033639, + "grad_norm": 0.33530673271263983, + "learning_rate": 4.5733019866250215e-06, + "loss": 0.342, + "num_tokens": 623962191.0, + "step": 1516 + }, + { + "epoch": 1.2461773700305812, + "grad_norm": 0.3229263283677262, + "learning_rate": 4.567849741270858e-06, + "loss": 0.3642, + "num_tokens": 624400925.0, + "step": 1517 + }, + { + "epoch": 1.2465596330275228, + "grad_norm": 0.30700483648842336, + "learning_rate": 4.562398925286753e-06, + "loss": 0.3736, + "num_tokens": 624833682.0, + "step": 1518 + }, + { + "epoch": 1.246941896024465, + "grad_norm": 0.2616954995028591, + "learning_rate": 4.556949547031048e-06, + "loss": 0.346, + "num_tokens": 625268088.0, + "step": 1519 + }, + { + "epoch": 1.2473241590214068, + "grad_norm": 0.2698282431970166, + "learning_rate": 4.551501614859882e-06, + "loss": 0.3256, + "num_tokens": 625646330.0, + "step": 1520 + }, + { + "epoch": 1.2477064220183487, + "grad_norm": 0.28539214412216046, + "learning_rate": 4.546055137127182e-06, + "loss": 0.3658, + "num_tokens": 626076391.0, + "step": 1521 + }, + { + "epoch": 1.2480886850152906, + "grad_norm": 0.30462971972459074, + "learning_rate": 4.540610122184637e-06, + "loss": 0.344, + "num_tokens": 626436204.0, + "step": 1522 + }, + { + "epoch": 1.2484709480122325, + "grad_norm": 0.2716450498903016, + "learning_rate": 4.535166578381699e-06, + "loss": 0.3496, + "num_tokens": 626835819.0, + "step": 1523 + }, + { + "epoch": 1.2488532110091743, + "grad_norm": 0.2913944763533766, + "learning_rate": 4.529724514065558e-06, + "loss": 0.3496, + "num_tokens": 627245387.0, + "step": 1524 + }, + { + "epoch": 1.2492354740061162, + "grad_norm": 0.2671265516175696, + "learning_rate": 4.5242839375811405e-06, + "loss": 0.3613, + "num_tokens": 627688997.0, + "step": 1525 + }, + { + "epoch": 1.2496177370030581, + "grad_norm": 0.2414177584090394, + "learning_rate": 4.51884485727109e-06, + "loss": 0.3324, + "num_tokens": 628127716.0, + "step": 1526 + }, + { + "epoch": 1.25, + "grad_norm": 0.2810210338065537, + "learning_rate": 4.513407281475757e-06, + "loss": 0.354, + "num_tokens": 628549294.0, + "step": 1527 + }, + { + "epoch": 1.2503822629969419, + "grad_norm": 0.2644659255243031, + "learning_rate": 4.50797121853318e-06, + "loss": 0.3482, + "num_tokens": 628950899.0, + "step": 1528 + }, + { + "epoch": 1.2507645259938838, + "grad_norm": 0.2393720310392607, + "learning_rate": 4.502536676779083e-06, + "loss": 0.3442, + "num_tokens": 629338364.0, + "step": 1529 + }, + { + "epoch": 1.2511467889908257, + "grad_norm": 0.2528096983787305, + "learning_rate": 4.497103664546858e-06, + "loss": 0.3594, + "num_tokens": 629757170.0, + "step": 1530 + }, + { + "epoch": 1.2515290519877675, + "grad_norm": 0.26294797201341047, + "learning_rate": 4.4916721901675455e-06, + "loss": 0.3445, + "num_tokens": 630128300.0, + "step": 1531 + }, + { + "epoch": 1.2519113149847094, + "grad_norm": 0.2276393231641771, + "learning_rate": 4.4862422619698335e-06, + "loss": 0.3452, + "num_tokens": 630609330.0, + "step": 1532 + }, + { + "epoch": 1.2522935779816513, + "grad_norm": 0.23850573359463312, + "learning_rate": 4.480813888280034e-06, + "loss": 0.3458, + "num_tokens": 631001433.0, + "step": 1533 + }, + { + "epoch": 1.2526758409785932, + "grad_norm": 0.23984901912773515, + "learning_rate": 4.475387077422083e-06, + "loss": 0.3339, + "num_tokens": 631402755.0, + "step": 1534 + }, + { + "epoch": 1.253058103975535, + "grad_norm": 0.22831802415425598, + "learning_rate": 4.469961837717512e-06, + "loss": 0.3439, + "num_tokens": 631825743.0, + "step": 1535 + }, + { + "epoch": 1.2534403669724772, + "grad_norm": 0.24260055909481143, + "learning_rate": 4.46453817748545e-06, + "loss": 0.3176, + "num_tokens": 632230041.0, + "step": 1536 + }, + { + "epoch": 1.2538226299694188, + "grad_norm": 0.2751155180509015, + "learning_rate": 4.459116105042598e-06, + "loss": 0.3733, + "num_tokens": 632626284.0, + "step": 1537 + }, + { + "epoch": 1.254204892966361, + "grad_norm": 0.2553636590595797, + "learning_rate": 4.453695628703226e-06, + "loss": 0.372, + "num_tokens": 633059142.0, + "step": 1538 + }, + { + "epoch": 1.2545871559633026, + "grad_norm": 0.23782395821537206, + "learning_rate": 4.448276756779156e-06, + "loss": 0.3764, + "num_tokens": 633509955.0, + "step": 1539 + }, + { + "epoch": 1.2549694189602447, + "grad_norm": 0.24711199820104984, + "learning_rate": 4.442859497579746e-06, + "loss": 0.3191, + "num_tokens": 633919276.0, + "step": 1540 + }, + { + "epoch": 1.2553516819571866, + "grad_norm": 0.2445412621012625, + "learning_rate": 4.4374438594118884e-06, + "loss": 0.348, + "num_tokens": 634326945.0, + "step": 1541 + }, + { + "epoch": 1.2557339449541285, + "grad_norm": 0.2372231240419333, + "learning_rate": 4.432029850579983e-06, + "loss": 0.341, + "num_tokens": 634742618.0, + "step": 1542 + }, + { + "epoch": 1.2561162079510704, + "grad_norm": 0.2499743494221262, + "learning_rate": 4.4266174793859375e-06, + "loss": 0.3539, + "num_tokens": 635188965.0, + "step": 1543 + }, + { + "epoch": 1.2564984709480123, + "grad_norm": 0.304348645689502, + "learning_rate": 4.421206754129142e-06, + "loss": 0.3677, + "num_tokens": 635597341.0, + "step": 1544 + }, + { + "epoch": 1.2568807339449541, + "grad_norm": 0.2270217774870297, + "learning_rate": 4.4157976831064664e-06, + "loss": 0.3305, + "num_tokens": 636044285.0, + "step": 1545 + }, + { + "epoch": 1.257262996941896, + "grad_norm": 0.2727457966571208, + "learning_rate": 4.410390274612241e-06, + "loss": 0.366, + "num_tokens": 636417305.0, + "step": 1546 + }, + { + "epoch": 1.257645259938838, + "grad_norm": 0.26319062485226313, + "learning_rate": 4.4049845369382525e-06, + "loss": 0.349, + "num_tokens": 636826472.0, + "step": 1547 + }, + { + "epoch": 1.2580275229357798, + "grad_norm": 0.28737450977268864, + "learning_rate": 4.3995804783737185e-06, + "loss": 0.3298, + "num_tokens": 637184366.0, + "step": 1548 + }, + { + "epoch": 1.2584097859327217, + "grad_norm": 0.276216735971841, + "learning_rate": 4.394178107205289e-06, + "loss": 0.351, + "num_tokens": 637604816.0, + "step": 1549 + }, + { + "epoch": 1.2587920489296636, + "grad_norm": 0.2843474613126444, + "learning_rate": 4.388777431717022e-06, + "loss": 0.3602, + "num_tokens": 637995589.0, + "step": 1550 + }, + { + "epoch": 1.2591743119266054, + "grad_norm": 0.24955350478748686, + "learning_rate": 4.383378460190373e-06, + "loss": 0.3488, + "num_tokens": 638367778.0, + "step": 1551 + }, + { + "epoch": 1.2595565749235473, + "grad_norm": 0.2593543321261272, + "learning_rate": 4.377981200904191e-06, + "loss": 0.3523, + "num_tokens": 638806793.0, + "step": 1552 + }, + { + "epoch": 1.2599388379204892, + "grad_norm": 0.24658179112136044, + "learning_rate": 4.372585662134695e-06, + "loss": 0.3339, + "num_tokens": 639214449.0, + "step": 1553 + }, + { + "epoch": 1.260321100917431, + "grad_norm": 0.26999169634474374, + "learning_rate": 4.367191852155467e-06, + "loss": 0.3442, + "num_tokens": 639601441.0, + "step": 1554 + }, + { + "epoch": 1.2607033639143732, + "grad_norm": 0.2965005020667414, + "learning_rate": 4.3617997792374365e-06, + "loss": 0.3572, + "num_tokens": 640049212.0, + "step": 1555 + }, + { + "epoch": 1.2610856269113149, + "grad_norm": 0.24517880168622486, + "learning_rate": 4.3564094516488755e-06, + "loss": 0.3788, + "num_tokens": 640519317.0, + "step": 1556 + }, + { + "epoch": 1.261467889908257, + "grad_norm": 0.2649115105411148, + "learning_rate": 4.351020877655369e-06, + "loss": 0.3306, + "num_tokens": 640901086.0, + "step": 1557 + }, + { + "epoch": 1.2618501529051986, + "grad_norm": 0.27893059877771775, + "learning_rate": 4.345634065519824e-06, + "loss": 0.3457, + "num_tokens": 641280383.0, + "step": 1558 + }, + { + "epoch": 1.2622324159021407, + "grad_norm": 0.2529691612743419, + "learning_rate": 4.340249023502439e-06, + "loss": 0.3268, + "num_tokens": 641667769.0, + "step": 1559 + }, + { + "epoch": 1.2626146788990826, + "grad_norm": 0.276398285482857, + "learning_rate": 4.3348657598607004e-06, + "loss": 0.3555, + "num_tokens": 642042009.0, + "step": 1560 + }, + { + "epoch": 1.2629969418960245, + "grad_norm": 0.2602587942489892, + "learning_rate": 4.329484282849367e-06, + "loss": 0.3631, + "num_tokens": 642462705.0, + "step": 1561 + }, + { + "epoch": 1.2633792048929664, + "grad_norm": 0.22366945952893483, + "learning_rate": 4.324104600720457e-06, + "loss": 0.3336, + "num_tokens": 642868439.0, + "step": 1562 + }, + { + "epoch": 1.2637614678899083, + "grad_norm": 0.23967243185765635, + "learning_rate": 4.31872672172324e-06, + "loss": 0.361, + "num_tokens": 643291691.0, + "step": 1563 + }, + { + "epoch": 1.2641437308868502, + "grad_norm": 0.26184581121011513, + "learning_rate": 4.313350654104215e-06, + "loss": 0.3605, + "num_tokens": 643716008.0, + "step": 1564 + }, + { + "epoch": 1.264525993883792, + "grad_norm": 0.24456196635927346, + "learning_rate": 4.307976406107112e-06, + "loss": 0.3494, + "num_tokens": 644152992.0, + "step": 1565 + }, + { + "epoch": 1.264908256880734, + "grad_norm": 0.2410128098271759, + "learning_rate": 4.302603985972861e-06, + "loss": 0.3516, + "num_tokens": 644558328.0, + "step": 1566 + }, + { + "epoch": 1.2652905198776758, + "grad_norm": 0.24540840942105183, + "learning_rate": 4.297233401939595e-06, + "loss": 0.359, + "num_tokens": 644989889.0, + "step": 1567 + }, + { + "epoch": 1.2656727828746177, + "grad_norm": 0.2523268012164393, + "learning_rate": 4.291864662242629e-06, + "loss": 0.3723, + "num_tokens": 645405233.0, + "step": 1568 + }, + { + "epoch": 1.2660550458715596, + "grad_norm": 0.25049435681540944, + "learning_rate": 4.286497775114453e-06, + "loss": 0.3803, + "num_tokens": 645846016.0, + "step": 1569 + }, + { + "epoch": 1.2664373088685015, + "grad_norm": 0.2846210627481902, + "learning_rate": 4.281132748784714e-06, + "loss": 0.3643, + "num_tokens": 646247293.0, + "step": 1570 + }, + { + "epoch": 1.2668195718654434, + "grad_norm": 0.25653061531991805, + "learning_rate": 4.275769591480203e-06, + "loss": 0.3599, + "num_tokens": 646689866.0, + "step": 1571 + }, + { + "epoch": 1.2672018348623852, + "grad_norm": 0.24902462114208151, + "learning_rate": 4.27040831142485e-06, + "loss": 0.3513, + "num_tokens": 647126701.0, + "step": 1572 + }, + { + "epoch": 1.2675840978593271, + "grad_norm": 0.26306414560476404, + "learning_rate": 4.265048916839703e-06, + "loss": 0.3496, + "num_tokens": 647518508.0, + "step": 1573 + }, + { + "epoch": 1.2679663608562692, + "grad_norm": 0.2537441942123504, + "learning_rate": 4.259691415942923e-06, + "loss": 0.3263, + "num_tokens": 647918630.0, + "step": 1574 + }, + { + "epoch": 1.268348623853211, + "grad_norm": 0.23722274611176833, + "learning_rate": 4.2543358169497615e-06, + "loss": 0.334, + "num_tokens": 648340238.0, + "step": 1575 + }, + { + "epoch": 1.268730886850153, + "grad_norm": 0.25457027845642066, + "learning_rate": 4.2489821280725575e-06, + "loss": 0.3254, + "num_tokens": 648716684.0, + "step": 1576 + }, + { + "epoch": 1.2691131498470947, + "grad_norm": 0.27670111132325226, + "learning_rate": 4.243630357520717e-06, + "loss": 0.3705, + "num_tokens": 649192883.0, + "step": 1577 + }, + { + "epoch": 1.2694954128440368, + "grad_norm": 0.24326388511272504, + "learning_rate": 4.238280513500712e-06, + "loss": 0.3706, + "num_tokens": 649660463.0, + "step": 1578 + }, + { + "epoch": 1.2698776758409787, + "grad_norm": 0.2425460141036009, + "learning_rate": 4.2329326042160525e-06, + "loss": 0.3372, + "num_tokens": 650089212.0, + "step": 1579 + }, + { + "epoch": 1.2702599388379205, + "grad_norm": 0.2654402725799945, + "learning_rate": 4.227586637867286e-06, + "loss": 0.3621, + "num_tokens": 650513932.0, + "step": 1580 + }, + { + "epoch": 1.2706422018348624, + "grad_norm": 0.2699134869489444, + "learning_rate": 4.22224262265198e-06, + "loss": 0.3502, + "num_tokens": 650944041.0, + "step": 1581 + }, + { + "epoch": 1.2710244648318043, + "grad_norm": 0.2605280217705378, + "learning_rate": 4.216900566764706e-06, + "loss": 0.3606, + "num_tokens": 651375352.0, + "step": 1582 + }, + { + "epoch": 1.2714067278287462, + "grad_norm": 0.2592068102550442, + "learning_rate": 4.2115604783970395e-06, + "loss": 0.3634, + "num_tokens": 651832360.0, + "step": 1583 + }, + { + "epoch": 1.271788990825688, + "grad_norm": 0.2636462795464737, + "learning_rate": 4.206222365737531e-06, + "loss": 0.3432, + "num_tokens": 652235375.0, + "step": 1584 + }, + { + "epoch": 1.27217125382263, + "grad_norm": 0.2652782810774488, + "learning_rate": 4.200886236971707e-06, + "loss": 0.3377, + "num_tokens": 652663291.0, + "step": 1585 + }, + { + "epoch": 1.2725535168195719, + "grad_norm": 0.284647558387535, + "learning_rate": 4.1955521002820455e-06, + "loss": 0.3469, + "num_tokens": 653059005.0, + "step": 1586 + }, + { + "epoch": 1.2729357798165137, + "grad_norm": 0.24201335199991722, + "learning_rate": 4.190219963847979e-06, + "loss": 0.3339, + "num_tokens": 653433740.0, + "step": 1587 + }, + { + "epoch": 1.2733180428134556, + "grad_norm": 0.2389642406696781, + "learning_rate": 4.184889835845862e-06, + "loss": 0.3265, + "num_tokens": 653837513.0, + "step": 1588 + }, + { + "epoch": 1.2737003058103975, + "grad_norm": 0.3063674740395495, + "learning_rate": 4.179561724448982e-06, + "loss": 0.3386, + "num_tokens": 654194858.0, + "step": 1589 + }, + { + "epoch": 1.2740825688073394, + "grad_norm": 0.2448844279083538, + "learning_rate": 4.174235637827521e-06, + "loss": 0.3154, + "num_tokens": 654600496.0, + "step": 1590 + }, + { + "epoch": 1.2744648318042813, + "grad_norm": 0.24337436179560917, + "learning_rate": 4.168911584148564e-06, + "loss": 0.3529, + "num_tokens": 655055781.0, + "step": 1591 + }, + { + "epoch": 1.2748470948012232, + "grad_norm": 0.24747389328879174, + "learning_rate": 4.163589571576076e-06, + "loss": 0.3176, + "num_tokens": 655486620.0, + "step": 1592 + }, + { + "epoch": 1.2752293577981653, + "grad_norm": 0.27613753335126656, + "learning_rate": 4.158269608270894e-06, + "loss": 0.3348, + "num_tokens": 655865221.0, + "step": 1593 + }, + { + "epoch": 1.275611620795107, + "grad_norm": 0.25013078860053084, + "learning_rate": 4.152951702390713e-06, + "loss": 0.3363, + "num_tokens": 656328076.0, + "step": 1594 + }, + { + "epoch": 1.275993883792049, + "grad_norm": 0.26905231532084006, + "learning_rate": 4.147635862090068e-06, + "loss": 0.3323, + "num_tokens": 656717113.0, + "step": 1595 + }, + { + "epoch": 1.2763761467889907, + "grad_norm": 0.24051223036770084, + "learning_rate": 4.142322095520334e-06, + "loss": 0.344, + "num_tokens": 657136128.0, + "step": 1596 + }, + { + "epoch": 1.2767584097859328, + "grad_norm": 0.24697322697358998, + "learning_rate": 4.1370104108297025e-06, + "loss": 0.3418, + "num_tokens": 657517257.0, + "step": 1597 + }, + { + "epoch": 1.2771406727828747, + "grad_norm": 0.2608386030485975, + "learning_rate": 4.13170081616317e-06, + "loss": 0.3525, + "num_tokens": 657934131.0, + "step": 1598 + }, + { + "epoch": 1.2775229357798166, + "grad_norm": 0.2569393826707226, + "learning_rate": 4.126393319662531e-06, + "loss": 0.3309, + "num_tokens": 658288548.0, + "step": 1599 + }, + { + "epoch": 1.2779051987767585, + "grad_norm": 0.25399379302142755, + "learning_rate": 4.121087929466366e-06, + "loss": 0.3434, + "num_tokens": 658675160.0, + "step": 1600 + }, + { + "epoch": 1.2782874617737003, + "grad_norm": 0.26215250516895544, + "learning_rate": 4.11578465371002e-06, + "loss": 0.3521, + "num_tokens": 659095282.0, + "step": 1601 + }, + { + "epoch": 1.2786697247706422, + "grad_norm": 0.2592975490564625, + "learning_rate": 4.110483500525595e-06, + "loss": 0.3397, + "num_tokens": 659534103.0, + "step": 1602 + }, + { + "epoch": 1.279051987767584, + "grad_norm": 0.2648017001314538, + "learning_rate": 4.105184478041945e-06, + "loss": 0.3311, + "num_tokens": 659925000.0, + "step": 1603 + }, + { + "epoch": 1.279434250764526, + "grad_norm": 0.2877984613405508, + "learning_rate": 4.09988759438465e-06, + "loss": 0.348, + "num_tokens": 660340498.0, + "step": 1604 + }, + { + "epoch": 1.2798165137614679, + "grad_norm": 0.24806556714119524, + "learning_rate": 4.094592857676015e-06, + "loss": 0.35, + "num_tokens": 660804431.0, + "step": 1605 + }, + { + "epoch": 1.2801987767584098, + "grad_norm": 0.26673730452209415, + "learning_rate": 4.08930027603505e-06, + "loss": 0.3584, + "num_tokens": 661247118.0, + "step": 1606 + }, + { + "epoch": 1.2805810397553516, + "grad_norm": 0.2598817801332126, + "learning_rate": 4.084009857577462e-06, + "loss": 0.3504, + "num_tokens": 661672028.0, + "step": 1607 + }, + { + "epoch": 1.2809633027522935, + "grad_norm": 0.2680424098907622, + "learning_rate": 4.078721610415637e-06, + "loss": 0.3534, + "num_tokens": 662058492.0, + "step": 1608 + }, + { + "epoch": 1.2813455657492354, + "grad_norm": 0.2966087797441796, + "learning_rate": 4.07343554265864e-06, + "loss": 0.3457, + "num_tokens": 662458194.0, + "step": 1609 + }, + { + "epoch": 1.2817278287461773, + "grad_norm": 0.2623679594058613, + "learning_rate": 4.0681516624121845e-06, + "loss": 0.3449, + "num_tokens": 662860956.0, + "step": 1610 + }, + { + "epoch": 1.2821100917431192, + "grad_norm": 0.2528543503408816, + "learning_rate": 4.062869977778637e-06, + "loss": 0.3507, + "num_tokens": 663301109.0, + "step": 1611 + }, + { + "epoch": 1.2824923547400613, + "grad_norm": 0.2542946493241396, + "learning_rate": 4.057590496856993e-06, + "loss": 0.3792, + "num_tokens": 663795527.0, + "step": 1612 + }, + { + "epoch": 1.282874617737003, + "grad_norm": 0.28365718103903786, + "learning_rate": 4.05231322774287e-06, + "loss": 0.3533, + "num_tokens": 664241572.0, + "step": 1613 + }, + { + "epoch": 1.283256880733945, + "grad_norm": 0.2540843990662738, + "learning_rate": 4.047038178528494e-06, + "loss": 0.3347, + "num_tokens": 664611912.0, + "step": 1614 + }, + { + "epoch": 1.2836391437308867, + "grad_norm": 0.27683863339835385, + "learning_rate": 4.041765357302683e-06, + "loss": 0.3413, + "num_tokens": 665026613.0, + "step": 1615 + }, + { + "epoch": 1.2840214067278288, + "grad_norm": 0.26511749851422695, + "learning_rate": 4.036494772150851e-06, + "loss": 0.348, + "num_tokens": 665460212.0, + "step": 1616 + }, + { + "epoch": 1.2844036697247707, + "grad_norm": 0.22757634047917277, + "learning_rate": 4.031226431154967e-06, + "loss": 0.3523, + "num_tokens": 665906921.0, + "step": 1617 + }, + { + "epoch": 1.2847859327217126, + "grad_norm": 0.29137810688727744, + "learning_rate": 4.02596034239357e-06, + "loss": 0.3426, + "num_tokens": 666312411.0, + "step": 1618 + }, + { + "epoch": 1.2851681957186545, + "grad_norm": 0.26685115861412273, + "learning_rate": 4.0206965139417395e-06, + "loss": 0.3651, + "num_tokens": 666753338.0, + "step": 1619 + }, + { + "epoch": 1.2855504587155964, + "grad_norm": 0.28806855548882604, + "learning_rate": 4.015434953871094e-06, + "loss": 0.3592, + "num_tokens": 667163013.0, + "step": 1620 + }, + { + "epoch": 1.2859327217125383, + "grad_norm": 0.27816067891620727, + "learning_rate": 4.01017567024977e-06, + "loss": 0.3389, + "num_tokens": 667563780.0, + "step": 1621 + }, + { + "epoch": 1.2863149847094801, + "grad_norm": 0.2772825435086106, + "learning_rate": 4.0049186711424125e-06, + "loss": 0.383, + "num_tokens": 668022554.0, + "step": 1622 + }, + { + "epoch": 1.286697247706422, + "grad_norm": 0.2649231069827948, + "learning_rate": 3.999663964610168e-06, + "loss": 0.3606, + "num_tokens": 668475300.0, + "step": 1623 + }, + { + "epoch": 1.287079510703364, + "grad_norm": 0.25321837681915044, + "learning_rate": 3.99441155871066e-06, + "loss": 0.3389, + "num_tokens": 668853999.0, + "step": 1624 + }, + { + "epoch": 1.2874617737003058, + "grad_norm": 0.32634566345089056, + "learning_rate": 3.989161461497996e-06, + "loss": 0.3741, + "num_tokens": 669289053.0, + "step": 1625 + }, + { + "epoch": 1.2878440366972477, + "grad_norm": 0.3355777915972197, + "learning_rate": 3.9839136810227285e-06, + "loss": 0.3554, + "num_tokens": 669686111.0, + "step": 1626 + }, + { + "epoch": 1.2882262996941896, + "grad_norm": 0.284979952256129, + "learning_rate": 3.978668225331872e-06, + "loss": 0.3652, + "num_tokens": 670106394.0, + "step": 1627 + }, + { + "epoch": 1.2886085626911314, + "grad_norm": 0.24701234632238284, + "learning_rate": 3.973425102468864e-06, + "loss": 0.3658, + "num_tokens": 670502521.0, + "step": 1628 + }, + { + "epoch": 1.2889908256880733, + "grad_norm": 0.27781475422291285, + "learning_rate": 3.968184320473574e-06, + "loss": 0.35, + "num_tokens": 670925489.0, + "step": 1629 + }, + { + "epoch": 1.2893730886850152, + "grad_norm": 0.2630524623493376, + "learning_rate": 3.962945887382274e-06, + "loss": 0.3459, + "num_tokens": 671310814.0, + "step": 1630 + }, + { + "epoch": 1.2897553516819573, + "grad_norm": 0.25808224344515196, + "learning_rate": 3.957709811227642e-06, + "loss": 0.3575, + "num_tokens": 671732779.0, + "step": 1631 + }, + { + "epoch": 1.290137614678899, + "grad_norm": 0.32067637746651884, + "learning_rate": 3.952476100038738e-06, + "loss": 0.3786, + "num_tokens": 672198025.0, + "step": 1632 + }, + { + "epoch": 1.290519877675841, + "grad_norm": 0.28015597849764223, + "learning_rate": 3.947244761840993e-06, + "loss": 0.3467, + "num_tokens": 672559181.0, + "step": 1633 + }, + { + "epoch": 1.2909021406727827, + "grad_norm": 0.2593265417342103, + "learning_rate": 3.942015804656204e-06, + "loss": 0.3585, + "num_tokens": 672987382.0, + "step": 1634 + }, + { + "epoch": 1.2912844036697249, + "grad_norm": 0.2619726313070248, + "learning_rate": 3.936789236502513e-06, + "loss": 0.3626, + "num_tokens": 673430120.0, + "step": 1635 + }, + { + "epoch": 1.2916666666666667, + "grad_norm": 0.25399496160010143, + "learning_rate": 3.931565065394403e-06, + "loss": 0.3388, + "num_tokens": 673842400.0, + "step": 1636 + }, + { + "epoch": 1.2920489296636086, + "grad_norm": 0.25154664823982636, + "learning_rate": 3.926343299342675e-06, + "loss": 0.3546, + "num_tokens": 674244853.0, + "step": 1637 + }, + { + "epoch": 1.2924311926605505, + "grad_norm": 0.2794727586637354, + "learning_rate": 3.92112394635445e-06, + "loss": 0.3641, + "num_tokens": 674673487.0, + "step": 1638 + }, + { + "epoch": 1.2928134556574924, + "grad_norm": 0.2665600194334451, + "learning_rate": 3.915907014433142e-06, + "loss": 0.3452, + "num_tokens": 675086347.0, + "step": 1639 + }, + { + "epoch": 1.2931957186544343, + "grad_norm": 0.2856590786036397, + "learning_rate": 3.910692511578458e-06, + "loss": 0.3807, + "num_tokens": 675522963.0, + "step": 1640 + }, + { + "epoch": 1.2935779816513762, + "grad_norm": 0.28696819659440775, + "learning_rate": 3.905480445786373e-06, + "loss": 0.355, + "num_tokens": 675934024.0, + "step": 1641 + }, + { + "epoch": 1.293960244648318, + "grad_norm": 0.2708796914563745, + "learning_rate": 3.900270825049133e-06, + "loss": 0.3587, + "num_tokens": 676333135.0, + "step": 1642 + }, + { + "epoch": 1.29434250764526, + "grad_norm": 0.2636085512434039, + "learning_rate": 3.895063657355228e-06, + "loss": 0.3433, + "num_tokens": 676701566.0, + "step": 1643 + }, + { + "epoch": 1.2947247706422018, + "grad_norm": 0.27322129613486396, + "learning_rate": 3.889858950689393e-06, + "loss": 0.3574, + "num_tokens": 677116337.0, + "step": 1644 + }, + { + "epoch": 1.2951070336391437, + "grad_norm": 0.28211374401538497, + "learning_rate": 3.884656713032583e-06, + "loss": 0.3374, + "num_tokens": 677503302.0, + "step": 1645 + }, + { + "epoch": 1.2954892966360856, + "grad_norm": 0.244467040344058, + "learning_rate": 3.879456952361971e-06, + "loss": 0.359, + "num_tokens": 677910155.0, + "step": 1646 + }, + { + "epoch": 1.2958715596330275, + "grad_norm": 0.23604624092040635, + "learning_rate": 3.8742596766509314e-06, + "loss": 0.376, + "num_tokens": 678356073.0, + "step": 1647 + }, + { + "epoch": 1.2962538226299694, + "grad_norm": 0.26952222492760003, + "learning_rate": 3.869064893869023e-06, + "loss": 0.3393, + "num_tokens": 678729844.0, + "step": 1648 + }, + { + "epoch": 1.2966360856269112, + "grad_norm": 0.2760789544630471, + "learning_rate": 3.863872611981993e-06, + "loss": 0.3516, + "num_tokens": 679138055.0, + "step": 1649 + }, + { + "epoch": 1.2970183486238533, + "grad_norm": 0.2788221969576679, + "learning_rate": 3.858682838951741e-06, + "loss": 0.3797, + "num_tokens": 679566870.0, + "step": 1650 + }, + { + "epoch": 1.297400611620795, + "grad_norm": 0.2966843717167917, + "learning_rate": 3.85349558273633e-06, + "loss": 0.3548, + "num_tokens": 679974327.0, + "step": 1651 + }, + { + "epoch": 1.2977828746177371, + "grad_norm": 0.2539789814895678, + "learning_rate": 3.848310851289956e-06, + "loss": 0.3405, + "num_tokens": 680353683.0, + "step": 1652 + }, + { + "epoch": 1.2981651376146788, + "grad_norm": 0.25887665129057746, + "learning_rate": 3.8431286525629456e-06, + "loss": 0.3522, + "num_tokens": 680757167.0, + "step": 1653 + }, + { + "epoch": 1.2985474006116209, + "grad_norm": 0.29081855181509403, + "learning_rate": 3.837948994501746e-06, + "loss": 0.3363, + "num_tokens": 681145277.0, + "step": 1654 + }, + { + "epoch": 1.2989296636085628, + "grad_norm": 0.26370598035478027, + "learning_rate": 3.832771885048901e-06, + "loss": 0.3684, + "num_tokens": 681570550.0, + "step": 1655 + }, + { + "epoch": 1.2993119266055047, + "grad_norm": 0.24263092886430465, + "learning_rate": 3.827597332143056e-06, + "loss": 0.3364, + "num_tokens": 681965806.0, + "step": 1656 + }, + { + "epoch": 1.2996941896024465, + "grad_norm": 0.2632476653974427, + "learning_rate": 3.822425343718926e-06, + "loss": 0.3503, + "num_tokens": 682385252.0, + "step": 1657 + }, + { + "epoch": 1.3000764525993884, + "grad_norm": 0.2538478860440045, + "learning_rate": 3.817255927707302e-06, + "loss": 0.3461, + "num_tokens": 682790346.0, + "step": 1658 + }, + { + "epoch": 1.3004587155963303, + "grad_norm": 0.24209429624581802, + "learning_rate": 3.8120890920350207e-06, + "loss": 0.3539, + "num_tokens": 683200927.0, + "step": 1659 + }, + { + "epoch": 1.3008409785932722, + "grad_norm": 0.26058844525798086, + "learning_rate": 3.806924844624975e-06, + "loss": 0.3632, + "num_tokens": 683604091.0, + "step": 1660 + }, + { + "epoch": 1.301223241590214, + "grad_norm": 0.2699157188183783, + "learning_rate": 3.8017631933960764e-06, + "loss": 0.3569, + "num_tokens": 683992048.0, + "step": 1661 + }, + { + "epoch": 1.301605504587156, + "grad_norm": 0.24002697908250872, + "learning_rate": 3.7966041462632665e-06, + "loss": 0.354, + "num_tokens": 684417141.0, + "step": 1662 + }, + { + "epoch": 1.3019877675840978, + "grad_norm": 0.24839689242861876, + "learning_rate": 3.791447711137484e-06, + "loss": 0.3428, + "num_tokens": 684848353.0, + "step": 1663 + }, + { + "epoch": 1.3023700305810397, + "grad_norm": 0.2699450196767002, + "learning_rate": 3.7862938959256656e-06, + "loss": 0.3301, + "num_tokens": 685262085.0, + "step": 1664 + }, + { + "epoch": 1.3027522935779816, + "grad_norm": 0.2472182995261609, + "learning_rate": 3.781142708530736e-06, + "loss": 0.3371, + "num_tokens": 685664290.0, + "step": 1665 + }, + { + "epoch": 1.3031345565749235, + "grad_norm": 0.23440243081424475, + "learning_rate": 3.7759941568515835e-06, + "loss": 0.3621, + "num_tokens": 686105446.0, + "step": 1666 + }, + { + "epoch": 1.3035168195718654, + "grad_norm": 0.26462861733039406, + "learning_rate": 3.7708482487830566e-06, + "loss": 0.3558, + "num_tokens": 686503650.0, + "step": 1667 + }, + { + "epoch": 1.3038990825688073, + "grad_norm": 0.2609048004264284, + "learning_rate": 3.7657049922159507e-06, + "loss": 0.3215, + "num_tokens": 686914325.0, + "step": 1668 + }, + { + "epoch": 1.3042813455657494, + "grad_norm": 0.2451725490958671, + "learning_rate": 3.7605643950369973e-06, + "loss": 0.3444, + "num_tokens": 687308665.0, + "step": 1669 + }, + { + "epoch": 1.304663608562691, + "grad_norm": 0.2496194545521366, + "learning_rate": 3.755426465128844e-06, + "loss": 0.3478, + "num_tokens": 687699132.0, + "step": 1670 + }, + { + "epoch": 1.3050458715596331, + "grad_norm": 0.24990664239611124, + "learning_rate": 3.7502912103700573e-06, + "loss": 0.3631, + "num_tokens": 688122227.0, + "step": 1671 + }, + { + "epoch": 1.3054281345565748, + "grad_norm": 0.2809508423841265, + "learning_rate": 3.7451586386350937e-06, + "loss": 0.3668, + "num_tokens": 688566628.0, + "step": 1672 + }, + { + "epoch": 1.305810397553517, + "grad_norm": 0.25952884156917994, + "learning_rate": 3.7400287577942994e-06, + "loss": 0.36, + "num_tokens": 688985207.0, + "step": 1673 + }, + { + "epoch": 1.3061926605504588, + "grad_norm": 0.24860883677654053, + "learning_rate": 3.734901575713892e-06, + "loss": 0.3489, + "num_tokens": 689443908.0, + "step": 1674 + }, + { + "epoch": 1.3065749235474007, + "grad_norm": 0.25107903298566203, + "learning_rate": 3.7297771002559524e-06, + "loss": 0.3684, + "num_tokens": 689874191.0, + "step": 1675 + }, + { + "epoch": 1.3069571865443426, + "grad_norm": 0.24659106899573477, + "learning_rate": 3.7246553392784125e-06, + "loss": 0.3132, + "num_tokens": 690245371.0, + "step": 1676 + }, + { + "epoch": 1.3073394495412844, + "grad_norm": 0.2522391218690764, + "learning_rate": 3.7195363006350372e-06, + "loss": 0.3529, + "num_tokens": 690680775.0, + "step": 1677 + }, + { + "epoch": 1.3077217125382263, + "grad_norm": 0.24318386137949508, + "learning_rate": 3.7144199921754252e-06, + "loss": 0.3433, + "num_tokens": 691074592.0, + "step": 1678 + }, + { + "epoch": 1.3081039755351682, + "grad_norm": 0.2561114796433777, + "learning_rate": 3.7093064217449783e-06, + "loss": 0.3314, + "num_tokens": 691432256.0, + "step": 1679 + }, + { + "epoch": 1.30848623853211, + "grad_norm": 0.25192527442953083, + "learning_rate": 3.7041955971849065e-06, + "loss": 0.3571, + "num_tokens": 691854842.0, + "step": 1680 + }, + { + "epoch": 1.308868501529052, + "grad_norm": 0.25462418470163234, + "learning_rate": 3.699087526332209e-06, + "loss": 0.3596, + "num_tokens": 692259815.0, + "step": 1681 + }, + { + "epoch": 1.3092507645259939, + "grad_norm": 0.22805146689730482, + "learning_rate": 3.6939822170196616e-06, + "loss": 0.3515, + "num_tokens": 692714004.0, + "step": 1682 + }, + { + "epoch": 1.3096330275229358, + "grad_norm": 0.24649529687676333, + "learning_rate": 3.6888796770758016e-06, + "loss": 0.3421, + "num_tokens": 693097906.0, + "step": 1683 + }, + { + "epoch": 1.3100152905198776, + "grad_norm": 0.26031175005142626, + "learning_rate": 3.6837799143249244e-06, + "loss": 0.335, + "num_tokens": 693501353.0, + "step": 1684 + }, + { + "epoch": 1.3103975535168195, + "grad_norm": 0.24710625441535372, + "learning_rate": 3.678682936587068e-06, + "loss": 0.3404, + "num_tokens": 693885606.0, + "step": 1685 + }, + { + "epoch": 1.3107798165137614, + "grad_norm": 0.24884736398500612, + "learning_rate": 3.6735887516779946e-06, + "loss": 0.3629, + "num_tokens": 694280419.0, + "step": 1686 + }, + { + "epoch": 1.3111620795107033, + "grad_norm": 0.22828035151065715, + "learning_rate": 3.6684973674091885e-06, + "loss": 0.346, + "num_tokens": 694689794.0, + "step": 1687 + }, + { + "epoch": 1.3115443425076452, + "grad_norm": 0.25747512086740254, + "learning_rate": 3.6634087915878347e-06, + "loss": 0.3578, + "num_tokens": 695099478.0, + "step": 1688 + }, + { + "epoch": 1.311926605504587, + "grad_norm": 0.26365855014073813, + "learning_rate": 3.6583230320168194e-06, + "loss": 0.3561, + "num_tokens": 695494532.0, + "step": 1689 + }, + { + "epoch": 1.3123088685015292, + "grad_norm": 0.23329354373292693, + "learning_rate": 3.6532400964947e-06, + "loss": 0.3182, + "num_tokens": 695857981.0, + "step": 1690 + }, + { + "epoch": 1.3126911314984708, + "grad_norm": 0.23915121220716568, + "learning_rate": 3.648159992815714e-06, + "loss": 0.3524, + "num_tokens": 696274873.0, + "step": 1691 + }, + { + "epoch": 1.313073394495413, + "grad_norm": 0.26791199410756306, + "learning_rate": 3.6430827287697466e-06, + "loss": 0.3522, + "num_tokens": 696707671.0, + "step": 1692 + }, + { + "epoch": 1.3134556574923548, + "grad_norm": 0.23200952961797544, + "learning_rate": 3.6380083121423395e-06, + "loss": 0.3494, + "num_tokens": 697137970.0, + "step": 1693 + }, + { + "epoch": 1.3138379204892967, + "grad_norm": 0.26802039122052357, + "learning_rate": 3.6329367507146583e-06, + "loss": 0.3615, + "num_tokens": 697535123.0, + "step": 1694 + }, + { + "epoch": 1.3142201834862386, + "grad_norm": 0.27258075226736395, + "learning_rate": 3.6278680522634948e-06, + "loss": 0.3732, + "num_tokens": 697927816.0, + "step": 1695 + }, + { + "epoch": 1.3146024464831805, + "grad_norm": 0.27244744922626585, + "learning_rate": 3.6228022245612494e-06, + "loss": 0.3806, + "num_tokens": 698334602.0, + "step": 1696 + }, + { + "epoch": 1.3149847094801224, + "grad_norm": 0.25482725022212754, + "learning_rate": 3.6177392753759233e-06, + "loss": 0.367, + "num_tokens": 698753560.0, + "step": 1697 + }, + { + "epoch": 1.3153669724770642, + "grad_norm": 0.25676648138733404, + "learning_rate": 3.6126792124710995e-06, + "loss": 0.3393, + "num_tokens": 699181694.0, + "step": 1698 + }, + { + "epoch": 1.3157492354740061, + "grad_norm": 0.27411543729618837, + "learning_rate": 3.6076220436059386e-06, + "loss": 0.3868, + "num_tokens": 699586947.0, + "step": 1699 + }, + { + "epoch": 1.316131498470948, + "grad_norm": 0.23366188176398012, + "learning_rate": 3.602567776535164e-06, + "loss": 0.3387, + "num_tokens": 699997751.0, + "step": 1700 + }, + { + "epoch": 1.31651376146789, + "grad_norm": 0.2339723951439109, + "learning_rate": 3.5975164190090427e-06, + "loss": 0.3485, + "num_tokens": 700413746.0, + "step": 1701 + }, + { + "epoch": 1.3168960244648318, + "grad_norm": 0.24634552093703108, + "learning_rate": 3.592467978773392e-06, + "loss": 0.3403, + "num_tokens": 700792428.0, + "step": 1702 + }, + { + "epoch": 1.3172782874617737, + "grad_norm": 0.24579856747239542, + "learning_rate": 3.5874224635695433e-06, + "loss": 0.3476, + "num_tokens": 701203930.0, + "step": 1703 + }, + { + "epoch": 1.3176605504587156, + "grad_norm": 0.2483170996101416, + "learning_rate": 3.582379881134349e-06, + "loss": 0.3279, + "num_tokens": 701577323.0, + "step": 1704 + }, + { + "epoch": 1.3180428134556574, + "grad_norm": 0.2528532610079257, + "learning_rate": 3.5773402392001666e-06, + "loss": 0.3766, + "num_tokens": 702044073.0, + "step": 1705 + }, + { + "epoch": 1.3184250764525993, + "grad_norm": 0.25962494185583007, + "learning_rate": 3.5723035454948385e-06, + "loss": 0.3616, + "num_tokens": 702423613.0, + "step": 1706 + }, + { + "epoch": 1.3188073394495412, + "grad_norm": 0.23479186487362205, + "learning_rate": 3.5672698077416913e-06, + "loss": 0.367, + "num_tokens": 702848438.0, + "step": 1707 + }, + { + "epoch": 1.319189602446483, + "grad_norm": 0.2672396381349754, + "learning_rate": 3.5622390336595168e-06, + "loss": 0.3526, + "num_tokens": 703267488.0, + "step": 1708 + }, + { + "epoch": 1.3195718654434252, + "grad_norm": 0.25379542698726226, + "learning_rate": 3.557211230962565e-06, + "loss": 0.3378, + "num_tokens": 703670261.0, + "step": 1709 + }, + { + "epoch": 1.3199541284403669, + "grad_norm": 0.2408783238066999, + "learning_rate": 3.5521864073605197e-06, + "loss": 0.341, + "num_tokens": 704099842.0, + "step": 1710 + }, + { + "epoch": 1.320336391437309, + "grad_norm": 0.2711219733211456, + "learning_rate": 3.5471645705585125e-06, + "loss": 0.3565, + "num_tokens": 704499691.0, + "step": 1711 + }, + { + "epoch": 1.3207186544342506, + "grad_norm": 0.2358202603524401, + "learning_rate": 3.5421457282570794e-06, + "loss": 0.3559, + "num_tokens": 704920360.0, + "step": 1712 + }, + { + "epoch": 1.3211009174311927, + "grad_norm": 0.23997352472324385, + "learning_rate": 3.5371298881521775e-06, + "loss": 0.3654, + "num_tokens": 705333587.0, + "step": 1713 + }, + { + "epoch": 1.3214831804281346, + "grad_norm": 0.253045374800109, + "learning_rate": 3.5321170579351514e-06, + "loss": 0.3496, + "num_tokens": 705737351.0, + "step": 1714 + }, + { + "epoch": 1.3218654434250765, + "grad_norm": 0.23724805464031495, + "learning_rate": 3.5271072452927333e-06, + "loss": 0.3683, + "num_tokens": 706167055.0, + "step": 1715 + }, + { + "epoch": 1.3222477064220184, + "grad_norm": 0.2495027557367922, + "learning_rate": 3.5221004579070295e-06, + "loss": 0.3454, + "num_tokens": 706563884.0, + "step": 1716 + }, + { + "epoch": 1.3226299694189603, + "grad_norm": 0.2619542188917075, + "learning_rate": 3.5170967034555015e-06, + "loss": 0.3475, + "num_tokens": 706989701.0, + "step": 1717 + }, + { + "epoch": 1.3230122324159022, + "grad_norm": 0.23258872139739586, + "learning_rate": 3.512095989610972e-06, + "loss": 0.3634, + "num_tokens": 707416734.0, + "step": 1718 + }, + { + "epoch": 1.323394495412844, + "grad_norm": 0.24531778588397174, + "learning_rate": 3.507098324041587e-06, + "loss": 0.3414, + "num_tokens": 707790806.0, + "step": 1719 + }, + { + "epoch": 1.323776758409786, + "grad_norm": 0.25038908760295703, + "learning_rate": 3.5021037144108305e-06, + "loss": 0.3544, + "num_tokens": 708192289.0, + "step": 1720 + }, + { + "epoch": 1.3241590214067278, + "grad_norm": 0.2437190032053367, + "learning_rate": 3.4971121683774913e-06, + "loss": 0.3378, + "num_tokens": 708623219.0, + "step": 1721 + }, + { + "epoch": 1.3245412844036697, + "grad_norm": 0.2304499154964965, + "learning_rate": 3.492123693595666e-06, + "loss": 0.3545, + "num_tokens": 709059431.0, + "step": 1722 + }, + { + "epoch": 1.3249235474006116, + "grad_norm": 0.25332064199962345, + "learning_rate": 3.487138297714738e-06, + "loss": 0.3622, + "num_tokens": 709477602.0, + "step": 1723 + }, + { + "epoch": 1.3253058103975535, + "grad_norm": 0.26226155182675603, + "learning_rate": 3.4821559883793737e-06, + "loss": 0.3437, + "num_tokens": 709839255.0, + "step": 1724 + }, + { + "epoch": 1.3256880733944953, + "grad_norm": 0.23792977461652384, + "learning_rate": 3.4771767732295047e-06, + "loss": 0.3538, + "num_tokens": 710265426.0, + "step": 1725 + }, + { + "epoch": 1.3260703363914372, + "grad_norm": 0.22686093364377113, + "learning_rate": 3.4722006599003134e-06, + "loss": 0.3432, + "num_tokens": 710682501.0, + "step": 1726 + }, + { + "epoch": 1.3264525993883791, + "grad_norm": 0.22365803653205701, + "learning_rate": 3.467227656022236e-06, + "loss": 0.3511, + "num_tokens": 711118888.0, + "step": 1727 + }, + { + "epoch": 1.3268348623853212, + "grad_norm": 0.23940006059197358, + "learning_rate": 3.462257769220928e-06, + "loss": 0.3349, + "num_tokens": 711547038.0, + "step": 1728 + }, + { + "epoch": 1.3272171253822629, + "grad_norm": 0.24654397058057753, + "learning_rate": 3.4572910071172755e-06, + "loss": 0.3492, + "num_tokens": 711985799.0, + "step": 1729 + }, + { + "epoch": 1.327599388379205, + "grad_norm": 0.2639918645245174, + "learning_rate": 3.452327377327369e-06, + "loss": 0.3511, + "num_tokens": 712366886.0, + "step": 1730 + }, + { + "epoch": 1.3279816513761467, + "grad_norm": 0.2540100855501442, + "learning_rate": 3.4473668874624945e-06, + "loss": 0.3562, + "num_tokens": 712758658.0, + "step": 1731 + }, + { + "epoch": 1.3283639143730888, + "grad_norm": 0.25801100106214636, + "learning_rate": 3.4424095451291273e-06, + "loss": 0.3327, + "num_tokens": 713147810.0, + "step": 1732 + }, + { + "epoch": 1.3287461773700306, + "grad_norm": 0.26082401625287394, + "learning_rate": 3.4374553579289117e-06, + "loss": 0.3497, + "num_tokens": 713561032.0, + "step": 1733 + }, + { + "epoch": 1.3291284403669725, + "grad_norm": 0.23322128338134585, + "learning_rate": 3.43250433345866e-06, + "loss": 0.3447, + "num_tokens": 713990734.0, + "step": 1734 + }, + { + "epoch": 1.3295107033639144, + "grad_norm": 0.2757943606287788, + "learning_rate": 3.4275564793103226e-06, + "loss": 0.3527, + "num_tokens": 714366004.0, + "step": 1735 + }, + { + "epoch": 1.3298929663608563, + "grad_norm": 0.2683112481848239, + "learning_rate": 3.4226118030710066e-06, + "loss": 0.3745, + "num_tokens": 714799206.0, + "step": 1736 + }, + { + "epoch": 1.3302752293577982, + "grad_norm": 0.2709962268635325, + "learning_rate": 3.4176703123229294e-06, + "loss": 0.3572, + "num_tokens": 715193469.0, + "step": 1737 + }, + { + "epoch": 1.33065749235474, + "grad_norm": 0.25891456430430126, + "learning_rate": 3.412732014643432e-06, + "loss": 0.3523, + "num_tokens": 715595011.0, + "step": 1738 + }, + { + "epoch": 1.331039755351682, + "grad_norm": 0.23862958356199837, + "learning_rate": 3.4077969176049576e-06, + "loss": 0.3435, + "num_tokens": 716004820.0, + "step": 1739 + }, + { + "epoch": 1.3314220183486238, + "grad_norm": 0.23589486284793224, + "learning_rate": 3.4028650287750413e-06, + "loss": 0.3433, + "num_tokens": 716418977.0, + "step": 1740 + }, + { + "epoch": 1.3318042813455657, + "grad_norm": 0.24345351341480068, + "learning_rate": 3.3979363557163e-06, + "loss": 0.3898, + "num_tokens": 716834032.0, + "step": 1741 + }, + { + "epoch": 1.3321865443425076, + "grad_norm": 0.24354670691167477, + "learning_rate": 3.3930109059864173e-06, + "loss": 0.3519, + "num_tokens": 717273363.0, + "step": 1742 + }, + { + "epoch": 1.3325688073394495, + "grad_norm": 0.28913050334476587, + "learning_rate": 3.3880886871381358e-06, + "loss": 0.3645, + "num_tokens": 717700550.0, + "step": 1743 + }, + { + "epoch": 1.3329510703363914, + "grad_norm": 0.27124550100484784, + "learning_rate": 3.3831697067192437e-06, + "loss": 0.3911, + "num_tokens": 718131446.0, + "step": 1744 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.24208857222921995, + "learning_rate": 3.3782539722725606e-06, + "loss": 0.3615, + "num_tokens": 718574564.0, + "step": 1745 + }, + { + "epoch": 1.3337155963302751, + "grad_norm": 0.45601948254369123, + "learning_rate": 3.373341491335932e-06, + "loss": 0.2743, + "num_tokens": 718654164.0, + "step": 1746 + }, + { + "epoch": 2.000382262996942, + "grad_norm": 0.2605193468540187, + "learning_rate": 3.368432271442214e-06, + "loss": 0.3556, + "num_tokens": 719105964.0, + "step": 1747 + }, + { + "epoch": 2.0007645259938838, + "grad_norm": 0.2519441102677706, + "learning_rate": 3.3635263201192604e-06, + "loss": 0.335, + "num_tokens": 719513847.0, + "step": 1748 + }, + { + "epoch": 2.001146788990826, + "grad_norm": 0.2567707267895316, + "learning_rate": 3.358623644889916e-06, + "loss": 0.3374, + "num_tokens": 719942682.0, + "step": 1749 + }, + { + "epoch": 2.0015290519877675, + "grad_norm": 0.25680848468131723, + "learning_rate": 3.3537242532719983e-06, + "loss": 0.3374, + "num_tokens": 720359982.0, + "step": 1750 + }, + { + "epoch": 2.0019113149847096, + "grad_norm": 0.21342056302975543, + "learning_rate": 3.348828152778291e-06, + "loss": 0.3524, + "num_tokens": 720851760.0, + "step": 1751 + }, + { + "epoch": 2.0022935779816513, + "grad_norm": 0.2753737792965614, + "learning_rate": 3.343935350916533e-06, + "loss": 0.352, + "num_tokens": 721276640.0, + "step": 1752 + }, + { + "epoch": 2.0026758409785934, + "grad_norm": 0.2388606259824953, + "learning_rate": 3.3390458551894056e-06, + "loss": 0.3277, + "num_tokens": 721656157.0, + "step": 1753 + }, + { + "epoch": 2.003058103975535, + "grad_norm": 0.24593308106541187, + "learning_rate": 3.3341596730945114e-06, + "loss": 0.3705, + "num_tokens": 722069598.0, + "step": 1754 + }, + { + "epoch": 2.003440366972477, + "grad_norm": 0.23737540411386032, + "learning_rate": 3.329276812124388e-06, + "loss": 0.3506, + "num_tokens": 722475421.0, + "step": 1755 + }, + { + "epoch": 2.003822629969419, + "grad_norm": 0.244529161580806, + "learning_rate": 3.3243972797664647e-06, + "loss": 0.3394, + "num_tokens": 722908037.0, + "step": 1756 + }, + { + "epoch": 2.004204892966361, + "grad_norm": 0.25326561274385917, + "learning_rate": 3.319521083503075e-06, + "loss": 0.3684, + "num_tokens": 723324951.0, + "step": 1757 + }, + { + "epoch": 2.0045871559633026, + "grad_norm": 0.2540233851780087, + "learning_rate": 3.314648230811436e-06, + "loss": 0.364, + "num_tokens": 723705095.0, + "step": 1758 + }, + { + "epoch": 2.0049694189602447, + "grad_norm": 0.24514828134241698, + "learning_rate": 3.3097787291636348e-06, + "loss": 0.3465, + "num_tokens": 724103309.0, + "step": 1759 + }, + { + "epoch": 2.0053516819571864, + "grad_norm": 0.31711799608973806, + "learning_rate": 3.3049125860266252e-06, + "loss": 0.363, + "num_tokens": 724524692.0, + "step": 1760 + }, + { + "epoch": 2.0057339449541285, + "grad_norm": 0.24220622404194303, + "learning_rate": 3.300049808862203e-06, + "loss": 0.3341, + "num_tokens": 724926013.0, + "step": 1761 + }, + { + "epoch": 2.00611620795107, + "grad_norm": 0.24282703269159878, + "learning_rate": 3.2951904051270122e-06, + "loss": 0.3375, + "num_tokens": 725308595.0, + "step": 1762 + }, + { + "epoch": 2.0064984709480123, + "grad_norm": 0.2279259878939482, + "learning_rate": 3.2903343822725143e-06, + "loss": 0.3648, + "num_tokens": 725739755.0, + "step": 1763 + }, + { + "epoch": 2.006880733944954, + "grad_norm": 0.23653094846075276, + "learning_rate": 3.285481747744997e-06, + "loss": 0.3116, + "num_tokens": 726142186.0, + "step": 1764 + }, + { + "epoch": 2.007262996941896, + "grad_norm": 0.25262850048228397, + "learning_rate": 3.2806325089855408e-06, + "loss": 0.3451, + "num_tokens": 726527765.0, + "step": 1765 + }, + { + "epoch": 2.007645259938838, + "grad_norm": 0.2519291337921129, + "learning_rate": 3.275786673430028e-06, + "loss": 0.3683, + "num_tokens": 726979577.0, + "step": 1766 + }, + { + "epoch": 2.00802752293578, + "grad_norm": 0.26062581077908115, + "learning_rate": 3.270944248509119e-06, + "loss": 0.3386, + "num_tokens": 727392753.0, + "step": 1767 + }, + { + "epoch": 2.008409785932722, + "grad_norm": 0.26258706473535826, + "learning_rate": 3.266105241648243e-06, + "loss": 0.3541, + "num_tokens": 727815584.0, + "step": 1768 + }, + { + "epoch": 2.0087920489296636, + "grad_norm": 0.2296541171626925, + "learning_rate": 3.2612696602675943e-06, + "loss": 0.3379, + "num_tokens": 728223806.0, + "step": 1769 + }, + { + "epoch": 2.0091743119266057, + "grad_norm": 0.24104741959317136, + "learning_rate": 3.256437511782101e-06, + "loss": 0.3395, + "num_tokens": 728649003.0, + "step": 1770 + }, + { + "epoch": 2.0095565749235473, + "grad_norm": 0.27132654885297314, + "learning_rate": 3.2516088036014444e-06, + "loss": 0.351, + "num_tokens": 729122402.0, + "step": 1771 + }, + { + "epoch": 2.0099388379204894, + "grad_norm": 0.2525735619578914, + "learning_rate": 3.246783543130014e-06, + "loss": 0.3428, + "num_tokens": 729512961.0, + "step": 1772 + }, + { + "epoch": 2.010321100917431, + "grad_norm": 0.2569081413889353, + "learning_rate": 3.24196173776693e-06, + "loss": 0.3176, + "num_tokens": 729905841.0, + "step": 1773 + }, + { + "epoch": 2.010703363914373, + "grad_norm": 0.245024897720478, + "learning_rate": 3.237143394905996e-06, + "loss": 0.3196, + "num_tokens": 730293014.0, + "step": 1774 + }, + { + "epoch": 2.011085626911315, + "grad_norm": 0.2241928592542632, + "learning_rate": 3.2323285219357194e-06, + "loss": 0.3299, + "num_tokens": 730694260.0, + "step": 1775 + }, + { + "epoch": 2.011467889908257, + "grad_norm": 0.25810531840059814, + "learning_rate": 3.2275171262392835e-06, + "loss": 0.311, + "num_tokens": 731043568.0, + "step": 1776 + }, + { + "epoch": 2.0118501529051986, + "grad_norm": 0.2873222386701609, + "learning_rate": 3.2227092151945327e-06, + "loss": 0.3248, + "num_tokens": 731415945.0, + "step": 1777 + }, + { + "epoch": 2.0122324159021407, + "grad_norm": 0.2581797267971073, + "learning_rate": 3.2179047961739807e-06, + "loss": 0.3184, + "num_tokens": 731809002.0, + "step": 1778 + }, + { + "epoch": 2.0126146788990824, + "grad_norm": 0.263628999936447, + "learning_rate": 3.213103876544773e-06, + "loss": 0.352, + "num_tokens": 732230925.0, + "step": 1779 + }, + { + "epoch": 2.0129969418960245, + "grad_norm": 0.2670793384235646, + "learning_rate": 3.208306463668703e-06, + "loss": 0.3343, + "num_tokens": 732628653.0, + "step": 1780 + }, + { + "epoch": 2.013379204892966, + "grad_norm": 0.24439546657845299, + "learning_rate": 3.2035125649021733e-06, + "loss": 0.3056, + "num_tokens": 733028213.0, + "step": 1781 + }, + { + "epoch": 2.0137614678899083, + "grad_norm": 0.2569393905404549, + "learning_rate": 3.198722187596206e-06, + "loss": 0.3197, + "num_tokens": 733431075.0, + "step": 1782 + }, + { + "epoch": 2.01414373088685, + "grad_norm": 0.23536067023370272, + "learning_rate": 3.1939353390964224e-06, + "loss": 0.3231, + "num_tokens": 733833055.0, + "step": 1783 + }, + { + "epoch": 2.014525993883792, + "grad_norm": 0.24503016888061527, + "learning_rate": 3.189152026743031e-06, + "loss": 0.3251, + "num_tokens": 734262584.0, + "step": 1784 + }, + { + "epoch": 2.014908256880734, + "grad_norm": 0.25207753138399924, + "learning_rate": 3.184372257870818e-06, + "loss": 0.3197, + "num_tokens": 734648250.0, + "step": 1785 + }, + { + "epoch": 2.015290519877676, + "grad_norm": 0.2405222425834741, + "learning_rate": 3.1795960398091373e-06, + "loss": 0.3059, + "num_tokens": 735083618.0, + "step": 1786 + }, + { + "epoch": 2.015672782874618, + "grad_norm": 0.27418869787100386, + "learning_rate": 3.1748233798818997e-06, + "loss": 0.2965, + "num_tokens": 735463309.0, + "step": 1787 + }, + { + "epoch": 2.0160550458715596, + "grad_norm": 0.24639733231677355, + "learning_rate": 3.1700542854075508e-06, + "loss": 0.3252, + "num_tokens": 735865715.0, + "step": 1788 + }, + { + "epoch": 2.0164373088685017, + "grad_norm": 0.2304158868186682, + "learning_rate": 3.165288763699084e-06, + "loss": 0.3311, + "num_tokens": 736326065.0, + "step": 1789 + }, + { + "epoch": 2.0168195718654434, + "grad_norm": 0.24822470747781525, + "learning_rate": 3.1605268220639986e-06, + "loss": 0.324, + "num_tokens": 736747892.0, + "step": 1790 + }, + { + "epoch": 2.0172018348623855, + "grad_norm": 0.23398422506593472, + "learning_rate": 3.1557684678043145e-06, + "loss": 0.2899, + "num_tokens": 737166530.0, + "step": 1791 + }, + { + "epoch": 2.017584097859327, + "grad_norm": 0.2744979500517675, + "learning_rate": 3.1510137082165475e-06, + "loss": 0.3061, + "num_tokens": 737550285.0, + "step": 1792 + }, + { + "epoch": 2.0179663608562692, + "grad_norm": 0.2589243971620022, + "learning_rate": 3.146262550591701e-06, + "loss": 0.3193, + "num_tokens": 737956094.0, + "step": 1793 + }, + { + "epoch": 2.018348623853211, + "grad_norm": 0.25721306935861216, + "learning_rate": 3.1415150022152564e-06, + "loss": 0.3151, + "num_tokens": 738369559.0, + "step": 1794 + }, + { + "epoch": 2.018730886850153, + "grad_norm": 0.25607224589419947, + "learning_rate": 3.136771070367157e-06, + "loss": 0.3186, + "num_tokens": 738785166.0, + "step": 1795 + }, + { + "epoch": 2.0191131498470947, + "grad_norm": 0.2646648028661554, + "learning_rate": 3.1320307623218075e-06, + "loss": 0.3264, + "num_tokens": 739217301.0, + "step": 1796 + }, + { + "epoch": 2.0194954128440368, + "grad_norm": 0.2426999415604464, + "learning_rate": 3.1272940853480473e-06, + "loss": 0.3142, + "num_tokens": 739626966.0, + "step": 1797 + }, + { + "epoch": 2.0198776758409784, + "grad_norm": 0.27346599109973424, + "learning_rate": 3.1225610467091533e-06, + "loss": 0.3109, + "num_tokens": 740029041.0, + "step": 1798 + }, + { + "epoch": 2.0202599388379205, + "grad_norm": 0.25062726426604437, + "learning_rate": 3.117831653662822e-06, + "loss": 0.2905, + "num_tokens": 740414177.0, + "step": 1799 + }, + { + "epoch": 2.020642201834862, + "grad_norm": 0.2286203124903576, + "learning_rate": 3.1131059134611595e-06, + "loss": 0.2722, + "num_tokens": 740804304.0, + "step": 1800 + }, + { + "epoch": 2.0210244648318043, + "grad_norm": 0.23416956734703526, + "learning_rate": 3.1083838333506715e-06, + "loss": 0.2998, + "num_tokens": 741205986.0, + "step": 1801 + }, + { + "epoch": 2.021406727828746, + "grad_norm": 0.24339664061892777, + "learning_rate": 3.1036654205722503e-06, + "loss": 0.3052, + "num_tokens": 741579886.0, + "step": 1802 + }, + { + "epoch": 2.021788990825688, + "grad_norm": 0.24777188271430553, + "learning_rate": 3.098950682361166e-06, + "loss": 0.3113, + "num_tokens": 741956543.0, + "step": 1803 + }, + { + "epoch": 2.02217125382263, + "grad_norm": 0.2505532161477693, + "learning_rate": 3.094239625947051e-06, + "loss": 0.3224, + "num_tokens": 742368740.0, + "step": 1804 + }, + { + "epoch": 2.022553516819572, + "grad_norm": 0.2740097837549157, + "learning_rate": 3.089532258553895e-06, + "loss": 0.3109, + "num_tokens": 742761394.0, + "step": 1805 + }, + { + "epoch": 2.022935779816514, + "grad_norm": 0.25186197910802094, + "learning_rate": 3.0848285874000326e-06, + "loss": 0.3125, + "num_tokens": 743178753.0, + "step": 1806 + }, + { + "epoch": 2.0233180428134556, + "grad_norm": 0.25063294806680786, + "learning_rate": 3.0801286196981234e-06, + "loss": 0.3221, + "num_tokens": 743640252.0, + "step": 1807 + }, + { + "epoch": 2.0237003058103977, + "grad_norm": 0.25593016534508156, + "learning_rate": 3.0754323626551543e-06, + "loss": 0.3053, + "num_tokens": 744041823.0, + "step": 1808 + }, + { + "epoch": 2.0240825688073394, + "grad_norm": 0.22744175225847083, + "learning_rate": 3.0707398234724206e-06, + "loss": 0.2991, + "num_tokens": 744459065.0, + "step": 1809 + }, + { + "epoch": 2.0244648318042815, + "grad_norm": 0.2446511972336764, + "learning_rate": 3.066051009345517e-06, + "loss": 0.3144, + "num_tokens": 744862164.0, + "step": 1810 + }, + { + "epoch": 2.024847094801223, + "grad_norm": 0.25524397782622377, + "learning_rate": 3.0613659274643255e-06, + "loss": 0.2919, + "num_tokens": 745220691.0, + "step": 1811 + }, + { + "epoch": 2.0252293577981653, + "grad_norm": 0.24391027544377664, + "learning_rate": 3.0566845850130043e-06, + "loss": 0.3383, + "num_tokens": 745639087.0, + "step": 1812 + }, + { + "epoch": 2.025611620795107, + "grad_norm": 0.2486522240362927, + "learning_rate": 3.052006989169981e-06, + "loss": 0.2932, + "num_tokens": 746049043.0, + "step": 1813 + }, + { + "epoch": 2.025993883792049, + "grad_norm": 0.23771560424583243, + "learning_rate": 3.0473331471079307e-06, + "loss": 0.2735, + "num_tokens": 746450290.0, + "step": 1814 + }, + { + "epoch": 2.0263761467889907, + "grad_norm": 0.26724819167392405, + "learning_rate": 3.0426630659937834e-06, + "loss": 0.3156, + "num_tokens": 746877820.0, + "step": 1815 + }, + { + "epoch": 2.026758409785933, + "grad_norm": 0.2765474352632359, + "learning_rate": 3.0379967529886904e-06, + "loss": 0.3261, + "num_tokens": 747290202.0, + "step": 1816 + }, + { + "epoch": 2.0271406727828745, + "grad_norm": 0.2525481884464414, + "learning_rate": 3.0333342152480332e-06, + "loss": 0.295, + "num_tokens": 747691191.0, + "step": 1817 + }, + { + "epoch": 2.0275229357798166, + "grad_norm": 0.24171910353744017, + "learning_rate": 3.0286754599214007e-06, + "loss": 0.2984, + "num_tokens": 748090982.0, + "step": 1818 + }, + { + "epoch": 2.0279051987767582, + "grad_norm": 0.2728246330869588, + "learning_rate": 3.0240204941525818e-06, + "loss": 0.3023, + "num_tokens": 748489459.0, + "step": 1819 + }, + { + "epoch": 2.0282874617737003, + "grad_norm": 0.27101232699076977, + "learning_rate": 3.0193693250795587e-06, + "loss": 0.3246, + "num_tokens": 748930093.0, + "step": 1820 + }, + { + "epoch": 2.028669724770642, + "grad_norm": 0.2527746512773829, + "learning_rate": 3.0147219598344823e-06, + "loss": 0.3073, + "num_tokens": 749367341.0, + "step": 1821 + }, + { + "epoch": 2.029051987767584, + "grad_norm": 0.2668158023550485, + "learning_rate": 3.0100784055436818e-06, + "loss": 0.3062, + "num_tokens": 749739523.0, + "step": 1822 + }, + { + "epoch": 2.0294342507645258, + "grad_norm": 0.22226742564938898, + "learning_rate": 3.005438669327633e-06, + "loss": 0.2882, + "num_tokens": 750137755.0, + "step": 1823 + }, + { + "epoch": 2.029816513761468, + "grad_norm": 0.25713361899649634, + "learning_rate": 3.000802758300967e-06, + "loss": 0.2961, + "num_tokens": 750506620.0, + "step": 1824 + }, + { + "epoch": 2.03019877675841, + "grad_norm": 0.29298680481859035, + "learning_rate": 2.9961706795724366e-06, + "loss": 0.3216, + "num_tokens": 750907674.0, + "step": 1825 + }, + { + "epoch": 2.0305810397553516, + "grad_norm": 0.24964897621978419, + "learning_rate": 2.9915424402449334e-06, + "loss": 0.3096, + "num_tokens": 751286645.0, + "step": 1826 + }, + { + "epoch": 2.0309633027522938, + "grad_norm": 0.25105094629201735, + "learning_rate": 2.9869180474154473e-06, + "loss": 0.3299, + "num_tokens": 751690243.0, + "step": 1827 + }, + { + "epoch": 2.0313455657492354, + "grad_norm": 0.24559835593024112, + "learning_rate": 2.9822975081750776e-06, + "loss": 0.2992, + "num_tokens": 752119840.0, + "step": 1828 + }, + { + "epoch": 2.0317278287461775, + "grad_norm": 0.24776057343020483, + "learning_rate": 2.9776808296090155e-06, + "loss": 0.3201, + "num_tokens": 752552466.0, + "step": 1829 + }, + { + "epoch": 2.032110091743119, + "grad_norm": 0.2515573527523867, + "learning_rate": 2.9730680187965237e-06, + "loss": 0.3068, + "num_tokens": 752936825.0, + "step": 1830 + }, + { + "epoch": 2.0324923547400613, + "grad_norm": 0.2531370379471016, + "learning_rate": 2.9684590828109473e-06, + "loss": 0.2794, + "num_tokens": 753305996.0, + "step": 1831 + }, + { + "epoch": 2.032874617737003, + "grad_norm": 0.24464212022852835, + "learning_rate": 2.963854028719676e-06, + "loss": 0.3002, + "num_tokens": 753686005.0, + "step": 1832 + }, + { + "epoch": 2.033256880733945, + "grad_norm": 0.2248890795516489, + "learning_rate": 2.959252863584159e-06, + "loss": 0.2842, + "num_tokens": 754109910.0, + "step": 1833 + }, + { + "epoch": 2.0336391437308867, + "grad_norm": 0.25739735506399464, + "learning_rate": 2.954655594459872e-06, + "loss": 0.3363, + "num_tokens": 754565104.0, + "step": 1834 + }, + { + "epoch": 2.034021406727829, + "grad_norm": 0.2726606936521271, + "learning_rate": 2.950062228396323e-06, + "loss": 0.3092, + "num_tokens": 754937706.0, + "step": 1835 + }, + { + "epoch": 2.0344036697247705, + "grad_norm": 0.248065062932608, + "learning_rate": 2.9454727724370325e-06, + "loss": 0.2879, + "num_tokens": 755310494.0, + "step": 1836 + }, + { + "epoch": 2.0347859327217126, + "grad_norm": 0.23527074332222286, + "learning_rate": 2.9408872336195252e-06, + "loss": 0.2931, + "num_tokens": 755698745.0, + "step": 1837 + }, + { + "epoch": 2.0351681957186543, + "grad_norm": 0.2594880311173301, + "learning_rate": 2.9363056189753224e-06, + "loss": 0.3189, + "num_tokens": 756139122.0, + "step": 1838 + }, + { + "epoch": 2.0355504587155964, + "grad_norm": 0.29091929784036424, + "learning_rate": 2.931727935529921e-06, + "loss": 0.2956, + "num_tokens": 756532212.0, + "step": 1839 + }, + { + "epoch": 2.035932721712538, + "grad_norm": 0.2378701459873727, + "learning_rate": 2.9271541903027984e-06, + "loss": 0.2975, + "num_tokens": 756924676.0, + "step": 1840 + }, + { + "epoch": 2.03631498470948, + "grad_norm": 0.24182257083588726, + "learning_rate": 2.9225843903073854e-06, + "loss": 0.2907, + "num_tokens": 757303985.0, + "step": 1841 + }, + { + "epoch": 2.036697247706422, + "grad_norm": 0.24109892958833565, + "learning_rate": 2.9180185425510678e-06, + "loss": 0.3131, + "num_tokens": 757727215.0, + "step": 1842 + }, + { + "epoch": 2.037079510703364, + "grad_norm": 0.23183766436989323, + "learning_rate": 2.9134566540351695e-06, + "loss": 0.3051, + "num_tokens": 758162846.0, + "step": 1843 + }, + { + "epoch": 2.037461773700306, + "grad_norm": 0.24656029234859986, + "learning_rate": 2.9088987317549443e-06, + "loss": 0.277, + "num_tokens": 758539184.0, + "step": 1844 + }, + { + "epoch": 2.0378440366972477, + "grad_norm": 0.234823666828163, + "learning_rate": 2.904344782699562e-06, + "loss": 0.2874, + "num_tokens": 758933524.0, + "step": 1845 + }, + { + "epoch": 2.03822629969419, + "grad_norm": 0.24334942235774046, + "learning_rate": 2.899794813852102e-06, + "loss": 0.3138, + "num_tokens": 759361176.0, + "step": 1846 + }, + { + "epoch": 2.0386085626911314, + "grad_norm": 0.2313774041503865, + "learning_rate": 2.895248832189541e-06, + "loss": 0.291, + "num_tokens": 759772585.0, + "step": 1847 + }, + { + "epoch": 2.0389908256880735, + "grad_norm": 0.2591648093940398, + "learning_rate": 2.8907068446827348e-06, + "loss": 0.3053, + "num_tokens": 760109076.0, + "step": 1848 + }, + { + "epoch": 2.039373088685015, + "grad_norm": 0.26922079840987917, + "learning_rate": 2.8861688582964263e-06, + "loss": 0.3204, + "num_tokens": 760522674.0, + "step": 1849 + }, + { + "epoch": 2.0397553516819573, + "grad_norm": 0.25498507046188246, + "learning_rate": 2.8816348799892134e-06, + "loss": 0.302, + "num_tokens": 760910044.0, + "step": 1850 + }, + { + "epoch": 2.040137614678899, + "grad_norm": 0.25255352580310136, + "learning_rate": 2.8771049167135507e-06, + "loss": 0.2887, + "num_tokens": 761297622.0, + "step": 1851 + }, + { + "epoch": 2.040519877675841, + "grad_norm": 0.2525503254488794, + "learning_rate": 2.8725789754157385e-06, + "loss": 0.2867, + "num_tokens": 761690778.0, + "step": 1852 + }, + { + "epoch": 2.0409021406727827, + "grad_norm": 0.25335127324537604, + "learning_rate": 2.868057063035906e-06, + "loss": 0.2973, + "num_tokens": 762088968.0, + "step": 1853 + }, + { + "epoch": 2.041284403669725, + "grad_norm": 0.24506273018165037, + "learning_rate": 2.8635391865080074e-06, + "loss": 0.3124, + "num_tokens": 762520894.0, + "step": 1854 + }, + { + "epoch": 2.0416666666666665, + "grad_norm": 0.2638475241286148, + "learning_rate": 2.8590253527598073e-06, + "loss": 0.3248, + "num_tokens": 762932618.0, + "step": 1855 + }, + { + "epoch": 2.0420489296636086, + "grad_norm": 0.25979010390983487, + "learning_rate": 2.8545155687128706e-06, + "loss": 0.3051, + "num_tokens": 763316146.0, + "step": 1856 + }, + { + "epoch": 2.0424311926605503, + "grad_norm": 0.24717900134876009, + "learning_rate": 2.850009841282554e-06, + "loss": 0.3068, + "num_tokens": 763728595.0, + "step": 1857 + }, + { + "epoch": 2.0428134556574924, + "grad_norm": 0.23594218221231555, + "learning_rate": 2.8455081773779893e-06, + "loss": 0.3113, + "num_tokens": 764202688.0, + "step": 1858 + }, + { + "epoch": 2.043195718654434, + "grad_norm": 0.2567370431225666, + "learning_rate": 2.841010583902082e-06, + "loss": 0.3265, + "num_tokens": 764628951.0, + "step": 1859 + }, + { + "epoch": 2.043577981651376, + "grad_norm": 0.2354944113587352, + "learning_rate": 2.836517067751494e-06, + "loss": 0.3004, + "num_tokens": 765041421.0, + "step": 1860 + }, + { + "epoch": 2.043960244648318, + "grad_norm": 0.24806082804422305, + "learning_rate": 2.8320276358166365e-06, + "loss": 0.3125, + "num_tokens": 765493970.0, + "step": 1861 + }, + { + "epoch": 2.04434250764526, + "grad_norm": 0.24444157506679715, + "learning_rate": 2.8275422949816556e-06, + "loss": 0.2959, + "num_tokens": 765906137.0, + "step": 1862 + }, + { + "epoch": 2.044724770642202, + "grad_norm": 0.2510230348240986, + "learning_rate": 2.823061052124425e-06, + "loss": 0.3016, + "num_tokens": 766311280.0, + "step": 1863 + }, + { + "epoch": 2.0451070336391437, + "grad_norm": 0.23636816214188436, + "learning_rate": 2.818583914116535e-06, + "loss": 0.2784, + "num_tokens": 766712496.0, + "step": 1864 + }, + { + "epoch": 2.045489296636086, + "grad_norm": 0.2670457603858624, + "learning_rate": 2.814110887823281e-06, + "loss": 0.314, + "num_tokens": 767143731.0, + "step": 1865 + }, + { + "epoch": 2.0458715596330275, + "grad_norm": 0.25096696615664604, + "learning_rate": 2.8096419801036552e-06, + "loss": 0.305, + "num_tokens": 767566242.0, + "step": 1866 + }, + { + "epoch": 2.0462538226299696, + "grad_norm": 0.23710405984066324, + "learning_rate": 2.805177197810329e-06, + "loss": 0.3205, + "num_tokens": 767991366.0, + "step": 1867 + }, + { + "epoch": 2.0466360856269112, + "grad_norm": 0.23647908195758097, + "learning_rate": 2.800716547789656e-06, + "loss": 0.3, + "num_tokens": 768435953.0, + "step": 1868 + }, + { + "epoch": 2.0470183486238533, + "grad_norm": 0.25090574546474453, + "learning_rate": 2.796260036881645e-06, + "loss": 0.3293, + "num_tokens": 768884222.0, + "step": 1869 + }, + { + "epoch": 2.047400611620795, + "grad_norm": 0.25437833149929656, + "learning_rate": 2.7918076719199626e-06, + "loss": 0.3239, + "num_tokens": 769283220.0, + "step": 1870 + }, + { + "epoch": 2.047782874617737, + "grad_norm": 0.25256325876657243, + "learning_rate": 2.787359459731919e-06, + "loss": 0.3054, + "num_tokens": 769706797.0, + "step": 1871 + }, + { + "epoch": 2.0481651376146788, + "grad_norm": 0.23409241520891302, + "learning_rate": 2.7829154071384528e-06, + "loss": 0.2852, + "num_tokens": 770112469.0, + "step": 1872 + }, + { + "epoch": 2.048547400611621, + "grad_norm": 0.24157602590040073, + "learning_rate": 2.7784755209541283e-06, + "loss": 0.3288, + "num_tokens": 770571324.0, + "step": 1873 + }, + { + "epoch": 2.0489296636085625, + "grad_norm": 0.2410693897499373, + "learning_rate": 2.7740398079871133e-06, + "loss": 0.2974, + "num_tokens": 770961764.0, + "step": 1874 + }, + { + "epoch": 2.0493119266055047, + "grad_norm": 0.2921642399613461, + "learning_rate": 2.7696082750391886e-06, + "loss": 0.3079, + "num_tokens": 771371095.0, + "step": 1875 + }, + { + "epoch": 2.0496941896024463, + "grad_norm": 0.24826658815863353, + "learning_rate": 2.765180928905712e-06, + "loss": 0.3009, + "num_tokens": 771798673.0, + "step": 1876 + }, + { + "epoch": 2.0500764525993884, + "grad_norm": 0.2836604359914633, + "learning_rate": 2.7607577763756333e-06, + "loss": 0.3188, + "num_tokens": 772163103.0, + "step": 1877 + }, + { + "epoch": 2.05045871559633, + "grad_norm": 0.2365639298982091, + "learning_rate": 2.7563388242314615e-06, + "loss": 0.3171, + "num_tokens": 772587281.0, + "step": 1878 + }, + { + "epoch": 2.050840978593272, + "grad_norm": 0.21631205242485615, + "learning_rate": 2.7519240792492717e-06, + "loss": 0.3118, + "num_tokens": 773017673.0, + "step": 1879 + }, + { + "epoch": 2.051223241590214, + "grad_norm": 0.22689033386267388, + "learning_rate": 2.7475135481986847e-06, + "loss": 0.3031, + "num_tokens": 773421549.0, + "step": 1880 + }, + { + "epoch": 2.051605504587156, + "grad_norm": 0.22404085649956276, + "learning_rate": 2.7431072378428604e-06, + "loss": 0.3132, + "num_tokens": 773853341.0, + "step": 1881 + }, + { + "epoch": 2.051987767584098, + "grad_norm": 0.2613187639618827, + "learning_rate": 2.738705154938487e-06, + "loss": 0.2824, + "num_tokens": 774223736.0, + "step": 1882 + }, + { + "epoch": 2.0523700305810397, + "grad_norm": 0.23813393871831123, + "learning_rate": 2.7343073062357655e-06, + "loss": 0.3069, + "num_tokens": 774636709.0, + "step": 1883 + }, + { + "epoch": 2.052752293577982, + "grad_norm": 0.2528773175125097, + "learning_rate": 2.7299136984784146e-06, + "loss": 0.2937, + "num_tokens": 775055518.0, + "step": 1884 + }, + { + "epoch": 2.0531345565749235, + "grad_norm": 0.25005658199878217, + "learning_rate": 2.7255243384036383e-06, + "loss": 0.3174, + "num_tokens": 775497286.0, + "step": 1885 + }, + { + "epoch": 2.0535168195718656, + "grad_norm": 0.22476413457932462, + "learning_rate": 2.721139232742137e-06, + "loss": 0.2983, + "num_tokens": 775913214.0, + "step": 1886 + }, + { + "epoch": 2.0538990825688073, + "grad_norm": 0.23422843440615318, + "learning_rate": 2.7167583882180794e-06, + "loss": 0.2849, + "num_tokens": 776308727.0, + "step": 1887 + }, + { + "epoch": 2.0542813455657494, + "grad_norm": 0.2367788294580391, + "learning_rate": 2.712381811549104e-06, + "loss": 0.2918, + "num_tokens": 776719045.0, + "step": 1888 + }, + { + "epoch": 2.054663608562691, + "grad_norm": 0.25154860367862086, + "learning_rate": 2.708009509446307e-06, + "loss": 0.2772, + "num_tokens": 777132833.0, + "step": 1889 + }, + { + "epoch": 2.055045871559633, + "grad_norm": 0.2853307411961546, + "learning_rate": 2.703641488614222e-06, + "loss": 0.2947, + "num_tokens": 777518237.0, + "step": 1890 + }, + { + "epoch": 2.055428134556575, + "grad_norm": 0.2892952858160809, + "learning_rate": 2.6992777557508287e-06, + "loss": 0.3031, + "num_tokens": 777920029.0, + "step": 1891 + }, + { + "epoch": 2.055810397553517, + "grad_norm": 0.23930523401692808, + "learning_rate": 2.6949183175475213e-06, + "loss": 0.312, + "num_tokens": 778368151.0, + "step": 1892 + }, + { + "epoch": 2.0561926605504586, + "grad_norm": 0.2462465355906849, + "learning_rate": 2.6905631806891176e-06, + "loss": 0.3006, + "num_tokens": 778770244.0, + "step": 1893 + }, + { + "epoch": 2.0565749235474007, + "grad_norm": 0.2678351029342093, + "learning_rate": 2.6862123518538306e-06, + "loss": 0.3246, + "num_tokens": 779183007.0, + "step": 1894 + }, + { + "epoch": 2.0569571865443423, + "grad_norm": 0.2602739183230476, + "learning_rate": 2.681865837713275e-06, + "loss": 0.3279, + "num_tokens": 779615373.0, + "step": 1895 + }, + { + "epoch": 2.0573394495412844, + "grad_norm": 0.22151913278866325, + "learning_rate": 2.6775236449324448e-06, + "loss": 0.2939, + "num_tokens": 780076818.0, + "step": 1896 + }, + { + "epoch": 2.057721712538226, + "grad_norm": 0.23749909608017644, + "learning_rate": 2.6731857801697096e-06, + "loss": 0.3013, + "num_tokens": 780536956.0, + "step": 1897 + }, + { + "epoch": 2.058103975535168, + "grad_norm": 0.2809814164220392, + "learning_rate": 2.668852250076801e-06, + "loss": 0.2972, + "num_tokens": 780945303.0, + "step": 1898 + }, + { + "epoch": 2.05848623853211, + "grad_norm": 0.25984682014967614, + "learning_rate": 2.664523061298806e-06, + "loss": 0.331, + "num_tokens": 781388653.0, + "step": 1899 + }, + { + "epoch": 2.058868501529052, + "grad_norm": 0.23952194385567244, + "learning_rate": 2.6601982204741524e-06, + "loss": 0.3258, + "num_tokens": 781839006.0, + "step": 1900 + }, + { + "epoch": 2.059250764525994, + "grad_norm": 0.2556101438240176, + "learning_rate": 2.6558777342345982e-06, + "loss": 0.3302, + "num_tokens": 782271159.0, + "step": 1901 + }, + { + "epoch": 2.0596330275229358, + "grad_norm": 0.2852184673002273, + "learning_rate": 2.6515616092052332e-06, + "loss": 0.274, + "num_tokens": 782637793.0, + "step": 1902 + }, + { + "epoch": 2.060015290519878, + "grad_norm": 0.2529499833638473, + "learning_rate": 2.647249852004449e-06, + "loss": 0.304, + "num_tokens": 783078149.0, + "step": 1903 + }, + { + "epoch": 2.0603975535168195, + "grad_norm": 0.25474573584430266, + "learning_rate": 2.6429424692439467e-06, + "loss": 0.2794, + "num_tokens": 783454245.0, + "step": 1904 + }, + { + "epoch": 2.0607798165137616, + "grad_norm": 0.2538441741205928, + "learning_rate": 2.638639467528715e-06, + "loss": 0.2841, + "num_tokens": 783879903.0, + "step": 1905 + }, + { + "epoch": 2.0611620795107033, + "grad_norm": 0.23563383153650205, + "learning_rate": 2.6343408534570295e-06, + "loss": 0.3146, + "num_tokens": 784307049.0, + "step": 1906 + }, + { + "epoch": 2.0615443425076454, + "grad_norm": 0.2995604393309861, + "learning_rate": 2.6300466336204333e-06, + "loss": 0.3144, + "num_tokens": 784725197.0, + "step": 1907 + }, + { + "epoch": 2.061926605504587, + "grad_norm": 0.239721608159916, + "learning_rate": 2.625756814603734e-06, + "loss": 0.2867, + "num_tokens": 785118223.0, + "step": 1908 + }, + { + "epoch": 2.062308868501529, + "grad_norm": 0.236866412604189, + "learning_rate": 2.621471402984991e-06, + "loss": 0.2878, + "num_tokens": 785529233.0, + "step": 1909 + }, + { + "epoch": 2.062691131498471, + "grad_norm": 0.2505921338189514, + "learning_rate": 2.6171904053355012e-06, + "loss": 0.3021, + "num_tokens": 785957992.0, + "step": 1910 + }, + { + "epoch": 2.063073394495413, + "grad_norm": 0.25622741884888617, + "learning_rate": 2.6129138282197976e-06, + "loss": 0.3178, + "num_tokens": 786406312.0, + "step": 1911 + }, + { + "epoch": 2.0634556574923546, + "grad_norm": 0.23381622623250836, + "learning_rate": 2.6086416781956342e-06, + "loss": 0.2836, + "num_tokens": 786773746.0, + "step": 1912 + }, + { + "epoch": 2.0638379204892967, + "grad_norm": 0.2419129447930029, + "learning_rate": 2.6043739618139744e-06, + "loss": 0.2818, + "num_tokens": 787159103.0, + "step": 1913 + }, + { + "epoch": 2.0642201834862384, + "grad_norm": 0.2417967267051386, + "learning_rate": 2.600110685618985e-06, + "loss": 0.3093, + "num_tokens": 787601671.0, + "step": 1914 + }, + { + "epoch": 2.0646024464831805, + "grad_norm": 0.26314094824173706, + "learning_rate": 2.595851856148024e-06, + "loss": 0.3177, + "num_tokens": 788016645.0, + "step": 1915 + }, + { + "epoch": 2.064984709480122, + "grad_norm": 0.23691810789002488, + "learning_rate": 2.591597479931629e-06, + "loss": 0.2942, + "num_tokens": 788407801.0, + "step": 1916 + }, + { + "epoch": 2.0653669724770642, + "grad_norm": 0.2603964480266365, + "learning_rate": 2.58734756349351e-06, + "loss": 0.2809, + "num_tokens": 788790923.0, + "step": 1917 + }, + { + "epoch": 2.065749235474006, + "grad_norm": 0.2573798564504652, + "learning_rate": 2.5831021133505385e-06, + "loss": 0.3026, + "num_tokens": 789229175.0, + "step": 1918 + }, + { + "epoch": 2.066131498470948, + "grad_norm": 0.29045468874027264, + "learning_rate": 2.578861136012739e-06, + "loss": 0.2979, + "num_tokens": 789638781.0, + "step": 1919 + }, + { + "epoch": 2.06651376146789, + "grad_norm": 0.2758999303570352, + "learning_rate": 2.5746246379832716e-06, + "loss": 0.3151, + "num_tokens": 790056351.0, + "step": 1920 + }, + { + "epoch": 2.066896024464832, + "grad_norm": 0.25087129346208803, + "learning_rate": 2.5703926257584344e-06, + "loss": 0.2939, + "num_tokens": 790450011.0, + "step": 1921 + }, + { + "epoch": 2.067278287461774, + "grad_norm": 0.256337724302696, + "learning_rate": 2.566165105827644e-06, + "loss": 0.3372, + "num_tokens": 790901889.0, + "step": 1922 + }, + { + "epoch": 2.0676605504587156, + "grad_norm": 0.23369123777508471, + "learning_rate": 2.561942084673428e-06, + "loss": 0.3118, + "num_tokens": 791347802.0, + "step": 1923 + }, + { + "epoch": 2.0680428134556577, + "grad_norm": 0.22942665179069943, + "learning_rate": 2.5577235687714162e-06, + "loss": 0.2842, + "num_tokens": 791759807.0, + "step": 1924 + }, + { + "epoch": 2.0684250764525993, + "grad_norm": 0.2383708107043818, + "learning_rate": 2.553509564590331e-06, + "loss": 0.3055, + "num_tokens": 792217480.0, + "step": 1925 + }, + { + "epoch": 2.0688073394495414, + "grad_norm": 0.28099771231634224, + "learning_rate": 2.549300078591975e-06, + "loss": 0.3189, + "num_tokens": 792640562.0, + "step": 1926 + }, + { + "epoch": 2.069189602446483, + "grad_norm": 0.26192783135072806, + "learning_rate": 2.545095117231221e-06, + "loss": 0.2982, + "num_tokens": 793053819.0, + "step": 1927 + }, + { + "epoch": 2.069571865443425, + "grad_norm": 0.27683330662659467, + "learning_rate": 2.54089468695601e-06, + "loss": 0.2922, + "num_tokens": 793418494.0, + "step": 1928 + }, + { + "epoch": 2.069954128440367, + "grad_norm": 0.24193480713225446, + "learning_rate": 2.536698794207327e-06, + "loss": 0.2728, + "num_tokens": 793775817.0, + "step": 1929 + }, + { + "epoch": 2.070336391437309, + "grad_norm": 0.2585965594706889, + "learning_rate": 2.5325074454192035e-06, + "loss": 0.2889, + "num_tokens": 794185908.0, + "step": 1930 + }, + { + "epoch": 2.0707186544342506, + "grad_norm": 0.2578078103759536, + "learning_rate": 2.5283206470187034e-06, + "loss": 0.2951, + "num_tokens": 794552715.0, + "step": 1931 + }, + { + "epoch": 2.0711009174311927, + "grad_norm": 0.24788166986132384, + "learning_rate": 2.5241384054259114e-06, + "loss": 0.2995, + "num_tokens": 794944870.0, + "step": 1932 + }, + { + "epoch": 2.0714831804281344, + "grad_norm": 0.24359820048510666, + "learning_rate": 2.519960727053927e-06, + "loss": 0.2982, + "num_tokens": 795359555.0, + "step": 1933 + }, + { + "epoch": 2.0718654434250765, + "grad_norm": 0.2705659431054324, + "learning_rate": 2.515787618308847e-06, + "loss": 0.3081, + "num_tokens": 795798667.0, + "step": 1934 + }, + { + "epoch": 2.072247706422018, + "grad_norm": 0.26599523237631695, + "learning_rate": 2.5116190855897703e-06, + "loss": 0.3, + "num_tokens": 796214008.0, + "step": 1935 + }, + { + "epoch": 2.0726299694189603, + "grad_norm": 0.2981101636985226, + "learning_rate": 2.507455135288767e-06, + "loss": 0.3183, + "num_tokens": 796606912.0, + "step": 1936 + }, + { + "epoch": 2.073012232415902, + "grad_norm": 0.2521113794732323, + "learning_rate": 2.5032957737908946e-06, + "loss": 0.2975, + "num_tokens": 797026547.0, + "step": 1937 + }, + { + "epoch": 2.073394495412844, + "grad_norm": 0.2579350055292374, + "learning_rate": 2.4991410074741586e-06, + "loss": 0.3322, + "num_tokens": 797439229.0, + "step": 1938 + }, + { + "epoch": 2.073776758409786, + "grad_norm": 0.2595412828204378, + "learning_rate": 2.494990842709533e-06, + "loss": 0.3151, + "num_tokens": 797860490.0, + "step": 1939 + }, + { + "epoch": 2.074159021406728, + "grad_norm": 0.2526795661057503, + "learning_rate": 2.4908452858609245e-06, + "loss": 0.3105, + "num_tokens": 798292920.0, + "step": 1940 + }, + { + "epoch": 2.07454128440367, + "grad_norm": 0.25075016400492633, + "learning_rate": 2.486704343285179e-06, + "loss": 0.2994, + "num_tokens": 798733759.0, + "step": 1941 + }, + { + "epoch": 2.0749235474006116, + "grad_norm": 0.2696263996982417, + "learning_rate": 2.4825680213320684e-06, + "loss": 0.3002, + "num_tokens": 799099532.0, + "step": 1942 + }, + { + "epoch": 2.0753058103975537, + "grad_norm": 0.2390526713381615, + "learning_rate": 2.4784363263442716e-06, + "loss": 0.3059, + "num_tokens": 799495133.0, + "step": 1943 + }, + { + "epoch": 2.0756880733944953, + "grad_norm": 0.2550420721264511, + "learning_rate": 2.474309264657384e-06, + "loss": 0.2897, + "num_tokens": 799904731.0, + "step": 1944 + }, + { + "epoch": 2.0760703363914375, + "grad_norm": 0.24725962070993487, + "learning_rate": 2.4701868425998844e-06, + "loss": 0.3118, + "num_tokens": 800316021.0, + "step": 1945 + }, + { + "epoch": 2.076452599388379, + "grad_norm": 0.23779828671091766, + "learning_rate": 2.466069066493148e-06, + "loss": 0.2874, + "num_tokens": 800710701.0, + "step": 1946 + }, + { + "epoch": 2.0768348623853212, + "grad_norm": 0.23819682478321458, + "learning_rate": 2.4619559426514166e-06, + "loss": 0.2776, + "num_tokens": 801094024.0, + "step": 1947 + }, + { + "epoch": 2.077217125382263, + "grad_norm": 0.23609053747481742, + "learning_rate": 2.4578474773818037e-06, + "loss": 0.3023, + "num_tokens": 801489337.0, + "step": 1948 + }, + { + "epoch": 2.077599388379205, + "grad_norm": 0.26608075426914574, + "learning_rate": 2.453743676984278e-06, + "loss": 0.2953, + "num_tokens": 801868408.0, + "step": 1949 + }, + { + "epoch": 2.0779816513761467, + "grad_norm": 0.24970868308401248, + "learning_rate": 2.4496445477516546e-06, + "loss": 0.3031, + "num_tokens": 802281975.0, + "step": 1950 + }, + { + "epoch": 2.0783639143730888, + "grad_norm": 0.2523229839785525, + "learning_rate": 2.445550095969587e-06, + "loss": 0.2952, + "num_tokens": 802672346.0, + "step": 1951 + }, + { + "epoch": 2.0787461773700304, + "grad_norm": 0.2637241949361023, + "learning_rate": 2.4414603279165524e-06, + "loss": 0.2875, + "num_tokens": 803090187.0, + "step": 1952 + }, + { + "epoch": 2.0791284403669725, + "grad_norm": 0.2395272340386784, + "learning_rate": 2.437375249863852e-06, + "loss": 0.3254, + "num_tokens": 803531693.0, + "step": 1953 + }, + { + "epoch": 2.079510703363914, + "grad_norm": 0.23258140931262117, + "learning_rate": 2.4332948680755893e-06, + "loss": 0.2981, + "num_tokens": 803950660.0, + "step": 1954 + }, + { + "epoch": 2.0798929663608563, + "grad_norm": 0.26122239728892716, + "learning_rate": 2.429219188808671e-06, + "loss": 0.2871, + "num_tokens": 804352402.0, + "step": 1955 + }, + { + "epoch": 2.080275229357798, + "grad_norm": 0.2688096280805627, + "learning_rate": 2.42514821831279e-06, + "loss": 0.3044, + "num_tokens": 804750137.0, + "step": 1956 + }, + { + "epoch": 2.08065749235474, + "grad_norm": 0.2509567696378054, + "learning_rate": 2.42108196283042e-06, + "loss": 0.3245, + "num_tokens": 805211442.0, + "step": 1957 + }, + { + "epoch": 2.081039755351682, + "grad_norm": 0.24655872686391142, + "learning_rate": 2.417020428596806e-06, + "loss": 0.2929, + "num_tokens": 805620477.0, + "step": 1958 + }, + { + "epoch": 2.081422018348624, + "grad_norm": 0.2640749001798263, + "learning_rate": 2.4129636218399497e-06, + "loss": 0.2991, + "num_tokens": 805998964.0, + "step": 1959 + }, + { + "epoch": 2.081804281345566, + "grad_norm": 0.2422026631687079, + "learning_rate": 2.408911548780609e-06, + "loss": 0.3035, + "num_tokens": 806427836.0, + "step": 1960 + }, + { + "epoch": 2.0821865443425076, + "grad_norm": 0.27184362826920394, + "learning_rate": 2.4048642156322745e-06, + "loss": 0.2957, + "num_tokens": 806796679.0, + "step": 1961 + }, + { + "epoch": 2.0825688073394497, + "grad_norm": 0.26559039268905466, + "learning_rate": 2.40082162860118e-06, + "loss": 0.328, + "num_tokens": 807182451.0, + "step": 1962 + }, + { + "epoch": 2.0829510703363914, + "grad_norm": 0.23635795940344678, + "learning_rate": 2.396783793886272e-06, + "loss": 0.3068, + "num_tokens": 807611255.0, + "step": 1963 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.24169401109767216, + "learning_rate": 2.392750717679213e-06, + "loss": 0.2794, + "num_tokens": 807985081.0, + "step": 1964 + }, + { + "epoch": 2.083715596330275, + "grad_norm": 0.24705460951153824, + "learning_rate": 2.388722406164371e-06, + "loss": 0.325, + "num_tokens": 808423621.0, + "step": 1965 + }, + { + "epoch": 2.0840978593272173, + "grad_norm": 0.23582888552655434, + "learning_rate": 2.3846988655188037e-06, + "loss": 0.2736, + "num_tokens": 808806117.0, + "step": 1966 + }, + { + "epoch": 2.084480122324159, + "grad_norm": 0.2437760865097742, + "learning_rate": 2.3806801019122562e-06, + "loss": 0.2938, + "num_tokens": 809228492.0, + "step": 1967 + }, + { + "epoch": 2.084862385321101, + "grad_norm": 0.24323459057573368, + "learning_rate": 2.3766661215071473e-06, + "loss": 0.269, + "num_tokens": 809630593.0, + "step": 1968 + }, + { + "epoch": 2.0852446483180427, + "grad_norm": 0.250714448534589, + "learning_rate": 2.372656930458562e-06, + "loss": 0.316, + "num_tokens": 810096654.0, + "step": 1969 + }, + { + "epoch": 2.085626911314985, + "grad_norm": 0.23347996825839065, + "learning_rate": 2.3686525349142415e-06, + "loss": 0.2948, + "num_tokens": 810502796.0, + "step": 1970 + }, + { + "epoch": 2.0860091743119265, + "grad_norm": 0.24149363672236024, + "learning_rate": 2.3646529410145684e-06, + "loss": 0.3086, + "num_tokens": 810931125.0, + "step": 1971 + }, + { + "epoch": 2.0863914373088686, + "grad_norm": 0.251702201892877, + "learning_rate": 2.3606581548925696e-06, + "loss": 0.3035, + "num_tokens": 811338873.0, + "step": 1972 + }, + { + "epoch": 2.08677370030581, + "grad_norm": 0.2378565025242665, + "learning_rate": 2.356668182673896e-06, + "loss": 0.2898, + "num_tokens": 811719159.0, + "step": 1973 + }, + { + "epoch": 2.0871559633027523, + "grad_norm": 0.23432044530748078, + "learning_rate": 2.3526830304768177e-06, + "loss": 0.2961, + "num_tokens": 812148086.0, + "step": 1974 + }, + { + "epoch": 2.087538226299694, + "grad_norm": 0.2239112584765275, + "learning_rate": 2.3487027044122134e-06, + "loss": 0.2883, + "num_tokens": 812557127.0, + "step": 1975 + }, + { + "epoch": 2.087920489296636, + "grad_norm": 0.25228758430024656, + "learning_rate": 2.3447272105835604e-06, + "loss": 0.3228, + "num_tokens": 812966276.0, + "step": 1976 + }, + { + "epoch": 2.088302752293578, + "grad_norm": 0.2390794479841412, + "learning_rate": 2.340756555086929e-06, + "loss": 0.2931, + "num_tokens": 813355448.0, + "step": 1977 + }, + { + "epoch": 2.08868501529052, + "grad_norm": 0.24907755446918084, + "learning_rate": 2.336790744010967e-06, + "loss": 0.3056, + "num_tokens": 813757328.0, + "step": 1978 + }, + { + "epoch": 2.089067278287462, + "grad_norm": 0.24878801457552382, + "learning_rate": 2.332829783436898e-06, + "loss": 0.2999, + "num_tokens": 814184701.0, + "step": 1979 + }, + { + "epoch": 2.0894495412844036, + "grad_norm": 0.21905298708359253, + "learning_rate": 2.3288736794385e-06, + "loss": 0.2834, + "num_tokens": 814595983.0, + "step": 1980 + }, + { + "epoch": 2.0898318042813457, + "grad_norm": 0.22281881033402656, + "learning_rate": 2.324922438082114e-06, + "loss": 0.2837, + "num_tokens": 815009340.0, + "step": 1981 + }, + { + "epoch": 2.0902140672782874, + "grad_norm": 0.2399234834585078, + "learning_rate": 2.320976065426617e-06, + "loss": 0.2981, + "num_tokens": 815426326.0, + "step": 1982 + }, + { + "epoch": 2.0905963302752295, + "grad_norm": 0.25795713022908623, + "learning_rate": 2.3170345675234225e-06, + "loss": 0.3348, + "num_tokens": 815904942.0, + "step": 1983 + }, + { + "epoch": 2.090978593272171, + "grad_norm": 0.23209352874040443, + "learning_rate": 2.3130979504164695e-06, + "loss": 0.2794, + "num_tokens": 816308189.0, + "step": 1984 + }, + { + "epoch": 2.0913608562691133, + "grad_norm": 0.23854635856560508, + "learning_rate": 2.3091662201422136e-06, + "loss": 0.3096, + "num_tokens": 816736107.0, + "step": 1985 + }, + { + "epoch": 2.091743119266055, + "grad_norm": 0.22731179585407965, + "learning_rate": 2.3052393827296163e-06, + "loss": 0.2996, + "num_tokens": 817145386.0, + "step": 1986 + }, + { + "epoch": 2.092125382262997, + "grad_norm": 0.21329199173990762, + "learning_rate": 2.3013174442001315e-06, + "loss": 0.2807, + "num_tokens": 817587614.0, + "step": 1987 + }, + { + "epoch": 2.0925076452599387, + "grad_norm": 0.22277926937398698, + "learning_rate": 2.2974004105677114e-06, + "loss": 0.2938, + "num_tokens": 818010456.0, + "step": 1988 + }, + { + "epoch": 2.092889908256881, + "grad_norm": 0.24219572862208105, + "learning_rate": 2.2934882878387753e-06, + "loss": 0.323, + "num_tokens": 818461879.0, + "step": 1989 + }, + { + "epoch": 2.0932721712538225, + "grad_norm": 0.23487379519997126, + "learning_rate": 2.2895810820122225e-06, + "loss": 0.2864, + "num_tokens": 818843862.0, + "step": 1990 + }, + { + "epoch": 2.0936544342507646, + "grad_norm": 0.2605759269692881, + "learning_rate": 2.2856787990794054e-06, + "loss": 0.323, + "num_tokens": 819315066.0, + "step": 1991 + }, + { + "epoch": 2.0940366972477062, + "grad_norm": 0.24434020247001167, + "learning_rate": 2.28178144502413e-06, + "loss": 0.3077, + "num_tokens": 819740170.0, + "step": 1992 + }, + { + "epoch": 2.0944189602446484, + "grad_norm": 0.2442194190968147, + "learning_rate": 2.277889025822645e-06, + "loss": 0.283, + "num_tokens": 820119329.0, + "step": 1993 + }, + { + "epoch": 2.09480122324159, + "grad_norm": 0.25180199181834334, + "learning_rate": 2.274001547443631e-06, + "loss": 0.3244, + "num_tokens": 820542988.0, + "step": 1994 + }, + { + "epoch": 2.095183486238532, + "grad_norm": 0.24554385098163184, + "learning_rate": 2.2701190158481935e-06, + "loss": 0.3063, + "num_tokens": 820961038.0, + "step": 1995 + }, + { + "epoch": 2.0955657492354742, + "grad_norm": 0.2339533271538118, + "learning_rate": 2.2662414369898494e-06, + "loss": 0.2949, + "num_tokens": 821354962.0, + "step": 1996 + }, + { + "epoch": 2.095948012232416, + "grad_norm": 0.24496440765255373, + "learning_rate": 2.262368816814527e-06, + "loss": 0.298, + "num_tokens": 821750785.0, + "step": 1997 + }, + { + "epoch": 2.096330275229358, + "grad_norm": 0.2261868350447137, + "learning_rate": 2.2585011612605418e-06, + "loss": 0.3062, + "num_tokens": 822193772.0, + "step": 1998 + }, + { + "epoch": 2.0967125382262997, + "grad_norm": 0.26994358754522324, + "learning_rate": 2.2546384762586083e-06, + "loss": 0.3178, + "num_tokens": 822617499.0, + "step": 1999 + }, + { + "epoch": 2.0970948012232418, + "grad_norm": 0.23462254403417077, + "learning_rate": 2.250780767731807e-06, + "loss": 0.3044, + "num_tokens": 823056577.0, + "step": 2000 + }, + { + "epoch": 2.0974770642201834, + "grad_norm": 0.23405054098614497, + "learning_rate": 2.246928041595596e-06, + "loss": 0.3041, + "num_tokens": 823466376.0, + "step": 2001 + }, + { + "epoch": 2.0978593272171255, + "grad_norm": 0.22959013247104987, + "learning_rate": 2.2430803037577912e-06, + "loss": 0.2983, + "num_tokens": 823862267.0, + "step": 2002 + }, + { + "epoch": 2.098241590214067, + "grad_norm": 0.24849909428549352, + "learning_rate": 2.2392375601185545e-06, + "loss": 0.2977, + "num_tokens": 824273307.0, + "step": 2003 + }, + { + "epoch": 2.0986238532110093, + "grad_norm": 0.2364683042373293, + "learning_rate": 2.2353998165703987e-06, + "loss": 0.3048, + "num_tokens": 824736380.0, + "step": 2004 + }, + { + "epoch": 2.099006116207951, + "grad_norm": 0.2255693384447123, + "learning_rate": 2.231567078998159e-06, + "loss": 0.3063, + "num_tokens": 825190429.0, + "step": 2005 + }, + { + "epoch": 2.099388379204893, + "grad_norm": 0.2213904725353695, + "learning_rate": 2.227739353279006e-06, + "loss": 0.3011, + "num_tokens": 825618381.0, + "step": 2006 + }, + { + "epoch": 2.0997706422018347, + "grad_norm": 0.23089715070582656, + "learning_rate": 2.2239166452824145e-06, + "loss": 0.2795, + "num_tokens": 826039312.0, + "step": 2007 + }, + { + "epoch": 2.100152905198777, + "grad_norm": 0.22930414673560054, + "learning_rate": 2.2200989608701707e-06, + "loss": 0.2892, + "num_tokens": 826426462.0, + "step": 2008 + }, + { + "epoch": 2.1005351681957185, + "grad_norm": 0.2521482277940626, + "learning_rate": 2.216286305896356e-06, + "loss": 0.2795, + "num_tokens": 826836637.0, + "step": 2009 + }, + { + "epoch": 2.1009174311926606, + "grad_norm": 0.24752548698512802, + "learning_rate": 2.2124786862073405e-06, + "loss": 0.2868, + "num_tokens": 827200286.0, + "step": 2010 + }, + { + "epoch": 2.1012996941896023, + "grad_norm": 0.2478400220769795, + "learning_rate": 2.2086761076417735e-06, + "loss": 0.2888, + "num_tokens": 827592790.0, + "step": 2011 + }, + { + "epoch": 2.1016819571865444, + "grad_norm": 0.24880147903011235, + "learning_rate": 2.2048785760305695e-06, + "loss": 0.2827, + "num_tokens": 827986124.0, + "step": 2012 + }, + { + "epoch": 2.102064220183486, + "grad_norm": 0.25528296902774644, + "learning_rate": 2.201086097196913e-06, + "loss": 0.3096, + "num_tokens": 828391614.0, + "step": 2013 + }, + { + "epoch": 2.102446483180428, + "grad_norm": 0.2714611599882389, + "learning_rate": 2.1972986769562294e-06, + "loss": 0.2902, + "num_tokens": 828806249.0, + "step": 2014 + }, + { + "epoch": 2.1028287461773703, + "grad_norm": 0.2555390738130769, + "learning_rate": 2.1935163211161986e-06, + "loss": 0.3064, + "num_tokens": 829234914.0, + "step": 2015 + }, + { + "epoch": 2.103211009174312, + "grad_norm": 0.2657611151258409, + "learning_rate": 2.1897390354767243e-06, + "loss": 0.3078, + "num_tokens": 829646217.0, + "step": 2016 + }, + { + "epoch": 2.103593272171254, + "grad_norm": 0.26745704438052575, + "learning_rate": 2.1859668258299434e-06, + "loss": 0.3199, + "num_tokens": 830029967.0, + "step": 2017 + }, + { + "epoch": 2.1039755351681957, + "grad_norm": 0.23241422555435154, + "learning_rate": 2.1821996979602043e-06, + "loss": 0.3022, + "num_tokens": 830451610.0, + "step": 2018 + }, + { + "epoch": 2.104357798165138, + "grad_norm": 0.2579197989641137, + "learning_rate": 2.1784376576440664e-06, + "loss": 0.2975, + "num_tokens": 830864854.0, + "step": 2019 + }, + { + "epoch": 2.1047400611620795, + "grad_norm": 0.2798820665668038, + "learning_rate": 2.1746807106502844e-06, + "loss": 0.3414, + "num_tokens": 831317841.0, + "step": 2020 + }, + { + "epoch": 2.1051223241590216, + "grad_norm": 0.24866314835968373, + "learning_rate": 2.170928862739806e-06, + "loss": 0.3199, + "num_tokens": 831771435.0, + "step": 2021 + }, + { + "epoch": 2.1055045871559632, + "grad_norm": 0.26631208259875394, + "learning_rate": 2.167182119665759e-06, + "loss": 0.3087, + "num_tokens": 832172306.0, + "step": 2022 + }, + { + "epoch": 2.1058868501529053, + "grad_norm": 0.25136749651347573, + "learning_rate": 2.1634404871734392e-06, + "loss": 0.2941, + "num_tokens": 832563381.0, + "step": 2023 + }, + { + "epoch": 2.106269113149847, + "grad_norm": 0.2583765361800248, + "learning_rate": 2.159703971000313e-06, + "loss": 0.3297, + "num_tokens": 832951801.0, + "step": 2024 + }, + { + "epoch": 2.106651376146789, + "grad_norm": 0.25263068069704947, + "learning_rate": 2.1559725768759966e-06, + "loss": 0.3045, + "num_tokens": 833380937.0, + "step": 2025 + }, + { + "epoch": 2.1070336391437308, + "grad_norm": 0.26471568693762637, + "learning_rate": 2.1522463105222525e-06, + "loss": 0.2856, + "num_tokens": 833769244.0, + "step": 2026 + }, + { + "epoch": 2.107415902140673, + "grad_norm": 0.2501651951026688, + "learning_rate": 2.148525177652982e-06, + "loss": 0.286, + "num_tokens": 834170879.0, + "step": 2027 + }, + { + "epoch": 2.1077981651376145, + "grad_norm": 0.24737874487181458, + "learning_rate": 2.144809183974213e-06, + "loss": 0.2987, + "num_tokens": 834595663.0, + "step": 2028 + }, + { + "epoch": 2.1081804281345566, + "grad_norm": 0.24719525513969742, + "learning_rate": 2.1410983351840943e-06, + "loss": 0.3306, + "num_tokens": 835024855.0, + "step": 2029 + }, + { + "epoch": 2.1085626911314983, + "grad_norm": 0.26438308141806255, + "learning_rate": 2.137392636972883e-06, + "loss": 0.3128, + "num_tokens": 835427531.0, + "step": 2030 + }, + { + "epoch": 2.1089449541284404, + "grad_norm": 0.2554820156248308, + "learning_rate": 2.1336920950229413e-06, + "loss": 0.2942, + "num_tokens": 835830026.0, + "step": 2031 + }, + { + "epoch": 2.109327217125382, + "grad_norm": 0.27509783342152216, + "learning_rate": 2.129996715008724e-06, + "loss": 0.3079, + "num_tokens": 836210136.0, + "step": 2032 + }, + { + "epoch": 2.109709480122324, + "grad_norm": 0.28296166588011973, + "learning_rate": 2.1263065025967673e-06, + "loss": 0.3321, + "num_tokens": 836643927.0, + "step": 2033 + }, + { + "epoch": 2.1100917431192663, + "grad_norm": 0.25425671111071196, + "learning_rate": 2.122621463445687e-06, + "loss": 0.3083, + "num_tokens": 837087497.0, + "step": 2034 + }, + { + "epoch": 2.110474006116208, + "grad_norm": 0.23935604428036045, + "learning_rate": 2.118941603206166e-06, + "loss": 0.2867, + "num_tokens": 837484032.0, + "step": 2035 + }, + { + "epoch": 2.11085626911315, + "grad_norm": 0.251681254033373, + "learning_rate": 2.115266927520943e-06, + "loss": 0.2918, + "num_tokens": 837865584.0, + "step": 2036 + }, + { + "epoch": 2.1112385321100917, + "grad_norm": 0.255413678319928, + "learning_rate": 2.1115974420248105e-06, + "loss": 0.3223, + "num_tokens": 838294571.0, + "step": 2037 + }, + { + "epoch": 2.111620795107034, + "grad_norm": 0.26388550275817085, + "learning_rate": 2.1079331523445986e-06, + "loss": 0.3094, + "num_tokens": 838702877.0, + "step": 2038 + }, + { + "epoch": 2.1120030581039755, + "grad_norm": 0.25089956420069187, + "learning_rate": 2.104274064099174e-06, + "loss": 0.3072, + "num_tokens": 839076594.0, + "step": 2039 + }, + { + "epoch": 2.1123853211009176, + "grad_norm": 0.25726818546206537, + "learning_rate": 2.100620182899421e-06, + "loss": 0.3169, + "num_tokens": 839466420.0, + "step": 2040 + }, + { + "epoch": 2.1127675840978593, + "grad_norm": 0.25339202275411654, + "learning_rate": 2.096971514348249e-06, + "loss": 0.2918, + "num_tokens": 839858148.0, + "step": 2041 + }, + { + "epoch": 2.1131498470948014, + "grad_norm": 0.22848555482716043, + "learning_rate": 2.0933280640405645e-06, + "loss": 0.2945, + "num_tokens": 840290244.0, + "step": 2042 + }, + { + "epoch": 2.113532110091743, + "grad_norm": 0.23106470839698875, + "learning_rate": 2.089689837563278e-06, + "loss": 0.3024, + "num_tokens": 840716151.0, + "step": 2043 + }, + { + "epoch": 2.113914373088685, + "grad_norm": 0.2740976401430039, + "learning_rate": 2.0860568404952885e-06, + "loss": 0.2875, + "num_tokens": 841065107.0, + "step": 2044 + }, + { + "epoch": 2.114296636085627, + "grad_norm": 0.27010890166749135, + "learning_rate": 2.082429078407476e-06, + "loss": 0.3264, + "num_tokens": 841503140.0, + "step": 2045 + }, + { + "epoch": 2.114678899082569, + "grad_norm": 0.2735882223763793, + "learning_rate": 2.0788065568626946e-06, + "loss": 0.2883, + "num_tokens": 841886466.0, + "step": 2046 + }, + { + "epoch": 2.1150611620795106, + "grad_norm": 0.24438416388030718, + "learning_rate": 2.0751892814157564e-06, + "loss": 0.2917, + "num_tokens": 842274503.0, + "step": 2047 + }, + { + "epoch": 2.1154434250764527, + "grad_norm": 0.2375739412144376, + "learning_rate": 2.0715772576134397e-06, + "loss": 0.3072, + "num_tokens": 842666013.0, + "step": 2048 + }, + { + "epoch": 2.1158256880733943, + "grad_norm": 0.2243074091393917, + "learning_rate": 2.0679704909944584e-06, + "loss": 0.2856, + "num_tokens": 843105009.0, + "step": 2049 + }, + { + "epoch": 2.1162079510703364, + "grad_norm": 0.25234466827749763, + "learning_rate": 2.064368987089475e-06, + "loss": 0.2996, + "num_tokens": 843516863.0, + "step": 2050 + }, + { + "epoch": 2.116590214067278, + "grad_norm": 0.24812982032305772, + "learning_rate": 2.0607727514210747e-06, + "loss": 0.2915, + "num_tokens": 843903857.0, + "step": 2051 + }, + { + "epoch": 2.11697247706422, + "grad_norm": 0.2620049148259815, + "learning_rate": 2.0571817895037672e-06, + "loss": 0.2967, + "num_tokens": 844294285.0, + "step": 2052 + }, + { + "epoch": 2.117354740061162, + "grad_norm": 0.26645720141380624, + "learning_rate": 2.053596106843976e-06, + "loss": 0.3135, + "num_tokens": 844687832.0, + "step": 2053 + }, + { + "epoch": 2.117737003058104, + "grad_norm": 0.23271984877675667, + "learning_rate": 2.0500157089400288e-06, + "loss": 0.297, + "num_tokens": 845113615.0, + "step": 2054 + }, + { + "epoch": 2.118119266055046, + "grad_norm": 0.2550077534661697, + "learning_rate": 2.0464406012821507e-06, + "loss": 0.3136, + "num_tokens": 845548984.0, + "step": 2055 + }, + { + "epoch": 2.1185015290519877, + "grad_norm": 0.24746712194569306, + "learning_rate": 2.0428707893524485e-06, + "loss": 0.2959, + "num_tokens": 845928404.0, + "step": 2056 + }, + { + "epoch": 2.11888379204893, + "grad_norm": 0.25403428901855957, + "learning_rate": 2.03930627862492e-06, + "loss": 0.3017, + "num_tokens": 846318283.0, + "step": 2057 + }, + { + "epoch": 2.1192660550458715, + "grad_norm": 0.24048069695483532, + "learning_rate": 2.0357470745654213e-06, + "loss": 0.3053, + "num_tokens": 846726168.0, + "step": 2058 + }, + { + "epoch": 2.1196483180428136, + "grad_norm": 0.2309192420389093, + "learning_rate": 2.032193182631683e-06, + "loss": 0.3039, + "num_tokens": 847170171.0, + "step": 2059 + }, + { + "epoch": 2.1200305810397553, + "grad_norm": 0.25813798578078306, + "learning_rate": 2.0286446082732803e-06, + "loss": 0.2923, + "num_tokens": 847547675.0, + "step": 2060 + }, + { + "epoch": 2.1204128440366974, + "grad_norm": 0.25284963149916945, + "learning_rate": 2.025101356931639e-06, + "loss": 0.2974, + "num_tokens": 847926897.0, + "step": 2061 + }, + { + "epoch": 2.120795107033639, + "grad_norm": 0.22831198067575204, + "learning_rate": 2.0215634340400235e-06, + "loss": 0.2978, + "num_tokens": 848325116.0, + "step": 2062 + }, + { + "epoch": 2.121177370030581, + "grad_norm": 0.23665012171810665, + "learning_rate": 2.018030845023525e-06, + "loss": 0.336, + "num_tokens": 848775290.0, + "step": 2063 + }, + { + "epoch": 2.121559633027523, + "grad_norm": 0.24903501908121609, + "learning_rate": 2.0145035952990572e-06, + "loss": 0.3335, + "num_tokens": 849237622.0, + "step": 2064 + }, + { + "epoch": 2.121941896024465, + "grad_norm": 0.23389107693187228, + "learning_rate": 2.010981690275343e-06, + "loss": 0.29, + "num_tokens": 849654334.0, + "step": 2065 + }, + { + "epoch": 2.1223241590214066, + "grad_norm": 0.24003041927197125, + "learning_rate": 2.0074651353529185e-06, + "loss": 0.2865, + "num_tokens": 850040596.0, + "step": 2066 + }, + { + "epoch": 2.1227064220183487, + "grad_norm": 0.2973018611005875, + "learning_rate": 2.003953935924104e-06, + "loss": 0.3187, + "num_tokens": 850408961.0, + "step": 2067 + }, + { + "epoch": 2.1230886850152904, + "grad_norm": 0.26229869108522647, + "learning_rate": 2.0004480973730166e-06, + "loss": 0.334, + "num_tokens": 850814388.0, + "step": 2068 + }, + { + "epoch": 2.1234709480122325, + "grad_norm": 0.23326672724616832, + "learning_rate": 1.9969476250755487e-06, + "loss": 0.3073, + "num_tokens": 851235532.0, + "step": 2069 + }, + { + "epoch": 2.123853211009174, + "grad_norm": 0.2391720452457327, + "learning_rate": 1.9934525243993665e-06, + "loss": 0.2738, + "num_tokens": 851605463.0, + "step": 2070 + }, + { + "epoch": 2.1242354740061162, + "grad_norm": 0.25321180105901725, + "learning_rate": 1.989962800703897e-06, + "loss": 0.3395, + "num_tokens": 852055051.0, + "step": 2071 + }, + { + "epoch": 2.124617737003058, + "grad_norm": 0.240504103568182, + "learning_rate": 1.986478459340323e-06, + "loss": 0.316, + "num_tokens": 852510944.0, + "step": 2072 + }, + { + "epoch": 2.125, + "grad_norm": 0.24235255145885043, + "learning_rate": 1.9829995056515772e-06, + "loss": 0.3263, + "num_tokens": 852967492.0, + "step": 2073 + }, + { + "epoch": 2.1253822629969417, + "grad_norm": 0.23413229550806186, + "learning_rate": 1.9795259449723213e-06, + "loss": 0.3153, + "num_tokens": 853361089.0, + "step": 2074 + }, + { + "epoch": 2.1257645259938838, + "grad_norm": 0.22275484065719478, + "learning_rate": 1.976057782628961e-06, + "loss": 0.306, + "num_tokens": 853775532.0, + "step": 2075 + }, + { + "epoch": 2.126146788990826, + "grad_norm": 0.23233734626700345, + "learning_rate": 1.9725950239396113e-06, + "loss": 0.3066, + "num_tokens": 854197223.0, + "step": 2076 + }, + { + "epoch": 2.1265290519877675, + "grad_norm": 0.2591172338498743, + "learning_rate": 1.9691376742141087e-06, + "loss": 0.311, + "num_tokens": 854586358.0, + "step": 2077 + }, + { + "epoch": 2.1269113149847096, + "grad_norm": 0.23554163602919484, + "learning_rate": 1.9656857387539942e-06, + "loss": 0.3069, + "num_tokens": 855027030.0, + "step": 2078 + }, + { + "epoch": 2.1272935779816513, + "grad_norm": 0.25517726667959767, + "learning_rate": 1.9622392228525046e-06, + "loss": 0.2916, + "num_tokens": 855437636.0, + "step": 2079 + }, + { + "epoch": 2.1276758409785934, + "grad_norm": 0.2548350322680789, + "learning_rate": 1.958798131794568e-06, + "loss": 0.2889, + "num_tokens": 855848132.0, + "step": 2080 + }, + { + "epoch": 2.128058103975535, + "grad_norm": 0.23371903947443085, + "learning_rate": 1.9553624708567937e-06, + "loss": 0.2934, + "num_tokens": 856249866.0, + "step": 2081 + }, + { + "epoch": 2.128440366972477, + "grad_norm": 0.234377234637379, + "learning_rate": 1.951932245307464e-06, + "loss": 0.3091, + "num_tokens": 856679397.0, + "step": 2082 + }, + { + "epoch": 2.128822629969419, + "grad_norm": 0.23379784586195537, + "learning_rate": 1.9485074604065276e-06, + "loss": 0.3148, + "num_tokens": 857093788.0, + "step": 2083 + }, + { + "epoch": 2.129204892966361, + "grad_norm": 0.22282828980897662, + "learning_rate": 1.945088121405588e-06, + "loss": 0.2958, + "num_tokens": 857514761.0, + "step": 2084 + }, + { + "epoch": 2.1295871559633026, + "grad_norm": 0.25271048117768247, + "learning_rate": 1.941674233547899e-06, + "loss": 0.3083, + "num_tokens": 857921681.0, + "step": 2085 + }, + { + "epoch": 2.1299694189602447, + "grad_norm": 0.2695747617883831, + "learning_rate": 1.9382658020683572e-06, + "loss": 0.3257, + "num_tokens": 858349122.0, + "step": 2086 + }, + { + "epoch": 2.1303516819571864, + "grad_norm": 0.2595980465753796, + "learning_rate": 1.934862832193491e-06, + "loss": 0.3079, + "num_tokens": 858733580.0, + "step": 2087 + }, + { + "epoch": 2.1307339449541285, + "grad_norm": 0.23111075929869002, + "learning_rate": 1.931465329141454e-06, + "loss": 0.3226, + "num_tokens": 859177849.0, + "step": 2088 + }, + { + "epoch": 2.13111620795107, + "grad_norm": 0.23130721592097422, + "learning_rate": 1.9280732981220165e-06, + "loss": 0.2951, + "num_tokens": 859604556.0, + "step": 2089 + }, + { + "epoch": 2.1314984709480123, + "grad_norm": 0.22097454244051973, + "learning_rate": 1.924686744336559e-06, + "loss": 0.2938, + "num_tokens": 860034949.0, + "step": 2090 + }, + { + "epoch": 2.131880733944954, + "grad_norm": 0.24224798779880968, + "learning_rate": 1.921305672978062e-06, + "loss": 0.3107, + "num_tokens": 860446727.0, + "step": 2091 + }, + { + "epoch": 2.132262996941896, + "grad_norm": 0.2762927430242535, + "learning_rate": 1.9179300892311007e-06, + "loss": 0.2878, + "num_tokens": 860816308.0, + "step": 2092 + }, + { + "epoch": 2.1326452599388377, + "grad_norm": 0.26176854757147, + "learning_rate": 1.9145599982718317e-06, + "loss": 0.2878, + "num_tokens": 861175021.0, + "step": 2093 + }, + { + "epoch": 2.13302752293578, + "grad_norm": 0.2597065954605703, + "learning_rate": 1.911195405267996e-06, + "loss": 0.3115, + "num_tokens": 861571841.0, + "step": 2094 + }, + { + "epoch": 2.133409785932722, + "grad_norm": 0.2657402495227854, + "learning_rate": 1.9078363153788964e-06, + "loss": 0.3168, + "num_tokens": 861971946.0, + "step": 2095 + }, + { + "epoch": 2.1337920489296636, + "grad_norm": 0.247429988583406, + "learning_rate": 1.9044827337554012e-06, + "loss": 0.3057, + "num_tokens": 862383936.0, + "step": 2096 + }, + { + "epoch": 2.1341743119266057, + "grad_norm": 0.22179317488302783, + "learning_rate": 1.901134665539931e-06, + "loss": 0.2939, + "num_tokens": 862832280.0, + "step": 2097 + }, + { + "epoch": 2.1345565749235473, + "grad_norm": 0.24733400676777484, + "learning_rate": 1.8977921158664537e-06, + "loss": 0.3071, + "num_tokens": 863229927.0, + "step": 2098 + }, + { + "epoch": 2.1349388379204894, + "grad_norm": 0.2553919717421349, + "learning_rate": 1.8944550898604742e-06, + "loss": 0.3059, + "num_tokens": 863622204.0, + "step": 2099 + }, + { + "epoch": 2.135321100917431, + "grad_norm": 0.265051457062787, + "learning_rate": 1.8911235926390243e-06, + "loss": 0.3334, + "num_tokens": 864089603.0, + "step": 2100 + }, + { + "epoch": 2.135703363914373, + "grad_norm": 0.2439916774368985, + "learning_rate": 1.8877976293106647e-06, + "loss": 0.3069, + "num_tokens": 864493179.0, + "step": 2101 + }, + { + "epoch": 2.136085626911315, + "grad_norm": 0.22513611569333414, + "learning_rate": 1.8844772049754614e-06, + "loss": 0.3129, + "num_tokens": 864941511.0, + "step": 2102 + }, + { + "epoch": 2.136467889908257, + "grad_norm": 0.23384222393051904, + "learning_rate": 1.881162324724997e-06, + "loss": 0.2946, + "num_tokens": 865325227.0, + "step": 2103 + }, + { + "epoch": 2.1368501529051986, + "grad_norm": 0.24912012636897485, + "learning_rate": 1.877852993642344e-06, + "loss": 0.3095, + "num_tokens": 865730768.0, + "step": 2104 + }, + { + "epoch": 2.1372324159021407, + "grad_norm": 0.25216571323223674, + "learning_rate": 1.8745492168020695e-06, + "loss": 0.313, + "num_tokens": 866097542.0, + "step": 2105 + }, + { + "epoch": 2.1376146788990824, + "grad_norm": 0.2449973160899574, + "learning_rate": 1.8712509992702247e-06, + "loss": 0.2979, + "num_tokens": 866500338.0, + "step": 2106 + }, + { + "epoch": 2.1379969418960245, + "grad_norm": 0.2287193781804762, + "learning_rate": 1.8679583461043333e-06, + "loss": 0.3221, + "num_tokens": 866919334.0, + "step": 2107 + }, + { + "epoch": 2.138379204892966, + "grad_norm": 0.24464941355917846, + "learning_rate": 1.8646712623533903e-06, + "loss": 0.3048, + "num_tokens": 867299841.0, + "step": 2108 + }, + { + "epoch": 2.1387614678899083, + "grad_norm": 0.24585957074572262, + "learning_rate": 1.8613897530578437e-06, + "loss": 0.2951, + "num_tokens": 867668283.0, + "step": 2109 + }, + { + "epoch": 2.13914373088685, + "grad_norm": 0.299476256903208, + "learning_rate": 1.8581138232496038e-06, + "loss": 0.2769, + "num_tokens": 868051777.0, + "step": 2110 + }, + { + "epoch": 2.139525993883792, + "grad_norm": 0.23993846809728608, + "learning_rate": 1.8548434779520139e-06, + "loss": 0.3085, + "num_tokens": 868492022.0, + "step": 2111 + }, + { + "epoch": 2.1399082568807337, + "grad_norm": 0.2548569357821261, + "learning_rate": 1.8515787221798641e-06, + "loss": 0.3089, + "num_tokens": 868871880.0, + "step": 2112 + }, + { + "epoch": 2.140290519877676, + "grad_norm": 0.25270569893686956, + "learning_rate": 1.8483195609393667e-06, + "loss": 0.321, + "num_tokens": 869272712.0, + "step": 2113 + }, + { + "epoch": 2.140672782874618, + "grad_norm": 0.24062152292476804, + "learning_rate": 1.845065999228158e-06, + "loss": 0.3076, + "num_tokens": 869711926.0, + "step": 2114 + }, + { + "epoch": 2.1410550458715596, + "grad_norm": 0.24838880956409964, + "learning_rate": 1.8418180420352877e-06, + "loss": 0.316, + "num_tokens": 870120482.0, + "step": 2115 + }, + { + "epoch": 2.1414373088685017, + "grad_norm": 0.25354298363481975, + "learning_rate": 1.8385756943412086e-06, + "loss": 0.2952, + "num_tokens": 870511654.0, + "step": 2116 + }, + { + "epoch": 2.1418195718654434, + "grad_norm": 0.26799249071943454, + "learning_rate": 1.8353389611177793e-06, + "loss": 0.3106, + "num_tokens": 870909116.0, + "step": 2117 + }, + { + "epoch": 2.1422018348623855, + "grad_norm": 0.270490782295248, + "learning_rate": 1.8321078473282386e-06, + "loss": 0.2929, + "num_tokens": 871281108.0, + "step": 2118 + }, + { + "epoch": 2.142584097859327, + "grad_norm": 0.33455488248063114, + "learning_rate": 1.828882357927219e-06, + "loss": 0.3248, + "num_tokens": 871730990.0, + "step": 2119 + }, + { + "epoch": 2.1429663608562692, + "grad_norm": 0.27779858548988456, + "learning_rate": 1.8256624978607203e-06, + "loss": 0.3354, + "num_tokens": 872167410.0, + "step": 2120 + }, + { + "epoch": 2.143348623853211, + "grad_norm": 0.23346536858247227, + "learning_rate": 1.822448272066114e-06, + "loss": 0.3239, + "num_tokens": 872582280.0, + "step": 2121 + }, + { + "epoch": 2.143730886850153, + "grad_norm": 0.2276523124022871, + "learning_rate": 1.8192396854721323e-06, + "loss": 0.3006, + "num_tokens": 872993119.0, + "step": 2122 + }, + { + "epoch": 2.1441131498470947, + "grad_norm": 0.2302734004635181, + "learning_rate": 1.8160367429988585e-06, + "loss": 0.2876, + "num_tokens": 873376311.0, + "step": 2123 + }, + { + "epoch": 2.1444954128440368, + "grad_norm": 0.24809741208573627, + "learning_rate": 1.8128394495577228e-06, + "loss": 0.2953, + "num_tokens": 873769169.0, + "step": 2124 + }, + { + "epoch": 2.1448776758409784, + "grad_norm": 0.2711681837130615, + "learning_rate": 1.8096478100514897e-06, + "loss": 0.3136, + "num_tokens": 874206693.0, + "step": 2125 + }, + { + "epoch": 2.1452599388379205, + "grad_norm": 0.20984038159537788, + "learning_rate": 1.8064618293742597e-06, + "loss": 0.3125, + "num_tokens": 874610599.0, + "step": 2126 + }, + { + "epoch": 2.145642201834862, + "grad_norm": 0.24482871458709266, + "learning_rate": 1.8032815124114488e-06, + "loss": 0.312, + "num_tokens": 875012682.0, + "step": 2127 + }, + { + "epoch": 2.1460244648318043, + "grad_norm": 0.23517788175497842, + "learning_rate": 1.8001068640397973e-06, + "loss": 0.2966, + "num_tokens": 875456384.0, + "step": 2128 + }, + { + "epoch": 2.146406727828746, + "grad_norm": 0.23129295818561682, + "learning_rate": 1.7969378891273432e-06, + "loss": 0.2971, + "num_tokens": 875835934.0, + "step": 2129 + }, + { + "epoch": 2.146788990825688, + "grad_norm": 0.22280537626236196, + "learning_rate": 1.793774592533431e-06, + "loss": 0.3115, + "num_tokens": 876285424.0, + "step": 2130 + }, + { + "epoch": 2.1471712538226297, + "grad_norm": 0.22985160745783734, + "learning_rate": 1.7906169791086975e-06, + "loss": 0.2947, + "num_tokens": 876735916.0, + "step": 2131 + }, + { + "epoch": 2.147553516819572, + "grad_norm": 0.24841327006440816, + "learning_rate": 1.7874650536950621e-06, + "loss": 0.3208, + "num_tokens": 877157697.0, + "step": 2132 + }, + { + "epoch": 2.147935779816514, + "grad_norm": 0.23966632621868167, + "learning_rate": 1.784318821125726e-06, + "loss": 0.2975, + "num_tokens": 877540955.0, + "step": 2133 + }, + { + "epoch": 2.1483180428134556, + "grad_norm": 0.24478603037443272, + "learning_rate": 1.781178286225157e-06, + "loss": 0.3343, + "num_tokens": 877980538.0, + "step": 2134 + }, + { + "epoch": 2.1487003058103977, + "grad_norm": 0.2400158310787067, + "learning_rate": 1.7780434538090902e-06, + "loss": 0.2824, + "num_tokens": 878363720.0, + "step": 2135 + }, + { + "epoch": 2.1490825688073394, + "grad_norm": 0.23607289093743822, + "learning_rate": 1.774914328684511e-06, + "loss": 0.3041, + "num_tokens": 878777397.0, + "step": 2136 + }, + { + "epoch": 2.1494648318042815, + "grad_norm": 0.21622115872393688, + "learning_rate": 1.7717909156496585e-06, + "loss": 0.3074, + "num_tokens": 879213588.0, + "step": 2137 + }, + { + "epoch": 2.149847094801223, + "grad_norm": 0.2417339252321948, + "learning_rate": 1.7686732194940098e-06, + "loss": 0.3351, + "num_tokens": 879593329.0, + "step": 2138 + }, + { + "epoch": 2.1502293577981653, + "grad_norm": 0.23294002059377833, + "learning_rate": 1.7655612449982767e-06, + "loss": 0.3183, + "num_tokens": 880027327.0, + "step": 2139 + }, + { + "epoch": 2.150611620795107, + "grad_norm": 0.24035358315138267, + "learning_rate": 1.7624549969343962e-06, + "loss": 0.3345, + "num_tokens": 880434175.0, + "step": 2140 + }, + { + "epoch": 2.150993883792049, + "grad_norm": 0.25487144822665003, + "learning_rate": 1.7593544800655272e-06, + "loss": 0.3137, + "num_tokens": 880841206.0, + "step": 2141 + }, + { + "epoch": 2.1513761467889907, + "grad_norm": 0.23784469700114344, + "learning_rate": 1.7562596991460368e-06, + "loss": 0.3086, + "num_tokens": 881225216.0, + "step": 2142 + }, + { + "epoch": 2.151758409785933, + "grad_norm": 0.22176898973464948, + "learning_rate": 1.7531706589214997e-06, + "loss": 0.2992, + "num_tokens": 881651920.0, + "step": 2143 + }, + { + "epoch": 2.1521406727828745, + "grad_norm": 0.21977701631691313, + "learning_rate": 1.7500873641286826e-06, + "loss": 0.3215, + "num_tokens": 882104001.0, + "step": 2144 + }, + { + "epoch": 2.1525229357798166, + "grad_norm": 0.23825845544773375, + "learning_rate": 1.7470098194955502e-06, + "loss": 0.3306, + "num_tokens": 882489970.0, + "step": 2145 + }, + { + "epoch": 2.1529051987767582, + "grad_norm": 0.24218138523003677, + "learning_rate": 1.7439380297412416e-06, + "loss": 0.3252, + "num_tokens": 882930464.0, + "step": 2146 + }, + { + "epoch": 2.1532874617737003, + "grad_norm": 0.24005861282488286, + "learning_rate": 1.740871999576077e-06, + "loss": 0.3362, + "num_tokens": 883381016.0, + "step": 2147 + }, + { + "epoch": 2.153669724770642, + "grad_norm": 0.21511687215090416, + "learning_rate": 1.7378117337015421e-06, + "loss": 0.2903, + "num_tokens": 883801136.0, + "step": 2148 + }, + { + "epoch": 2.154051987767584, + "grad_norm": 0.23474547948790533, + "learning_rate": 1.7347572368102842e-06, + "loss": 0.3362, + "num_tokens": 884241537.0, + "step": 2149 + }, + { + "epoch": 2.1544342507645258, + "grad_norm": 0.2734136876791039, + "learning_rate": 1.7317085135861042e-06, + "loss": 0.3367, + "num_tokens": 884654039.0, + "step": 2150 + }, + { + "epoch": 2.154816513761468, + "grad_norm": 0.2498551895217333, + "learning_rate": 1.72866556870395e-06, + "loss": 0.3087, + "num_tokens": 885036375.0, + "step": 2151 + }, + { + "epoch": 2.15519877675841, + "grad_norm": 0.2677136733975535, + "learning_rate": 1.7256284068299106e-06, + "loss": 0.3156, + "num_tokens": 885425644.0, + "step": 2152 + }, + { + "epoch": 2.1555810397553516, + "grad_norm": 0.2392599141536694, + "learning_rate": 1.7225970326212003e-06, + "loss": 0.3192, + "num_tokens": 885826394.0, + "step": 2153 + }, + { + "epoch": 2.1559633027522938, + "grad_norm": 0.25009117343100296, + "learning_rate": 1.719571450726169e-06, + "loss": 0.3171, + "num_tokens": 886250191.0, + "step": 2154 + }, + { + "epoch": 2.1563455657492354, + "grad_norm": 0.22542024108272465, + "learning_rate": 1.7165516657842768e-06, + "loss": 0.3217, + "num_tokens": 886662843.0, + "step": 2155 + }, + { + "epoch": 2.1567278287461775, + "grad_norm": 0.22388441923626276, + "learning_rate": 1.7135376824260968e-06, + "loss": 0.2972, + "num_tokens": 887062063.0, + "step": 2156 + }, + { + "epoch": 2.157110091743119, + "grad_norm": 0.22413732083869867, + "learning_rate": 1.7105295052733061e-06, + "loss": 0.3202, + "num_tokens": 887507979.0, + "step": 2157 + }, + { + "epoch": 2.1574923547400613, + "grad_norm": 0.24925762297099385, + "learning_rate": 1.70752713893868e-06, + "loss": 0.2843, + "num_tokens": 887877390.0, + "step": 2158 + }, + { + "epoch": 2.157874617737003, + "grad_norm": 0.2400224275647967, + "learning_rate": 1.7045305880260811e-06, + "loss": 0.3023, + "num_tokens": 888283113.0, + "step": 2159 + }, + { + "epoch": 2.158256880733945, + "grad_norm": 0.23544314939670413, + "learning_rate": 1.7015398571304543e-06, + "loss": 0.3016, + "num_tokens": 888693310.0, + "step": 2160 + }, + { + "epoch": 2.1586391437308867, + "grad_norm": 0.2346447544702893, + "learning_rate": 1.698554950837824e-06, + "loss": 0.3305, + "num_tokens": 889114532.0, + "step": 2161 + }, + { + "epoch": 2.159021406727829, + "grad_norm": 0.22520061592034207, + "learning_rate": 1.695575873725276e-06, + "loss": 0.3102, + "num_tokens": 889543613.0, + "step": 2162 + }, + { + "epoch": 2.1594036697247705, + "grad_norm": 0.2374390942659956, + "learning_rate": 1.6926026303609666e-06, + "loss": 0.3145, + "num_tokens": 890000023.0, + "step": 2163 + }, + { + "epoch": 2.1597859327217126, + "grad_norm": 0.2297304773443682, + "learning_rate": 1.6896352253040993e-06, + "loss": 0.3191, + "num_tokens": 890430127.0, + "step": 2164 + }, + { + "epoch": 2.1601681957186543, + "grad_norm": 0.26326553296748756, + "learning_rate": 1.6866736631049268e-06, + "loss": 0.3118, + "num_tokens": 890814244.0, + "step": 2165 + }, + { + "epoch": 2.1605504587155964, + "grad_norm": 0.2655643448713461, + "learning_rate": 1.6837179483047444e-06, + "loss": 0.311, + "num_tokens": 891221931.0, + "step": 2166 + }, + { + "epoch": 2.160932721712538, + "grad_norm": 0.2743386388651708, + "learning_rate": 1.6807680854358794e-06, + "loss": 0.331, + "num_tokens": 891626411.0, + "step": 2167 + }, + { + "epoch": 2.16131498470948, + "grad_norm": 0.2240515591428254, + "learning_rate": 1.6778240790216862e-06, + "loss": 0.2917, + "num_tokens": 892033990.0, + "step": 2168 + }, + { + "epoch": 2.161697247706422, + "grad_norm": 0.22303228559331495, + "learning_rate": 1.674885933576536e-06, + "loss": 0.3092, + "num_tokens": 892439851.0, + "step": 2169 + }, + { + "epoch": 2.162079510703364, + "grad_norm": 0.22988815312965005, + "learning_rate": 1.6719536536058195e-06, + "loss": 0.3118, + "num_tokens": 892867279.0, + "step": 2170 + }, + { + "epoch": 2.162461773700306, + "grad_norm": 0.25808832106658236, + "learning_rate": 1.6690272436059247e-06, + "loss": 0.3207, + "num_tokens": 893246395.0, + "step": 2171 + }, + { + "epoch": 2.1628440366972477, + "grad_norm": 0.2280001549484002, + "learning_rate": 1.6661067080642466e-06, + "loss": 0.2924, + "num_tokens": 893636167.0, + "step": 2172 + }, + { + "epoch": 2.16322629969419, + "grad_norm": 0.21580364727461712, + "learning_rate": 1.6631920514591654e-06, + "loss": 0.309, + "num_tokens": 894063196.0, + "step": 2173 + }, + { + "epoch": 2.1636085626911314, + "grad_norm": 0.24763860416960673, + "learning_rate": 1.6602832782600509e-06, + "loss": 0.3075, + "num_tokens": 894481173.0, + "step": 2174 + }, + { + "epoch": 2.1639908256880735, + "grad_norm": 0.25639163356992056, + "learning_rate": 1.6573803929272487e-06, + "loss": 0.3293, + "num_tokens": 894876005.0, + "step": 2175 + }, + { + "epoch": 2.164373088685015, + "grad_norm": 0.23846091212504994, + "learning_rate": 1.654483399912078e-06, + "loss": 0.3158, + "num_tokens": 895311769.0, + "step": 2176 + }, + { + "epoch": 2.1647553516819573, + "grad_norm": 0.23754704691085898, + "learning_rate": 1.651592303656822e-06, + "loss": 0.3061, + "num_tokens": 895707932.0, + "step": 2177 + }, + { + "epoch": 2.165137614678899, + "grad_norm": 0.23976539419012896, + "learning_rate": 1.6487071085947193e-06, + "loss": 0.286, + "num_tokens": 896114483.0, + "step": 2178 + }, + { + "epoch": 2.165519877675841, + "grad_norm": 0.21447832330315947, + "learning_rate": 1.6458278191499644e-06, + "loss": 0.3075, + "num_tokens": 896545251.0, + "step": 2179 + }, + { + "epoch": 2.1659021406727827, + "grad_norm": 0.2299798965965033, + "learning_rate": 1.6429544397376911e-06, + "loss": 0.3089, + "num_tokens": 896963748.0, + "step": 2180 + }, + { + "epoch": 2.166284403669725, + "grad_norm": 0.23722486935301504, + "learning_rate": 1.640086974763973e-06, + "loss": 0.3057, + "num_tokens": 897340983.0, + "step": 2181 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.2527094707513443, + "learning_rate": 1.6372254286258156e-06, + "loss": 0.3354, + "num_tokens": 897759767.0, + "step": 2182 + }, + { + "epoch": 2.1670489296636086, + "grad_norm": 0.26662822816229126, + "learning_rate": 1.6343698057111454e-06, + "loss": 0.3097, + "num_tokens": 898152893.0, + "step": 2183 + }, + { + "epoch": 2.1674311926605503, + "grad_norm": 0.241867663457883, + "learning_rate": 1.6315201103988088e-06, + "loss": 0.3146, + "num_tokens": 898570730.0, + "step": 2184 + }, + { + "epoch": 2.1678134556574924, + "grad_norm": 0.26400921040030584, + "learning_rate": 1.6286763470585618e-06, + "loss": 0.3363, + "num_tokens": 899020923.0, + "step": 2185 + }, + { + "epoch": 2.168195718654434, + "grad_norm": 0.22934436883295536, + "learning_rate": 1.6258385200510652e-06, + "loss": 0.29, + "num_tokens": 899387561.0, + "step": 2186 + }, + { + "epoch": 2.168577981651376, + "grad_norm": 0.2185352284892116, + "learning_rate": 1.6230066337278723e-06, + "loss": 0.2851, + "num_tokens": 899780061.0, + "step": 2187 + }, + { + "epoch": 2.168960244648318, + "grad_norm": 0.22267719267504493, + "learning_rate": 1.6201806924314356e-06, + "loss": 0.3106, + "num_tokens": 900214923.0, + "step": 2188 + }, + { + "epoch": 2.16934250764526, + "grad_norm": 0.2370172982760451, + "learning_rate": 1.6173607004950822e-06, + "loss": 0.3091, + "num_tokens": 900658115.0, + "step": 2189 + }, + { + "epoch": 2.169724770642202, + "grad_norm": 0.24890328982520848, + "learning_rate": 1.6145466622430229e-06, + "loss": 0.3345, + "num_tokens": 901104304.0, + "step": 2190 + }, + { + "epoch": 2.1701070336391437, + "grad_norm": 0.23334691754876039, + "learning_rate": 1.6117385819903347e-06, + "loss": 0.3045, + "num_tokens": 901549087.0, + "step": 2191 + }, + { + "epoch": 2.170489296636086, + "grad_norm": 0.24893520678779027, + "learning_rate": 1.608936464042963e-06, + "loss": 0.322, + "num_tokens": 901958983.0, + "step": 2192 + }, + { + "epoch": 2.1708715596330275, + "grad_norm": 0.2335239583187043, + "learning_rate": 1.6061403126977066e-06, + "loss": 0.3185, + "num_tokens": 902387911.0, + "step": 2193 + }, + { + "epoch": 2.1712538226299696, + "grad_norm": 0.22523536140055822, + "learning_rate": 1.6033501322422162e-06, + "loss": 0.3305, + "num_tokens": 902844732.0, + "step": 2194 + }, + { + "epoch": 2.1716360856269112, + "grad_norm": 0.24817934425633256, + "learning_rate": 1.6005659269549882e-06, + "loss": 0.3154, + "num_tokens": 903226836.0, + "step": 2195 + }, + { + "epoch": 2.1720183486238533, + "grad_norm": 0.2189113136735565, + "learning_rate": 1.5977877011053556e-06, + "loss": 0.2978, + "num_tokens": 903650041.0, + "step": 2196 + }, + { + "epoch": 2.172400611620795, + "grad_norm": 0.23439605113917591, + "learning_rate": 1.5950154589534806e-06, + "loss": 0.3259, + "num_tokens": 904043266.0, + "step": 2197 + }, + { + "epoch": 2.172782874617737, + "grad_norm": 0.2511393392124079, + "learning_rate": 1.5922492047503521e-06, + "loss": 0.3124, + "num_tokens": 904468451.0, + "step": 2198 + }, + { + "epoch": 2.1731651376146788, + "grad_norm": 0.25283704329209317, + "learning_rate": 1.5894889427377768e-06, + "loss": 0.3215, + "num_tokens": 904865229.0, + "step": 2199 + }, + { + "epoch": 2.173547400611621, + "grad_norm": 0.268779467281901, + "learning_rate": 1.5867346771483732e-06, + "loss": 0.3538, + "num_tokens": 905272720.0, + "step": 2200 + }, + { + "epoch": 2.1739296636085625, + "grad_norm": 0.24167880279655754, + "learning_rate": 1.583986412205563e-06, + "loss": 0.3186, + "num_tokens": 905698266.0, + "step": 2201 + }, + { + "epoch": 2.1743119266055047, + "grad_norm": 0.2592728408756888, + "learning_rate": 1.5812441521235686e-06, + "loss": 0.2947, + "num_tokens": 906041053.0, + "step": 2202 + }, + { + "epoch": 2.1746941896024463, + "grad_norm": 0.2392668050830515, + "learning_rate": 1.578507901107403e-06, + "loss": 0.3384, + "num_tokens": 906488584.0, + "step": 2203 + }, + { + "epoch": 2.1750764525993884, + "grad_norm": 0.24248852863353335, + "learning_rate": 1.5757776633528654e-06, + "loss": 0.3097, + "num_tokens": 906905347.0, + "step": 2204 + }, + { + "epoch": 2.17545871559633, + "grad_norm": 0.2258979972111339, + "learning_rate": 1.5730534430465358e-06, + "loss": 0.3381, + "num_tokens": 907343695.0, + "step": 2205 + }, + { + "epoch": 2.175840978593272, + "grad_norm": 0.2505783495895386, + "learning_rate": 1.5703352443657615e-06, + "loss": 0.3423, + "num_tokens": 907806661.0, + "step": 2206 + }, + { + "epoch": 2.176223241590214, + "grad_norm": 0.234528317351922, + "learning_rate": 1.5676230714786646e-06, + "loss": 0.3204, + "num_tokens": 908249078.0, + "step": 2207 + }, + { + "epoch": 2.176605504587156, + "grad_norm": 0.240582554262787, + "learning_rate": 1.564916928544119e-06, + "loss": 0.2832, + "num_tokens": 908616632.0, + "step": 2208 + }, + { + "epoch": 2.176987767584098, + "grad_norm": 0.2532827480126124, + "learning_rate": 1.562216819711757e-06, + "loss": 0.3087, + "num_tokens": 909020945.0, + "step": 2209 + }, + { + "epoch": 2.1773700305810397, + "grad_norm": 0.227986714594487, + "learning_rate": 1.5595227491219572e-06, + "loss": 0.3314, + "num_tokens": 909421917.0, + "step": 2210 + }, + { + "epoch": 2.177752293577982, + "grad_norm": 0.22561577681881217, + "learning_rate": 1.5568347209058387e-06, + "loss": 0.3366, + "num_tokens": 909861072.0, + "step": 2211 + }, + { + "epoch": 2.1781345565749235, + "grad_norm": 0.24628669248200902, + "learning_rate": 1.5541527391852561e-06, + "loss": 0.2797, + "num_tokens": 910186108.0, + "step": 2212 + }, + { + "epoch": 2.1785168195718656, + "grad_norm": 0.22918259509551395, + "learning_rate": 1.5514768080727885e-06, + "loss": 0.2914, + "num_tokens": 910579170.0, + "step": 2213 + }, + { + "epoch": 2.1788990825688073, + "grad_norm": 0.22790157562790342, + "learning_rate": 1.5488069316717437e-06, + "loss": 0.3315, + "num_tokens": 911004000.0, + "step": 2214 + }, + { + "epoch": 2.1792813455657494, + "grad_norm": 0.24872085181802045, + "learning_rate": 1.5461431140761368e-06, + "loss": 0.295, + "num_tokens": 911371041.0, + "step": 2215 + }, + { + "epoch": 2.179663608562691, + "grad_norm": 0.24737913360578787, + "learning_rate": 1.5434853593707017e-06, + "loss": 0.3361, + "num_tokens": 911826234.0, + "step": 2216 + }, + { + "epoch": 2.180045871559633, + "grad_norm": 0.2287124782176664, + "learning_rate": 1.540833671630867e-06, + "loss": 0.2819, + "num_tokens": 912217194.0, + "step": 2217 + }, + { + "epoch": 2.180428134556575, + "grad_norm": 0.23422642745576158, + "learning_rate": 1.5381880549227622e-06, + "loss": 0.2958, + "num_tokens": 912637441.0, + "step": 2218 + }, + { + "epoch": 2.180810397553517, + "grad_norm": 0.24946965050875264, + "learning_rate": 1.535548513303207e-06, + "loss": 0.326, + "num_tokens": 913074206.0, + "step": 2219 + }, + { + "epoch": 2.1811926605504586, + "grad_norm": 0.22803680271701338, + "learning_rate": 1.5329150508197053e-06, + "loss": 0.3274, + "num_tokens": 913496226.0, + "step": 2220 + }, + { + "epoch": 2.1815749235474007, + "grad_norm": 0.23459413487022435, + "learning_rate": 1.5302876715104397e-06, + "loss": 0.3363, + "num_tokens": 913930557.0, + "step": 2221 + }, + { + "epoch": 2.1819571865443423, + "grad_norm": 0.251718183603139, + "learning_rate": 1.5276663794042618e-06, + "loss": 0.309, + "num_tokens": 914285508.0, + "step": 2222 + }, + { + "epoch": 2.1823394495412844, + "grad_norm": 0.21860858401753172, + "learning_rate": 1.525051178520695e-06, + "loss": 0.2851, + "num_tokens": 914660487.0, + "step": 2223 + }, + { + "epoch": 2.182721712538226, + "grad_norm": 0.21937232669635792, + "learning_rate": 1.5224420728699157e-06, + "loss": 0.3105, + "num_tokens": 915092199.0, + "step": 2224 + }, + { + "epoch": 2.183103975535168, + "grad_norm": 0.23911207937321602, + "learning_rate": 1.5198390664527595e-06, + "loss": 0.2977, + "num_tokens": 915499337.0, + "step": 2225 + }, + { + "epoch": 2.18348623853211, + "grad_norm": 0.23484188018834565, + "learning_rate": 1.5172421632607045e-06, + "loss": 0.3357, + "num_tokens": 915948803.0, + "step": 2226 + }, + { + "epoch": 2.183868501529052, + "grad_norm": 0.25470772185951285, + "learning_rate": 1.5146513672758733e-06, + "loss": 0.3146, + "num_tokens": 916357800.0, + "step": 2227 + }, + { + "epoch": 2.184250764525994, + "grad_norm": 0.22091256980267132, + "learning_rate": 1.5120666824710233e-06, + "loss": 0.3201, + "num_tokens": 916818247.0, + "step": 2228 + }, + { + "epoch": 2.1846330275229358, + "grad_norm": 0.25480031651744306, + "learning_rate": 1.509488112809538e-06, + "loss": 0.3432, + "num_tokens": 917225950.0, + "step": 2229 + }, + { + "epoch": 2.185015290519878, + "grad_norm": 0.24017436040468665, + "learning_rate": 1.5069156622454286e-06, + "loss": 0.3081, + "num_tokens": 917615638.0, + "step": 2230 + }, + { + "epoch": 2.1853975535168195, + "grad_norm": 0.2436175156978095, + "learning_rate": 1.5043493347233187e-06, + "loss": 0.3202, + "num_tokens": 918022831.0, + "step": 2231 + }, + { + "epoch": 2.1857798165137616, + "grad_norm": 0.23740927983326002, + "learning_rate": 1.5017891341784471e-06, + "loss": 0.3239, + "num_tokens": 918409424.0, + "step": 2232 + }, + { + "epoch": 2.1861620795107033, + "grad_norm": 0.25019661443580676, + "learning_rate": 1.4992350645366529e-06, + "loss": 0.3234, + "num_tokens": 918815634.0, + "step": 2233 + }, + { + "epoch": 2.1865443425076454, + "grad_norm": 0.23846630836269916, + "learning_rate": 1.4966871297143776e-06, + "loss": 0.3145, + "num_tokens": 919211196.0, + "step": 2234 + }, + { + "epoch": 2.186926605504587, + "grad_norm": 0.24319562313396528, + "learning_rate": 1.4941453336186532e-06, + "loss": 0.3089, + "num_tokens": 919593384.0, + "step": 2235 + }, + { + "epoch": 2.187308868501529, + "grad_norm": 0.26326880051970314, + "learning_rate": 1.4916096801471002e-06, + "loss": 0.3186, + "num_tokens": 919958439.0, + "step": 2236 + }, + { + "epoch": 2.187691131498471, + "grad_norm": 0.21928210405981835, + "learning_rate": 1.4890801731879198e-06, + "loss": 0.3201, + "num_tokens": 920380521.0, + "step": 2237 + }, + { + "epoch": 2.188073394495413, + "grad_norm": 0.2271918808386219, + "learning_rate": 1.4865568166198843e-06, + "loss": 0.3039, + "num_tokens": 920796242.0, + "step": 2238 + }, + { + "epoch": 2.1884556574923546, + "grad_norm": 0.22879151647382603, + "learning_rate": 1.484039614312342e-06, + "loss": 0.3106, + "num_tokens": 921203862.0, + "step": 2239 + }, + { + "epoch": 2.1888379204892967, + "grad_norm": 0.25356331608615573, + "learning_rate": 1.481528570125197e-06, + "loss": 0.3256, + "num_tokens": 921609826.0, + "step": 2240 + }, + { + "epoch": 2.1892201834862384, + "grad_norm": 0.25670375029848935, + "learning_rate": 1.4790236879089161e-06, + "loss": 0.3071, + "num_tokens": 922025226.0, + "step": 2241 + }, + { + "epoch": 2.1896024464831805, + "grad_norm": 0.25028564081571036, + "learning_rate": 1.4765249715045136e-06, + "loss": 0.3266, + "num_tokens": 922428898.0, + "step": 2242 + }, + { + "epoch": 2.189984709480122, + "grad_norm": 0.2457858575478593, + "learning_rate": 1.47403242474355e-06, + "loss": 0.3381, + "num_tokens": 922821850.0, + "step": 2243 + }, + { + "epoch": 2.1903669724770642, + "grad_norm": 0.24796580458470796, + "learning_rate": 1.4715460514481265e-06, + "loss": 0.2985, + "num_tokens": 923180062.0, + "step": 2244 + }, + { + "epoch": 2.190749235474006, + "grad_norm": 0.2281638392614198, + "learning_rate": 1.4690658554308763e-06, + "loss": 0.3124, + "num_tokens": 923601876.0, + "step": 2245 + }, + { + "epoch": 2.191131498470948, + "grad_norm": 0.24819064319689557, + "learning_rate": 1.4665918404949609e-06, + "loss": 0.3414, + "num_tokens": 924053085.0, + "step": 2246 + }, + { + "epoch": 2.19151376146789, + "grad_norm": 0.221779737860921, + "learning_rate": 1.4641240104340635e-06, + "loss": 0.2957, + "num_tokens": 924448570.0, + "step": 2247 + }, + { + "epoch": 2.191896024464832, + "grad_norm": 0.2353912156533869, + "learning_rate": 1.4616623690323845e-06, + "loss": 0.3153, + "num_tokens": 924867172.0, + "step": 2248 + }, + { + "epoch": 2.192278287461774, + "grad_norm": 0.2409162808634134, + "learning_rate": 1.4592069200646316e-06, + "loss": 0.3207, + "num_tokens": 925264252.0, + "step": 2249 + }, + { + "epoch": 2.1926605504587156, + "grad_norm": 0.2575582061153032, + "learning_rate": 1.4567576672960198e-06, + "loss": 0.3376, + "num_tokens": 925659568.0, + "step": 2250 + }, + { + "epoch": 2.1930428134556577, + "grad_norm": 0.23666965423824904, + "learning_rate": 1.4543146144822623e-06, + "loss": 0.321, + "num_tokens": 926093541.0, + "step": 2251 + }, + { + "epoch": 2.1934250764525993, + "grad_norm": 0.22345524625671717, + "learning_rate": 1.4518777653695632e-06, + "loss": 0.3212, + "num_tokens": 926509139.0, + "step": 2252 + }, + { + "epoch": 2.1938073394495414, + "grad_norm": 0.24472584350893706, + "learning_rate": 1.4494471236946172e-06, + "loss": 0.3191, + "num_tokens": 926919949.0, + "step": 2253 + }, + { + "epoch": 2.194189602446483, + "grad_norm": 0.2237900397108314, + "learning_rate": 1.4470226931845968e-06, + "loss": 0.312, + "num_tokens": 927347962.0, + "step": 2254 + }, + { + "epoch": 2.194571865443425, + "grad_norm": 0.24192376693367446, + "learning_rate": 1.4446044775571539e-06, + "loss": 0.3051, + "num_tokens": 927757359.0, + "step": 2255 + }, + { + "epoch": 2.194954128440367, + "grad_norm": 0.25049196542039615, + "learning_rate": 1.4421924805204075e-06, + "loss": 0.3114, + "num_tokens": 928116516.0, + "step": 2256 + }, + { + "epoch": 2.195336391437309, + "grad_norm": 0.2740250120932496, + "learning_rate": 1.4397867057729401e-06, + "loss": 0.3131, + "num_tokens": 928484709.0, + "step": 2257 + }, + { + "epoch": 2.1957186544342506, + "grad_norm": 0.26813796233385834, + "learning_rate": 1.4373871570037978e-06, + "loss": 0.3388, + "num_tokens": 928930938.0, + "step": 2258 + }, + { + "epoch": 2.1961009174311927, + "grad_norm": 0.2395394764193769, + "learning_rate": 1.434993837892475e-06, + "loss": 0.3223, + "num_tokens": 929331110.0, + "step": 2259 + }, + { + "epoch": 2.1964831804281344, + "grad_norm": 0.2390888572187172, + "learning_rate": 1.4326067521089149e-06, + "loss": 0.3314, + "num_tokens": 929762317.0, + "step": 2260 + }, + { + "epoch": 2.1968654434250765, + "grad_norm": 0.23187735590618286, + "learning_rate": 1.4302259033135035e-06, + "loss": 0.3078, + "num_tokens": 930157263.0, + "step": 2261 + }, + { + "epoch": 2.197247706422018, + "grad_norm": 0.24650218368092827, + "learning_rate": 1.4278512951570615e-06, + "loss": 0.3355, + "num_tokens": 930560738.0, + "step": 2262 + }, + { + "epoch": 2.1976299694189603, + "grad_norm": 0.24718079585326058, + "learning_rate": 1.4254829312808405e-06, + "loss": 0.3166, + "num_tokens": 930982686.0, + "step": 2263 + }, + { + "epoch": 2.198012232415902, + "grad_norm": 0.24536463572552442, + "learning_rate": 1.4231208153165178e-06, + "loss": 0.3372, + "num_tokens": 931413350.0, + "step": 2264 + }, + { + "epoch": 2.198394495412844, + "grad_norm": 0.23812044230751547, + "learning_rate": 1.4207649508861893e-06, + "loss": 0.3035, + "num_tokens": 931781562.0, + "step": 2265 + }, + { + "epoch": 2.198776758409786, + "grad_norm": 0.23306212887001213, + "learning_rate": 1.4184153416023638e-06, + "loss": 0.3156, + "num_tokens": 932169176.0, + "step": 2266 + }, + { + "epoch": 2.199159021406728, + "grad_norm": 0.22855772156103168, + "learning_rate": 1.4160719910679621e-06, + "loss": 0.3354, + "num_tokens": 932586511.0, + "step": 2267 + }, + { + "epoch": 2.19954128440367, + "grad_norm": 0.2554414905313789, + "learning_rate": 1.4137349028763031e-06, + "loss": 0.3175, + "num_tokens": 933001427.0, + "step": 2268 + }, + { + "epoch": 2.1999235474006116, + "grad_norm": 0.2357468823759153, + "learning_rate": 1.4114040806111056e-06, + "loss": 0.3245, + "num_tokens": 933411499.0, + "step": 2269 + }, + { + "epoch": 2.2003058103975537, + "grad_norm": 0.2232072191124909, + "learning_rate": 1.4090795278464791e-06, + "loss": 0.3193, + "num_tokens": 933897268.0, + "step": 2270 + }, + { + "epoch": 2.2006880733944953, + "grad_norm": 0.2421418674687781, + "learning_rate": 1.4067612481469209e-06, + "loss": 0.3503, + "num_tokens": 934310511.0, + "step": 2271 + }, + { + "epoch": 2.2010703363914375, + "grad_norm": 0.22236266053699993, + "learning_rate": 1.404449245067308e-06, + "loss": 0.3239, + "num_tokens": 934722838.0, + "step": 2272 + }, + { + "epoch": 2.201452599388379, + "grad_norm": 0.25544114978141097, + "learning_rate": 1.4021435221528907e-06, + "loss": 0.3139, + "num_tokens": 935110837.0, + "step": 2273 + }, + { + "epoch": 2.2018348623853212, + "grad_norm": 0.22882052149328255, + "learning_rate": 1.399844082939295e-06, + "loss": 0.3122, + "num_tokens": 935527589.0, + "step": 2274 + }, + { + "epoch": 2.202217125382263, + "grad_norm": 0.23133086508808312, + "learning_rate": 1.3975509309525036e-06, + "loss": 0.2945, + "num_tokens": 935903704.0, + "step": 2275 + }, + { + "epoch": 2.202599388379205, + "grad_norm": 0.2219750007159546, + "learning_rate": 1.3952640697088664e-06, + "loss": 0.3246, + "num_tokens": 936342729.0, + "step": 2276 + }, + { + "epoch": 2.2029816513761467, + "grad_norm": 0.23543136760802605, + "learning_rate": 1.3929835027150806e-06, + "loss": 0.3107, + "num_tokens": 936718536.0, + "step": 2277 + }, + { + "epoch": 2.2033639143730888, + "grad_norm": 0.24465846908175518, + "learning_rate": 1.3907092334681954e-06, + "loss": 0.3491, + "num_tokens": 937121040.0, + "step": 2278 + }, + { + "epoch": 2.2037461773700304, + "grad_norm": 0.2174748946547011, + "learning_rate": 1.3884412654556004e-06, + "loss": 0.2991, + "num_tokens": 937534524.0, + "step": 2279 + }, + { + "epoch": 2.2041284403669725, + "grad_norm": 0.22507665843183142, + "learning_rate": 1.3861796021550253e-06, + "loss": 0.3205, + "num_tokens": 937940498.0, + "step": 2280 + }, + { + "epoch": 2.204510703363914, + "grad_norm": 0.2189591135263967, + "learning_rate": 1.383924247034531e-06, + "loss": 0.2938, + "num_tokens": 938368876.0, + "step": 2281 + }, + { + "epoch": 2.2048929663608563, + "grad_norm": 0.2379367896549823, + "learning_rate": 1.3816752035525035e-06, + "loss": 0.319, + "num_tokens": 938792032.0, + "step": 2282 + }, + { + "epoch": 2.205275229357798, + "grad_norm": 0.241070821822433, + "learning_rate": 1.3794324751576551e-06, + "loss": 0.3328, + "num_tokens": 939194145.0, + "step": 2283 + }, + { + "epoch": 2.20565749235474, + "grad_norm": 0.2217473087425511, + "learning_rate": 1.3771960652890086e-06, + "loss": 0.3249, + "num_tokens": 939629399.0, + "step": 2284 + }, + { + "epoch": 2.206039755351682, + "grad_norm": 0.2262142179988771, + "learning_rate": 1.3749659773759038e-06, + "loss": 0.3056, + "num_tokens": 940017173.0, + "step": 2285 + }, + { + "epoch": 2.206422018348624, + "grad_norm": 0.23237353982198988, + "learning_rate": 1.3727422148379816e-06, + "loss": 0.3101, + "num_tokens": 940442507.0, + "step": 2286 + }, + { + "epoch": 2.206804281345566, + "grad_norm": 0.2348016416938737, + "learning_rate": 1.3705247810851857e-06, + "loss": 0.3107, + "num_tokens": 940843846.0, + "step": 2287 + }, + { + "epoch": 2.2071865443425076, + "grad_norm": 0.21990185962230782, + "learning_rate": 1.3683136795177549e-06, + "loss": 0.2973, + "num_tokens": 941267142.0, + "step": 2288 + }, + { + "epoch": 2.2075688073394497, + "grad_norm": 0.22656945532680414, + "learning_rate": 1.3661089135262188e-06, + "loss": 0.3162, + "num_tokens": 941701733.0, + "step": 2289 + }, + { + "epoch": 2.2079510703363914, + "grad_norm": 0.24345309167467757, + "learning_rate": 1.3639104864913908e-06, + "loss": 0.3243, + "num_tokens": 942090520.0, + "step": 2290 + }, + { + "epoch": 2.2083333333333335, + "grad_norm": 0.22572103977646577, + "learning_rate": 1.3617184017843628e-06, + "loss": 0.3319, + "num_tokens": 942569073.0, + "step": 2291 + }, + { + "epoch": 2.208715596330275, + "grad_norm": 0.23471783702654825, + "learning_rate": 1.3595326627665061e-06, + "loss": 0.33, + "num_tokens": 942974528.0, + "step": 2292 + }, + { + "epoch": 2.2090978593272173, + "grad_norm": 0.25122562058168113, + "learning_rate": 1.3573532727894561e-06, + "loss": 0.3267, + "num_tokens": 943408209.0, + "step": 2293 + }, + { + "epoch": 2.209480122324159, + "grad_norm": 0.2401020226565427, + "learning_rate": 1.3551802351951155e-06, + "loss": 0.3388, + "num_tokens": 943836121.0, + "step": 2294 + }, + { + "epoch": 2.209862385321101, + "grad_norm": 0.23346182496953494, + "learning_rate": 1.3530135533156449e-06, + "loss": 0.3086, + "num_tokens": 944234298.0, + "step": 2295 + }, + { + "epoch": 2.2102446483180427, + "grad_norm": 0.22608818047665813, + "learning_rate": 1.3508532304734603e-06, + "loss": 0.3215, + "num_tokens": 944657903.0, + "step": 2296 + }, + { + "epoch": 2.210626911314985, + "grad_norm": 0.24261592210672542, + "learning_rate": 1.348699269981226e-06, + "loss": 0.333, + "num_tokens": 945057836.0, + "step": 2297 + }, + { + "epoch": 2.2110091743119265, + "grad_norm": 0.2261756969915708, + "learning_rate": 1.3465516751418489e-06, + "loss": 0.3358, + "num_tokens": 945499768.0, + "step": 2298 + }, + { + "epoch": 2.2113914373088686, + "grad_norm": 0.22595008294888091, + "learning_rate": 1.3444104492484778e-06, + "loss": 0.3091, + "num_tokens": 945929291.0, + "step": 2299 + }, + { + "epoch": 2.21177370030581, + "grad_norm": 0.23402033546061082, + "learning_rate": 1.3422755955844904e-06, + "loss": 0.3184, + "num_tokens": 946324979.0, + "step": 2300 + }, + { + "epoch": 2.2121559633027523, + "grad_norm": 0.2279214969978577, + "learning_rate": 1.3401471174235004e-06, + "loss": 0.3153, + "num_tokens": 946757470.0, + "step": 2301 + }, + { + "epoch": 2.212538226299694, + "grad_norm": 0.2222163162909481, + "learning_rate": 1.3380250180293368e-06, + "loss": 0.3335, + "num_tokens": 947193223.0, + "step": 2302 + }, + { + "epoch": 2.212920489296636, + "grad_norm": 0.23056795436250863, + "learning_rate": 1.3359093006560542e-06, + "loss": 0.32, + "num_tokens": 947586945.0, + "step": 2303 + }, + { + "epoch": 2.213302752293578, + "grad_norm": 0.25343283225955054, + "learning_rate": 1.3337999685479172e-06, + "loss": 0.3377, + "num_tokens": 948019867.0, + "step": 2304 + }, + { + "epoch": 2.21368501529052, + "grad_norm": 0.23873458289519447, + "learning_rate": 1.3316970249394e-06, + "loss": 0.3496, + "num_tokens": 948467090.0, + "step": 2305 + }, + { + "epoch": 2.214067278287462, + "grad_norm": 0.22709695742963218, + "learning_rate": 1.3296004730551817e-06, + "loss": 0.3585, + "num_tokens": 948904308.0, + "step": 2306 + }, + { + "epoch": 2.2144495412844036, + "grad_norm": 0.22136998928004317, + "learning_rate": 1.3275103161101386e-06, + "loss": 0.3071, + "num_tokens": 949304602.0, + "step": 2307 + }, + { + "epoch": 2.2148318042813457, + "grad_norm": 0.26575182512318135, + "learning_rate": 1.3254265573093417e-06, + "loss": 0.3096, + "num_tokens": 949661022.0, + "step": 2308 + }, + { + "epoch": 2.2152140672782874, + "grad_norm": 0.22764252975869667, + "learning_rate": 1.3233491998480525e-06, + "loss": 0.3285, + "num_tokens": 950096474.0, + "step": 2309 + }, + { + "epoch": 2.2155963302752295, + "grad_norm": 0.21688755798732887, + "learning_rate": 1.3212782469117124e-06, + "loss": 0.34, + "num_tokens": 950545600.0, + "step": 2310 + }, + { + "epoch": 2.215978593272171, + "grad_norm": 0.22693689396465785, + "learning_rate": 1.3192137016759467e-06, + "loss": 0.3362, + "num_tokens": 951002008.0, + "step": 2311 + }, + { + "epoch": 2.2163608562691133, + "grad_norm": 0.24767412885410545, + "learning_rate": 1.3171555673065528e-06, + "loss": 0.3217, + "num_tokens": 951397021.0, + "step": 2312 + }, + { + "epoch": 2.216743119266055, + "grad_norm": 0.24195651311111752, + "learning_rate": 1.3151038469594976e-06, + "loss": 0.3355, + "num_tokens": 951779013.0, + "step": 2313 + }, + { + "epoch": 2.217125382262997, + "grad_norm": 0.21759711994940784, + "learning_rate": 1.313058543780913e-06, + "loss": 0.2914, + "num_tokens": 952204464.0, + "step": 2314 + }, + { + "epoch": 2.2175076452599387, + "grad_norm": 0.24907461933173577, + "learning_rate": 1.3110196609070905e-06, + "loss": 0.3147, + "num_tokens": 952607710.0, + "step": 2315 + }, + { + "epoch": 2.217889908256881, + "grad_norm": 0.22592717975716248, + "learning_rate": 1.3089872014644772e-06, + "loss": 0.2956, + "num_tokens": 953010225.0, + "step": 2316 + }, + { + "epoch": 2.2182721712538225, + "grad_norm": 0.24232146017803918, + "learning_rate": 1.3069611685696698e-06, + "loss": 0.3417, + "num_tokens": 953467656.0, + "step": 2317 + }, + { + "epoch": 2.2186544342507646, + "grad_norm": 0.23711273303917801, + "learning_rate": 1.3049415653294114e-06, + "loss": 0.2952, + "num_tokens": 953858252.0, + "step": 2318 + }, + { + "epoch": 2.2190366972477062, + "grad_norm": 1.1279001716458446, + "learning_rate": 1.3029283948405838e-06, + "loss": 0.3057, + "num_tokens": 954256959.0, + "step": 2319 + }, + { + "epoch": 2.2194189602446484, + "grad_norm": 0.2627908620020632, + "learning_rate": 1.3009216601902081e-06, + "loss": 0.3322, + "num_tokens": 954657390.0, + "step": 2320 + }, + { + "epoch": 2.21980122324159, + "grad_norm": 0.25476591817781247, + "learning_rate": 1.2989213644554322e-06, + "loss": 0.3311, + "num_tokens": 955057155.0, + "step": 2321 + }, + { + "epoch": 2.220183486238532, + "grad_norm": 0.23294470526809133, + "learning_rate": 1.2969275107035344e-06, + "loss": 0.336, + "num_tokens": 955463753.0, + "step": 2322 + }, + { + "epoch": 2.2205657492354742, + "grad_norm": 0.22848285033664562, + "learning_rate": 1.2949401019919122e-06, + "loss": 0.3195, + "num_tokens": 955847873.0, + "step": 2323 + }, + { + "epoch": 2.220948012232416, + "grad_norm": 0.2426497760128737, + "learning_rate": 1.2929591413680829e-06, + "loss": 0.3166, + "num_tokens": 956252133.0, + "step": 2324 + }, + { + "epoch": 2.221330275229358, + "grad_norm": 0.23044559560800626, + "learning_rate": 1.2909846318696733e-06, + "loss": 0.3183, + "num_tokens": 956682056.0, + "step": 2325 + }, + { + "epoch": 2.2217125382262997, + "grad_norm": 0.24209794611596508, + "learning_rate": 1.2890165765244187e-06, + "loss": 0.3145, + "num_tokens": 957072123.0, + "step": 2326 + }, + { + "epoch": 2.2220948012232418, + "grad_norm": 0.23379577599053974, + "learning_rate": 1.28705497835016e-06, + "loss": 0.3178, + "num_tokens": 957493114.0, + "step": 2327 + }, + { + "epoch": 2.2224770642201834, + "grad_norm": 0.25773880165038165, + "learning_rate": 1.2850998403548317e-06, + "loss": 0.3479, + "num_tokens": 957942611.0, + "step": 2328 + }, + { + "epoch": 2.2228593272171255, + "grad_norm": 0.2552289625550681, + "learning_rate": 1.283151165536469e-06, + "loss": 0.3118, + "num_tokens": 958289097.0, + "step": 2329 + }, + { + "epoch": 2.223241590214067, + "grad_norm": 0.23588348418555952, + "learning_rate": 1.28120895688319e-06, + "loss": 0.3201, + "num_tokens": 958680101.0, + "step": 2330 + }, + { + "epoch": 2.2236238532110093, + "grad_norm": 0.24121142920277158, + "learning_rate": 1.2792732173732e-06, + "loss": 0.3077, + "num_tokens": 959065958.0, + "step": 2331 + }, + { + "epoch": 2.224006116207951, + "grad_norm": 0.23547260481922436, + "learning_rate": 1.2773439499747857e-06, + "loss": 0.3222, + "num_tokens": 959474133.0, + "step": 2332 + }, + { + "epoch": 2.224388379204893, + "grad_norm": 0.2573138381206103, + "learning_rate": 1.2754211576463072e-06, + "loss": 0.3163, + "num_tokens": 959873850.0, + "step": 2333 + }, + { + "epoch": 2.2247706422018347, + "grad_norm": 0.2424576405277795, + "learning_rate": 1.273504843336198e-06, + "loss": 0.3435, + "num_tokens": 960300908.0, + "step": 2334 + }, + { + "epoch": 2.225152905198777, + "grad_norm": 0.2574693554800805, + "learning_rate": 1.2715950099829538e-06, + "loss": 0.3363, + "num_tokens": 960740105.0, + "step": 2335 + }, + { + "epoch": 2.2255351681957185, + "grad_norm": 0.24031655648209174, + "learning_rate": 1.2696916605151393e-06, + "loss": 0.331, + "num_tokens": 961142988.0, + "step": 2336 + }, + { + "epoch": 2.2259174311926606, + "grad_norm": 0.23842976302984917, + "learning_rate": 1.2677947978513692e-06, + "loss": 0.3044, + "num_tokens": 961544665.0, + "step": 2337 + }, + { + "epoch": 2.2262996941896023, + "grad_norm": 0.2666112477759676, + "learning_rate": 1.2659044249003177e-06, + "loss": 0.3416, + "num_tokens": 961977659.0, + "step": 2338 + }, + { + "epoch": 2.2266819571865444, + "grad_norm": 0.27182505991297184, + "learning_rate": 1.2640205445607024e-06, + "loss": 0.3312, + "num_tokens": 962413899.0, + "step": 2339 + }, + { + "epoch": 2.227064220183486, + "grad_norm": 0.2405277906069935, + "learning_rate": 1.262143159721288e-06, + "loss": 0.3361, + "num_tokens": 962839378.0, + "step": 2340 + }, + { + "epoch": 2.227446483180428, + "grad_norm": 0.22899951744301314, + "learning_rate": 1.2602722732608797e-06, + "loss": 0.3418, + "num_tokens": 963258151.0, + "step": 2341 + }, + { + "epoch": 2.2278287461773703, + "grad_norm": 0.2704219932385218, + "learning_rate": 1.2584078880483138e-06, + "loss": 0.3504, + "num_tokens": 963655868.0, + "step": 2342 + }, + { + "epoch": 2.228211009174312, + "grad_norm": 0.2501115418240735, + "learning_rate": 1.2565500069424627e-06, + "loss": 0.3343, + "num_tokens": 964062046.0, + "step": 2343 + }, + { + "epoch": 2.228593272171254, + "grad_norm": 0.2377390548463414, + "learning_rate": 1.2546986327922218e-06, + "loss": 0.3088, + "num_tokens": 964505395.0, + "step": 2344 + }, + { + "epoch": 2.2289755351681957, + "grad_norm": 0.2514710340773721, + "learning_rate": 1.2528537684365103e-06, + "loss": 0.3305, + "num_tokens": 964919459.0, + "step": 2345 + }, + { + "epoch": 2.229357798165138, + "grad_norm": 0.2944574806796761, + "learning_rate": 1.2510154167042645e-06, + "loss": 0.3266, + "num_tokens": 965325423.0, + "step": 2346 + }, + { + "epoch": 2.2297400611620795, + "grad_norm": 0.2255865241598775, + "learning_rate": 1.2491835804144337e-06, + "loss": 0.3307, + "num_tokens": 965728260.0, + "step": 2347 + }, + { + "epoch": 2.2301223241590216, + "grad_norm": 0.22356574088705075, + "learning_rate": 1.2473582623759777e-06, + "loss": 0.3171, + "num_tokens": 966131727.0, + "step": 2348 + }, + { + "epoch": 2.2305045871559632, + "grad_norm": 0.250177987823189, + "learning_rate": 1.2455394653878605e-06, + "loss": 0.3436, + "num_tokens": 966594257.0, + "step": 2349 + }, + { + "epoch": 2.2308868501529053, + "grad_norm": 0.21561272375601487, + "learning_rate": 1.243727192239047e-06, + "loss": 0.3201, + "num_tokens": 967022928.0, + "step": 2350 + }, + { + "epoch": 2.231269113149847, + "grad_norm": 0.3783810508075721, + "learning_rate": 1.2419214457084957e-06, + "loss": 0.3569, + "num_tokens": 967483627.0, + "step": 2351 + }, + { + "epoch": 2.231651376146789, + "grad_norm": 0.269147342832729, + "learning_rate": 1.240122228565162e-06, + "loss": 0.3622, + "num_tokens": 967928371.0, + "step": 2352 + }, + { + "epoch": 2.2320336391437308, + "grad_norm": 0.27442759091246816, + "learning_rate": 1.2383295435679845e-06, + "loss": 0.3327, + "num_tokens": 968354358.0, + "step": 2353 + }, + { + "epoch": 2.232415902140673, + "grad_norm": 0.23952720590378324, + "learning_rate": 1.2365433934658894e-06, + "loss": 0.3318, + "num_tokens": 968798125.0, + "step": 2354 + }, + { + "epoch": 2.2327981651376145, + "grad_norm": 0.2218412787475607, + "learning_rate": 1.2347637809977778e-06, + "loss": 0.3321, + "num_tokens": 969236754.0, + "step": 2355 + }, + { + "epoch": 2.2331804281345566, + "grad_norm": 0.24723712819115837, + "learning_rate": 1.2329907088925288e-06, + "loss": 0.3162, + "num_tokens": 969627410.0, + "step": 2356 + }, + { + "epoch": 2.2335626911314983, + "grad_norm": 0.23037465157581424, + "learning_rate": 1.2312241798689926e-06, + "loss": 0.347, + "num_tokens": 970024874.0, + "step": 2357 + }, + { + "epoch": 2.2339449541284404, + "grad_norm": 0.2341244909650061, + "learning_rate": 1.2294641966359847e-06, + "loss": 0.3174, + "num_tokens": 970432922.0, + "step": 2358 + }, + { + "epoch": 2.234327217125382, + "grad_norm": 0.22461168607791035, + "learning_rate": 1.2277107618922845e-06, + "loss": 0.3245, + "num_tokens": 970845536.0, + "step": 2359 + }, + { + "epoch": 2.234709480122324, + "grad_norm": 0.24555144689063618, + "learning_rate": 1.225963878326628e-06, + "loss": 0.3573, + "num_tokens": 971266305.0, + "step": 2360 + }, + { + "epoch": 2.2350917431192663, + "grad_norm": 0.2604432967209313, + "learning_rate": 1.2242235486177089e-06, + "loss": 0.322, + "num_tokens": 971639480.0, + "step": 2361 + }, + { + "epoch": 2.235474006116208, + "grad_norm": 0.257334344877142, + "learning_rate": 1.2224897754341664e-06, + "loss": 0.3327, + "num_tokens": 972048619.0, + "step": 2362 + }, + { + "epoch": 2.23585626911315, + "grad_norm": 0.2646046183525611, + "learning_rate": 1.2207625614345906e-06, + "loss": 0.3138, + "num_tokens": 972416765.0, + "step": 2363 + }, + { + "epoch": 2.2362385321100917, + "grad_norm": 0.24901280762033648, + "learning_rate": 1.2190419092675103e-06, + "loss": 0.3408, + "num_tokens": 972881739.0, + "step": 2364 + }, + { + "epoch": 2.236620795107034, + "grad_norm": 0.22882767739962545, + "learning_rate": 1.217327821571394e-06, + "loss": 0.3328, + "num_tokens": 973324590.0, + "step": 2365 + }, + { + "epoch": 2.2370030581039755, + "grad_norm": 0.24258467297047012, + "learning_rate": 1.2156203009746435e-06, + "loss": 0.3394, + "num_tokens": 973713531.0, + "step": 2366 + }, + { + "epoch": 2.2373853211009176, + "grad_norm": 0.24712962912409794, + "learning_rate": 1.2139193500955915e-06, + "loss": 0.319, + "num_tokens": 974152715.0, + "step": 2367 + }, + { + "epoch": 2.2377675840978593, + "grad_norm": 0.23290576817608108, + "learning_rate": 1.2122249715424946e-06, + "loss": 0.3337, + "num_tokens": 974560982.0, + "step": 2368 + }, + { + "epoch": 2.2381498470948014, + "grad_norm": 0.2638009822973547, + "learning_rate": 1.2105371679135347e-06, + "loss": 0.3515, + "num_tokens": 974985943.0, + "step": 2369 + }, + { + "epoch": 2.238532110091743, + "grad_norm": 0.23306091906775117, + "learning_rate": 1.208855941796807e-06, + "loss": 0.3415, + "num_tokens": 975390608.0, + "step": 2370 + }, + { + "epoch": 2.238914373088685, + "grad_norm": 0.2294760950209685, + "learning_rate": 1.207181295770325e-06, + "loss": 0.3132, + "num_tokens": 975770600.0, + "step": 2371 + }, + { + "epoch": 2.239296636085627, + "grad_norm": 0.24913543687894454, + "learning_rate": 1.2055132324020097e-06, + "loss": 0.3236, + "num_tokens": 976169200.0, + "step": 2372 + }, + { + "epoch": 2.239678899082569, + "grad_norm": 0.24559171160646293, + "learning_rate": 1.2038517542496887e-06, + "loss": 0.3314, + "num_tokens": 976583705.0, + "step": 2373 + }, + { + "epoch": 2.2400611620795106, + "grad_norm": 0.23009442171228991, + "learning_rate": 1.2021968638610923e-06, + "loss": 0.3413, + "num_tokens": 977002714.0, + "step": 2374 + }, + { + "epoch": 2.2404434250764527, + "grad_norm": 0.23805876172751517, + "learning_rate": 1.2005485637738485e-06, + "loss": 0.3511, + "num_tokens": 977458100.0, + "step": 2375 + }, + { + "epoch": 2.2408256880733943, + "grad_norm": 0.3550575328518845, + "learning_rate": 1.19890685651548e-06, + "loss": 0.3028, + "num_tokens": 977861529.0, + "step": 2376 + }, + { + "epoch": 2.2412079510703364, + "grad_norm": 0.24806454185425694, + "learning_rate": 1.1972717446033997e-06, + "loss": 0.3293, + "num_tokens": 978291280.0, + "step": 2377 + }, + { + "epoch": 2.241590214067278, + "grad_norm": 0.21584684248480998, + "learning_rate": 1.1956432305449083e-06, + "loss": 0.3226, + "num_tokens": 978741339.0, + "step": 2378 + }, + { + "epoch": 2.24197247706422, + "grad_norm": 0.23255449153082586, + "learning_rate": 1.1940213168371855e-06, + "loss": 0.3051, + "num_tokens": 979128725.0, + "step": 2379 + }, + { + "epoch": 2.2423547400611623, + "grad_norm": 0.23109287396888736, + "learning_rate": 1.1924060059672956e-06, + "loss": 0.3576, + "num_tokens": 979590770.0, + "step": 2380 + }, + { + "epoch": 2.242737003058104, + "grad_norm": 0.24246922535894574, + "learning_rate": 1.190797300412174e-06, + "loss": 0.3682, + "num_tokens": 980019713.0, + "step": 2381 + }, + { + "epoch": 2.243119266055046, + "grad_norm": 0.20591953580243422, + "learning_rate": 1.1891952026386274e-06, + "loss": 0.3161, + "num_tokens": 980427091.0, + "step": 2382 + }, + { + "epoch": 2.2435015290519877, + "grad_norm": 0.23444119629223706, + "learning_rate": 1.1875997151033323e-06, + "loss": 0.324, + "num_tokens": 980815242.0, + "step": 2383 + }, + { + "epoch": 2.24388379204893, + "grad_norm": 0.24435098789599358, + "learning_rate": 1.186010840252828e-06, + "loss": 0.3364, + "num_tokens": 981231859.0, + "step": 2384 + }, + { + "epoch": 2.2442660550458715, + "grad_norm": 0.2554346704101858, + "learning_rate": 1.184428580523514e-06, + "loss": 0.3457, + "num_tokens": 981638572.0, + "step": 2385 + }, + { + "epoch": 2.2446483180428136, + "grad_norm": 0.2184947627844497, + "learning_rate": 1.182852938341644e-06, + "loss": 0.3213, + "num_tokens": 982064729.0, + "step": 2386 + }, + { + "epoch": 2.2450305810397553, + "grad_norm": 0.2360576833754859, + "learning_rate": 1.1812839161233283e-06, + "loss": 0.3319, + "num_tokens": 982458530.0, + "step": 2387 + }, + { + "epoch": 2.2454128440366974, + "grad_norm": 0.2307922137214105, + "learning_rate": 1.1797215162745213e-06, + "loss": 0.3253, + "num_tokens": 982879093.0, + "step": 2388 + }, + { + "epoch": 2.245795107033639, + "grad_norm": 0.23752900972508847, + "learning_rate": 1.1781657411910283e-06, + "loss": 0.3269, + "num_tokens": 983289273.0, + "step": 2389 + }, + { + "epoch": 2.246177370030581, + "grad_norm": 0.24424483910199665, + "learning_rate": 1.1766165932584904e-06, + "loss": 0.3488, + "num_tokens": 983728007.0, + "step": 2390 + }, + { + "epoch": 2.246559633027523, + "grad_norm": 0.25929355296369944, + "learning_rate": 1.1750740748523895e-06, + "loss": 0.3568, + "num_tokens": 984160764.0, + "step": 2391 + }, + { + "epoch": 2.246941896024465, + "grad_norm": 0.21736834185746057, + "learning_rate": 1.173538188338042e-06, + "loss": 0.3279, + "num_tokens": 984595170.0, + "step": 2392 + }, + { + "epoch": 2.2473241590214066, + "grad_norm": 0.23243257558138927, + "learning_rate": 1.1720089360705938e-06, + "loss": 0.3063, + "num_tokens": 984973412.0, + "step": 2393 + }, + { + "epoch": 2.2477064220183487, + "grad_norm": 0.23678355239935087, + "learning_rate": 1.1704863203950187e-06, + "loss": 0.3482, + "num_tokens": 985403473.0, + "step": 2394 + }, + { + "epoch": 2.2480886850152904, + "grad_norm": 0.23438387545897227, + "learning_rate": 1.1689703436461121e-06, + "loss": 0.3246, + "num_tokens": 985763286.0, + "step": 2395 + }, + { + "epoch": 2.2484709480122325, + "grad_norm": 0.22719810461895487, + "learning_rate": 1.1674610081484913e-06, + "loss": 0.331, + "num_tokens": 986162901.0, + "step": 2396 + }, + { + "epoch": 2.248853211009174, + "grad_norm": 0.2442947173746423, + "learning_rate": 1.165958316216588e-06, + "loss": 0.3326, + "num_tokens": 986572469.0, + "step": 2397 + }, + { + "epoch": 2.2492354740061162, + "grad_norm": 0.22448501469058643, + "learning_rate": 1.1644622701546491e-06, + "loss": 0.3433, + "num_tokens": 987016079.0, + "step": 2398 + }, + { + "epoch": 2.2496177370030583, + "grad_norm": 0.22184255826468807, + "learning_rate": 1.1629728722567276e-06, + "loss": 0.3139, + "num_tokens": 987454798.0, + "step": 2399 + }, + { + "epoch": 2.25, + "grad_norm": 0.28746205382492435, + "learning_rate": 1.161490124806684e-06, + "loss": 0.3364, + "num_tokens": 987876376.0, + "step": 2400 + }, + { + "epoch": 2.2503822629969417, + "grad_norm": 0.24710237038320823, + "learning_rate": 1.160014030078181e-06, + "loss": 0.3286, + "num_tokens": 988277981.0, + "step": 2401 + }, + { + "epoch": 2.2507645259938838, + "grad_norm": 0.2267036496230456, + "learning_rate": 1.1585445903346784e-06, + "loss": 0.3232, + "num_tokens": 988665446.0, + "step": 2402 + }, + { + "epoch": 2.251146788990826, + "grad_norm": 0.22240446421589935, + "learning_rate": 1.1570818078294336e-06, + "loss": 0.3392, + "num_tokens": 989084252.0, + "step": 2403 + }, + { + "epoch": 2.2515290519877675, + "grad_norm": 0.2457982278835067, + "learning_rate": 1.1556256848054923e-06, + "loss": 0.3236, + "num_tokens": 989455382.0, + "step": 2404 + }, + { + "epoch": 2.2519113149847096, + "grad_norm": 0.21584865743447265, + "learning_rate": 1.1541762234956927e-06, + "loss": 0.3263, + "num_tokens": 989936412.0, + "step": 2405 + }, + { + "epoch": 2.2522935779816513, + "grad_norm": 0.2300170346912694, + "learning_rate": 1.1527334261226545e-06, + "loss": 0.3242, + "num_tokens": 990328515.0, + "step": 2406 + }, + { + "epoch": 2.2526758409785934, + "grad_norm": 0.23039095277682822, + "learning_rate": 1.1512972948987801e-06, + "loss": 0.3143, + "num_tokens": 990729837.0, + "step": 2407 + }, + { + "epoch": 2.253058103975535, + "grad_norm": 0.22492205125592585, + "learning_rate": 1.1498678320262497e-06, + "loss": 0.324, + "num_tokens": 991152825.0, + "step": 2408 + }, + { + "epoch": 2.253440366972477, + "grad_norm": 0.2253740070545322, + "learning_rate": 1.1484450396970186e-06, + "loss": 0.2986, + "num_tokens": 991557123.0, + "step": 2409 + }, + { + "epoch": 2.253822629969419, + "grad_norm": 0.250902049729847, + "learning_rate": 1.1470289200928129e-06, + "loss": 0.3532, + "num_tokens": 991953366.0, + "step": 2410 + }, + { + "epoch": 2.254204892966361, + "grad_norm": 0.2230592486884478, + "learning_rate": 1.1456194753851274e-06, + "loss": 0.3518, + "num_tokens": 992386224.0, + "step": 2411 + }, + { + "epoch": 2.2545871559633026, + "grad_norm": 0.2187880536890912, + "learning_rate": 1.1442167077352203e-06, + "loss": 0.3557, + "num_tokens": 992837037.0, + "step": 2412 + }, + { + "epoch": 2.2549694189602447, + "grad_norm": 0.21920680206903553, + "learning_rate": 1.1428206192941121e-06, + "loss": 0.3008, + "num_tokens": 993246358.0, + "step": 2413 + }, + { + "epoch": 2.2553516819571864, + "grad_norm": 0.2211077967432299, + "learning_rate": 1.1414312122025812e-06, + "loss": 0.3289, + "num_tokens": 993654027.0, + "step": 2414 + }, + { + "epoch": 2.2557339449541285, + "grad_norm": 0.23348801568444205, + "learning_rate": 1.1400484885911608e-06, + "loss": 0.3206, + "num_tokens": 994069700.0, + "step": 2415 + }, + { + "epoch": 2.25611620795107, + "grad_norm": 0.22417454627328656, + "learning_rate": 1.1386724505801348e-06, + "loss": 0.3357, + "num_tokens": 994516047.0, + "step": 2416 + }, + { + "epoch": 2.2564984709480123, + "grad_norm": 0.25749464587236665, + "learning_rate": 1.137303100279536e-06, + "loss": 0.3502, + "num_tokens": 994924423.0, + "step": 2417 + }, + { + "epoch": 2.2568807339449544, + "grad_norm": 0.21323452439042662, + "learning_rate": 1.1359404397891425e-06, + "loss": 0.3124, + "num_tokens": 995371367.0, + "step": 2418 + }, + { + "epoch": 2.257262996941896, + "grad_norm": 0.2626278791586827, + "learning_rate": 1.1345844711984736e-06, + "loss": 0.3444, + "num_tokens": 995744387.0, + "step": 2419 + }, + { + "epoch": 2.2576452599388377, + "grad_norm": 0.24647406811267328, + "learning_rate": 1.133235196586787e-06, + "loss": 0.3302, + "num_tokens": 996153554.0, + "step": 2420 + }, + { + "epoch": 2.25802752293578, + "grad_norm": 0.23851547349763427, + "learning_rate": 1.1318926180230768e-06, + "loss": 0.3128, + "num_tokens": 996511448.0, + "step": 2421 + }, + { + "epoch": 2.258409785932722, + "grad_norm": 0.2275516894358364, + "learning_rate": 1.130556737566068e-06, + "loss": 0.3336, + "num_tokens": 996931898.0, + "step": 2422 + }, + { + "epoch": 2.2587920489296636, + "grad_norm": 0.2513206093283021, + "learning_rate": 1.1292275572642152e-06, + "loss": 0.3414, + "num_tokens": 997322671.0, + "step": 2423 + }, + { + "epoch": 2.2591743119266057, + "grad_norm": 0.24373213840456173, + "learning_rate": 1.1279050791556998e-06, + "loss": 0.3274, + "num_tokens": 997694860.0, + "step": 2424 + }, + { + "epoch": 2.2595565749235473, + "grad_norm": 0.23072164853942825, + "learning_rate": 1.1265893052684239e-06, + "loss": 0.3349, + "num_tokens": 998133875.0, + "step": 2425 + }, + { + "epoch": 2.2599388379204894, + "grad_norm": 0.21236945439116897, + "learning_rate": 1.1252802376200108e-06, + "loss": 0.315, + "num_tokens": 998541531.0, + "step": 2426 + }, + { + "epoch": 2.260321100917431, + "grad_norm": 0.22694767316606934, + "learning_rate": 1.1239778782178005e-06, + "loss": 0.3251, + "num_tokens": 998928523.0, + "step": 2427 + }, + { + "epoch": 2.260703363914373, + "grad_norm": 0.24063553186959405, + "learning_rate": 1.1226822290588466e-06, + "loss": 0.3417, + "num_tokens": 999376294.0, + "step": 2428 + }, + { + "epoch": 2.261085626911315, + "grad_norm": 0.2220640093426646, + "learning_rate": 1.1213932921299111e-06, + "loss": 0.3601, + "num_tokens": 999846399.0, + "step": 2429 + }, + { + "epoch": 2.261467889908257, + "grad_norm": 0.2364277592625136, + "learning_rate": 1.1201110694074657e-06, + "loss": 0.3114, + "num_tokens": 1000228168.0, + "step": 2430 + }, + { + "epoch": 2.2618501529051986, + "grad_norm": 0.24034084515428292, + "learning_rate": 1.1188355628576863e-06, + "loss": 0.3277, + "num_tokens": 1000607465.0, + "step": 2431 + }, + { + "epoch": 2.2622324159021407, + "grad_norm": 0.22000277276288935, + "learning_rate": 1.1175667744364482e-06, + "loss": 0.3086, + "num_tokens": 1000994851.0, + "step": 2432 + }, + { + "epoch": 2.2626146788990824, + "grad_norm": 0.24137844973686678, + "learning_rate": 1.1163047060893274e-06, + "loss": 0.336, + "num_tokens": 1001369091.0, + "step": 2433 + }, + { + "epoch": 2.2629969418960245, + "grad_norm": 0.23021306253268806, + "learning_rate": 1.1150493597515936e-06, + "loss": 0.3443, + "num_tokens": 1001789787.0, + "step": 2434 + }, + { + "epoch": 2.263379204892966, + "grad_norm": 0.2161987935811964, + "learning_rate": 1.1138007373482098e-06, + "loss": 0.3124, + "num_tokens": 1002195521.0, + "step": 2435 + }, + { + "epoch": 2.2637614678899083, + "grad_norm": 0.22001589376340233, + "learning_rate": 1.1125588407938276e-06, + "loss": 0.3411, + "num_tokens": 1002618773.0, + "step": 2436 + }, + { + "epoch": 2.2641437308868504, + "grad_norm": 0.2279997990434341, + "learning_rate": 1.1113236719927858e-06, + "loss": 0.3416, + "num_tokens": 1003043090.0, + "step": 2437 + }, + { + "epoch": 2.264525993883792, + "grad_norm": 0.2291006277691492, + "learning_rate": 1.110095232839108e-06, + "loss": 0.3302, + "num_tokens": 1003480074.0, + "step": 2438 + }, + { + "epoch": 2.2649082568807337, + "grad_norm": 0.2545025363657803, + "learning_rate": 1.1088735252164943e-06, + "loss": 0.3315, + "num_tokens": 1003885410.0, + "step": 2439 + }, + { + "epoch": 2.265290519877676, + "grad_norm": 0.22325642009139007, + "learning_rate": 1.1076585509983285e-06, + "loss": 0.3401, + "num_tokens": 1004316971.0, + "step": 2440 + }, + { + "epoch": 2.265672782874618, + "grad_norm": 0.2619365379370404, + "learning_rate": 1.1064503120476633e-06, + "loss": 0.3521, + "num_tokens": 1004732315.0, + "step": 2441 + }, + { + "epoch": 2.2660550458715596, + "grad_norm": 0.22302334345209515, + "learning_rate": 1.1052488102172289e-06, + "loss": 0.3609, + "num_tokens": 1005173098.0, + "step": 2442 + }, + { + "epoch": 2.2664373088685017, + "grad_norm": 0.24858579207722525, + "learning_rate": 1.1040540473494204e-06, + "loss": 0.3465, + "num_tokens": 1005574375.0, + "step": 2443 + }, + { + "epoch": 2.2668195718654434, + "grad_norm": 0.22887026474639136, + "learning_rate": 1.1028660252763019e-06, + "loss": 0.3413, + "num_tokens": 1006016948.0, + "step": 2444 + }, + { + "epoch": 2.2672018348623855, + "grad_norm": 0.23490994931571146, + "learning_rate": 1.1016847458196e-06, + "loss": 0.3333, + "num_tokens": 1006453783.0, + "step": 2445 + }, + { + "epoch": 2.267584097859327, + "grad_norm": 0.22729442171332015, + "learning_rate": 1.100510210790703e-06, + "loss": 0.3305, + "num_tokens": 1006845590.0, + "step": 2446 + }, + { + "epoch": 2.2679663608562692, + "grad_norm": 0.23019563326392958, + "learning_rate": 1.099342421990656e-06, + "loss": 0.3091, + "num_tokens": 1007245712.0, + "step": 2447 + }, + { + "epoch": 2.268348623853211, + "grad_norm": 0.21654243132755285, + "learning_rate": 1.0981813812101597e-06, + "loss": 0.3156, + "num_tokens": 1007667320.0, + "step": 2448 + }, + { + "epoch": 2.268730886850153, + "grad_norm": 0.24294838728171128, + "learning_rate": 1.0970270902295682e-06, + "loss": 0.3074, + "num_tokens": 1008043766.0, + "step": 2449 + }, + { + "epoch": 2.2691131498470947, + "grad_norm": 0.23831900831720476, + "learning_rate": 1.0958795508188836e-06, + "loss": 0.3544, + "num_tokens": 1008519965.0, + "step": 2450 + }, + { + "epoch": 2.2694954128440368, + "grad_norm": 0.22209478460253104, + "learning_rate": 1.0947387647377577e-06, + "loss": 0.3528, + "num_tokens": 1008987545.0, + "step": 2451 + }, + { + "epoch": 2.2698776758409784, + "grad_norm": 0.2229096082060342, + "learning_rate": 1.0936047337354843e-06, + "loss": 0.3193, + "num_tokens": 1009416294.0, + "step": 2452 + }, + { + "epoch": 2.2702599388379205, + "grad_norm": 0.2454184509830498, + "learning_rate": 1.0924774595509998e-06, + "loss": 0.3435, + "num_tokens": 1009841014.0, + "step": 2453 + }, + { + "epoch": 2.270642201834862, + "grad_norm": 0.21828349370561898, + "learning_rate": 1.0913569439128803e-06, + "loss": 0.3336, + "num_tokens": 1010271123.0, + "step": 2454 + }, + { + "epoch": 2.2710244648318043, + "grad_norm": 0.25708198825260103, + "learning_rate": 1.0902431885393359e-06, + "loss": 0.3425, + "num_tokens": 1010702434.0, + "step": 2455 + }, + { + "epoch": 2.2714067278287464, + "grad_norm": 0.2654127500719497, + "learning_rate": 1.0891361951382135e-06, + "loss": 0.3465, + "num_tokens": 1011159442.0, + "step": 2456 + }, + { + "epoch": 2.271788990825688, + "grad_norm": 0.23541331898086507, + "learning_rate": 1.0880359654069887e-06, + "loss": 0.3259, + "num_tokens": 1011562457.0, + "step": 2457 + }, + { + "epoch": 2.2721712538226297, + "grad_norm": 0.22997285270085244, + "learning_rate": 1.0869425010327675e-06, + "loss": 0.3213, + "num_tokens": 1011990373.0, + "step": 2458 + }, + { + "epoch": 2.272553516819572, + "grad_norm": 0.23854453314317425, + "learning_rate": 1.0858558036922804e-06, + "loss": 0.3303, + "num_tokens": 1012386087.0, + "step": 2459 + }, + { + "epoch": 2.272935779816514, + "grad_norm": 0.22850580374894988, + "learning_rate": 1.0847758750518818e-06, + "loss": 0.3141, + "num_tokens": 1012760822.0, + "step": 2460 + }, + { + "epoch": 2.2733180428134556, + "grad_norm": 0.22177009226263714, + "learning_rate": 1.0837027167675467e-06, + "loss": 0.3088, + "num_tokens": 1013164595.0, + "step": 2461 + }, + { + "epoch": 2.2737003058103977, + "grad_norm": 0.2471851556767312, + "learning_rate": 1.0826363304848686e-06, + "loss": 0.3221, + "num_tokens": 1013521940.0, + "step": 2462 + }, + { + "epoch": 2.2740825688073394, + "grad_norm": 0.20770442392940838, + "learning_rate": 1.081576717839057e-06, + "loss": 0.2981, + "num_tokens": 1013927578.0, + "step": 2463 + }, + { + "epoch": 2.2744648318042815, + "grad_norm": 0.24233006861887452, + "learning_rate": 1.0805238804549334e-06, + "loss": 0.3355, + "num_tokens": 1014382863.0, + "step": 2464 + }, + { + "epoch": 2.274847094801223, + "grad_norm": 0.21310212218947608, + "learning_rate": 1.0794778199469321e-06, + "loss": 0.3017, + "num_tokens": 1014813702.0, + "step": 2465 + }, + { + "epoch": 2.2752293577981653, + "grad_norm": 0.22936044493682983, + "learning_rate": 1.0784385379190935e-06, + "loss": 0.3179, + "num_tokens": 1015192303.0, + "step": 2466 + }, + { + "epoch": 2.275611620795107, + "grad_norm": 0.23366685622894057, + "learning_rate": 1.077406035965065e-06, + "loss": 0.3205, + "num_tokens": 1015655158.0, + "step": 2467 + }, + { + "epoch": 2.275993883792049, + "grad_norm": 0.24503070845159775, + "learning_rate": 1.076380315668097e-06, + "loss": 0.3151, + "num_tokens": 1016044195.0, + "step": 2468 + }, + { + "epoch": 2.2763761467889907, + "grad_norm": 0.22621904277630284, + "learning_rate": 1.0753613786010414e-06, + "loss": 0.3269, + "num_tokens": 1016463210.0, + "step": 2469 + }, + { + "epoch": 2.276758409785933, + "grad_norm": 0.24239948071102563, + "learning_rate": 1.0743492263263481e-06, + "loss": 0.3217, + "num_tokens": 1016844339.0, + "step": 2470 + }, + { + "epoch": 2.2771406727828745, + "grad_norm": 0.24631978782728897, + "learning_rate": 1.0733438603960623e-06, + "loss": 0.3353, + "num_tokens": 1017261213.0, + "step": 2471 + }, + { + "epoch": 2.2775229357798166, + "grad_norm": 0.24916868586881602, + "learning_rate": 1.072345282351825e-06, + "loss": 0.3109, + "num_tokens": 1017615630.0, + "step": 2472 + }, + { + "epoch": 2.2779051987767582, + "grad_norm": 0.24592236476542634, + "learning_rate": 1.0713534937248669e-06, + "loss": 0.3242, + "num_tokens": 1018002242.0, + "step": 2473 + }, + { + "epoch": 2.2782874617737003, + "grad_norm": 0.24434138236353822, + "learning_rate": 1.0703684960360082e-06, + "loss": 0.3352, + "num_tokens": 1018422364.0, + "step": 2474 + }, + { + "epoch": 2.2786697247706424, + "grad_norm": 0.24275741522292824, + "learning_rate": 1.0693902907956555e-06, + "loss": 0.3236, + "num_tokens": 1018861185.0, + "step": 2475 + }, + { + "epoch": 2.279051987767584, + "grad_norm": 0.23454360157374696, + "learning_rate": 1.0684188795038004e-06, + "loss": 0.3139, + "num_tokens": 1019252082.0, + "step": 2476 + }, + { + "epoch": 2.2794342507645258, + "grad_norm": 0.24633318774513466, + "learning_rate": 1.0674542636500164e-06, + "loss": 0.3327, + "num_tokens": 1019667580.0, + "step": 2477 + }, + { + "epoch": 2.279816513761468, + "grad_norm": 0.2215715592344735, + "learning_rate": 1.066496444713457e-06, + "loss": 0.3339, + "num_tokens": 1020131513.0, + "step": 2478 + }, + { + "epoch": 2.28019877675841, + "grad_norm": 0.2369951451708204, + "learning_rate": 1.0655454241628516e-06, + "loss": 0.3422, + "num_tokens": 1020574200.0, + "step": 2479 + }, + { + "epoch": 2.2805810397553516, + "grad_norm": 0.20555307622572766, + "learning_rate": 1.0646012034565075e-06, + "loss": 0.3338, + "num_tokens": 1020999110.0, + "step": 2480 + }, + { + "epoch": 2.2809633027522938, + "grad_norm": 0.2386519993262111, + "learning_rate": 1.0636637840423036e-06, + "loss": 0.3361, + "num_tokens": 1021385574.0, + "step": 2481 + }, + { + "epoch": 2.2813455657492354, + "grad_norm": 0.2325576182407741, + "learning_rate": 1.0627331673576897e-06, + "loss": 0.3301, + "num_tokens": 1021785276.0, + "step": 2482 + }, + { + "epoch": 2.2817278287461775, + "grad_norm": 0.22632855578489075, + "learning_rate": 1.0618093548296832e-06, + "loss": 0.3282, + "num_tokens": 1022188038.0, + "step": 2483 + }, + { + "epoch": 2.282110091743119, + "grad_norm": 0.22207231420518883, + "learning_rate": 1.0608923478748704e-06, + "loss": 0.3348, + "num_tokens": 1022628191.0, + "step": 2484 + }, + { + "epoch": 2.2824923547400613, + "grad_norm": 0.21461671627297577, + "learning_rate": 1.0599821478993992e-06, + "loss": 0.3631, + "num_tokens": 1023122609.0, + "step": 2485 + }, + { + "epoch": 2.282874617737003, + "grad_norm": 0.2217188580276791, + "learning_rate": 1.0590787562989818e-06, + "loss": 0.3384, + "num_tokens": 1023568654.0, + "step": 2486 + }, + { + "epoch": 2.283256880733945, + "grad_norm": 0.2233008731424751, + "learning_rate": 1.058182174458889e-06, + "loss": 0.3168, + "num_tokens": 1023938994.0, + "step": 2487 + }, + { + "epoch": 2.2836391437308867, + "grad_norm": 0.237373223058795, + "learning_rate": 1.0572924037539496e-06, + "loss": 0.3256, + "num_tokens": 1024353695.0, + "step": 2488 + }, + { + "epoch": 2.284021406727829, + "grad_norm": 0.25126775459473333, + "learning_rate": 1.0564094455485487e-06, + "loss": 0.3318, + "num_tokens": 1024787294.0, + "step": 2489 + }, + { + "epoch": 2.2844036697247705, + "grad_norm": 0.216384087436434, + "learning_rate": 1.0555333011966248e-06, + "loss": 0.3341, + "num_tokens": 1025234003.0, + "step": 2490 + }, + { + "epoch": 2.2847859327217126, + "grad_norm": 0.24847245347434235, + "learning_rate": 1.054663972041668e-06, + "loss": 0.3277, + "num_tokens": 1025639493.0, + "step": 2491 + }, + { + "epoch": 2.2851681957186543, + "grad_norm": 0.24744861784953204, + "learning_rate": 1.0538014594167164e-06, + "loss": 0.3491, + "num_tokens": 1026080420.0, + "step": 2492 + }, + { + "epoch": 2.2855504587155964, + "grad_norm": 0.38082254245006325, + "learning_rate": 1.0529457646443592e-06, + "loss": 0.3432, + "num_tokens": 1026490095.0, + "step": 2493 + }, + { + "epoch": 2.2859327217125385, + "grad_norm": 0.23890051881340058, + "learning_rate": 1.0520968890367262e-06, + "loss": 0.3225, + "num_tokens": 1026890862.0, + "step": 2494 + }, + { + "epoch": 2.28631498470948, + "grad_norm": 0.24995117410668877, + "learning_rate": 1.051254833895495e-06, + "loss": 0.367, + "num_tokens": 1027349636.0, + "step": 2495 + }, + { + "epoch": 2.286697247706422, + "grad_norm": 0.261554029088189, + "learning_rate": 1.0504196005118822e-06, + "loss": 0.345, + "num_tokens": 1027802382.0, + "step": 2496 + }, + { + "epoch": 2.287079510703364, + "grad_norm": 0.22765795161334693, + "learning_rate": 1.049591190166644e-06, + "loss": 0.3202, + "num_tokens": 1028181081.0, + "step": 2497 + }, + { + "epoch": 2.287461773700306, + "grad_norm": 0.23888710057451729, + "learning_rate": 1.0487696041300751e-06, + "loss": 0.361, + "num_tokens": 1028616135.0, + "step": 2498 + }, + { + "epoch": 2.2878440366972477, + "grad_norm": 0.24111396839299082, + "learning_rate": 1.047954843662004e-06, + "loss": 0.3417, + "num_tokens": 1029013193.0, + "step": 2499 + }, + { + "epoch": 2.28822629969419, + "grad_norm": 0.25126342407327235, + "learning_rate": 1.0471469100117956e-06, + "loss": 0.3497, + "num_tokens": 1029433476.0, + "step": 2500 + }, + { + "epoch": 2.2886085626911314, + "grad_norm": 0.24075279450239317, + "learning_rate": 1.046345804418343e-06, + "loss": 0.3476, + "num_tokens": 1029829603.0, + "step": 2501 + }, + { + "epoch": 2.2889908256880735, + "grad_norm": 0.23261187753686255, + "learning_rate": 1.0455515281100723e-06, + "loss": 0.3352, + "num_tokens": 1030252571.0, + "step": 2502 + }, + { + "epoch": 2.289373088685015, + "grad_norm": 0.2431561018775167, + "learning_rate": 1.0447640823049351e-06, + "loss": 0.3286, + "num_tokens": 1030637896.0, + "step": 2503 + }, + { + "epoch": 2.2897553516819573, + "grad_norm": 0.24355417236149104, + "learning_rate": 1.0439834682104104e-06, + "loss": 0.3415, + "num_tokens": 1031059861.0, + "step": 2504 + }, + { + "epoch": 2.290137614678899, + "grad_norm": 0.2238937945937746, + "learning_rate": 1.0432096870235008e-06, + "loss": 0.3662, + "num_tokens": 1031525107.0, + "step": 2505 + }, + { + "epoch": 2.290519877675841, + "grad_norm": 0.23742281997755257, + "learning_rate": 1.0424427399307311e-06, + "loss": 0.3303, + "num_tokens": 1031886263.0, + "step": 2506 + }, + { + "epoch": 2.2909021406727827, + "grad_norm": 0.22219249894177853, + "learning_rate": 1.0416826281081475e-06, + "loss": 0.3428, + "num_tokens": 1032314464.0, + "step": 2507 + }, + { + "epoch": 2.291284403669725, + "grad_norm": 0.23859566227237639, + "learning_rate": 1.0409293527213138e-06, + "loss": 0.3469, + "num_tokens": 1032757202.0, + "step": 2508 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.22197096845323003, + "learning_rate": 1.0401829149253118e-06, + "loss": 0.3226, + "num_tokens": 1033169482.0, + "step": 2509 + }, + { + "epoch": 2.2920489296636086, + "grad_norm": 0.22648885510158728, + "learning_rate": 1.0394433158647366e-06, + "loss": 0.3375, + "num_tokens": 1033571935.0, + "step": 2510 + }, + { + "epoch": 2.2924311926605503, + "grad_norm": 0.23897672390103608, + "learning_rate": 1.0387105566736996e-06, + "loss": 0.349, + "num_tokens": 1034000569.0, + "step": 2511 + }, + { + "epoch": 2.2928134556574924, + "grad_norm": 0.22080956936944648, + "learning_rate": 1.0379846384758216e-06, + "loss": 0.3298, + "num_tokens": 1034413429.0, + "step": 2512 + }, + { + "epoch": 2.293195718654434, + "grad_norm": 0.230914093737077, + "learning_rate": 1.037265562384234e-06, + "loss": 0.3657, + "num_tokens": 1034850045.0, + "step": 2513 + }, + { + "epoch": 2.293577981651376, + "grad_norm": 0.23314237997544482, + "learning_rate": 1.0365533295015762e-06, + "loss": 0.3403, + "num_tokens": 1035261106.0, + "step": 2514 + }, + { + "epoch": 2.293960244648318, + "grad_norm": 0.23580618281791355, + "learning_rate": 1.0358479409199952e-06, + "loss": 0.3422, + "num_tokens": 1035660217.0, + "step": 2515 + }, + { + "epoch": 2.29434250764526, + "grad_norm": 0.22224157880256534, + "learning_rate": 1.0351493977211414e-06, + "loss": 0.3261, + "num_tokens": 1036028648.0, + "step": 2516 + }, + { + "epoch": 2.294724770642202, + "grad_norm": 0.22964886278721158, + "learning_rate": 1.0344577009761687e-06, + "loss": 0.3421, + "num_tokens": 1036443419.0, + "step": 2517 + }, + { + "epoch": 2.2951070336391437, + "grad_norm": 0.23296474146998342, + "learning_rate": 1.033772851745734e-06, + "loss": 0.3224, + "num_tokens": 1036830384.0, + "step": 2518 + }, + { + "epoch": 2.295489296636086, + "grad_norm": 0.23051644311855313, + "learning_rate": 1.0330948510799923e-06, + "loss": 0.3409, + "num_tokens": 1037237237.0, + "step": 2519 + }, + { + "epoch": 2.2958715596330275, + "grad_norm": 0.22462641573422928, + "learning_rate": 1.0324237000185984e-06, + "loss": 0.3582, + "num_tokens": 1037683155.0, + "step": 2520 + }, + { + "epoch": 2.2962538226299696, + "grad_norm": 0.2328542438143196, + "learning_rate": 1.0317593995907015e-06, + "loss": 0.3229, + "num_tokens": 1038056926.0, + "step": 2521 + }, + { + "epoch": 2.2966360856269112, + "grad_norm": 0.2409888331971136, + "learning_rate": 1.0311019508149495e-06, + "loss": 0.3364, + "num_tokens": 1038465137.0, + "step": 2522 + }, + { + "epoch": 2.2970183486238533, + "grad_norm": 0.2510625211342022, + "learning_rate": 1.0304513546994814e-06, + "loss": 0.3642, + "num_tokens": 1038893952.0, + "step": 2523 + }, + { + "epoch": 2.297400611620795, + "grad_norm": 0.24607363352809503, + "learning_rate": 1.0298076122419289e-06, + "loss": 0.3403, + "num_tokens": 1039301409.0, + "step": 2524 + }, + { + "epoch": 2.297782874617737, + "grad_norm": 0.25978387243465945, + "learning_rate": 1.0291707244294139e-06, + "loss": 0.3237, + "num_tokens": 1039680765.0, + "step": 2525 + }, + { + "epoch": 2.2981651376146788, + "grad_norm": 0.24181156677840934, + "learning_rate": 1.0285406922385473e-06, + "loss": 0.3359, + "num_tokens": 1040084249.0, + "step": 2526 + }, + { + "epoch": 2.298547400611621, + "grad_norm": 0.24329747091372597, + "learning_rate": 1.0279175166354286e-06, + "loss": 0.3224, + "num_tokens": 1040472359.0, + "step": 2527 + }, + { + "epoch": 2.2989296636085625, + "grad_norm": 0.24984109042275252, + "learning_rate": 1.0273011985756423e-06, + "loss": 0.3527, + "num_tokens": 1040897632.0, + "step": 2528 + }, + { + "epoch": 2.2993119266055047, + "grad_norm": 0.23180393663264445, + "learning_rate": 1.0266917390042572e-06, + "loss": 0.3202, + "num_tokens": 1041292888.0, + "step": 2529 + }, + { + "epoch": 2.2996941896024463, + "grad_norm": 0.22831126182005135, + "learning_rate": 1.0260891388558262e-06, + "loss": 0.3351, + "num_tokens": 1041712334.0, + "step": 2530 + }, + { + "epoch": 2.3000764525993884, + "grad_norm": 0.2298160715014218, + "learning_rate": 1.0254933990543832e-06, + "loss": 0.3306, + "num_tokens": 1042117428.0, + "step": 2531 + }, + { + "epoch": 2.30045871559633, + "grad_norm": 0.21852322629232612, + "learning_rate": 1.0249045205134426e-06, + "loss": 0.337, + "num_tokens": 1042528009.0, + "step": 2532 + }, + { + "epoch": 2.300840978593272, + "grad_norm": 0.29880010296834103, + "learning_rate": 1.0243225041359973e-06, + "loss": 0.346, + "num_tokens": 1042931173.0, + "step": 2533 + }, + { + "epoch": 2.301223241590214, + "grad_norm": 0.2524327301267761, + "learning_rate": 1.0237473508145184e-06, + "loss": 0.3396, + "num_tokens": 1043319130.0, + "step": 2534 + }, + { + "epoch": 2.301605504587156, + "grad_norm": 0.22399662240110776, + "learning_rate": 1.0231790614309528e-06, + "loss": 0.3375, + "num_tokens": 1043744223.0, + "step": 2535 + }, + { + "epoch": 2.301987767584098, + "grad_norm": 0.21996258757076784, + "learning_rate": 1.0226176368567209e-06, + "loss": 0.3273, + "num_tokens": 1044175435.0, + "step": 2536 + }, + { + "epoch": 2.3023700305810397, + "grad_norm": 0.22040390109681718, + "learning_rate": 1.0220630779527195e-06, + "loss": 0.3157, + "num_tokens": 1044589167.0, + "step": 2537 + }, + { + "epoch": 2.302752293577982, + "grad_norm": 0.2376891651149006, + "learning_rate": 1.021515385569314e-06, + "loss": 0.3214, + "num_tokens": 1044991372.0, + "step": 2538 + }, + { + "epoch": 2.3031345565749235, + "grad_norm": 0.22474595409268258, + "learning_rate": 1.0209745605463435e-06, + "loss": 0.3449, + "num_tokens": 1045432528.0, + "step": 2539 + }, + { + "epoch": 2.3035168195718656, + "grad_norm": 0.2262652063302646, + "learning_rate": 1.0204406037131151e-06, + "loss": 0.3389, + "num_tokens": 1045830732.0, + "step": 2540 + }, + { + "epoch": 2.3038990825688073, + "grad_norm": 0.22462505511446673, + "learning_rate": 1.0199135158884041e-06, + "loss": 0.3079, + "num_tokens": 1046241407.0, + "step": 2541 + }, + { + "epoch": 2.3042813455657494, + "grad_norm": 0.22350405477389867, + "learning_rate": 1.019393297880453e-06, + "loss": 0.3278, + "num_tokens": 1046635747.0, + "step": 2542 + }, + { + "epoch": 2.304663608562691, + "grad_norm": 0.2442778544216497, + "learning_rate": 1.0188799504869703e-06, + "loss": 0.3315, + "num_tokens": 1047026214.0, + "step": 2543 + }, + { + "epoch": 2.305045871559633, + "grad_norm": 0.22896654353288845, + "learning_rate": 1.0183734744951297e-06, + "loss": 0.3466, + "num_tokens": 1047449309.0, + "step": 2544 + }, + { + "epoch": 2.305428134556575, + "grad_norm": 0.244732032673048, + "learning_rate": 1.0178738706815656e-06, + "loss": 0.3532, + "num_tokens": 1047893710.0, + "step": 2545 + }, + { + "epoch": 2.305810397553517, + "grad_norm": 0.2474406684090097, + "learning_rate": 1.0173811398123782e-06, + "loss": 0.3445, + "num_tokens": 1048312289.0, + "step": 2546 + }, + { + "epoch": 2.3061926605504586, + "grad_norm": 0.24448324306281707, + "learning_rate": 1.0168952826431252e-06, + "loss": 0.3348, + "num_tokens": 1048770990.0, + "step": 2547 + }, + { + "epoch": 2.3065749235474007, + "grad_norm": 0.23737176881433894, + "learning_rate": 1.0164162999188269e-06, + "loss": 0.3522, + "num_tokens": 1049201273.0, + "step": 2548 + }, + { + "epoch": 2.3069571865443423, + "grad_norm": 0.23311706203338056, + "learning_rate": 1.0159441923739603e-06, + "loss": 0.2976, + "num_tokens": 1049572453.0, + "step": 2549 + }, + { + "epoch": 2.3073394495412844, + "grad_norm": 0.21908035996337083, + "learning_rate": 1.0154789607324605e-06, + "loss": 0.339, + "num_tokens": 1050007857.0, + "step": 2550 + }, + { + "epoch": 2.307721712538226, + "grad_norm": 0.22278710961037473, + "learning_rate": 1.0150206057077197e-06, + "loss": 0.3264, + "num_tokens": 1050401674.0, + "step": 2551 + }, + { + "epoch": 2.308103975535168, + "grad_norm": 0.22054604231911978, + "learning_rate": 1.014569128002584e-06, + "loss": 0.315, + "num_tokens": 1050759338.0, + "step": 2552 + }, + { + "epoch": 2.30848623853211, + "grad_norm": 0.22081096541220202, + "learning_rate": 1.0141245283093553e-06, + "loss": 0.3416, + "num_tokens": 1051181924.0, + "step": 2553 + }, + { + "epoch": 2.308868501529052, + "grad_norm": 0.2510745144149683, + "learning_rate": 1.0136868073097877e-06, + "loss": 0.3434, + "num_tokens": 1051586897.0, + "step": 2554 + }, + { + "epoch": 2.309250764525994, + "grad_norm": 0.21096862292524723, + "learning_rate": 1.0132559656750875e-06, + "loss": 0.3361, + "num_tokens": 1052041086.0, + "step": 2555 + }, + { + "epoch": 2.3096330275229358, + "grad_norm": 0.23509415857047336, + "learning_rate": 1.0128320040659124e-06, + "loss": 0.3259, + "num_tokens": 1052424988.0, + "step": 2556 + }, + { + "epoch": 2.310015290519878, + "grad_norm": 0.2212050536162352, + "learning_rate": 1.0124149231323704e-06, + "loss": 0.3202, + "num_tokens": 1052828435.0, + "step": 2557 + }, + { + "epoch": 2.3103975535168195, + "grad_norm": 0.23468803635063076, + "learning_rate": 1.0120047235140178e-06, + "loss": 0.3235, + "num_tokens": 1053212688.0, + "step": 2558 + }, + { + "epoch": 2.3107798165137616, + "grad_norm": 0.23189443243628696, + "learning_rate": 1.011601405839859e-06, + "loss": 0.346, + "num_tokens": 1053607501.0, + "step": 2559 + }, + { + "epoch": 2.3111620795107033, + "grad_norm": 0.21508531306556702, + "learning_rate": 1.0112049707283475e-06, + "loss": 0.3291, + "num_tokens": 1054016876.0, + "step": 2560 + }, + { + "epoch": 2.3115443425076454, + "grad_norm": 0.23227188638372645, + "learning_rate": 1.0108154187873804e-06, + "loss": 0.3429, + "num_tokens": 1054426560.0, + "step": 2561 + }, + { + "epoch": 2.311926605504587, + "grad_norm": 0.23612394657024277, + "learning_rate": 1.0104327506143014e-06, + "loss": 0.3401, + "num_tokens": 1054821614.0, + "step": 2562 + }, + { + "epoch": 2.312308868501529, + "grad_norm": 0.22920504353452664, + "learning_rate": 1.0100569667958982e-06, + "loss": 0.3008, + "num_tokens": 1055185063.0, + "step": 2563 + }, + { + "epoch": 2.312691131498471, + "grad_norm": 0.22824922786612356, + "learning_rate": 1.0096880679084025e-06, + "loss": 0.3367, + "num_tokens": 1055601955.0, + "step": 2564 + }, + { + "epoch": 2.313073394495413, + "grad_norm": 0.2526336087338982, + "learning_rate": 1.009326054517488e-06, + "loss": 0.3382, + "num_tokens": 1056034753.0, + "step": 2565 + }, + { + "epoch": 2.3134556574923546, + "grad_norm": 0.22288612631581622, + "learning_rate": 1.0089709271782696e-06, + "loss": 0.334, + "num_tokens": 1056465052.0, + "step": 2566 + }, + { + "epoch": 2.3138379204892967, + "grad_norm": 0.2294912223328932, + "learning_rate": 1.008622686435305e-06, + "loss": 0.3463, + "num_tokens": 1056862205.0, + "step": 2567 + }, + { + "epoch": 2.3142201834862384, + "grad_norm": 0.24227930904450967, + "learning_rate": 1.0082813328225893e-06, + "loss": 0.3584, + "num_tokens": 1057254898.0, + "step": 2568 + }, + { + "epoch": 2.3146024464831805, + "grad_norm": 0.23695038104521526, + "learning_rate": 1.0079468668635584e-06, + "loss": 0.3652, + "num_tokens": 1057661684.0, + "step": 2569 + }, + { + "epoch": 2.314984709480122, + "grad_norm": 0.2709236842675759, + "learning_rate": 1.007619289071087e-06, + "loss": 0.3518, + "num_tokens": 1058080642.0, + "step": 2570 + }, + { + "epoch": 2.3153669724770642, + "grad_norm": 0.22561965993818303, + "learning_rate": 1.007298599947486e-06, + "loss": 0.3259, + "num_tokens": 1058508776.0, + "step": 2571 + }, + { + "epoch": 2.315749235474006, + "grad_norm": 0.2509449265816071, + "learning_rate": 1.0069847999845041e-06, + "loss": 0.3713, + "num_tokens": 1058914029.0, + "step": 2572 + }, + { + "epoch": 2.316131498470948, + "grad_norm": 0.26544896502927867, + "learning_rate": 1.006677889663326e-06, + "loss": 0.323, + "num_tokens": 1059324833.0, + "step": 2573 + }, + { + "epoch": 2.31651376146789, + "grad_norm": 0.24484947257304102, + "learning_rate": 1.0063778694545715e-06, + "loss": 0.3327, + "num_tokens": 1059740828.0, + "step": 2574 + }, + { + "epoch": 2.316896024464832, + "grad_norm": 0.2482732124964759, + "learning_rate": 1.0060847398182944e-06, + "loss": 0.3242, + "num_tokens": 1060119510.0, + "step": 2575 + }, + { + "epoch": 2.317278287461774, + "grad_norm": 0.22607380741711375, + "learning_rate": 1.0057985012039843e-06, + "loss": 0.3324, + "num_tokens": 1060531012.0, + "step": 2576 + }, + { + "epoch": 2.3176605504587156, + "grad_norm": 0.23654994191862802, + "learning_rate": 1.0055191540505613e-06, + "loss": 0.3118, + "num_tokens": 1060904405.0, + "step": 2577 + }, + { + "epoch": 2.3180428134556577, + "grad_norm": 0.21409976253788449, + "learning_rate": 1.005246698786381e-06, + "loss": 0.3622, + "num_tokens": 1061371155.0, + "step": 2578 + }, + { + "epoch": 2.3184250764525993, + "grad_norm": 0.2309483797904588, + "learning_rate": 1.004981135829229e-06, + "loss": 0.345, + "num_tokens": 1061750695.0, + "step": 2579 + }, + { + "epoch": 2.3188073394495414, + "grad_norm": 0.2209584863444376, + "learning_rate": 1.004722465586322e-06, + "loss": 0.35, + "num_tokens": 1062175520.0, + "step": 2580 + }, + { + "epoch": 2.319189602446483, + "grad_norm": 0.23909379587847404, + "learning_rate": 1.004470688454309e-06, + "loss": 0.3392, + "num_tokens": 1062594570.0, + "step": 2581 + }, + { + "epoch": 2.319571865443425, + "grad_norm": 0.22906900138976263, + "learning_rate": 1.004225804819267e-06, + "loss": 0.3235, + "num_tokens": 1062997343.0, + "step": 2582 + }, + { + "epoch": 2.319954128440367, + "grad_norm": 0.20983776796263418, + "learning_rate": 1.003987815056704e-06, + "loss": 0.3265, + "num_tokens": 1063426924.0, + "step": 2583 + }, + { + "epoch": 2.320336391437309, + "grad_norm": 0.24436396259085938, + "learning_rate": 1.0037567195315564e-06, + "loss": 0.3423, + "num_tokens": 1063826773.0, + "step": 2584 + }, + { + "epoch": 2.3207186544342506, + "grad_norm": 0.23346624228085094, + "learning_rate": 1.0035325185981882e-06, + "loss": 0.34, + "num_tokens": 1064247442.0, + "step": 2585 + }, + { + "epoch": 2.3211009174311927, + "grad_norm": 0.2148594761325381, + "learning_rate": 1.0033152126003927e-06, + "loss": 0.3498, + "num_tokens": 1064660669.0, + "step": 2586 + }, + { + "epoch": 2.3214831804281344, + "grad_norm": 0.23159282734808287, + "learning_rate": 1.0031048018713885e-06, + "loss": 0.3354, + "num_tokens": 1065064433.0, + "step": 2587 + }, + { + "epoch": 2.3218654434250765, + "grad_norm": 0.22804428060431975, + "learning_rate": 1.0029012867338217e-06, + "loss": 0.3528, + "num_tokens": 1065494137.0, + "step": 2588 + }, + { + "epoch": 2.322247706422018, + "grad_norm": 0.22346420028074468, + "learning_rate": 1.0027046674997656e-06, + "loss": 0.3305, + "num_tokens": 1065890966.0, + "step": 2589 + }, + { + "epoch": 2.3226299694189603, + "grad_norm": 0.24306989857321962, + "learning_rate": 1.002514944470718e-06, + "loss": 0.3341, + "num_tokens": 1066316783.0, + "step": 2590 + }, + { + "epoch": 2.323012232415902, + "grad_norm": 0.22381326387345193, + "learning_rate": 1.0023321179376029e-06, + "loss": 0.3483, + "num_tokens": 1066743816.0, + "step": 2591 + }, + { + "epoch": 2.323394495412844, + "grad_norm": 0.2420401955009923, + "learning_rate": 1.0021561881807676e-06, + "loss": 0.3263, + "num_tokens": 1067117888.0, + "step": 2592 + }, + { + "epoch": 2.323776758409786, + "grad_norm": 0.23509563537049455, + "learning_rate": 1.0019871554699855e-06, + "loss": 0.3393, + "num_tokens": 1067519371.0, + "step": 2593 + }, + { + "epoch": 2.324159021406728, + "grad_norm": 0.23558611403571264, + "learning_rate": 1.001825020064453e-06, + "loss": 0.3238, + "num_tokens": 1067950301.0, + "step": 2594 + }, + { + "epoch": 2.32454128440367, + "grad_norm": 0.21200595149700782, + "learning_rate": 1.0016697822127914e-06, + "loss": 0.3397, + "num_tokens": 1068386513.0, + "step": 2595 + }, + { + "epoch": 2.3249235474006116, + "grad_norm": 0.2243911394866069, + "learning_rate": 1.001521442153043e-06, + "loss": 0.3473, + "num_tokens": 1068804684.0, + "step": 2596 + }, + { + "epoch": 2.3253058103975537, + "grad_norm": 0.24157224102672215, + "learning_rate": 1.001380000112675e-06, + "loss": 0.3275, + "num_tokens": 1069166337.0, + "step": 2597 + }, + { + "epoch": 2.3256880733944953, + "grad_norm": 0.22273570512325785, + "learning_rate": 1.0012454563085758e-06, + "loss": 0.3393, + "num_tokens": 1069592508.0, + "step": 2598 + }, + { + "epoch": 2.3260703363914375, + "grad_norm": 0.2234569969951848, + "learning_rate": 1.0011178109470566e-06, + "loss": 0.327, + "num_tokens": 1070009583.0, + "step": 2599 + }, + { + "epoch": 2.326452599388379, + "grad_norm": 0.21476705658140316, + "learning_rate": 1.00099706422385e-06, + "loss": 0.3356, + "num_tokens": 1070445970.0, + "step": 2600 + }, + { + "epoch": 2.3268348623853212, + "grad_norm": 0.23114710206420533, + "learning_rate": 1.000883216324111e-06, + "loss": 0.3208, + "num_tokens": 1070874120.0, + "step": 2601 + }, + { + "epoch": 2.327217125382263, + "grad_norm": 0.22901004329693428, + "learning_rate": 1.0007762674224153e-06, + "loss": 0.3358, + "num_tokens": 1071312881.0, + "step": 2602 + }, + { + "epoch": 2.327599388379205, + "grad_norm": 0.25549508600903154, + "learning_rate": 1.0006762176827586e-06, + "loss": 0.3363, + "num_tokens": 1071693968.0, + "step": 2603 + }, + { + "epoch": 2.3279816513761467, + "grad_norm": 0.23640053974141, + "learning_rate": 1.0005830672585594e-06, + "loss": 0.3412, + "num_tokens": 1072085740.0, + "step": 2604 + }, + { + "epoch": 2.3283639143730888, + "grad_norm": 0.2323327589925819, + "learning_rate": 1.000496816292655e-06, + "loss": 0.3189, + "num_tokens": 1072474892.0, + "step": 2605 + }, + { + "epoch": 2.3287461773700304, + "grad_norm": 0.2466367038767514, + "learning_rate": 1.000417464917304e-06, + "loss": 0.3368, + "num_tokens": 1072888114.0, + "step": 2606 + }, + { + "epoch": 2.3291284403669725, + "grad_norm": 0.22164749243531737, + "learning_rate": 1.0003450132541852e-06, + "loss": 0.3302, + "num_tokens": 1073317816.0, + "step": 2607 + }, + { + "epoch": 2.329510703363914, + "grad_norm": 0.2351053279203124, + "learning_rate": 1.0002794614143953e-06, + "loss": 0.3388, + "num_tokens": 1073693086.0, + "step": 2608 + }, + { + "epoch": 2.3298929663608563, + "grad_norm": 0.26298485086037265, + "learning_rate": 1.0002208094984539e-06, + "loss": 0.3607, + "num_tokens": 1074126288.0, + "step": 2609 + }, + { + "epoch": 2.330275229357798, + "grad_norm": 0.2490011840136058, + "learning_rate": 1.0001690575962977e-06, + "loss": 0.3431, + "num_tokens": 1074520551.0, + "step": 2610 + }, + { + "epoch": 2.33065749235474, + "grad_norm": 0.22911822715821784, + "learning_rate": 1.0001242057872839e-06, + "loss": 0.3392, + "num_tokens": 1074922093.0, + "step": 2611 + }, + { + "epoch": 2.331039755351682, + "grad_norm": 0.22918798351783345, + "learning_rate": 1.0000862541401885e-06, + "loss": 0.329, + "num_tokens": 1075331902.0, + "step": 2612 + }, + { + "epoch": 2.331422018348624, + "grad_norm": 0.22857812030033325, + "learning_rate": 1.0000552027132067e-06, + "loss": 0.3287, + "num_tokens": 1075746059.0, + "step": 2613 + }, + { + "epoch": 2.331804281345566, + "grad_norm": 0.23829378961735942, + "learning_rate": 1.0000310515539543e-06, + "loss": 0.3738, + "num_tokens": 1076161114.0, + "step": 2614 + }, + { + "epoch": 2.3321865443425076, + "grad_norm": 0.314099427435107, + "learning_rate": 1.000013800699464e-06, + "loss": 0.3384, + "num_tokens": 1076600445.0, + "step": 2615 + }, + { + "epoch": 2.3325688073394497, + "grad_norm": 0.25738694977595783, + "learning_rate": 1.000003450176189e-06, + "loss": 0.3525, + "num_tokens": 1077027632.0, + "step": 2616 + } + ], + "logging_steps": 1, + "max_steps": 2616, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5212247128866816.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}