| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1476, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006777363605557439, |
| "grad_norm": 215.31928086277176, |
| "learning_rate": 2e-06, |
| "loss": 10.8319, |
| "mean_token_accuracy": 0.22483410313725471, |
| "num_tokens": 152858.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0013554727211114877, |
| "grad_norm": 175.0501040640227, |
| "learning_rate": 1.9986449864498644e-06, |
| "loss": 10.2378, |
| "mean_token_accuracy": 0.24036189168691635, |
| "num_tokens": 300109.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0020332090816672314, |
| "grad_norm": 199.45107221580295, |
| "learning_rate": 1.997289972899729e-06, |
| "loss": 9.9628, |
| "mean_token_accuracy": 0.2516961731016636, |
| "num_tokens": 451536.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0027109454422229754, |
| "grad_norm": 276.70813993268683, |
| "learning_rate": 1.9959349593495935e-06, |
| "loss": 9.6748, |
| "mean_token_accuracy": 0.25994390062987804, |
| "num_tokens": 601536.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.003388681802778719, |
| "grad_norm": 327.52259849403754, |
| "learning_rate": 1.994579945799458e-06, |
| "loss": 8.4163, |
| "mean_token_accuracy": 0.28762012347579, |
| "num_tokens": 751272.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.004066418163334463, |
| "grad_norm": 161.92592352250892, |
| "learning_rate": 1.9932249322493225e-06, |
| "loss": 8.3087, |
| "mean_token_accuracy": 0.29531916975975037, |
| "num_tokens": 899935.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.004744154523890207, |
| "grad_norm": 243.14567162148077, |
| "learning_rate": 1.991869918699187e-06, |
| "loss": 7.9566, |
| "mean_token_accuracy": 0.3023368716239929, |
| "num_tokens": 1046913.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.005421890884445951, |
| "grad_norm": 176.82871333029541, |
| "learning_rate": 1.990514905149051e-06, |
| "loss": 8.108, |
| "mean_token_accuracy": 0.2935122326016426, |
| "num_tokens": 1201554.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.006099627245001694, |
| "grad_norm": 113.28849091621012, |
| "learning_rate": 1.9891598915989156e-06, |
| "loss": 6.8465, |
| "mean_token_accuracy": 0.3109285309910774, |
| "num_tokens": 1349568.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.006777363605557438, |
| "grad_norm": 254.49554351760855, |
| "learning_rate": 1.9878048780487806e-06, |
| "loss": 6.3518, |
| "mean_token_accuracy": 0.3181378021836281, |
| "num_tokens": 1497459.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.007455099966113182, |
| "grad_norm": 83.50614405450463, |
| "learning_rate": 1.986449864498645e-06, |
| "loss": 6.2805, |
| "mean_token_accuracy": 0.3154120370745659, |
| "num_tokens": 1646573.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.008132836326668925, |
| "grad_norm": 86.39368529977068, |
| "learning_rate": 1.9850948509485096e-06, |
| "loss": 6.145, |
| "mean_token_accuracy": 0.31414467468857765, |
| "num_tokens": 1792422.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.00881057268722467, |
| "grad_norm": 119.19627809594022, |
| "learning_rate": 1.9837398373983737e-06, |
| "loss": 6.0264, |
| "mean_token_accuracy": 0.3126797378063202, |
| "num_tokens": 1940761.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.009488309047780414, |
| "grad_norm": 73.76070751225097, |
| "learning_rate": 1.9823848238482382e-06, |
| "loss": 5.6976, |
| "mean_token_accuracy": 0.32139725238084793, |
| "num_tokens": 2088013.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.010166045408336157, |
| "grad_norm": 96.93314452315771, |
| "learning_rate": 1.9810298102981028e-06, |
| "loss": 5.7813, |
| "mean_token_accuracy": 0.30942872911691666, |
| "num_tokens": 2239304.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.010843781768891902, |
| "grad_norm": 166.31622316662109, |
| "learning_rate": 1.9796747967479673e-06, |
| "loss": 5.623, |
| "mean_token_accuracy": 0.3139219619333744, |
| "num_tokens": 2387468.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.011521518129447645, |
| "grad_norm": 66.0493117054842, |
| "learning_rate": 1.978319783197832e-06, |
| "loss": 5.282, |
| "mean_token_accuracy": 0.32881826534867287, |
| "num_tokens": 2535035.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.012199254490003388, |
| "grad_norm": 64.22483298490378, |
| "learning_rate": 1.9769647696476963e-06, |
| "loss": 5.0493, |
| "mean_token_accuracy": 0.3398439697921276, |
| "num_tokens": 2680182.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.012876990850559133, |
| "grad_norm": 75.86806695541563, |
| "learning_rate": 1.975609756097561e-06, |
| "loss": 5.0178, |
| "mean_token_accuracy": 0.34127549827098846, |
| "num_tokens": 2827956.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.013554727211114876, |
| "grad_norm": 77.56337630320212, |
| "learning_rate": 1.9742547425474254e-06, |
| "loss": 4.9786, |
| "mean_token_accuracy": 0.34324514865875244, |
| "num_tokens": 2976812.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01423246357167062, |
| "grad_norm": 77.81656184616301, |
| "learning_rate": 1.97289972899729e-06, |
| "loss": 4.7973, |
| "mean_token_accuracy": 0.351806353777647, |
| "num_tokens": 3125150.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.014910199932226365, |
| "grad_norm": 80.89049380537493, |
| "learning_rate": 1.9715447154471544e-06, |
| "loss": 4.7448, |
| "mean_token_accuracy": 0.3536057360470295, |
| "num_tokens": 3272181.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.015587936292782108, |
| "grad_norm": 85.30272719046583, |
| "learning_rate": 1.970189701897019e-06, |
| "loss": 4.6602, |
| "mean_token_accuracy": 0.3579399660229683, |
| "num_tokens": 3421575.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.01626567265333785, |
| "grad_norm": 89.74092507095774, |
| "learning_rate": 1.9688346883468834e-06, |
| "loss": 4.5848, |
| "mean_token_accuracy": 0.36106717213988304, |
| "num_tokens": 3570850.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.016943409013893594, |
| "grad_norm": 92.46296984633125, |
| "learning_rate": 1.967479674796748e-06, |
| "loss": 4.4245, |
| "mean_token_accuracy": 0.37010491639375687, |
| "num_tokens": 3720110.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.01762114537444934, |
| "grad_norm": 99.31637778401333, |
| "learning_rate": 1.9661246612466125e-06, |
| "loss": 4.3483, |
| "mean_token_accuracy": 0.37695401161909103, |
| "num_tokens": 3867948.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.018298881735005084, |
| "grad_norm": 107.76203073450068, |
| "learning_rate": 1.964769647696477e-06, |
| "loss": 4.3199, |
| "mean_token_accuracy": 0.37547387182712555, |
| "num_tokens": 4016986.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.018976618095560827, |
| "grad_norm": 110.05813690039194, |
| "learning_rate": 1.9634146341463415e-06, |
| "loss": 4.1004, |
| "mean_token_accuracy": 0.39141304790973663, |
| "num_tokens": 4162483.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.01965435445611657, |
| "grad_norm": 122.62831228475048, |
| "learning_rate": 1.962059620596206e-06, |
| "loss": 4.1895, |
| "mean_token_accuracy": 0.3831036686897278, |
| "num_tokens": 4311489.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.020332090816672314, |
| "grad_norm": 124.97452075345946, |
| "learning_rate": 1.9607046070460706e-06, |
| "loss": 4.1468, |
| "mean_token_accuracy": 0.38225793465971947, |
| "num_tokens": 4464098.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.021009827177228057, |
| "grad_norm": 124.1659063102063, |
| "learning_rate": 1.9593495934959347e-06, |
| "loss": 4.006, |
| "mean_token_accuracy": 0.3971566930413246, |
| "num_tokens": 4611556.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.021687563537783804, |
| "grad_norm": 125.00815818704322, |
| "learning_rate": 1.957994579945799e-06, |
| "loss": 3.9493, |
| "mean_token_accuracy": 0.40315019339323044, |
| "num_tokens": 4760310.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.022365299898339547, |
| "grad_norm": 122.07591947126842, |
| "learning_rate": 1.9566395663956637e-06, |
| "loss": 3.7965, |
| "mean_token_accuracy": 0.4153985045850277, |
| "num_tokens": 4906715.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.02304303625889529, |
| "grad_norm": 126.74800270999756, |
| "learning_rate": 1.955284552845528e-06, |
| "loss": 3.8472, |
| "mean_token_accuracy": 0.4084292873740196, |
| "num_tokens": 5056083.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.023720772619451033, |
| "grad_norm": 125.49345325581528, |
| "learning_rate": 1.953929539295393e-06, |
| "loss": 3.7707, |
| "mean_token_accuracy": 0.4145648442208767, |
| "num_tokens": 5205556.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.024398508980006776, |
| "grad_norm": 130.54186032107705, |
| "learning_rate": 1.9525745257452573e-06, |
| "loss": 3.8357, |
| "mean_token_accuracy": 0.40865280851721764, |
| "num_tokens": 5358374.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.02507624534056252, |
| "grad_norm": 123.00299156664755, |
| "learning_rate": 1.9512195121951218e-06, |
| "loss": 3.6205, |
| "mean_token_accuracy": 0.4278510734438896, |
| "num_tokens": 5504445.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.025753981701118266, |
| "grad_norm": 124.8763734549625, |
| "learning_rate": 1.9498644986449863e-06, |
| "loss": 3.6099, |
| "mean_token_accuracy": 0.4292087107896805, |
| "num_tokens": 5655196.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.02643171806167401, |
| "grad_norm": 122.93711702855603, |
| "learning_rate": 1.948509485094851e-06, |
| "loss": 3.5454, |
| "mean_token_accuracy": 0.4364416375756264, |
| "num_tokens": 5804319.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.027109454422229753, |
| "grad_norm": 127.34644717949502, |
| "learning_rate": 1.9471544715447153e-06, |
| "loss": 3.5861, |
| "mean_token_accuracy": 0.431281503289938, |
| "num_tokens": 5956589.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.027787190782785496, |
| "grad_norm": 120.67004246260448, |
| "learning_rate": 1.94579945799458e-06, |
| "loss": 3.4181, |
| "mean_token_accuracy": 0.4462522640824318, |
| "num_tokens": 6104451.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.02846492714334124, |
| "grad_norm": 124.66691702087084, |
| "learning_rate": 1.9444444444444444e-06, |
| "loss": 3.4574, |
| "mean_token_accuracy": 0.44873106479644775, |
| "num_tokens": 6253336.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.029142663503896982, |
| "grad_norm": 127.90260387508701, |
| "learning_rate": 1.943089430894309e-06, |
| "loss": 3.4951, |
| "mean_token_accuracy": 0.44188638776540756, |
| "num_tokens": 6405404.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.02982039986445273, |
| "grad_norm": 122.09081961349227, |
| "learning_rate": 1.9417344173441734e-06, |
| "loss": 3.3221, |
| "mean_token_accuracy": 0.4585568234324455, |
| "num_tokens": 6553474.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.030498136225008472, |
| "grad_norm": 122.73792960023344, |
| "learning_rate": 1.940379403794038e-06, |
| "loss": 3.3127, |
| "mean_token_accuracy": 0.4573376663029194, |
| "num_tokens": 6700108.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.031175872585564215, |
| "grad_norm": 122.62720593333567, |
| "learning_rate": 1.9390243902439024e-06, |
| "loss": 3.2842, |
| "mean_token_accuracy": 0.45810314640402794, |
| "num_tokens": 6848463.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.03185360894611996, |
| "grad_norm": 121.2286239246027, |
| "learning_rate": 1.937669376693767e-06, |
| "loss": 3.2317, |
| "mean_token_accuracy": 0.46193112805485725, |
| "num_tokens": 6998690.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0325313453066757, |
| "grad_norm": 123.66939663555446, |
| "learning_rate": 1.9363143631436315e-06, |
| "loss": 3.2559, |
| "mean_token_accuracy": 0.45676320046186447, |
| "num_tokens": 7149009.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.033209081667231445, |
| "grad_norm": 120.83625724001882, |
| "learning_rate": 1.934959349593496e-06, |
| "loss": 3.1631, |
| "mean_token_accuracy": 0.4640156216919422, |
| "num_tokens": 7296015.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.03388681802778719, |
| "grad_norm": 115.86241664463016, |
| "learning_rate": 1.93360433604336e-06, |
| "loss": 3.0535, |
| "mean_token_accuracy": 0.47255614027380943, |
| "num_tokens": 7440154.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03456455438834293, |
| "grad_norm": 120.77679179022189, |
| "learning_rate": 1.9322493224932246e-06, |
| "loss": 3.1193, |
| "mean_token_accuracy": 0.46199216321110725, |
| "num_tokens": 7589986.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.03524229074889868, |
| "grad_norm": 119.75325748406044, |
| "learning_rate": 1.9308943089430896e-06, |
| "loss": 3.0699, |
| "mean_token_accuracy": 0.4629558362066746, |
| "num_tokens": 7738759.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.035920027109454425, |
| "grad_norm": 122.81516564789821, |
| "learning_rate": 1.929539295392954e-06, |
| "loss": 3.0929, |
| "mean_token_accuracy": 0.4590213857591152, |
| "num_tokens": 7888442.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.03659776347001017, |
| "grad_norm": 116.60973705379047, |
| "learning_rate": 1.9281842818428186e-06, |
| "loss": 2.9478, |
| "mean_token_accuracy": 0.47187361493706703, |
| "num_tokens": 8035766.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.03727549983056591, |
| "grad_norm": 122.05879455164165, |
| "learning_rate": 1.9268292682926827e-06, |
| "loss": 3.0327, |
| "mean_token_accuracy": 0.45789875090122223, |
| "num_tokens": 8187621.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.037953236191121655, |
| "grad_norm": 121.61169807853662, |
| "learning_rate": 1.9254742547425472e-06, |
| "loss": 3.0006, |
| "mean_token_accuracy": 0.45924459397792816, |
| "num_tokens": 8336909.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.0386309725516774, |
| "grad_norm": 117.02544106479719, |
| "learning_rate": 1.9241192411924117e-06, |
| "loss": 2.8853, |
| "mean_token_accuracy": 0.46758873015642166, |
| "num_tokens": 8484120.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.03930870891223314, |
| "grad_norm": 118.50453238861088, |
| "learning_rate": 1.9227642276422763e-06, |
| "loss": 2.8773, |
| "mean_token_accuracy": 0.4679280035197735, |
| "num_tokens": 8632765.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.039986445272788884, |
| "grad_norm": 119.75315383821146, |
| "learning_rate": 1.9214092140921408e-06, |
| "loss": 2.8609, |
| "mean_token_accuracy": 0.46564289554953575, |
| "num_tokens": 8780158.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.04066418163334463, |
| "grad_norm": 121.82700024099637, |
| "learning_rate": 1.9200542005420053e-06, |
| "loss": 2.8858, |
| "mean_token_accuracy": 0.45738009735941887, |
| "num_tokens": 8931674.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04134191799390037, |
| "grad_norm": 120.37404301611848, |
| "learning_rate": 1.91869918699187e-06, |
| "loss": 2.8305, |
| "mean_token_accuracy": 0.4612518399953842, |
| "num_tokens": 9083330.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.042019654354456114, |
| "grad_norm": 120.55110506672673, |
| "learning_rate": 1.9173441734417343e-06, |
| "loss": 2.7999, |
| "mean_token_accuracy": 0.46009667590260506, |
| "num_tokens": 9234569.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.04269739071501186, |
| "grad_norm": 120.16994369985942, |
| "learning_rate": 1.915989159891599e-06, |
| "loss": 2.7573, |
| "mean_token_accuracy": 0.4637075141072273, |
| "num_tokens": 9383301.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.04337512707556761, |
| "grad_norm": 123.56031740474211, |
| "learning_rate": 1.9146341463414634e-06, |
| "loss": 2.7766, |
| "mean_token_accuracy": 0.45710835233330727, |
| "num_tokens": 9535886.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.04405286343612335, |
| "grad_norm": 122.98514880561626, |
| "learning_rate": 1.913279132791328e-06, |
| "loss": 2.7324, |
| "mean_token_accuracy": 0.4671022370457649, |
| "num_tokens": 9687879.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.044730599796679094, |
| "grad_norm": 125.43736740720358, |
| "learning_rate": 1.9119241192411924e-06, |
| "loss": 2.7436, |
| "mean_token_accuracy": 0.47459762170910835, |
| "num_tokens": 9842571.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.04540833615723484, |
| "grad_norm": 114.11746508121755, |
| "learning_rate": 1.910569105691057e-06, |
| "loss": 2.5244, |
| "mean_token_accuracy": 0.48580894619226456, |
| "num_tokens": 9987540.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.04608607251779058, |
| "grad_norm": 121.82433617300515, |
| "learning_rate": 1.909214092140921e-06, |
| "loss": 2.6134, |
| "mean_token_accuracy": 0.4700146056711674, |
| "num_tokens": 10138378.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.04676380887834632, |
| "grad_norm": 120.89387496367533, |
| "learning_rate": 1.907859078590786e-06, |
| "loss": 2.569, |
| "mean_token_accuracy": 0.511335089802742, |
| "num_tokens": 10287979.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.047441545238902066, |
| "grad_norm": 122.26976503869227, |
| "learning_rate": 1.9065040650406503e-06, |
| "loss": 2.5595, |
| "mean_token_accuracy": 0.5703656449913979, |
| "num_tokens": 10438939.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04811928159945781, |
| "grad_norm": 119.11292329257235, |
| "learning_rate": 1.9051490514905148e-06, |
| "loss": 2.4608, |
| "mean_token_accuracy": 0.4872521534562111, |
| "num_tokens": 10586175.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.04879701796001355, |
| "grad_norm": 120.66894079757309, |
| "learning_rate": 1.9037940379403793e-06, |
| "loss": 2.4675, |
| "mean_token_accuracy": 0.5670045763254166, |
| "num_tokens": 10735195.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.049474754320569296, |
| "grad_norm": 126.57059301982753, |
| "learning_rate": 1.9024390243902436e-06, |
| "loss": 2.5366, |
| "mean_token_accuracy": 0.6708709970116615, |
| "num_tokens": 10890747.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.05015249068112504, |
| "grad_norm": 119.11823841769932, |
| "learning_rate": 1.9010840108401084e-06, |
| "loss": 2.3787, |
| "mean_token_accuracy": 0.5503224208950996, |
| "num_tokens": 11038790.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.05083022704168079, |
| "grad_norm": 120.87828788657913, |
| "learning_rate": 1.8997289972899729e-06, |
| "loss": 2.3944, |
| "mean_token_accuracy": 0.7692296281456947, |
| "num_tokens": 11188775.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.05150796340223653, |
| "grad_norm": 118.25074374900122, |
| "learning_rate": 1.8983739837398374e-06, |
| "loss": 2.3129, |
| "mean_token_accuracy": 0.7943554669618607, |
| "num_tokens": 11336216.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.052185699762792276, |
| "grad_norm": 119.89244525861773, |
| "learning_rate": 1.897018970189702e-06, |
| "loss": 2.3178, |
| "mean_token_accuracy": 0.8039621710777283, |
| "num_tokens": 11487643.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.05286343612334802, |
| "grad_norm": 117.61570660235944, |
| "learning_rate": 1.8956639566395662e-06, |
| "loss": 2.2518, |
| "mean_token_accuracy": 0.8842473700642586, |
| "num_tokens": 11636350.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.05354117248390376, |
| "grad_norm": 119.18861910334302, |
| "learning_rate": 1.8943089430894307e-06, |
| "loss": 2.2523, |
| "mean_token_accuracy": 0.9015108346939087, |
| "num_tokens": 11786908.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.054218908844459505, |
| "grad_norm": 115.79762500453249, |
| "learning_rate": 1.8929539295392953e-06, |
| "loss": 2.1631, |
| "mean_token_accuracy": 0.8935829252004623, |
| "num_tokens": 11932736.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05489664520501525, |
| "grad_norm": 119.09989693599702, |
| "learning_rate": 1.8915989159891598e-06, |
| "loss": 2.1722, |
| "mean_token_accuracy": 0.913075864315033, |
| "num_tokens": 12081616.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.05557438156557099, |
| "grad_norm": 118.06996069036731, |
| "learning_rate": 1.8902439024390243e-06, |
| "loss": 2.1496, |
| "mean_token_accuracy": 0.904613807797432, |
| "num_tokens": 12230175.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.056252117926126735, |
| "grad_norm": 123.85320199345325, |
| "learning_rate": 1.8888888888888888e-06, |
| "loss": 2.1906, |
| "mean_token_accuracy": 0.9205794930458069, |
| "num_tokens": 12383751.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.05692985428668248, |
| "grad_norm": 121.52517225759287, |
| "learning_rate": 1.8875338753387533e-06, |
| "loss": 2.1203, |
| "mean_token_accuracy": 0.9293738752603531, |
| "num_tokens": 12534184.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.05760759064723822, |
| "grad_norm": 120.65452449500275, |
| "learning_rate": 1.8861788617886179e-06, |
| "loss": 2.0948, |
| "mean_token_accuracy": 0.9251657500863075, |
| "num_tokens": 12687333.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.058285327007793965, |
| "grad_norm": 119.22615191510779, |
| "learning_rate": 1.8848238482384824e-06, |
| "loss": 2.0499, |
| "mean_token_accuracy": 0.9273019582033157, |
| "num_tokens": 12837011.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.058963063368349715, |
| "grad_norm": 118.57976808894816, |
| "learning_rate": 1.8834688346883467e-06, |
| "loss": 2.0156, |
| "mean_token_accuracy": 0.9246799051761627, |
| "num_tokens": 12986181.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.05964079972890546, |
| "grad_norm": 118.99893633401761, |
| "learning_rate": 1.8821138211382112e-06, |
| "loss": 1.9919, |
| "mean_token_accuracy": 0.9291153773665428, |
| "num_tokens": 13134214.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.0603185360894612, |
| "grad_norm": 121.00759692804243, |
| "learning_rate": 1.8807588075880757e-06, |
| "loss": 1.9796, |
| "mean_token_accuracy": 0.9294244274497032, |
| "num_tokens": 13286608.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.060996272450016945, |
| "grad_norm": 122.96705505458303, |
| "learning_rate": 1.8794037940379405e-06, |
| "loss": 1.9793, |
| "mean_token_accuracy": 0.9320042505860329, |
| "num_tokens": 13438982.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06167400881057269, |
| "grad_norm": 115.45042378077322, |
| "learning_rate": 1.8780487804878048e-06, |
| "loss": 1.8714, |
| "mean_token_accuracy": 0.9282395839691162, |
| "num_tokens": 13585428.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.06235174517112843, |
| "grad_norm": 111.7836219513804, |
| "learning_rate": 1.8766937669376693e-06, |
| "loss": 1.7863, |
| "mean_token_accuracy": 0.9293386787176132, |
| "num_tokens": 13730964.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.06302948153168418, |
| "grad_norm": 118.87630401245544, |
| "learning_rate": 1.8753387533875338e-06, |
| "loss": 1.8495, |
| "mean_token_accuracy": 0.9302572533488274, |
| "num_tokens": 13879984.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.06370721789223992, |
| "grad_norm": 114.52334966185592, |
| "learning_rate": 1.8739837398373983e-06, |
| "loss": 1.775, |
| "mean_token_accuracy": 0.9275632426142693, |
| "num_tokens": 14029215.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.06438495425279567, |
| "grad_norm": 121.46158002135849, |
| "learning_rate": 1.8726287262872629e-06, |
| "loss": 1.8224, |
| "mean_token_accuracy": 0.9329173788428307, |
| "num_tokens": 14182230.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0650626906133514, |
| "grad_norm": 112.07899271050006, |
| "learning_rate": 1.8712737127371272e-06, |
| "loss": 1.6955, |
| "mean_token_accuracy": 0.929063692688942, |
| "num_tokens": 14324540.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.06574042697390715, |
| "grad_norm": 118.60172402243633, |
| "learning_rate": 1.8699186991869917e-06, |
| "loss": 1.747, |
| "mean_token_accuracy": 0.9302254170179367, |
| "num_tokens": 14476642.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.06641816333446289, |
| "grad_norm": 118.1561917036615, |
| "learning_rate": 1.8685636856368562e-06, |
| "loss": 1.714, |
| "mean_token_accuracy": 0.930650383234024, |
| "num_tokens": 14627769.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.06709589969501864, |
| "grad_norm": 117.87828039655399, |
| "learning_rate": 1.867208672086721e-06, |
| "loss": 1.6863, |
| "mean_token_accuracy": 0.9286526739597321, |
| "num_tokens": 14779314.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.06777363605557438, |
| "grad_norm": 118.70756510736268, |
| "learning_rate": 1.8658536585365854e-06, |
| "loss": 1.6616, |
| "mean_token_accuracy": 0.9325949177145958, |
| "num_tokens": 14930361.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06845137241613013, |
| "grad_norm": 115.0705408214617, |
| "learning_rate": 1.8644986449864498e-06, |
| "loss": 1.6001, |
| "mean_token_accuracy": 0.9325196817517281, |
| "num_tokens": 15076964.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.06912910877668586, |
| "grad_norm": 112.04482720947613, |
| "learning_rate": 1.8631436314363143e-06, |
| "loss": 1.5575, |
| "mean_token_accuracy": 0.9281066954135895, |
| "num_tokens": 15224576.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.06980684513724161, |
| "grad_norm": 114.97883851594864, |
| "learning_rate": 1.8617886178861788e-06, |
| "loss": 1.5598, |
| "mean_token_accuracy": 0.9284133464097977, |
| "num_tokens": 15374847.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.07048458149779736, |
| "grad_norm": 113.64949116247662, |
| "learning_rate": 1.8604336043360433e-06, |
| "loss": 1.5186, |
| "mean_token_accuracy": 0.9293715506792068, |
| "num_tokens": 15525505.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.0711623178583531, |
| "grad_norm": 109.63343026661512, |
| "learning_rate": 1.8590785907859076e-06, |
| "loss": 1.4655, |
| "mean_token_accuracy": 0.927336260676384, |
| "num_tokens": 15672609.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.07184005421890885, |
| "grad_norm": 114.18318380830236, |
| "learning_rate": 1.8577235772357721e-06, |
| "loss": 1.4822, |
| "mean_token_accuracy": 0.9287517815828323, |
| "num_tokens": 15822989.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.07251779057946459, |
| "grad_norm": 110.2006577907072, |
| "learning_rate": 1.8563685636856367e-06, |
| "loss": 1.4193, |
| "mean_token_accuracy": 0.9296880438923836, |
| "num_tokens": 15970564.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.07319552694002034, |
| "grad_norm": 108.26203421754022, |
| "learning_rate": 1.8550135501355014e-06, |
| "loss": 1.3794, |
| "mean_token_accuracy": 0.9290289804339409, |
| "num_tokens": 16116146.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.07387326330057607, |
| "grad_norm": 111.01200634554529, |
| "learning_rate": 1.853658536585366e-06, |
| "loss": 1.3787, |
| "mean_token_accuracy": 0.9306840002536774, |
| "num_tokens": 16264186.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.07455099966113182, |
| "grad_norm": 111.43380256907831, |
| "learning_rate": 1.8523035230352302e-06, |
| "loss": 1.3651, |
| "mean_token_accuracy": 0.9310031309723854, |
| "num_tokens": 16413040.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.07522873602168756, |
| "grad_norm": 112.26968519229395, |
| "learning_rate": 1.8509485094850947e-06, |
| "loss": 1.3468, |
| "mean_token_accuracy": 0.931459404528141, |
| "num_tokens": 16564213.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.07590647238224331, |
| "grad_norm": 109.02323402522909, |
| "learning_rate": 1.8495934959349593e-06, |
| "loss": 1.3021, |
| "mean_token_accuracy": 0.9300208985805511, |
| "num_tokens": 16713339.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.07658420874279905, |
| "grad_norm": 107.38997170228535, |
| "learning_rate": 1.8482384823848238e-06, |
| "loss": 1.2712, |
| "mean_token_accuracy": 0.9297583177685738, |
| "num_tokens": 16860845.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.0772619451033548, |
| "grad_norm": 105.96279047024431, |
| "learning_rate": 1.8468834688346883e-06, |
| "loss": 1.2345, |
| "mean_token_accuracy": 0.9307239279150963, |
| "num_tokens": 17008447.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.07793968146391055, |
| "grad_norm": 107.1270259224278, |
| "learning_rate": 1.8455284552845526e-06, |
| "loss": 1.2204, |
| "mean_token_accuracy": 0.9324908629059792, |
| "num_tokens": 17156449.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.07861741782446628, |
| "grad_norm": 107.24961194416598, |
| "learning_rate": 1.8441734417344173e-06, |
| "loss": 1.2074, |
| "mean_token_accuracy": 0.9318635389208794, |
| "num_tokens": 17305874.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.07929515418502203, |
| "grad_norm": 101.95987989009163, |
| "learning_rate": 1.8428184281842819e-06, |
| "loss": 1.1589, |
| "mean_token_accuracy": 0.9266369640827179, |
| "num_tokens": 17453331.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.07997289054557777, |
| "grad_norm": 105.82055210123028, |
| "learning_rate": 1.8414634146341464e-06, |
| "loss": 1.1611, |
| "mean_token_accuracy": 0.930683083832264, |
| "num_tokens": 17602041.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.08065062690613352, |
| "grad_norm": 99.44019225376582, |
| "learning_rate": 1.8401084010840107e-06, |
| "loss": 1.1027, |
| "mean_token_accuracy": 0.9283188283443451, |
| "num_tokens": 17748531.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.08132836326668925, |
| "grad_norm": 104.3596685159921, |
| "learning_rate": 1.8387533875338752e-06, |
| "loss": 1.1165, |
| "mean_token_accuracy": 0.9300511553883553, |
| "num_tokens": 17899989.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.082006099627245, |
| "grad_norm": 100.81191503355579, |
| "learning_rate": 1.8373983739837397e-06, |
| "loss": 1.0777, |
| "mean_token_accuracy": 0.9295774847269058, |
| "num_tokens": 18048993.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.08268383598780074, |
| "grad_norm": 93.88135822464984, |
| "learning_rate": 1.8360433604336042e-06, |
| "loss": 1.02, |
| "mean_token_accuracy": 0.9281311184167862, |
| "num_tokens": 18190797.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.08336157234835649, |
| "grad_norm": 100.17923887033267, |
| "learning_rate": 1.8346883468834688e-06, |
| "loss": 1.0306, |
| "mean_token_accuracy": 0.9315191507339478, |
| "num_tokens": 18338862.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.08403930870891223, |
| "grad_norm": 96.97512207450372, |
| "learning_rate": 1.833333333333333e-06, |
| "loss": 0.9934, |
| "mean_token_accuracy": 0.9317428171634674, |
| "num_tokens": 18486594.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.08471704506946798, |
| "grad_norm": 96.55200278181182, |
| "learning_rate": 1.8319783197831978e-06, |
| "loss": 0.9818, |
| "mean_token_accuracy": 0.9299175664782524, |
| "num_tokens": 18636214.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.08539478143002371, |
| "grad_norm": 95.99010101143081, |
| "learning_rate": 1.8306233062330623e-06, |
| "loss": 0.9599, |
| "mean_token_accuracy": 0.9313259571790695, |
| "num_tokens": 18786296.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.08607251779057946, |
| "grad_norm": 93.57149619949969, |
| "learning_rate": 1.8292682926829268e-06, |
| "loss": 0.9285, |
| "mean_token_accuracy": 0.93116744607687, |
| "num_tokens": 18934560.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.08675025415113521, |
| "grad_norm": 92.01277288501751, |
| "learning_rate": 1.8279132791327912e-06, |
| "loss": 0.9117, |
| "mean_token_accuracy": 0.9301353469491005, |
| "num_tokens": 19084031.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.08742799051169095, |
| "grad_norm": 93.17793996368636, |
| "learning_rate": 1.8265582655826557e-06, |
| "loss": 0.8974, |
| "mean_token_accuracy": 0.9323903545737267, |
| "num_tokens": 19234417.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.0881057268722467, |
| "grad_norm": 87.91463270355133, |
| "learning_rate": 1.8252032520325202e-06, |
| "loss": 0.8588, |
| "mean_token_accuracy": 0.9307873845100403, |
| "num_tokens": 19380999.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08878346323280244, |
| "grad_norm": 88.1638527981959, |
| "learning_rate": 1.8238482384823847e-06, |
| "loss": 0.8553, |
| "mean_token_accuracy": 0.9293078556656837, |
| "num_tokens": 19528898.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.08946119959335819, |
| "grad_norm": 88.94710264026781, |
| "learning_rate": 1.8224932249322492e-06, |
| "loss": 0.835, |
| "mean_token_accuracy": 0.9326649755239487, |
| "num_tokens": 19681222.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.09013893595391392, |
| "grad_norm": 85.90974131550782, |
| "learning_rate": 1.8211382113821138e-06, |
| "loss": 0.8063, |
| "mean_token_accuracy": 0.932416245341301, |
| "num_tokens": 19830868.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.09081667231446967, |
| "grad_norm": 84.86967745400749, |
| "learning_rate": 1.8197831978319783e-06, |
| "loss": 0.784, |
| "mean_token_accuracy": 0.9324630126357079, |
| "num_tokens": 19980203.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.09149440867502541, |
| "grad_norm": 83.25372427993219, |
| "learning_rate": 1.8184281842818428e-06, |
| "loss": 0.7671, |
| "mean_token_accuracy": 0.9325378760695457, |
| "num_tokens": 20127736.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.09217214503558116, |
| "grad_norm": 82.15717791516646, |
| "learning_rate": 1.8170731707317073e-06, |
| "loss": 0.7513, |
| "mean_token_accuracy": 0.9318142458796501, |
| "num_tokens": 20278996.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.0928498813961369, |
| "grad_norm": 77.51250554472611, |
| "learning_rate": 1.8157181571815718e-06, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.9290755987167358, |
| "num_tokens": 20425353.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.09352761775669265, |
| "grad_norm": 78.21321425269326, |
| "learning_rate": 1.8143631436314361e-06, |
| "loss": 0.7069, |
| "mean_token_accuracy": 0.9344506710767746, |
| "num_tokens": 20574453.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.0942053541172484, |
| "grad_norm": 74.35012855974352, |
| "learning_rate": 1.8130081300813007e-06, |
| "loss": 0.6941, |
| "mean_token_accuracy": 0.9283354431390762, |
| "num_tokens": 20721900.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.09488309047780413, |
| "grad_norm": 73.22218164338094, |
| "learning_rate": 1.8116531165311652e-06, |
| "loss": 0.6646, |
| "mean_token_accuracy": 0.9332837462425232, |
| "num_tokens": 20868804.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09556082683835988, |
| "grad_norm": 71.46610642433735, |
| "learning_rate": 1.81029810298103e-06, |
| "loss": 0.6465, |
| "mean_token_accuracy": 0.9334042221307755, |
| "num_tokens": 21015885.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.09623856319891562, |
| "grad_norm": 72.70207166228874, |
| "learning_rate": 1.8089430894308942e-06, |
| "loss": 0.645, |
| "mean_token_accuracy": 0.9326634481549263, |
| "num_tokens": 21168376.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.09691629955947137, |
| "grad_norm": 71.05676304804858, |
| "learning_rate": 1.8075880758807587e-06, |
| "loss": 0.6231, |
| "mean_token_accuracy": 0.9345277771353722, |
| "num_tokens": 21319503.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0975940359200271, |
| "grad_norm": 70.89133381935596, |
| "learning_rate": 1.8062330623306233e-06, |
| "loss": 0.614, |
| "mean_token_accuracy": 0.9351460039615631, |
| "num_tokens": 21474123.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.09827177228058286, |
| "grad_norm": 64.58516129851412, |
| "learning_rate": 1.8048780487804878e-06, |
| "loss": 0.5968, |
| "mean_token_accuracy": 0.9304576441645622, |
| "num_tokens": 21624512.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.09894950864113859, |
| "grad_norm": 65.24856190459862, |
| "learning_rate": 1.8035230352303523e-06, |
| "loss": 0.5809, |
| "mean_token_accuracy": 0.9337568357586861, |
| "num_tokens": 21776372.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.09962724500169434, |
| "grad_norm": 62.568471114426266, |
| "learning_rate": 1.8021680216802166e-06, |
| "loss": 0.5723, |
| "mean_token_accuracy": 0.9315716549754143, |
| "num_tokens": 21926956.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.10030498136225008, |
| "grad_norm": 59.965025191532895, |
| "learning_rate": 1.8008130081300811e-06, |
| "loss": 0.5625, |
| "mean_token_accuracy": 0.9297163262963295, |
| "num_tokens": 22075445.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.10098271772280583, |
| "grad_norm": 59.35204164689335, |
| "learning_rate": 1.7994579945799456e-06, |
| "loss": 0.5394, |
| "mean_token_accuracy": 0.9344234243035316, |
| "num_tokens": 22224724.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.10166045408336158, |
| "grad_norm": 56.05971904812392, |
| "learning_rate": 1.7981029810298104e-06, |
| "loss": 0.5217, |
| "mean_token_accuracy": 0.932852178812027, |
| "num_tokens": 22372664.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10233819044391732, |
| "grad_norm": 55.899238426042885, |
| "learning_rate": 1.7967479674796747e-06, |
| "loss": 0.5111, |
| "mean_token_accuracy": 0.9346337839961052, |
| "num_tokens": 22522123.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.10301592680447307, |
| "grad_norm": 53.66256786236335, |
| "learning_rate": 1.7953929539295392e-06, |
| "loss": 0.5019, |
| "mean_token_accuracy": 0.9335299357771873, |
| "num_tokens": 22672150.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.1036936631650288, |
| "grad_norm": 50.73144865079355, |
| "learning_rate": 1.7940379403794037e-06, |
| "loss": 0.484, |
| "mean_token_accuracy": 0.9336813315749168, |
| "num_tokens": 22818967.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.10437139952558455, |
| "grad_norm": 49.08098043135844, |
| "learning_rate": 1.7926829268292682e-06, |
| "loss": 0.4711, |
| "mean_token_accuracy": 0.9337323307991028, |
| "num_tokens": 22966978.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.10504913588614029, |
| "grad_norm": 46.55240197600489, |
| "learning_rate": 1.7913279132791328e-06, |
| "loss": 0.4667, |
| "mean_token_accuracy": 0.9307694062590599, |
| "num_tokens": 23113941.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.10572687224669604, |
| "grad_norm": 45.4207324250192, |
| "learning_rate": 1.789972899728997e-06, |
| "loss": 0.4495, |
| "mean_token_accuracy": 0.9333298355340958, |
| "num_tokens": 23260864.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.10640460860725177, |
| "grad_norm": 43.93823803845357, |
| "learning_rate": 1.7886178861788616e-06, |
| "loss": 0.4405, |
| "mean_token_accuracy": 0.9329454302787781, |
| "num_tokens": 23410477.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.10708234496780752, |
| "grad_norm": 43.85611117671106, |
| "learning_rate": 1.7872628726287263e-06, |
| "loss": 0.4286, |
| "mean_token_accuracy": 0.9351244196295738, |
| "num_tokens": 23562882.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.10776008132836326, |
| "grad_norm": 41.32401336746093, |
| "learning_rate": 1.7859078590785908e-06, |
| "loss": 0.4164, |
| "mean_token_accuracy": 0.9357188642024994, |
| "num_tokens": 23713602.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.10843781768891901, |
| "grad_norm": 39.5175400314298, |
| "learning_rate": 1.7845528455284554e-06, |
| "loss": 0.406, |
| "mean_token_accuracy": 0.9362591058015823, |
| "num_tokens": 23860727.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10911555404947476, |
| "grad_norm": 38.416625804623116, |
| "learning_rate": 1.7831978319783197e-06, |
| "loss": 0.4068, |
| "mean_token_accuracy": 0.9329937174916267, |
| "num_tokens": 24012425.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.1097932904100305, |
| "grad_norm": 37.59441870071182, |
| "learning_rate": 1.7818428184281842e-06, |
| "loss": 0.3909, |
| "mean_token_accuracy": 0.9354848563671112, |
| "num_tokens": 24164863.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.11047102677058625, |
| "grad_norm": 33.91348570223861, |
| "learning_rate": 1.7804878048780487e-06, |
| "loss": 0.3914, |
| "mean_token_accuracy": 0.9322528839111328, |
| "num_tokens": 24311333.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.11114876313114198, |
| "grad_norm": 35.04551034637384, |
| "learning_rate": 1.7791327913279132e-06, |
| "loss": 0.3714, |
| "mean_token_accuracy": 0.9384682103991508, |
| "num_tokens": 24463590.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.11182649949169773, |
| "grad_norm": 33.68435544945808, |
| "learning_rate": 1.7777777777777775e-06, |
| "loss": 0.3835, |
| "mean_token_accuracy": 0.932737372815609, |
| "num_tokens": 24619168.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.11250423585225347, |
| "grad_norm": 29.82505772722471, |
| "learning_rate": 1.776422764227642e-06, |
| "loss": 0.3639, |
| "mean_token_accuracy": 0.933692567050457, |
| "num_tokens": 24765859.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.11318197221280922, |
| "grad_norm": 30.525423505810934, |
| "learning_rate": 1.7750677506775068e-06, |
| "loss": 0.3507, |
| "mean_token_accuracy": 0.9373074173927307, |
| "num_tokens": 24917083.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.11385970857336496, |
| "grad_norm": 29.204223791728122, |
| "learning_rate": 1.7737127371273713e-06, |
| "loss": 0.3529, |
| "mean_token_accuracy": 0.9359963908791542, |
| "num_tokens": 25070216.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.1145374449339207, |
| "grad_norm": 26.974547673145388, |
| "learning_rate": 1.7723577235772358e-06, |
| "loss": 0.3535, |
| "mean_token_accuracy": 0.9311786666512489, |
| "num_tokens": 25222236.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.11521518129447644, |
| "grad_norm": 25.967670455879, |
| "learning_rate": 1.7710027100271001e-06, |
| "loss": 0.3434, |
| "mean_token_accuracy": 0.9331553354859352, |
| "num_tokens": 25374346.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1158929176550322, |
| "grad_norm": 24.357263220519712, |
| "learning_rate": 1.7696476964769647e-06, |
| "loss": 0.3335, |
| "mean_token_accuracy": 0.9339867532253265, |
| "num_tokens": 25522811.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.11657065401558793, |
| "grad_norm": 24.239286311319983, |
| "learning_rate": 1.7682926829268292e-06, |
| "loss": 0.3158, |
| "mean_token_accuracy": 0.9381037876009941, |
| "num_tokens": 25674231.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.11724839037614368, |
| "grad_norm": 23.38288846125647, |
| "learning_rate": 1.7669376693766937e-06, |
| "loss": 0.3186, |
| "mean_token_accuracy": 0.9358685091137886, |
| "num_tokens": 25825637.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.11792612673669943, |
| "grad_norm": 21.22164404336873, |
| "learning_rate": 1.765582655826558e-06, |
| "loss": 0.3261, |
| "mean_token_accuracy": 0.9328601211309433, |
| "num_tokens": 25974206.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.11860386309725517, |
| "grad_norm": 20.66910608762449, |
| "learning_rate": 1.7642276422764225e-06, |
| "loss": 0.3041, |
| "mean_token_accuracy": 0.9370677098631859, |
| "num_tokens": 26122520.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.11928159945781092, |
| "grad_norm": 19.929368999525767, |
| "learning_rate": 1.7628726287262872e-06, |
| "loss": 0.3054, |
| "mean_token_accuracy": 0.9352571219205856, |
| "num_tokens": 26270638.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.11995933581836665, |
| "grad_norm": 18.77625094705047, |
| "learning_rate": 1.7615176151761518e-06, |
| "loss": 0.3008, |
| "mean_token_accuracy": 0.9352380633354187, |
| "num_tokens": 26419854.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.1206370721789224, |
| "grad_norm": 18.135479252465245, |
| "learning_rate": 1.7601626016260163e-06, |
| "loss": 0.2888, |
| "mean_token_accuracy": 0.9373323991894722, |
| "num_tokens": 26570929.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.12131480853947814, |
| "grad_norm": 17.09097110378426, |
| "learning_rate": 1.7588075880758806e-06, |
| "loss": 0.2904, |
| "mean_token_accuracy": 0.9367010816931725, |
| "num_tokens": 26721450.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.12199254490003389, |
| "grad_norm": 16.371606132788596, |
| "learning_rate": 1.7574525745257451e-06, |
| "loss": 0.2923, |
| "mean_token_accuracy": 0.9353242516517639, |
| "num_tokens": 26872626.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.12267028126058963, |
| "grad_norm": 15.248691516705478, |
| "learning_rate": 1.7560975609756096e-06, |
| "loss": 0.2826, |
| "mean_token_accuracy": 0.9366849288344383, |
| "num_tokens": 27021185.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.12334801762114538, |
| "grad_norm": 14.665572209611303, |
| "learning_rate": 1.7547425474254742e-06, |
| "loss": 0.28, |
| "mean_token_accuracy": 0.9356379881501198, |
| "num_tokens": 27171480.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.12402575398170111, |
| "grad_norm": 13.630257963311905, |
| "learning_rate": 1.7533875338753387e-06, |
| "loss": 0.2911, |
| "mean_token_accuracy": 0.9309459328651428, |
| "num_tokens": 27322106.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.12470349034225686, |
| "grad_norm": 13.34907992144106, |
| "learning_rate": 1.7520325203252032e-06, |
| "loss": 0.2692, |
| "mean_token_accuracy": 0.9370318055152893, |
| "num_tokens": 27472644.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.1253812267028126, |
| "grad_norm": 12.514192586765324, |
| "learning_rate": 1.7506775067750677e-06, |
| "loss": 0.2869, |
| "mean_token_accuracy": 0.9315063208341599, |
| "num_tokens": 27624217.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.12605896306336836, |
| "grad_norm": 11.80365697230253, |
| "learning_rate": 1.7493224932249322e-06, |
| "loss": 0.2657, |
| "mean_token_accuracy": 0.9369841367006302, |
| "num_tokens": 27770421.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.1267366994239241, |
| "grad_norm": 11.288559694543885, |
| "learning_rate": 1.7479674796747968e-06, |
| "loss": 0.2629, |
| "mean_token_accuracy": 0.9366028532385826, |
| "num_tokens": 27919419.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.12741443578447983, |
| "grad_norm": 11.256329471358255, |
| "learning_rate": 1.746612466124661e-06, |
| "loss": 0.2608, |
| "mean_token_accuracy": 0.9365546107292175, |
| "num_tokens": 28073242.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.12809217214503557, |
| "grad_norm": 10.423222021208382, |
| "learning_rate": 1.7452574525745256e-06, |
| "loss": 0.2613, |
| "mean_token_accuracy": 0.9360240176320076, |
| "num_tokens": 28224007.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.12876990850559134, |
| "grad_norm": 10.212095148631771, |
| "learning_rate": 1.74390243902439e-06, |
| "loss": 0.2663, |
| "mean_token_accuracy": 0.9340936243534088, |
| "num_tokens": 28371178.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.12944764486614707, |
| "grad_norm": 9.412927667433392, |
| "learning_rate": 1.7425474254742546e-06, |
| "loss": 0.2542, |
| "mean_token_accuracy": 0.9363159984350204, |
| "num_tokens": 28519559.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.1301253812267028, |
| "grad_norm": 8.826276779662683, |
| "learning_rate": 1.7411924119241194e-06, |
| "loss": 0.2592, |
| "mean_token_accuracy": 0.9346078857779503, |
| "num_tokens": 28670550.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.13080311758725854, |
| "grad_norm": 8.135940057692663, |
| "learning_rate": 1.7398373983739837e-06, |
| "loss": 0.2505, |
| "mean_token_accuracy": 0.9358848333358765, |
| "num_tokens": 28816237.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.1314808539478143, |
| "grad_norm": 8.043989763018512, |
| "learning_rate": 1.7384823848238482e-06, |
| "loss": 0.2469, |
| "mean_token_accuracy": 0.9357665106654167, |
| "num_tokens": 28966298.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.13215859030837004, |
| "grad_norm": 7.767089209326023, |
| "learning_rate": 1.7371273712737127e-06, |
| "loss": 0.2392, |
| "mean_token_accuracy": 0.9387388676404953, |
| "num_tokens": 29116059.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.13283632666892578, |
| "grad_norm": 7.318706294635459, |
| "learning_rate": 1.7357723577235772e-06, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.9373445063829422, |
| "num_tokens": 29266370.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.13351406302948154, |
| "grad_norm": 6.936442211517777, |
| "learning_rate": 1.7344173441734417e-06, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.9369916915893555, |
| "num_tokens": 29418266.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.13419179939003728, |
| "grad_norm": 6.779837554902574, |
| "learning_rate": 1.733062330623306e-06, |
| "loss": 0.2323, |
| "mean_token_accuracy": 0.9391230568289757, |
| "num_tokens": 29569892.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.13486953575059302, |
| "grad_norm": 6.186085198068867, |
| "learning_rate": 1.7317073170731706e-06, |
| "loss": 0.2383, |
| "mean_token_accuracy": 0.9360450059175491, |
| "num_tokens": 29719244.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.13554727211114875, |
| "grad_norm": 6.019009969504502, |
| "learning_rate": 1.730352303523035e-06, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.935395322740078, |
| "num_tokens": 29871381.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13622500847170452, |
| "grad_norm": 5.727962603690222, |
| "learning_rate": 1.7289972899728998e-06, |
| "loss": 0.2288, |
| "mean_token_accuracy": 0.9381909817457199, |
| "num_tokens": 30021540.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.13690274483226025, |
| "grad_norm": 5.253235694977242, |
| "learning_rate": 1.7276422764227641e-06, |
| "loss": 0.2361, |
| "mean_token_accuracy": 0.936265304684639, |
| "num_tokens": 30169377.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.137580481192816, |
| "grad_norm": 5.22232253508078, |
| "learning_rate": 1.7262872628726286e-06, |
| "loss": 0.2273, |
| "mean_token_accuracy": 0.9385531917214394, |
| "num_tokens": 30321117.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.13825821755337173, |
| "grad_norm": 4.648804691637109, |
| "learning_rate": 1.7249322493224932e-06, |
| "loss": 0.2298, |
| "mean_token_accuracy": 0.9360805526375771, |
| "num_tokens": 30467494.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.1389359539139275, |
| "grad_norm": 4.164262717598353, |
| "learning_rate": 1.7235772357723577e-06, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.9347240626811981, |
| "num_tokens": 30606325.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.13961369027448323, |
| "grad_norm": 4.536463490792273, |
| "learning_rate": 1.7222222222222222e-06, |
| "loss": 0.2195, |
| "mean_token_accuracy": 0.938719667494297, |
| "num_tokens": 30758254.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.14029142663503896, |
| "grad_norm": 4.538769291529138, |
| "learning_rate": 1.7208672086720865e-06, |
| "loss": 0.2254, |
| "mean_token_accuracy": 0.9374769926071167, |
| "num_tokens": 30905389.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.14096916299559473, |
| "grad_norm": 4.033711929104982, |
| "learning_rate": 1.719512195121951e-06, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9390349760651588, |
| "num_tokens": 31055026.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.14164689935615046, |
| "grad_norm": 3.6409458860202712, |
| "learning_rate": 1.7181571815718158e-06, |
| "loss": 0.222, |
| "mean_token_accuracy": 0.937271773815155, |
| "num_tokens": 31200100.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.1423246357167062, |
| "grad_norm": 3.4863786559982337, |
| "learning_rate": 1.7168021680216803e-06, |
| "loss": 0.2288, |
| "mean_token_accuracy": 0.9352766647934914, |
| "num_tokens": 31346837.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.14300237207726194, |
| "grad_norm": 3.383281767656107, |
| "learning_rate": 1.7154471544715446e-06, |
| "loss": 0.2197, |
| "mean_token_accuracy": 0.9381493553519249, |
| "num_tokens": 31493121.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.1436801084378177, |
| "grad_norm": 3.2964314796805243, |
| "learning_rate": 1.7140921409214091e-06, |
| "loss": 0.2084, |
| "mean_token_accuracy": 0.9412015900015831, |
| "num_tokens": 31641799.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.14435784479837344, |
| "grad_norm": 3.4016540123991397, |
| "learning_rate": 1.7127371273712736e-06, |
| "loss": 0.2144, |
| "mean_token_accuracy": 0.938295342028141, |
| "num_tokens": 31787984.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.14503558115892917, |
| "grad_norm": 3.0325880459408734, |
| "learning_rate": 1.7113821138211381e-06, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.9392581135034561, |
| "num_tokens": 31939062.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.1457133175194849, |
| "grad_norm": 3.0832469321783913, |
| "learning_rate": 1.7100271002710027e-06, |
| "loss": 0.2163, |
| "mean_token_accuracy": 0.93733299523592, |
| "num_tokens": 32089393.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.14639105388004067, |
| "grad_norm": 3.2888382634875866, |
| "learning_rate": 1.708672086720867e-06, |
| "loss": 0.205, |
| "mean_token_accuracy": 0.9410371333360672, |
| "num_tokens": 32238112.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.1470687902405964, |
| "grad_norm": 2.7808473921102803, |
| "learning_rate": 1.7073170731707315e-06, |
| "loss": 0.2108, |
| "mean_token_accuracy": 0.9397686347365379, |
| "num_tokens": 32385200.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.14774652660115214, |
| "grad_norm": 2.5104602082156737, |
| "learning_rate": 1.7059620596205962e-06, |
| "loss": 0.2164, |
| "mean_token_accuracy": 0.9379466697573662, |
| "num_tokens": 32535204.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.1484242629617079, |
| "grad_norm": 2.735127283922612, |
| "learning_rate": 1.7046070460704607e-06, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9441072791814804, |
| "num_tokens": 32687702.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.14910199932226365, |
| "grad_norm": 2.4352256213682324, |
| "learning_rate": 1.7032520325203253e-06, |
| "loss": 0.2168, |
| "mean_token_accuracy": 0.9381761774420738, |
| "num_tokens": 32840837.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.14977973568281938, |
| "grad_norm": 2.197431115010913, |
| "learning_rate": 1.7018970189701896e-06, |
| "loss": 0.2077, |
| "mean_token_accuracy": 0.9409635365009308, |
| "num_tokens": 32985864.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.15045747204337512, |
| "grad_norm": 2.3023481059782136, |
| "learning_rate": 1.700542005420054e-06, |
| "loss": 0.1935, |
| "mean_token_accuracy": 0.9441099762916565, |
| "num_tokens": 33136575.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.15113520840393088, |
| "grad_norm": 2.1703415601449567, |
| "learning_rate": 1.6991869918699186e-06, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9431856349110603, |
| "num_tokens": 33285733.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.15181294476448662, |
| "grad_norm": 2.0999864868513454, |
| "learning_rate": 1.6978319783197831e-06, |
| "loss": 0.1987, |
| "mean_token_accuracy": 0.9425452724099159, |
| "num_tokens": 33434357.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.15249068112504235, |
| "grad_norm": 1.982812903952784, |
| "learning_rate": 1.6964769647696474e-06, |
| "loss": 0.2032, |
| "mean_token_accuracy": 0.9406230673193932, |
| "num_tokens": 33581539.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.1531684174855981, |
| "grad_norm": 1.988849372609138, |
| "learning_rate": 1.6951219512195122e-06, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.9410624876618385, |
| "num_tokens": 33731875.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.15384615384615385, |
| "grad_norm": 2.0620676978301242, |
| "learning_rate": 1.6937669376693767e-06, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.944817841053009, |
| "num_tokens": 33883783.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.1545238902067096, |
| "grad_norm": 1.8858609906294919, |
| "learning_rate": 1.6924119241192412e-06, |
| "loss": 0.1954, |
| "mean_token_accuracy": 0.9423842057585716, |
| "num_tokens": 34026743.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.15520162656726533, |
| "grad_norm": 1.7701044225179388, |
| "learning_rate": 1.6910569105691057e-06, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9419083893299103, |
| "num_tokens": 34174186.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.1558793629278211, |
| "grad_norm": 1.8072093712894957, |
| "learning_rate": 1.68970189701897e-06, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9456257075071335, |
| "num_tokens": 34323042.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.15655709928837683, |
| "grad_norm": 1.7231337811365375, |
| "learning_rate": 1.6883468834688346e-06, |
| "loss": 0.1934, |
| "mean_token_accuracy": 0.9430926144123077, |
| "num_tokens": 34466803.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.15723483564893256, |
| "grad_norm": 1.7500461493474793, |
| "learning_rate": 1.686991869918699e-06, |
| "loss": 0.1955, |
| "mean_token_accuracy": 0.9432472810149193, |
| "num_tokens": 34614121.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.1579125720094883, |
| "grad_norm": 1.8031685403461577, |
| "learning_rate": 1.6856368563685636e-06, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9413378089666367, |
| "num_tokens": 34768119.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.15859030837004406, |
| "grad_norm": 1.5907404865528574, |
| "learning_rate": 1.684281842818428e-06, |
| "loss": 0.177, |
| "mean_token_accuracy": 0.9471688345074654, |
| "num_tokens": 34916728.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.1592680447305998, |
| "grad_norm": 1.9739387942567423, |
| "learning_rate": 1.6829268292682926e-06, |
| "loss": 0.1968, |
| "mean_token_accuracy": 0.942169301211834, |
| "num_tokens": 35064413.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.15994578109115554, |
| "grad_norm": 1.8386525906268687, |
| "learning_rate": 1.6815718157181572e-06, |
| "loss": 0.203, |
| "mean_token_accuracy": 0.9395218342542648, |
| "num_tokens": 35215765.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.16062351745171127, |
| "grad_norm": 1.429285344989454, |
| "learning_rate": 1.6802168021680217e-06, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9406212717294693, |
| "num_tokens": 35359898.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.16130125381226704, |
| "grad_norm": 1.4973311916596244, |
| "learning_rate": 1.6788617886178862e-06, |
| "loss": 0.1949, |
| "mean_token_accuracy": 0.9427109137177467, |
| "num_tokens": 35507980.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.16197899017282277, |
| "grad_norm": 1.4499758005643015, |
| "learning_rate": 1.6775067750677505e-06, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.9435393437743187, |
| "num_tokens": 35660165.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.1626567265333785, |
| "grad_norm": 1.380569873662013, |
| "learning_rate": 1.676151761517615e-06, |
| "loss": 0.1961, |
| "mean_token_accuracy": 0.9422763735055923, |
| "num_tokens": 35808219.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.16333446289393425, |
| "grad_norm": 1.4096173759739132, |
| "learning_rate": 1.6747967479674795e-06, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.942513681948185, |
| "num_tokens": 35955460.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.16401219925449, |
| "grad_norm": 1.4868939861076391, |
| "learning_rate": 1.673441734417344e-06, |
| "loss": 0.1988, |
| "mean_token_accuracy": 0.9399052634835243, |
| "num_tokens": 36103261.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.16468993561504575, |
| "grad_norm": 1.4375727799110902, |
| "learning_rate": 1.6720867208672088e-06, |
| "loss": 0.1973, |
| "mean_token_accuracy": 0.9417018443346024, |
| "num_tokens": 36248189.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.16536767197560148, |
| "grad_norm": 1.314070869390369, |
| "learning_rate": 1.670731707317073e-06, |
| "loss": 0.1893, |
| "mean_token_accuracy": 0.9426815882325172, |
| "num_tokens": 36393544.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.16604540833615725, |
| "grad_norm": 1.3499549232471146, |
| "learning_rate": 1.6693766937669376e-06, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9427401125431061, |
| "num_tokens": 36543518.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.16672314469671298, |
| "grad_norm": 1.9729404580437042, |
| "learning_rate": 1.6680216802168021e-06, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9439921900629997, |
| "num_tokens": 36694721.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.16740088105726872, |
| "grad_norm": 1.3458282419895553, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.1958, |
| "mean_token_accuracy": 0.9415561556816101, |
| "num_tokens": 36841930.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.16807861741782446, |
| "grad_norm": 1.3064160320363043, |
| "learning_rate": 1.665311653116531e-06, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9455081224441528, |
| "num_tokens": 36990131.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.16875635377838022, |
| "grad_norm": 1.261676946128337, |
| "learning_rate": 1.6639566395663955e-06, |
| "loss": 0.1833, |
| "mean_token_accuracy": 0.944699801504612, |
| "num_tokens": 37137578.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.16943409013893596, |
| "grad_norm": 1.2203540224127716, |
| "learning_rate": 1.66260162601626e-06, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9413745477795601, |
| "num_tokens": 37286524.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.1701118264994917, |
| "grad_norm": 1.3724047567386704, |
| "learning_rate": 1.6612466124661245e-06, |
| "loss": 0.1891, |
| "mean_token_accuracy": 0.9431008100509644, |
| "num_tokens": 37438074.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.17078956286004743, |
| "grad_norm": 1.3174705549054002, |
| "learning_rate": 1.6598915989159893e-06, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9415145292878151, |
| "num_tokens": 37589233.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.1714672992206032, |
| "grad_norm": 1.1838612410896918, |
| "learning_rate": 1.6585365853658536e-06, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9449172392487526, |
| "num_tokens": 37739989.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.17214503558115893, |
| "grad_norm": 1.2635613106193353, |
| "learning_rate": 1.657181571815718e-06, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.9437820985913277, |
| "num_tokens": 37893549.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.17282277194171466, |
| "grad_norm": 1.282954852219653, |
| "learning_rate": 1.6558265582655826e-06, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9490370899438858, |
| "num_tokens": 38040724.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.17350050830227043, |
| "grad_norm": 1.2530699631714797, |
| "learning_rate": 1.6544715447154471e-06, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9446689784526825, |
| "num_tokens": 38188254.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.17417824466282616, |
| "grad_norm": 1.100306273258646, |
| "learning_rate": 1.6531165311653114e-06, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9455656632781029, |
| "num_tokens": 38330425.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.1748559810233819, |
| "grad_norm": 1.5264947374361602, |
| "learning_rate": 1.651761517615176e-06, |
| "loss": 0.1861, |
| "mean_token_accuracy": 0.943930372595787, |
| "num_tokens": 38477837.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.17553371738393764, |
| "grad_norm": 1.1532449440630994, |
| "learning_rate": 1.6504065040650405e-06, |
| "loss": 0.1861, |
| "mean_token_accuracy": 0.9444294348359108, |
| "num_tokens": 38626257.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.1762114537444934, |
| "grad_norm": 1.6544229249884137, |
| "learning_rate": 1.6490514905149052e-06, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9426649659872055, |
| "num_tokens": 38776569.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.17688919010504914, |
| "grad_norm": 1.0694243551952667, |
| "learning_rate": 1.6476964769647697e-06, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.9409672617912292, |
| "num_tokens": 38925925.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.17756692646560487, |
| "grad_norm": 1.3080074316972092, |
| "learning_rate": 1.646341463414634e-06, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9425214007496834, |
| "num_tokens": 39077640.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.1782446628261606, |
| "grad_norm": 1.3980933975114367, |
| "learning_rate": 1.6449864498644986e-06, |
| "loss": 0.1806, |
| "mean_token_accuracy": 0.9455323368310928, |
| "num_tokens": 39224918.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.17892239918671637, |
| "grad_norm": 1.1857997540693603, |
| "learning_rate": 1.643631436314363e-06, |
| "loss": 0.1884, |
| "mean_token_accuracy": 0.9434381946921349, |
| "num_tokens": 39376315.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.1796001355472721, |
| "grad_norm": 1.134544558489507, |
| "learning_rate": 1.6422764227642276e-06, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.9452700912952423, |
| "num_tokens": 39527098.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.18027787190782785, |
| "grad_norm": 1.020597711825539, |
| "learning_rate": 1.6409214092140921e-06, |
| "loss": 0.1774, |
| "mean_token_accuracy": 0.9458664432168007, |
| "num_tokens": 39677407.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.1809556082683836, |
| "grad_norm": 1.0196216805968796, |
| "learning_rate": 1.6395663956639564e-06, |
| "loss": 0.1938, |
| "mean_token_accuracy": 0.9419092014431953, |
| "num_tokens": 39824965.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.18163334462893935, |
| "grad_norm": 1.0375652692946211, |
| "learning_rate": 1.638211382113821e-06, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.9472092837095261, |
| "num_tokens": 39973184.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.18231108098949508, |
| "grad_norm": 1.069543030178723, |
| "learning_rate": 1.6368563685636857e-06, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.9444401264190674, |
| "num_tokens": 40126749.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.18298881735005082, |
| "grad_norm": 2.6087291009006095, |
| "learning_rate": 1.6355013550135502e-06, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9445175155997276, |
| "num_tokens": 40274296.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.18366655371060658, |
| "grad_norm": 1.145610129912151, |
| "learning_rate": 1.6341463414634145e-06, |
| "loss": 0.1763, |
| "mean_token_accuracy": 0.9467235654592514, |
| "num_tokens": 40424327.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.18434429007116232, |
| "grad_norm": 0.9442070840190216, |
| "learning_rate": 1.632791327913279e-06, |
| "loss": 0.1799, |
| "mean_token_accuracy": 0.9448632001876831, |
| "num_tokens": 40576373.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.18502202643171806, |
| "grad_norm": 1.1101817736320692, |
| "learning_rate": 1.6314363143631435e-06, |
| "loss": 0.1864, |
| "mean_token_accuracy": 0.9435540661215782, |
| "num_tokens": 40726947.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.1856997627922738, |
| "grad_norm": 1.0278047553144887, |
| "learning_rate": 1.630081300813008e-06, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9442450702190399, |
| "num_tokens": 40881512.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.18637749915282956, |
| "grad_norm": 1.0047281423735306, |
| "learning_rate": 1.6287262872628726e-06, |
| "loss": 0.1781, |
| "mean_token_accuracy": 0.9466114342212677, |
| "num_tokens": 41032071.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.1870552355133853, |
| "grad_norm": 0.9207829057447015, |
| "learning_rate": 1.6273712737127369e-06, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.9454649612307549, |
| "num_tokens": 41178572.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.18773297187394103, |
| "grad_norm": 1.0459821704366787, |
| "learning_rate": 1.6260162601626016e-06, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9479309245944023, |
| "num_tokens": 41326245.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.1884107082344968, |
| "grad_norm": 0.8895774959436806, |
| "learning_rate": 1.6246612466124661e-06, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.9422756433486938, |
| "num_tokens": 41477100.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.18908844459505253, |
| "grad_norm": 1.0128601516532234, |
| "learning_rate": 1.6233062330623307e-06, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9443082809448242, |
| "num_tokens": 41630341.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.18976618095560827, |
| "grad_norm": 2.0424823903284413, |
| "learning_rate": 1.6219512195121952e-06, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9452182948589325, |
| "num_tokens": 41780343.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.190443917316164, |
| "grad_norm": 0.8145971834620163, |
| "learning_rate": 1.6205962059620595e-06, |
| "loss": 0.1792, |
| "mean_token_accuracy": 0.9458454251289368, |
| "num_tokens": 41933317.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.19112165367671977, |
| "grad_norm": 1.078154403405584, |
| "learning_rate": 1.619241192411924e-06, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9441016316413879, |
| "num_tokens": 42079542.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.1917993900372755, |
| "grad_norm": 0.8513071000906206, |
| "learning_rate": 1.6178861788617885e-06, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9428616538643837, |
| "num_tokens": 42228028.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.19247712639783124, |
| "grad_norm": 0.8207607308186832, |
| "learning_rate": 1.616531165311653e-06, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.9433973506093025, |
| "num_tokens": 42377327.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.19315486275838697, |
| "grad_norm": 0.9210360751803185, |
| "learning_rate": 1.6151761517615173e-06, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9449851289391518, |
| "num_tokens": 42529591.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.19383259911894274, |
| "grad_norm": 1.2912492016134, |
| "learning_rate": 1.613821138211382e-06, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9434697777032852, |
| "num_tokens": 42676420.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.19451033547949848, |
| "grad_norm": 0.8456164080511251, |
| "learning_rate": 1.6124661246612466e-06, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9428432658314705, |
| "num_tokens": 42827210.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.1951880718400542, |
| "grad_norm": 0.9723505555850847, |
| "learning_rate": 1.6111111111111111e-06, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9461212381720543, |
| "num_tokens": 42976660.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.19586580820060998, |
| "grad_norm": 0.8654379574437804, |
| "learning_rate": 1.6097560975609756e-06, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9445760920643806, |
| "num_tokens": 43125512.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.1965435445611657, |
| "grad_norm": 0.8652948188343491, |
| "learning_rate": 1.60840108401084e-06, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9456342980265617, |
| "num_tokens": 43277205.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.19722128092172145, |
| "grad_norm": 0.8533413578650831, |
| "learning_rate": 1.6070460704607045e-06, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.9441514536738396, |
| "num_tokens": 43423965.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.19789901728227718, |
| "grad_norm": 2.515713642686306, |
| "learning_rate": 1.605691056910569e-06, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.9408286511898041, |
| "num_tokens": 43575854.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.19857675364283295, |
| "grad_norm": 0.795648973749908, |
| "learning_rate": 1.6043360433604335e-06, |
| "loss": 0.1813, |
| "mean_token_accuracy": 0.9449711665511131, |
| "num_tokens": 43721651.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.19925449000338868, |
| "grad_norm": 0.8159918175759459, |
| "learning_rate": 1.602981029810298e-06, |
| "loss": 0.1799, |
| "mean_token_accuracy": 0.9449080228805542, |
| "num_tokens": 43869979.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.19993222636394442, |
| "grad_norm": 0.8163105160373558, |
| "learning_rate": 1.6016260162601625e-06, |
| "loss": 0.1567, |
| "mean_token_accuracy": 0.952298603951931, |
| "num_tokens": 44018298.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.20060996272450016, |
| "grad_norm": 0.8492083507814324, |
| "learning_rate": 1.600271002710027e-06, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9416857361793518, |
| "num_tokens": 44172138.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.20128769908505592, |
| "grad_norm": 0.8226773724748727, |
| "learning_rate": 1.5989159891598916e-06, |
| "loss": 0.1756, |
| "mean_token_accuracy": 0.9463808164000511, |
| "num_tokens": 44322425.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.20196543544561166, |
| "grad_norm": 0.8167617610295194, |
| "learning_rate": 1.597560975609756e-06, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.9460206627845764, |
| "num_tokens": 44471714.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.2026431718061674, |
| "grad_norm": 0.8121474009282322, |
| "learning_rate": 1.5962059620596204e-06, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.945349395275116, |
| "num_tokens": 44620706.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.20332090816672316, |
| "grad_norm": 0.8203925606623038, |
| "learning_rate": 1.594850948509485e-06, |
| "loss": 0.1798, |
| "mean_token_accuracy": 0.9457396641373634, |
| "num_tokens": 44768001.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2039986445272789, |
| "grad_norm": 1.1162539595032475, |
| "learning_rate": 1.5934959349593495e-06, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9439538642764091, |
| "num_tokens": 44917279.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.20467638088783463, |
| "grad_norm": 0.9438985030858859, |
| "learning_rate": 1.592140921409214e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9461761340498924, |
| "num_tokens": 45064408.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.20535411724839037, |
| "grad_norm": 0.7513340518725357, |
| "learning_rate": 1.5907859078590787e-06, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.943101279437542, |
| "num_tokens": 45211245.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.20603185360894613, |
| "grad_norm": 0.7923678369199532, |
| "learning_rate": 1.589430894308943e-06, |
| "loss": 0.1961, |
| "mean_token_accuracy": 0.9411340057849884, |
| "num_tokens": 45360319.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.20670958996950187, |
| "grad_norm": 0.8234845112435402, |
| "learning_rate": 1.5880758807588075e-06, |
| "loss": 0.1754, |
| "mean_token_accuracy": 0.9461505860090256, |
| "num_tokens": 45508437.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.2073873263300576, |
| "grad_norm": 0.8035375957173344, |
| "learning_rate": 1.586720867208672e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9456483274698257, |
| "num_tokens": 45657849.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.20806506269061334, |
| "grad_norm": 0.7798304235915292, |
| "learning_rate": 1.5853658536585366e-06, |
| "loss": 0.1781, |
| "mean_token_accuracy": 0.9458611235022545, |
| "num_tokens": 45806799.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.2087427990511691, |
| "grad_norm": 1.027733000497745, |
| "learning_rate": 1.5840108401084009e-06, |
| "loss": 0.1792, |
| "mean_token_accuracy": 0.9450256898999214, |
| "num_tokens": 45956514.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.20942053541172484, |
| "grad_norm": 0.9708179518638089, |
| "learning_rate": 1.5826558265582654e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9480339214205742, |
| "num_tokens": 46101153.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.21009827177228058, |
| "grad_norm": 1.0101673484960263, |
| "learning_rate": 1.58130081300813e-06, |
| "loss": 0.1745, |
| "mean_token_accuracy": 0.9472529590129852, |
| "num_tokens": 46250830.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.21077600813283634, |
| "grad_norm": 0.8055269982671405, |
| "learning_rate": 1.5799457994579946e-06, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9427757039666176, |
| "num_tokens": 46399689.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.21145374449339208, |
| "grad_norm": 0.8131150374890923, |
| "learning_rate": 1.5785907859078592e-06, |
| "loss": 0.1828, |
| "mean_token_accuracy": 0.9429053366184235, |
| "num_tokens": 46550048.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.2121314808539478, |
| "grad_norm": 0.9916815324998841, |
| "learning_rate": 1.5772357723577235e-06, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9444558545947075, |
| "num_tokens": 46700214.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.21280921721450355, |
| "grad_norm": 0.8218886276826911, |
| "learning_rate": 1.575880758807588e-06, |
| "loss": 0.1757, |
| "mean_token_accuracy": 0.9461935609579086, |
| "num_tokens": 46850904.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.2134869535750593, |
| "grad_norm": 1.5051826126447845, |
| "learning_rate": 1.5745257452574525e-06, |
| "loss": 0.1942, |
| "mean_token_accuracy": 0.9406725689768791, |
| "num_tokens": 46995645.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.21416468993561505, |
| "grad_norm": 1.1804298128417674, |
| "learning_rate": 1.573170731707317e-06, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9470863491296768, |
| "num_tokens": 47144834.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.21484242629617079, |
| "grad_norm": 1.8709738855807754, |
| "learning_rate": 1.5718157181571813e-06, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.9464762061834335, |
| "num_tokens": 47295935.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.21552016265672652, |
| "grad_norm": 0.9948866957339356, |
| "learning_rate": 1.5704607046070459e-06, |
| "loss": 0.1926, |
| "mean_token_accuracy": 0.9422658383846283, |
| "num_tokens": 47448767.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.21619789901728229, |
| "grad_norm": 0.6859056336958299, |
| "learning_rate": 1.5691056910569104e-06, |
| "loss": 0.1768, |
| "mean_token_accuracy": 0.9466421827673912, |
| "num_tokens": 47597776.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.21687563537783802, |
| "grad_norm": 0.9534355123013837, |
| "learning_rate": 1.5677506775067751e-06, |
| "loss": 0.1741, |
| "mean_token_accuracy": 0.9466301873326302, |
| "num_tokens": 47749593.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.21755337173839376, |
| "grad_norm": 0.6601057176223992, |
| "learning_rate": 1.5663956639566396e-06, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9476458579301834, |
| "num_tokens": 47899535.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.21823110809894952, |
| "grad_norm": 1.4849205670776673, |
| "learning_rate": 1.565040650406504e-06, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9445011243224144, |
| "num_tokens": 48047352.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.21890884445950526, |
| "grad_norm": 0.743404066147728, |
| "learning_rate": 1.5636856368563685e-06, |
| "loss": 0.1804, |
| "mean_token_accuracy": 0.9452903419733047, |
| "num_tokens": 48197212.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.219586580820061, |
| "grad_norm": 0.6631613920135498, |
| "learning_rate": 1.562330623306233e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9489706978201866, |
| "num_tokens": 48347691.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.22026431718061673, |
| "grad_norm": 0.6597316081566086, |
| "learning_rate": 1.5609756097560975e-06, |
| "loss": 0.1734, |
| "mean_token_accuracy": 0.9466495141386986, |
| "num_tokens": 48494182.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2209420535411725, |
| "grad_norm": 0.7592815379964866, |
| "learning_rate": 1.559620596205962e-06, |
| "loss": 0.1875, |
| "mean_token_accuracy": 0.9426940456032753, |
| "num_tokens": 48644558.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.22161978990172823, |
| "grad_norm": 0.7647348178811313, |
| "learning_rate": 1.5582655826558263e-06, |
| "loss": 0.1738, |
| "mean_token_accuracy": 0.9461934119462967, |
| "num_tokens": 48795672.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.22229752626228397, |
| "grad_norm": 0.7808653399303861, |
| "learning_rate": 1.556910569105691e-06, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.9463725537061691, |
| "num_tokens": 48943974.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.2229752626228397, |
| "grad_norm": 0.756952990843898, |
| "learning_rate": 1.5555555555555556e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9455836415290833, |
| "num_tokens": 49098021.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.22365299898339547, |
| "grad_norm": 0.709020202356822, |
| "learning_rate": 1.55420054200542e-06, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9447627812623978, |
| "num_tokens": 49244636.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.2243307353439512, |
| "grad_norm": 2.5406344783012953, |
| "learning_rate": 1.5528455284552844e-06, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9430216625332832, |
| "num_tokens": 49393214.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.22500847170450694, |
| "grad_norm": 1.8544287599202158, |
| "learning_rate": 1.551490514905149e-06, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9466702789068222, |
| "num_tokens": 49543539.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.2256862080650627, |
| "grad_norm": 1.2263258425274202, |
| "learning_rate": 1.5501355013550134e-06, |
| "loss": 0.1733, |
| "mean_token_accuracy": 0.9458054676651955, |
| "num_tokens": 49694060.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.22636394442561844, |
| "grad_norm": 0.8958912135833915, |
| "learning_rate": 1.548780487804878e-06, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.9444833248853683, |
| "num_tokens": 49838822.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.22704168078617418, |
| "grad_norm": 0.7500636106488427, |
| "learning_rate": 1.5474254742547425e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9464510008692741, |
| "num_tokens": 49993407.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.2277194171467299, |
| "grad_norm": 0.8422231147582026, |
| "learning_rate": 1.5460704607046068e-06, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9492187052965164, |
| "num_tokens": 50139231.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.22839715350728568, |
| "grad_norm": 0.7618479553430881, |
| "learning_rate": 1.5447154471544715e-06, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9489440247416496, |
| "num_tokens": 50285172.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.2290748898678414, |
| "grad_norm": 3.9635844853206668, |
| "learning_rate": 1.543360433604336e-06, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9444872289896011, |
| "num_tokens": 50434641.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.22975262622839715, |
| "grad_norm": 1.5173197037721815, |
| "learning_rate": 1.5420054200542006e-06, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.9459080919623375, |
| "num_tokens": 50586423.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.2304303625889529, |
| "grad_norm": 0.8715123124413551, |
| "learning_rate": 1.540650406504065e-06, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9437100365757942, |
| "num_tokens": 50739282.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.23110809894950865, |
| "grad_norm": 0.713851985898761, |
| "learning_rate": 1.5392953929539294e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9452726021409035, |
| "num_tokens": 50889759.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.2317858353100644, |
| "grad_norm": 1.7227816114940597, |
| "learning_rate": 1.537940379403794e-06, |
| "loss": 0.1782, |
| "mean_token_accuracy": 0.9447159990668297, |
| "num_tokens": 51039115.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.23246357167062012, |
| "grad_norm": 0.6357869321933041, |
| "learning_rate": 1.5365853658536584e-06, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9484410360455513, |
| "num_tokens": 51191408.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.23314130803117586, |
| "grad_norm": 0.7641072965945757, |
| "learning_rate": 1.535230352303523e-06, |
| "loss": 0.1788, |
| "mean_token_accuracy": 0.9448902904987335, |
| "num_tokens": 51342797.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.23381904439173162, |
| "grad_norm": 1.426367810145877, |
| "learning_rate": 1.5338753387533875e-06, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9472499415278435, |
| "num_tokens": 51494034.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.23449678075228736, |
| "grad_norm": 0.8277340659798751, |
| "learning_rate": 1.532520325203252e-06, |
| "loss": 0.1779, |
| "mean_token_accuracy": 0.9454124942421913, |
| "num_tokens": 51645287.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.2351745171128431, |
| "grad_norm": 0.6225560087724911, |
| "learning_rate": 1.5311653116531165e-06, |
| "loss": 0.1782, |
| "mean_token_accuracy": 0.944624200463295, |
| "num_tokens": 51794927.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.23585225347339886, |
| "grad_norm": 0.6105718263558104, |
| "learning_rate": 1.529810298102981e-06, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9470618143677711, |
| "num_tokens": 51949310.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.2365299898339546, |
| "grad_norm": 0.8282440344375993, |
| "learning_rate": 1.5284552845528455e-06, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9437334463000298, |
| "num_tokens": 52098351.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.23720772619451033, |
| "grad_norm": 0.8045671772384535, |
| "learning_rate": 1.5271002710027099e-06, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9471992254257202, |
| "num_tokens": 52243926.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.23788546255506607, |
| "grad_norm": 0.7650637135387357, |
| "learning_rate": 1.5257452574525744e-06, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.9483982622623444, |
| "num_tokens": 52392176.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.23856319891562183, |
| "grad_norm": 0.5877138058616375, |
| "learning_rate": 1.524390243902439e-06, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9483258500695229, |
| "num_tokens": 52539091.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.23924093527617757, |
| "grad_norm": 0.9492412162850925, |
| "learning_rate": 1.5230352303523036e-06, |
| "loss": 0.1798, |
| "mean_token_accuracy": 0.94503004103899, |
| "num_tokens": 52685643.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.2399186716367333, |
| "grad_norm": 0.7914693335736157, |
| "learning_rate": 1.521680216802168e-06, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9475009217858315, |
| "num_tokens": 52836703.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.24059640799728904, |
| "grad_norm": 0.6318669285353299, |
| "learning_rate": 1.5203252032520325e-06, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9431716948747635, |
| "num_tokens": 52987407.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.2412741443578448, |
| "grad_norm": 0.6525553458237484, |
| "learning_rate": 1.518970189701897e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9459177628159523, |
| "num_tokens": 53138015.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.24195188071840054, |
| "grad_norm": 1.0182590701495176, |
| "learning_rate": 1.5176151761517615e-06, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9465559273958206, |
| "num_tokens": 53286706.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.24262961707895628, |
| "grad_norm": 0.7419379260166533, |
| "learning_rate": 1.516260162601626e-06, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9479071423411369, |
| "num_tokens": 53435927.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.24330735343951204, |
| "grad_norm": 0.6434143063693157, |
| "learning_rate": 1.5149051490514903e-06, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9488667771220207, |
| "num_tokens": 53583285.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.24398508980006778, |
| "grad_norm": 0.7064489736323913, |
| "learning_rate": 1.5135501355013548e-06, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9466379284858704, |
| "num_tokens": 53729652.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.24466282616062351, |
| "grad_norm": 0.57356802923614, |
| "learning_rate": 1.5121951219512194e-06, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.9437981992959976, |
| "num_tokens": 53879309.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.24534056252117925, |
| "grad_norm": 0.6642508244898545, |
| "learning_rate": 1.510840108401084e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.945423923432827, |
| "num_tokens": 54026614.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.24601829888173501, |
| "grad_norm": 0.7069645718402103, |
| "learning_rate": 1.5094850948509486e-06, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9490662589669228, |
| "num_tokens": 54176925.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.24669603524229075, |
| "grad_norm": 0.6324630784607443, |
| "learning_rate": 1.508130081300813e-06, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9442641958594322, |
| "num_tokens": 54325352.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.2473737716028465, |
| "grad_norm": 0.8384176307680217, |
| "learning_rate": 1.5067750677506774e-06, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9429530650377274, |
| "num_tokens": 54470233.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.24805150796340222, |
| "grad_norm": 0.6954590281573189, |
| "learning_rate": 1.505420054200542e-06, |
| "loss": 0.1755, |
| "mean_token_accuracy": 0.9451927840709686, |
| "num_tokens": 54618289.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.248729244323958, |
| "grad_norm": 0.6399857892270593, |
| "learning_rate": 1.5040650406504065e-06, |
| "loss": 0.1776, |
| "mean_token_accuracy": 0.9442391991615295, |
| "num_tokens": 54769472.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.24940698068451372, |
| "grad_norm": 0.5810678934700249, |
| "learning_rate": 1.5027100271002708e-06, |
| "loss": 0.1806, |
| "mean_token_accuracy": 0.9441807121038437, |
| "num_tokens": 54918059.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.25008471704506946, |
| "grad_norm": 0.5748648220171113, |
| "learning_rate": 1.5013550135501353e-06, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9472423642873764, |
| "num_tokens": 55061935.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.2507624534056252, |
| "grad_norm": 0.6204651627049123, |
| "learning_rate": 1.5e-06, |
| "loss": 0.1779, |
| "mean_token_accuracy": 0.9443566277623177, |
| "num_tokens": 55207909.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.25144018976618093, |
| "grad_norm": 1.9313730908634967, |
| "learning_rate": 1.4986449864498646e-06, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9468053802847862, |
| "num_tokens": 55359342.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.2521179261267367, |
| "grad_norm": 0.7053643966816144, |
| "learning_rate": 1.497289972899729e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.9480549320578575, |
| "num_tokens": 55509924.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.25279566248729246, |
| "grad_norm": 0.5711946424889177, |
| "learning_rate": 1.4959349593495934e-06, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.9459714740514755, |
| "num_tokens": 55655563.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.2534733988478482, |
| "grad_norm": 0.7057929562458208, |
| "learning_rate": 1.494579945799458e-06, |
| "loss": 0.1861, |
| "mean_token_accuracy": 0.9424010515213013, |
| "num_tokens": 55805408.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.25415113520840393, |
| "grad_norm": 0.6779480981631576, |
| "learning_rate": 1.4932249322493224e-06, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9481048583984375, |
| "num_tokens": 55954355.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.25482887156895967, |
| "grad_norm": 0.6446360496909933, |
| "learning_rate": 1.491869918699187e-06, |
| "loss": 0.1738, |
| "mean_token_accuracy": 0.9460531622171402, |
| "num_tokens": 56104752.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.2555066079295154, |
| "grad_norm": 0.5592091503855612, |
| "learning_rate": 1.4905149051490513e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9460347890853882, |
| "num_tokens": 56253865.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.25618434429007114, |
| "grad_norm": 0.7691679868780944, |
| "learning_rate": 1.4891598915989158e-06, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.9456475153565407, |
| "num_tokens": 56401545.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.25686208065062693, |
| "grad_norm": 0.5847739125186556, |
| "learning_rate": 1.4878048780487805e-06, |
| "loss": 0.1744, |
| "mean_token_accuracy": 0.9454444199800491, |
| "num_tokens": 56548503.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.25753981701118267, |
| "grad_norm": 0.736275038780874, |
| "learning_rate": 1.486449864498645e-06, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9481697604060173, |
| "num_tokens": 56698713.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2582175533717384, |
| "grad_norm": 0.5901455728571308, |
| "learning_rate": 1.4850948509485095e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9482420459389687, |
| "num_tokens": 56842679.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.25889528973229414, |
| "grad_norm": 0.6090115372442366, |
| "learning_rate": 1.4837398373983739e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9481484293937683, |
| "num_tokens": 56990745.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.2595730260928499, |
| "grad_norm": 0.617661866412643, |
| "learning_rate": 1.4823848238482384e-06, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9488430246710777, |
| "num_tokens": 57139222.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.2602507624534056, |
| "grad_norm": 0.700865862234029, |
| "learning_rate": 1.4810298102981029e-06, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9490372464060783, |
| "num_tokens": 57285381.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.26092849881396135, |
| "grad_norm": 0.6651593095325409, |
| "learning_rate": 1.4796747967479674e-06, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9455696046352386, |
| "num_tokens": 57436822.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.2616062351745171, |
| "grad_norm": 0.5245056227383542, |
| "learning_rate": 1.478319783197832e-06, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9481494948267937, |
| "num_tokens": 57580522.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.2622839715350729, |
| "grad_norm": 0.7261567302725815, |
| "learning_rate": 1.4769647696476962e-06, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9464136362075806, |
| "num_tokens": 57726389.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.2629617078956286, |
| "grad_norm": 0.5153986387448213, |
| "learning_rate": 1.475609756097561e-06, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9486708492040634, |
| "num_tokens": 57876226.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.26363944425618435, |
| "grad_norm": 0.5414010058573772, |
| "learning_rate": 1.4742547425474255e-06, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9455103054642677, |
| "num_tokens": 58027117.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.2643171806167401, |
| "grad_norm": 0.628858128487318, |
| "learning_rate": 1.47289972899729e-06, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9465411677956581, |
| "num_tokens": 58173593.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2649949169772958, |
| "grad_norm": 1.1610350160200895, |
| "learning_rate": 1.4715447154471543e-06, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9406460449099541, |
| "num_tokens": 58316482.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.26567265333785156, |
| "grad_norm": 0.5892258484112674, |
| "learning_rate": 1.4701897018970188e-06, |
| "loss": 0.1781, |
| "mean_token_accuracy": 0.9436300992965698, |
| "num_tokens": 58467767.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.2663503896984073, |
| "grad_norm": 2.1658319104198, |
| "learning_rate": 1.4688346883468834e-06, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9428680464625359, |
| "num_tokens": 58619811.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.2670281260589631, |
| "grad_norm": 0.5878236554873534, |
| "learning_rate": 1.4674796747967479e-06, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.947679303586483, |
| "num_tokens": 58770262.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.2677058624195188, |
| "grad_norm": 1.014211328981339, |
| "learning_rate": 1.4661246612466124e-06, |
| "loss": 0.1708, |
| "mean_token_accuracy": 0.9462408125400543, |
| "num_tokens": 58918504.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.26838359878007456, |
| "grad_norm": 0.573535814041953, |
| "learning_rate": 1.464769647696477e-06, |
| "loss": 0.1829, |
| "mean_token_accuracy": 0.9431460350751877, |
| "num_tokens": 59069941.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.2690613351406303, |
| "grad_norm": 0.5211265319964212, |
| "learning_rate": 1.4634146341463414e-06, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9468951299786568, |
| "num_tokens": 59217661.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.26973907150118603, |
| "grad_norm": 0.47731716171111105, |
| "learning_rate": 1.462059620596206e-06, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9483945071697235, |
| "num_tokens": 59368209.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.27041680786174177, |
| "grad_norm": 0.6369462410545568, |
| "learning_rate": 1.4607046070460705e-06, |
| "loss": 0.177, |
| "mean_token_accuracy": 0.9451368376612663, |
| "num_tokens": 59516528.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.2710945442222975, |
| "grad_norm": 0.6314122894037728, |
| "learning_rate": 1.4593495934959348e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.9454843327403069, |
| "num_tokens": 59670956.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.27177228058285324, |
| "grad_norm": 0.6436872262414639, |
| "learning_rate": 1.4579945799457993e-06, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9488751068711281, |
| "num_tokens": 59818251.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.27245001694340903, |
| "grad_norm": 0.7202028539827626, |
| "learning_rate": 1.4566395663956638e-06, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9471313506364822, |
| "num_tokens": 59963581.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.27312775330396477, |
| "grad_norm": 0.6133438385683547, |
| "learning_rate": 1.4552845528455283e-06, |
| "loss": 0.1754, |
| "mean_token_accuracy": 0.9452999755740166, |
| "num_tokens": 60112835.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.2738054896645205, |
| "grad_norm": 0.5430933799454143, |
| "learning_rate": 1.453929539295393e-06, |
| "loss": 0.1785, |
| "mean_token_accuracy": 0.9441541954874992, |
| "num_tokens": 60262081.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.27448322602507624, |
| "grad_norm": 0.5688106499270535, |
| "learning_rate": 1.4525745257452574e-06, |
| "loss": 0.1769, |
| "mean_token_accuracy": 0.9446316137909889, |
| "num_tokens": 60406861.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.275160962385632, |
| "grad_norm": 0.7473940935553454, |
| "learning_rate": 1.451219512195122e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9457308053970337, |
| "num_tokens": 60559494.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.2758386987461877, |
| "grad_norm": 0.801132251747668, |
| "learning_rate": 1.4498644986449864e-06, |
| "loss": 0.1745, |
| "mean_token_accuracy": 0.9456846341490746, |
| "num_tokens": 60710003.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.27651643510674345, |
| "grad_norm": 0.6208498730487398, |
| "learning_rate": 1.448509485094851e-06, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9447103142738342, |
| "num_tokens": 60856286.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.27719417146729924, |
| "grad_norm": 0.6774623538519904, |
| "learning_rate": 1.4471544715447155e-06, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9472365826368332, |
| "num_tokens": 61006327.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.277871907827855, |
| "grad_norm": 0.6357023280654596, |
| "learning_rate": 1.4457994579945798e-06, |
| "loss": 0.1748, |
| "mean_token_accuracy": 0.9449824169278145, |
| "num_tokens": 61155663.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2785496441884107, |
| "grad_norm": 0.586651942204708, |
| "learning_rate": 1.4444444444444443e-06, |
| "loss": 0.1809, |
| "mean_token_accuracy": 0.9449451044201851, |
| "num_tokens": 61306989.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.27922738054896645, |
| "grad_norm": 0.5943615384568424, |
| "learning_rate": 1.4430894308943088e-06, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.945733904838562, |
| "num_tokens": 61454320.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.2799051169095222, |
| "grad_norm": 0.535959689403948, |
| "learning_rate": 1.4417344173441735e-06, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9447762593626976, |
| "num_tokens": 61604384.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.2805828532700779, |
| "grad_norm": 1.1108469224632183, |
| "learning_rate": 1.4403794037940378e-06, |
| "loss": 0.1881, |
| "mean_token_accuracy": 0.9409819021821022, |
| "num_tokens": 61749668.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.28126058963063366, |
| "grad_norm": 1.4220143979516637, |
| "learning_rate": 1.4390243902439024e-06, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.948139026761055, |
| "num_tokens": 61896561.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.28193832599118945, |
| "grad_norm": 5.664714263912324, |
| "learning_rate": 1.4376693766937669e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9476560726761818, |
| "num_tokens": 62046096.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.2826160623517452, |
| "grad_norm": 0.5408574541103001, |
| "learning_rate": 1.4363143631436314e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9460098519921303, |
| "num_tokens": 62196352.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.2832937987123009, |
| "grad_norm": 0.5171848037398799, |
| "learning_rate": 1.434959349593496e-06, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.947636604309082, |
| "num_tokens": 62342590.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.28397153507285666, |
| "grad_norm": 0.7681427596162449, |
| "learning_rate": 1.4336043360433602e-06, |
| "loss": 0.1765, |
| "mean_token_accuracy": 0.9449510797858238, |
| "num_tokens": 62492065.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.2846492714334124, |
| "grad_norm": 0.5576209119650561, |
| "learning_rate": 1.4322493224932248e-06, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9463492035865784, |
| "num_tokens": 62642728.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.28532700779396813, |
| "grad_norm": 1.554946021967147, |
| "learning_rate": 1.4308943089430895e-06, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9462010189890862, |
| "num_tokens": 62791436.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.28600474415452387, |
| "grad_norm": 0.5922041226434441, |
| "learning_rate": 1.429539295392954e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.9464942514896393, |
| "num_tokens": 62936005.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.2866824805150796, |
| "grad_norm": 0.5037684488003914, |
| "learning_rate": 1.4281842818428185e-06, |
| "loss": 0.1739, |
| "mean_token_accuracy": 0.9458468109369278, |
| "num_tokens": 63082667.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.2873602168756354, |
| "grad_norm": 1.6124042876955846, |
| "learning_rate": 1.4268292682926828e-06, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9480357691645622, |
| "num_tokens": 63236677.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.28803795323619114, |
| "grad_norm": 0.5859070705046109, |
| "learning_rate": 1.4254742547425473e-06, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9433834180235863, |
| "num_tokens": 63386648.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.28871568959674687, |
| "grad_norm": 0.5369683321519448, |
| "learning_rate": 1.4241192411924119e-06, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9470108896493912, |
| "num_tokens": 63535840.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.2893934259573026, |
| "grad_norm": 0.564420602002199, |
| "learning_rate": 1.4227642276422764e-06, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.9462280347943306, |
| "num_tokens": 63682575.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.29007116231785834, |
| "grad_norm": 0.5839667069470125, |
| "learning_rate": 1.4214092140921407e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9467422887682915, |
| "num_tokens": 63831495.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.2907488986784141, |
| "grad_norm": 0.9956637241221521, |
| "learning_rate": 1.4200542005420052e-06, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9462997168302536, |
| "num_tokens": 63979852.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.2914266350389698, |
| "grad_norm": 0.5704586511896154, |
| "learning_rate": 1.41869918699187e-06, |
| "loss": 0.1746, |
| "mean_token_accuracy": 0.9454977512359619, |
| "num_tokens": 64129698.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.2921043713995256, |
| "grad_norm": 1.871088990317711, |
| "learning_rate": 1.4173441734417345e-06, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9499670192599297, |
| "num_tokens": 64279200.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.29278210776008134, |
| "grad_norm": 1.5476889433417227, |
| "learning_rate": 1.415989159891599e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9459821656346321, |
| "num_tokens": 64425862.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.2934598441206371, |
| "grad_norm": 0.5123528384746117, |
| "learning_rate": 1.4146341463414633e-06, |
| "loss": 0.1533, |
| "mean_token_accuracy": 0.9508016780018806, |
| "num_tokens": 64571479.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.2941375804811928, |
| "grad_norm": 0.5428371445057838, |
| "learning_rate": 1.4132791327913278e-06, |
| "loss": 0.1758, |
| "mean_token_accuracy": 0.9448861479759216, |
| "num_tokens": 64724369.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.29481531684174855, |
| "grad_norm": 0.5636544542076198, |
| "learning_rate": 1.4119241192411923e-06, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9479606598615646, |
| "num_tokens": 64874118.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.2954930532023043, |
| "grad_norm": 0.6426992034644753, |
| "learning_rate": 1.4105691056910569e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9485922604799271, |
| "num_tokens": 65022090.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.29617078956286, |
| "grad_norm": 0.5686534096575024, |
| "learning_rate": 1.4092140921409212e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.947353720664978, |
| "num_tokens": 65173700.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.2968485259234158, |
| "grad_norm": 0.663170523983989, |
| "learning_rate": 1.4078590785907859e-06, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9468541517853737, |
| "num_tokens": 65321565.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.29752626228397155, |
| "grad_norm": 0.9033226102569267, |
| "learning_rate": 1.4065040650406504e-06, |
| "loss": 0.1769, |
| "mean_token_accuracy": 0.9448807463049889, |
| "num_tokens": 65471086.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.2982039986445273, |
| "grad_norm": 0.6238168471167909, |
| "learning_rate": 1.405149051490515e-06, |
| "loss": 0.1799, |
| "mean_token_accuracy": 0.9433454647660255, |
| "num_tokens": 65622215.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.298881735005083, |
| "grad_norm": 0.65657877294606, |
| "learning_rate": 1.4037940379403795e-06, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9487375542521477, |
| "num_tokens": 65773020.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.29955947136563876, |
| "grad_norm": 0.5451737092122348, |
| "learning_rate": 1.4024390243902438e-06, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.9447740465402603, |
| "num_tokens": 65925286.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.3002372077261945, |
| "grad_norm": 0.9603213545168497, |
| "learning_rate": 1.4010840108401083e-06, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.944790817797184, |
| "num_tokens": 66071253.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.30091494408675024, |
| "grad_norm": 0.5051328972393605, |
| "learning_rate": 1.3997289972899728e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.9472986534237862, |
| "num_tokens": 66221084.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.30159268044730597, |
| "grad_norm": 0.5285524624142262, |
| "learning_rate": 1.3983739837398373e-06, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9456663802266121, |
| "num_tokens": 66374673.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.30227041680786176, |
| "grad_norm": 0.5569235706497918, |
| "learning_rate": 1.3970189701897018e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9494876861572266, |
| "num_tokens": 66523000.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.3029481531684175, |
| "grad_norm": 0.6230296052835673, |
| "learning_rate": 1.3956639566395664e-06, |
| "loss": 0.1759, |
| "mean_token_accuracy": 0.9450423941016197, |
| "num_tokens": 66671260.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.30362588952897324, |
| "grad_norm": 0.7398089733547972, |
| "learning_rate": 1.3943089430894309e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9479307159781456, |
| "num_tokens": 66820184.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.304303625889529, |
| "grad_norm": 0.5061983222632233, |
| "learning_rate": 1.3929539295392954e-06, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9474298283457756, |
| "num_tokens": 66965488.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.3049813622500847, |
| "grad_norm": 0.5750619870591818, |
| "learning_rate": 1.39159891598916e-06, |
| "loss": 0.1741, |
| "mean_token_accuracy": 0.9448145478963852, |
| "num_tokens": 67114457.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.30565909861064045, |
| "grad_norm": 0.6895145752098988, |
| "learning_rate": 1.3902439024390242e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9465706199407578, |
| "num_tokens": 67267546.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.3063368349711962, |
| "grad_norm": 0.5779351191860994, |
| "learning_rate": 1.3888888888888887e-06, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.9438166320323944, |
| "num_tokens": 67414865.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.307014571331752, |
| "grad_norm": 0.5393943677688567, |
| "learning_rate": 1.3875338753387533e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.9451542571187019, |
| "num_tokens": 67561255.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.516057277111362, |
| "learning_rate": 1.3861788617886178e-06, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9466835260391235, |
| "num_tokens": 67713911.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.30837004405286345, |
| "grad_norm": 0.5651895866813262, |
| "learning_rate": 1.3848238482384825e-06, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9428005740046501, |
| "num_tokens": 67862459.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.3090477804134192, |
| "grad_norm": 1.8607347681619975, |
| "learning_rate": 1.3834688346883468e-06, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9473821371793747, |
| "num_tokens": 68015157.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.3097255167739749, |
| "grad_norm": 0.6787472041403066, |
| "learning_rate": 1.3821138211382113e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9469644278287888, |
| "num_tokens": 68163700.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.31040325313453065, |
| "grad_norm": 0.5242628755897727, |
| "learning_rate": 1.3807588075880759e-06, |
| "loss": 0.1797, |
| "mean_token_accuracy": 0.9444685280323029, |
| "num_tokens": 68313252.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.3110809894950864, |
| "grad_norm": 0.6132130405405823, |
| "learning_rate": 1.3794037940379404e-06, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9476732388138771, |
| "num_tokens": 68465402.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.3117587258556422, |
| "grad_norm": 0.6238582052168113, |
| "learning_rate": 1.3780487804878047e-06, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9477816596627235, |
| "num_tokens": 68617127.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.3124364622161979, |
| "grad_norm": 0.5029242618624353, |
| "learning_rate": 1.3766937669376692e-06, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9483131021261215, |
| "num_tokens": 68767063.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.31311419857675366, |
| "grad_norm": 0.5515295054346878, |
| "learning_rate": 1.3753387533875337e-06, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9469945877790451, |
| "num_tokens": 68917560.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.3137919349373094, |
| "grad_norm": 0.6068528780075827, |
| "learning_rate": 1.3739837398373982e-06, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9475928917527199, |
| "num_tokens": 69068425.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.3144696712978651, |
| "grad_norm": 0.5545208173289021, |
| "learning_rate": 1.372628726287263e-06, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9473569318652153, |
| "num_tokens": 69217619.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.31514740765842086, |
| "grad_norm": 0.4733218472766618, |
| "learning_rate": 1.3712737127371273e-06, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9482020065188408, |
| "num_tokens": 69368394.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.3158251440189766, |
| "grad_norm": 0.4955677510087951, |
| "learning_rate": 1.3699186991869918e-06, |
| "loss": 0.1704, |
| "mean_token_accuracy": 0.9457170516252518, |
| "num_tokens": 69514936.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.31650288037953234, |
| "grad_norm": 0.7772199753715142, |
| "learning_rate": 1.3685636856368563e-06, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.947662316262722, |
| "num_tokens": 69668437.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.31718061674008813, |
| "grad_norm": 0.5756478947859188, |
| "learning_rate": 1.3672086720867208e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9456316977739334, |
| "num_tokens": 69818454.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.31785835310064386, |
| "grad_norm": 0.8871875668287942, |
| "learning_rate": 1.3658536585365854e-06, |
| "loss": 0.1768, |
| "mean_token_accuracy": 0.9445038959383965, |
| "num_tokens": 69970526.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.3185360894611996, |
| "grad_norm": 0.547639293799732, |
| "learning_rate": 1.3644986449864497e-06, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9447048604488373, |
| "num_tokens": 70118479.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.31921382582175534, |
| "grad_norm": 0.5942792200494496, |
| "learning_rate": 1.3631436314363142e-06, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9484960064291954, |
| "num_tokens": 70264592.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.3198915621823111, |
| "grad_norm": 0.8867268905803408, |
| "learning_rate": 1.361788617886179e-06, |
| "loss": 0.1804, |
| "mean_token_accuracy": 0.9441796317696571, |
| "num_tokens": 70413918.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.3205692985428668, |
| "grad_norm": 0.5964766492892464, |
| "learning_rate": 1.3604336043360434e-06, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9500296339392662, |
| "num_tokens": 70557619.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.32124703490342255, |
| "grad_norm": 0.496311307208984, |
| "learning_rate": 1.3590785907859078e-06, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.9461822360754013, |
| "num_tokens": 70712848.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.32192477126397834, |
| "grad_norm": 1.0881675555317671, |
| "learning_rate": 1.3577235772357723e-06, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9468296840786934, |
| "num_tokens": 70860911.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.3226025076245341, |
| "grad_norm": 0.7588836627706635, |
| "learning_rate": 1.3563685636856368e-06, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.9439148157835007, |
| "num_tokens": 71007181.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.3232802439850898, |
| "grad_norm": 0.7597258614999529, |
| "learning_rate": 1.3550135501355013e-06, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.94772619754076, |
| "num_tokens": 71156741.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.32395798034564555, |
| "grad_norm": 0.5079000156085575, |
| "learning_rate": 1.3536585365853658e-06, |
| "loss": 0.186, |
| "mean_token_accuracy": 0.9424601569771767, |
| "num_tokens": 71305246.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.3246357167062013, |
| "grad_norm": 0.7809688395216372, |
| "learning_rate": 1.3523035230352301e-06, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9514161571860313, |
| "num_tokens": 71456332.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.325313453066757, |
| "grad_norm": 1.8673061353136242, |
| "learning_rate": 1.3509485094850947e-06, |
| "loss": 0.1882, |
| "mean_token_accuracy": 0.9417855143547058, |
| "num_tokens": 71606935.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.32599118942731276, |
| "grad_norm": 0.48000691431047476, |
| "learning_rate": 1.3495934959349594e-06, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9492316693067551, |
| "num_tokens": 71757554.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.3266689257878685, |
| "grad_norm": 0.5014062000611049, |
| "learning_rate": 1.348238482384824e-06, |
| "loss": 0.1785, |
| "mean_token_accuracy": 0.943628765642643, |
| "num_tokens": 71903853.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.3273466621484243, |
| "grad_norm": 0.6987254900902669, |
| "learning_rate": 1.3468834688346884e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9468587040901184, |
| "num_tokens": 72054020.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.32802439850898, |
| "grad_norm": 0.5985256675867555, |
| "learning_rate": 1.3455284552845527e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9468069896101952, |
| "num_tokens": 72206495.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.32870213486953576, |
| "grad_norm": 1.3290797142590758, |
| "learning_rate": 1.3441734417344173e-06, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9495629146695137, |
| "num_tokens": 72353685.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.3293798712300915, |
| "grad_norm": 0.5119654251077203, |
| "learning_rate": 1.3428184281842818e-06, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9460138157010078, |
| "num_tokens": 72504327.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.33005760759064723, |
| "grad_norm": 0.4880586016116781, |
| "learning_rate": 1.3414634146341463e-06, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.945694237947464, |
| "num_tokens": 72654840.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.33073534395120296, |
| "grad_norm": 0.5043166922170219, |
| "learning_rate": 1.3401084010840106e-06, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9485412240028381, |
| "num_tokens": 72799372.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.3314130803117587, |
| "grad_norm": 0.535117165226787, |
| "learning_rate": 1.3387533875338753e-06, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9474276155233383, |
| "num_tokens": 72948960.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.3320908166723145, |
| "grad_norm": 0.519763234225128, |
| "learning_rate": 1.3373983739837399e-06, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9468537047505379, |
| "num_tokens": 73100760.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.33276855303287023, |
| "grad_norm": 0.5711715197411761, |
| "learning_rate": 1.3360433604336044e-06, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9430373981595039, |
| "num_tokens": 73246508.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.33344628939342597, |
| "grad_norm": 1.1701182936544605, |
| "learning_rate": 1.334688346883469e-06, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.947103701531887, |
| "num_tokens": 73397884.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.3341240257539817, |
| "grad_norm": 0.50623069387712, |
| "learning_rate": 1.3333333333333332e-06, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9459755271673203, |
| "num_tokens": 73544836.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.33480176211453744, |
| "grad_norm": 0.5201173073498514, |
| "learning_rate": 1.3319783197831977e-06, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9458015710115433, |
| "num_tokens": 73694370.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.3354794984750932, |
| "grad_norm": 0.5538352573416628, |
| "learning_rate": 1.3306233062330622e-06, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9487332031130791, |
| "num_tokens": 73844473.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.3361572348356489, |
| "grad_norm": 0.6168876470878009, |
| "learning_rate": 1.3292682926829268e-06, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9499210342764854, |
| "num_tokens": 73996402.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.3368349711962047, |
| "grad_norm": 0.4550808255041631, |
| "learning_rate": 1.327913279132791e-06, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9480182603001595, |
| "num_tokens": 74149961.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.33751270755676044, |
| "grad_norm": 0.49242815426502456, |
| "learning_rate": 1.3265582655826558e-06, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.9449413493275642, |
| "num_tokens": 74300652.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.3381904439173162, |
| "grad_norm": 0.6538467978588217, |
| "learning_rate": 1.3252032520325203e-06, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9482651427388191, |
| "num_tokens": 74452182.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.3388681802778719, |
| "grad_norm": 0.5554154903588379, |
| "learning_rate": 1.3238482384823848e-06, |
| "loss": 0.1788, |
| "mean_token_accuracy": 0.9439527839422226, |
| "num_tokens": 74602966.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.33954591663842765, |
| "grad_norm": 0.45834856596079654, |
| "learning_rate": 1.3224932249322494e-06, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9474470615386963, |
| "num_tokens": 74752679.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.3402236529989834, |
| "grad_norm": 0.482857087748798, |
| "learning_rate": 1.3211382113821137e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9479742795228958, |
| "num_tokens": 74900663.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.3409013893595391, |
| "grad_norm": 0.4688544076647359, |
| "learning_rate": 1.3197831978319782e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.948046438395977, |
| "num_tokens": 75053468.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.34157912572009486, |
| "grad_norm": 0.8131967901070724, |
| "learning_rate": 1.3184281842818427e-06, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9472577646374702, |
| "num_tokens": 75206169.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.34225686208065065, |
| "grad_norm": 0.45932339378365955, |
| "learning_rate": 1.3170731707317072e-06, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9472535997629166, |
| "num_tokens": 75353645.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3429345984412064, |
| "grad_norm": 0.49131482848831964, |
| "learning_rate": 1.315718157181572e-06, |
| "loss": 0.1811, |
| "mean_token_accuracy": 0.9429889023303986, |
| "num_tokens": 75506144.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.3436123348017621, |
| "grad_norm": 0.7735749197588243, |
| "learning_rate": 1.3143631436314363e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.948386162519455, |
| "num_tokens": 75653020.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.34429007116231786, |
| "grad_norm": 0.5290018312058388, |
| "learning_rate": 1.3130081300813008e-06, |
| "loss": 0.179, |
| "mean_token_accuracy": 0.9440815150737762, |
| "num_tokens": 75806317.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.3449678075228736, |
| "grad_norm": 0.8792037383181599, |
| "learning_rate": 1.3116531165311653e-06, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.9479077309370041, |
| "num_tokens": 75954710.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.34564554388342933, |
| "grad_norm": 1.2379023033712115, |
| "learning_rate": 1.3102981029810298e-06, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.948448158800602, |
| "num_tokens": 76102081.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.34632328024398507, |
| "grad_norm": 0.7047671011914056, |
| "learning_rate": 1.3089430894308941e-06, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9455461129546165, |
| "num_tokens": 76253124.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.34700101660454086, |
| "grad_norm": 0.5215665430937292, |
| "learning_rate": 1.3075880758807587e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9454864710569382, |
| "num_tokens": 76401400.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.3476787529650966, |
| "grad_norm": 0.6254105625236192, |
| "learning_rate": 1.3062330623306232e-06, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9473327398300171, |
| "num_tokens": 76557019.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.34835648932565233, |
| "grad_norm": 0.5188097653856689, |
| "learning_rate": 1.3048780487804877e-06, |
| "loss": 0.1769, |
| "mean_token_accuracy": 0.9443547651171684, |
| "num_tokens": 76706741.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.34903422568620807, |
| "grad_norm": 0.4675221964626805, |
| "learning_rate": 1.3035230352303524e-06, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9501334503293037, |
| "num_tokens": 76854200.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.3497119620467638, |
| "grad_norm": 0.49070216913278825, |
| "learning_rate": 1.3021680216802167e-06, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.94585020840168, |
| "num_tokens": 77003380.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.35038969840731954, |
| "grad_norm": 0.6566218821573573, |
| "learning_rate": 1.3008130081300813e-06, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9479044005274773, |
| "num_tokens": 77153163.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.3510674347678753, |
| "grad_norm": 0.7789379307919548, |
| "learning_rate": 1.2994579945799458e-06, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9485658556222916, |
| "num_tokens": 77303522.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.35174517112843107, |
| "grad_norm": 1.6255929271886427, |
| "learning_rate": 1.2981029810298103e-06, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.9446841180324554, |
| "num_tokens": 77454740.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.3524229074889868, |
| "grad_norm": 0.7616210624348961, |
| "learning_rate": 1.2967479674796746e-06, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9477170780301094, |
| "num_tokens": 77601927.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.35310064384954254, |
| "grad_norm": 0.5266794391641887, |
| "learning_rate": 1.2953929539295391e-06, |
| "loss": 0.1741, |
| "mean_token_accuracy": 0.944876454770565, |
| "num_tokens": 77751294.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.3537783802100983, |
| "grad_norm": 0.5950589959591587, |
| "learning_rate": 1.2940379403794036e-06, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.9505382627248764, |
| "num_tokens": 77902036.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.354456116570654, |
| "grad_norm": 0.5787471802269085, |
| "learning_rate": 1.2926829268292684e-06, |
| "loss": 0.1776, |
| "mean_token_accuracy": 0.9443177506327629, |
| "num_tokens": 78050230.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.35513385293120975, |
| "grad_norm": 0.5137344606578952, |
| "learning_rate": 1.2913279132791329e-06, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9494662508368492, |
| "num_tokens": 78193724.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.3558115892917655, |
| "grad_norm": 0.5758157648566467, |
| "learning_rate": 1.2899728997289972e-06, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9469494745135307, |
| "num_tokens": 78344604.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.3564893256523212, |
| "grad_norm": 0.4823293650140685, |
| "learning_rate": 1.2886178861788617e-06, |
| "loss": 0.1733, |
| "mean_token_accuracy": 0.9464617371559143, |
| "num_tokens": 78493386.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.357167062012877, |
| "grad_norm": 1.3093734334897686, |
| "learning_rate": 1.2872628726287262e-06, |
| "loss": 0.1736, |
| "mean_token_accuracy": 0.9449149370193481, |
| "num_tokens": 78643722.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.35784479837343275, |
| "grad_norm": 0.9479862338053053, |
| "learning_rate": 1.2859078590785908e-06, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.949364185333252, |
| "num_tokens": 78794768.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.3585225347339885, |
| "grad_norm": 0.4910511709067298, |
| "learning_rate": 1.2845528455284553e-06, |
| "loss": 0.1739, |
| "mean_token_accuracy": 0.9452231824398041, |
| "num_tokens": 78945067.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.3592002710945442, |
| "grad_norm": 0.6780819218651569, |
| "learning_rate": 1.2831978319783196e-06, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9451046735048294, |
| "num_tokens": 79093659.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.35987800745509996, |
| "grad_norm": 0.6028299913157514, |
| "learning_rate": 1.281842818428184e-06, |
| "loss": 0.1692, |
| "mean_token_accuracy": 0.9468301609158516, |
| "num_tokens": 79245168.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.3605557438156557, |
| "grad_norm": 0.47202114885701624, |
| "learning_rate": 1.2804878048780488e-06, |
| "loss": 0.1703, |
| "mean_token_accuracy": 0.946605496108532, |
| "num_tokens": 79392112.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.36123348017621143, |
| "grad_norm": 0.44640985325082555, |
| "learning_rate": 1.2791327913279134e-06, |
| "loss": 0.1553, |
| "mean_token_accuracy": 0.9503597840666771, |
| "num_tokens": 79540450.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.3619112165367672, |
| "grad_norm": 0.4932596703182334, |
| "learning_rate": 1.2777777777777777e-06, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9503260552883148, |
| "num_tokens": 79688916.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.36258895289732296, |
| "grad_norm": 0.5831851813680463, |
| "learning_rate": 1.2764227642276422e-06, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9486764073371887, |
| "num_tokens": 79838845.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3632666892578787, |
| "grad_norm": 0.61181600071, |
| "learning_rate": 1.2750677506775067e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9481100291013718, |
| "num_tokens": 79988221.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.36394442561843443, |
| "grad_norm": 0.5506545480633555, |
| "learning_rate": 1.2737127371273712e-06, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9479293003678322, |
| "num_tokens": 80137590.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.36462216197899017, |
| "grad_norm": 0.43887901796596895, |
| "learning_rate": 1.2723577235772357e-06, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9483424499630928, |
| "num_tokens": 80287101.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.3652998983395459, |
| "grad_norm": 0.4955011682450888, |
| "learning_rate": 1.2710027100271e-06, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9483226388692856, |
| "num_tokens": 80436322.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.36597763470010164, |
| "grad_norm": 0.5239537864203889, |
| "learning_rate": 1.2696476964769648e-06, |
| "loss": 0.1715, |
| "mean_token_accuracy": 0.9458038881421089, |
| "num_tokens": 80581684.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.36665537106065743, |
| "grad_norm": 0.5346227062984173, |
| "learning_rate": 1.2682926829268293e-06, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.948853574693203, |
| "num_tokens": 80734232.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.36733310742121317, |
| "grad_norm": 0.43618818869719533, |
| "learning_rate": 1.2669376693766938e-06, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9466055184602737, |
| "num_tokens": 80885624.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.3680108437817689, |
| "grad_norm": 0.5126139849918319, |
| "learning_rate": 1.2655826558265581e-06, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.9452295154333115, |
| "num_tokens": 81032518.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.36868858014232464, |
| "grad_norm": 0.5156027227638869, |
| "learning_rate": 1.2642276422764226e-06, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9462386891245842, |
| "num_tokens": 81183730.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.3693663165028804, |
| "grad_norm": 0.4953647785801021, |
| "learning_rate": 1.2628726287262872e-06, |
| "loss": 0.1779, |
| "mean_token_accuracy": 0.9446133226156235, |
| "num_tokens": 81332553.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.3700440528634361, |
| "grad_norm": 0.6205616037290729, |
| "learning_rate": 1.2615176151761517e-06, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9468551427125931, |
| "num_tokens": 81481317.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.37072178922399185, |
| "grad_norm": 0.5144257052051068, |
| "learning_rate": 1.2601626016260162e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9458277970552444, |
| "num_tokens": 81629307.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.3713995255845476, |
| "grad_norm": 0.49628657698403855, |
| "learning_rate": 1.2588075880758805e-06, |
| "loss": 0.1731, |
| "mean_token_accuracy": 0.9464028999209404, |
| "num_tokens": 81780418.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.3720772619451034, |
| "grad_norm": 0.5360410530877545, |
| "learning_rate": 1.2574525745257452e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9468945264816284, |
| "num_tokens": 81931133.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.3727549983056591, |
| "grad_norm": 0.5154118695419517, |
| "learning_rate": 1.2560975609756098e-06, |
| "loss": 0.1802, |
| "mean_token_accuracy": 0.9430196806788445, |
| "num_tokens": 82077166.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.37343273466621485, |
| "grad_norm": 0.5054293298675031, |
| "learning_rate": 1.2547425474254743e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9454409256577492, |
| "num_tokens": 82227375.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.3741104710267706, |
| "grad_norm": 0.8108065600612613, |
| "learning_rate": 1.2533875338753388e-06, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9477621614933014, |
| "num_tokens": 82372453.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.3747882073873263, |
| "grad_norm": 0.4999817635617397, |
| "learning_rate": 1.2520325203252031e-06, |
| "loss": 0.1708, |
| "mean_token_accuracy": 0.9459622874855995, |
| "num_tokens": 82518157.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.37546594374788206, |
| "grad_norm": 0.6126446568489133, |
| "learning_rate": 1.2506775067750676e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9467468857765198, |
| "num_tokens": 82669176.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.3761436801084378, |
| "grad_norm": 0.4092562831332532, |
| "learning_rate": 1.2493224932249322e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9466510340571404, |
| "num_tokens": 82815386.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.3768214164689936, |
| "grad_norm": 0.5544136499451818, |
| "learning_rate": 1.2479674796747967e-06, |
| "loss": 0.1436, |
| "mean_token_accuracy": 0.9543255716562271, |
| "num_tokens": 82963781.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.3774991528295493, |
| "grad_norm": 0.6577759537029808, |
| "learning_rate": 1.2466124661246612e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9457321241497993, |
| "num_tokens": 83114702.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.37817688919010506, |
| "grad_norm": 0.6669400570725771, |
| "learning_rate": 1.2452574525745257e-06, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9488318488001823, |
| "num_tokens": 83262766.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.3788546255506608, |
| "grad_norm": 2.110564198277507, |
| "learning_rate": 1.2439024390243902e-06, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9489534422755241, |
| "num_tokens": 83414310.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.37953236191121653, |
| "grad_norm": 0.5862482748277763, |
| "learning_rate": 1.2425474254742547e-06, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9466524496674538, |
| "num_tokens": 83565463.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.38021009827177227, |
| "grad_norm": 0.44976063765840474, |
| "learning_rate": 1.2411924119241193e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.948176383972168, |
| "num_tokens": 83716123.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.380887834632328, |
| "grad_norm": 0.4871383532266955, |
| "learning_rate": 1.2398373983739836e-06, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9474801197648048, |
| "num_tokens": 83867770.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.3815655709928838, |
| "grad_norm": 0.4875394052722114, |
| "learning_rate": 1.238482384823848e-06, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9471545964479446, |
| "num_tokens": 84013176.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.38224330735343953, |
| "grad_norm": 0.4812873126942103, |
| "learning_rate": 1.2371273712737126e-06, |
| "loss": 0.1784, |
| "mean_token_accuracy": 0.9448759853839874, |
| "num_tokens": 84159306.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.38292104371399527, |
| "grad_norm": 0.48087624001771767, |
| "learning_rate": 1.2357723577235773e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9471018239855766, |
| "num_tokens": 84306663.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.383598780074551, |
| "grad_norm": 0.49836516202253933, |
| "learning_rate": 1.2344173441734419e-06, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9457863420248032, |
| "num_tokens": 84457537.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.38427651643510674, |
| "grad_norm": 0.8028630495125755, |
| "learning_rate": 1.2330623306233062e-06, |
| "loss": 0.1715, |
| "mean_token_accuracy": 0.945386491715908, |
| "num_tokens": 84604736.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.3849542527956625, |
| "grad_norm": 0.517032288094208, |
| "learning_rate": 1.2317073170731707e-06, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.9462244659662247, |
| "num_tokens": 84750138.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.3856319891562182, |
| "grad_norm": 0.524515202262513, |
| "learning_rate": 1.2303523035230352e-06, |
| "loss": 0.1734, |
| "mean_token_accuracy": 0.9456737115979195, |
| "num_tokens": 84901947.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.38630972551677395, |
| "grad_norm": 0.5016321881100445, |
| "learning_rate": 1.2289972899728997e-06, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9482433423399925, |
| "num_tokens": 85050141.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.38698746187732974, |
| "grad_norm": 0.701544567243051, |
| "learning_rate": 1.227642276422764e-06, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.9463346377015114, |
| "num_tokens": 85197933.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.3876651982378855, |
| "grad_norm": 0.9260298597170125, |
| "learning_rate": 1.2262872628726286e-06, |
| "loss": 0.1746, |
| "mean_token_accuracy": 0.9452116563916206, |
| "num_tokens": 85344354.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.3883429345984412, |
| "grad_norm": 0.4665785964639984, |
| "learning_rate": 1.224932249322493e-06, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.946522019803524, |
| "num_tokens": 85493157.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.38902067095899695, |
| "grad_norm": 0.47890919766848145, |
| "learning_rate": 1.2235772357723578e-06, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9488857388496399, |
| "num_tokens": 85640984.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.3896984073195527, |
| "grad_norm": 0.5237978003925238, |
| "learning_rate": 1.2222222222222223e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9449859485030174, |
| "num_tokens": 85787937.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.3903761436801084, |
| "grad_norm": 0.5044327985275826, |
| "learning_rate": 1.2208672086720866e-06, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.947325699031353, |
| "num_tokens": 85934883.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.39105388004066416, |
| "grad_norm": 1.212018009924748, |
| "learning_rate": 1.2195121951219512e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9467579498887062, |
| "num_tokens": 86079914.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.39173161640121995, |
| "grad_norm": 0.5117113977489923, |
| "learning_rate": 1.2181571815718157e-06, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9482778683304787, |
| "num_tokens": 86228740.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.3924093527617757, |
| "grad_norm": 0.6494915285028472, |
| "learning_rate": 1.2168021680216802e-06, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.9500811025500298, |
| "num_tokens": 86377532.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.3930870891223314, |
| "grad_norm": 0.5973179488736141, |
| "learning_rate": 1.2154471544715445e-06, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9502118080854416, |
| "num_tokens": 86526682.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.39376482548288716, |
| "grad_norm": 0.5808424568038079, |
| "learning_rate": 1.214092140921409e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.9514239802956581, |
| "num_tokens": 86676044.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.3944425618434429, |
| "grad_norm": 0.4953674203579119, |
| "learning_rate": 1.2127371273712735e-06, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9472720921039581, |
| "num_tokens": 86825381.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.39512029820399863, |
| "grad_norm": 0.4322370643094621, |
| "learning_rate": 1.2113821138211383e-06, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9499673321843147, |
| "num_tokens": 86970825.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.39579803456455437, |
| "grad_norm": 0.5358177146908822, |
| "learning_rate": 1.2100271002710028e-06, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9472456872463226, |
| "num_tokens": 87118763.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.3964757709251101, |
| "grad_norm": 0.5193297152368992, |
| "learning_rate": 1.208672086720867e-06, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.9457727372646332, |
| "num_tokens": 87267978.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.3971535072856659, |
| "grad_norm": 0.5008798591237139, |
| "learning_rate": 1.2073170731707316e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.950521744787693, |
| "num_tokens": 87413903.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.39783124364622163, |
| "grad_norm": 0.5845390262447139, |
| "learning_rate": 1.2059620596205961e-06, |
| "loss": 0.1729, |
| "mean_token_accuracy": 0.9458159878849983, |
| "num_tokens": 87567058.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.39850898000677737, |
| "grad_norm": 0.44423455283346175, |
| "learning_rate": 1.2046070460704607e-06, |
| "loss": 0.1713, |
| "mean_token_accuracy": 0.945778027176857, |
| "num_tokens": 87719494.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.3991867163673331, |
| "grad_norm": 0.47221544465230036, |
| "learning_rate": 1.2032520325203252e-06, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9476176202297211, |
| "num_tokens": 87870107.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.39986445272788884, |
| "grad_norm": 0.49885913591469067, |
| "learning_rate": 1.2018970189701895e-06, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9467541128396988, |
| "num_tokens": 88016868.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.4005421890884446, |
| "grad_norm": 0.7890592585591757, |
| "learning_rate": 1.2005420054200542e-06, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.946752056479454, |
| "num_tokens": 88164592.0, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.4012199254490003, |
| "grad_norm": 0.7369518492575368, |
| "learning_rate": 1.1991869918699187e-06, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.949407272040844, |
| "num_tokens": 88312487.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.4018976618095561, |
| "grad_norm": 0.4484504676130025, |
| "learning_rate": 1.1978319783197833e-06, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9482650607824326, |
| "num_tokens": 88460693.0, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.40257539817011184, |
| "grad_norm": 0.8485155855645619, |
| "learning_rate": 1.1964769647696476e-06, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9483138248324394, |
| "num_tokens": 88609733.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.4032531345306676, |
| "grad_norm": 0.4480372776691949, |
| "learning_rate": 1.195121951219512e-06, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9484245628118515, |
| "num_tokens": 88761086.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.4039308708912233, |
| "grad_norm": 0.4329784742071409, |
| "learning_rate": 1.1937669376693766e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9458352103829384, |
| "num_tokens": 88906999.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.40460860725177905, |
| "grad_norm": 0.5096564027934126, |
| "learning_rate": 1.1924119241192411e-06, |
| "loss": 0.1722, |
| "mean_token_accuracy": 0.9450919255614281, |
| "num_tokens": 89058412.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.4052863436123348, |
| "grad_norm": 0.45356808602832843, |
| "learning_rate": 1.1910569105691056e-06, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.9447236880660057, |
| "num_tokens": 89205873.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.4059640799728905, |
| "grad_norm": 0.4789037831314169, |
| "learning_rate": 1.18970189701897e-06, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.94582499563694, |
| "num_tokens": 89352986.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.4066418163334463, |
| "grad_norm": 0.4903512536874018, |
| "learning_rate": 1.1883468834688347e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9469872713088989, |
| "num_tokens": 89502021.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.40731955269400205, |
| "grad_norm": 0.5036605840464546, |
| "learning_rate": 1.1869918699186992e-06, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.9450404793024063, |
| "num_tokens": 89654563.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.4079972890545578, |
| "grad_norm": 0.5970722709739015, |
| "learning_rate": 1.1856368563685637e-06, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9475810378789902, |
| "num_tokens": 89807709.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.4086750254151135, |
| "grad_norm": 0.4818755636606668, |
| "learning_rate": 1.184281842818428e-06, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9479880854487419, |
| "num_tokens": 89957680.0, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.40935276177566926, |
| "grad_norm": 0.49513417717067487, |
| "learning_rate": 1.1829268292682926e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9484023600816727, |
| "num_tokens": 90104372.0, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.410030498136225, |
| "grad_norm": 0.4705947908265269, |
| "learning_rate": 1.181571815718157e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9449072778224945, |
| "num_tokens": 90254315.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.41070823449678073, |
| "grad_norm": 0.6062525288818309, |
| "learning_rate": 1.1802168021680216e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9490978792309761, |
| "num_tokens": 90406409.0, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.41138597085733647, |
| "grad_norm": 0.4887642188601793, |
| "learning_rate": 1.1788617886178861e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9504926428198814, |
| "num_tokens": 90555846.0, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.41206370721789226, |
| "grad_norm": 0.6695846625041596, |
| "learning_rate": 1.1775067750677506e-06, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9490882977843285, |
| "num_tokens": 90703080.0, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.412741443578448, |
| "grad_norm": 0.626157768978341, |
| "learning_rate": 1.1761517615176152e-06, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9481958895921707, |
| "num_tokens": 90849340.0, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.41341917993900373, |
| "grad_norm": 0.49981917199068726, |
| "learning_rate": 1.1747967479674797e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9468593597412109, |
| "num_tokens": 90998272.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.41409691629955947, |
| "grad_norm": 0.4809733175759335, |
| "learning_rate": 1.1734417344173442e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9470062106847763, |
| "num_tokens": 91147727.0, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.4147746526601152, |
| "grad_norm": 0.5546230985106029, |
| "learning_rate": 1.1720867208672087e-06, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9482028186321259, |
| "num_tokens": 91298332.0, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.41545238902067094, |
| "grad_norm": 1.369314542898872, |
| "learning_rate": 1.170731707317073e-06, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9486950933933258, |
| "num_tokens": 91448033.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.4161301253812267, |
| "grad_norm": 0.47642657411701406, |
| "learning_rate": 1.1693766937669375e-06, |
| "loss": 0.1778, |
| "mean_token_accuracy": 0.9440385848283768, |
| "num_tokens": 91594992.0, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.41680786174178247, |
| "grad_norm": 0.47669213610459643, |
| "learning_rate": 1.168021680216802e-06, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9455291479825974, |
| "num_tokens": 91737805.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.4174855981023382, |
| "grad_norm": 0.5797787298320769, |
| "learning_rate": 1.1666666666666668e-06, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.947659395635128, |
| "num_tokens": 91887143.0, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.41816333446289394, |
| "grad_norm": 0.5345821378971174, |
| "learning_rate": 1.165311653116531e-06, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9479231685400009, |
| "num_tokens": 92033481.0, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.4188410708234497, |
| "grad_norm": 0.4900914190375703, |
| "learning_rate": 1.1639566395663956e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.9459358528256416, |
| "num_tokens": 92186016.0, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.4195188071840054, |
| "grad_norm": 0.5466231230181472, |
| "learning_rate": 1.1626016260162601e-06, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9476613327860832, |
| "num_tokens": 92337069.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.42019654354456115, |
| "grad_norm": 0.48693997965582553, |
| "learning_rate": 1.1612466124661247e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9474040865898132, |
| "num_tokens": 92485244.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4208742799051169, |
| "grad_norm": 0.43074697827839376, |
| "learning_rate": 1.1598915989159892e-06, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.9458062797784805, |
| "num_tokens": 92637765.0, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.4215520162656727, |
| "grad_norm": 0.5096419473326914, |
| "learning_rate": 1.1585365853658535e-06, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9466564804315567, |
| "num_tokens": 92781503.0, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.4222297526262284, |
| "grad_norm": 0.5259582342847179, |
| "learning_rate": 1.157181571815718e-06, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9453662484884262, |
| "num_tokens": 92933443.0, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.42290748898678415, |
| "grad_norm": 0.47284404987635503, |
| "learning_rate": 1.1558265582655825e-06, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9497467800974846, |
| "num_tokens": 93083711.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.4235852253473399, |
| "grad_norm": 0.4602475722385104, |
| "learning_rate": 1.1544715447154473e-06, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9428821057081223, |
| "num_tokens": 93233232.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.4242629617078956, |
| "grad_norm": 0.5936717548555265, |
| "learning_rate": 1.1531165311653116e-06, |
| "loss": 0.1843, |
| "mean_token_accuracy": 0.943098396062851, |
| "num_tokens": 93382753.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.42494069806845136, |
| "grad_norm": 0.5047803415992738, |
| "learning_rate": 1.151761517615176e-06, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9477358534932137, |
| "num_tokens": 93530158.0, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.4256184344290071, |
| "grad_norm": 0.40107744619021507, |
| "learning_rate": 1.1504065040650406e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9463073089718819, |
| "num_tokens": 93680192.0, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.42629617078956283, |
| "grad_norm": 1.5652127560562183, |
| "learning_rate": 1.1490514905149051e-06, |
| "loss": 0.176, |
| "mean_token_accuracy": 0.9450653791427612, |
| "num_tokens": 93826231.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.4269739071501186, |
| "grad_norm": 0.5639073605989375, |
| "learning_rate": 1.1476964769647696e-06, |
| "loss": 0.1549, |
| "mean_token_accuracy": 0.9503551051020622, |
| "num_tokens": 93978792.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.42765164351067436, |
| "grad_norm": 0.598966036130003, |
| "learning_rate": 1.146341463414634e-06, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9473350793123245, |
| "num_tokens": 94127961.0, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.4283293798712301, |
| "grad_norm": 0.44224050193023334, |
| "learning_rate": 1.1449864498644985e-06, |
| "loss": 0.1707, |
| "mean_token_accuracy": 0.9464718252420425, |
| "num_tokens": 94275820.0, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.42900711623178583, |
| "grad_norm": 0.43676189629999784, |
| "learning_rate": 1.1436314363143632e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9459518492221832, |
| "num_tokens": 94427764.0, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.42968485259234157, |
| "grad_norm": 0.4853901597305064, |
| "learning_rate": 1.1422764227642277e-06, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9467138350009918, |
| "num_tokens": 94578347.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.4303625889528973, |
| "grad_norm": 0.8511262690132398, |
| "learning_rate": 1.1409214092140922e-06, |
| "loss": 0.1727, |
| "mean_token_accuracy": 0.9447790756821632, |
| "num_tokens": 94724699.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.43104032531345304, |
| "grad_norm": 0.5749925214374794, |
| "learning_rate": 1.1395663956639565e-06, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9488679245114326, |
| "num_tokens": 94871151.0, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.43171806167400884, |
| "grad_norm": 0.4064827921884073, |
| "learning_rate": 1.138211382113821e-06, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9467856511473656, |
| "num_tokens": 95016678.0, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.43239579803456457, |
| "grad_norm": 0.4499177007745121, |
| "learning_rate": 1.1368563685636856e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9464468285441399, |
| "num_tokens": 95169906.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.4330735343951203, |
| "grad_norm": 0.43655958974808085, |
| "learning_rate": 1.1355013550135501e-06, |
| "loss": 0.1707, |
| "mean_token_accuracy": 0.9456497430801392, |
| "num_tokens": 95314234.0, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.43375127075567604, |
| "grad_norm": 0.4486256023161831, |
| "learning_rate": 1.1341463414634144e-06, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9474812969565392, |
| "num_tokens": 95462991.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.4344290071162318, |
| "grad_norm": 0.697692666673496, |
| "learning_rate": 1.132791327913279e-06, |
| "loss": 0.1787, |
| "mean_token_accuracy": 0.9435675144195557, |
| "num_tokens": 95612865.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.4351067434767875, |
| "grad_norm": 0.47686709968057667, |
| "learning_rate": 1.1314363143631437e-06, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9478402659296989, |
| "num_tokens": 95762009.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.43578447983734325, |
| "grad_norm": 0.5222106123204958, |
| "learning_rate": 1.1300813008130082e-06, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.9447394981980324, |
| "num_tokens": 95913932.0, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.43646221619789904, |
| "grad_norm": 0.4408805629947386, |
| "learning_rate": 1.1287262872628727e-06, |
| "loss": 0.1525, |
| "mean_token_accuracy": 0.9512403458356857, |
| "num_tokens": 96062886.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.4371399525584548, |
| "grad_norm": 0.5189327331310518, |
| "learning_rate": 1.127371273712737e-06, |
| "loss": 0.1677, |
| "mean_token_accuracy": 0.9463458731770515, |
| "num_tokens": 96209877.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.4378176889190105, |
| "grad_norm": 0.56504508409712, |
| "learning_rate": 1.1260162601626015e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.948399268090725, |
| "num_tokens": 96353812.0, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.43849542527956625, |
| "grad_norm": 0.6696376390441588, |
| "learning_rate": 1.124661246612466e-06, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.948715977370739, |
| "num_tokens": 96503467.0, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.439173161640122, |
| "grad_norm": 8.77885251280563, |
| "learning_rate": 1.1233062330623306e-06, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9463560730218887, |
| "num_tokens": 96653851.0, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.4398508980006777, |
| "grad_norm": 0.4390179148821473, |
| "learning_rate": 1.121951219512195e-06, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9486561864614487, |
| "num_tokens": 96806318.0, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.44052863436123346, |
| "grad_norm": 0.4538980859024063, |
| "learning_rate": 1.1205962059620594e-06, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9472699463367462, |
| "num_tokens": 96957317.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4412063707217892, |
| "grad_norm": 0.5309930041301483, |
| "learning_rate": 1.1192411924119241e-06, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9494251385331154, |
| "num_tokens": 97106739.0, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.441884107082345, |
| "grad_norm": 1.1385250470817276, |
| "learning_rate": 1.1178861788617887e-06, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.9499758258461952, |
| "num_tokens": 97252810.0, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.4425618434429007, |
| "grad_norm": 0.46732463165861593, |
| "learning_rate": 1.1165311653116532e-06, |
| "loss": 0.1701, |
| "mean_token_accuracy": 0.9466474801301956, |
| "num_tokens": 97401105.0, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.44323957980345646, |
| "grad_norm": 0.4512173042208031, |
| "learning_rate": 1.1151761517615175e-06, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9485284760594368, |
| "num_tokens": 97547875.0, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.4439173161640122, |
| "grad_norm": 0.5145343354995293, |
| "learning_rate": 1.113821138211382e-06, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9473702013492584, |
| "num_tokens": 97697488.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.44459505252456794, |
| "grad_norm": 0.41260423688312087, |
| "learning_rate": 1.1124661246612465e-06, |
| "loss": 0.1738, |
| "mean_token_accuracy": 0.9457203596830368, |
| "num_tokens": 97846993.0, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.44527278888512367, |
| "grad_norm": 0.5102664687253794, |
| "learning_rate": 1.111111111111111e-06, |
| "loss": 0.18, |
| "mean_token_accuracy": 0.9433509930968285, |
| "num_tokens": 97996404.0, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.4459505252456794, |
| "grad_norm": 0.506990574421379, |
| "learning_rate": 1.1097560975609756e-06, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9421600103378296, |
| "num_tokens": 98147309.0, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.4466282616062352, |
| "grad_norm": 0.5881766043352495, |
| "learning_rate": 1.10840108401084e-06, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9479449465870857, |
| "num_tokens": 98297012.0, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.44730599796679094, |
| "grad_norm": 0.5533879121735672, |
| "learning_rate": 1.1070460704607046e-06, |
| "loss": 0.1542, |
| "mean_token_accuracy": 0.9507003426551819, |
| "num_tokens": 98445075.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.44798373432734667, |
| "grad_norm": 0.5201112017138473, |
| "learning_rate": 1.1056910569105691e-06, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9461495503783226, |
| "num_tokens": 98594676.0, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.4486614706879024, |
| "grad_norm": 0.44444711040968005, |
| "learning_rate": 1.1043360433604336e-06, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9470791891217232, |
| "num_tokens": 98744305.0, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.44933920704845814, |
| "grad_norm": 0.41457573882593707, |
| "learning_rate": 1.102981029810298e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9500356465578079, |
| "num_tokens": 98894145.0, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.4500169434090139, |
| "grad_norm": 1.2907711098136603, |
| "learning_rate": 1.1016260162601625e-06, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9473171010613441, |
| "num_tokens": 99041859.0, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.4506946797695696, |
| "grad_norm": 0.46720116943273754, |
| "learning_rate": 1.100271002710027e-06, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9454536214470863, |
| "num_tokens": 99193325.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.4513724161301254, |
| "grad_norm": 0.5735650882180938, |
| "learning_rate": 1.0989159891598915e-06, |
| "loss": 0.1781, |
| "mean_token_accuracy": 0.944037027657032, |
| "num_tokens": 99344245.0, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.45205015249068115, |
| "grad_norm": 0.42171149150991766, |
| "learning_rate": 1.0975609756097562e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9449406191706657, |
| "num_tokens": 99493777.0, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.4527278888512369, |
| "grad_norm": 2.622371719944133, |
| "learning_rate": 1.0962059620596205e-06, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9471249580383301, |
| "num_tokens": 99639666.0, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.4534056252117926, |
| "grad_norm": 0.4308619134194267, |
| "learning_rate": 1.094850948509485e-06, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9482819065451622, |
| "num_tokens": 99791278.0, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.45408336157234835, |
| "grad_norm": 0.44887095282109707, |
| "learning_rate": 1.0934959349593496e-06, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9470083937048912, |
| "num_tokens": 99941775.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.4547610979329041, |
| "grad_norm": 0.6272156675456803, |
| "learning_rate": 1.092140921409214e-06, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9480069652199745, |
| "num_tokens": 100091771.0, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.4554388342934598, |
| "grad_norm": 0.5381560149540484, |
| "learning_rate": 1.0907859078590786e-06, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.9485157206654549, |
| "num_tokens": 100239960.0, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.45611657065401556, |
| "grad_norm": 0.5687429876872409, |
| "learning_rate": 1.089430894308943e-06, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.947501078248024, |
| "num_tokens": 100387617.0, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.45679430701457135, |
| "grad_norm": 0.4784796211552843, |
| "learning_rate": 1.0880758807588074e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9460517168045044, |
| "num_tokens": 100535043.0, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.4574720433751271, |
| "grad_norm": 0.5820901146665393, |
| "learning_rate": 1.086720867208672e-06, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.946740947663784, |
| "num_tokens": 100685344.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.4581497797356828, |
| "grad_norm": 0.49825430441063895, |
| "learning_rate": 1.0853658536585367e-06, |
| "loss": 0.1749, |
| "mean_token_accuracy": 0.9447141662240028, |
| "num_tokens": 100834993.0, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.45882751609623856, |
| "grad_norm": 0.4517472676766605, |
| "learning_rate": 1.084010840108401e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9495163634419441, |
| "num_tokens": 100980820.0, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.4595052524567943, |
| "grad_norm": 4.281733626493249, |
| "learning_rate": 1.0826558265582655e-06, |
| "loss": 0.1508, |
| "mean_token_accuracy": 0.9506775587797165, |
| "num_tokens": 101125284.0, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.46018298881735004, |
| "grad_norm": 0.3922966399191834, |
| "learning_rate": 1.08130081300813e-06, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9481890052556992, |
| "num_tokens": 101272992.0, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.4608607251779058, |
| "grad_norm": 0.4595732751799743, |
| "learning_rate": 1.0799457994579946e-06, |
| "loss": 0.1731, |
| "mean_token_accuracy": 0.9453582316637039, |
| "num_tokens": 101421582.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.46153846153846156, |
| "grad_norm": 0.5865357648842143, |
| "learning_rate": 1.078590785907859e-06, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9493934810161591, |
| "num_tokens": 101569696.0, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.4622161978990173, |
| "grad_norm": 0.4907597552813253, |
| "learning_rate": 1.0772357723577234e-06, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9481813237071037, |
| "num_tokens": 101718242.0, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.46289393425957304, |
| "grad_norm": 0.5738496429651249, |
| "learning_rate": 1.075880758807588e-06, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.948225773870945, |
| "num_tokens": 101868574.0, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.4635716706201288, |
| "grad_norm": 0.5747780283701054, |
| "learning_rate": 1.0745257452574526e-06, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9486450627446175, |
| "num_tokens": 102016272.0, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.4642494069806845, |
| "grad_norm": 0.5059577085300414, |
| "learning_rate": 1.0731707317073172e-06, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.949810229241848, |
| "num_tokens": 102165015.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.46492714334124025, |
| "grad_norm": 0.5835022953962732, |
| "learning_rate": 1.0718157181571815e-06, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9493243172764778, |
| "num_tokens": 102311940.0, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.465604879701796, |
| "grad_norm": 0.7781826625681301, |
| "learning_rate": 1.070460704607046e-06, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9490118324756622, |
| "num_tokens": 102465237.0, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.4662826160623517, |
| "grad_norm": 0.47757570477744504, |
| "learning_rate": 1.0691056910569105e-06, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9480254128575325, |
| "num_tokens": 102610588.0, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.4669603524229075, |
| "grad_norm": 0.4806833949582325, |
| "learning_rate": 1.067750677506775e-06, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.945494756102562, |
| "num_tokens": 102763615.0, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.46763808878346325, |
| "grad_norm": 0.4122436220594733, |
| "learning_rate": 1.0663956639566396e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9464166983962059, |
| "num_tokens": 102910842.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.468315825144019, |
| "grad_norm": 0.4394930622319954, |
| "learning_rate": 1.0650406504065039e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.9447741508483887, |
| "num_tokens": 103058541.0, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.4689935615045747, |
| "grad_norm": 0.43543158560650574, |
| "learning_rate": 1.0636856368563684e-06, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9491621479392052, |
| "num_tokens": 103212920.0, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.46967129786513045, |
| "grad_norm": 0.6038558938086582, |
| "learning_rate": 1.0623306233062331e-06, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9469928592443466, |
| "num_tokens": 103366294.0, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.4703490342256862, |
| "grad_norm": 0.5169428891483752, |
| "learning_rate": 1.0609756097560976e-06, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9460704624652863, |
| "num_tokens": 103516086.0, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.4710267705862419, |
| "grad_norm": 0.4204143777115915, |
| "learning_rate": 1.0596205962059621e-06, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9489069432020187, |
| "num_tokens": 103663772.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.4717045069467977, |
| "grad_norm": 0.53680032839607, |
| "learning_rate": 1.0582655826558265e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9499735161662102, |
| "num_tokens": 103809403.0, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.47238224330735346, |
| "grad_norm": 0.41928447429862764, |
| "learning_rate": 1.056910569105691e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9501360580325127, |
| "num_tokens": 103958959.0, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.4730599796679092, |
| "grad_norm": 0.5315795339532013, |
| "learning_rate": 1.0555555555555555e-06, |
| "loss": 0.1707, |
| "mean_token_accuracy": 0.9456048160791397, |
| "num_tokens": 104105483.0, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.47373771602846493, |
| "grad_norm": 0.5954558763645185, |
| "learning_rate": 1.05420054200542e-06, |
| "loss": 0.1713, |
| "mean_token_accuracy": 0.9454812332987785, |
| "num_tokens": 104253981.0, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.47441545238902066, |
| "grad_norm": 0.548448631040149, |
| "learning_rate": 1.0528455284552843e-06, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9472458362579346, |
| "num_tokens": 104400658.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4750931887495764, |
| "grad_norm": 0.7249286327431037, |
| "learning_rate": 1.051490514905149e-06, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9484637081623077, |
| "num_tokens": 104546254.0, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.47577092511013214, |
| "grad_norm": 0.47880559600558203, |
| "learning_rate": 1.0501355013550136e-06, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9477341920137405, |
| "num_tokens": 104697706.0, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.47644866147068793, |
| "grad_norm": 0.532284214873716, |
| "learning_rate": 1.048780487804878e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9469640702009201, |
| "num_tokens": 104850387.0, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.47712639783124366, |
| "grad_norm": 0.8313245354091897, |
| "learning_rate": 1.0474254742547426e-06, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9462714120745659, |
| "num_tokens": 104998229.0, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.4778041341917994, |
| "grad_norm": 0.4642162638190659, |
| "learning_rate": 1.046070460704607e-06, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9481769949197769, |
| "num_tokens": 105142197.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.47848187055235514, |
| "grad_norm": 0.5041207986876783, |
| "learning_rate": 1.0447154471544714e-06, |
| "loss": 0.1543, |
| "mean_token_accuracy": 0.9502139016985893, |
| "num_tokens": 105292849.0, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.4791596069129109, |
| "grad_norm": 0.48408064640935866, |
| "learning_rate": 1.043360433604336e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9469591826200485, |
| "num_tokens": 105442224.0, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.4798373432734666, |
| "grad_norm": 0.49214635864367573, |
| "learning_rate": 1.0420054200542005e-06, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9472545087337494, |
| "num_tokens": 105588113.0, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.48051507963402235, |
| "grad_norm": 0.47558616757666466, |
| "learning_rate": 1.040650406504065e-06, |
| "loss": 0.1679, |
| "mean_token_accuracy": 0.9467742890119553, |
| "num_tokens": 105735195.0, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.4811928159945781, |
| "grad_norm": 0.4988404932381961, |
| "learning_rate": 1.0392953929539295e-06, |
| "loss": 0.1756, |
| "mean_token_accuracy": 0.9449669793248177, |
| "num_tokens": 105884493.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4818705523551339, |
| "grad_norm": 0.49847447518114224, |
| "learning_rate": 1.037940379403794e-06, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9476760029792786, |
| "num_tokens": 106033097.0, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.4825482887156896, |
| "grad_norm": 0.4738878804902704, |
| "learning_rate": 1.0365853658536586e-06, |
| "loss": 0.1541, |
| "mean_token_accuracy": 0.9495199620723724, |
| "num_tokens": 106180475.0, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.48322602507624535, |
| "grad_norm": 0.4549647737418487, |
| "learning_rate": 1.035230352303523e-06, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9463451281189919, |
| "num_tokens": 106324243.0, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.4839037614368011, |
| "grad_norm": 0.5457516421527091, |
| "learning_rate": 1.0338753387533874e-06, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9500289857387543, |
| "num_tokens": 106476157.0, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.4845814977973568, |
| "grad_norm": 0.5482377070075686, |
| "learning_rate": 1.032520325203252e-06, |
| "loss": 0.1731, |
| "mean_token_accuracy": 0.9448134079575539, |
| "num_tokens": 106628472.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.48525923415791256, |
| "grad_norm": 0.5467205293980786, |
| "learning_rate": 1.0311653116531164e-06, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9459547698497772, |
| "num_tokens": 106774603.0, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.4859369705184683, |
| "grad_norm": 0.625319792565366, |
| "learning_rate": 1.029810298102981e-06, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9472379386425018, |
| "num_tokens": 106925585.0, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.4866147068790241, |
| "grad_norm": 0.46862713118482585, |
| "learning_rate": 1.0284552845528457e-06, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9494001865386963, |
| "num_tokens": 107073596.0, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.4872924432395798, |
| "grad_norm": 0.4729214433145921, |
| "learning_rate": 1.02710027100271e-06, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.949380025267601, |
| "num_tokens": 107223256.0, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.48797017960013556, |
| "grad_norm": 0.4821712382621107, |
| "learning_rate": 1.0257452574525745e-06, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9479318857192993, |
| "num_tokens": 107373846.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4886479159606913, |
| "grad_norm": 0.42839181331142523, |
| "learning_rate": 1.024390243902439e-06, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9478693678975105, |
| "num_tokens": 107523614.0, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.48932565232124703, |
| "grad_norm": 1.4382103336661227, |
| "learning_rate": 1.0230352303523035e-06, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9502756372094154, |
| "num_tokens": 107672510.0, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.49000338868180277, |
| "grad_norm": 0.4478990985518304, |
| "learning_rate": 1.0216802168021679e-06, |
| "loss": 0.1545, |
| "mean_token_accuracy": 0.950567439198494, |
| "num_tokens": 107821486.0, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.4906811250423585, |
| "grad_norm": 0.49546135852173817, |
| "learning_rate": 1.0203252032520324e-06, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9489807188510895, |
| "num_tokens": 107971346.0, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.4913588614029143, |
| "grad_norm": 0.5268621271334399, |
| "learning_rate": 1.0189701897018969e-06, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9466768205165863, |
| "num_tokens": 108117296.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.49203659776347003, |
| "grad_norm": 0.5302331386916604, |
| "learning_rate": 1.0176151761517614e-06, |
| "loss": 0.1715, |
| "mean_token_accuracy": 0.9459145441651344, |
| "num_tokens": 108269363.0, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.49271433412402577, |
| "grad_norm": 0.6950355569365599, |
| "learning_rate": 1.0162601626016261e-06, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9490808993577957, |
| "num_tokens": 108418896.0, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.4933920704845815, |
| "grad_norm": 0.6291340372645929, |
| "learning_rate": 1.0149051490514905e-06, |
| "loss": 0.1553, |
| "mean_token_accuracy": 0.9506272599101067, |
| "num_tokens": 108568146.0, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.49406980684513724, |
| "grad_norm": 0.4823368965206394, |
| "learning_rate": 1.013550135501355e-06, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.9474935382604599, |
| "num_tokens": 108717754.0, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.494747543205693, |
| "grad_norm": 1.1298261109243466, |
| "learning_rate": 1.0121951219512195e-06, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.9445156082510948, |
| "num_tokens": 108865813.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.4954252795662487, |
| "grad_norm": 0.5542227313632503, |
| "learning_rate": 1.010840108401084e-06, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9461698085069656, |
| "num_tokens": 109017876.0, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.49610301592680445, |
| "grad_norm": 0.6317319587465365, |
| "learning_rate": 1.0094850948509485e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9471932277083397, |
| "num_tokens": 109165966.0, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.49678075228736024, |
| "grad_norm": 0.5709037252451544, |
| "learning_rate": 1.0081300813008128e-06, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9459073320031166, |
| "num_tokens": 109319510.0, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.497458488647916, |
| "grad_norm": 0.4745034952273246, |
| "learning_rate": 1.0067750677506774e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.945509634912014, |
| "num_tokens": 109471374.0, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.4981362250084717, |
| "grad_norm": 0.5997054541531742, |
| "learning_rate": 1.005420054200542e-06, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9486510381102562, |
| "num_tokens": 109619134.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.49881396136902745, |
| "grad_norm": 0.44582034839420726, |
| "learning_rate": 1.0040650406504066e-06, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9460132345557213, |
| "num_tokens": 109771208.0, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.4994916977295832, |
| "grad_norm": 0.46505806035131303, |
| "learning_rate": 1.002710027100271e-06, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9484423398971558, |
| "num_tokens": 109923524.0, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.5001694340901389, |
| "grad_norm": 0.534117786954477, |
| "learning_rate": 1.0013550135501354e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.9447778537869453, |
| "num_tokens": 110074329.0, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.5008471704506947, |
| "grad_norm": 0.4969310333618879, |
| "learning_rate": 1e-06, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.9441555514931679, |
| "num_tokens": 110223675.0, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.5015249068112504, |
| "grad_norm": 0.43004524784944254, |
| "learning_rate": 9.986449864498645e-07, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9462130591273308, |
| "num_tokens": 110371658.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5022026431718062, |
| "grad_norm": 0.5280386035661061, |
| "learning_rate": 9.97289972899729e-07, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9489178732037544, |
| "num_tokens": 110522893.0, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.5028803795323619, |
| "grad_norm": 0.487926791997932, |
| "learning_rate": 9.959349593495935e-07, |
| "loss": 0.1505, |
| "mean_token_accuracy": 0.9517373815178871, |
| "num_tokens": 110667680.0, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.5035581158929177, |
| "grad_norm": 0.4887678950646701, |
| "learning_rate": 9.945799457994578e-07, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9478684067726135, |
| "num_tokens": 110816521.0, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.5042358522534734, |
| "grad_norm": 1.0072774482193543, |
| "learning_rate": 9.932249322493226e-07, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9473919719457626, |
| "num_tokens": 110965132.0, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.5049135886140291, |
| "grad_norm": 0.504093174335246, |
| "learning_rate": 9.918699186991869e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9465838596224785, |
| "num_tokens": 111116073.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.5055913249745849, |
| "grad_norm": 0.49005593530631875, |
| "learning_rate": 9.905149051490514e-07, |
| "loss": 0.1784, |
| "mean_token_accuracy": 0.9438984841108322, |
| "num_tokens": 111263209.0, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.5062690613351406, |
| "grad_norm": 0.4547236242174316, |
| "learning_rate": 9.89159891598916e-07, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9499284625053406, |
| "num_tokens": 111411766.0, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.5069467976956964, |
| "grad_norm": 0.6009114812592581, |
| "learning_rate": 9.878048780487804e-07, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9469569846987724, |
| "num_tokens": 111557025.0, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.5076245340562521, |
| "grad_norm": 0.42135854041604787, |
| "learning_rate": 9.86449864498645e-07, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.9499689266085625, |
| "num_tokens": 111707742.0, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.5083022704168079, |
| "grad_norm": 0.4338767955990597, |
| "learning_rate": 9.850948509485095e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9490743651986122, |
| "num_tokens": 111858573.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5089800067773637, |
| "grad_norm": 0.48802928227265013, |
| "learning_rate": 9.83739837398374e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9479610547423363, |
| "num_tokens": 112006134.0, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.5096577431379193, |
| "grad_norm": 0.5253666701851781, |
| "learning_rate": 9.823848238482385e-07, |
| "loss": 0.153, |
| "mean_token_accuracy": 0.9508332163095474, |
| "num_tokens": 112156014.0, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.5103354794984751, |
| "grad_norm": 0.6510987326952798, |
| "learning_rate": 9.81029810298103e-07, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9488585442304611, |
| "num_tokens": 112307072.0, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.5110132158590308, |
| "grad_norm": 0.42952608876024273, |
| "learning_rate": 9.796747967479673e-07, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9463882446289062, |
| "num_tokens": 112457138.0, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.5116909522195866, |
| "grad_norm": 0.44738605619561317, |
| "learning_rate": 9.783197831978318e-07, |
| "loss": 0.1564, |
| "mean_token_accuracy": 0.9506853669881821, |
| "num_tokens": 112606569.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.5123686885801423, |
| "grad_norm": 0.5123927765966358, |
| "learning_rate": 9.769647696476966e-07, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.948229692876339, |
| "num_tokens": 112755571.0, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.5130464249406981, |
| "grad_norm": 0.45086208832924035, |
| "learning_rate": 9.756097560975609e-07, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9456148147583008, |
| "num_tokens": 112904410.0, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.5137241613012539, |
| "grad_norm": 0.630544474001028, |
| "learning_rate": 9.742547425474254e-07, |
| "loss": 0.1679, |
| "mean_token_accuracy": 0.946696400642395, |
| "num_tokens": 113054622.0, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.5144018976618095, |
| "grad_norm": 0.47191984856805846, |
| "learning_rate": 9.7289972899729e-07, |
| "loss": 0.1715, |
| "mean_token_accuracy": 0.9447854086756706, |
| "num_tokens": 113204101.0, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.5150796340223653, |
| "grad_norm": 0.7466705426242991, |
| "learning_rate": 9.715447154471544e-07, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9477136209607124, |
| "num_tokens": 113353757.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.515757370382921, |
| "grad_norm": 0.4424803125673622, |
| "learning_rate": 9.70189701897019e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9492787793278694, |
| "num_tokens": 113501991.0, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.5164351067434768, |
| "grad_norm": 0.5154548055150858, |
| "learning_rate": 9.688346883468835e-07, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9481583312153816, |
| "num_tokens": 113656775.0, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.5171128431040325, |
| "grad_norm": 0.5138120980499192, |
| "learning_rate": 9.67479674796748e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.949316069483757, |
| "num_tokens": 113804739.0, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.5177905794645883, |
| "grad_norm": 0.597476859195241, |
| "learning_rate": 9.661246612466123e-07, |
| "loss": 0.1679, |
| "mean_token_accuracy": 0.9463634565472603, |
| "num_tokens": 113956619.0, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.518468315825144, |
| "grad_norm": 0.3915756552113162, |
| "learning_rate": 9.64769647696477e-07, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9484176561236382, |
| "num_tokens": 114110011.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.5191460521856998, |
| "grad_norm": 0.5299306971347804, |
| "learning_rate": 9.634146341463414e-07, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9498151689767838, |
| "num_tokens": 114256287.0, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.5198237885462555, |
| "grad_norm": 0.45604200441024956, |
| "learning_rate": 9.620596205962059e-07, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.949748583137989, |
| "num_tokens": 114405384.0, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.5205015249068112, |
| "grad_norm": 0.5501039593422863, |
| "learning_rate": 9.607046070460704e-07, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.947017602622509, |
| "num_tokens": 114558364.0, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.521179261267367, |
| "grad_norm": 0.425048989890096, |
| "learning_rate": 9.59349593495935e-07, |
| "loss": 0.1582, |
| "mean_token_accuracy": 0.9495890736579895, |
| "num_tokens": 114706272.0, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.5218569976279227, |
| "grad_norm": 0.4962916899714615, |
| "learning_rate": 9.579945799457994e-07, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9498101100325584, |
| "num_tokens": 114851238.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5225347339884785, |
| "grad_norm": 1.0919307561207405, |
| "learning_rate": 9.56639566395664e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9484520703554153, |
| "num_tokens": 115002037.0, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.5232124703490342, |
| "grad_norm": 0.47581107966778774, |
| "learning_rate": 9.552845528455285e-07, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9501020833849907, |
| "num_tokens": 115150459.0, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.52389020670959, |
| "grad_norm": 0.431562811821882, |
| "learning_rate": 9.53929539295393e-07, |
| "loss": 0.1759, |
| "mean_token_accuracy": 0.9442851468920708, |
| "num_tokens": 115294986.0, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.5245679430701458, |
| "grad_norm": 0.4078301321714335, |
| "learning_rate": 9.525745257452574e-07, |
| "loss": 0.1677, |
| "mean_token_accuracy": 0.9461727514863014, |
| "num_tokens": 115443301.0, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.5252456794307014, |
| "grad_norm": 0.5082886880179815, |
| "learning_rate": 9.512195121951218e-07, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.947071261703968, |
| "num_tokens": 115591069.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5259234157912572, |
| "grad_norm": 0.4666195259750703, |
| "learning_rate": 9.498644986449864e-07, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9454357475042343, |
| "num_tokens": 115743380.0, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.5266011521518129, |
| "grad_norm": 0.4919611574214216, |
| "learning_rate": 9.48509485094851e-07, |
| "loss": 0.1703, |
| "mean_token_accuracy": 0.9463658332824707, |
| "num_tokens": 115889361.0, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.5272788885123687, |
| "grad_norm": 1.7430472687651062, |
| "learning_rate": 9.471544715447154e-07, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9499025717377663, |
| "num_tokens": 116038571.0, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.5279566248729244, |
| "grad_norm": 0.5108511243101571, |
| "learning_rate": 9.457994579945799e-07, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9482218623161316, |
| "num_tokens": 116186871.0, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.5286343612334802, |
| "grad_norm": 1.2775624635609895, |
| "learning_rate": 9.444444444444444e-07, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9478557929396629, |
| "num_tokens": 116333923.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.529312097594036, |
| "grad_norm": 0.7218332579746664, |
| "learning_rate": 9.430894308943089e-07, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9476354792714119, |
| "num_tokens": 116484514.0, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.5299898339545916, |
| "grad_norm": 0.47382289757475327, |
| "learning_rate": 9.417344173441733e-07, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9467662200331688, |
| "num_tokens": 116633843.0, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.5306675703151474, |
| "grad_norm": 0.7320079657643676, |
| "learning_rate": 9.403794037940379e-07, |
| "loss": 0.1567, |
| "mean_token_accuracy": 0.9489927589893341, |
| "num_tokens": 116784397.0, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.5313453066757031, |
| "grad_norm": 0.4630461591729857, |
| "learning_rate": 9.390243902439024e-07, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.949583537876606, |
| "num_tokens": 116937107.0, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.5320230430362589, |
| "grad_norm": 0.47230814693707296, |
| "learning_rate": 9.376693766937669e-07, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9474940076470375, |
| "num_tokens": 117086319.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.5327007793968146, |
| "grad_norm": 0.4415373162153277, |
| "learning_rate": 9.363143631436314e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9468573704361916, |
| "num_tokens": 117234587.0, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.5333785157573704, |
| "grad_norm": 0.42824798111784507, |
| "learning_rate": 9.349593495934958e-07, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.947014681994915, |
| "num_tokens": 117383847.0, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.5340562521179262, |
| "grad_norm": 0.44986339654153784, |
| "learning_rate": 9.336043360433605e-07, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9481543377041817, |
| "num_tokens": 117534768.0, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.5347339884784819, |
| "grad_norm": 0.6592995054568456, |
| "learning_rate": 9.322493224932249e-07, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9465798437595367, |
| "num_tokens": 117686438.0, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.5354117248390377, |
| "grad_norm": 0.543648944178695, |
| "learning_rate": 9.308943089430894e-07, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9476120993494987, |
| "num_tokens": 117840225.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5360894611995933, |
| "grad_norm": 0.5019318009561816, |
| "learning_rate": 9.295392953929538e-07, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9476380497217178, |
| "num_tokens": 117986407.0, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.5367671975601491, |
| "grad_norm": 0.4633489551285423, |
| "learning_rate": 9.281842818428183e-07, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9449344500899315, |
| "num_tokens": 118135942.0, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.5374449339207048, |
| "grad_norm": 0.4902355737543805, |
| "learning_rate": 9.26829268292683e-07, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9482391402125359, |
| "num_tokens": 118287895.0, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.5381226702812606, |
| "grad_norm": 0.9371752420840535, |
| "learning_rate": 9.254742547425474e-07, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9446223452687263, |
| "num_tokens": 118432630.0, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.5388004066418164, |
| "grad_norm": 0.4481821620387517, |
| "learning_rate": 9.241192411924119e-07, |
| "loss": 0.1557, |
| "mean_token_accuracy": 0.9504266083240509, |
| "num_tokens": 118580254.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5394781430023721, |
| "grad_norm": 0.4433316238279581, |
| "learning_rate": 9.227642276422763e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9479897543787956, |
| "num_tokens": 118727619.0, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.5401558793629279, |
| "grad_norm": 0.46876476388476396, |
| "learning_rate": 9.214092140921409e-07, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9457740485668182, |
| "num_tokens": 118874120.0, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.5408336157234835, |
| "grad_norm": 0.3922350527541487, |
| "learning_rate": 9.200542005420053e-07, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9497921913862228, |
| "num_tokens": 119023831.0, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.5415113520840393, |
| "grad_norm": 0.46069590927505677, |
| "learning_rate": 9.186991869918699e-07, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9491604790091515, |
| "num_tokens": 119172402.0, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.542189088444595, |
| "grad_norm": 1.5163101510803785, |
| "learning_rate": 9.173441734417344e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9501603320240974, |
| "num_tokens": 119321631.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5428668248051508, |
| "grad_norm": 0.47039412722321516, |
| "learning_rate": 9.159891598915989e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9467765465378761, |
| "num_tokens": 119473130.0, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.5435445611657065, |
| "grad_norm": 0.500634772226534, |
| "learning_rate": 9.146341463414634e-07, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9450068846344948, |
| "num_tokens": 119624002.0, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.5442222975262623, |
| "grad_norm": 0.546366210975246, |
| "learning_rate": 9.132791327913278e-07, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9482316449284554, |
| "num_tokens": 119770921.0, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.5449000338868181, |
| "grad_norm": 0.444178937700613, |
| "learning_rate": 9.119241192411924e-07, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9471771121025085, |
| "num_tokens": 119921288.0, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.5455777702473738, |
| "grad_norm": 0.7038113748485348, |
| "learning_rate": 9.105691056910569e-07, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9470604583621025, |
| "num_tokens": 120063297.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5462555066079295, |
| "grad_norm": 0.4750752606409635, |
| "learning_rate": 9.092140921409214e-07, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9491050541400909, |
| "num_tokens": 120214728.0, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.5469332429684852, |
| "grad_norm": 0.46993115257183415, |
| "learning_rate": 9.078590785907859e-07, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9494856968522072, |
| "num_tokens": 120359269.0, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.547610979329041, |
| "grad_norm": 2.162229906542572, |
| "learning_rate": 9.065040650406503e-07, |
| "loss": 0.1518, |
| "mean_token_accuracy": 0.9509051218628883, |
| "num_tokens": 120511262.0, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.5482887156895967, |
| "grad_norm": 0.5212946679760524, |
| "learning_rate": 9.05149051490515e-07, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9471752345561981, |
| "num_tokens": 120664787.0, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.5489664520501525, |
| "grad_norm": 0.6422543732085594, |
| "learning_rate": 9.037940379403794e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9477086663246155, |
| "num_tokens": 120816729.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.5496441884107083, |
| "grad_norm": 0.44358840772713953, |
| "learning_rate": 9.024390243902439e-07, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9464196562767029, |
| "num_tokens": 120965272.0, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.550321924771264, |
| "grad_norm": 0.4511515056931182, |
| "learning_rate": 9.010840108401083e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9473424032330513, |
| "num_tokens": 121111754.0, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.5509996611318198, |
| "grad_norm": 0.38817856013106755, |
| "learning_rate": 8.997289972899728e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9496878236532211, |
| "num_tokens": 121261973.0, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.5516773974923754, |
| "grad_norm": 0.4061986477027123, |
| "learning_rate": 8.983739837398373e-07, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9469241499900818, |
| "num_tokens": 121410011.0, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.5523551338529312, |
| "grad_norm": 1.015324252743301, |
| "learning_rate": 8.970189701897019e-07, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9484916105866432, |
| "num_tokens": 121550744.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5530328702134869, |
| "grad_norm": 0.44049449649746186, |
| "learning_rate": 8.956639566395664e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9494698345661163, |
| "num_tokens": 121700363.0, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.5537106065740427, |
| "grad_norm": 0.6340088199439193, |
| "learning_rate": 8.943089430894308e-07, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.9485754668712616, |
| "num_tokens": 121848454.0, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.5543883429345985, |
| "grad_norm": 0.48586459859651804, |
| "learning_rate": 8.929539295392954e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9491472393274307, |
| "num_tokens": 121998493.0, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.5550660792951542, |
| "grad_norm": 1.4709351530109251, |
| "learning_rate": 8.915989159891598e-07, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9486441239714622, |
| "num_tokens": 122145352.0, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.55574381565571, |
| "grad_norm": 0.4324305757569167, |
| "learning_rate": 8.902439024390244e-07, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9465356022119522, |
| "num_tokens": 122297716.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5564215520162656, |
| "grad_norm": 0.6959982735475431, |
| "learning_rate": 8.888888888888888e-07, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9456816613674164, |
| "num_tokens": 122450816.0, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.5570992883768214, |
| "grad_norm": 0.7003745818253505, |
| "learning_rate": 8.875338753387534e-07, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.9444479048252106, |
| "num_tokens": 122600489.0, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.5577770247373771, |
| "grad_norm": 0.9742519698585274, |
| "learning_rate": 8.861788617886179e-07, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9471156671643257, |
| "num_tokens": 122750619.0, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.5584547610979329, |
| "grad_norm": 0.5382653005082321, |
| "learning_rate": 8.848238482384823e-07, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9445811286568642, |
| "num_tokens": 122899835.0, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.5591324974584887, |
| "grad_norm": 0.5746386344906712, |
| "learning_rate": 8.834688346883468e-07, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9500567018985748, |
| "num_tokens": 123048046.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5598102338190444, |
| "grad_norm": 0.49272665751581907, |
| "learning_rate": 8.821138211382113e-07, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9488162100315094, |
| "num_tokens": 123196978.0, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.5604879701796002, |
| "grad_norm": 0.49751566301965017, |
| "learning_rate": 8.807588075880759e-07, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9485758990049362, |
| "num_tokens": 123346407.0, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.5611657065401559, |
| "grad_norm": 0.5289804379300486, |
| "learning_rate": 8.794037940379403e-07, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9460392519831657, |
| "num_tokens": 123500062.0, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.5618434429007116, |
| "grad_norm": 0.45654638803491393, |
| "learning_rate": 8.780487804878048e-07, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9497273415327072, |
| "num_tokens": 123649294.0, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.5625211792612673, |
| "grad_norm": 0.44613375128695365, |
| "learning_rate": 8.766937669376693e-07, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.948423445224762, |
| "num_tokens": 123800787.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5631989156218231, |
| "grad_norm": 1.0831604948180953, |
| "learning_rate": 8.753387533875339e-07, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9475297853350639, |
| "num_tokens": 123952990.0, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.5638766519823789, |
| "grad_norm": 0.9942947603234078, |
| "learning_rate": 8.739837398373984e-07, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9475270807743073, |
| "num_tokens": 124099716.0, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.5645543883429346, |
| "grad_norm": 0.517614231215208, |
| "learning_rate": 8.726287262872628e-07, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9498177841305733, |
| "num_tokens": 124252344.0, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.5652321247034904, |
| "grad_norm": 0.5317023085654892, |
| "learning_rate": 8.712737127371273e-07, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9490551054477692, |
| "num_tokens": 124401448.0, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.5659098610640461, |
| "grad_norm": 0.5185688681175784, |
| "learning_rate": 8.699186991869918e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9469140246510506, |
| "num_tokens": 124546582.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.5665875974246019, |
| "grad_norm": 0.41557332220052745, |
| "learning_rate": 8.685636856368563e-07, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9459367915987968, |
| "num_tokens": 124695136.0, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.5672653337851575, |
| "grad_norm": 0.6431221859186249, |
| "learning_rate": 8.672086720867209e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9466153234243393, |
| "num_tokens": 124848280.0, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.5679430701457133, |
| "grad_norm": 0.4597423272540076, |
| "learning_rate": 8.658536585365853e-07, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9452999532222748, |
| "num_tokens": 124995930.0, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.5686208065062691, |
| "grad_norm": 0.40694430784656294, |
| "learning_rate": 8.644986449864499e-07, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9465653151273727, |
| "num_tokens": 125141313.0, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.5692985428668248, |
| "grad_norm": 0.6509343776039687, |
| "learning_rate": 8.631436314363143e-07, |
| "loss": 0.1576, |
| "mean_token_accuracy": 0.9495387375354767, |
| "num_tokens": 125289634.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5699762792273806, |
| "grad_norm": 0.4304540404614368, |
| "learning_rate": 8.617886178861788e-07, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9472305327653885, |
| "num_tokens": 125440651.0, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.5706540155879363, |
| "grad_norm": 0.5442253821672075, |
| "learning_rate": 8.604336043360433e-07, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.9492665678262711, |
| "num_tokens": 125587031.0, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.5713317519484921, |
| "grad_norm": 1.0424392574624841, |
| "learning_rate": 8.590785907859079e-07, |
| "loss": 0.1564, |
| "mean_token_accuracy": 0.9497368782758713, |
| "num_tokens": 125734286.0, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.5720094883090477, |
| "grad_norm": 0.6985477539533492, |
| "learning_rate": 8.577235772357723e-07, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9490614905953407, |
| "num_tokens": 125879250.0, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.5726872246696035, |
| "grad_norm": 0.3914382877262425, |
| "learning_rate": 8.563685636856368e-07, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.9510177820920944, |
| "num_tokens": 126030729.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5733649610301592, |
| "grad_norm": 0.5134766215553107, |
| "learning_rate": 8.550135501355013e-07, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9482594132423401, |
| "num_tokens": 126180883.0, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.574042697390715, |
| "grad_norm": 0.7396491074742783, |
| "learning_rate": 8.536585365853657e-07, |
| "loss": 0.1505, |
| "mean_token_accuracy": 0.9510187357664108, |
| "num_tokens": 126336047.0, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.5747204337512708, |
| "grad_norm": 0.4231370791240071, |
| "learning_rate": 8.523035230352304e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9463280215859413, |
| "num_tokens": 126483799.0, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.5753981701118265, |
| "grad_norm": 4.232746679945196, |
| "learning_rate": 8.509485094850948e-07, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.948805071413517, |
| "num_tokens": 126634319.0, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.5760759064723823, |
| "grad_norm": 0.4184667470094849, |
| "learning_rate": 8.495934959349593e-07, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9463841244578362, |
| "num_tokens": 126785223.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.576753642832938, |
| "grad_norm": 0.4315358083805072, |
| "learning_rate": 8.482384823848237e-07, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9478829950094223, |
| "num_tokens": 126932745.0, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.5774313791934937, |
| "grad_norm": 0.8449257861756138, |
| "learning_rate": 8.468834688346883e-07, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9479701891541481, |
| "num_tokens": 127083705.0, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.5781091155540494, |
| "grad_norm": 0.40974519696433026, |
| "learning_rate": 8.455284552845529e-07, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9462849348783493, |
| "num_tokens": 127235425.0, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.5787868519146052, |
| "grad_norm": 0.4195578859819862, |
| "learning_rate": 8.441734417344173e-07, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.949033334851265, |
| "num_tokens": 127388201.0, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.579464588275161, |
| "grad_norm": 0.49000810389007354, |
| "learning_rate": 8.428184281842818e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9488765150308609, |
| "num_tokens": 127534594.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.5801423246357167, |
| "grad_norm": 1.0802510638369283, |
| "learning_rate": 8.414634146341463e-07, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9483487829566002, |
| "num_tokens": 127683017.0, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.5808200609962725, |
| "grad_norm": 0.6578855599046194, |
| "learning_rate": 8.401084010840108e-07, |
| "loss": 0.1512, |
| "mean_token_accuracy": 0.9515197053551674, |
| "num_tokens": 127832419.0, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.5814977973568282, |
| "grad_norm": 0.4359659612480235, |
| "learning_rate": 8.387533875338753e-07, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9471932053565979, |
| "num_tokens": 127982085.0, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.582175533717384, |
| "grad_norm": 1.819776022271404, |
| "learning_rate": 8.373983739837398e-07, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9458101093769073, |
| "num_tokens": 128128922.0, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.5828532700779396, |
| "grad_norm": 0.4258732751942105, |
| "learning_rate": 8.360433604336044e-07, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9463748782873154, |
| "num_tokens": 128277048.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5835310064384954, |
| "grad_norm": 0.4183924777076949, |
| "learning_rate": 8.346883468834688e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9493186771869659, |
| "num_tokens": 128423094.0, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.5842087427990512, |
| "grad_norm": 0.4758718297804327, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9486329630017281, |
| "num_tokens": 128574980.0, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.5848864791596069, |
| "grad_norm": 0.4797506718095942, |
| "learning_rate": 8.319783197831977e-07, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9494662657380104, |
| "num_tokens": 128725510.0, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.5855642155201627, |
| "grad_norm": 0.6113872026540533, |
| "learning_rate": 8.306233062330623e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9471870213747025, |
| "num_tokens": 128876751.0, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.5862419518807184, |
| "grad_norm": 0.9150236072650709, |
| "learning_rate": 8.292682926829268e-07, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9455385357141495, |
| "num_tokens": 129030714.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5869196882412742, |
| "grad_norm": 0.37306588743257885, |
| "learning_rate": 8.279132791327913e-07, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9470183029770851, |
| "num_tokens": 129180043.0, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.5875974246018298, |
| "grad_norm": 0.46435908405163356, |
| "learning_rate": 8.265582655826557e-07, |
| "loss": 0.176, |
| "mean_token_accuracy": 0.9439493343234062, |
| "num_tokens": 129332537.0, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.5882751609623856, |
| "grad_norm": 0.465667821102405, |
| "learning_rate": 8.252032520325202e-07, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.9455002173781395, |
| "num_tokens": 129482278.0, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.5889528973229414, |
| "grad_norm": 0.4219842630697802, |
| "learning_rate": 8.238482384823849e-07, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9466238841414452, |
| "num_tokens": 129629550.0, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.5896306336834971, |
| "grad_norm": 0.4745502832196809, |
| "learning_rate": 8.224932249322493e-07, |
| "loss": 0.1679, |
| "mean_token_accuracy": 0.94708351790905, |
| "num_tokens": 129776202.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5903083700440529, |
| "grad_norm": 0.42594833229248413, |
| "learning_rate": 8.211382113821138e-07, |
| "loss": 0.1555, |
| "mean_token_accuracy": 0.9506156072020531, |
| "num_tokens": 129923307.0, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.5909861064046086, |
| "grad_norm": 0.3876767907960687, |
| "learning_rate": 8.197831978319782e-07, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9464268982410431, |
| "num_tokens": 130071563.0, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.5916638427651644, |
| "grad_norm": 0.4452387059716283, |
| "learning_rate": 8.184281842818428e-07, |
| "loss": 0.1543, |
| "mean_token_accuracy": 0.9506272077560425, |
| "num_tokens": 130218754.0, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.59234157912572, |
| "grad_norm": 0.39458214992094115, |
| "learning_rate": 8.170731707317072e-07, |
| "loss": 0.1486, |
| "mean_token_accuracy": 0.9521933421492577, |
| "num_tokens": 130372245.0, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.5930193154862758, |
| "grad_norm": 0.4060057020635777, |
| "learning_rate": 8.157181571815718e-07, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9476899206638336, |
| "num_tokens": 130522066.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5936970518468316, |
| "grad_norm": 0.3740875728540713, |
| "learning_rate": 8.143631436314363e-07, |
| "loss": 0.1567, |
| "mean_token_accuracy": 0.949752576649189, |
| "num_tokens": 130670822.0, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.5943747882073873, |
| "grad_norm": 0.449478208310373, |
| "learning_rate": 8.130081300813008e-07, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9482812359929085, |
| "num_tokens": 130823135.0, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.5950525245679431, |
| "grad_norm": 0.470574147535195, |
| "learning_rate": 8.116531165311653e-07, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.9464894384145737, |
| "num_tokens": 130976061.0, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.5957302609284988, |
| "grad_norm": 0.43273814562418705, |
| "learning_rate": 8.102981029810297e-07, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.946991890668869, |
| "num_tokens": 131130287.0, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.5964079972890546, |
| "grad_norm": 0.5133374187671007, |
| "learning_rate": 8.089430894308943e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9469396248459816, |
| "num_tokens": 131279600.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5970857336496103, |
| "grad_norm": 0.5527391693964944, |
| "learning_rate": 8.075880758807587e-07, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9478160068392754, |
| "num_tokens": 131429003.0, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.597763470010166, |
| "grad_norm": 0.5869559180286896, |
| "learning_rate": 8.062330623306233e-07, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9487715065479279, |
| "num_tokens": 131581100.0, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.5984412063707217, |
| "grad_norm": 0.5139028440956621, |
| "learning_rate": 8.048780487804878e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9471326619386673, |
| "num_tokens": 131730402.0, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.5991189427312775, |
| "grad_norm": 0.4782042342078006, |
| "learning_rate": 8.035230352303522e-07, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.945782758295536, |
| "num_tokens": 131883450.0, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.5997966790918333, |
| "grad_norm": 0.5420589297463521, |
| "learning_rate": 8.021680216802168e-07, |
| "loss": 0.1546, |
| "mean_token_accuracy": 0.9501096978783607, |
| "num_tokens": 132035968.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.600474415452389, |
| "grad_norm": 0.5659045444746059, |
| "learning_rate": 8.008130081300813e-07, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9486903175711632, |
| "num_tokens": 132185887.0, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.6011521518129448, |
| "grad_norm": 0.5846227300914831, |
| "learning_rate": 7.994579945799458e-07, |
| "loss": 0.1734, |
| "mean_token_accuracy": 0.945100449025631, |
| "num_tokens": 132333916.0, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.6018298881735005, |
| "grad_norm": 0.43980703857602094, |
| "learning_rate": 7.981029810298102e-07, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.946213386952877, |
| "num_tokens": 132484111.0, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.6025076245340563, |
| "grad_norm": 0.38862752445000626, |
| "learning_rate": 7.967479674796747e-07, |
| "loss": 0.171, |
| "mean_token_accuracy": 0.9452003985643387, |
| "num_tokens": 132638890.0, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.6031853608946119, |
| "grad_norm": 0.5018014213821751, |
| "learning_rate": 7.953929539295394e-07, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9488074406981468, |
| "num_tokens": 132787007.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6038630972551677, |
| "grad_norm": 0.6087439529267555, |
| "learning_rate": 7.940379403794038e-07, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9478660523891449, |
| "num_tokens": 132936200.0, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.6045408336157235, |
| "grad_norm": 0.49604043868352027, |
| "learning_rate": 7.926829268292683e-07, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9487058073282242, |
| "num_tokens": 133088672.0, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.6052185699762792, |
| "grad_norm": 0.7340801375571371, |
| "learning_rate": 7.913279132791327e-07, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9449058994650841, |
| "num_tokens": 133239608.0, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.605896306336835, |
| "grad_norm": 0.39721191202967127, |
| "learning_rate": 7.899728997289973e-07, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9476265981793404, |
| "num_tokens": 133387750.0, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.6065740426973907, |
| "grad_norm": 0.4376674787899762, |
| "learning_rate": 7.886178861788617e-07, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.947635717689991, |
| "num_tokens": 133535888.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.6072517790579465, |
| "grad_norm": 0.42423057779823, |
| "learning_rate": 7.872628726287263e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9462937116622925, |
| "num_tokens": 133681883.0, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.6079295154185022, |
| "grad_norm": 0.5936497940257488, |
| "learning_rate": 7.859078590785907e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.946974903345108, |
| "num_tokens": 133833687.0, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.608607251779058, |
| "grad_norm": 0.5219859690946337, |
| "learning_rate": 7.845528455284552e-07, |
| "loss": 0.1736, |
| "mean_token_accuracy": 0.9455397203564644, |
| "num_tokens": 133986910.0, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.6092849881396137, |
| "grad_norm": 0.4774722891619204, |
| "learning_rate": 7.831978319783198e-07, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9463390782475471, |
| "num_tokens": 134132997.0, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.6099627245001694, |
| "grad_norm": 0.4850631210301614, |
| "learning_rate": 7.818428184281842e-07, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9509309977293015, |
| "num_tokens": 134279331.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6106404608607252, |
| "grad_norm": 2.0643418841530607, |
| "learning_rate": 7.804878048780488e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9475974664092064, |
| "num_tokens": 134430596.0, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.6113181972212809, |
| "grad_norm": 0.4523057522995995, |
| "learning_rate": 7.791327913279132e-07, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9492429792881012, |
| "num_tokens": 134582264.0, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.6119959335818367, |
| "grad_norm": 0.772571598377388, |
| "learning_rate": 7.777777777777778e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9482496008276939, |
| "num_tokens": 134729586.0, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.6126736699423924, |
| "grad_norm": 0.9569695012936315, |
| "learning_rate": 7.764227642276422e-07, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9479860961437225, |
| "num_tokens": 134876147.0, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.6133514063029482, |
| "grad_norm": 3.4385418189852968, |
| "learning_rate": 7.750677506775067e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9483773708343506, |
| "num_tokens": 135027603.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.614029142663504, |
| "grad_norm": 0.4428624807588454, |
| "learning_rate": 7.737127371273712e-07, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9465060532093048, |
| "num_tokens": 135175544.0, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.6147068790240596, |
| "grad_norm": 0.42033595784536043, |
| "learning_rate": 7.723577235772358e-07, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9488836824893951, |
| "num_tokens": 135325813.0, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.4431305406629688, |
| "learning_rate": 7.710027100271003e-07, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9485266506671906, |
| "num_tokens": 135475422.0, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.6160623517451711, |
| "grad_norm": 0.46267044020984877, |
| "learning_rate": 7.696476964769647e-07, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.946107029914856, |
| "num_tokens": 135622188.0, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.6167400881057269, |
| "grad_norm": 0.7193235514746738, |
| "learning_rate": 7.682926829268292e-07, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9476191028952599, |
| "num_tokens": 135771377.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6174178244662826, |
| "grad_norm": 0.38960900983203894, |
| "learning_rate": 7.669376693766937e-07, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9476170986890793, |
| "num_tokens": 135920418.0, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.6180955608268384, |
| "grad_norm": 0.7216784295260695, |
| "learning_rate": 7.655826558265583e-07, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9461710155010223, |
| "num_tokens": 136068122.0, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.6187732971873942, |
| "grad_norm": 0.4510715884297431, |
| "learning_rate": 7.642276422764228e-07, |
| "loss": 0.1721, |
| "mean_token_accuracy": 0.9446103274822235, |
| "num_tokens": 136217084.0, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.6194510335479498, |
| "grad_norm": 0.5186949964426198, |
| "learning_rate": 7.628726287262872e-07, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9492061957716942, |
| "num_tokens": 136362608.0, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.6201287699085056, |
| "grad_norm": 0.4111352524011199, |
| "learning_rate": 7.615176151761518e-07, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.9499277547001839, |
| "num_tokens": 136515985.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.6208065062690613, |
| "grad_norm": 0.5139528769121537, |
| "learning_rate": 7.601626016260162e-07, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9490419402718544, |
| "num_tokens": 136666846.0, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.6214842426296171, |
| "grad_norm": 0.6677050444967101, |
| "learning_rate": 7.588075880758807e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9486491605639458, |
| "num_tokens": 136814955.0, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.6221619789901728, |
| "grad_norm": 0.5997639548038106, |
| "learning_rate": 7.574525745257452e-07, |
| "loss": 0.1558, |
| "mean_token_accuracy": 0.9490111693739891, |
| "num_tokens": 136962819.0, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.6228397153507286, |
| "grad_norm": 0.3898808395054342, |
| "learning_rate": 7.560975609756097e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9494509026408195, |
| "num_tokens": 137107360.0, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.6235174517112844, |
| "grad_norm": 0.5185812674164358, |
| "learning_rate": 7.547425474254743e-07, |
| "loss": 0.1582, |
| "mean_token_accuracy": 0.9492890685796738, |
| "num_tokens": 137254114.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.62419518807184, |
| "grad_norm": 0.4597891887499145, |
| "learning_rate": 7.533875338753387e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9472557231783867, |
| "num_tokens": 137407322.0, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.6248729244323958, |
| "grad_norm": 0.43904471022087876, |
| "learning_rate": 7.520325203252032e-07, |
| "loss": 0.1713, |
| "mean_token_accuracy": 0.9465183466672897, |
| "num_tokens": 137552436.0, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.6255506607929515, |
| "grad_norm": 0.5280919024943563, |
| "learning_rate": 7.506775067750677e-07, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9440080150961876, |
| "num_tokens": 137698616.0, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.6262283971535073, |
| "grad_norm": 0.5085519075809602, |
| "learning_rate": 7.493224932249323e-07, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9469470083713531, |
| "num_tokens": 137852417.0, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.626906133514063, |
| "grad_norm": 0.9381387752413748, |
| "learning_rate": 7.479674796747967e-07, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9501704648137093, |
| "num_tokens": 138002264.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.6275838698746188, |
| "grad_norm": 0.4065937650418371, |
| "learning_rate": 7.466124661246612e-07, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9459371268749237, |
| "num_tokens": 138150177.0, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.6282616062351745, |
| "grad_norm": 0.6068927611317052, |
| "learning_rate": 7.452574525745256e-07, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9475051537156105, |
| "num_tokens": 138299546.0, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.6289393425957303, |
| "grad_norm": 0.48424123680078385, |
| "learning_rate": 7.439024390243903e-07, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9467552751302719, |
| "num_tokens": 138451328.0, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.629617078956286, |
| "grad_norm": 0.5033477602408023, |
| "learning_rate": 7.425474254742548e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9471101090312004, |
| "num_tokens": 138597072.0, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.6302948153168417, |
| "grad_norm": 0.4618401305691291, |
| "learning_rate": 7.411924119241192e-07, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.9449163228273392, |
| "num_tokens": 138746702.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6309725516773975, |
| "grad_norm": 0.5859131515689481, |
| "learning_rate": 7.398373983739837e-07, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9446366801857948, |
| "num_tokens": 138898702.0, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.6316502880379532, |
| "grad_norm": 0.4646050979072507, |
| "learning_rate": 7.384823848238481e-07, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9500670656561852, |
| "num_tokens": 139050482.0, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.632328024398509, |
| "grad_norm": 0.4938179522850119, |
| "learning_rate": 7.371273712737127e-07, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9475302547216415, |
| "num_tokens": 139202406.0, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.6330057607590647, |
| "grad_norm": 0.4587172954884775, |
| "learning_rate": 7.357723577235772e-07, |
| "loss": 0.1567, |
| "mean_token_accuracy": 0.9493889287114143, |
| "num_tokens": 139349833.0, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.6336834971196205, |
| "grad_norm": 0.6670031930804362, |
| "learning_rate": 7.344173441734417e-07, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9493798017501831, |
| "num_tokens": 139500833.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.6343612334801763, |
| "grad_norm": 0.4389808125315095, |
| "learning_rate": 7.330623306233062e-07, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9476785436272621, |
| "num_tokens": 139648168.0, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.6350389698407319, |
| "grad_norm": 0.37509769597300396, |
| "learning_rate": 7.317073170731707e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9489102885127068, |
| "num_tokens": 139794445.0, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.6357167062012877, |
| "grad_norm": 0.53982506710144, |
| "learning_rate": 7.303523035230352e-07, |
| "loss": 0.1505, |
| "mean_token_accuracy": 0.9502668455243111, |
| "num_tokens": 139942868.0, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.6363944425618434, |
| "grad_norm": 0.4635865192147379, |
| "learning_rate": 7.289972899728997e-07, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9481147155165672, |
| "num_tokens": 140093883.0, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.6370721789223992, |
| "grad_norm": 0.41885841355844794, |
| "learning_rate": 7.276422764227642e-07, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9470746591687202, |
| "num_tokens": 140240725.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6377499152829549, |
| "grad_norm": 0.47462858953487475, |
| "learning_rate": 7.262872628726287e-07, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9460104629397392, |
| "num_tokens": 140388967.0, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.6384276516435107, |
| "grad_norm": 1.1343146325099123, |
| "learning_rate": 7.249322493224932e-07, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9451970756053925, |
| "num_tokens": 140541231.0, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.6391053880040665, |
| "grad_norm": 0.6104215798107635, |
| "learning_rate": 7.235772357723577e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9492672756314278, |
| "num_tokens": 140692424.0, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.6397831243646221, |
| "grad_norm": 0.421253631362635, |
| "learning_rate": 7.222222222222221e-07, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9471209272742271, |
| "num_tokens": 140841186.0, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.6404608607251779, |
| "grad_norm": 0.38741060193677335, |
| "learning_rate": 7.208672086720868e-07, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9487796202301979, |
| "num_tokens": 140991562.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.6411385970857336, |
| "grad_norm": 0.41674417095219957, |
| "learning_rate": 7.195121951219512e-07, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.9446077048778534, |
| "num_tokens": 141139839.0, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.6418163334462894, |
| "grad_norm": 0.4176168006364939, |
| "learning_rate": 7.181571815718157e-07, |
| "loss": 0.1524, |
| "mean_token_accuracy": 0.9500337019562721, |
| "num_tokens": 141288450.0, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.6424940698068451, |
| "grad_norm": 0.47308408138568686, |
| "learning_rate": 7.168021680216801e-07, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9485318809747696, |
| "num_tokens": 141439179.0, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.6431718061674009, |
| "grad_norm": 0.39253531614222453, |
| "learning_rate": 7.154471544715447e-07, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9496328011155128, |
| "num_tokens": 141591861.0, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.6438495425279567, |
| "grad_norm": 0.4843422610365838, |
| "learning_rate": 7.140921409214093e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9474169239401817, |
| "num_tokens": 141741485.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6445272788885124, |
| "grad_norm": 0.5257492858880942, |
| "learning_rate": 7.127371273712737e-07, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9472004100680351, |
| "num_tokens": 141890794.0, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.6452050152490681, |
| "grad_norm": 0.5382929426731695, |
| "learning_rate": 7.113821138211382e-07, |
| "loss": 0.1562, |
| "mean_token_accuracy": 0.9496374428272247, |
| "num_tokens": 142041227.0, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.6458827516096238, |
| "grad_norm": 0.4590965631424657, |
| "learning_rate": 7.100271002710026e-07, |
| "loss": 0.1557, |
| "mean_token_accuracy": 0.949472963809967, |
| "num_tokens": 142189286.0, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.6465604879701796, |
| "grad_norm": 0.9315953451855179, |
| "learning_rate": 7.086720867208672e-07, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9469330534338951, |
| "num_tokens": 142335353.0, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.6472382243307353, |
| "grad_norm": 0.49292357092040273, |
| "learning_rate": 7.073170731707316e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9490496292710304, |
| "num_tokens": 142487994.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.6479159606912911, |
| "grad_norm": 0.6003853214589224, |
| "learning_rate": 7.059620596205962e-07, |
| "loss": 0.1739, |
| "mean_token_accuracy": 0.94477578997612, |
| "num_tokens": 142641052.0, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.6485936970518469, |
| "grad_norm": 0.43738049928150774, |
| "learning_rate": 7.046070460704606e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.948681466281414, |
| "num_tokens": 142785857.0, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.6492714334124026, |
| "grad_norm": 0.5945071762572889, |
| "learning_rate": 7.032520325203252e-07, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.947319395840168, |
| "num_tokens": 142934275.0, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.6499491697729584, |
| "grad_norm": 0.39907508200233105, |
| "learning_rate": 7.018970189701897e-07, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.944616362452507, |
| "num_tokens": 143085778.0, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.650626906133514, |
| "grad_norm": 0.43112091341931374, |
| "learning_rate": 7.005420054200541e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9482121244072914, |
| "num_tokens": 143237406.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.6513046424940698, |
| "grad_norm": 0.37422520350770055, |
| "learning_rate": 6.991869918699187e-07, |
| "loss": 0.1534, |
| "mean_token_accuracy": 0.9510421678423882, |
| "num_tokens": 143388807.0, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.6519823788546255, |
| "grad_norm": 0.5373532478650356, |
| "learning_rate": 6.978319783197832e-07, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.9494762420654297, |
| "num_tokens": 143538958.0, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.6526601152151813, |
| "grad_norm": 0.5160251596916773, |
| "learning_rate": 6.964769647696477e-07, |
| "loss": 0.1713, |
| "mean_token_accuracy": 0.9450407698750496, |
| "num_tokens": 143689416.0, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.653337851575737, |
| "grad_norm": 0.43329708914509557, |
| "learning_rate": 6.951219512195121e-07, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9481780678033829, |
| "num_tokens": 143839600.0, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.6540155879362928, |
| "grad_norm": 0.7329749772251899, |
| "learning_rate": 6.937669376693766e-07, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9457468092441559, |
| "num_tokens": 143990179.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.6546933242968486, |
| "grad_norm": 0.7453207951547401, |
| "learning_rate": 6.924119241192413e-07, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9484611824154854, |
| "num_tokens": 144142458.0, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.6553710606574042, |
| "grad_norm": 0.42955950036507906, |
| "learning_rate": 6.910569105691057e-07, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9487689658999443, |
| "num_tokens": 144293182.0, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.65604879701796, |
| "grad_norm": 0.46246879113498524, |
| "learning_rate": 6.897018970189702e-07, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9465522468090057, |
| "num_tokens": 144441849.0, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.6567265333785157, |
| "grad_norm": 0.4728480565761358, |
| "learning_rate": 6.883468834688346e-07, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9483184814453125, |
| "num_tokens": 144594086.0, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.6574042697390715, |
| "grad_norm": 0.5700572199360658, |
| "learning_rate": 6.869918699186991e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.948683463037014, |
| "num_tokens": 144743175.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6580820060996272, |
| "grad_norm": 0.4478620978369501, |
| "learning_rate": 6.856368563685636e-07, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9477500319480896, |
| "num_tokens": 144890797.0, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.658759742460183, |
| "grad_norm": 0.4471516180312603, |
| "learning_rate": 6.842818428184282e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9481164589524269, |
| "num_tokens": 145039973.0, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.6594374788207388, |
| "grad_norm": 0.43607758337967034, |
| "learning_rate": 6.829268292682927e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9476160705089569, |
| "num_tokens": 145186940.0, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.6601152151812945, |
| "grad_norm": 0.6017629461734204, |
| "learning_rate": 6.815718157181571e-07, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.945083349943161, |
| "num_tokens": 145337724.0, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.6607929515418502, |
| "grad_norm": 0.5675225086716628, |
| "learning_rate": 6.802168021680217e-07, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9487828463315964, |
| "num_tokens": 145488495.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.6614706879024059, |
| "grad_norm": 0.7115168554273109, |
| "learning_rate": 6.788617886178861e-07, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9486016631126404, |
| "num_tokens": 145639262.0, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.6621484242629617, |
| "grad_norm": 0.41618104203918604, |
| "learning_rate": 6.775067750677507e-07, |
| "loss": 0.1564, |
| "mean_token_accuracy": 0.9503029733896255, |
| "num_tokens": 145786917.0, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.6628261606235174, |
| "grad_norm": 0.539733540745076, |
| "learning_rate": 6.761517615176151e-07, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9473312497138977, |
| "num_tokens": 145937276.0, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.6635038969840732, |
| "grad_norm": 0.4949885821070727, |
| "learning_rate": 6.747967479674797e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.948186106979847, |
| "num_tokens": 146086934.0, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.664181633344629, |
| "grad_norm": 0.43336406807299116, |
| "learning_rate": 6.734417344173442e-07, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.946333147585392, |
| "num_tokens": 146233812.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6648593697051847, |
| "grad_norm": 0.49588130939765285, |
| "learning_rate": 6.720867208672086e-07, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9457742869853973, |
| "num_tokens": 146380514.0, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.6655371060657405, |
| "grad_norm": 0.4596389273240423, |
| "learning_rate": 6.707317073170731e-07, |
| "loss": 0.1562, |
| "mean_token_accuracy": 0.9501648396253586, |
| "num_tokens": 146527322.0, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.6662148424262961, |
| "grad_norm": 0.6372506840384964, |
| "learning_rate": 6.693766937669377e-07, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.946533665060997, |
| "num_tokens": 146675600.0, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.6668925787868519, |
| "grad_norm": 0.3954764119023803, |
| "learning_rate": 6.680216802168022e-07, |
| "loss": 0.1573, |
| "mean_token_accuracy": 0.9492572247982025, |
| "num_tokens": 146826945.0, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.6675703151474076, |
| "grad_norm": 0.5041113695619949, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.946345716714859, |
| "num_tokens": 146977048.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6682480515079634, |
| "grad_norm": 0.42733097214955723, |
| "learning_rate": 6.653116531165311e-07, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9447240754961967, |
| "num_tokens": 147126835.0, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.6689257878685192, |
| "grad_norm": 0.43474026258546133, |
| "learning_rate": 6.639566395663955e-07, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9477177634835243, |
| "num_tokens": 147279051.0, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.6696035242290749, |
| "grad_norm": 0.3979093996109721, |
| "learning_rate": 6.626016260162602e-07, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9484685435891151, |
| "num_tokens": 147429614.0, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.6702812605896307, |
| "grad_norm": 0.6075977010738575, |
| "learning_rate": 6.612466124661247e-07, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9487278908491135, |
| "num_tokens": 147578177.0, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.6709589969501863, |
| "grad_norm": 0.6626901297393074, |
| "learning_rate": 6.598915989159891e-07, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9505260586738586, |
| "num_tokens": 147728883.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6716367333107421, |
| "grad_norm": 0.44245900135479793, |
| "learning_rate": 6.585365853658536e-07, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9478070139884949, |
| "num_tokens": 147880083.0, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.6723144696712978, |
| "grad_norm": 0.44930521995867917, |
| "learning_rate": 6.571815718157181e-07, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9511049762368202, |
| "num_tokens": 148029594.0, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.6729922060318536, |
| "grad_norm": 0.47838561589504064, |
| "learning_rate": 6.558265582655827e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9485039636492729, |
| "num_tokens": 148176040.0, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.6736699423924094, |
| "grad_norm": 0.45346399169896656, |
| "learning_rate": 6.544715447154471e-07, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9471649006009102, |
| "num_tokens": 148328243.0, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.6743476787529651, |
| "grad_norm": 0.40105875616680847, |
| "learning_rate": 6.531165311653116e-07, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9450387135148048, |
| "num_tokens": 148478098.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6750254151135209, |
| "grad_norm": 0.4691349395187444, |
| "learning_rate": 6.517615176151762e-07, |
| "loss": 0.1733, |
| "mean_token_accuracy": 0.9437250271439552, |
| "num_tokens": 148626030.0, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.6757031514740766, |
| "grad_norm": 0.4670006204550999, |
| "learning_rate": 6.504065040650406e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9475633949041367, |
| "num_tokens": 148773461.0, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.6763808878346323, |
| "grad_norm": 0.4226966853445537, |
| "learning_rate": 6.490514905149051e-07, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.947656437754631, |
| "num_tokens": 148921060.0, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.677058624195188, |
| "grad_norm": 0.37248207883522355, |
| "learning_rate": 6.476964769647696e-07, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9487950205802917, |
| "num_tokens": 149070708.0, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.6777363605557438, |
| "grad_norm": 0.8744351429100297, |
| "learning_rate": 6.463414634146342e-07, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9497079774737358, |
| "num_tokens": 149221242.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6784140969162996, |
| "grad_norm": 0.5697798300327783, |
| "learning_rate": 6.449864498644986e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9479104578495026, |
| "num_tokens": 149373988.0, |
| "step": 1001 |
| }, |
| { |
| "epoch": 0.6790918332768553, |
| "grad_norm": 1.2588346663017063, |
| "learning_rate": 6.436314363143631e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9477178603410721, |
| "num_tokens": 149525452.0, |
| "step": 1002 |
| }, |
| { |
| "epoch": 0.6797695696374111, |
| "grad_norm": 0.5467253562349623, |
| "learning_rate": 6.422764227642276e-07, |
| "loss": 0.145, |
| "mean_token_accuracy": 0.952636294066906, |
| "num_tokens": 149677061.0, |
| "step": 1003 |
| }, |
| { |
| "epoch": 0.6804473059979668, |
| "grad_norm": 0.42502847488356554, |
| "learning_rate": 6.40921409214092e-07, |
| "loss": 0.1736, |
| "mean_token_accuracy": 0.9454517439007759, |
| "num_tokens": 149827751.0, |
| "step": 1004 |
| }, |
| { |
| "epoch": 0.6811250423585226, |
| "grad_norm": 0.524162218685377, |
| "learning_rate": 6.395663956639567e-07, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9461118578910828, |
| "num_tokens": 149974862.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.6818027787190782, |
| "grad_norm": 0.4066395805542919, |
| "learning_rate": 6.382113821138211e-07, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9463948607444763, |
| "num_tokens": 150124235.0, |
| "step": 1006 |
| }, |
| { |
| "epoch": 0.682480515079634, |
| "grad_norm": 0.39312368414912413, |
| "learning_rate": 6.368563685636856e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9479557573795319, |
| "num_tokens": 150269048.0, |
| "step": 1007 |
| }, |
| { |
| "epoch": 0.6831582514401897, |
| "grad_norm": 0.4971750767713653, |
| "learning_rate": 6.3550135501355e-07, |
| "loss": 0.1692, |
| "mean_token_accuracy": 0.9455204978585243, |
| "num_tokens": 150414464.0, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.6838359878007455, |
| "grad_norm": 0.523589931416947, |
| "learning_rate": 6.341463414634146e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9485000520944595, |
| "num_tokens": 150559944.0, |
| "step": 1009 |
| }, |
| { |
| "epoch": 0.6845137241613013, |
| "grad_norm": 0.42982723923499433, |
| "learning_rate": 6.327913279132791e-07, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.947691835463047, |
| "num_tokens": 150709472.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.685191460521857, |
| "grad_norm": 0.4800134554668242, |
| "learning_rate": 6.314363143631436e-07, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9475793689489365, |
| "num_tokens": 150857897.0, |
| "step": 1011 |
| }, |
| { |
| "epoch": 0.6858691968824128, |
| "grad_norm": 0.4965764094852088, |
| "learning_rate": 6.300813008130081e-07, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9453277885913849, |
| "num_tokens": 151009033.0, |
| "step": 1012 |
| }, |
| { |
| "epoch": 0.6865469332429684, |
| "grad_norm": 0.9911853882172641, |
| "learning_rate": 6.287262872628726e-07, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9489800333976746, |
| "num_tokens": 151158813.0, |
| "step": 1013 |
| }, |
| { |
| "epoch": 0.6872246696035242, |
| "grad_norm": 0.4913798882103245, |
| "learning_rate": 6.273712737127371e-07, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9487544074654579, |
| "num_tokens": 151306636.0, |
| "step": 1014 |
| }, |
| { |
| "epoch": 0.6879024059640799, |
| "grad_norm": 0.42742009951783383, |
| "learning_rate": 6.260162601626016e-07, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9494217410683632, |
| "num_tokens": 151457560.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6885801423246357, |
| "grad_norm": 0.434739576969294, |
| "learning_rate": 6.246612466124661e-07, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9475802183151245, |
| "num_tokens": 151604728.0, |
| "step": 1016 |
| }, |
| { |
| "epoch": 0.6892578786851915, |
| "grad_norm": 0.46381836616803435, |
| "learning_rate": 6.233062330623306e-07, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9461763128638268, |
| "num_tokens": 151754272.0, |
| "step": 1017 |
| }, |
| { |
| "epoch": 0.6899356150457472, |
| "grad_norm": 0.4280497647125328, |
| "learning_rate": 6.219512195121951e-07, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9453966617584229, |
| "num_tokens": 151902351.0, |
| "step": 1018 |
| }, |
| { |
| "epoch": 0.690613351406303, |
| "grad_norm": 0.6707571359671862, |
| "learning_rate": 6.205962059620596e-07, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9466883912682533, |
| "num_tokens": 152052072.0, |
| "step": 1019 |
| }, |
| { |
| "epoch": 0.6912910877668587, |
| "grad_norm": 0.4956643167635072, |
| "learning_rate": 6.19241192411924e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9491880983114243, |
| "num_tokens": 152204352.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6919688241274145, |
| "grad_norm": 0.3824262140069849, |
| "learning_rate": 6.178861788617887e-07, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9476993232965469, |
| "num_tokens": 152354692.0, |
| "step": 1021 |
| }, |
| { |
| "epoch": 0.6926465604879701, |
| "grad_norm": 0.47128833676415405, |
| "learning_rate": 6.165311653116531e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9471416920423508, |
| "num_tokens": 152505270.0, |
| "step": 1022 |
| }, |
| { |
| "epoch": 0.6933242968485259, |
| "grad_norm": 0.435155481364805, |
| "learning_rate": 6.151761517615176e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.948992021381855, |
| "num_tokens": 152656768.0, |
| "step": 1023 |
| }, |
| { |
| "epoch": 0.6940020332090817, |
| "grad_norm": 0.48425829900282286, |
| "learning_rate": 6.13821138211382e-07, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9452779963612556, |
| "num_tokens": 152798483.0, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.6946797695696374, |
| "grad_norm": 0.4958286938733244, |
| "learning_rate": 6.124661246612465e-07, |
| "loss": 0.1775, |
| "mean_token_accuracy": 0.9441835582256317, |
| "num_tokens": 152945144.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6953575059301932, |
| "grad_norm": 0.46234975698526665, |
| "learning_rate": 6.111111111111112e-07, |
| "loss": 0.1738, |
| "mean_token_accuracy": 0.9445149600505829, |
| "num_tokens": 153097031.0, |
| "step": 1026 |
| }, |
| { |
| "epoch": 0.6960352422907489, |
| "grad_norm": 1.057014143853164, |
| "learning_rate": 6.097560975609756e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9487510249018669, |
| "num_tokens": 153245418.0, |
| "step": 1027 |
| }, |
| { |
| "epoch": 0.6967129786513047, |
| "grad_norm": 0.4154254841226032, |
| "learning_rate": 6.084010840108401e-07, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9500058144330978, |
| "num_tokens": 153393588.0, |
| "step": 1028 |
| }, |
| { |
| "epoch": 0.6973907150118603, |
| "grad_norm": 0.494308254553982, |
| "learning_rate": 6.070460704607045e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9491009786725044, |
| "num_tokens": 153544720.0, |
| "step": 1029 |
| }, |
| { |
| "epoch": 0.6980684513724161, |
| "grad_norm": 0.6031422994604925, |
| "learning_rate": 6.056910569105691e-07, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.9448559284210205, |
| "num_tokens": 153693950.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6987461877329719, |
| "grad_norm": 0.41629799805370477, |
| "learning_rate": 6.043360433604336e-07, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9477561861276627, |
| "num_tokens": 153840032.0, |
| "step": 1031 |
| }, |
| { |
| "epoch": 0.6994239240935276, |
| "grad_norm": 0.6530039841126879, |
| "learning_rate": 6.029810298102981e-07, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9468976333737373, |
| "num_tokens": 153987990.0, |
| "step": 1032 |
| }, |
| { |
| "epoch": 0.7001016604540834, |
| "grad_norm": 0.4861004204726015, |
| "learning_rate": 6.016260162601626e-07, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9479655027389526, |
| "num_tokens": 154141733.0, |
| "step": 1033 |
| }, |
| { |
| "epoch": 0.7007793968146391, |
| "grad_norm": 0.414327671266201, |
| "learning_rate": 6.002710027100271e-07, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9477419853210449, |
| "num_tokens": 154294962.0, |
| "step": 1034 |
| }, |
| { |
| "epoch": 0.7014571331751949, |
| "grad_norm": 0.583808333818519, |
| "learning_rate": 5.989159891598916e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9477186799049377, |
| "num_tokens": 154444151.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.7021348695357505, |
| "grad_norm": 0.4599533579612761, |
| "learning_rate": 5.97560975609756e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9488044008612633, |
| "num_tokens": 154596863.0, |
| "step": 1036 |
| }, |
| { |
| "epoch": 0.7028126058963063, |
| "grad_norm": 0.7288151095568319, |
| "learning_rate": 5.962059620596206e-07, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9469533413648605, |
| "num_tokens": 154743904.0, |
| "step": 1037 |
| }, |
| { |
| "epoch": 0.7034903422568621, |
| "grad_norm": 0.40988753181716203, |
| "learning_rate": 5.94850948509485e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9466472268104553, |
| "num_tokens": 154891324.0, |
| "step": 1038 |
| }, |
| { |
| "epoch": 0.7041680786174178, |
| "grad_norm": 0.4404920504893847, |
| "learning_rate": 5.934959349593496e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9487411081790924, |
| "num_tokens": 155042210.0, |
| "step": 1039 |
| }, |
| { |
| "epoch": 0.7048458149779736, |
| "grad_norm": 0.5766288185187523, |
| "learning_rate": 5.92140921409214e-07, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9482933580875397, |
| "num_tokens": 155189432.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7055235513385293, |
| "grad_norm": 0.412030134764044, |
| "learning_rate": 5.907859078590785e-07, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9456660151481628, |
| "num_tokens": 155341723.0, |
| "step": 1041 |
| }, |
| { |
| "epoch": 0.7062012876990851, |
| "grad_norm": 0.44728187011697396, |
| "learning_rate": 5.894308943089431e-07, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9455131962895393, |
| "num_tokens": 155489554.0, |
| "step": 1042 |
| }, |
| { |
| "epoch": 0.7068790240596408, |
| "grad_norm": 0.5074896720012357, |
| "learning_rate": 5.880758807588076e-07, |
| "loss": 0.1558, |
| "mean_token_accuracy": 0.9492698237299919, |
| "num_tokens": 155635822.0, |
| "step": 1043 |
| }, |
| { |
| "epoch": 0.7075567604201966, |
| "grad_norm": 0.4326441243027523, |
| "learning_rate": 5.867208672086721e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9493276998400688, |
| "num_tokens": 155780829.0, |
| "step": 1044 |
| }, |
| { |
| "epoch": 0.7082344967807523, |
| "grad_norm": 0.40335399154054274, |
| "learning_rate": 5.853658536585365e-07, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9495497569441795, |
| "num_tokens": 155930299.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.708912233141308, |
| "grad_norm": 0.446254509446425, |
| "learning_rate": 5.84010840108401e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9480220675468445, |
| "num_tokens": 156079491.0, |
| "step": 1046 |
| }, |
| { |
| "epoch": 0.7095899695018638, |
| "grad_norm": 0.4839320678771503, |
| "learning_rate": 5.826558265582655e-07, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9495791122317314, |
| "num_tokens": 156231042.0, |
| "step": 1047 |
| }, |
| { |
| "epoch": 0.7102677058624195, |
| "grad_norm": 0.39007295050050605, |
| "learning_rate": 5.813008130081301e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9492901787161827, |
| "num_tokens": 156381147.0, |
| "step": 1048 |
| }, |
| { |
| "epoch": 0.7109454422229753, |
| "grad_norm": 0.4476484023763023, |
| "learning_rate": 5.799457994579946e-07, |
| "loss": 0.1677, |
| "mean_token_accuracy": 0.9462874010205269, |
| "num_tokens": 156527359.0, |
| "step": 1049 |
| }, |
| { |
| "epoch": 0.711623178583531, |
| "grad_norm": 0.36657558143799696, |
| "learning_rate": 5.78590785907859e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9479619190096855, |
| "num_tokens": 156675494.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7123009149440868, |
| "grad_norm": 0.3907366683635014, |
| "learning_rate": 5.772357723577236e-07, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.9459411054849625, |
| "num_tokens": 156824525.0, |
| "step": 1051 |
| }, |
| { |
| "epoch": 0.7129786513046424, |
| "grad_norm": 0.5010217700696592, |
| "learning_rate": 5.75880758807588e-07, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.9472585767507553, |
| "num_tokens": 156972539.0, |
| "step": 1052 |
| }, |
| { |
| "epoch": 0.7136563876651982, |
| "grad_norm": 0.4552239071847725, |
| "learning_rate": 5.745257452574526e-07, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9496041089296341, |
| "num_tokens": 157126178.0, |
| "step": 1053 |
| }, |
| { |
| "epoch": 0.714334124025754, |
| "grad_norm": 0.5803431503580072, |
| "learning_rate": 5.73170731707317e-07, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9461987987160683, |
| "num_tokens": 157271554.0, |
| "step": 1054 |
| }, |
| { |
| "epoch": 0.7150118603863097, |
| "grad_norm": 0.41125032613778734, |
| "learning_rate": 5.718157181571816e-07, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.948636420071125, |
| "num_tokens": 157420011.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.7156895967468655, |
| "grad_norm": 0.49381781540478925, |
| "learning_rate": 5.704607046070461e-07, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9482694193720818, |
| "num_tokens": 157572976.0, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.7163673331074212, |
| "grad_norm": 0.4063972540938144, |
| "learning_rate": 5.691056910569105e-07, |
| "loss": 0.1565, |
| "mean_token_accuracy": 0.9495553001761436, |
| "num_tokens": 157726207.0, |
| "step": 1057 |
| }, |
| { |
| "epoch": 0.717045069467977, |
| "grad_norm": 0.47874259116197987, |
| "learning_rate": 5.677506775067751e-07, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9484956189990044, |
| "num_tokens": 157876946.0, |
| "step": 1058 |
| }, |
| { |
| "epoch": 0.7177228058285327, |
| "grad_norm": 0.48396877389491794, |
| "learning_rate": 5.663956639566395e-07, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9495202451944351, |
| "num_tokens": 158026932.0, |
| "step": 1059 |
| }, |
| { |
| "epoch": 0.7184005421890884, |
| "grad_norm": 0.391289028399714, |
| "learning_rate": 5.650406504065041e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9484245777130127, |
| "num_tokens": 158174397.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7190782785496442, |
| "grad_norm": 0.4424697534472181, |
| "learning_rate": 5.636856368563685e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9496257454156876, |
| "num_tokens": 158326039.0, |
| "step": 1061 |
| }, |
| { |
| "epoch": 0.7197560149101999, |
| "grad_norm": 0.5089271154160393, |
| "learning_rate": 5.62330623306233e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9480353966355324, |
| "num_tokens": 158469566.0, |
| "step": 1062 |
| }, |
| { |
| "epoch": 0.7204337512707557, |
| "grad_norm": 0.4985784920100805, |
| "learning_rate": 5.609756097560975e-07, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9490131065249443, |
| "num_tokens": 158619431.0, |
| "step": 1063 |
| }, |
| { |
| "epoch": 0.7211114876313114, |
| "grad_norm": 0.44218019553298804, |
| "learning_rate": 5.596205962059621e-07, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9480287060141563, |
| "num_tokens": 158771302.0, |
| "step": 1064 |
| }, |
| { |
| "epoch": 0.7217892239918672, |
| "grad_norm": 0.6204884275566048, |
| "learning_rate": 5.582655826558266e-07, |
| "loss": 0.1507, |
| "mean_token_accuracy": 0.9509203135967255, |
| "num_tokens": 158916649.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.7224669603524229, |
| "grad_norm": 0.609368393923901, |
| "learning_rate": 5.56910569105691e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9473355263471603, |
| "num_tokens": 159066742.0, |
| "step": 1066 |
| }, |
| { |
| "epoch": 0.7231446967129787, |
| "grad_norm": 0.49336244834060305, |
| "learning_rate": 5.555555555555555e-07, |
| "loss": 0.1519, |
| "mean_token_accuracy": 0.950840562582016, |
| "num_tokens": 159216948.0, |
| "step": 1067 |
| }, |
| { |
| "epoch": 0.7238224330735344, |
| "grad_norm": 0.4084779841187189, |
| "learning_rate": 5.5420054200542e-07, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.949260413646698, |
| "num_tokens": 159367626.0, |
| "step": 1068 |
| }, |
| { |
| "epoch": 0.7245001694340901, |
| "grad_norm": 0.4875070606551302, |
| "learning_rate": 5.528455284552846e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9500613510608673, |
| "num_tokens": 159518997.0, |
| "step": 1069 |
| }, |
| { |
| "epoch": 0.7251779057946459, |
| "grad_norm": 0.621033279364186, |
| "learning_rate": 5.51490514905149e-07, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.947950966656208, |
| "num_tokens": 159667046.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7258556421552016, |
| "grad_norm": 0.5091231347925989, |
| "learning_rate": 5.501355013550135e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9486712962388992, |
| "num_tokens": 159818866.0, |
| "step": 1071 |
| }, |
| { |
| "epoch": 0.7265333785157574, |
| "grad_norm": 0.42381328311317457, |
| "learning_rate": 5.487804878048781e-07, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9498942792415619, |
| "num_tokens": 159969008.0, |
| "step": 1072 |
| }, |
| { |
| "epoch": 0.7272111148763131, |
| "grad_norm": 0.4825256924296279, |
| "learning_rate": 5.474254742547425e-07, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9494803622364998, |
| "num_tokens": 160110210.0, |
| "step": 1073 |
| }, |
| { |
| "epoch": 0.7278888512368689, |
| "grad_norm": 0.4265890277238117, |
| "learning_rate": 5.46070460704607e-07, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.9486462771892548, |
| "num_tokens": 160257767.0, |
| "step": 1074 |
| }, |
| { |
| "epoch": 0.7285665875974247, |
| "grad_norm": 0.47929271348046776, |
| "learning_rate": 5.447154471544715e-07, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9511135593056679, |
| "num_tokens": 160403786.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.7292443239579803, |
| "grad_norm": 0.45652629692790414, |
| "learning_rate": 5.43360433604336e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9493985250592232, |
| "num_tokens": 160555117.0, |
| "step": 1076 |
| }, |
| { |
| "epoch": 0.7299220603185361, |
| "grad_norm": 1.9342607817523356, |
| "learning_rate": 5.420054200542005e-07, |
| "loss": 0.1549, |
| "mean_token_accuracy": 0.9498350769281387, |
| "num_tokens": 160702467.0, |
| "step": 1077 |
| }, |
| { |
| "epoch": 0.7305997966790918, |
| "grad_norm": 0.4618242595459399, |
| "learning_rate": 5.40650406504065e-07, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9453785419464111, |
| "num_tokens": 160852318.0, |
| "step": 1078 |
| }, |
| { |
| "epoch": 0.7312775330396476, |
| "grad_norm": 0.9002974906877117, |
| "learning_rate": 5.392953929539295e-07, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9489387944340706, |
| "num_tokens": 161005141.0, |
| "step": 1079 |
| }, |
| { |
| "epoch": 0.7319552694002033, |
| "grad_norm": 0.4539144950148489, |
| "learning_rate": 5.37940379403794e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9478288516402245, |
| "num_tokens": 161155555.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.7326330057607591, |
| "grad_norm": 0.717476527687285, |
| "learning_rate": 5.365853658536586e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9471670761704445, |
| "num_tokens": 161307804.0, |
| "step": 1081 |
| }, |
| { |
| "epoch": 0.7333107421213149, |
| "grad_norm": 0.5286492849743774, |
| "learning_rate": 5.35230352303523e-07, |
| "loss": 0.16, |
| "mean_token_accuracy": 0.9486302956938744, |
| "num_tokens": 161453779.0, |
| "step": 1082 |
| }, |
| { |
| "epoch": 0.7339884784818705, |
| "grad_norm": 0.44520556756003754, |
| "learning_rate": 5.338753387533875e-07, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.947560265660286, |
| "num_tokens": 161602603.0, |
| "step": 1083 |
| }, |
| { |
| "epoch": 0.7346662148424263, |
| "grad_norm": 0.44339790141119606, |
| "learning_rate": 5.325203252032519e-07, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9448318034410477, |
| "num_tokens": 161752988.0, |
| "step": 1084 |
| }, |
| { |
| "epoch": 0.735343951202982, |
| "grad_norm": 0.6641488603681329, |
| "learning_rate": 5.311653116531166e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9483387470245361, |
| "num_tokens": 161902180.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.7360216875635378, |
| "grad_norm": 0.4147540319296875, |
| "learning_rate": 5.298102981029811e-07, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9463905096054077, |
| "num_tokens": 162052450.0, |
| "step": 1086 |
| }, |
| { |
| "epoch": 0.7366994239240935, |
| "grad_norm": 0.4571766444054861, |
| "learning_rate": 5.284552845528455e-07, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9472680240869522, |
| "num_tokens": 162204506.0, |
| "step": 1087 |
| }, |
| { |
| "epoch": 0.7373771602846493, |
| "grad_norm": 0.5567352989309443, |
| "learning_rate": 5.2710027100271e-07, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9466003924608231, |
| "num_tokens": 162356075.0, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.738054896645205, |
| "grad_norm": 0.4095996116976388, |
| "learning_rate": 5.257452574525745e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9489855617284775, |
| "num_tokens": 162505952.0, |
| "step": 1089 |
| }, |
| { |
| "epoch": 0.7387326330057608, |
| "grad_norm": 0.42251631468174644, |
| "learning_rate": 5.24390243902439e-07, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9460385143756866, |
| "num_tokens": 162655204.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.7394103693663165, |
| "grad_norm": 0.5142282362855055, |
| "learning_rate": 5.230352303523035e-07, |
| "loss": 0.1471, |
| "mean_token_accuracy": 0.9530572295188904, |
| "num_tokens": 162806577.0, |
| "step": 1091 |
| }, |
| { |
| "epoch": 0.7400881057268722, |
| "grad_norm": 0.47517812677105936, |
| "learning_rate": 5.21680216802168e-07, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9492161795496941, |
| "num_tokens": 162953245.0, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.740765842087428, |
| "grad_norm": 0.47577509346551555, |
| "learning_rate": 5.203252032520325e-07, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.947672463953495, |
| "num_tokens": 163103640.0, |
| "step": 1093 |
| }, |
| { |
| "epoch": 0.7414435784479837, |
| "grad_norm": 0.4931157254060953, |
| "learning_rate": 5.18970189701897e-07, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9485186189413071, |
| "num_tokens": 163251527.0, |
| "step": 1094 |
| }, |
| { |
| "epoch": 0.7421213148085395, |
| "grad_norm": 0.46813396306553173, |
| "learning_rate": 5.176151761517615e-07, |
| "loss": 0.1525, |
| "mean_token_accuracy": 0.9509926736354828, |
| "num_tokens": 163401281.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.7427990511690952, |
| "grad_norm": 0.44488095188988325, |
| "learning_rate": 5.16260162601626e-07, |
| "loss": 0.1684, |
| "mean_token_accuracy": 0.9467815607786179, |
| "num_tokens": 163548949.0, |
| "step": 1096 |
| }, |
| { |
| "epoch": 0.743476787529651, |
| "grad_norm": 0.37545263145077906, |
| "learning_rate": 5.149051490514905e-07, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9503208845853806, |
| "num_tokens": 163696654.0, |
| "step": 1097 |
| }, |
| { |
| "epoch": 0.7441545238902068, |
| "grad_norm": 0.451614998294825, |
| "learning_rate": 5.13550135501355e-07, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9479142129421234, |
| "num_tokens": 163844051.0, |
| "step": 1098 |
| }, |
| { |
| "epoch": 0.7448322602507624, |
| "grad_norm": 0.3906206502589162, |
| "learning_rate": 5.121951219512195e-07, |
| "loss": 0.1531, |
| "mean_token_accuracy": 0.9506673514842987, |
| "num_tokens": 163997319.0, |
| "step": 1099 |
| }, |
| { |
| "epoch": 0.7455099966113182, |
| "grad_norm": 0.39475390707276214, |
| "learning_rate": 5.108401084010839e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9487632811069489, |
| "num_tokens": 164149806.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7461877329718739, |
| "grad_norm": 0.45195993303656146, |
| "learning_rate": 5.094850948509484e-07, |
| "loss": 0.1701, |
| "mean_token_accuracy": 0.9452178552746773, |
| "num_tokens": 164300498.0, |
| "step": 1101 |
| }, |
| { |
| "epoch": 0.7468654693324297, |
| "grad_norm": 0.9283345187428643, |
| "learning_rate": 5.081300813008131e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9482429027557373, |
| "num_tokens": 164453163.0, |
| "step": 1102 |
| }, |
| { |
| "epoch": 0.7475432056929854, |
| "grad_norm": 0.4160790574433991, |
| "learning_rate": 5.067750677506775e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9479247257113457, |
| "num_tokens": 164603381.0, |
| "step": 1103 |
| }, |
| { |
| "epoch": 0.7482209420535412, |
| "grad_norm": 0.5972610753011809, |
| "learning_rate": 5.05420054200542e-07, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9486266598105431, |
| "num_tokens": 164753482.0, |
| "step": 1104 |
| }, |
| { |
| "epoch": 0.748898678414097, |
| "grad_norm": 0.50765690623194, |
| "learning_rate": 5.040650406504064e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9477174207568169, |
| "num_tokens": 164901803.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.7495764147746526, |
| "grad_norm": 0.6244395867041137, |
| "learning_rate": 5.02710027100271e-07, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.945775531232357, |
| "num_tokens": 165051724.0, |
| "step": 1106 |
| }, |
| { |
| "epoch": 0.7502541511352084, |
| "grad_norm": 0.4230661986575455, |
| "learning_rate": 5.013550135501355e-07, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9486871957778931, |
| "num_tokens": 165201307.0, |
| "step": 1107 |
| }, |
| { |
| "epoch": 0.7509318874957641, |
| "grad_norm": 0.4138709790482835, |
| "learning_rate": 5e-07, |
| "loss": 0.1759, |
| "mean_token_accuracy": 0.944426141679287, |
| "num_tokens": 165348364.0, |
| "step": 1108 |
| }, |
| { |
| "epoch": 0.7516096238563199, |
| "grad_norm": 0.6258168441746877, |
| "learning_rate": 4.986449864498645e-07, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9489279463887215, |
| "num_tokens": 165498683.0, |
| "step": 1109 |
| }, |
| { |
| "epoch": 0.7522873602168756, |
| "grad_norm": 0.44625048459307576, |
| "learning_rate": 4.972899728997289e-07, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.9476623311638832, |
| "num_tokens": 165649766.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7529650965774314, |
| "grad_norm": 0.4055254667064464, |
| "learning_rate": 4.959349593495934e-07, |
| "loss": 0.1573, |
| "mean_token_accuracy": 0.9490974396467209, |
| "num_tokens": 165799500.0, |
| "step": 1111 |
| }, |
| { |
| "epoch": 0.7536428329379872, |
| "grad_norm": 0.5897618277261697, |
| "learning_rate": 4.94579945799458e-07, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9483003690838814, |
| "num_tokens": 165943803.0, |
| "step": 1112 |
| }, |
| { |
| "epoch": 0.7543205692985429, |
| "grad_norm": 0.4931572511555608, |
| "learning_rate": 4.932249322493225e-07, |
| "loss": 0.1565, |
| "mean_token_accuracy": 0.9494629874825478, |
| "num_tokens": 166093921.0, |
| "step": 1113 |
| }, |
| { |
| "epoch": 0.7549983056590986, |
| "grad_norm": 0.44369073250658586, |
| "learning_rate": 4.91869918699187e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9481725171208382, |
| "num_tokens": 166240636.0, |
| "step": 1114 |
| }, |
| { |
| "epoch": 0.7556760420196543, |
| "grad_norm": 0.3982843037368229, |
| "learning_rate": 4.905149051490515e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9479377120733261, |
| "num_tokens": 166388546.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.7563537783802101, |
| "grad_norm": 0.48976846833393406, |
| "learning_rate": 4.891598915989159e-07, |
| "loss": 0.1518, |
| "mean_token_accuracy": 0.9502234533429146, |
| "num_tokens": 166538021.0, |
| "step": 1116 |
| }, |
| { |
| "epoch": 0.7570315147407658, |
| "grad_norm": 0.4933497477703194, |
| "learning_rate": 4.878048780487804e-07, |
| "loss": 0.1777, |
| "mean_token_accuracy": 0.944856159389019, |
| "num_tokens": 166688000.0, |
| "step": 1117 |
| }, |
| { |
| "epoch": 0.7577092511013216, |
| "grad_norm": 0.859665555701841, |
| "learning_rate": 4.86449864498645e-07, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.949802003800869, |
| "num_tokens": 166839080.0, |
| "step": 1118 |
| }, |
| { |
| "epoch": 0.7583869874618774, |
| "grad_norm": 0.48352933866150305, |
| "learning_rate": 4.850948509485095e-07, |
| "loss": 0.1779, |
| "mean_token_accuracy": 0.9442877918481827, |
| "num_tokens": 166991404.0, |
| "step": 1119 |
| }, |
| { |
| "epoch": 0.7590647238224331, |
| "grad_norm": 0.5251387210642391, |
| "learning_rate": 4.83739837398374e-07, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9475614503026009, |
| "num_tokens": 167136411.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7597424601829889, |
| "grad_norm": 0.43023629072445974, |
| "learning_rate": 4.823848238482385e-07, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9484768286347389, |
| "num_tokens": 167285781.0, |
| "step": 1121 |
| }, |
| { |
| "epoch": 0.7604201965435445, |
| "grad_norm": 0.3962967805842696, |
| "learning_rate": 4.810298102981029e-07, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9462398812174797, |
| "num_tokens": 167435299.0, |
| "step": 1122 |
| }, |
| { |
| "epoch": 0.7610979329041003, |
| "grad_norm": 0.5046873268117023, |
| "learning_rate": 4.796747967479675e-07, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.9459140375256538, |
| "num_tokens": 167580175.0, |
| "step": 1123 |
| }, |
| { |
| "epoch": 0.761775669264656, |
| "grad_norm": 0.44880163261180067, |
| "learning_rate": 4.78319783197832e-07, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9481295719742775, |
| "num_tokens": 167731384.0, |
| "step": 1124 |
| }, |
| { |
| "epoch": 0.7624534056252118, |
| "grad_norm": 0.476587035527155, |
| "learning_rate": 4.769647696476965e-07, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9479389265179634, |
| "num_tokens": 167878163.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.7631311419857676, |
| "grad_norm": 0.4058674637764776, |
| "learning_rate": 4.756097560975609e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9488006085157394, |
| "num_tokens": 168024384.0, |
| "step": 1126 |
| }, |
| { |
| "epoch": 0.7638088783463233, |
| "grad_norm": 0.4528273730595995, |
| "learning_rate": 4.742547425474255e-07, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9465280771255493, |
| "num_tokens": 168174116.0, |
| "step": 1127 |
| }, |
| { |
| "epoch": 0.7644866147068791, |
| "grad_norm": 0.43105944053847295, |
| "learning_rate": 4.7289972899728995e-07, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9475222527980804, |
| "num_tokens": 168322628.0, |
| "step": 1128 |
| }, |
| { |
| "epoch": 0.7651643510674347, |
| "grad_norm": 0.6121472847360002, |
| "learning_rate": 4.7154471544715447e-07, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.9420292302966118, |
| "num_tokens": 168472390.0, |
| "step": 1129 |
| }, |
| { |
| "epoch": 0.7658420874279905, |
| "grad_norm": 0.4119363405573088, |
| "learning_rate": 4.7018970189701893e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9473756477236748, |
| "num_tokens": 168625150.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7665198237885462, |
| "grad_norm": 0.6418528742178927, |
| "learning_rate": 4.6883468834688345e-07, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.9500811100006104, |
| "num_tokens": 168772427.0, |
| "step": 1131 |
| }, |
| { |
| "epoch": 0.767197560149102, |
| "grad_norm": 0.6202551640812881, |
| "learning_rate": 4.674796747967479e-07, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9462482109665871, |
| "num_tokens": 168917877.0, |
| "step": 1132 |
| }, |
| { |
| "epoch": 0.7678752965096577, |
| "grad_norm": 0.4660477467600856, |
| "learning_rate": 4.6612466124661244e-07, |
| "loss": 0.1721, |
| "mean_token_accuracy": 0.9449072405695915, |
| "num_tokens": 169065824.0, |
| "step": 1133 |
| }, |
| { |
| "epoch": 0.7685530328702135, |
| "grad_norm": 0.4556578992177878, |
| "learning_rate": 4.647696476964769e-07, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.9484978765249252, |
| "num_tokens": 169216911.0, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.40157746071753253, |
| "learning_rate": 4.634146341463415e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9495006650686264, |
| "num_tokens": 169364869.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.769908505591325, |
| "grad_norm": 0.7454765323943013, |
| "learning_rate": 4.6205962059620595e-07, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.9446614757180214, |
| "num_tokens": 169513768.0, |
| "step": 1136 |
| }, |
| { |
| "epoch": 0.7705862419518807, |
| "grad_norm": 0.45367573148015927, |
| "learning_rate": 4.6070460704607046e-07, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.9525748193264008, |
| "num_tokens": 169662682.0, |
| "step": 1137 |
| }, |
| { |
| "epoch": 0.7712639783124364, |
| "grad_norm": 0.46148330013702105, |
| "learning_rate": 4.5934959349593493e-07, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9462109357118607, |
| "num_tokens": 169812200.0, |
| "step": 1138 |
| }, |
| { |
| "epoch": 0.7719417146729922, |
| "grad_norm": 0.49333201810275873, |
| "learning_rate": 4.5799457994579945e-07, |
| "loss": 0.1408, |
| "mean_token_accuracy": 0.9537433162331581, |
| "num_tokens": 169963321.0, |
| "step": 1139 |
| }, |
| { |
| "epoch": 0.7726194510335479, |
| "grad_norm": 0.47938082073994764, |
| "learning_rate": 4.566395663956639e-07, |
| "loss": 0.1692, |
| "mean_token_accuracy": 0.9459724575281143, |
| "num_tokens": 170111961.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7732971873941037, |
| "grad_norm": 0.4136374831731232, |
| "learning_rate": 4.5528455284552844e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9475755989551544, |
| "num_tokens": 170260622.0, |
| "step": 1141 |
| }, |
| { |
| "epoch": 0.7739749237546595, |
| "grad_norm": 0.4048615679339811, |
| "learning_rate": 4.5392953929539296e-07, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.947485126554966, |
| "num_tokens": 170407835.0, |
| "step": 1142 |
| }, |
| { |
| "epoch": 0.7746526601152152, |
| "grad_norm": 1.00471158613149, |
| "learning_rate": 4.525745257452575e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9490302726626396, |
| "num_tokens": 170562121.0, |
| "step": 1143 |
| }, |
| { |
| "epoch": 0.775330396475771, |
| "grad_norm": 0.4249072808237762, |
| "learning_rate": 4.5121951219512194e-07, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9463138654828072, |
| "num_tokens": 170708352.0, |
| "step": 1144 |
| }, |
| { |
| "epoch": 0.7760081328363266, |
| "grad_norm": 0.6615241980486258, |
| "learning_rate": 4.498644986449864e-07, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9470024704933167, |
| "num_tokens": 170859060.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.7766858691968824, |
| "grad_norm": 0.4079743575969547, |
| "learning_rate": 4.4850948509485093e-07, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9460372775793076, |
| "num_tokens": 171006613.0, |
| "step": 1146 |
| }, |
| { |
| "epoch": 0.7773636055574381, |
| "grad_norm": 0.47673565387001, |
| "learning_rate": 4.471544715447154e-07, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9476036727428436, |
| "num_tokens": 171152018.0, |
| "step": 1147 |
| }, |
| { |
| "epoch": 0.7780413419179939, |
| "grad_norm": 0.48209638608165467, |
| "learning_rate": 4.457994579945799e-07, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9482689946889877, |
| "num_tokens": 171298623.0, |
| "step": 1148 |
| }, |
| { |
| "epoch": 0.7787190782785497, |
| "grad_norm": 0.4632142775384579, |
| "learning_rate": 4.444444444444444e-07, |
| "loss": 0.1692, |
| "mean_token_accuracy": 0.9462221264839172, |
| "num_tokens": 171449245.0, |
| "step": 1149 |
| }, |
| { |
| "epoch": 0.7793968146391054, |
| "grad_norm": 0.6531764939466912, |
| "learning_rate": 4.4308943089430896e-07, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9493897035717964, |
| "num_tokens": 171600165.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7800745509996612, |
| "grad_norm": 0.45547498077652343, |
| "learning_rate": 4.417344173441734e-07, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9489210993051529, |
| "num_tokens": 171751080.0, |
| "step": 1151 |
| }, |
| { |
| "epoch": 0.7807522873602168, |
| "grad_norm": 0.484511345927537, |
| "learning_rate": 4.4037940379403794e-07, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9453775733709335, |
| "num_tokens": 171901129.0, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.7814300237207726, |
| "grad_norm": 0.41706546335957123, |
| "learning_rate": 4.390243902439024e-07, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9489161521196365, |
| "num_tokens": 172050783.0, |
| "step": 1153 |
| }, |
| { |
| "epoch": 0.7821077600813283, |
| "grad_norm": 0.4308068569401677, |
| "learning_rate": 4.3766937669376693e-07, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9455830752849579, |
| "num_tokens": 172198177.0, |
| "step": 1154 |
| }, |
| { |
| "epoch": 0.7827854964418841, |
| "grad_norm": 0.604842624030898, |
| "learning_rate": 4.363143631436314e-07, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9486095905303955, |
| "num_tokens": 172350203.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7834632328024399, |
| "grad_norm": 0.3932324798137123, |
| "learning_rate": 4.349593495934959e-07, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9465008825063705, |
| "num_tokens": 172500435.0, |
| "step": 1156 |
| }, |
| { |
| "epoch": 0.7841409691629956, |
| "grad_norm": 0.4495398266781922, |
| "learning_rate": 4.3360433604336043e-07, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9467159286141396, |
| "num_tokens": 172650483.0, |
| "step": 1157 |
| }, |
| { |
| "epoch": 0.7848187055235514, |
| "grad_norm": 0.6357691155168428, |
| "learning_rate": 4.3224932249322495e-07, |
| "loss": 0.1612, |
| "mean_token_accuracy": 0.9480964988470078, |
| "num_tokens": 172795246.0, |
| "step": 1158 |
| }, |
| { |
| "epoch": 0.785496441884107, |
| "grad_norm": 0.4374942015220935, |
| "learning_rate": 4.308943089430894e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9480918347835541, |
| "num_tokens": 172942985.0, |
| "step": 1159 |
| }, |
| { |
| "epoch": 0.7861741782446628, |
| "grad_norm": 0.47143774102574604, |
| "learning_rate": 4.2953929539295394e-07, |
| "loss": 0.1546, |
| "mean_token_accuracy": 0.9506010115146637, |
| "num_tokens": 173092290.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7868519146052185, |
| "grad_norm": 0.6559537232580721, |
| "learning_rate": 4.281842818428184e-07, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9472885355353355, |
| "num_tokens": 173242462.0, |
| "step": 1161 |
| }, |
| { |
| "epoch": 0.7875296509657743, |
| "grad_norm": 0.42550464871987237, |
| "learning_rate": 4.268292682926829e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9479214176535606, |
| "num_tokens": 173391533.0, |
| "step": 1162 |
| }, |
| { |
| "epoch": 0.7882073873263301, |
| "grad_norm": 0.4178771851406128, |
| "learning_rate": 4.254742547425474e-07, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9468031749129295, |
| "num_tokens": 173541939.0, |
| "step": 1163 |
| }, |
| { |
| "epoch": 0.7888851236868858, |
| "grad_norm": 3.0851874844884506, |
| "learning_rate": 4.2411924119241186e-07, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9444401487708092, |
| "num_tokens": 173691382.0, |
| "step": 1164 |
| }, |
| { |
| "epoch": 0.7895628600474416, |
| "grad_norm": 0.541872071246644, |
| "learning_rate": 4.2276422764227643e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9471112638711929, |
| "num_tokens": 173839231.0, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7902405964079973, |
| "grad_norm": 0.4587060057310284, |
| "learning_rate": 4.214092140921409e-07, |
| "loss": 0.1792, |
| "mean_token_accuracy": 0.9437511041760445, |
| "num_tokens": 173989261.0, |
| "step": 1166 |
| }, |
| { |
| "epoch": 0.7909183327685531, |
| "grad_norm": 0.4225958028880353, |
| "learning_rate": 4.200542005420054e-07, |
| "loss": 0.1727, |
| "mean_token_accuracy": 0.9445930123329163, |
| "num_tokens": 174138486.0, |
| "step": 1167 |
| }, |
| { |
| "epoch": 0.7915960691291087, |
| "grad_norm": 0.4800932551942435, |
| "learning_rate": 4.186991869918699e-07, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9493310451507568, |
| "num_tokens": 174291947.0, |
| "step": 1168 |
| }, |
| { |
| "epoch": 0.7922738054896645, |
| "grad_norm": 0.5128346029772238, |
| "learning_rate": 4.173441734417344e-07, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9485156983137131, |
| "num_tokens": 174440595.0, |
| "step": 1169 |
| }, |
| { |
| "epoch": 0.7929515418502202, |
| "grad_norm": 0.40254641736714897, |
| "learning_rate": 4.1598915989159887e-07, |
| "loss": 0.1544, |
| "mean_token_accuracy": 0.9498544856905937, |
| "num_tokens": 174590649.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.793629278210776, |
| "grad_norm": 0.4033616840760745, |
| "learning_rate": 4.146341463414634e-07, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.948909617960453, |
| "num_tokens": 174742023.0, |
| "step": 1171 |
| }, |
| { |
| "epoch": 0.7943070145713318, |
| "grad_norm": 0.845072571792667, |
| "learning_rate": 4.1327913279132786e-07, |
| "loss": 0.1494, |
| "mean_token_accuracy": 0.9514944478869438, |
| "num_tokens": 174890895.0, |
| "step": 1172 |
| }, |
| { |
| "epoch": 0.7949847509318875, |
| "grad_norm": 0.40710652669557085, |
| "learning_rate": 4.1192411924119243e-07, |
| "loss": 0.1658, |
| "mean_token_accuracy": 0.9479440152645111, |
| "num_tokens": 175039721.0, |
| "step": 1173 |
| }, |
| { |
| "epoch": 0.7956624872924433, |
| "grad_norm": 0.5258012808838943, |
| "learning_rate": 4.105691056910569e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9484201744198799, |
| "num_tokens": 175192953.0, |
| "step": 1174 |
| }, |
| { |
| "epoch": 0.796340223652999, |
| "grad_norm": 0.44523121927967174, |
| "learning_rate": 4.092140921409214e-07, |
| "loss": 0.1562, |
| "mean_token_accuracy": 0.9496227726340294, |
| "num_tokens": 175345855.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.7970179600135547, |
| "grad_norm": 0.5030347427449846, |
| "learning_rate": 4.078590785907859e-07, |
| "loss": 0.1491, |
| "mean_token_accuracy": 0.9523748084902763, |
| "num_tokens": 175494658.0, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.7976956963741104, |
| "grad_norm": 0.3941068989311148, |
| "learning_rate": 4.065040650406504e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9482178464531898, |
| "num_tokens": 175644012.0, |
| "step": 1177 |
| }, |
| { |
| "epoch": 0.7983734327346662, |
| "grad_norm": 0.46684232352389565, |
| "learning_rate": 4.0514905149051487e-07, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9468219578266144, |
| "num_tokens": 175792869.0, |
| "step": 1178 |
| }, |
| { |
| "epoch": 0.799051169095222, |
| "grad_norm": 1.1655751155677825, |
| "learning_rate": 4.0379403794037934e-07, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9474190697073936, |
| "num_tokens": 175942224.0, |
| "step": 1179 |
| }, |
| { |
| "epoch": 0.7997289054557777, |
| "grad_norm": 0.5365055431684573, |
| "learning_rate": 4.024390243902439e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.94867292791605, |
| "num_tokens": 176092741.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8004066418163335, |
| "grad_norm": 0.3826394929998182, |
| "learning_rate": 4.010840108401084e-07, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.9491298869252205, |
| "num_tokens": 176246685.0, |
| "step": 1181 |
| }, |
| { |
| "epoch": 0.8010843781768892, |
| "grad_norm": 1.0506281171730445, |
| "learning_rate": 3.997289972899729e-07, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9438380673527718, |
| "num_tokens": 176394970.0, |
| "step": 1182 |
| }, |
| { |
| "epoch": 0.801762114537445, |
| "grad_norm": 0.7703779432047844, |
| "learning_rate": 3.9837398373983736e-07, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9469498619437218, |
| "num_tokens": 176544055.0, |
| "step": 1183 |
| }, |
| { |
| "epoch": 0.8024398508980006, |
| "grad_norm": 0.5262594689854897, |
| "learning_rate": 3.970189701897019e-07, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.948112279176712, |
| "num_tokens": 176688198.0, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.8031175872585564, |
| "grad_norm": 0.3950576329095008, |
| "learning_rate": 3.9566395663956635e-07, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9483528733253479, |
| "num_tokens": 176837103.0, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.8037953236191122, |
| "grad_norm": 0.605347918350154, |
| "learning_rate": 3.9430894308943087e-07, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9456880316138268, |
| "num_tokens": 176981904.0, |
| "step": 1186 |
| }, |
| { |
| "epoch": 0.8044730599796679, |
| "grad_norm": 0.41717484581072717, |
| "learning_rate": 3.9295392953929534e-07, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9493725001811981, |
| "num_tokens": 177129587.0, |
| "step": 1187 |
| }, |
| { |
| "epoch": 0.8051507963402237, |
| "grad_norm": 0.46778880805281214, |
| "learning_rate": 3.915989159891599e-07, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9488052427768707, |
| "num_tokens": 177282050.0, |
| "step": 1188 |
| }, |
| { |
| "epoch": 0.8058285327007794, |
| "grad_norm": 0.7785220244255328, |
| "learning_rate": 3.902439024390244e-07, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9476650431752205, |
| "num_tokens": 177429549.0, |
| "step": 1189 |
| }, |
| { |
| "epoch": 0.8065062690613352, |
| "grad_norm": 0.42360438924275595, |
| "learning_rate": 3.888888888888889e-07, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9501416981220245, |
| "num_tokens": 177580668.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.8071840054218908, |
| "grad_norm": 0.6166756515285686, |
| "learning_rate": 3.8753387533875336e-07, |
| "loss": 0.1564, |
| "mean_token_accuracy": 0.9494942203164101, |
| "num_tokens": 177733477.0, |
| "step": 1191 |
| }, |
| { |
| "epoch": 0.8078617417824466, |
| "grad_norm": 0.40998304298923804, |
| "learning_rate": 3.861788617886179e-07, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.9466608390212059, |
| "num_tokens": 177880290.0, |
| "step": 1192 |
| }, |
| { |
| "epoch": 0.8085394781430024, |
| "grad_norm": 0.42628251502407266, |
| "learning_rate": 3.8482384823848235e-07, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9487505033612251, |
| "num_tokens": 178034375.0, |
| "step": 1193 |
| }, |
| { |
| "epoch": 0.8092172145035581, |
| "grad_norm": 0.7174704285276794, |
| "learning_rate": 3.8346883468834687e-07, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.9452232122421265, |
| "num_tokens": 178186543.0, |
| "step": 1194 |
| }, |
| { |
| "epoch": 0.8098949508641139, |
| "grad_norm": 0.6932593178487101, |
| "learning_rate": 3.821138211382114e-07, |
| "loss": 0.1655, |
| "mean_token_accuracy": 0.9474772363901138, |
| "num_tokens": 178337693.0, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.8105726872246696, |
| "grad_norm": 0.45951252039517, |
| "learning_rate": 3.807588075880759e-07, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9495590180158615, |
| "num_tokens": 178489898.0, |
| "step": 1196 |
| }, |
| { |
| "epoch": 0.8112504235852254, |
| "grad_norm": 0.4309023019212715, |
| "learning_rate": 3.794037940379404e-07, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9492277428507805, |
| "num_tokens": 178636952.0, |
| "step": 1197 |
| }, |
| { |
| "epoch": 0.811928159945781, |
| "grad_norm": 0.4792060714652919, |
| "learning_rate": 3.7804878048780484e-07, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.945383831858635, |
| "num_tokens": 178788213.0, |
| "step": 1198 |
| }, |
| { |
| "epoch": 0.8126058963063368, |
| "grad_norm": 0.5703820152757423, |
| "learning_rate": 3.7669376693766936e-07, |
| "loss": 0.1669, |
| "mean_token_accuracy": 0.9468568116426468, |
| "num_tokens": 178938091.0, |
| "step": 1199 |
| }, |
| { |
| "epoch": 0.8132836326668926, |
| "grad_norm": 1.0307293977795946, |
| "learning_rate": 3.7533875338753383e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9486866071820259, |
| "num_tokens": 179083665.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8139613690274483, |
| "grad_norm": 0.5343963015335058, |
| "learning_rate": 3.7398373983739835e-07, |
| "loss": 0.1501, |
| "mean_token_accuracy": 0.9515577852725983, |
| "num_tokens": 179234580.0, |
| "step": 1201 |
| }, |
| { |
| "epoch": 0.8146391053880041, |
| "grad_norm": 0.47269386542652564, |
| "learning_rate": 3.726287262872628e-07, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9467170462012291, |
| "num_tokens": 179384638.0, |
| "step": 1202 |
| }, |
| { |
| "epoch": 0.8153168417485598, |
| "grad_norm": 0.622932001873934, |
| "learning_rate": 3.712737127371274e-07, |
| "loss": 0.1541, |
| "mean_token_accuracy": 0.94975396245718, |
| "num_tokens": 179532313.0, |
| "step": 1203 |
| }, |
| { |
| "epoch": 0.8159945781091156, |
| "grad_norm": 1.30977819206103, |
| "learning_rate": 3.6991869918699185e-07, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9485208690166473, |
| "num_tokens": 179678201.0, |
| "step": 1204 |
| }, |
| { |
| "epoch": 0.8166723144696713, |
| "grad_norm": 0.49551797663980024, |
| "learning_rate": 3.6856368563685637e-07, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9494869783520699, |
| "num_tokens": 179825896.0, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.817350050830227, |
| "grad_norm": 0.39597804024706906, |
| "learning_rate": 3.6720867208672084e-07, |
| "loss": 0.1529, |
| "mean_token_accuracy": 0.9507526159286499, |
| "num_tokens": 179975719.0, |
| "step": 1206 |
| }, |
| { |
| "epoch": 0.8180277871907828, |
| "grad_norm": 0.8735766662793337, |
| "learning_rate": 3.6585365853658536e-07, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9466816782951355, |
| "num_tokens": 180123444.0, |
| "step": 1207 |
| }, |
| { |
| "epoch": 0.8187055235513385, |
| "grad_norm": 0.7259156579762289, |
| "learning_rate": 3.644986449864498e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9495271146297455, |
| "num_tokens": 180273836.0, |
| "step": 1208 |
| }, |
| { |
| "epoch": 0.8193832599118943, |
| "grad_norm": 0.42037044850466443, |
| "learning_rate": 3.6314363143631434e-07, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9484260380268097, |
| "num_tokens": 180419061.0, |
| "step": 1209 |
| }, |
| { |
| "epoch": 0.82006099627245, |
| "grad_norm": 1.2533075540817267, |
| "learning_rate": 3.6178861788617886e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9464852139353752, |
| "num_tokens": 180570458.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8207387326330058, |
| "grad_norm": 0.5627775260496694, |
| "learning_rate": 3.604336043360434e-07, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9483892321586609, |
| "num_tokens": 180719063.0, |
| "step": 1211 |
| }, |
| { |
| "epoch": 0.8214164689935615, |
| "grad_norm": 0.5266555398373794, |
| "learning_rate": 3.5907859078590785e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9465538933873177, |
| "num_tokens": 180869555.0, |
| "step": 1212 |
| }, |
| { |
| "epoch": 0.8220942053541173, |
| "grad_norm": 0.6971738026269859, |
| "learning_rate": 3.5772357723577237e-07, |
| "loss": 0.1758, |
| "mean_token_accuracy": 0.945346049964428, |
| "num_tokens": 181021477.0, |
| "step": 1213 |
| }, |
| { |
| "epoch": 0.8227719417146729, |
| "grad_norm": 0.5589925515652632, |
| "learning_rate": 3.5636856368563684e-07, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9486629068851471, |
| "num_tokens": 181171918.0, |
| "step": 1214 |
| }, |
| { |
| "epoch": 0.8234496780752287, |
| "grad_norm": 0.9941559869041463, |
| "learning_rate": 3.550135501355013e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.948122650384903, |
| "num_tokens": 181318018.0, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.8241274144357845, |
| "grad_norm": 0.4344537392230086, |
| "learning_rate": 3.536585365853658e-07, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9494742602109909, |
| "num_tokens": 181467329.0, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.8248051507963402, |
| "grad_norm": 0.38916370294651115, |
| "learning_rate": 3.523035230352303e-07, |
| "loss": 0.152, |
| "mean_token_accuracy": 0.9506790786981583, |
| "num_tokens": 181613107.0, |
| "step": 1217 |
| }, |
| { |
| "epoch": 0.825482887156896, |
| "grad_norm": 0.46380417163685994, |
| "learning_rate": 3.5094850948509486e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9499632716178894, |
| "num_tokens": 181761115.0, |
| "step": 1218 |
| }, |
| { |
| "epoch": 0.8261606235174517, |
| "grad_norm": 0.3606566170578932, |
| "learning_rate": 3.4959349593495933e-07, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9500089287757874, |
| "num_tokens": 181911928.0, |
| "step": 1219 |
| }, |
| { |
| "epoch": 0.8268383598780075, |
| "grad_norm": 0.5330045233966407, |
| "learning_rate": 3.4823848238482385e-07, |
| "loss": 0.1565, |
| "mean_token_accuracy": 0.9503036737442017, |
| "num_tokens": 182057199.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.8275160962385631, |
| "grad_norm": 0.383571091647365, |
| "learning_rate": 3.468834688346883e-07, |
| "loss": 0.1527, |
| "mean_token_accuracy": 0.9508719816803932, |
| "num_tokens": 182208138.0, |
| "step": 1221 |
| }, |
| { |
| "epoch": 0.8281938325991189, |
| "grad_norm": 0.46665590559776177, |
| "learning_rate": 3.4552845528455284e-07, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9455024227499962, |
| "num_tokens": 182360961.0, |
| "step": 1222 |
| }, |
| { |
| "epoch": 0.8288715689596747, |
| "grad_norm": 0.8179154615736234, |
| "learning_rate": 3.441734417344173e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9481576010584831, |
| "num_tokens": 182510718.0, |
| "step": 1223 |
| }, |
| { |
| "epoch": 0.8295493053202304, |
| "grad_norm": 0.4497187886461492, |
| "learning_rate": 3.428184281842818e-07, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.9444357454776764, |
| "num_tokens": 182658094.0, |
| "step": 1224 |
| }, |
| { |
| "epoch": 0.8302270416807862, |
| "grad_norm": 0.5485640710688537, |
| "learning_rate": 3.4146341463414634e-07, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9486449137330055, |
| "num_tokens": 182809300.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.8309047780413419, |
| "grad_norm": 0.6068099348141154, |
| "learning_rate": 3.4010840108401086e-07, |
| "loss": 0.1504, |
| "mean_token_accuracy": 0.9518176093697548, |
| "num_tokens": 182960527.0, |
| "step": 1226 |
| }, |
| { |
| "epoch": 0.8315825144018977, |
| "grad_norm": 0.5377496770350054, |
| "learning_rate": 3.3875338753387533e-07, |
| "loss": 0.1656, |
| "mean_token_accuracy": 0.94700937718153, |
| "num_tokens": 183109254.0, |
| "step": 1227 |
| }, |
| { |
| "epoch": 0.8322602507624534, |
| "grad_norm": 0.41807441815831087, |
| "learning_rate": 3.3739837398373985e-07, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9449028000235558, |
| "num_tokens": 183261181.0, |
| "step": 1228 |
| }, |
| { |
| "epoch": 0.8329379871230091, |
| "grad_norm": 0.4427331296597803, |
| "learning_rate": 3.360433604336043e-07, |
| "loss": 0.1545, |
| "mean_token_accuracy": 0.9506580010056496, |
| "num_tokens": 183407209.0, |
| "step": 1229 |
| }, |
| { |
| "epoch": 0.8336157234835649, |
| "grad_norm": 0.9700740822615448, |
| "learning_rate": 3.3468834688346883e-07, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9479135498404503, |
| "num_tokens": 183553470.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.8342934598441206, |
| "grad_norm": 1.0011705970363838, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9486960023641586, |
| "num_tokens": 183705918.0, |
| "step": 1231 |
| }, |
| { |
| "epoch": 0.8349711962046764, |
| "grad_norm": 1.2599432294982404, |
| "learning_rate": 3.3197831978319777e-07, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9466327428817749, |
| "num_tokens": 183856899.0, |
| "step": 1232 |
| }, |
| { |
| "epoch": 0.8356489325652321, |
| "grad_norm": 0.44463240271531623, |
| "learning_rate": 3.3062330623306234e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9478116035461426, |
| "num_tokens": 184006826.0, |
| "step": 1233 |
| }, |
| { |
| "epoch": 0.8363266689257879, |
| "grad_norm": 0.3853389407085814, |
| "learning_rate": 3.292682926829268e-07, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.949841283261776, |
| "num_tokens": 184160764.0, |
| "step": 1234 |
| }, |
| { |
| "epoch": 0.8370044052863436, |
| "grad_norm": 0.45767149703589827, |
| "learning_rate": 3.279132791327913e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9479725807905197, |
| "num_tokens": 184303969.0, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.8376821416468994, |
| "grad_norm": 1.9773177245102982, |
| "learning_rate": 3.265582655826558e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9471398890018463, |
| "num_tokens": 184450526.0, |
| "step": 1236 |
| }, |
| { |
| "epoch": 0.8383598780074552, |
| "grad_norm": 0.4918123062692637, |
| "learning_rate": 3.252032520325203e-07, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.944765530526638, |
| "num_tokens": 184602809.0, |
| "step": 1237 |
| }, |
| { |
| "epoch": 0.8390376143680108, |
| "grad_norm": 0.3901587167469587, |
| "learning_rate": 3.238482384823848e-07, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9484593421220779, |
| "num_tokens": 184754853.0, |
| "step": 1238 |
| }, |
| { |
| "epoch": 0.8397153507285666, |
| "grad_norm": 1.287255346321022, |
| "learning_rate": 3.224932249322493e-07, |
| "loss": 0.1601, |
| "mean_token_accuracy": 0.9491341561079025, |
| "num_tokens": 184904472.0, |
| "step": 1239 |
| }, |
| { |
| "epoch": 0.8403930870891223, |
| "grad_norm": 0.4942027850122778, |
| "learning_rate": 3.211382113821138e-07, |
| "loss": 0.1686, |
| "mean_token_accuracy": 0.9465633928775787, |
| "num_tokens": 185051283.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.8410708234496781, |
| "grad_norm": 0.4421386762305037, |
| "learning_rate": 3.1978319783197834e-07, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9489489793777466, |
| "num_tokens": 185198011.0, |
| "step": 1241 |
| }, |
| { |
| "epoch": 0.8417485598102338, |
| "grad_norm": 0.5126437317253629, |
| "learning_rate": 3.184281842818428e-07, |
| "loss": 0.1704, |
| "mean_token_accuracy": 0.9451634883880615, |
| "num_tokens": 185348283.0, |
| "step": 1242 |
| }, |
| { |
| "epoch": 0.8424262961707896, |
| "grad_norm": 0.4640809390185053, |
| "learning_rate": 3.170731707317073e-07, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9478862881660461, |
| "num_tokens": 185503045.0, |
| "step": 1243 |
| }, |
| { |
| "epoch": 0.8431040325313454, |
| "grad_norm": 0.5888631565883243, |
| "learning_rate": 3.157181571815718e-07, |
| "loss": 0.163, |
| "mean_token_accuracy": 0.9481711536645889, |
| "num_tokens": 185650437.0, |
| "step": 1244 |
| }, |
| { |
| "epoch": 0.843781768891901, |
| "grad_norm": 0.6289341440241704, |
| "learning_rate": 3.143631436314363e-07, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.947802871465683, |
| "num_tokens": 185803124.0, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.8444595052524568, |
| "grad_norm": 0.587262406475338, |
| "learning_rate": 3.130081300813008e-07, |
| "loss": 0.156, |
| "mean_token_accuracy": 0.9502067714929581, |
| "num_tokens": 185949400.0, |
| "step": 1246 |
| }, |
| { |
| "epoch": 0.8451372416130125, |
| "grad_norm": 0.4966697299947886, |
| "learning_rate": 3.116531165311653e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9491457492113113, |
| "num_tokens": 186101066.0, |
| "step": 1247 |
| }, |
| { |
| "epoch": 0.8458149779735683, |
| "grad_norm": 0.4232902254992847, |
| "learning_rate": 3.102981029810298e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9468086063861847, |
| "num_tokens": 186252511.0, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.846492714334124, |
| "grad_norm": 0.3604518735048993, |
| "learning_rate": 3.0894308943089434e-07, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.9492731615900993, |
| "num_tokens": 186404888.0, |
| "step": 1249 |
| }, |
| { |
| "epoch": 0.8471704506946798, |
| "grad_norm": 0.40100425292989167, |
| "learning_rate": 3.075880758807588e-07, |
| "loss": 0.1546, |
| "mean_token_accuracy": 0.9506191238760948, |
| "num_tokens": 186552971.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8478481870552355, |
| "grad_norm": 0.37038645677460275, |
| "learning_rate": 3.0623306233062327e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9471093341708183, |
| "num_tokens": 186701736.0, |
| "step": 1251 |
| }, |
| { |
| "epoch": 0.8485259234157913, |
| "grad_norm": 0.5248022281276231, |
| "learning_rate": 3.048780487804878e-07, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9471896588802338, |
| "num_tokens": 186849104.0, |
| "step": 1252 |
| }, |
| { |
| "epoch": 0.849203659776347, |
| "grad_norm": 0.8811872116727292, |
| "learning_rate": 3.0352303523035226e-07, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9485187977552414, |
| "num_tokens": 186998111.0, |
| "step": 1253 |
| }, |
| { |
| "epoch": 0.8498813961369027, |
| "grad_norm": 0.5996673555873404, |
| "learning_rate": 3.021680216802168e-07, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9474183395504951, |
| "num_tokens": 187142004.0, |
| "step": 1254 |
| }, |
| { |
| "epoch": 0.8505591324974585, |
| "grad_norm": 0.42858683016562826, |
| "learning_rate": 3.008130081300813e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9466542750597, |
| "num_tokens": 187292467.0, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.8512368688580142, |
| "grad_norm": 0.39053228229918563, |
| "learning_rate": 2.994579945799458e-07, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9496741071343422, |
| "num_tokens": 187438951.0, |
| "step": 1256 |
| }, |
| { |
| "epoch": 0.85191460521857, |
| "grad_norm": 0.3980012222167013, |
| "learning_rate": 2.981029810298103e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.948141522705555, |
| "num_tokens": 187586449.0, |
| "step": 1257 |
| }, |
| { |
| "epoch": 0.8525923415791257, |
| "grad_norm": 0.48893145810215405, |
| "learning_rate": 2.967479674796748e-07, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9474975019693375, |
| "num_tokens": 187733743.0, |
| "step": 1258 |
| }, |
| { |
| "epoch": 0.8532700779396815, |
| "grad_norm": 0.4166335922064512, |
| "learning_rate": 2.9539295392953927e-07, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9472394436597824, |
| "num_tokens": 187880144.0, |
| "step": 1259 |
| }, |
| { |
| "epoch": 0.8539478143002373, |
| "grad_norm": 0.5566663810368183, |
| "learning_rate": 2.940379403794038e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9473065361380577, |
| "num_tokens": 188031774.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.8546255506607929, |
| "grad_norm": 0.44170510987096306, |
| "learning_rate": 2.9268292682926825e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9481263235211372, |
| "num_tokens": 188180795.0, |
| "step": 1261 |
| }, |
| { |
| "epoch": 0.8553032870213487, |
| "grad_norm": 0.456152885141962, |
| "learning_rate": 2.913279132791328e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9467766508460045, |
| "num_tokens": 188327661.0, |
| "step": 1262 |
| }, |
| { |
| "epoch": 0.8559810233819044, |
| "grad_norm": 0.48010268631689385, |
| "learning_rate": 2.899728997289973e-07, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.949536144733429, |
| "num_tokens": 188478001.0, |
| "step": 1263 |
| }, |
| { |
| "epoch": 0.8566587597424602, |
| "grad_norm": 0.4825523038347356, |
| "learning_rate": 2.886178861788618e-07, |
| "loss": 0.1777, |
| "mean_token_accuracy": 0.9437452927231789, |
| "num_tokens": 188626348.0, |
| "step": 1264 |
| }, |
| { |
| "epoch": 0.8573364961030159, |
| "grad_norm": 0.5861677090234955, |
| "learning_rate": 2.872628726287263e-07, |
| "loss": 0.1747, |
| "mean_token_accuracy": 0.9444176256656647, |
| "num_tokens": 188776297.0, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.8580142324635717, |
| "grad_norm": 0.39427539151202734, |
| "learning_rate": 2.859078590785908e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.9467851668596268, |
| "num_tokens": 188923403.0, |
| "step": 1266 |
| }, |
| { |
| "epoch": 0.8586919688241275, |
| "grad_norm": 0.40564905252247896, |
| "learning_rate": 2.8455284552845527e-07, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9476913884282112, |
| "num_tokens": 189076475.0, |
| "step": 1267 |
| }, |
| { |
| "epoch": 0.8593697051846831, |
| "grad_norm": 0.4186772728839254, |
| "learning_rate": 2.8319783197831973e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9472432062029839, |
| "num_tokens": 189230576.0, |
| "step": 1268 |
| }, |
| { |
| "epoch": 0.8600474415452389, |
| "grad_norm": 0.4390604343607371, |
| "learning_rate": 2.8184281842818425e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9469037428498268, |
| "num_tokens": 189381092.0, |
| "step": 1269 |
| }, |
| { |
| "epoch": 0.8607251779057946, |
| "grad_norm": 0.40477830196201936, |
| "learning_rate": 2.8048780487804877e-07, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.9463135749101639, |
| "num_tokens": 189529988.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.8614029142663504, |
| "grad_norm": 0.4461257655942589, |
| "learning_rate": 2.791327913279133e-07, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9497584477066994, |
| "num_tokens": 189679438.0, |
| "step": 1271 |
| }, |
| { |
| "epoch": 0.8620806506269061, |
| "grad_norm": 0.627751042354744, |
| "learning_rate": 2.7777777777777776e-07, |
| "loss": 0.1758, |
| "mean_token_accuracy": 0.9447125047445297, |
| "num_tokens": 189827211.0, |
| "step": 1272 |
| }, |
| { |
| "epoch": 0.8627583869874619, |
| "grad_norm": 0.5029254430147256, |
| "learning_rate": 2.764227642276423e-07, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9464574307203293, |
| "num_tokens": 189975220.0, |
| "step": 1273 |
| }, |
| { |
| "epoch": 0.8634361233480177, |
| "grad_norm": 0.9232385351058474, |
| "learning_rate": 2.7506775067750675e-07, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9502605646848679, |
| "num_tokens": 190127470.0, |
| "step": 1274 |
| }, |
| { |
| "epoch": 0.8641138597085734, |
| "grad_norm": 0.44449553850505596, |
| "learning_rate": 2.7371273712737127e-07, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9485994949936867, |
| "num_tokens": 190280037.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.8647915960691291, |
| "grad_norm": 2.4731755358390917, |
| "learning_rate": 2.7235772357723573e-07, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9491154477000237, |
| "num_tokens": 190429590.0, |
| "step": 1276 |
| }, |
| { |
| "epoch": 0.8654693324296848, |
| "grad_norm": 0.43476345107395914, |
| "learning_rate": 2.7100271002710025e-07, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9486151933670044, |
| "num_tokens": 190580748.0, |
| "step": 1277 |
| }, |
| { |
| "epoch": 0.8661470687902406, |
| "grad_norm": 0.4611579965695597, |
| "learning_rate": 2.6964769647696477e-07, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.948381170630455, |
| "num_tokens": 190732409.0, |
| "step": 1278 |
| }, |
| { |
| "epoch": 0.8668248051507963, |
| "grad_norm": 0.3853333697956996, |
| "learning_rate": 2.682926829268293e-07, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9489934965968132, |
| "num_tokens": 190883434.0, |
| "step": 1279 |
| }, |
| { |
| "epoch": 0.8675025415113521, |
| "grad_norm": 0.5793024017953724, |
| "learning_rate": 2.6693766937669376e-07, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9469688385725021, |
| "num_tokens": 191031276.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.8681802778719079, |
| "grad_norm": 0.4653940866114761, |
| "learning_rate": 2.655826558265583e-07, |
| "loss": 0.174, |
| "mean_token_accuracy": 0.944032609462738, |
| "num_tokens": 191180550.0, |
| "step": 1281 |
| }, |
| { |
| "epoch": 0.8688580142324636, |
| "grad_norm": 0.5295147327059975, |
| "learning_rate": 2.6422764227642274e-07, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.950570173561573, |
| "num_tokens": 191328672.0, |
| "step": 1282 |
| }, |
| { |
| "epoch": 0.8695357505930194, |
| "grad_norm": 0.4839416614450446, |
| "learning_rate": 2.6287262872628726e-07, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9487968757748604, |
| "num_tokens": 191476358.0, |
| "step": 1283 |
| }, |
| { |
| "epoch": 0.870213486953575, |
| "grad_norm": 0.5506681952992758, |
| "learning_rate": 2.6151761517615173e-07, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9497069045901299, |
| "num_tokens": 191616680.0, |
| "step": 1284 |
| }, |
| { |
| "epoch": 0.8708912233141308, |
| "grad_norm": 0.5124878675054456, |
| "learning_rate": 2.6016260162601625e-07, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9464941918849945, |
| "num_tokens": 191762437.0, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.8715689596746865, |
| "grad_norm": 0.6914476460367316, |
| "learning_rate": 2.5880758807588077e-07, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.948076955974102, |
| "num_tokens": 191912068.0, |
| "step": 1286 |
| }, |
| { |
| "epoch": 0.8722466960352423, |
| "grad_norm": 0.513420792494206, |
| "learning_rate": 2.5745257452574524e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9484427720308304, |
| "num_tokens": 192062536.0, |
| "step": 1287 |
| }, |
| { |
| "epoch": 0.8729244323957981, |
| "grad_norm": 0.44814166594814653, |
| "learning_rate": 2.5609756097560976e-07, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9484685808420181, |
| "num_tokens": 192211249.0, |
| "step": 1288 |
| }, |
| { |
| "epoch": 0.8736021687563538, |
| "grad_norm": 0.471365632422765, |
| "learning_rate": 2.547425474254742e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9492049291729927, |
| "num_tokens": 192353299.0, |
| "step": 1289 |
| }, |
| { |
| "epoch": 0.8742799051169096, |
| "grad_norm": 0.46964537723716343, |
| "learning_rate": 2.5338753387533874e-07, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9494825899600983, |
| "num_tokens": 192502090.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.8749576414774652, |
| "grad_norm": 0.778237123761203, |
| "learning_rate": 2.520325203252032e-07, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9470194801688194, |
| "num_tokens": 192653755.0, |
| "step": 1291 |
| }, |
| { |
| "epoch": 0.875635377838021, |
| "grad_norm": 1.4437998607866556, |
| "learning_rate": 2.5067750677506773e-07, |
| "loss": 0.1706, |
| "mean_token_accuracy": 0.9462285861372948, |
| "num_tokens": 192802868.0, |
| "step": 1292 |
| }, |
| { |
| "epoch": 0.8763131141985767, |
| "grad_norm": 0.39360519635219227, |
| "learning_rate": 2.4932249322493225e-07, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9470438733696938, |
| "num_tokens": 192949636.0, |
| "step": 1293 |
| }, |
| { |
| "epoch": 0.8769908505591325, |
| "grad_norm": 0.4870040325471479, |
| "learning_rate": 2.479674796747967e-07, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9510429948568344, |
| "num_tokens": 193097651.0, |
| "step": 1294 |
| }, |
| { |
| "epoch": 0.8776685869196882, |
| "grad_norm": 0.5810436594415671, |
| "learning_rate": 2.4661246612466123e-07, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9486610442399979, |
| "num_tokens": 193244672.0, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.878346323280244, |
| "grad_norm": 0.4662810869506531, |
| "learning_rate": 2.4525745257452575e-07, |
| "loss": 0.1519, |
| "mean_token_accuracy": 0.9504307880997658, |
| "num_tokens": 193395885.0, |
| "step": 1296 |
| }, |
| { |
| "epoch": 0.8790240596407998, |
| "grad_norm": 0.367052086711362, |
| "learning_rate": 2.439024390243902e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.947607047855854, |
| "num_tokens": 193544593.0, |
| "step": 1297 |
| }, |
| { |
| "epoch": 0.8797017960013555, |
| "grad_norm": 0.4134829891271465, |
| "learning_rate": 2.4254742547425474e-07, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9463229477405548, |
| "num_tokens": 193694382.0, |
| "step": 1298 |
| }, |
| { |
| "epoch": 0.8803795323619112, |
| "grad_norm": 0.5142223217380762, |
| "learning_rate": 2.4119241192411926e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9481227025389671, |
| "num_tokens": 193844950.0, |
| "step": 1299 |
| }, |
| { |
| "epoch": 0.8810572687224669, |
| "grad_norm": 0.5725821844973095, |
| "learning_rate": 2.3983739837398373e-07, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.944548599421978, |
| "num_tokens": 193991353.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8817350050830227, |
| "grad_norm": 0.5356025903004101, |
| "learning_rate": 2.3848238482384825e-07, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9450171962380409, |
| "num_tokens": 194136677.0, |
| "step": 1301 |
| }, |
| { |
| "epoch": 0.8824127414435784, |
| "grad_norm": 0.48877085346696164, |
| "learning_rate": 2.3712737127371274e-07, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9471217319369316, |
| "num_tokens": 194286050.0, |
| "step": 1302 |
| }, |
| { |
| "epoch": 0.8830904778041342, |
| "grad_norm": 0.376632408570397, |
| "learning_rate": 2.3577235772357723e-07, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9473475515842438, |
| "num_tokens": 194436091.0, |
| "step": 1303 |
| }, |
| { |
| "epoch": 0.88376821416469, |
| "grad_norm": 0.45486507931417874, |
| "learning_rate": 2.3441734417344173e-07, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.9485864788293839, |
| "num_tokens": 194586531.0, |
| "step": 1304 |
| }, |
| { |
| "epoch": 0.8844459505252457, |
| "grad_norm": 0.44784500489937396, |
| "learning_rate": 2.3306233062330622e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9483805522322655, |
| "num_tokens": 194737102.0, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.8851236868858015, |
| "grad_norm": 0.7993291555150185, |
| "learning_rate": 2.3170731707317074e-07, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9464853033423424, |
| "num_tokens": 194889119.0, |
| "step": 1306 |
| }, |
| { |
| "epoch": 0.8858014232463571, |
| "grad_norm": 0.4379267853744643, |
| "learning_rate": 2.3035230352303523e-07, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9470188841223717, |
| "num_tokens": 195036049.0, |
| "step": 1307 |
| }, |
| { |
| "epoch": 0.8864791596069129, |
| "grad_norm": 0.40040270745053264, |
| "learning_rate": 2.2899728997289973e-07, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.947534941136837, |
| "num_tokens": 195184127.0, |
| "step": 1308 |
| }, |
| { |
| "epoch": 0.8871568959674686, |
| "grad_norm": 0.36793694793042225, |
| "learning_rate": 2.2764227642276422e-07, |
| "loss": 0.1548, |
| "mean_token_accuracy": 0.9490689262747765, |
| "num_tokens": 195334385.0, |
| "step": 1309 |
| }, |
| { |
| "epoch": 0.8878346323280244, |
| "grad_norm": 0.36498410723940267, |
| "learning_rate": 2.2628726287262874e-07, |
| "loss": 0.1559, |
| "mean_token_accuracy": 0.949463352560997, |
| "num_tokens": 195482527.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.8885123686885802, |
| "grad_norm": 0.9601269538119754, |
| "learning_rate": 2.249322493224932e-07, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.949680283665657, |
| "num_tokens": 195633680.0, |
| "step": 1311 |
| }, |
| { |
| "epoch": 0.8891901050491359, |
| "grad_norm": 0.3973604680490306, |
| "learning_rate": 2.235772357723577e-07, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9494841918349266, |
| "num_tokens": 195784698.0, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.8898678414096917, |
| "grad_norm": 0.4244837770458849, |
| "learning_rate": 2.222222222222222e-07, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9473633095622063, |
| "num_tokens": 195930878.0, |
| "step": 1313 |
| }, |
| { |
| "epoch": 0.8905455777702473, |
| "grad_norm": 0.6840023007911671, |
| "learning_rate": 2.208672086720867e-07, |
| "loss": 0.1745, |
| "mean_token_accuracy": 0.94522675126791, |
| "num_tokens": 196081665.0, |
| "step": 1314 |
| }, |
| { |
| "epoch": 0.8912233141308031, |
| "grad_norm": 0.46451158595442055, |
| "learning_rate": 2.195121951219512e-07, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9465636387467384, |
| "num_tokens": 196233517.0, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.8919010504913588, |
| "grad_norm": 0.40412358259357356, |
| "learning_rate": 2.181571815718157e-07, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9449945688247681, |
| "num_tokens": 196383614.0, |
| "step": 1316 |
| }, |
| { |
| "epoch": 0.8925787868519146, |
| "grad_norm": 0.5062261543786651, |
| "learning_rate": 2.1680216802168022e-07, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9466878697276115, |
| "num_tokens": 196532369.0, |
| "step": 1317 |
| }, |
| { |
| "epoch": 0.8932565232124704, |
| "grad_norm": 0.6978443716648501, |
| "learning_rate": 2.154471544715447e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9484983906149864, |
| "num_tokens": 196676914.0, |
| "step": 1318 |
| }, |
| { |
| "epoch": 0.8939342595730261, |
| "grad_norm": 0.6617405961208453, |
| "learning_rate": 2.140921409214092e-07, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9493363499641418, |
| "num_tokens": 196822084.0, |
| "step": 1319 |
| }, |
| { |
| "epoch": 0.8946119959335819, |
| "grad_norm": 0.5560405235740569, |
| "learning_rate": 2.127371273712737e-07, |
| "loss": 0.1722, |
| "mean_token_accuracy": 0.9459304660558701, |
| "num_tokens": 196970332.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8952897322941376, |
| "grad_norm": 0.40050381837066196, |
| "learning_rate": 2.1138211382113822e-07, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9466271102428436, |
| "num_tokens": 197119174.0, |
| "step": 1321 |
| }, |
| { |
| "epoch": 0.8959674686546933, |
| "grad_norm": 0.6321057538339169, |
| "learning_rate": 2.100271002710027e-07, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9481549188494682, |
| "num_tokens": 197268637.0, |
| "step": 1322 |
| }, |
| { |
| "epoch": 0.896645205015249, |
| "grad_norm": 0.45622751792515764, |
| "learning_rate": 2.086720867208672e-07, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9462346211075783, |
| "num_tokens": 197415386.0, |
| "step": 1323 |
| }, |
| { |
| "epoch": 0.8973229413758048, |
| "grad_norm": 0.3687854548382046, |
| "learning_rate": 2.073170731707317e-07, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9470949769020081, |
| "num_tokens": 197558590.0, |
| "step": 1324 |
| }, |
| { |
| "epoch": 0.8980006777363606, |
| "grad_norm": 0.3640691148557062, |
| "learning_rate": 2.0596205962059622e-07, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9491933286190033, |
| "num_tokens": 197708876.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.8986784140969163, |
| "grad_norm": 0.43915756263748035, |
| "learning_rate": 2.046070460704607e-07, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9470717161893845, |
| "num_tokens": 197857476.0, |
| "step": 1326 |
| }, |
| { |
| "epoch": 0.8993561504574721, |
| "grad_norm": 0.3884628942763983, |
| "learning_rate": 2.032520325203252e-07, |
| "loss": 0.1452, |
| "mean_token_accuracy": 0.9524314031004906, |
| "num_tokens": 198004317.0, |
| "step": 1327 |
| }, |
| { |
| "epoch": 0.9000338868180278, |
| "grad_norm": 0.559885511493827, |
| "learning_rate": 2.0189701897018967e-07, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9473034217953682, |
| "num_tokens": 198157023.0, |
| "step": 1328 |
| }, |
| { |
| "epoch": 0.9007116231785836, |
| "grad_norm": 1.286366814416818, |
| "learning_rate": 2.005420054200542e-07, |
| "loss": 0.1465, |
| "mean_token_accuracy": 0.9523212388157845, |
| "num_tokens": 198301609.0, |
| "step": 1329 |
| }, |
| { |
| "epoch": 0.9013893595391392, |
| "grad_norm": 0.4470361145569467, |
| "learning_rate": 1.9918699186991868e-07, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9452895820140839, |
| "num_tokens": 198450025.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.902067095899695, |
| "grad_norm": 0.5056161190236214, |
| "learning_rate": 1.9783197831978317e-07, |
| "loss": 0.1559, |
| "mean_token_accuracy": 0.9498795047402382, |
| "num_tokens": 198595265.0, |
| "step": 1331 |
| }, |
| { |
| "epoch": 0.9027448322602508, |
| "grad_norm": 0.4705626378527346, |
| "learning_rate": 1.9647696476964767e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9484021738171577, |
| "num_tokens": 198743782.0, |
| "step": 1332 |
| }, |
| { |
| "epoch": 0.9034225686208065, |
| "grad_norm": 0.7139510920836756, |
| "learning_rate": 1.951219512195122e-07, |
| "loss": 0.1548, |
| "mean_token_accuracy": 0.9500160440802574, |
| "num_tokens": 198892693.0, |
| "step": 1333 |
| }, |
| { |
| "epoch": 0.9041003049813623, |
| "grad_norm": 0.4346157528632144, |
| "learning_rate": 1.9376693766937668e-07, |
| "loss": 0.174, |
| "mean_token_accuracy": 0.9452803283929825, |
| "num_tokens": 199047544.0, |
| "step": 1334 |
| }, |
| { |
| "epoch": 0.904778041341918, |
| "grad_norm": 0.46040849660604394, |
| "learning_rate": 1.9241192411924117e-07, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9492721632122993, |
| "num_tokens": 199197511.0, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.9054557777024738, |
| "grad_norm": 0.5077735426247331, |
| "learning_rate": 1.910569105691057e-07, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9483174160122871, |
| "num_tokens": 199350131.0, |
| "step": 1336 |
| }, |
| { |
| "epoch": 0.9061335140630294, |
| "grad_norm": 0.5822238393967466, |
| "learning_rate": 1.897018970189702e-07, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9482871666550636, |
| "num_tokens": 199498888.0, |
| "step": 1337 |
| }, |
| { |
| "epoch": 0.9068112504235852, |
| "grad_norm": 0.9963446979708105, |
| "learning_rate": 1.8834688346883468e-07, |
| "loss": 0.1676, |
| "mean_token_accuracy": 0.9468878507614136, |
| "num_tokens": 199650041.0, |
| "step": 1338 |
| }, |
| { |
| "epoch": 0.9074889867841409, |
| "grad_norm": 0.4094642484081531, |
| "learning_rate": 1.8699186991869917e-07, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9493989273905754, |
| "num_tokens": 199799651.0, |
| "step": 1339 |
| }, |
| { |
| "epoch": 0.9081667231446967, |
| "grad_norm": 0.4044904055612835, |
| "learning_rate": 1.856368563685637e-07, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9452235251665115, |
| "num_tokens": 199949211.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9088444595052525, |
| "grad_norm": 0.43063638556652495, |
| "learning_rate": 1.8428184281842819e-07, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9505686908960342, |
| "num_tokens": 200095988.0, |
| "step": 1341 |
| }, |
| { |
| "epoch": 0.9095221958658082, |
| "grad_norm": 0.4829193370579189, |
| "learning_rate": 1.8292682926829268e-07, |
| "loss": 0.1544, |
| "mean_token_accuracy": 0.9502275586128235, |
| "num_tokens": 200250528.0, |
| "step": 1342 |
| }, |
| { |
| "epoch": 0.910199932226364, |
| "grad_norm": 0.4802569071665241, |
| "learning_rate": 1.8157181571815717e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9475988522171974, |
| "num_tokens": 200403099.0, |
| "step": 1343 |
| }, |
| { |
| "epoch": 0.9108776685869197, |
| "grad_norm": 0.8898908368334608, |
| "learning_rate": 1.802168021680217e-07, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9496802464127541, |
| "num_tokens": 200553689.0, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.9115554049474754, |
| "grad_norm": 0.9086836656117131, |
| "learning_rate": 1.7886178861788619e-07, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.947704590857029, |
| "num_tokens": 200701501.0, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.9122331413080311, |
| "grad_norm": 0.5054363586713749, |
| "learning_rate": 1.7750677506775065e-07, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9495393559336662, |
| "num_tokens": 200850046.0, |
| "step": 1346 |
| }, |
| { |
| "epoch": 0.9129108776685869, |
| "grad_norm": 0.48926660077240064, |
| "learning_rate": 1.7615176151761515e-07, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9491599351167679, |
| "num_tokens": 201000638.0, |
| "step": 1347 |
| }, |
| { |
| "epoch": 0.9135886140291427, |
| "grad_norm": 0.48134549461282383, |
| "learning_rate": 1.7479674796747966e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9479363709688187, |
| "num_tokens": 201149168.0, |
| "step": 1348 |
| }, |
| { |
| "epoch": 0.9142663503896984, |
| "grad_norm": 0.8390624871312155, |
| "learning_rate": 1.7344173441734416e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9488432630896568, |
| "num_tokens": 201294436.0, |
| "step": 1349 |
| }, |
| { |
| "epoch": 0.9149440867502542, |
| "grad_norm": 0.5602856755239709, |
| "learning_rate": 1.7208672086720865e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9480102583765984, |
| "num_tokens": 201441840.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9156218231108099, |
| "grad_norm": 0.4292552573013406, |
| "learning_rate": 1.7073170731707317e-07, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9480935409665108, |
| "num_tokens": 201588791.0, |
| "step": 1351 |
| }, |
| { |
| "epoch": 0.9162995594713657, |
| "grad_norm": 0.425749476838467, |
| "learning_rate": 1.6937669376693766e-07, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9492569714784622, |
| "num_tokens": 201737553.0, |
| "step": 1352 |
| }, |
| { |
| "epoch": 0.9169772958319213, |
| "grad_norm": 0.7023335491453301, |
| "learning_rate": 1.6802168021680216e-07, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.9445386901497841, |
| "num_tokens": 201885478.0, |
| "step": 1353 |
| }, |
| { |
| "epoch": 0.9176550321924771, |
| "grad_norm": 0.4780401071887224, |
| "learning_rate": 1.6666666666666665e-07, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9456515088677406, |
| "num_tokens": 202035458.0, |
| "step": 1354 |
| }, |
| { |
| "epoch": 0.9183327685530329, |
| "grad_norm": 0.46240498917097805, |
| "learning_rate": 1.6531165311653117e-07, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9500665068626404, |
| "num_tokens": 202184449.0, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.9190105049135886, |
| "grad_norm": 0.809794434334873, |
| "learning_rate": 1.6395663956639566e-07, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9476128816604614, |
| "num_tokens": 202337184.0, |
| "step": 1356 |
| }, |
| { |
| "epoch": 0.9196882412741444, |
| "grad_norm": 0.7156898794358771, |
| "learning_rate": 1.6260162601626016e-07, |
| "loss": 0.155, |
| "mean_token_accuracy": 0.9504127278923988, |
| "num_tokens": 202490847.0, |
| "step": 1357 |
| }, |
| { |
| "epoch": 0.9203659776347001, |
| "grad_norm": 0.43154704578495007, |
| "learning_rate": 1.6124661246612465e-07, |
| "loss": 0.1481, |
| "mean_token_accuracy": 0.9516681507229805, |
| "num_tokens": 202642145.0, |
| "step": 1358 |
| }, |
| { |
| "epoch": 0.9210437139952559, |
| "grad_norm": 0.4127900350922141, |
| "learning_rate": 1.5989159891598917e-07, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9486039876937866, |
| "num_tokens": 202793032.0, |
| "step": 1359 |
| }, |
| { |
| "epoch": 0.9217214503558115, |
| "grad_norm": 0.5934644429599674, |
| "learning_rate": 1.5853658536585366e-07, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9477408677339554, |
| "num_tokens": 202941213.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.9223991867163673, |
| "grad_norm": 0.5187626748589975, |
| "learning_rate": 1.5718157181571816e-07, |
| "loss": 0.165, |
| "mean_token_accuracy": 0.9467665180563927, |
| "num_tokens": 203089723.0, |
| "step": 1361 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.4700994490692931, |
| "learning_rate": 1.5582655826558265e-07, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9481227323412895, |
| "num_tokens": 203239273.0, |
| "step": 1362 |
| }, |
| { |
| "epoch": 0.9237546594374788, |
| "grad_norm": 0.4619249541167841, |
| "learning_rate": 1.5447154471544717e-07, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.945893757045269, |
| "num_tokens": 203389451.0, |
| "step": 1363 |
| }, |
| { |
| "epoch": 0.9244323957980346, |
| "grad_norm": 0.5623005777167278, |
| "learning_rate": 1.5311653116531164e-07, |
| "loss": 0.1538, |
| "mean_token_accuracy": 0.9502845928072929, |
| "num_tokens": 203536972.0, |
| "step": 1364 |
| }, |
| { |
| "epoch": 0.9251101321585903, |
| "grad_norm": 0.3768508973610597, |
| "learning_rate": 1.5176151761517613e-07, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.952195554971695, |
| "num_tokens": 203688790.0, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.9257878685191461, |
| "grad_norm": 0.42153147866492585, |
| "learning_rate": 1.5040650406504065e-07, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9478074163198471, |
| "num_tokens": 203832292.0, |
| "step": 1366 |
| }, |
| { |
| "epoch": 0.9264656048797018, |
| "grad_norm": 0.4192583902969117, |
| "learning_rate": 1.4905149051490514e-07, |
| "loss": 0.1646, |
| "mean_token_accuracy": 0.946540355682373, |
| "num_tokens": 203983349.0, |
| "step": 1367 |
| }, |
| { |
| "epoch": 0.9271433412402575, |
| "grad_norm": 0.4001297016006573, |
| "learning_rate": 1.4769647696476963e-07, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9498896673321724, |
| "num_tokens": 204132144.0, |
| "step": 1368 |
| }, |
| { |
| "epoch": 0.9278210776008133, |
| "grad_norm": 0.4395702296591971, |
| "learning_rate": 1.4634146341463413e-07, |
| "loss": 0.1611, |
| "mean_token_accuracy": 0.9474482089281082, |
| "num_tokens": 204285881.0, |
| "step": 1369 |
| }, |
| { |
| "epoch": 0.928498813961369, |
| "grad_norm": 0.44906172664238575, |
| "learning_rate": 1.4498644986449865e-07, |
| "loss": 0.1549, |
| "mean_token_accuracy": 0.950348399579525, |
| "num_tokens": 204434052.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.9291765503219248, |
| "grad_norm": 0.4713204739330186, |
| "learning_rate": 1.4363143631436314e-07, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9476472660899162, |
| "num_tokens": 204582273.0, |
| "step": 1371 |
| }, |
| { |
| "epoch": 0.9298542866824805, |
| "grad_norm": 0.4670922599752331, |
| "learning_rate": 1.4227642276422763e-07, |
| "loss": 0.1733, |
| "mean_token_accuracy": 0.944943904876709, |
| "num_tokens": 204734486.0, |
| "step": 1372 |
| }, |
| { |
| "epoch": 0.9305320230430363, |
| "grad_norm": 0.5480059042541625, |
| "learning_rate": 1.4092140921409213e-07, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9495535492897034, |
| "num_tokens": 204882430.0, |
| "step": 1373 |
| }, |
| { |
| "epoch": 0.931209759403592, |
| "grad_norm": 0.40076036586833685, |
| "learning_rate": 1.3956639566395665e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9486064985394478, |
| "num_tokens": 205026873.0, |
| "step": 1374 |
| }, |
| { |
| "epoch": 0.9318874957641478, |
| "grad_norm": 0.47472929118812235, |
| "learning_rate": 1.3821138211382114e-07, |
| "loss": 0.1615, |
| "mean_token_accuracy": 0.9485347419977188, |
| "num_tokens": 205174583.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.9325652321247034, |
| "grad_norm": 0.5345554264982604, |
| "learning_rate": 1.3685636856368563e-07, |
| "loss": 0.1555, |
| "mean_token_accuracy": 0.9495607689023018, |
| "num_tokens": 205322381.0, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.9332429684852592, |
| "grad_norm": 0.5403128710953481, |
| "learning_rate": 1.3550135501355013e-07, |
| "loss": 0.1545, |
| "mean_token_accuracy": 0.9499097019433975, |
| "num_tokens": 205470320.0, |
| "step": 1377 |
| }, |
| { |
| "epoch": 0.933920704845815, |
| "grad_norm": 0.4233625714626548, |
| "learning_rate": 1.3414634146341465e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9478924572467804, |
| "num_tokens": 205618922.0, |
| "step": 1378 |
| }, |
| { |
| "epoch": 0.9345984412063707, |
| "grad_norm": 0.3872504699509748, |
| "learning_rate": 1.3279132791327914e-07, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9503503814339638, |
| "num_tokens": 205769081.0, |
| "step": 1379 |
| }, |
| { |
| "epoch": 0.9352761775669265, |
| "grad_norm": 0.5570599491065501, |
| "learning_rate": 1.3143631436314363e-07, |
| "loss": 0.1567, |
| "mean_token_accuracy": 0.9494674950838089, |
| "num_tokens": 205920667.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.9359539139274822, |
| "grad_norm": 1.5008149195955116, |
| "learning_rate": 1.3008130081300813e-07, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9510042071342468, |
| "num_tokens": 206070639.0, |
| "step": 1381 |
| }, |
| { |
| "epoch": 0.936631650288038, |
| "grad_norm": 0.3892147335254059, |
| "learning_rate": 1.2872628726287262e-07, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9483537450432777, |
| "num_tokens": 206217381.0, |
| "step": 1382 |
| }, |
| { |
| "epoch": 0.9373093866485936, |
| "grad_norm": 0.5219739171434112, |
| "learning_rate": 1.273712737127371e-07, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9483264535665512, |
| "num_tokens": 206370560.0, |
| "step": 1383 |
| }, |
| { |
| "epoch": 0.9379871230091494, |
| "grad_norm": 0.41583308606483665, |
| "learning_rate": 1.260162601626016e-07, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9483330249786377, |
| "num_tokens": 206518013.0, |
| "step": 1384 |
| }, |
| { |
| "epoch": 0.9386648593697052, |
| "grad_norm": 0.4161834454505529, |
| "learning_rate": 1.2466124661246612e-07, |
| "loss": 0.164, |
| "mean_token_accuracy": 0.9477608054876328, |
| "num_tokens": 206667634.0, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.9393425957302609, |
| "grad_norm": 1.122778950661849, |
| "learning_rate": 1.2330623306233062e-07, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9485376551747322, |
| "num_tokens": 206815383.0, |
| "step": 1386 |
| }, |
| { |
| "epoch": 0.9400203320908167, |
| "grad_norm": 1.0992782123231566, |
| "learning_rate": 1.219512195121951e-07, |
| "loss": 0.1708, |
| "mean_token_accuracy": 0.945359356701374, |
| "num_tokens": 206963461.0, |
| "step": 1387 |
| }, |
| { |
| "epoch": 0.9406980684513724, |
| "grad_norm": 0.5225790434291085, |
| "learning_rate": 1.2059620596205963e-07, |
| "loss": 0.1518, |
| "mean_token_accuracy": 0.9507160186767578, |
| "num_tokens": 207113679.0, |
| "step": 1388 |
| }, |
| { |
| "epoch": 0.9413758048119282, |
| "grad_norm": 0.4292828147249612, |
| "learning_rate": 1.1924119241192412e-07, |
| "loss": 0.15, |
| "mean_token_accuracy": 0.9513011053204536, |
| "num_tokens": 207260255.0, |
| "step": 1389 |
| }, |
| { |
| "epoch": 0.9420535411724839, |
| "grad_norm": 0.4518478115324446, |
| "learning_rate": 1.1788617886178862e-07, |
| "loss": 0.1592, |
| "mean_token_accuracy": 0.9490228891372681, |
| "num_tokens": 207408464.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.9427312775330396, |
| "grad_norm": 0.3926518572267984, |
| "learning_rate": 1.1653116531165311e-07, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.9475235939025879, |
| "num_tokens": 207555440.0, |
| "step": 1391 |
| }, |
| { |
| "epoch": 0.9434090138935954, |
| "grad_norm": 0.5210429994208757, |
| "learning_rate": 1.1517615176151762e-07, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.9485410302877426, |
| "num_tokens": 207701598.0, |
| "step": 1392 |
| }, |
| { |
| "epoch": 0.9440867502541511, |
| "grad_norm": 0.5924782618800354, |
| "learning_rate": 1.1382113821138211e-07, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.94605902582407, |
| "num_tokens": 207850093.0, |
| "step": 1393 |
| }, |
| { |
| "epoch": 0.9447644866147069, |
| "grad_norm": 0.39958163207143554, |
| "learning_rate": 1.124661246612466e-07, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9468234106898308, |
| "num_tokens": 208002086.0, |
| "step": 1394 |
| }, |
| { |
| "epoch": 0.9454422229752626, |
| "grad_norm": 0.5160991026450638, |
| "learning_rate": 1.111111111111111e-07, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.947230651974678, |
| "num_tokens": 208154266.0, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.9461199593358184, |
| "grad_norm": 0.8555265857717597, |
| "learning_rate": 1.097560975609756e-07, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.947859637439251, |
| "num_tokens": 208298242.0, |
| "step": 1396 |
| }, |
| { |
| "epoch": 0.9467976956963741, |
| "grad_norm": 0.6694579789663303, |
| "learning_rate": 1.0840108401084011e-07, |
| "loss": 0.1522, |
| "mean_token_accuracy": 0.9503393620252609, |
| "num_tokens": 208446923.0, |
| "step": 1397 |
| }, |
| { |
| "epoch": 0.9474754320569299, |
| "grad_norm": 0.7662756104885292, |
| "learning_rate": 1.070460704607046e-07, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9478938356041908, |
| "num_tokens": 208595226.0, |
| "step": 1398 |
| }, |
| { |
| "epoch": 0.9481531684174856, |
| "grad_norm": 0.6299424424389714, |
| "learning_rate": 1.0569105691056911e-07, |
| "loss": 0.1695, |
| "mean_token_accuracy": 0.9457604214549065, |
| "num_tokens": 208747043.0, |
| "step": 1399 |
| }, |
| { |
| "epoch": 0.9488309047780413, |
| "grad_norm": 0.6996370626099346, |
| "learning_rate": 1.043360433604336e-07, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.946971595287323, |
| "num_tokens": 208894774.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9495086411385971, |
| "grad_norm": 0.6276628265777034, |
| "learning_rate": 1.0298102981029811e-07, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9496228843927383, |
| "num_tokens": 209041360.0, |
| "step": 1401 |
| }, |
| { |
| "epoch": 0.9501863774991528, |
| "grad_norm": 0.4483022817684672, |
| "learning_rate": 1.016260162601626e-07, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9469497203826904, |
| "num_tokens": 209189529.0, |
| "step": 1402 |
| }, |
| { |
| "epoch": 0.9508641138597086, |
| "grad_norm": 0.45151447318384536, |
| "learning_rate": 1.002710027100271e-07, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.9444667249917984, |
| "num_tokens": 209336530.0, |
| "step": 1403 |
| }, |
| { |
| "epoch": 0.9515418502202643, |
| "grad_norm": 0.4861060434456673, |
| "learning_rate": 9.891598915989159e-08, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9496123939752579, |
| "num_tokens": 209485343.0, |
| "step": 1404 |
| }, |
| { |
| "epoch": 0.9522195865808201, |
| "grad_norm": 0.4596168635267935, |
| "learning_rate": 9.75609756097561e-08, |
| "loss": 0.1745, |
| "mean_token_accuracy": 0.9443343281745911, |
| "num_tokens": 209633536.0, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.9528973229413759, |
| "grad_norm": 0.5019914406626255, |
| "learning_rate": 9.620596205962059e-08, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9496943727135658, |
| "num_tokens": 209782949.0, |
| "step": 1406 |
| }, |
| { |
| "epoch": 0.9535750593019315, |
| "grad_norm": 0.41667165269758527, |
| "learning_rate": 9.48509485094851e-08, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9488124921917915, |
| "num_tokens": 209934473.0, |
| "step": 1407 |
| }, |
| { |
| "epoch": 0.9542527956624873, |
| "grad_norm": 0.4249710160121378, |
| "learning_rate": 9.349593495934959e-08, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9486833810806274, |
| "num_tokens": 210084504.0, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.954930532023043, |
| "grad_norm": 0.42678826928356306, |
| "learning_rate": 9.214092140921409e-08, |
| "loss": 0.1603, |
| "mean_token_accuracy": 0.9479594007134438, |
| "num_tokens": 210233550.0, |
| "step": 1409 |
| }, |
| { |
| "epoch": 0.9556082683835988, |
| "grad_norm": 0.44287513677740237, |
| "learning_rate": 9.078590785907859e-08, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9466651305556297, |
| "num_tokens": 210377294.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.9562860047441545, |
| "grad_norm": 0.5190623192817143, |
| "learning_rate": 8.943089430894309e-08, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9465017914772034, |
| "num_tokens": 210528792.0, |
| "step": 1411 |
| }, |
| { |
| "epoch": 0.9569637411047103, |
| "grad_norm": 0.45243101031406463, |
| "learning_rate": 8.807588075880757e-08, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9479076936841011, |
| "num_tokens": 210673424.0, |
| "step": 1412 |
| }, |
| { |
| "epoch": 0.9576414774652661, |
| "grad_norm": 0.5749862594910748, |
| "learning_rate": 8.672086720867208e-08, |
| "loss": 0.1575, |
| "mean_token_accuracy": 0.949820950627327, |
| "num_tokens": 210823368.0, |
| "step": 1413 |
| }, |
| { |
| "epoch": 0.9583192138258217, |
| "grad_norm": 0.4382410240273075, |
| "learning_rate": 8.536585365853659e-08, |
| "loss": 0.1586, |
| "mean_token_accuracy": 0.9487280175089836, |
| "num_tokens": 210975510.0, |
| "step": 1414 |
| }, |
| { |
| "epoch": 0.9589969501863775, |
| "grad_norm": 0.4427876374521735, |
| "learning_rate": 8.401084010840108e-08, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9473810121417046, |
| "num_tokens": 211127158.0, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.9596746865469332, |
| "grad_norm": 0.4804162197215828, |
| "learning_rate": 8.265582655826558e-08, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9485824853181839, |
| "num_tokens": 211275571.0, |
| "step": 1416 |
| }, |
| { |
| "epoch": 0.960352422907489, |
| "grad_norm": 0.5330105982182631, |
| "learning_rate": 8.130081300813008e-08, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9479814395308495, |
| "num_tokens": 211424065.0, |
| "step": 1417 |
| }, |
| { |
| "epoch": 0.9610301592680447, |
| "grad_norm": 0.4616732204432621, |
| "learning_rate": 7.994579945799458e-08, |
| "loss": 0.1576, |
| "mean_token_accuracy": 0.9496120512485504, |
| "num_tokens": 211573949.0, |
| "step": 1418 |
| }, |
| { |
| "epoch": 0.9617078956286005, |
| "grad_norm": 0.4145030612625962, |
| "learning_rate": 7.859078590785908e-08, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9490218088030815, |
| "num_tokens": 211724572.0, |
| "step": 1419 |
| }, |
| { |
| "epoch": 0.9623856319891562, |
| "grad_norm": 0.7476389113371616, |
| "learning_rate": 7.723577235772358e-08, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9456061944365501, |
| "num_tokens": 211874433.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.963063368349712, |
| "grad_norm": 0.4511632542604127, |
| "learning_rate": 7.588075880758806e-08, |
| "loss": 0.1677, |
| "mean_token_accuracy": 0.9462789595127106, |
| "num_tokens": 212024712.0, |
| "step": 1421 |
| }, |
| { |
| "epoch": 0.9637411047102677, |
| "grad_norm": 0.42374716464969436, |
| "learning_rate": 7.452574525745257e-08, |
| "loss": 0.1484, |
| "mean_token_accuracy": 0.9522485435009003, |
| "num_tokens": 212170030.0, |
| "step": 1422 |
| }, |
| { |
| "epoch": 0.9644188410708234, |
| "grad_norm": 0.44484121070820676, |
| "learning_rate": 7.317073170731706e-08, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9475429728627205, |
| "num_tokens": 212318731.0, |
| "step": 1423 |
| }, |
| { |
| "epoch": 0.9650965774313792, |
| "grad_norm": 0.40622925709319363, |
| "learning_rate": 7.181571815718157e-08, |
| "loss": 0.1804, |
| "mean_token_accuracy": 0.9423197209835052, |
| "num_tokens": 212467624.0, |
| "step": 1424 |
| }, |
| { |
| "epoch": 0.9657743137919349, |
| "grad_norm": 0.5531226991163463, |
| "learning_rate": 7.046070460704606e-08, |
| "loss": 0.1574, |
| "mean_token_accuracy": 0.9494439512491226, |
| "num_tokens": 212617827.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.9664520501524907, |
| "grad_norm": 0.7000362874893901, |
| "learning_rate": 6.910569105691057e-08, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9473242685198784, |
| "num_tokens": 212765526.0, |
| "step": 1426 |
| }, |
| { |
| "epoch": 0.9671297865130464, |
| "grad_norm": 0.6038971992664033, |
| "learning_rate": 6.775067750677506e-08, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9467999115586281, |
| "num_tokens": 212916763.0, |
| "step": 1427 |
| }, |
| { |
| "epoch": 0.9678075228736022, |
| "grad_norm": 0.3759979601319053, |
| "learning_rate": 6.639566395663957e-08, |
| "loss": 0.1564, |
| "mean_token_accuracy": 0.9499575644731522, |
| "num_tokens": 213066865.0, |
| "step": 1428 |
| }, |
| { |
| "epoch": 0.968485259234158, |
| "grad_norm": 0.4273989848141862, |
| "learning_rate": 6.504065040650406e-08, |
| "loss": 0.1498, |
| "mean_token_accuracy": 0.9508862793445587, |
| "num_tokens": 213218281.0, |
| "step": 1429 |
| }, |
| { |
| "epoch": 0.9691629955947136, |
| "grad_norm": 0.3797911718383248, |
| "learning_rate": 6.368563685636856e-08, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9494618847966194, |
| "num_tokens": 213368122.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.9698407319552694, |
| "grad_norm": 1.0814754748509061, |
| "learning_rate": 6.233062330623306e-08, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9488984197378159, |
| "num_tokens": 213516074.0, |
| "step": 1431 |
| }, |
| { |
| "epoch": 0.9705184683158251, |
| "grad_norm": 0.45371909333167076, |
| "learning_rate": 6.097560975609756e-08, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.9507407993078232, |
| "num_tokens": 213661704.0, |
| "step": 1432 |
| }, |
| { |
| "epoch": 0.9711962046763809, |
| "grad_norm": 0.723580284990279, |
| "learning_rate": 5.962059620596206e-08, |
| "loss": 0.1524, |
| "mean_token_accuracy": 0.9498827084898949, |
| "num_tokens": 213810239.0, |
| "step": 1433 |
| }, |
| { |
| "epoch": 0.9718739410369366, |
| "grad_norm": 0.38236820047757286, |
| "learning_rate": 5.8265582655826555e-08, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9473016634583473, |
| "num_tokens": 213960363.0, |
| "step": 1434 |
| }, |
| { |
| "epoch": 0.9725516773974924, |
| "grad_norm": 0.4881616063054681, |
| "learning_rate": 5.6910569105691055e-08, |
| "loss": 0.1548, |
| "mean_token_accuracy": 0.9498519450426102, |
| "num_tokens": 214106047.0, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.9732294137580482, |
| "grad_norm": 0.4358638712210395, |
| "learning_rate": 5.555555555555555e-08, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9484395757317543, |
| "num_tokens": 214253705.0, |
| "step": 1436 |
| }, |
| { |
| "epoch": 0.9739071501186038, |
| "grad_norm": 0.4715153035751415, |
| "learning_rate": 5.4200542005420054e-08, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9482447728514671, |
| "num_tokens": 214404671.0, |
| "step": 1437 |
| }, |
| { |
| "epoch": 0.9745848864791596, |
| "grad_norm": 0.46315128761956975, |
| "learning_rate": 5.2845528455284554e-08, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9491038843989372, |
| "num_tokens": 214556711.0, |
| "step": 1438 |
| }, |
| { |
| "epoch": 0.9752626228397153, |
| "grad_norm": 0.39816199018947296, |
| "learning_rate": 5.1490514905149054e-08, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.9482710883021355, |
| "num_tokens": 214706564.0, |
| "step": 1439 |
| }, |
| { |
| "epoch": 0.9759403592002711, |
| "grad_norm": 0.5013092964160424, |
| "learning_rate": 5.013550135501355e-08, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9488778188824654, |
| "num_tokens": 214852299.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.9766180955608268, |
| "grad_norm": 0.7003737733221979, |
| "learning_rate": 4.878048780487805e-08, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9493415355682373, |
| "num_tokens": 214999570.0, |
| "step": 1441 |
| }, |
| { |
| "epoch": 0.9772958319213826, |
| "grad_norm": 0.6975526090391319, |
| "learning_rate": 4.742547425474255e-08, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.945052981376648, |
| "num_tokens": 215147006.0, |
| "step": 1442 |
| }, |
| { |
| "epoch": 0.9779735682819384, |
| "grad_norm": 0.4318385766579875, |
| "learning_rate": 4.6070460704607046e-08, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9462201297283173, |
| "num_tokens": 215296933.0, |
| "step": 1443 |
| }, |
| { |
| "epoch": 0.9786513046424941, |
| "grad_norm": 0.4686644205335301, |
| "learning_rate": 4.4715447154471546e-08, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9505117833614349, |
| "num_tokens": 215444571.0, |
| "step": 1444 |
| }, |
| { |
| "epoch": 0.9793290410030498, |
| "grad_norm": 0.5516950584174143, |
| "learning_rate": 4.336043360433604e-08, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9482467696070671, |
| "num_tokens": 215590422.0, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.9800067773636055, |
| "grad_norm": 0.40927146742899334, |
| "learning_rate": 4.200542005420054e-08, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.9507933631539345, |
| "num_tokens": 215740373.0, |
| "step": 1446 |
| }, |
| { |
| "epoch": 0.9806845137241613, |
| "grad_norm": 1.3874657143664766, |
| "learning_rate": 4.065040650406504e-08, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9482349902391434, |
| "num_tokens": 215890354.0, |
| "step": 1447 |
| }, |
| { |
| "epoch": 0.981362250084717, |
| "grad_norm": 0.39288518372515746, |
| "learning_rate": 3.929539295392954e-08, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9491913393139839, |
| "num_tokens": 216042293.0, |
| "step": 1448 |
| }, |
| { |
| "epoch": 0.9820399864452728, |
| "grad_norm": 0.4142757769437737, |
| "learning_rate": 3.794037940379403e-08, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9484345316886902, |
| "num_tokens": 216192721.0, |
| "step": 1449 |
| }, |
| { |
| "epoch": 0.9827177228058286, |
| "grad_norm": 0.513825824397314, |
| "learning_rate": 3.658536585365853e-08, |
| "loss": 0.1449, |
| "mean_token_accuracy": 0.9520828351378441, |
| "num_tokens": 216342654.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9833954591663843, |
| "grad_norm": 0.4753898397147626, |
| "learning_rate": 3.523035230352303e-08, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9478369429707527, |
| "num_tokens": 216491843.0, |
| "step": 1451 |
| }, |
| { |
| "epoch": 0.9840731955269401, |
| "grad_norm": 0.42824168874566454, |
| "learning_rate": 3.387533875338753e-08, |
| "loss": 0.1673, |
| "mean_token_accuracy": 0.9474748447537422, |
| "num_tokens": 216637358.0, |
| "step": 1452 |
| }, |
| { |
| "epoch": 0.9847509318874957, |
| "grad_norm": 0.4235882744447348, |
| "learning_rate": 3.252032520325203e-08, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9481077715754509, |
| "num_tokens": 216780615.0, |
| "step": 1453 |
| }, |
| { |
| "epoch": 0.9854286682480515, |
| "grad_norm": 0.42767560637753677, |
| "learning_rate": 3.116531165311653e-08, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9475177302956581, |
| "num_tokens": 216933270.0, |
| "step": 1454 |
| }, |
| { |
| "epoch": 0.9861064046086072, |
| "grad_norm": 0.4693239469006344, |
| "learning_rate": 2.981029810298103e-08, |
| "loss": 0.1524, |
| "mean_token_accuracy": 0.9508577063679695, |
| "num_tokens": 217083629.0, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.986784140969163, |
| "grad_norm": 0.383108569963258, |
| "learning_rate": 2.8455284552845527e-08, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9483126401901245, |
| "num_tokens": 217234233.0, |
| "step": 1456 |
| }, |
| { |
| "epoch": 0.9874618773297187, |
| "grad_norm": 1.029164567918242, |
| "learning_rate": 2.7100271002710027e-08, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9489754140377045, |
| "num_tokens": 217383089.0, |
| "step": 1457 |
| }, |
| { |
| "epoch": 0.9881396136902745, |
| "grad_norm": 0.4222264677475407, |
| "learning_rate": 2.5745257452574527e-08, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9470364972949028, |
| "num_tokens": 217529730.0, |
| "step": 1458 |
| }, |
| { |
| "epoch": 0.9888173500508303, |
| "grad_norm": 0.42748278256522887, |
| "learning_rate": 2.4390243902439023e-08, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.9459366276860237, |
| "num_tokens": 217676151.0, |
| "step": 1459 |
| }, |
| { |
| "epoch": 0.989495086411386, |
| "grad_norm": 0.7811521591588235, |
| "learning_rate": 2.3035230352303523e-08, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9484688863158226, |
| "num_tokens": 217821534.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.9901728227719417, |
| "grad_norm": 0.4365643706529922, |
| "learning_rate": 2.168021680216802e-08, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.951580099761486, |
| "num_tokens": 217968730.0, |
| "step": 1461 |
| }, |
| { |
| "epoch": 0.9908505591324974, |
| "grad_norm": 0.41791200977441884, |
| "learning_rate": 2.032520325203252e-08, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9464643821120262, |
| "num_tokens": 218119818.0, |
| "step": 1462 |
| }, |
| { |
| "epoch": 0.9915282954930532, |
| "grad_norm": 0.39231804146546523, |
| "learning_rate": 1.8970189701897016e-08, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9486127719283104, |
| "num_tokens": 218269352.0, |
| "step": 1463 |
| }, |
| { |
| "epoch": 0.9922060318536089, |
| "grad_norm": 0.7878637845281535, |
| "learning_rate": 1.7615176151761516e-08, |
| "loss": 0.1546, |
| "mean_token_accuracy": 0.9497353583574295, |
| "num_tokens": 218416325.0, |
| "step": 1464 |
| }, |
| { |
| "epoch": 0.9928837682141647, |
| "grad_norm": 0.4620398358770985, |
| "learning_rate": 1.6260162601626016e-08, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9468031600117683, |
| "num_tokens": 218561324.0, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.9935615045747205, |
| "grad_norm": 0.5178289109201032, |
| "learning_rate": 1.4905149051490515e-08, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9484723061323166, |
| "num_tokens": 218709700.0, |
| "step": 1466 |
| }, |
| { |
| "epoch": 0.9942392409352762, |
| "grad_norm": 0.40435189291017193, |
| "learning_rate": 1.3550135501355014e-08, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.9486423879861832, |
| "num_tokens": 218852741.0, |
| "step": 1467 |
| }, |
| { |
| "epoch": 0.994916977295832, |
| "grad_norm": 0.5105890387057939, |
| "learning_rate": 1.2195121951219512e-08, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.9470389634370804, |
| "num_tokens": 218999345.0, |
| "step": 1468 |
| }, |
| { |
| "epoch": 0.9955947136563876, |
| "grad_norm": 0.5170350599741357, |
| "learning_rate": 1.084010840108401e-08, |
| "loss": 0.1533, |
| "mean_token_accuracy": 0.9501299187541008, |
| "num_tokens": 219148132.0, |
| "step": 1469 |
| }, |
| { |
| "epoch": 0.9962724500169434, |
| "grad_norm": 0.459774640406125, |
| "learning_rate": 9.485094850948508e-09, |
| "loss": 0.1541, |
| "mean_token_accuracy": 0.949880562722683, |
| "num_tokens": 219295719.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9969501863774991, |
| "grad_norm": 0.3986996249321926, |
| "learning_rate": 8.130081300813008e-09, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.9457027688622475, |
| "num_tokens": 219443499.0, |
| "step": 1471 |
| }, |
| { |
| "epoch": 0.9976279227380549, |
| "grad_norm": 0.4286006846978446, |
| "learning_rate": 6.775067750677507e-09, |
| "loss": 0.1535, |
| "mean_token_accuracy": 0.9497988075017929, |
| "num_tokens": 219591042.0, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.9983056590986107, |
| "grad_norm": 0.5770627540228075, |
| "learning_rate": 5.420054200542005e-09, |
| "loss": 0.1596, |
| "mean_token_accuracy": 0.9486493095755577, |
| "num_tokens": 219741118.0, |
| "step": 1473 |
| }, |
| { |
| "epoch": 0.9989833954591664, |
| "grad_norm": 0.5074089071295785, |
| "learning_rate": 4.065040650406504e-09, |
| "loss": 0.1576, |
| "mean_token_accuracy": 0.9491940215229988, |
| "num_tokens": 219889941.0, |
| "step": 1474 |
| }, |
| { |
| "epoch": 0.9996611318197222, |
| "grad_norm": 0.3843164178142288, |
| "learning_rate": 2.7100271002710025e-09, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9503316506743431, |
| "num_tokens": 220044144.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.0722423111346728, |
| "learning_rate": 1.3550135501355012e-09, |
| "loss": 0.1607, |
| "mean_token_accuracy": 0.947552278637886, |
| "num_tokens": 220100184.0, |
| "step": 1476 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 1476, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.436682675001754e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|