{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.991020917892456, "epoch": 0.002177463255307567, "grad_norm": 1.7277425527572632, "learning_rate": 0.0, "loss": 2.1366, "mean_token_accuracy": 0.628267303109169, "num_tokens": 3878.0, "step": 1 }, { "entropy": 0.957662433385849, "epoch": 0.004354926510615134, "grad_norm": 2.072942018508911, "learning_rate": 1.4285714285714285e-05, "loss": 2.1489, "mean_token_accuracy": 0.6321403831243515, "num_tokens": 7754.0, "step": 2 }, { "entropy": 0.9688078463077545, "epoch": 0.0065323897659227, "grad_norm": 1.712500810623169, "learning_rate": 2.857142857142857e-05, "loss": 2.1006, "mean_token_accuracy": 0.6395954489707947, "num_tokens": 11724.0, "step": 3 }, { "entropy": 0.9446051567792892, "epoch": 0.008709853021230268, "grad_norm": 1.6249885559082031, "learning_rate": 4.2857142857142856e-05, "loss": 1.8636, "mean_token_accuracy": 0.6592330932617188, "num_tokens": 15998.0, "step": 4 }, { "entropy": 0.9482486844062805, "epoch": 0.010887316276537834, "grad_norm": 1.2645702362060547, "learning_rate": 5.714285714285714e-05, "loss": 1.6381, "mean_token_accuracy": 0.6715894490480423, "num_tokens": 20034.0, "step": 5 }, { "entropy": 0.8820638656616211, "epoch": 0.0130647795318454, "grad_norm": 0.9111854434013367, "learning_rate": 7.142857142857143e-05, "loss": 1.256, "mean_token_accuracy": 0.7338996976613998, "num_tokens": 24592.0, "step": 6 }, { "entropy": 0.866950273513794, "epoch": 0.015242242787152967, "grad_norm": 0.6964920163154602, "learning_rate": 8.571428571428571e-05, "loss": 1.0385, "mean_token_accuracy": 0.7606792002916336, "num_tokens": 29247.0, "step": 7 }, { "entropy": 0.9245865046977997, "epoch": 0.017419706042460535, "grad_norm": 0.6615565419197083, "learning_rate": 0.0001, "loss": 0.9594, "mean_token_accuracy": 0.7808533608913422, "num_tokens": 33561.0, "step": 8 }, { "entropy": 0.8866463452577591, "epoch": 0.0195971692977681, "grad_norm": 0.5024364590644836, "learning_rate": 0.00011428571428571428, "loss": 0.8709, "mean_token_accuracy": 0.7967555373907089, "num_tokens": 37956.0, "step": 9 }, { "entropy": 0.8838604241609573, "epoch": 0.021774632553075667, "grad_norm": 0.637697696685791, "learning_rate": 0.00012857142857142858, "loss": 0.8448, "mean_token_accuracy": 0.7953355461359024, "num_tokens": 41607.0, "step": 10 }, { "entropy": 0.8180341869592667, "epoch": 0.023952095808383235, "grad_norm": 0.5411834120750427, "learning_rate": 0.00014285714285714287, "loss": 0.7641, "mean_token_accuracy": 0.8057558983564377, "num_tokens": 45872.0, "step": 11 }, { "entropy": 0.6423389464616776, "epoch": 0.0261295590636908, "grad_norm": 0.5807392597198486, "learning_rate": 0.00015714285714285716, "loss": 0.6364, "mean_token_accuracy": 0.8353168815374374, "num_tokens": 50197.0, "step": 12 }, { "entropy": 0.7770279943943024, "epoch": 0.028307022318998367, "grad_norm": 0.602966845035553, "learning_rate": 0.00017142857142857143, "loss": 0.911, "mean_token_accuracy": 0.8115980476140976, "num_tokens": 55436.0, "step": 13 }, { "entropy": 0.6030550897121429, "epoch": 0.030484485574305935, "grad_norm": 0.471264511346817, "learning_rate": 0.00018571428571428572, "loss": 0.6506, "mean_token_accuracy": 0.8220222592353821, "num_tokens": 59509.0, "step": 14 }, { "entropy": 0.5797188133001328, "epoch": 0.0326619488296135, "grad_norm": 0.3981204628944397, "learning_rate": 0.0002, "loss": 0.6439, "mean_token_accuracy": 0.8296175897121429, "num_tokens": 63811.0, "step": 15 }, { "entropy": 0.5227785632014275, "epoch": 0.03483941208492107, "grad_norm": 0.3803451955318451, "learning_rate": 0.00019999776724666853, "loss": 0.5614, "mean_token_accuracy": 0.8536529093980789, "num_tokens": 67933.0, "step": 16 }, { "entropy": 0.5339454486966133, "epoch": 0.037016875340228635, "grad_norm": 0.4023122489452362, "learning_rate": 0.00019999106909745614, "loss": 0.5768, "mean_token_accuracy": 0.8468181490898132, "num_tokens": 71929.0, "step": 17 }, { "entropy": 0.5080433636903763, "epoch": 0.0391943385955362, "grad_norm": 0.359109103679657, "learning_rate": 0.0001999799058847031, "loss": 0.5158, "mean_token_accuracy": 0.8626691251993179, "num_tokens": 76116.0, "step": 18 }, { "entropy": 0.49260225892066956, "epoch": 0.04137180185084377, "grad_norm": 0.34172919392585754, "learning_rate": 0.00019996427816229171, "loss": 0.5121, "mean_token_accuracy": 0.8724553287029266, "num_tokens": 80000.0, "step": 19 }, { "entropy": 0.5065008923411369, "epoch": 0.043549265106151334, "grad_norm": 0.4033750295639038, "learning_rate": 0.00019994418670561857, "loss": 0.5636, "mean_token_accuracy": 0.8592322468757629, "num_tokens": 83682.0, "step": 20 }, { "entropy": 0.54892348498106, "epoch": 0.0457267283614589, "grad_norm": 0.41379520297050476, "learning_rate": 0.00019991963251155627, "loss": 0.5693, "mean_token_accuracy": 0.8495212495326996, "num_tokens": 87684.0, "step": 21 }, { "entropy": 0.4928950071334839, "epoch": 0.04790419161676647, "grad_norm": 0.3717893362045288, "learning_rate": 0.00019989061679840392, "loss": 0.523, "mean_token_accuracy": 0.8606368601322174, "num_tokens": 91550.0, "step": 22 }, { "entropy": 0.5253347381949425, "epoch": 0.050081654872074034, "grad_norm": 0.3741125166416168, "learning_rate": 0.0001998571410058266, "loss": 0.5433, "mean_token_accuracy": 0.8630485236644745, "num_tokens": 95625.0, "step": 23 }, { "entropy": 0.5028375387191772, "epoch": 0.0522591181273816, "grad_norm": 1.8555870056152344, "learning_rate": 0.00019981920679478407, "loss": 0.5296, "mean_token_accuracy": 0.8609876334667206, "num_tokens": 99517.0, "step": 24 }, { "entropy": 0.5414893701672554, "epoch": 0.05443658138268917, "grad_norm": 0.44715237617492676, "learning_rate": 0.00019977681604744824, "loss": 0.5782, "mean_token_accuracy": 0.8441034108400345, "num_tokens": 103204.0, "step": 25 }, { "entropy": 0.48021427541971207, "epoch": 0.056614044637996734, "grad_norm": 0.31098225712776184, "learning_rate": 0.0001997299708671098, "loss": 0.4932, "mean_token_accuracy": 0.8744789958000183, "num_tokens": 107327.0, "step": 26 }, { "entropy": 0.46857017278671265, "epoch": 0.0587915078933043, "grad_norm": 0.3036307692527771, "learning_rate": 0.00019967867357807391, "loss": 0.4791, "mean_token_accuracy": 0.8786111921072006, "num_tokens": 111453.0, "step": 27 }, { "entropy": 0.49031493067741394, "epoch": 0.06096897114861187, "grad_norm": 0.3337958753108978, "learning_rate": 0.00019962292672554493, "loss": 0.5018, "mean_token_accuracy": 0.8619510382413864, "num_tokens": 115266.0, "step": 28 }, { "entropy": 0.4807809889316559, "epoch": 0.06314643440391943, "grad_norm": 0.35365355014801025, "learning_rate": 0.00019956273307549988, "loss": 0.4877, "mean_token_accuracy": 0.8618622571229935, "num_tokens": 118928.0, "step": 29 }, { "entropy": 0.40949247032403946, "epoch": 0.065323897659227, "grad_norm": 0.3226538300514221, "learning_rate": 0.00019949809561455156, "loss": 0.4133, "mean_token_accuracy": 0.8882981538772583, "num_tokens": 122893.0, "step": 30 }, { "entropy": 0.49030745029449463, "epoch": 0.06750136091453457, "grad_norm": 0.33420825004577637, "learning_rate": 0.0001994290175498001, "loss": 0.503, "mean_token_accuracy": 0.8634953200817108, "num_tokens": 127132.0, "step": 31 }, { "entropy": 0.49527500569820404, "epoch": 0.06967882416984214, "grad_norm": 0.4112333655357361, "learning_rate": 0.00019935550230867392, "loss": 0.5067, "mean_token_accuracy": 0.8607686161994934, "num_tokens": 131100.0, "step": 32 }, { "entropy": 0.5203969404101372, "epoch": 0.0718562874251497, "grad_norm": 1.3927068710327148, "learning_rate": 0.00019927755353875965, "loss": 0.5942, "mean_token_accuracy": 0.8566101640462875, "num_tokens": 135503.0, "step": 33 }, { "entropy": 0.5023058727383614, "epoch": 0.07403375068045727, "grad_norm": 0.4079550802707672, "learning_rate": 0.00019919517510762124, "loss": 0.4961, "mean_token_accuracy": 0.8630523085594177, "num_tokens": 139771.0, "step": 34 }, { "entropy": 0.4864235520362854, "epoch": 0.07621121393576484, "grad_norm": 0.39264485239982605, "learning_rate": 0.0001991083711026079, "loss": 0.5009, "mean_token_accuracy": 0.871365949511528, "num_tokens": 143980.0, "step": 35 }, { "entropy": 0.525127612054348, "epoch": 0.0783886771910724, "grad_norm": 0.4305553734302521, "learning_rate": 0.00019901714583065152, "loss": 0.4872, "mean_token_accuracy": 0.8670255392789841, "num_tokens": 148059.0, "step": 36 }, { "entropy": 0.5707878470420837, "epoch": 0.08056614044637997, "grad_norm": 0.44129130244255066, "learning_rate": 0.00019892150381805267, "loss": 0.581, "mean_token_accuracy": 0.844414696097374, "num_tokens": 152230.0, "step": 37 }, { "entropy": 0.5001106485724449, "epoch": 0.08274360370168754, "grad_norm": 0.5918931365013123, "learning_rate": 0.00019882144981025633, "loss": 0.4751, "mean_token_accuracy": 0.8649907559156418, "num_tokens": 156252.0, "step": 38 }, { "entropy": 0.5312293991446495, "epoch": 0.0849210669569951, "grad_norm": 0.4835371971130371, "learning_rate": 0.00019871698877161627, "loss": 0.5091, "mean_token_accuracy": 0.871647521853447, "num_tokens": 160171.0, "step": 39 }, { "entropy": 0.4701843932271004, "epoch": 0.08709853021230267, "grad_norm": 0.48571643233299255, "learning_rate": 0.0001986081258851487, "loss": 0.4495, "mean_token_accuracy": 0.8771228045225143, "num_tokens": 163975.0, "step": 40 }, { "entropy": 0.4694196283817291, "epoch": 0.08927599346761024, "grad_norm": 0.4216046929359436, "learning_rate": 0.00019849486655227532, "loss": 0.4158, "mean_token_accuracy": 0.8735549598932266, "num_tokens": 168421.0, "step": 41 }, { "entropy": 0.4697120413184166, "epoch": 0.0914534567229178, "grad_norm": 0.3664827346801758, "learning_rate": 0.000198377216392555, "loss": 0.4231, "mean_token_accuracy": 0.8784957528114319, "num_tokens": 172395.0, "step": 42 }, { "entropy": 0.44586674869060516, "epoch": 0.09363091997822537, "grad_norm": 0.39455050230026245, "learning_rate": 0.00019825518124340529, "loss": 0.4166, "mean_token_accuracy": 0.8799059689044952, "num_tokens": 175967.0, "step": 43 }, { "entropy": 0.4293370470404625, "epoch": 0.09580838323353294, "grad_norm": 0.39706796407699585, "learning_rate": 0.00019812876715981248, "loss": 0.4522, "mean_token_accuracy": 0.8723510503768921, "num_tokens": 180153.0, "step": 44 }, { "entropy": 0.41641899943351746, "epoch": 0.0979858464888405, "grad_norm": 0.41735970973968506, "learning_rate": 0.00019799798041403137, "loss": 0.4436, "mean_token_accuracy": 0.8725763112306595, "num_tokens": 184161.0, "step": 45 }, { "entropy": 0.40215710550546646, "epoch": 0.10016330974414807, "grad_norm": 0.44639289379119873, "learning_rate": 0.00019786282749527406, "loss": 0.4289, "mean_token_accuracy": 0.8803199082612991, "num_tokens": 187869.0, "step": 46 }, { "entropy": 0.48222628980875015, "epoch": 0.10234077299945564, "grad_norm": 0.4197250306606293, "learning_rate": 0.00019772331510938782, "loss": 0.4861, "mean_token_accuracy": 0.8618861585855484, "num_tokens": 192020.0, "step": 47 }, { "entropy": 0.49629800766706467, "epoch": 0.1045182362547632, "grad_norm": 0.5031387209892273, "learning_rate": 0.00019757945017852258, "loss": 0.4775, "mean_token_accuracy": 0.8681423515081406, "num_tokens": 195514.0, "step": 48 }, { "entropy": 0.3977178856730461, "epoch": 0.10669569951007077, "grad_norm": 0.4578983783721924, "learning_rate": 0.0001974312398407873, "loss": 0.3673, "mean_token_accuracy": 0.8914825022220612, "num_tokens": 199234.0, "step": 49 }, { "entropy": 0.3965229466557503, "epoch": 0.10887316276537834, "grad_norm": 0.37602174282073975, "learning_rate": 0.0001972786914498958, "loss": 0.3953, "mean_token_accuracy": 0.8783656060695648, "num_tokens": 203760.0, "step": 50 }, { "entropy": 0.42161373794078827, "epoch": 0.1110506260206859, "grad_norm": 0.3125810921192169, "learning_rate": 0.00019712181257480212, "loss": 0.3754, "mean_token_accuracy": 0.8832796663045883, "num_tokens": 207439.0, "step": 51 }, { "entropy": 0.4191659912467003, "epoch": 0.11322808927599347, "grad_norm": 0.32242998480796814, "learning_rate": 0.00019696061099932471, "loss": 0.3861, "mean_token_accuracy": 0.8820012956857681, "num_tokens": 211708.0, "step": 52 }, { "entropy": 0.464703693985939, "epoch": 0.11540555253130104, "grad_norm": 0.4021685719490051, "learning_rate": 0.00019679509472176032, "loss": 0.4384, "mean_token_accuracy": 0.8743875622749329, "num_tokens": 215763.0, "step": 53 }, { "entropy": 0.4165603965520859, "epoch": 0.1175830157866086, "grad_norm": 0.3444255590438843, "learning_rate": 0.00019662527195448722, "loss": 0.3991, "mean_token_accuracy": 0.88118776679039, "num_tokens": 220090.0, "step": 54 }, { "entropy": 0.4068721905350685, "epoch": 0.11976047904191617, "grad_norm": 0.3705560564994812, "learning_rate": 0.00019645115112355754, "loss": 0.3707, "mean_token_accuracy": 0.882274329662323, "num_tokens": 223672.0, "step": 55 }, { "entropy": 0.3627975210547447, "epoch": 0.12193794229722374, "grad_norm": 0.37365177273750305, "learning_rate": 0.00019627274086827948, "loss": 0.36, "mean_token_accuracy": 0.8874702304601669, "num_tokens": 227497.0, "step": 56 }, { "entropy": 0.40359440445899963, "epoch": 0.1241154055525313, "grad_norm": 0.33996060490608215, "learning_rate": 0.00019609005004078838, "loss": 0.4253, "mean_token_accuracy": 0.8732311725616455, "num_tokens": 231293.0, "step": 57 }, { "entropy": 0.36641839146614075, "epoch": 0.12629286880783885, "grad_norm": 0.2762836515903473, "learning_rate": 0.00019590308770560763, "loss": 0.3485, "mean_token_accuracy": 0.8926344960927963, "num_tokens": 236001.0, "step": 58 }, { "entropy": 0.4077141284942627, "epoch": 0.12847033206314643, "grad_norm": 0.2915239930152893, "learning_rate": 0.00019571186313919895, "loss": 0.3942, "mean_token_accuracy": 0.8783977180719376, "num_tokens": 240264.0, "step": 59 }, { "entropy": 0.4022030830383301, "epoch": 0.130647795318454, "grad_norm": 0.3684654235839844, "learning_rate": 0.00019551638582950213, "loss": 0.412, "mean_token_accuracy": 0.8735997825860977, "num_tokens": 243854.0, "step": 60 }, { "entropy": 0.41812095791101456, "epoch": 0.13282525857376157, "grad_norm": 0.3383813500404358, "learning_rate": 0.00019531666547546403, "loss": 0.4302, "mean_token_accuracy": 0.8795482665300369, "num_tokens": 247268.0, "step": 61 }, { "entropy": 0.38665496557950974, "epoch": 0.13500272182906914, "grad_norm": 0.31561279296875, "learning_rate": 0.0001951127119865578, "loss": 0.3844, "mean_token_accuracy": 0.8816228210926056, "num_tokens": 251256.0, "step": 62 }, { "entropy": 0.4358583614230156, "epoch": 0.1371801850843767, "grad_norm": 0.3552601933479309, "learning_rate": 0.00019490453548229075, "loss": 0.4193, "mean_token_accuracy": 0.8728261440992355, "num_tokens": 255350.0, "step": 63 }, { "entropy": 0.40031400322914124, "epoch": 0.13935764833968428, "grad_norm": 0.30350831151008606, "learning_rate": 0.00019469214629170246, "loss": 0.4005, "mean_token_accuracy": 0.8818740844726562, "num_tokens": 259391.0, "step": 64 }, { "entropy": 0.3782212808728218, "epoch": 0.14153511159499182, "grad_norm": 0.2870739996433258, "learning_rate": 0.00019447555495285247, "loss": 0.3396, "mean_token_accuracy": 0.8948279619216919, "num_tokens": 263599.0, "step": 65 }, { "entropy": 0.41549866646528244, "epoch": 0.1437125748502994, "grad_norm": 0.2995204031467438, "learning_rate": 0.00019425477221229694, "loss": 0.394, "mean_token_accuracy": 0.8853535056114197, "num_tokens": 267514.0, "step": 66 }, { "entropy": 0.40607404708862305, "epoch": 0.14589003810560697, "grad_norm": 0.3016026020050049, "learning_rate": 0.00019402980902455592, "loss": 0.4006, "mean_token_accuracy": 0.8783000707626343, "num_tokens": 271156.0, "step": 67 }, { "entropy": 0.3719393089413643, "epoch": 0.14806750136091454, "grad_norm": 0.26128438115119934, "learning_rate": 0.00019380067655156956, "loss": 0.3537, "mean_token_accuracy": 0.8965920209884644, "num_tokens": 275317.0, "step": 68 }, { "entropy": 0.42157839983701706, "epoch": 0.1502449646162221, "grad_norm": 0.3250483572483063, "learning_rate": 0.00019356738616214435, "loss": 0.4115, "mean_token_accuracy": 0.8846541047096252, "num_tokens": 279424.0, "step": 69 }, { "entropy": 0.4183052033185959, "epoch": 0.15242242787152968, "grad_norm": 0.315361887216568, "learning_rate": 0.00019332994943138906, "loss": 0.4148, "mean_token_accuracy": 0.8700041323900223, "num_tokens": 283564.0, "step": 70 }, { "entropy": 0.40483858436346054, "epoch": 0.15459989112683722, "grad_norm": 0.31096142530441284, "learning_rate": 0.00019308837814014038, "loss": 0.3835, "mean_token_accuracy": 0.8849562704563141, "num_tokens": 287357.0, "step": 71 }, { "entropy": 0.39035435765981674, "epoch": 0.1567773543821448, "grad_norm": 0.3067997097969055, "learning_rate": 0.0001928426842743784, "loss": 0.3846, "mean_token_accuracy": 0.8829791098833084, "num_tokens": 291390.0, "step": 72 }, { "entropy": 0.3541962653398514, "epoch": 0.15895481763745237, "grad_norm": 0.27743661403656006, "learning_rate": 0.000192592880024632, "loss": 0.3279, "mean_token_accuracy": 0.8986150324344635, "num_tokens": 295446.0, "step": 73 }, { "entropy": 0.4067593812942505, "epoch": 0.16113228089275994, "grad_norm": 0.2917785346508026, "learning_rate": 0.00019233897778537387, "loss": 0.4056, "mean_token_accuracy": 0.8775222897529602, "num_tokens": 299884.0, "step": 74 }, { "entropy": 0.3865869492292404, "epoch": 0.1633097441480675, "grad_norm": 0.3175944685935974, "learning_rate": 0.00019208099015440553, "loss": 0.3947, "mean_token_accuracy": 0.8831316977739334, "num_tokens": 303679.0, "step": 75 }, { "entropy": 0.42061641067266464, "epoch": 0.16548720740337508, "grad_norm": 0.29020923376083374, "learning_rate": 0.00019181892993223241, "loss": 0.424, "mean_token_accuracy": 0.8717161864042282, "num_tokens": 308028.0, "step": 76 }, { "entropy": 0.3790237084031105, "epoch": 0.16766467065868262, "grad_norm": 0.28459441661834717, "learning_rate": 0.00019155281012142857, "loss": 0.3669, "mean_token_accuracy": 0.8902580589056015, "num_tokens": 312280.0, "step": 77 }, { "entropy": 0.4007532522082329, "epoch": 0.1698421339139902, "grad_norm": 0.2907998263835907, "learning_rate": 0.00019128264392599166, "loss": 0.421, "mean_token_accuracy": 0.8734158575534821, "num_tokens": 316050.0, "step": 78 }, { "entropy": 0.38431502133607864, "epoch": 0.17201959716929777, "grad_norm": 0.2705579102039337, "learning_rate": 0.00019100844475068777, "loss": 0.3687, "mean_token_accuracy": 0.8934948295354843, "num_tokens": 319866.0, "step": 79 }, { "entropy": 0.4128147065639496, "epoch": 0.17419706042460534, "grad_norm": 0.3151399493217468, "learning_rate": 0.0001907302262003863, "loss": 0.3829, "mean_token_accuracy": 0.8834633827209473, "num_tokens": 323982.0, "step": 80 }, { "entropy": 0.4086031913757324, "epoch": 0.1763745236799129, "grad_norm": 0.3054238557815552, "learning_rate": 0.00019044800207938483, "loss": 0.3987, "mean_token_accuracy": 0.8847066015005112, "num_tokens": 327984.0, "step": 81 }, { "entropy": 0.3883258253335953, "epoch": 0.17855198693522048, "grad_norm": 0.29092952609062195, "learning_rate": 0.00019016178639072448, "loss": 0.3799, "mean_token_accuracy": 0.8958835899829865, "num_tokens": 331502.0, "step": 82 }, { "entropy": 0.41453375667333603, "epoch": 0.18072945019052802, "grad_norm": 0.279079407453537, "learning_rate": 0.0001898715933354948, "loss": 0.4303, "mean_token_accuracy": 0.879971370100975, "num_tokens": 335369.0, "step": 83 }, { "entropy": 0.395871065557003, "epoch": 0.1829069134458356, "grad_norm": 0.2992061972618103, "learning_rate": 0.0001895774373121294, "loss": 0.3933, "mean_token_accuracy": 0.8855740427970886, "num_tokens": 339407.0, "step": 84 }, { "entropy": 0.352156363427639, "epoch": 0.18508437670114317, "grad_norm": 0.29319193959236145, "learning_rate": 0.00018927933291569142, "loss": 0.3458, "mean_token_accuracy": 0.8971658796072006, "num_tokens": 343524.0, "step": 85 }, { "entropy": 0.3487248420715332, "epoch": 0.18726183995645074, "grad_norm": 0.2763819694519043, "learning_rate": 0.00018897729493714936, "loss": 0.3259, "mean_token_accuracy": 0.8960808515548706, "num_tokens": 347925.0, "step": 86 }, { "entropy": 0.4102029874920845, "epoch": 0.1894393032117583, "grad_norm": 0.2646510601043701, "learning_rate": 0.00018867133836264333, "loss": 0.3945, "mean_token_accuracy": 0.8839164674282074, "num_tokens": 352250.0, "step": 87 }, { "entropy": 0.3762153908610344, "epoch": 0.19161676646706588, "grad_norm": 0.3275756239891052, "learning_rate": 0.00018836147837274128, "loss": 0.3588, "mean_token_accuracy": 0.893315777182579, "num_tokens": 356538.0, "step": 88 }, { "entropy": 0.3680166006088257, "epoch": 0.19379422972237342, "grad_norm": 0.3026663362979889, "learning_rate": 0.00018804773034168605, "loss": 0.346, "mean_token_accuracy": 0.8997195810079575, "num_tokens": 360352.0, "step": 89 }, { "entropy": 0.3681929111480713, "epoch": 0.195971692977681, "grad_norm": 0.27409690618515015, "learning_rate": 0.00018773010983663235, "loss": 0.3619, "mean_token_accuracy": 0.8918221592903137, "num_tokens": 364359.0, "step": 90 }, { "entropy": 0.41026338934898376, "epoch": 0.19814915623298857, "grad_norm": 0.27450209856033325, "learning_rate": 0.00018740863261687438, "loss": 0.3772, "mean_token_accuracy": 0.885251596570015, "num_tokens": 368184.0, "step": 91 }, { "entropy": 0.41991668939590454, "epoch": 0.20032661948829614, "grad_norm": 0.3204193413257599, "learning_rate": 0.000187083314633064, "loss": 0.4387, "mean_token_accuracy": 0.877353847026825, "num_tokens": 372188.0, "step": 92 }, { "entropy": 0.3829573169350624, "epoch": 0.2025040827436037, "grad_norm": 0.2948894500732422, "learning_rate": 0.00018675417202641928, "loss": 0.3713, "mean_token_accuracy": 0.8871684223413467, "num_tokens": 376175.0, "step": 93 }, { "entropy": 0.37284964323043823, "epoch": 0.20468154599891128, "grad_norm": 0.3094096779823303, "learning_rate": 0.00018642122112792352, "loss": 0.3704, "mean_token_accuracy": 0.8872140049934387, "num_tokens": 380212.0, "step": 94 }, { "entropy": 0.3658677488565445, "epoch": 0.20685900925421882, "grad_norm": 0.2979802191257477, "learning_rate": 0.00018608447845751521, "loss": 0.3491, "mean_token_accuracy": 0.8897504657506943, "num_tokens": 384295.0, "step": 95 }, { "entropy": 0.36876438558101654, "epoch": 0.2090364725095264, "grad_norm": 0.2677754759788513, "learning_rate": 0.00018574396072326807, "loss": 0.3441, "mean_token_accuracy": 0.894922137260437, "num_tokens": 388732.0, "step": 96 }, { "entropy": 0.3612924814224243, "epoch": 0.21121393576483397, "grad_norm": 0.2736094892024994, "learning_rate": 0.0001853996848205622, "loss": 0.3723, "mean_token_accuracy": 0.8909705579280853, "num_tokens": 392764.0, "step": 97 }, { "entropy": 0.3905804604291916, "epoch": 0.21339139902014154, "grad_norm": 0.2624414265155792, "learning_rate": 0.0001850516678312458, "loss": 0.3891, "mean_token_accuracy": 0.8835895210504532, "num_tokens": 397014.0, "step": 98 }, { "entropy": 0.3591335415840149, "epoch": 0.2155688622754491, "grad_norm": 0.27455052733421326, "learning_rate": 0.0001846999270227876, "loss": 0.3285, "mean_token_accuracy": 0.9014366716146469, "num_tokens": 400931.0, "step": 99 }, { "entropy": 0.3889941945672035, "epoch": 0.21774632553075668, "grad_norm": 0.3075306713581085, "learning_rate": 0.00018434447984742012, "loss": 0.3748, "mean_token_accuracy": 0.8902212232351303, "num_tokens": 404953.0, "step": 100 }, { "entropy": 0.40706127136945724, "epoch": 0.21992378878606422, "grad_norm": 0.291089802980423, "learning_rate": 0.00018398534394127366, "loss": 0.3842, "mean_token_accuracy": 0.8786927759647369, "num_tokens": 408846.0, "step": 101 }, { "entropy": 0.3662910833954811, "epoch": 0.2221012520413718, "grad_norm": 0.2830312252044678, "learning_rate": 0.00018362253712350131, "loss": 0.3651, "mean_token_accuracy": 0.8856998383998871, "num_tokens": 413058.0, "step": 102 }, { "entropy": 0.3981722518801689, "epoch": 0.22427871529667937, "grad_norm": 0.26717105507850647, "learning_rate": 0.00018325607739539497, "loss": 0.4013, "mean_token_accuracy": 0.881842851638794, "num_tokens": 417404.0, "step": 103 }, { "entropy": 0.38402143120765686, "epoch": 0.22645617855198694, "grad_norm": 0.26284581422805786, "learning_rate": 0.00018288598293949185, "loss": 0.3933, "mean_token_accuracy": 0.8858134895563126, "num_tokens": 421886.0, "step": 104 }, { "entropy": 0.35189586132764816, "epoch": 0.2286336418072945, "grad_norm": 0.2981458604335785, "learning_rate": 0.00018251227211867264, "loss": 0.3779, "mean_token_accuracy": 0.8904144316911697, "num_tokens": 426069.0, "step": 105 }, { "entropy": 0.3991141989827156, "epoch": 0.23081110506260208, "grad_norm": 0.30855289101600647, "learning_rate": 0.0001821349634752502, "loss": 0.4118, "mean_token_accuracy": 0.875004380941391, "num_tokens": 430019.0, "step": 106 }, { "entropy": 0.3846806064248085, "epoch": 0.23298856831790962, "grad_norm": 0.25153040885925293, "learning_rate": 0.00018175407573004974, "loss": 0.3944, "mean_token_accuracy": 0.8794781714677811, "num_tokens": 434787.0, "step": 107 }, { "entropy": 0.38610684871673584, "epoch": 0.2351660315732172, "grad_norm": 0.25855541229248047, "learning_rate": 0.00018136962778147965, "loss": 0.3625, "mean_token_accuracy": 0.895257756114006, "num_tokens": 438762.0, "step": 108 }, { "entropy": 0.38023480772972107, "epoch": 0.23734349482852476, "grad_norm": 0.26064959168434143, "learning_rate": 0.00018098163870459419, "loss": 0.3508, "mean_token_accuracy": 0.8982452154159546, "num_tokens": 442358.0, "step": 109 }, { "entropy": 0.38109494745731354, "epoch": 0.23952095808383234, "grad_norm": 0.2560478746891022, "learning_rate": 0.00018059012775014673, "loss": 0.3316, "mean_token_accuracy": 0.8920884728431702, "num_tokens": 446375.0, "step": 110 }, { "entropy": 0.40175357460975647, "epoch": 0.2416984213391399, "grad_norm": 0.2690741717815399, "learning_rate": 0.00018019511434363479, "loss": 0.3694, "mean_token_accuracy": 0.8843608647584915, "num_tokens": 450240.0, "step": 111 }, { "entropy": 0.4437231123447418, "epoch": 0.24387588459444748, "grad_norm": 0.3393898606300354, "learning_rate": 0.00017979661808433615, "loss": 0.4375, "mean_token_accuracy": 0.8717398643493652, "num_tokens": 454162.0, "step": 112 }, { "entropy": 0.39301927387714386, "epoch": 0.24605334784975502, "grad_norm": 0.26305022835731506, "learning_rate": 0.00017939465874433633, "loss": 0.3915, "mean_token_accuracy": 0.8859032839536667, "num_tokens": 458075.0, "step": 113 }, { "entropy": 0.37585896253585815, "epoch": 0.2482308111050626, "grad_norm": 0.2808936536312103, "learning_rate": 0.0001789892562675477, "loss": 0.3808, "mean_token_accuracy": 0.8814007937908173, "num_tokens": 462440.0, "step": 114 }, { "entropy": 0.35389212518930435, "epoch": 0.25040827436037016, "grad_norm": 0.2638992667198181, "learning_rate": 0.0001785804307687199, "loss": 0.3669, "mean_token_accuracy": 0.8885058760643005, "num_tokens": 466896.0, "step": 115 }, { "entropy": 0.32084520161151886, "epoch": 0.2525857376156777, "grad_norm": 0.2875458896160126, "learning_rate": 0.00017816820253244156, "loss": 0.3393, "mean_token_accuracy": 0.8992051929235458, "num_tokens": 470737.0, "step": 116 }, { "entropy": 0.37875620275735855, "epoch": 0.2547632008709853, "grad_norm": 0.3010421693325043, "learning_rate": 0.0001777525920121343, "loss": 0.3771, "mean_token_accuracy": 0.8866951763629913, "num_tokens": 474704.0, "step": 117 }, { "entropy": 0.3695053979754448, "epoch": 0.25694066412629285, "grad_norm": 0.28365740180015564, "learning_rate": 0.0001773336198290375, "loss": 0.3606, "mean_token_accuracy": 0.8899102210998535, "num_tokens": 478684.0, "step": 118 }, { "entropy": 0.37022798508405685, "epoch": 0.25911812738160045, "grad_norm": 0.2810768187046051, "learning_rate": 0.00017691130677118533, "loss": 0.371, "mean_token_accuracy": 0.8898769170045853, "num_tokens": 482795.0, "step": 119 }, { "entropy": 0.3846744894981384, "epoch": 0.261295590636908, "grad_norm": 0.2767440974712372, "learning_rate": 0.00017648567379237524, "loss": 0.3858, "mean_token_accuracy": 0.8894098848104477, "num_tokens": 486910.0, "step": 120 }, { "entropy": 0.36647915840148926, "epoch": 0.2634730538922156, "grad_norm": 0.29192766547203064, "learning_rate": 0.00017605674201112844, "loss": 0.3532, "mean_token_accuracy": 0.8931601047515869, "num_tokens": 490909.0, "step": 121 }, { "entropy": 0.3607020005583763, "epoch": 0.26565051714752314, "grad_norm": 0.27455756068229675, "learning_rate": 0.00017562453270964184, "loss": 0.3376, "mean_token_accuracy": 0.8977847099304199, "num_tokens": 494900.0, "step": 122 }, { "entropy": 0.39875783771276474, "epoch": 0.2678279804028307, "grad_norm": 0.29144948720932007, "learning_rate": 0.0001751890673327323, "loss": 0.3625, "mean_token_accuracy": 0.8899316191673279, "num_tokens": 498621.0, "step": 123 }, { "entropy": 0.388169527053833, "epoch": 0.2700054436581383, "grad_norm": 0.28327831625938416, "learning_rate": 0.00017475036748677253, "loss": 0.368, "mean_token_accuracy": 0.8881956189870834, "num_tokens": 502604.0, "step": 124 }, { "entropy": 0.42279627174139023, "epoch": 0.2721829069134458, "grad_norm": 0.2637234330177307, "learning_rate": 0.00017430845493861903, "loss": 0.4163, "mean_token_accuracy": 0.8793482929468155, "num_tokens": 506851.0, "step": 125 }, { "entropy": 0.3659377843141556, "epoch": 0.2743603701687534, "grad_norm": 0.2649920582771301, "learning_rate": 0.00017386335161453204, "loss": 0.3592, "mean_token_accuracy": 0.8870955407619476, "num_tokens": 511029.0, "step": 126 }, { "entropy": 0.3424355015158653, "epoch": 0.27653783342406096, "grad_norm": 0.24584396183490753, "learning_rate": 0.00017341507959908788, "loss": 0.3212, "mean_token_accuracy": 0.8989846706390381, "num_tokens": 514975.0, "step": 127 }, { "entropy": 0.38080035150051117, "epoch": 0.27871529667936856, "grad_norm": 0.2918618321418762, "learning_rate": 0.00017296366113408283, "loss": 0.3836, "mean_token_accuracy": 0.8840546309947968, "num_tokens": 518603.0, "step": 128 }, { "entropy": 0.37054024636745453, "epoch": 0.2808927599346761, "grad_norm": 0.2792854309082031, "learning_rate": 0.00017250911861742984, "loss": 0.383, "mean_token_accuracy": 0.8847608417272568, "num_tokens": 522974.0, "step": 129 }, { "entropy": 0.4149508401751518, "epoch": 0.28307022318998365, "grad_norm": 0.2900242805480957, "learning_rate": 0.00017205147460204708, "loss": 0.4176, "mean_token_accuracy": 0.8743131309747696, "num_tokens": 527053.0, "step": 130 }, { "entropy": 0.3568470776081085, "epoch": 0.28524768644529125, "grad_norm": 0.2806275188922882, "learning_rate": 0.00017159075179473904, "loss": 0.3506, "mean_token_accuracy": 0.8944987952709198, "num_tokens": 531165.0, "step": 131 }, { "entropy": 0.3553621917963028, "epoch": 0.2874251497005988, "grad_norm": 0.25992849469184875, "learning_rate": 0.00017112697305506972, "loss": 0.3473, "mean_token_accuracy": 0.8974603414535522, "num_tokens": 535268.0, "step": 132 }, { "entropy": 0.350556381046772, "epoch": 0.2896026129559064, "grad_norm": 0.255686491727829, "learning_rate": 0.00017066016139422868, "loss": 0.3428, "mean_token_accuracy": 0.8938136249780655, "num_tokens": 539608.0, "step": 133 }, { "entropy": 0.3975898027420044, "epoch": 0.29178007621121393, "grad_norm": 0.2862681746482849, "learning_rate": 0.00017019033997388893, "loss": 0.3852, "mean_token_accuracy": 0.8919837325811386, "num_tokens": 543509.0, "step": 134 }, { "entropy": 0.3602987751364708, "epoch": 0.2939575394665215, "grad_norm": 0.2506209909915924, "learning_rate": 0.00016971753210505815, "loss": 0.3512, "mean_token_accuracy": 0.8999500423669815, "num_tokens": 548201.0, "step": 135 }, { "entropy": 0.36172477155923843, "epoch": 0.2961350027218291, "grad_norm": 0.24992506206035614, "learning_rate": 0.00016924176124692171, "loss": 0.3296, "mean_token_accuracy": 0.9002155065536499, "num_tokens": 552588.0, "step": 136 }, { "entropy": 0.39114704728126526, "epoch": 0.2983124659771366, "grad_norm": 0.26535582542419434, "learning_rate": 0.00016876305100567898, "loss": 0.3606, "mean_token_accuracy": 0.8913624733686447, "num_tokens": 556684.0, "step": 137 }, { "entropy": 0.3595954030752182, "epoch": 0.3004899292324442, "grad_norm": 0.2526366114616394, "learning_rate": 0.0001682814251333718, "loss": 0.3524, "mean_token_accuracy": 0.8964285999536514, "num_tokens": 560872.0, "step": 138 }, { "entropy": 0.3456057384610176, "epoch": 0.30266739248775176, "grad_norm": 0.2838667631149292, "learning_rate": 0.0001677969075267062, "loss": 0.3598, "mean_token_accuracy": 0.8893538117408752, "num_tokens": 565414.0, "step": 139 }, { "entropy": 0.3304522782564163, "epoch": 0.30484485574305936, "grad_norm": 0.2537218928337097, "learning_rate": 0.00016730952222586672, "loss": 0.3252, "mean_token_accuracy": 0.9008310884237289, "num_tokens": 569961.0, "step": 140 }, { "entropy": 0.37971338629722595, "epoch": 0.3070223189983669, "grad_norm": 0.2846769392490387, "learning_rate": 0.00016681929341332333, "loss": 0.3812, "mean_token_accuracy": 0.8877308219671249, "num_tokens": 573882.0, "step": 141 }, { "entropy": 0.32383736968040466, "epoch": 0.30919978225367445, "grad_norm": 0.30265504121780396, "learning_rate": 0.00016632624541263193, "loss": 0.3259, "mean_token_accuracy": 0.8970090597867966, "num_tokens": 577860.0, "step": 142 }, { "entropy": 0.4320111721754074, "epoch": 0.31137724550898205, "grad_norm": 0.2903831899166107, "learning_rate": 0.0001658304026872274, "loss": 0.4118, "mean_token_accuracy": 0.8787370920181274, "num_tokens": 581333.0, "step": 143 }, { "entropy": 0.372535839676857, "epoch": 0.3135547087642896, "grad_norm": 0.26929277181625366, "learning_rate": 0.00016533178983920964, "loss": 0.3555, "mean_token_accuracy": 0.8883365392684937, "num_tokens": 585459.0, "step": 144 }, { "entropy": 0.38039466738700867, "epoch": 0.3157321720195972, "grad_norm": 0.2679445743560791, "learning_rate": 0.00016483043160812295, "loss": 0.3633, "mean_token_accuracy": 0.8902519345283508, "num_tokens": 589257.0, "step": 145 }, { "entropy": 0.42324574291706085, "epoch": 0.31790963527490473, "grad_norm": 0.2745194137096405, "learning_rate": 0.0001643263528697288, "loss": 0.4154, "mean_token_accuracy": 0.878746971487999, "num_tokens": 593457.0, "step": 146 }, { "entropy": 0.46310587227344513, "epoch": 0.3200870985302123, "grad_norm": 0.2937363088130951, "learning_rate": 0.0001638195786347712, "loss": 0.4564, "mean_token_accuracy": 0.8730504065752029, "num_tokens": 596979.0, "step": 147 }, { "entropy": 0.3750259429216385, "epoch": 0.3222645617855199, "grad_norm": 0.24124816060066223, "learning_rate": 0.00016331013404773597, "loss": 0.3568, "mean_token_accuracy": 0.8933057188987732, "num_tokens": 601388.0, "step": 148 }, { "entropy": 0.37991973757743835, "epoch": 0.3244420250408274, "grad_norm": 0.27898603677749634, "learning_rate": 0.00016279804438560304, "loss": 0.3518, "mean_token_accuracy": 0.8888091742992401, "num_tokens": 605267.0, "step": 149 }, { "entropy": 0.38875921070575714, "epoch": 0.326619488296135, "grad_norm": 0.2823559641838074, "learning_rate": 0.00016228333505659246, "loss": 0.376, "mean_token_accuracy": 0.8856324106454849, "num_tokens": 609434.0, "step": 150 }, { "entropy": 0.3876258060336113, "epoch": 0.32879695155144256, "grad_norm": 0.2898506224155426, "learning_rate": 0.00016176603159890346, "loss": 0.376, "mean_token_accuracy": 0.8831023424863815, "num_tokens": 613396.0, "step": 151 }, { "entropy": 0.3707014173269272, "epoch": 0.33097441480675016, "grad_norm": 0.2642916142940521, "learning_rate": 0.00016124615967944762, "loss": 0.3752, "mean_token_accuracy": 0.8911104500293732, "num_tokens": 617399.0, "step": 152 }, { "entropy": 0.3736526593565941, "epoch": 0.3331518780620577, "grad_norm": 0.3004290461540222, "learning_rate": 0.00016072374509257516, "loss": 0.3808, "mean_token_accuracy": 0.8887975662946701, "num_tokens": 621104.0, "step": 153 }, { "entropy": 0.35118088871240616, "epoch": 0.33532934131736525, "grad_norm": 0.26038020849227905, "learning_rate": 0.0001601988137587952, "loss": 0.3382, "mean_token_accuracy": 0.8998311161994934, "num_tokens": 625151.0, "step": 154 }, { "entropy": 0.38535889238119125, "epoch": 0.33750680457267285, "grad_norm": 0.2737407088279724, "learning_rate": 0.00015967139172348954, "loss": 0.3913, "mean_token_accuracy": 0.8854628801345825, "num_tokens": 628964.0, "step": 155 }, { "entropy": 0.38133371621370316, "epoch": 0.3396842678279804, "grad_norm": 0.27977254986763, "learning_rate": 0.00015914150515562055, "loss": 0.3794, "mean_token_accuracy": 0.8869093209505081, "num_tokens": 632846.0, "step": 156 }, { "entropy": 0.37492088973522186, "epoch": 0.341861731083288, "grad_norm": 0.2831854224205017, "learning_rate": 0.00015860918034643276, "loss": 0.355, "mean_token_accuracy": 0.8947048038244247, "num_tokens": 636601.0, "step": 157 }, { "entropy": 0.4035057872533798, "epoch": 0.34403919433859553, "grad_norm": 0.37472277879714966, "learning_rate": 0.00015807444370814815, "loss": 0.3954, "mean_token_accuracy": 0.8825927823781967, "num_tokens": 640518.0, "step": 158 }, { "entropy": 0.34154055267572403, "epoch": 0.3462166575939031, "grad_norm": 0.27869144082069397, "learning_rate": 0.00015753732177265582, "loss": 0.3376, "mean_token_accuracy": 0.8913106769323349, "num_tokens": 644858.0, "step": 159 }, { "entropy": 0.41696153581142426, "epoch": 0.3483941208492107, "grad_norm": 0.291029155254364, "learning_rate": 0.00015699784119019554, "loss": 0.3964, "mean_token_accuracy": 0.8756668865680695, "num_tokens": 648735.0, "step": 160 }, { "entropy": 0.3924735262989998, "epoch": 0.3505715841045182, "grad_norm": 0.28552576899528503, "learning_rate": 0.00015645602872803554, "loss": 0.3852, "mean_token_accuracy": 0.8868783414363861, "num_tokens": 652408.0, "step": 161 }, { "entropy": 0.34768833965063095, "epoch": 0.3527490473598258, "grad_norm": 0.2506498098373413, "learning_rate": 0.00015591191126914424, "loss": 0.3351, "mean_token_accuracy": 0.8980260044336319, "num_tokens": 656844.0, "step": 162 }, { "entropy": 0.3891329765319824, "epoch": 0.35492651061513336, "grad_norm": 0.30480027198791504, "learning_rate": 0.0001553655158108565, "loss": 0.4034, "mean_token_accuracy": 0.8790914118289948, "num_tokens": 661184.0, "step": 163 }, { "entropy": 0.4067026600241661, "epoch": 0.35710397387044096, "grad_norm": 0.27617979049682617, "learning_rate": 0.00015481686946353413, "loss": 0.4081, "mean_token_accuracy": 0.8769482225179672, "num_tokens": 665163.0, "step": 164 }, { "entropy": 0.4310021921992302, "epoch": 0.3592814371257485, "grad_norm": 0.2954219877719879, "learning_rate": 0.00015426599944922062, "loss": 0.4193, "mean_token_accuracy": 0.8807303011417389, "num_tokens": 669177.0, "step": 165 }, { "entropy": 0.37181543558835983, "epoch": 0.36145890038105605, "grad_norm": 0.2674584984779358, "learning_rate": 0.0001537129331002907, "loss": 0.3423, "mean_token_accuracy": 0.8933178037405014, "num_tokens": 672660.0, "step": 166 }, { "entropy": 0.36294087767601013, "epoch": 0.36363636363636365, "grad_norm": 0.2539677321910858, "learning_rate": 0.00015315769785809394, "loss": 0.3419, "mean_token_accuracy": 0.8953043073415756, "num_tokens": 676937.0, "step": 167 }, { "entropy": 0.36527111381292343, "epoch": 0.3658138268916712, "grad_norm": 0.279691219329834, "learning_rate": 0.0001526003212715934, "loss": 0.3689, "mean_token_accuracy": 0.8915591537952423, "num_tokens": 680798.0, "step": 168 }, { "entropy": 0.32713668793439865, "epoch": 0.3679912901469788, "grad_norm": 0.2610296308994293, "learning_rate": 0.00015204083099599862, "loss": 0.3398, "mean_token_accuracy": 0.8963142186403275, "num_tokens": 685386.0, "step": 169 }, { "entropy": 0.35941240191459656, "epoch": 0.37016875340228633, "grad_norm": 0.26744726300239563, "learning_rate": 0.00015147925479139357, "loss": 0.3543, "mean_token_accuracy": 0.8914755284786224, "num_tokens": 689455.0, "step": 170 }, { "entropy": 0.3640653118491173, "epoch": 0.3723462166575939, "grad_norm": 0.2773352861404419, "learning_rate": 0.00015091562052135912, "loss": 0.3822, "mean_token_accuracy": 0.8882244229316711, "num_tokens": 693956.0, "step": 171 }, { "entropy": 0.37736089527606964, "epoch": 0.3745236799129015, "grad_norm": 0.2925175130367279, "learning_rate": 0.00015034995615159074, "loss": 0.3628, "mean_token_accuracy": 0.889089897274971, "num_tokens": 697863.0, "step": 172 }, { "entropy": 0.37925824522972107, "epoch": 0.376701143168209, "grad_norm": 0.2618020474910736, "learning_rate": 0.00014978228974851077, "loss": 0.3624, "mean_token_accuracy": 0.8942320197820663, "num_tokens": 701537.0, "step": 173 }, { "entropy": 0.34706228971481323, "epoch": 0.3788786064235166, "grad_norm": 0.2923741340637207, "learning_rate": 0.000149212649477876, "loss": 0.3541, "mean_token_accuracy": 0.8954867422580719, "num_tokens": 705253.0, "step": 174 }, { "entropy": 0.3569258749485016, "epoch": 0.38105606967882416, "grad_norm": 0.2816322147846222, "learning_rate": 0.00014864106360337992, "loss": 0.357, "mean_token_accuracy": 0.8935216814279556, "num_tokens": 709276.0, "step": 175 }, { "entropy": 0.35546237230300903, "epoch": 0.38323353293413176, "grad_norm": 0.2701316773891449, "learning_rate": 0.00014806756048525073, "loss": 0.3423, "mean_token_accuracy": 0.9047370553016663, "num_tokens": 713489.0, "step": 176 }, { "entropy": 0.38647014647722244, "epoch": 0.3854109961894393, "grad_norm": 0.2974873185157776, "learning_rate": 0.00014749216857884388, "loss": 0.3698, "mean_token_accuracy": 0.8884487450122833, "num_tokens": 717582.0, "step": 177 }, { "entropy": 0.41117021441459656, "epoch": 0.38758845944474685, "grad_norm": 0.46910688281059265, "learning_rate": 0.0001469149164332304, "loss": 0.3913, "mean_token_accuracy": 0.8818454891443253, "num_tokens": 721522.0, "step": 178 }, { "entropy": 0.3503909111022949, "epoch": 0.38976592270005445, "grad_norm": 0.24447594583034515, "learning_rate": 0.00014633583268978037, "loss": 0.3159, "mean_token_accuracy": 0.9022247046232224, "num_tokens": 725345.0, "step": 179 }, { "entropy": 0.34674597531557083, "epoch": 0.391943385955362, "grad_norm": 0.25831112265586853, "learning_rate": 0.00014575494608074166, "loss": 0.3403, "mean_token_accuracy": 0.8952628076076508, "num_tokens": 729377.0, "step": 180 }, { "entropy": 0.32907338812947273, "epoch": 0.3941208492106696, "grad_norm": 0.25881391763687134, "learning_rate": 0.0001451722854278146, "loss": 0.3039, "mean_token_accuracy": 0.9026439040899277, "num_tokens": 733265.0, "step": 181 }, { "entropy": 0.35795633494853973, "epoch": 0.39629831246597713, "grad_norm": 0.28063708543777466, "learning_rate": 0.00014458787964072165, "loss": 0.3381, "mean_token_accuracy": 0.8983410447835922, "num_tokens": 737131.0, "step": 182 }, { "entropy": 0.33193762600421906, "epoch": 0.39847577572128473, "grad_norm": 0.29431116580963135, "learning_rate": 0.00014400175771577326, "loss": 0.3225, "mean_token_accuracy": 0.9057250618934631, "num_tokens": 740821.0, "step": 183 }, { "entropy": 0.31135137379169464, "epoch": 0.4006532389765923, "grad_norm": 0.29750552773475647, "learning_rate": 0.00014341394873442897, "loss": 0.3264, "mean_token_accuracy": 0.8973560929298401, "num_tokens": 744896.0, "step": 184 }, { "entropy": 0.3354290798306465, "epoch": 0.4028307022318998, "grad_norm": 0.27261385321617126, "learning_rate": 0.0001428244818618546, "loss": 0.3427, "mean_token_accuracy": 0.8985736221075058, "num_tokens": 748839.0, "step": 185 }, { "entropy": 0.3166900649666786, "epoch": 0.4050081654872074, "grad_norm": 0.27092301845550537, "learning_rate": 0.0001422333863454751, "loss": 0.3087, "mean_token_accuracy": 0.9003172963857651, "num_tokens": 752819.0, "step": 186 }, { "entropy": 0.3550329655408859, "epoch": 0.40718562874251496, "grad_norm": 0.27660685777664185, "learning_rate": 0.0001416406915135235, "loss": 0.3544, "mean_token_accuracy": 0.8941550552845001, "num_tokens": 756769.0, "step": 187 }, { "entropy": 0.3845446854829788, "epoch": 0.40936309199782256, "grad_norm": 0.3029703199863434, "learning_rate": 0.00014104642677358547, "loss": 0.3864, "mean_token_accuracy": 0.8840687274932861, "num_tokens": 760466.0, "step": 188 }, { "entropy": 0.3692278042435646, "epoch": 0.4115405552531301, "grad_norm": 0.2795009911060333, "learning_rate": 0.00014045062161114065, "loss": 0.3618, "mean_token_accuracy": 0.8954125195741653, "num_tokens": 764627.0, "step": 189 }, { "entropy": 0.34045620262622833, "epoch": 0.41371801850843765, "grad_norm": 0.2698828876018524, "learning_rate": 0.00013985330558809918, "loss": 0.3225, "mean_token_accuracy": 0.8965429812669754, "num_tokens": 768901.0, "step": 190 }, { "entropy": 0.3410160765051842, "epoch": 0.41589548176374525, "grad_norm": 0.25038790702819824, "learning_rate": 0.00013925450834133542, "loss": 0.3253, "mean_token_accuracy": 0.9037521332502365, "num_tokens": 773052.0, "step": 191 }, { "entropy": 0.36402270942926407, "epoch": 0.4180729450190528, "grad_norm": 0.2695653736591339, "learning_rate": 0.00013865425958121697, "loss": 0.3614, "mean_token_accuracy": 0.8942222446203232, "num_tokens": 776826.0, "step": 192 }, { "entropy": 0.31327100098133087, "epoch": 0.4202504082743604, "grad_norm": 0.2406344711780548, "learning_rate": 0.00013805258909013095, "loss": 0.2927, "mean_token_accuracy": 0.9095935225486755, "num_tokens": 781250.0, "step": 193 }, { "entropy": 0.37202536314725876, "epoch": 0.42242787152966793, "grad_norm": 0.30606889724731445, "learning_rate": 0.00013744952672100613, "loss": 0.3924, "mean_token_accuracy": 0.8838685899972916, "num_tokens": 785238.0, "step": 194 }, { "entropy": 0.3558414503931999, "epoch": 0.42460533478497553, "grad_norm": 0.24589793384075165, "learning_rate": 0.00013684510239583166, "loss": 0.344, "mean_token_accuracy": 0.896059587597847, "num_tokens": 789796.0, "step": 195 }, { "entropy": 0.37479735910892487, "epoch": 0.4267827980402831, "grad_norm": 0.25714266300201416, "learning_rate": 0.0001362393461041726, "loss": 0.3708, "mean_token_accuracy": 0.8902730643749237, "num_tokens": 794040.0, "step": 196 }, { "entropy": 0.356051467359066, "epoch": 0.4289602612955906, "grad_norm": 0.27870944142341614, "learning_rate": 0.00013563228790168178, "loss": 0.3551, "mean_token_accuracy": 0.8951977044343948, "num_tokens": 798230.0, "step": 197 }, { "entropy": 0.3533203676342964, "epoch": 0.4311377245508982, "grad_norm": 0.2748214602470398, "learning_rate": 0.00013502395790860862, "loss": 0.3345, "mean_token_accuracy": 0.8976791948080063, "num_tokens": 802137.0, "step": 198 }, { "entropy": 0.404046893119812, "epoch": 0.43331518780620576, "grad_norm": 0.2737223505973816, "learning_rate": 0.00013441438630830464, "loss": 0.4053, "mean_token_accuracy": 0.8848972916603088, "num_tokens": 806240.0, "step": 199 }, { "entropy": 0.3257349133491516, "epoch": 0.43549265106151336, "grad_norm": 0.28284040093421936, "learning_rate": 0.0001338036033457259, "loss": 0.3047, "mean_token_accuracy": 0.9047138094902039, "num_tokens": 809920.0, "step": 200 }, { "entropy": 0.3515155389904976, "epoch": 0.4376701143168209, "grad_norm": 0.2601410746574402, "learning_rate": 0.00013319163932593226, "loss": 0.3389, "mean_token_accuracy": 0.8959746956825256, "num_tokens": 813888.0, "step": 201 }, { "entropy": 0.35355835407972336, "epoch": 0.43984757757212845, "grad_norm": 0.28591784834861755, "learning_rate": 0.0001325785246125838, "loss": 0.3629, "mean_token_accuracy": 0.8906663358211517, "num_tokens": 817940.0, "step": 202 }, { "entropy": 0.36141665279865265, "epoch": 0.44202504082743604, "grad_norm": 0.27857449650764465, "learning_rate": 0.00013196428962643426, "loss": 0.3418, "mean_token_accuracy": 0.8927578181028366, "num_tokens": 822014.0, "step": 203 }, { "entropy": 0.4061436876654625, "epoch": 0.4442025040827436, "grad_norm": 0.2518883943557739, "learning_rate": 0.0001313489648438217, "loss": 0.4024, "mean_token_accuracy": 0.8816352039575577, "num_tokens": 826422.0, "step": 204 }, { "entropy": 0.3674250468611717, "epoch": 0.4463799673380512, "grad_norm": 0.2753954231739044, "learning_rate": 0.00013073258079515632, "loss": 0.3508, "mean_token_accuracy": 0.8967752158641815, "num_tokens": 830085.0, "step": 205 }, { "entropy": 0.35362084209918976, "epoch": 0.44855743059335873, "grad_norm": 0.2868417203426361, "learning_rate": 0.00013011516806340557, "loss": 0.3743, "mean_token_accuracy": 0.8918885141611099, "num_tokens": 834548.0, "step": 206 }, { "entropy": 0.39741218090057373, "epoch": 0.45073489384866633, "grad_norm": 0.2914039194583893, "learning_rate": 0.0001294967572825769, "loss": 0.3976, "mean_token_accuracy": 0.8822353929281235, "num_tokens": 838029.0, "step": 207 }, { "entropy": 0.31900452077388763, "epoch": 0.4529123571039739, "grad_norm": 0.24336911737918854, "learning_rate": 0.0001288773791361977, "loss": 0.3179, "mean_token_accuracy": 0.9089991301298141, "num_tokens": 842500.0, "step": 208 }, { "entropy": 0.3548683598637581, "epoch": 0.4550898203592814, "grad_norm": 0.24573664367198944, "learning_rate": 0.0001282570643557928, "loss": 0.3332, "mean_token_accuracy": 0.8994109332561493, "num_tokens": 846504.0, "step": 209 }, { "entropy": 0.4130469933152199, "epoch": 0.457267283614589, "grad_norm": 0.22916413843631744, "learning_rate": 0.00012763584371935986, "loss": 0.3935, "mean_token_accuracy": 0.8888524770736694, "num_tokens": 850825.0, "step": 210 }, { "entropy": 0.39430346339941025, "epoch": 0.45944474686989656, "grad_norm": 0.24899472296237946, "learning_rate": 0.00012701374804984205, "loss": 0.3623, "mean_token_accuracy": 0.8868012726306915, "num_tokens": 854995.0, "step": 211 }, { "entropy": 0.3773266300559044, "epoch": 0.46162221012520416, "grad_norm": 0.282216340303421, "learning_rate": 0.00012639080821359898, "loss": 0.3786, "mean_token_accuracy": 0.8827318847179413, "num_tokens": 858988.0, "step": 212 }, { "entropy": 0.3632218912243843, "epoch": 0.4637996733805117, "grad_norm": 0.2573084235191345, "learning_rate": 0.00012576705511887492, "loss": 0.3624, "mean_token_accuracy": 0.8912414461374283, "num_tokens": 863081.0, "step": 213 }, { "entropy": 0.35169900953769684, "epoch": 0.46597713663581924, "grad_norm": 0.2548096477985382, "learning_rate": 0.00012514251971426545, "loss": 0.3325, "mean_token_accuracy": 0.9051143527030945, "num_tokens": 867052.0, "step": 214 }, { "entropy": 0.36711084097623825, "epoch": 0.46815459989112684, "grad_norm": 0.2645510733127594, "learning_rate": 0.00012451723298718175, "loss": 0.3774, "mean_token_accuracy": 0.8909319043159485, "num_tokens": 871119.0, "step": 215 }, { "entropy": 0.35685280710458755, "epoch": 0.4703320631464344, "grad_norm": 0.3010730445384979, "learning_rate": 0.0001238912259623133, "loss": 0.3435, "mean_token_accuracy": 0.8955214470624924, "num_tokens": 874529.0, "step": 216 }, { "entropy": 0.3657463937997818, "epoch": 0.472509526401742, "grad_norm": 0.2753501534461975, "learning_rate": 0.0001232645297000883, "loss": 0.356, "mean_token_accuracy": 0.8999243825674057, "num_tokens": 878518.0, "step": 217 }, { "entropy": 0.3516548126935959, "epoch": 0.47468698965704953, "grad_norm": 0.2859194576740265, "learning_rate": 0.00012263717529513267, "loss": 0.3561, "mean_token_accuracy": 0.8952623754739761, "num_tokens": 882202.0, "step": 218 }, { "entropy": 0.3554818853735924, "epoch": 0.47686445291235713, "grad_norm": 0.2630636394023895, "learning_rate": 0.00012200919387472723, "loss": 0.3454, "mean_token_accuracy": 0.8877929896116257, "num_tokens": 886781.0, "step": 219 }, { "entropy": 0.35459691286087036, "epoch": 0.47904191616766467, "grad_norm": 0.28057464957237244, "learning_rate": 0.0001213806165972633, "loss": 0.3597, "mean_token_accuracy": 0.8925827890634537, "num_tokens": 890846.0, "step": 220 }, { "entropy": 0.3253984898328781, "epoch": 0.4812193794229722, "grad_norm": 0.2502402067184448, "learning_rate": 0.00012075147465069667, "loss": 0.3183, "mean_token_accuracy": 0.9015309363603592, "num_tokens": 895392.0, "step": 221 }, { "entropy": 0.3588094562292099, "epoch": 0.4833968426782798, "grad_norm": 0.24630582332611084, "learning_rate": 0.0001201217992510002, "loss": 0.3361, "mean_token_accuracy": 0.9005966037511826, "num_tokens": 899490.0, "step": 222 }, { "entropy": 0.3819248303771019, "epoch": 0.48557430593358736, "grad_norm": 0.24468845129013062, "learning_rate": 0.00011949162164061486, "loss": 0.3661, "mean_token_accuracy": 0.8975157290697098, "num_tokens": 903478.0, "step": 223 }, { "entropy": 0.4134289547801018, "epoch": 0.48775176918889496, "grad_norm": 0.27261775732040405, "learning_rate": 0.0001188609730868998, "loss": 0.4087, "mean_token_accuracy": 0.8844785243272781, "num_tokens": 907286.0, "step": 224 }, { "entropy": 0.3919166326522827, "epoch": 0.4899292324442025, "grad_norm": 0.2661035358905792, "learning_rate": 0.00011822988488058071, "loss": 0.3575, "mean_token_accuracy": 0.8900353014469147, "num_tokens": 911300.0, "step": 225 }, { "entropy": 0.34307558089494705, "epoch": 0.49210669569951004, "grad_norm": 0.2561405301094055, "learning_rate": 0.00011759838833419754, "loss": 0.3052, "mean_token_accuracy": 0.90419901907444, "num_tokens": 915659.0, "step": 226 }, { "entropy": 0.35558557510375977, "epoch": 0.49428415895481764, "grad_norm": 0.24936646223068237, "learning_rate": 0.00011696651478055067, "loss": 0.3531, "mean_token_accuracy": 0.8979819416999817, "num_tokens": 919483.0, "step": 227 }, { "entropy": 0.35391464084386826, "epoch": 0.4964616222101252, "grad_norm": 0.2600042521953583, "learning_rate": 0.00011633429557114635, "loss": 0.3565, "mean_token_accuracy": 0.889078825712204, "num_tokens": 923394.0, "step": 228 }, { "entropy": 0.37007713317871094, "epoch": 0.4986390854654328, "grad_norm": 0.25796735286712646, "learning_rate": 0.00011570176207464114, "loss": 0.3369, "mean_token_accuracy": 0.8971839994192123, "num_tokens": 927293.0, "step": 229 }, { "entropy": 0.38342171162366867, "epoch": 0.5008165487207403, "grad_norm": 0.27563533186912537, "learning_rate": 0.00011506894567528556, "loss": 0.3546, "mean_token_accuracy": 0.8875249475240707, "num_tokens": 931453.0, "step": 230 }, { "entropy": 0.3373766243457794, "epoch": 0.5029940119760479, "grad_norm": 0.24225658178329468, "learning_rate": 0.00011443587777136679, "loss": 0.3411, "mean_token_accuracy": 0.9000124335289001, "num_tokens": 936010.0, "step": 231 }, { "entropy": 0.33466411381959915, "epoch": 0.5051714752313554, "grad_norm": 0.2858439087867737, "learning_rate": 0.0001138025897736509, "loss": 0.3343, "mean_token_accuracy": 0.8957197666168213, "num_tokens": 939926.0, "step": 232 }, { "entropy": 0.3573242276906967, "epoch": 0.5073489384866631, "grad_norm": 0.30942314863204956, "learning_rate": 0.00011316911310382416, "loss": 0.3597, "mean_token_accuracy": 0.8864942044019699, "num_tokens": 944087.0, "step": 233 }, { "entropy": 0.3710939437150955, "epoch": 0.5095264017419706, "grad_norm": 0.2737363278865814, "learning_rate": 0.00011253547919293439, "loss": 0.3577, "mean_token_accuracy": 0.8874527662992477, "num_tokens": 948518.0, "step": 234 }, { "entropy": 0.33612143993377686, "epoch": 0.5117038649972782, "grad_norm": 0.24085883796215057, "learning_rate": 0.00011190171947983091, "loss": 0.3161, "mean_token_accuracy": 0.902932345867157, "num_tokens": 952833.0, "step": 235 }, { "entropy": 0.353444904088974, "epoch": 0.5138813282525857, "grad_norm": 0.28172338008880615, "learning_rate": 0.00011126786540960512, "loss": 0.3562, "mean_token_accuracy": 0.8990496397018433, "num_tokens": 956824.0, "step": 236 }, { "entropy": 0.33875197917222977, "epoch": 0.5160587915078934, "grad_norm": 0.2717280387878418, "learning_rate": 0.00011063394843203004, "loss": 0.3117, "mean_token_accuracy": 0.9031887650489807, "num_tokens": 960613.0, "step": 237 }, { "entropy": 0.3543147072196007, "epoch": 0.5182362547632009, "grad_norm": 0.2418098896741867, "learning_rate": 0.00011000000000000002, "loss": 0.3577, "mean_token_accuracy": 0.8868001103401184, "num_tokens": 965072.0, "step": 238 }, { "entropy": 0.3672889471054077, "epoch": 0.5204137180185084, "grad_norm": 0.27860227227211, "learning_rate": 0.00010936605156797, "loss": 0.3616, "mean_token_accuracy": 0.8912352472543716, "num_tokens": 969185.0, "step": 239 }, { "entropy": 0.3546944558620453, "epoch": 0.522591181273816, "grad_norm": 0.27250248193740845, "learning_rate": 0.0001087321345903949, "loss": 0.34, "mean_token_accuracy": 0.8949205875396729, "num_tokens": 972955.0, "step": 240 }, { "entropy": 0.4006873667240143, "epoch": 0.5247686445291235, "grad_norm": 0.28049609065055847, "learning_rate": 0.00010809828052016913, "loss": 0.3895, "mean_token_accuracy": 0.878919780254364, "num_tokens": 976759.0, "step": 241 }, { "entropy": 0.34407609701156616, "epoch": 0.5269461077844312, "grad_norm": 0.22804318368434906, "learning_rate": 0.00010746452080706563, "loss": 0.3046, "mean_token_accuracy": 0.9041478931903839, "num_tokens": 981169.0, "step": 242 }, { "entropy": 0.34020114690065384, "epoch": 0.5291235710397387, "grad_norm": 0.25987792015075684, "learning_rate": 0.00010683088689617582, "loss": 0.3175, "mean_token_accuracy": 0.9022326022386551, "num_tokens": 984838.0, "step": 243 }, { "entropy": 0.35350754112005234, "epoch": 0.5313010342950463, "grad_norm": 0.2573815584182739, "learning_rate": 0.00010619741022634912, "loss": 0.3525, "mean_token_accuracy": 0.8904687911272049, "num_tokens": 988767.0, "step": 244 }, { "entropy": 0.319248978048563, "epoch": 0.5334784975503538, "grad_norm": 0.21112677454948425, "learning_rate": 0.00010556412222863321, "loss": 0.3022, "mean_token_accuracy": 0.9129808992147446, "num_tokens": 993209.0, "step": 245 }, { "entropy": 0.3874542936682701, "epoch": 0.5356559608056614, "grad_norm": 0.2539237439632416, "learning_rate": 0.00010493105432471443, "loss": 0.3908, "mean_token_accuracy": 0.8874447643756866, "num_tokens": 997348.0, "step": 246 }, { "entropy": 0.3753085806965828, "epoch": 0.537833424060969, "grad_norm": 0.242266446352005, "learning_rate": 0.00010429823792535891, "loss": 0.3721, "mean_token_accuracy": 0.8896859586238861, "num_tokens": 1001182.0, "step": 247 }, { "entropy": 0.326670840382576, "epoch": 0.5400108873162766, "grad_norm": 0.24620375037193298, "learning_rate": 0.00010366570442885373, "loss": 0.3195, "mean_token_accuracy": 0.9036577641963959, "num_tokens": 1005310.0, "step": 248 }, { "entropy": 0.36552029848098755, "epoch": 0.5421883505715841, "grad_norm": 0.24721576273441315, "learning_rate": 0.00010303348521944938, "loss": 0.3665, "mean_token_accuracy": 0.892762616276741, "num_tokens": 1009657.0, "step": 249 }, { "entropy": 0.34408629685640335, "epoch": 0.5443658138268916, "grad_norm": 0.23724570870399475, "learning_rate": 0.0001024016116658025, "loss": 0.3347, "mean_token_accuracy": 0.9008950591087341, "num_tokens": 1014240.0, "step": 250 }, { "entropy": 0.33717598021030426, "epoch": 0.5465432770821992, "grad_norm": 0.25629547238349915, "learning_rate": 0.0001017701151194193, "loss": 0.3434, "mean_token_accuracy": 0.9011830985546112, "num_tokens": 1018254.0, "step": 251 }, { "entropy": 0.36749306321144104, "epoch": 0.5487207403375068, "grad_norm": 0.2619577944278717, "learning_rate": 0.00010113902691310024, "loss": 0.3551, "mean_token_accuracy": 0.8974686414003372, "num_tokens": 1022155.0, "step": 252 }, { "entropy": 0.4006720781326294, "epoch": 0.5508982035928144, "grad_norm": 0.2916308343410492, "learning_rate": 0.00010050837835938516, "loss": 0.3901, "mean_token_accuracy": 0.884143054485321, "num_tokens": 1026011.0, "step": 253 }, { "entropy": 0.3434867560863495, "epoch": 0.5530756668481219, "grad_norm": 0.24261599779129028, "learning_rate": 9.98782007489998e-05, "loss": 0.3447, "mean_token_accuracy": 0.8931203186511993, "num_tokens": 1029811.0, "step": 254 }, { "entropy": 0.33298294991254807, "epoch": 0.5552531301034295, "grad_norm": 0.24710261821746826, "learning_rate": 9.924852534930333e-05, "loss": 0.3163, "mean_token_accuracy": 0.8988287448883057, "num_tokens": 1033838.0, "step": 255 }, { "entropy": 0.36351051926612854, "epoch": 0.5574305933587371, "grad_norm": 0.22865501046180725, "learning_rate": 9.861938340273671e-05, "loss": 0.3537, "mean_token_accuracy": 0.8958317637443542, "num_tokens": 1038890.0, "step": 256 }, { "entropy": 0.34496162831783295, "epoch": 0.5596080566140447, "grad_norm": 0.27052974700927734, "learning_rate": 9.79908061252728e-05, "loss": 0.3422, "mean_token_accuracy": 0.8985669314861298, "num_tokens": 1042344.0, "step": 257 }, { "entropy": 0.3629255071282387, "epoch": 0.5617855198693522, "grad_norm": 0.27112752199172974, "learning_rate": 9.736282470486739e-05, "loss": 0.36, "mean_token_accuracy": 0.8962416350841522, "num_tokens": 1046638.0, "step": 258 }, { "entropy": 0.3592648208141327, "epoch": 0.5639629831246598, "grad_norm": 0.23911136388778687, "learning_rate": 9.673547029991173e-05, "loss": 0.3398, "mean_token_accuracy": 0.8957805782556534, "num_tokens": 1050963.0, "step": 259 }, { "entropy": 0.41059066355228424, "epoch": 0.5661404463799673, "grad_norm": 0.2601061463356018, "learning_rate": 9.61087740376867e-05, "loss": 0.413, "mean_token_accuracy": 0.875213697552681, "num_tokens": 1055077.0, "step": 260 }, { "entropy": 0.33156271278858185, "epoch": 0.568317909635275, "grad_norm": 0.2332238405942917, "learning_rate": 9.548276701281821e-05, "loss": 0.3202, "mean_token_accuracy": 0.9033721536397934, "num_tokens": 1059270.0, "step": 261 }, { "entropy": 0.38206638395786285, "epoch": 0.5704953728905825, "grad_norm": 0.2890869677066803, "learning_rate": 9.485748028573455e-05, "loss": 0.3721, "mean_token_accuracy": 0.8858179748058319, "num_tokens": 1063429.0, "step": 262 }, { "entropy": 0.3339729979634285, "epoch": 0.57267283614589, "grad_norm": 0.23651231825351715, "learning_rate": 9.423294488112509e-05, "loss": 0.3376, "mean_token_accuracy": 0.9060862809419632, "num_tokens": 1067575.0, "step": 263 }, { "entropy": 0.36243191361427307, "epoch": 0.5748502994011976, "grad_norm": 0.2469407469034195, "learning_rate": 9.360919178640104e-05, "loss": 0.3313, "mean_token_accuracy": 0.9048342257738113, "num_tokens": 1071393.0, "step": 264 }, { "entropy": 0.3420562148094177, "epoch": 0.5770277626565051, "grad_norm": 0.24036115407943726, "learning_rate": 9.298625195015796e-05, "loss": 0.3464, "mean_token_accuracy": 0.900355190038681, "num_tokens": 1076079.0, "step": 265 }, { "entropy": 0.39919717609882355, "epoch": 0.5792052259118128, "grad_norm": 0.2509303390979767, "learning_rate": 9.236415628064017e-05, "loss": 0.3731, "mean_token_accuracy": 0.8862645626068115, "num_tokens": 1079989.0, "step": 266 }, { "entropy": 0.3894932344555855, "epoch": 0.5813826891671203, "grad_norm": 0.25672271847724915, "learning_rate": 9.174293564420724e-05, "loss": 0.3749, "mean_token_accuracy": 0.8905623853206635, "num_tokens": 1083957.0, "step": 267 }, { "entropy": 0.37751832604408264, "epoch": 0.5835601524224279, "grad_norm": 0.2643100321292877, "learning_rate": 9.112262086380234e-05, "loss": 0.371, "mean_token_accuracy": 0.8892365545034409, "num_tokens": 1087639.0, "step": 268 }, { "entropy": 0.35557425767183304, "epoch": 0.5857376156777354, "grad_norm": 0.2569376230239868, "learning_rate": 9.050324271742312e-05, "loss": 0.3369, "mean_token_accuracy": 0.8985206633806229, "num_tokens": 1091448.0, "step": 269 }, { "entropy": 0.3663223683834076, "epoch": 0.587915078933043, "grad_norm": 0.28307580947875977, "learning_rate": 8.988483193659447e-05, "loss": 0.3379, "mean_token_accuracy": 0.8939681947231293, "num_tokens": 1095282.0, "step": 270 }, { "entropy": 0.35191214829683304, "epoch": 0.5900925421883506, "grad_norm": 0.241379514336586, "learning_rate": 8.926741920484374e-05, "loss": 0.3447, "mean_token_accuracy": 0.8967802226543427, "num_tokens": 1099519.0, "step": 271 }, { "entropy": 0.33553165942430496, "epoch": 0.5922700054436582, "grad_norm": 0.26522010564804077, "learning_rate": 8.865103515617834e-05, "loss": 0.3126, "mean_token_accuracy": 0.9028987288475037, "num_tokens": 1103293.0, "step": 272 }, { "entropy": 0.321424663066864, "epoch": 0.5944474686989657, "grad_norm": 0.23075014352798462, "learning_rate": 8.803571037356575e-05, "loss": 0.3204, "mean_token_accuracy": 0.9045960456132889, "num_tokens": 1107725.0, "step": 273 }, { "entropy": 0.3491132855415344, "epoch": 0.5966249319542732, "grad_norm": 0.26291459798812866, "learning_rate": 8.742147538741623e-05, "loss": 0.3178, "mean_token_accuracy": 0.9050692319869995, "num_tokens": 1111448.0, "step": 274 }, { "entropy": 0.3245581164956093, "epoch": 0.5988023952095808, "grad_norm": 0.2527916729450226, "learning_rate": 8.680836067406775e-05, "loss": 0.3164, "mean_token_accuracy": 0.9089783430099487, "num_tokens": 1115353.0, "step": 275 }, { "entropy": 0.3509984761476517, "epoch": 0.6009798584648884, "grad_norm": 0.2409028708934784, "learning_rate": 8.619639665427411e-05, "loss": 0.3205, "mean_token_accuracy": 0.901856929063797, "num_tokens": 1119037.0, "step": 276 }, { "entropy": 0.41339434683322906, "epoch": 0.603157321720196, "grad_norm": 0.2666266858577728, "learning_rate": 8.558561369169535e-05, "loss": 0.4118, "mean_token_accuracy": 0.8851277679204941, "num_tokens": 1122815.0, "step": 277 }, { "entropy": 0.355926550924778, "epoch": 0.6053347849755035, "grad_norm": 0.2666811943054199, "learning_rate": 8.497604209139139e-05, "loss": 0.3598, "mean_token_accuracy": 0.8959801942110062, "num_tokens": 1126942.0, "step": 278 }, { "entropy": 0.33385297656059265, "epoch": 0.6075122482308111, "grad_norm": 0.26262858510017395, "learning_rate": 8.436771209831825e-05, "loss": 0.356, "mean_token_accuracy": 0.8975410759449005, "num_tokens": 1130948.0, "step": 279 }, { "entropy": 0.3468668982386589, "epoch": 0.6096897114861187, "grad_norm": 0.2627294659614563, "learning_rate": 8.376065389582739e-05, "loss": 0.3453, "mean_token_accuracy": 0.8972453325986862, "num_tokens": 1135319.0, "step": 280 }, { "entropy": 0.34889067709445953, "epoch": 0.6118671747414263, "grad_norm": 0.2477421760559082, "learning_rate": 8.315489760416839e-05, "loss": 0.3221, "mean_token_accuracy": 0.9074793308973312, "num_tokens": 1138864.0, "step": 281 }, { "entropy": 0.3540688380599022, "epoch": 0.6140446379967338, "grad_norm": 0.2644377052783966, "learning_rate": 8.255047327899392e-05, "loss": 0.3697, "mean_token_accuracy": 0.8973688334226608, "num_tokens": 1142749.0, "step": 282 }, { "entropy": 0.3112604096531868, "epoch": 0.6162221012520414, "grad_norm": 0.24608677625656128, "learning_rate": 8.19474109098691e-05, "loss": 0.3115, "mean_token_accuracy": 0.9069450497627258, "num_tokens": 1146891.0, "step": 283 }, { "entropy": 0.3105768784880638, "epoch": 0.6183995645073489, "grad_norm": 0.2628800868988037, "learning_rate": 8.134574041878306e-05, "loss": 0.3144, "mean_token_accuracy": 0.9045025259256363, "num_tokens": 1151024.0, "step": 284 }, { "entropy": 0.3061619848012924, "epoch": 0.6205770277626566, "grad_norm": 0.2500765919685364, "learning_rate": 8.074549165866463e-05, "loss": 0.2996, "mean_token_accuracy": 0.9090612530708313, "num_tokens": 1155564.0, "step": 285 }, { "entropy": 0.34363674372434616, "epoch": 0.6227544910179641, "grad_norm": 0.2619493305683136, "learning_rate": 8.014669441190081e-05, "loss": 0.3196, "mean_token_accuracy": 0.8998923152685165, "num_tokens": 1159454.0, "step": 286 }, { "entropy": 0.3449995443224907, "epoch": 0.6249319542732716, "grad_norm": 0.2670820355415344, "learning_rate": 7.954937838885937e-05, "loss": 0.3517, "mean_token_accuracy": 0.8967305719852448, "num_tokens": 1163267.0, "step": 287 }, { "entropy": 0.3603576719760895, "epoch": 0.6271094175285792, "grad_norm": 0.24100132286548615, "learning_rate": 7.895357322641452e-05, "loss": 0.3508, "mean_token_accuracy": 0.8935562521219254, "num_tokens": 1167581.0, "step": 288 }, { "entropy": 0.3160111829638481, "epoch": 0.6292868807838867, "grad_norm": 0.2645825445652008, "learning_rate": 7.835930848647653e-05, "loss": 0.3045, "mean_token_accuracy": 0.9113835692405701, "num_tokens": 1171514.0, "step": 289 }, { "entropy": 0.33360420912504196, "epoch": 0.6314643440391944, "grad_norm": 0.22924089431762695, "learning_rate": 7.776661365452491e-05, "loss": 0.3087, "mean_token_accuracy": 0.9061863869428635, "num_tokens": 1175361.0, "step": 290 }, { "entropy": 0.3485657498240471, "epoch": 0.6336418072945019, "grad_norm": 0.24018257856369019, "learning_rate": 7.717551813814543e-05, "loss": 0.3087, "mean_token_accuracy": 0.903602659702301, "num_tokens": 1179132.0, "step": 291 }, { "entropy": 0.342680849134922, "epoch": 0.6358192705498095, "grad_norm": 0.22566929459571838, "learning_rate": 7.658605126557105e-05, "loss": 0.3183, "mean_token_accuracy": 0.9066330194473267, "num_tokens": 1183571.0, "step": 292 }, { "entropy": 0.3731561452150345, "epoch": 0.637996733805117, "grad_norm": 0.2820538580417633, "learning_rate": 7.599824228422677e-05, "loss": 0.371, "mean_token_accuracy": 0.8894180357456207, "num_tokens": 1187179.0, "step": 293 }, { "entropy": 0.32869182527065277, "epoch": 0.6401741970604246, "grad_norm": 0.2502634823322296, "learning_rate": 7.541212035927839e-05, "loss": 0.2968, "mean_token_accuracy": 0.9134543687105179, "num_tokens": 1191246.0, "step": 294 }, { "entropy": 0.37562160193920135, "epoch": 0.6423516603157322, "grad_norm": 0.2863782048225403, "learning_rate": 7.482771457218542e-05, "loss": 0.3717, "mean_token_accuracy": 0.8882504254579544, "num_tokens": 1195149.0, "step": 295 }, { "entropy": 0.33977876603603363, "epoch": 0.6445291235710398, "grad_norm": 0.24794067442417145, "learning_rate": 7.424505391925833e-05, "loss": 0.3122, "mean_token_accuracy": 0.9125866144895554, "num_tokens": 1198886.0, "step": 296 }, { "entropy": 0.3772798329591751, "epoch": 0.6467065868263473, "grad_norm": 0.23983165621757507, "learning_rate": 7.366416731021964e-05, "loss": 0.362, "mean_token_accuracy": 0.8952146172523499, "num_tokens": 1202933.0, "step": 297 }, { "entropy": 0.3076706826686859, "epoch": 0.6488840500816548, "grad_norm": 0.2429223656654358, "learning_rate": 7.30850835667696e-05, "loss": 0.3008, "mean_token_accuracy": 0.909699097275734, "num_tokens": 1206978.0, "step": 298 }, { "entropy": 0.3360467702150345, "epoch": 0.6510615133369625, "grad_norm": 0.25572511553764343, "learning_rate": 7.250783142115615e-05, "loss": 0.341, "mean_token_accuracy": 0.9028728753328323, "num_tokens": 1210951.0, "step": 299 }, { "entropy": 0.305056668817997, "epoch": 0.65323897659227, "grad_norm": 0.24135318398475647, "learning_rate": 7.193243951474933e-05, "loss": 0.3122, "mean_token_accuracy": 0.908637598156929, "num_tokens": 1215517.0, "step": 300 }, { "entropy": 0.337252639234066, "epoch": 0.6554164398475776, "grad_norm": 0.27407306432724, "learning_rate": 7.135893639662012e-05, "loss": 0.3226, "mean_token_accuracy": 0.9033920913934708, "num_tokens": 1219456.0, "step": 301 }, { "entropy": 0.3444167599081993, "epoch": 0.6575939031028851, "grad_norm": 0.2554808557033539, "learning_rate": 7.078735052212402e-05, "loss": 0.3405, "mean_token_accuracy": 0.8994651138782501, "num_tokens": 1223440.0, "step": 302 }, { "entropy": 0.3203364834189415, "epoch": 0.6597713663581927, "grad_norm": 0.2498241364955902, "learning_rate": 7.021771025148922e-05, "loss": 0.2994, "mean_token_accuracy": 0.9104214161634445, "num_tokens": 1227205.0, "step": 303 }, { "entropy": 0.3659024015069008, "epoch": 0.6619488296135003, "grad_norm": 0.24576182663440704, "learning_rate": 6.965004384840928e-05, "loss": 0.3434, "mean_token_accuracy": 0.8974325805902481, "num_tokens": 1231062.0, "step": 304 }, { "entropy": 0.35433361679315567, "epoch": 0.6641262928688079, "grad_norm": 0.2348756641149521, "learning_rate": 6.90843794786409e-05, "loss": 0.3326, "mean_token_accuracy": 0.8999007195234299, "num_tokens": 1235210.0, "step": 305 }, { "entropy": 0.3522880747914314, "epoch": 0.6663037561241154, "grad_norm": 0.24180057644844055, "learning_rate": 6.852074520860648e-05, "loss": 0.3286, "mean_token_accuracy": 0.9014742374420166, "num_tokens": 1238954.0, "step": 306 }, { "entropy": 0.35235612094402313, "epoch": 0.668481219379423, "grad_norm": 0.24760101735591888, "learning_rate": 6.795916900400138e-05, "loss": 0.3262, "mean_token_accuracy": 0.9001569449901581, "num_tokens": 1242691.0, "step": 307 }, { "entropy": 0.35372819751501083, "epoch": 0.6706586826347305, "grad_norm": 0.2558618485927582, "learning_rate": 6.739967872840662e-05, "loss": 0.3389, "mean_token_accuracy": 0.9027666747570038, "num_tokens": 1246355.0, "step": 308 }, { "entropy": 0.32674338668584824, "epoch": 0.6728361458900382, "grad_norm": 0.2397354543209076, "learning_rate": 6.684230214190608e-05, "loss": 0.3026, "mean_token_accuracy": 0.9039190113544464, "num_tokens": 1251017.0, "step": 309 }, { "entropy": 0.3184630870819092, "epoch": 0.6750136091453457, "grad_norm": 0.2725917100906372, "learning_rate": 6.628706689970932e-05, "loss": 0.3305, "mean_token_accuracy": 0.8989760279655457, "num_tokens": 1255024.0, "step": 310 }, { "entropy": 0.35561081022024155, "epoch": 0.6771910724006532, "grad_norm": 0.24204087257385254, "learning_rate": 6.573400055077938e-05, "loss": 0.3393, "mean_token_accuracy": 0.8942540436983109, "num_tokens": 1259033.0, "step": 311 }, { "entropy": 0.3308749422430992, "epoch": 0.6793685356559608, "grad_norm": 0.23772156238555908, "learning_rate": 6.518313053646586e-05, "loss": 0.3264, "mean_token_accuracy": 0.9023979008197784, "num_tokens": 1263455.0, "step": 312 }, { "entropy": 0.3347730040550232, "epoch": 0.6815459989112683, "grad_norm": 0.2505793571472168, "learning_rate": 6.463448418914348e-05, "loss": 0.3392, "mean_token_accuracy": 0.9027709066867828, "num_tokens": 1267335.0, "step": 313 }, { "entropy": 0.3568695932626724, "epoch": 0.683723462166576, "grad_norm": 0.24569235742092133, "learning_rate": 6.408808873085577e-05, "loss": 0.3399, "mean_token_accuracy": 0.8940989226102829, "num_tokens": 1271810.0, "step": 314 }, { "entropy": 0.33151426911354065, "epoch": 0.6859009254218835, "grad_norm": 0.28417110443115234, "learning_rate": 6.354397127196448e-05, "loss": 0.3196, "mean_token_accuracy": 0.9016236513853073, "num_tokens": 1275575.0, "step": 315 }, { "entropy": 0.31233637779951096, "epoch": 0.6880783886771911, "grad_norm": 0.23522846400737762, "learning_rate": 6.300215880980446e-05, "loss": 0.2954, "mean_token_accuracy": 0.9116706401109695, "num_tokens": 1280034.0, "step": 316 }, { "entropy": 0.35172613710165024, "epoch": 0.6902558519324986, "grad_norm": 0.25289177894592285, "learning_rate": 6.246267822734421e-05, "loss": 0.3253, "mean_token_accuracy": 0.8971187770366669, "num_tokens": 1283664.0, "step": 317 }, { "entropy": 0.3484005257487297, "epoch": 0.6924333151878062, "grad_norm": 0.2565121054649353, "learning_rate": 6.192555629185189e-05, "loss": 0.3408, "mean_token_accuracy": 0.8945488780736923, "num_tokens": 1287685.0, "step": 318 }, { "entropy": 0.3195461556315422, "epoch": 0.6946107784431138, "grad_norm": 0.24285030364990234, "learning_rate": 6.139081965356725e-05, "loss": 0.3188, "mean_token_accuracy": 0.9035038352012634, "num_tokens": 1291337.0, "step": 319 }, { "entropy": 0.33748240023851395, "epoch": 0.6967882416984214, "grad_norm": 0.24630972743034363, "learning_rate": 6.085849484437944e-05, "loss": 0.3411, "mean_token_accuracy": 0.9040576815605164, "num_tokens": 1295196.0, "step": 320 }, { "entropy": 0.3222072795033455, "epoch": 0.6989657049537289, "grad_norm": 0.23582881689071655, "learning_rate": 6.0328608276510476e-05, "loss": 0.3193, "mean_token_accuracy": 0.900396928191185, "num_tokens": 1299276.0, "step": 321 }, { "entropy": 0.34793104976415634, "epoch": 0.7011431682090364, "grad_norm": 0.28013235330581665, "learning_rate": 5.980118624120483e-05, "loss": 0.3234, "mean_token_accuracy": 0.8983870148658752, "num_tokens": 1302970.0, "step": 322 }, { "entropy": 0.3033921644091606, "epoch": 0.7033206314643441, "grad_norm": 0.23157738149166107, "learning_rate": 5.9276254907424846e-05, "loss": 0.2927, "mean_token_accuracy": 0.9108779579401016, "num_tokens": 1307008.0, "step": 323 }, { "entropy": 0.36583440005779266, "epoch": 0.7054980947196516, "grad_norm": 0.2319372296333313, "learning_rate": 5.875384032055239e-05, "loss": 0.371, "mean_token_accuracy": 0.897381991147995, "num_tokens": 1311263.0, "step": 324 }, { "entropy": 0.3215944245457649, "epoch": 0.7076755579749592, "grad_norm": 0.23082365095615387, "learning_rate": 5.823396840109657e-05, "loss": 0.3094, "mean_token_accuracy": 0.903637707233429, "num_tokens": 1315823.0, "step": 325 }, { "entropy": 0.34219200164079666, "epoch": 0.7098530212302667, "grad_norm": 0.23884856700897217, "learning_rate": 5.771666494340756e-05, "loss": 0.3289, "mean_token_accuracy": 0.9032928794622421, "num_tokens": 1319955.0, "step": 326 }, { "entropy": 0.2886577844619751, "epoch": 0.7120304844855743, "grad_norm": 0.22707660496234894, "learning_rate": 5.7201955614396964e-05, "loss": 0.2839, "mean_token_accuracy": 0.9143697619438171, "num_tokens": 1324096.0, "step": 327 }, { "entropy": 0.3365718871355057, "epoch": 0.7142079477408819, "grad_norm": 0.21789753437042236, "learning_rate": 5.668986595226404e-05, "loss": 0.3316, "mean_token_accuracy": 0.9025033861398697, "num_tokens": 1328868.0, "step": 328 }, { "entropy": 0.313778854906559, "epoch": 0.7163854109961895, "grad_norm": 0.24393050372600555, "learning_rate": 5.618042136522881e-05, "loss": 0.3212, "mean_token_accuracy": 0.9037179052829742, "num_tokens": 1333087.0, "step": 329 }, { "entropy": 0.3029978275299072, "epoch": 0.718562874251497, "grad_norm": 0.24070705473423004, "learning_rate": 5.567364713027121e-05, "loss": 0.306, "mean_token_accuracy": 0.9108355790376663, "num_tokens": 1337351.0, "step": 330 }, { "entropy": 0.36839231103658676, "epoch": 0.7207403375068046, "grad_norm": 0.25364482402801514, "learning_rate": 5.5169568391877035e-05, "loss": 0.3493, "mean_token_accuracy": 0.89275161921978, "num_tokens": 1341499.0, "step": 331 }, { "entropy": 0.37619777768850327, "epoch": 0.7229178007621121, "grad_norm": 0.24351854622364044, "learning_rate": 5.46682101607904e-05, "loss": 0.3816, "mean_token_accuracy": 0.8932196348905563, "num_tokens": 1345295.0, "step": 332 }, { "entropy": 0.30273835361003876, "epoch": 0.7250952640174197, "grad_norm": 0.2297053039073944, "learning_rate": 5.416959731277264e-05, "loss": 0.2852, "mean_token_accuracy": 0.9142936319112778, "num_tokens": 1349605.0, "step": 333 }, { "entropy": 0.3630438446998596, "epoch": 0.7272727272727273, "grad_norm": 0.2559914290904999, "learning_rate": 5.3673754587368094e-05, "loss": 0.3791, "mean_token_accuracy": 0.8942387253046036, "num_tokens": 1353706.0, "step": 334 }, { "entropy": 0.32199136167764664, "epoch": 0.7294501905280348, "grad_norm": 0.25669071078300476, "learning_rate": 5.318070658667671e-05, "loss": 0.3123, "mean_token_accuracy": 0.9080253690481186, "num_tokens": 1357558.0, "step": 335 }, { "entropy": 0.35776887834072113, "epoch": 0.7316276537833424, "grad_norm": 0.2596750855445862, "learning_rate": 5.269047777413333e-05, "loss": 0.3436, "mean_token_accuracy": 0.8997514098882675, "num_tokens": 1361340.0, "step": 336 }, { "entropy": 0.34765905141830444, "epoch": 0.7338051170386499, "grad_norm": 0.21836940944194794, "learning_rate": 5.22030924732938e-05, "loss": 0.3277, "mean_token_accuracy": 0.9053044319152832, "num_tokens": 1365153.0, "step": 337 }, { "entropy": 0.34295450896024704, "epoch": 0.7359825802939576, "grad_norm": 0.2738622725009918, "learning_rate": 5.171857486662823e-05, "loss": 0.3336, "mean_token_accuracy": 0.8998141139745712, "num_tokens": 1368896.0, "step": 338 }, { "entropy": 0.32134611159563065, "epoch": 0.7381600435492651, "grad_norm": 0.22107118368148804, "learning_rate": 5.1236948994321055e-05, "loss": 0.2999, "mean_token_accuracy": 0.908054381608963, "num_tokens": 1373609.0, "step": 339 }, { "entropy": 0.3105727434158325, "epoch": 0.7403375068045727, "grad_norm": 0.23407259583473206, "learning_rate": 5.075823875307828e-05, "loss": 0.2947, "mean_token_accuracy": 0.9088436663150787, "num_tokens": 1377893.0, "step": 340 }, { "entropy": 0.3235616162419319, "epoch": 0.7425149700598802, "grad_norm": 0.2505863606929779, "learning_rate": 5.0282467894941864e-05, "loss": 0.3338, "mean_token_accuracy": 0.9098049253225327, "num_tokens": 1381665.0, "step": 341 }, { "entropy": 0.30407993495464325, "epoch": 0.7446924333151878, "grad_norm": 0.23674152791500092, "learning_rate": 4.980966002611108e-05, "loss": 0.2939, "mean_token_accuracy": 0.9113668948411942, "num_tokens": 1386000.0, "step": 342 }, { "entropy": 0.29837000370025635, "epoch": 0.7468698965704954, "grad_norm": 0.24069277942180634, "learning_rate": 4.933983860577136e-05, "loss": 0.2801, "mean_token_accuracy": 0.9147733300924301, "num_tokens": 1389768.0, "step": 343 }, { "entropy": 0.5488722026348114, "epoch": 0.749047359825803, "grad_norm": 0.23018239438533783, "learning_rate": 4.887302694493029e-05, "loss": 0.6326, "mean_token_accuracy": 0.8503530323505402, "num_tokens": 1394588.0, "step": 344 }, { "entropy": 0.3708427771925926, "epoch": 0.7512248230811105, "grad_norm": 0.28215181827545166, "learning_rate": 4.840924820526096e-05, "loss": 0.3952, "mean_token_accuracy": 0.8861146718263626, "num_tokens": 1398304.0, "step": 345 }, { "entropy": 0.34193163365125656, "epoch": 0.753402286336418, "grad_norm": 0.2342662215232849, "learning_rate": 4.794852539795291e-05, "loss": 0.3495, "mean_token_accuracy": 0.903597891330719, "num_tokens": 1402505.0, "step": 346 }, { "entropy": 0.3103507123887539, "epoch": 0.7555797495917257, "grad_norm": 0.23902368545532227, "learning_rate": 4.749088138257017e-05, "loss": 0.3078, "mean_token_accuracy": 0.9087391942739487, "num_tokens": 1406703.0, "step": 347 }, { "entropy": 0.3236440420150757, "epoch": 0.7577572128470332, "grad_norm": 0.22265306115150452, "learning_rate": 4.703633886591719e-05, "loss": 0.3387, "mean_token_accuracy": 0.9036975800991058, "num_tokens": 1410765.0, "step": 348 }, { "entropy": 0.31991977244615555, "epoch": 0.7599346761023408, "grad_norm": 0.2397955358028412, "learning_rate": 4.6584920400912156e-05, "loss": 0.3056, "mean_token_accuracy": 0.9113240092992783, "num_tokens": 1414804.0, "step": 349 }, { "entropy": 0.28877923637628555, "epoch": 0.7621121393576483, "grad_norm": 0.2253178060054779, "learning_rate": 4.6136648385467977e-05, "loss": 0.2649, "mean_token_accuracy": 0.9233576655387878, "num_tokens": 1419025.0, "step": 350 }, { "entropy": 0.35838521271944046, "epoch": 0.7642896026129559, "grad_norm": 0.2513080835342407, "learning_rate": 4.5691545061381026e-05, "loss": 0.3413, "mean_token_accuracy": 0.8982634395360947, "num_tokens": 1423031.0, "step": 351 }, { "entropy": 0.371716171503067, "epoch": 0.7664670658682635, "grad_norm": 0.23435106873512268, "learning_rate": 4.5249632513227504e-05, "loss": 0.3457, "mean_token_accuracy": 0.9014202654361725, "num_tokens": 1427232.0, "step": 352 }, { "entropy": 0.32540784031152725, "epoch": 0.7686445291235711, "grad_norm": 0.2637276351451874, "learning_rate": 4.481093266726772e-05, "loss": 0.2913, "mean_token_accuracy": 0.9063924849033356, "num_tokens": 1431135.0, "step": 353 }, { "entropy": 0.35406405478715897, "epoch": 0.7708219923788786, "grad_norm": 0.24304324388504028, "learning_rate": 4.43754672903582e-05, "loss": 0.3232, "mean_token_accuracy": 0.9024296700954437, "num_tokens": 1435499.0, "step": 354 }, { "entropy": 0.32546380907297134, "epoch": 0.7729994556341862, "grad_norm": 0.22986435890197754, "learning_rate": 4.394325798887158e-05, "loss": 0.31, "mean_token_accuracy": 0.9013588130474091, "num_tokens": 1439833.0, "step": 355 }, { "entropy": 0.38513386994600296, "epoch": 0.7751769188894937, "grad_norm": 0.27596256136894226, "learning_rate": 4.351432620762478e-05, "loss": 0.346, "mean_token_accuracy": 0.8986889123916626, "num_tokens": 1443460.0, "step": 356 }, { "entropy": 0.3382200300693512, "epoch": 0.7773543821448013, "grad_norm": 0.24578897655010223, "learning_rate": 4.30886932288147e-05, "loss": 0.3229, "mean_token_accuracy": 0.9034900367259979, "num_tokens": 1447099.0, "step": 357 }, { "entropy": 0.3409022316336632, "epoch": 0.7795318454001089, "grad_norm": 0.2280901074409485, "learning_rate": 4.266638017096252e-05, "loss": 0.3411, "mean_token_accuracy": 0.9012559950351715, "num_tokens": 1451312.0, "step": 358 }, { "entropy": 0.32152481377124786, "epoch": 0.7817093086554164, "grad_norm": 0.24760432541370392, "learning_rate": 4.224740798786573e-05, "loss": 0.3204, "mean_token_accuracy": 0.9076259434223175, "num_tokens": 1455523.0, "step": 359 }, { "entropy": 0.31170132011175156, "epoch": 0.783886771910724, "grad_norm": 0.2510303258895874, "learning_rate": 4.183179746755844e-05, "loss": 0.3126, "mean_token_accuracy": 0.9090617448091507, "num_tokens": 1459544.0, "step": 360 }, { "entropy": 0.3523375913500786, "epoch": 0.7860642351660315, "grad_norm": 0.26667118072509766, "learning_rate": 4.141956923128013e-05, "loss": 0.3492, "mean_token_accuracy": 0.8998522162437439, "num_tokens": 1463315.0, "step": 361 }, { "entropy": 0.3598644956946373, "epoch": 0.7882416984213392, "grad_norm": 0.2440025806427002, "learning_rate": 4.1010743732452294e-05, "loss": 0.3544, "mean_token_accuracy": 0.8947449177503586, "num_tokens": 1467647.0, "step": 362 }, { "entropy": 0.395267553627491, "epoch": 0.7904191616766467, "grad_norm": 0.24411144852638245, "learning_rate": 4.0605341255663696e-05, "loss": 0.4317, "mean_token_accuracy": 0.8864284604787827, "num_tokens": 1471972.0, "step": 363 }, { "entropy": 0.33659572899341583, "epoch": 0.7925966249319543, "grad_norm": 0.26458773016929626, "learning_rate": 4.02033819156639e-05, "loss": 0.3298, "mean_token_accuracy": 0.9003510624170303, "num_tokens": 1475826.0, "step": 364 }, { "entropy": 0.29316914454102516, "epoch": 0.7947740881872618, "grad_norm": 0.25398463010787964, "learning_rate": 3.980488565636522e-05, "loss": 0.2772, "mean_token_accuracy": 0.9137367159128189, "num_tokens": 1480107.0, "step": 365 }, { "entropy": 0.3080258443951607, "epoch": 0.7969515514425695, "grad_norm": 0.26426613330841064, "learning_rate": 3.9409872249853286e-05, "loss": 0.3046, "mean_token_accuracy": 0.9098687618970871, "num_tokens": 1484069.0, "step": 366 }, { "entropy": 0.34426791220903397, "epoch": 0.799129014697877, "grad_norm": 0.2809188663959503, "learning_rate": 3.9018361295405856e-05, "loss": 0.3592, "mean_token_accuracy": 0.9000663906335831, "num_tokens": 1487840.0, "step": 367 }, { "entropy": 0.33940157294273376, "epoch": 0.8013064779531845, "grad_norm": 0.2272171825170517, "learning_rate": 3.8630372218520384e-05, "loss": 0.3417, "mean_token_accuracy": 0.9024456739425659, "num_tokens": 1491938.0, "step": 368 }, { "entropy": 0.33219510316848755, "epoch": 0.8034839412084921, "grad_norm": 0.2192796915769577, "learning_rate": 3.824592426995029e-05, "loss": 0.3221, "mean_token_accuracy": 0.9031501561403275, "num_tokens": 1496386.0, "step": 369 }, { "entropy": 0.3439122289419174, "epoch": 0.8056614044637996, "grad_norm": 0.229109987616539, "learning_rate": 3.786503652474982e-05, "loss": 0.3427, "mean_token_accuracy": 0.9062491357326508, "num_tokens": 1500938.0, "step": 370 }, { "entropy": 0.3725889101624489, "epoch": 0.8078388677191073, "grad_norm": 0.2585630714893341, "learning_rate": 3.7487727881327405e-05, "loss": 0.3704, "mean_token_accuracy": 0.8960603177547455, "num_tokens": 1504742.0, "step": 371 }, { "entropy": 0.3037722408771515, "epoch": 0.8100163309744148, "grad_norm": 0.23759490251541138, "learning_rate": 3.711401706050821e-05, "loss": 0.2939, "mean_token_accuracy": 0.9124279767274857, "num_tokens": 1508512.0, "step": 372 }, { "entropy": 0.3051731139421463, "epoch": 0.8121937942297224, "grad_norm": 0.22473642230033875, "learning_rate": 3.674392260460509e-05, "loss": 0.3036, "mean_token_accuracy": 0.9092454463243484, "num_tokens": 1513083.0, "step": 373 }, { "entropy": 0.3145020753145218, "epoch": 0.8143712574850299, "grad_norm": 0.2272917479276657, "learning_rate": 3.6377462876498694e-05, "loss": 0.2858, "mean_token_accuracy": 0.9174733906984329, "num_tokens": 1516960.0, "step": 374 }, { "entropy": 0.33495523035526276, "epoch": 0.8165487207403375, "grad_norm": 0.24096311628818512, "learning_rate": 3.601465605872636e-05, "loss": 0.3004, "mean_token_accuracy": 0.9126247465610504, "num_tokens": 1520583.0, "step": 375 }, { "entropy": 0.3524938374757767, "epoch": 0.8187261839956451, "grad_norm": 0.23482073843479156, "learning_rate": 3.565552015257989e-05, "loss": 0.3596, "mean_token_accuracy": 0.894221231341362, "num_tokens": 1525126.0, "step": 376 }, { "entropy": 0.3637235388159752, "epoch": 0.8209036472509527, "grad_norm": 0.2486315220594406, "learning_rate": 3.530007297721239e-05, "loss": 0.3518, "mean_token_accuracy": 0.8981701731681824, "num_tokens": 1528846.0, "step": 377 }, { "entropy": 0.327960979193449, "epoch": 0.8230811105062602, "grad_norm": 0.21721476316452026, "learning_rate": 3.494833216875421e-05, "loss": 0.2854, "mean_token_accuracy": 0.915936678647995, "num_tokens": 1532720.0, "step": 378 }, { "entropy": 0.3281715139746666, "epoch": 0.8252585737615677, "grad_norm": 0.27801278233528137, "learning_rate": 3.4600315179437807e-05, "loss": 0.3094, "mean_token_accuracy": 0.9122365713119507, "num_tokens": 1536770.0, "step": 379 }, { "entropy": 0.319459468126297, "epoch": 0.8274360370168753, "grad_norm": 0.24818798899650574, "learning_rate": 3.425603927673195e-05, "loss": 0.2909, "mean_token_accuracy": 0.9143448621034622, "num_tokens": 1540543.0, "step": 380 }, { "entropy": 0.29846663028001785, "epoch": 0.829613500272183, "grad_norm": 0.2553517520427704, "learning_rate": 3.3915521542484794e-05, "loss": 0.2984, "mean_token_accuracy": 0.9117088168859482, "num_tokens": 1544682.0, "step": 381 }, { "entropy": 0.3208995833992958, "epoch": 0.8317909635274905, "grad_norm": 0.23631241917610168, "learning_rate": 3.357877887207648e-05, "loss": 0.3218, "mean_token_accuracy": 0.9085069596767426, "num_tokens": 1548933.0, "step": 382 }, { "entropy": 0.3497694879770279, "epoch": 0.833968426782798, "grad_norm": 0.26314374804496765, "learning_rate": 3.3245827973580754e-05, "loss": 0.3651, "mean_token_accuracy": 0.8973031789064407, "num_tokens": 1553109.0, "step": 383 }, { "entropy": 0.36065296083688736, "epoch": 0.8361458900381056, "grad_norm": 0.2554258704185486, "learning_rate": 3.2916685366936016e-05, "loss": 0.3572, "mean_token_accuracy": 0.8984216153621674, "num_tokens": 1557199.0, "step": 384 }, { "entropy": 0.3203965201973915, "epoch": 0.8383233532934131, "grad_norm": 0.2560184597969055, "learning_rate": 3.259136738312565e-05, "loss": 0.3107, "mean_token_accuracy": 0.9113545119762421, "num_tokens": 1560942.0, "step": 385 }, { "entropy": 0.3545750603079796, "epoch": 0.8405008165487208, "grad_norm": 0.23520711064338684, "learning_rate": 3.226989016336767e-05, "loss": 0.3295, "mean_token_accuracy": 0.8977851718664169, "num_tokens": 1565528.0, "step": 386 }, { "entropy": 0.27805931866168976, "epoch": 0.8426782798040283, "grad_norm": 0.22847194969654083, "learning_rate": 3.1952269658313963e-05, "loss": 0.2647, "mean_token_accuracy": 0.9223105758428574, "num_tokens": 1569618.0, "step": 387 }, { "entropy": 0.36420372873544693, "epoch": 0.8448557430593359, "grad_norm": 0.2458695024251938, "learning_rate": 3.163852162725872e-05, "loss": 0.349, "mean_token_accuracy": 0.8980138152837753, "num_tokens": 1573505.0, "step": 388 }, { "entropy": 0.3188191279768944, "epoch": 0.8470332063146434, "grad_norm": 0.245536670088768, "learning_rate": 3.1328661637356714e-05, "loss": 0.3177, "mean_token_accuracy": 0.907622441649437, "num_tokens": 1577568.0, "step": 389 }, { "entropy": 0.3238792344927788, "epoch": 0.8492106695699511, "grad_norm": 0.24584944546222687, "learning_rate": 3.102270506285067e-05, "loss": 0.3085, "mean_token_accuracy": 0.9090628027915955, "num_tokens": 1581202.0, "step": 390 }, { "entropy": 0.34554795920848846, "epoch": 0.8513881328252586, "grad_norm": 0.24180692434310913, "learning_rate": 3.072066708430862e-05, "loss": 0.3203, "mean_token_accuracy": 0.9024082869291306, "num_tokens": 1585340.0, "step": 391 }, { "entropy": 0.31679805368185043, "epoch": 0.8535655960805661, "grad_norm": 0.23670694231987, "learning_rate": 3.042256268787063e-05, "loss": 0.2891, "mean_token_accuracy": 0.9171215295791626, "num_tokens": 1589570.0, "step": 392 }, { "entropy": 0.316896952688694, "epoch": 0.8557430593358737, "grad_norm": 0.26047396659851074, "learning_rate": 3.0128406664505215e-05, "loss": 0.3237, "mean_token_accuracy": 0.9058733284473419, "num_tokens": 1593421.0, "step": 393 }, { "entropy": 0.3199731484055519, "epoch": 0.8579205225911812, "grad_norm": 0.2323935478925705, "learning_rate": 2.9838213609275546e-05, "loss": 0.3018, "mean_token_accuracy": 0.9120573252439499, "num_tokens": 1597598.0, "step": 394 }, { "entropy": 0.29843273013830185, "epoch": 0.8600979858464889, "grad_norm": 0.2387438267469406, "learning_rate": 2.9551997920615187e-05, "loss": 0.2862, "mean_token_accuracy": 0.9175356030464172, "num_tokens": 1601591.0, "step": 395 }, { "entropy": 0.31333109736442566, "epoch": 0.8622754491017964, "grad_norm": 0.23580299317836761, "learning_rate": 2.926977379961374e-05, "loss": 0.3098, "mean_token_accuracy": 0.911782830953598, "num_tokens": 1606156.0, "step": 396 }, { "entropy": 0.32873860746622086, "epoch": 0.864452912357104, "grad_norm": 0.23804928362369537, "learning_rate": 2.899155524931224e-05, "loss": 0.3171, "mean_token_accuracy": 0.9060818552970886, "num_tokens": 1610215.0, "step": 397 }, { "entropy": 0.331471748650074, "epoch": 0.8666303756124115, "grad_norm": 0.22940973937511444, "learning_rate": 2.8717356074008345e-05, "loss": 0.3201, "mean_token_accuracy": 0.905473530292511, "num_tokens": 1614427.0, "step": 398 }, { "entropy": 0.33943046629428864, "epoch": 0.8688078388677191, "grad_norm": 0.24828903377056122, "learning_rate": 2.844718987857145e-05, "loss": 0.3408, "mean_token_accuracy": 0.8990557938814163, "num_tokens": 1618891.0, "step": 399 }, { "entropy": 0.33763300627470016, "epoch": 0.8709853021230267, "grad_norm": 0.25826534628868103, "learning_rate": 2.818107006776761e-05, "loss": 0.3195, "mean_token_accuracy": 0.9027258008718491, "num_tokens": 1622659.0, "step": 400 }, { "entropy": 0.29499682784080505, "epoch": 0.8731627653783343, "grad_norm": 0.22961440682411194, "learning_rate": 2.7919009845594502e-05, "loss": 0.2923, "mean_token_accuracy": 0.9152926355600357, "num_tokens": 1626858.0, "step": 401 }, { "entropy": 0.3353520557284355, "epoch": 0.8753402286336418, "grad_norm": 0.25194504857063293, "learning_rate": 2.7661022214626153e-05, "loss": 0.3207, "mean_token_accuracy": 0.9085413068532944, "num_tokens": 1630448.0, "step": 402 }, { "entropy": 0.29210612177848816, "epoch": 0.8775176918889493, "grad_norm": 0.2511427402496338, "learning_rate": 2.7407119975368006e-05, "loss": 0.2815, "mean_token_accuracy": 0.9171009808778763, "num_tokens": 1634411.0, "step": 403 }, { "entropy": 0.35340818017721176, "epoch": 0.8796951551442569, "grad_norm": 0.24676676094532013, "learning_rate": 2.7157315725621612e-05, "loss": 0.3692, "mean_token_accuracy": 0.905316099524498, "num_tokens": 1638404.0, "step": 404 }, { "entropy": 0.3412262871861458, "epoch": 0.8818726183995645, "grad_norm": 0.27478235960006714, "learning_rate": 2.6911621859859658e-05, "loss": 0.3472, "mean_token_accuracy": 0.90118607878685, "num_tokens": 1642162.0, "step": 405 }, { "entropy": 0.33481264114379883, "epoch": 0.8840500816548721, "grad_norm": 0.2933956980705261, "learning_rate": 2.6670050568610972e-05, "loss": 0.3248, "mean_token_accuracy": 0.9072499722242355, "num_tokens": 1646171.0, "step": 406 }, { "entropy": 0.3591442406177521, "epoch": 0.8862275449101796, "grad_norm": 0.21709908545017242, "learning_rate": 2.6432613837855658e-05, "loss": 0.3407, "mean_token_accuracy": 0.9071426689624786, "num_tokens": 1650504.0, "step": 407 }, { "entropy": 0.32970624417066574, "epoch": 0.8884050081654872, "grad_norm": 0.23687736690044403, "learning_rate": 2.6199323448430458e-05, "loss": 0.3135, "mean_token_accuracy": 0.903979942202568, "num_tokens": 1654507.0, "step": 408 }, { "entropy": 0.3415728807449341, "epoch": 0.8905824714207947, "grad_norm": 0.2553468644618988, "learning_rate": 2.597019097544409e-05, "loss": 0.3039, "mean_token_accuracy": 0.9025170505046844, "num_tokens": 1658421.0, "step": 409 }, { "entropy": 0.29549212008714676, "epoch": 0.8927599346761024, "grad_norm": 0.21464505791664124, "learning_rate": 2.574522778770308e-05, "loss": 0.2634, "mean_token_accuracy": 0.9200884401798248, "num_tokens": 1662809.0, "step": 410 }, { "entropy": 0.3326757438480854, "epoch": 0.8949373979314099, "grad_norm": 0.23331218957901, "learning_rate": 2.5524445047147567e-05, "loss": 0.319, "mean_token_accuracy": 0.900556892156601, "num_tokens": 1667221.0, "step": 411 }, { "entropy": 0.31935854256153107, "epoch": 0.8971148611867175, "grad_norm": 0.23457299172878265, "learning_rate": 2.5307853708297523e-05, "loss": 0.3045, "mean_token_accuracy": 0.9045213311910629, "num_tokens": 1671381.0, "step": 412 }, { "entropy": 0.3340509235858917, "epoch": 0.899292324442025, "grad_norm": 0.23886168003082275, "learning_rate": 2.5095464517709277e-05, "loss": 0.3264, "mean_token_accuracy": 0.899304986000061, "num_tokens": 1675656.0, "step": 413 }, { "entropy": 0.3181797042489052, "epoch": 0.9014697876973327, "grad_norm": 0.24742458760738373, "learning_rate": 2.4887288013442218e-05, "loss": 0.2988, "mean_token_accuracy": 0.9066351801156998, "num_tokens": 1679259.0, "step": 414 }, { "entropy": 0.3163676857948303, "epoch": 0.9036472509526402, "grad_norm": 0.25340980291366577, "learning_rate": 2.468333452453597e-05, "loss": 0.2979, "mean_token_accuracy": 0.9118978530168533, "num_tokens": 1683245.0, "step": 415 }, { "entropy": 0.30397678166627884, "epoch": 0.9058247142079477, "grad_norm": 0.2358277142047882, "learning_rate": 2.4483614170497916e-05, "loss": 0.2955, "mean_token_accuracy": 0.9145314395427704, "num_tokens": 1687531.0, "step": 416 }, { "entropy": 0.34245041757822037, "epoch": 0.9080021774632553, "grad_norm": 0.23215466737747192, "learning_rate": 2.4288136860801048e-05, "loss": 0.326, "mean_token_accuracy": 0.9006476998329163, "num_tokens": 1692172.0, "step": 417 }, { "entropy": 0.3470025435090065, "epoch": 0.9101796407185628, "grad_norm": 0.26786699891090393, "learning_rate": 2.409691229439239e-05, "loss": 0.3668, "mean_token_accuracy": 0.8918263465166092, "num_tokens": 1696141.0, "step": 418 }, { "entropy": 0.30502913892269135, "epoch": 0.9123571039738705, "grad_norm": 0.23780497908592224, "learning_rate": 2.3909949959211657e-05, "loss": 0.2906, "mean_token_accuracy": 0.9070711433887482, "num_tokens": 1700408.0, "step": 419 }, { "entropy": 0.3096166178584099, "epoch": 0.914534567229178, "grad_norm": 0.21969804167747498, "learning_rate": 2.372725913172055e-05, "loss": 0.32, "mean_token_accuracy": 0.9115228056907654, "num_tokens": 1704797.0, "step": 420 }, { "entropy": 0.30217302590608597, "epoch": 0.9167120304844856, "grad_norm": 0.23517285287380219, "learning_rate": 2.3548848876442465e-05, "loss": 0.2789, "mean_token_accuracy": 0.9120800346136093, "num_tokens": 1708762.0, "step": 421 }, { "entropy": 0.27675122022628784, "epoch": 0.9188894937397931, "grad_norm": 0.2593907415866852, "learning_rate": 2.337472804551281e-05, "loss": 0.2552, "mean_token_accuracy": 0.9166678935289383, "num_tokens": 1712763.0, "step": 422 }, { "entropy": 0.31945841014385223, "epoch": 0.9210669569951007, "grad_norm": 0.22665663063526154, "learning_rate": 2.320490527823968e-05, "loss": 0.322, "mean_token_accuracy": 0.9008611887693405, "num_tokens": 1717586.0, "step": 423 }, { "entropy": 0.28783877938985825, "epoch": 0.9232444202504083, "grad_norm": 0.2106105536222458, "learning_rate": 2.303938900067531e-05, "loss": 0.2571, "mean_token_accuracy": 0.9197226613759995, "num_tokens": 1722046.0, "step": 424 }, { "entropy": 0.31481262296438217, "epoch": 0.9254218835057159, "grad_norm": 0.24338746070861816, "learning_rate": 2.2878187425197893e-05, "loss": 0.3072, "mean_token_accuracy": 0.9047886729240417, "num_tokens": 1726207.0, "step": 425 }, { "entropy": 0.35147786885499954, "epoch": 0.9275993467610234, "grad_norm": 0.2515200078487396, "learning_rate": 2.272130855010421e-05, "loss": 0.3496, "mean_token_accuracy": 0.8965179175138474, "num_tokens": 1730155.0, "step": 426 }, { "entropy": 0.36182061582803726, "epoch": 0.929776810016331, "grad_norm": 0.2628372609615326, "learning_rate": 2.2568760159212745e-05, "loss": 0.3187, "mean_token_accuracy": 0.9001797884702682, "num_tokens": 1733927.0, "step": 427 }, { "entropy": 0.32963769882917404, "epoch": 0.9319542732716385, "grad_norm": 0.26346680521965027, "learning_rate": 2.2420549821477435e-05, "loss": 0.311, "mean_token_accuracy": 0.9040227830410004, "num_tokens": 1737774.0, "step": 428 }, { "entropy": 0.37061919271945953, "epoch": 0.9341317365269461, "grad_norm": 0.2579784691333771, "learning_rate": 2.227668489061219e-05, "loss": 0.3676, "mean_token_accuracy": 0.8960554301738739, "num_tokens": 1741942.0, "step": 429 }, { "entropy": 0.3078198730945587, "epoch": 0.9363091997822537, "grad_norm": 0.24415822327136993, "learning_rate": 2.2137172504725956e-05, "loss": 0.2881, "mean_token_accuracy": 0.912653386592865, "num_tokens": 1745914.0, "step": 430 }, { "entropy": 0.3296479806303978, "epoch": 0.9384866630375612, "grad_norm": 0.25575825572013855, "learning_rate": 2.2002019585968637e-05, "loss": 0.3096, "mean_token_accuracy": 0.9089950323104858, "num_tokens": 1749929.0, "step": 431 }, { "entropy": 0.3241398259997368, "epoch": 0.9406641262928688, "grad_norm": 0.2516978085041046, "learning_rate": 2.187123284018753e-05, "loss": 0.3186, "mean_token_accuracy": 0.9034547358751297, "num_tokens": 1753992.0, "step": 432 }, { "entropy": 0.3980755880475044, "epoch": 0.9428415895481764, "grad_norm": 0.24856629967689514, "learning_rate": 2.174481875659472e-05, "loss": 0.3749, "mean_token_accuracy": 0.8908516466617584, "num_tokens": 1758062.0, "step": 433 }, { "entropy": 0.3143734037876129, "epoch": 0.945019052803484, "grad_norm": 0.25844618678092957, "learning_rate": 2.1622783607444988e-05, "loss": 0.2784, "mean_token_accuracy": 0.922119140625, "num_tokens": 1761689.0, "step": 434 }, { "entropy": 0.3378266841173172, "epoch": 0.9471965160587915, "grad_norm": 0.24213889241218567, "learning_rate": 2.150513344772469e-05, "loss": 0.3155, "mean_token_accuracy": 0.9061428606510162, "num_tokens": 1766010.0, "step": 435 }, { "entropy": 0.35330820083618164, "epoch": 0.9493739793140991, "grad_norm": 0.2620498836040497, "learning_rate": 2.1391874114851294e-05, "loss": 0.3583, "mean_token_accuracy": 0.9004585295915604, "num_tokens": 1769801.0, "step": 436 }, { "entropy": 0.2881145551800728, "epoch": 0.9515514425694066, "grad_norm": 0.24421681463718414, "learning_rate": 2.128301122838377e-05, "loss": 0.3026, "mean_token_accuracy": 0.9104648381471634, "num_tokens": 1774342.0, "step": 437 }, { "entropy": 0.3526333123445511, "epoch": 0.9537289058247143, "grad_norm": 0.2302054911851883, "learning_rate": 2.117855018974369e-05, "loss": 0.3199, "mean_token_accuracy": 0.9067949205636978, "num_tokens": 1778412.0, "step": 438 }, { "entropy": 0.32418397441506386, "epoch": 0.9559063690800218, "grad_norm": 0.21741004288196564, "learning_rate": 2.107849618194735e-05, "loss": 0.3114, "mean_token_accuracy": 0.9031261652708054, "num_tokens": 1782995.0, "step": 439 }, { "entropy": 0.30877869576215744, "epoch": 0.9580838323353293, "grad_norm": 0.23063865303993225, "learning_rate": 2.0982854169348503e-05, "loss": 0.2949, "mean_token_accuracy": 0.9094719737768173, "num_tokens": 1787537.0, "step": 440 }, { "entropy": 0.3279525935649872, "epoch": 0.9602612955906369, "grad_norm": 0.2691234350204468, "learning_rate": 2.0891628897392087e-05, "loss": 0.345, "mean_token_accuracy": 0.8982786238193512, "num_tokens": 1791355.0, "step": 441 }, { "entropy": 0.3470368981361389, "epoch": 0.9624387588459444, "grad_norm": 0.26819464564323425, "learning_rate": 2.0804824892378765e-05, "loss": 0.3414, "mean_token_accuracy": 0.9030001610517502, "num_tokens": 1795467.0, "step": 442 }, { "entropy": 0.3493390902876854, "epoch": 0.9646162221012521, "grad_norm": 0.23444399237632751, "learning_rate": 2.0722446461240352e-05, "loss": 0.3442, "mean_token_accuracy": 0.8999157398939133, "num_tokens": 1800109.0, "step": 443 }, { "entropy": 0.3092813342809677, "epoch": 0.9667936853565596, "grad_norm": 0.23800377547740936, "learning_rate": 2.0644497691326106e-05, "loss": 0.2999, "mean_token_accuracy": 0.9111448973417282, "num_tokens": 1804018.0, "step": 444 }, { "entropy": 0.29666490107774734, "epoch": 0.9689711486118672, "grad_norm": 0.22874487936496735, "learning_rate": 2.0570982450199913e-05, "loss": 0.2858, "mean_token_accuracy": 0.9175421446561813, "num_tokens": 1808059.0, "step": 445 }, { "entropy": 0.3779358044266701, "epoch": 0.9711486118671747, "grad_norm": 0.2360084503889084, "learning_rate": 2.0501904385448447e-05, "loss": 0.3668, "mean_token_accuracy": 0.9037110358476639, "num_tokens": 1812165.0, "step": 446 }, { "entropy": 0.3430086299777031, "epoch": 0.9733260751224823, "grad_norm": 0.2596234679222107, "learning_rate": 2.043726692450014e-05, "loss": 0.3233, "mean_token_accuracy": 0.9003089815378189, "num_tokens": 1815708.0, "step": 447 }, { "entropy": 0.3329969719052315, "epoch": 0.9755035383777899, "grad_norm": 0.25411558151245117, "learning_rate": 2.037707327445511e-05, "loss": 0.3299, "mean_token_accuracy": 0.9008579254150391, "num_tokens": 1819635.0, "step": 448 }, { "entropy": 0.3378527835011482, "epoch": 0.9776810016330975, "grad_norm": 0.2512282431125641, "learning_rate": 2.0321326421926097e-05, "loss": 0.3325, "mean_token_accuracy": 0.9022142142057419, "num_tokens": 1823694.0, "step": 449 }, { "entropy": 0.34048717468976974, "epoch": 0.979858464888405, "grad_norm": 0.24113033711910248, "learning_rate": 2.0270029132890223e-05, "loss": 0.344, "mean_token_accuracy": 0.9008767306804657, "num_tokens": 1827735.0, "step": 450 }, { "entropy": 0.3050212487578392, "epoch": 0.9820359281437125, "grad_norm": 0.21851961314678192, "learning_rate": 2.0223183952551785e-05, "loss": 0.2795, "mean_token_accuracy": 0.917202040553093, "num_tokens": 1831884.0, "step": 451 }, { "entropy": 0.3319382965564728, "epoch": 0.9842133913990201, "grad_norm": 0.24525989592075348, "learning_rate": 2.018079320521593e-05, "loss": 0.3079, "mean_token_accuracy": 0.9144886583089828, "num_tokens": 1835507.0, "step": 452 }, { "entropy": 0.34535887837409973, "epoch": 0.9863908546543277, "grad_norm": 0.2506140172481537, "learning_rate": 2.0142858994173404e-05, "loss": 0.3436, "mean_token_accuracy": 0.9002240151166916, "num_tokens": 1839606.0, "step": 453 }, { "entropy": 0.3276618719100952, "epoch": 0.9885683179096353, "grad_norm": 0.2481948435306549, "learning_rate": 2.0109383201596102e-05, "loss": 0.3105, "mean_token_accuracy": 0.9108982384204865, "num_tokens": 1843500.0, "step": 454 }, { "entropy": 0.3270680084824562, "epoch": 0.9907457811649428, "grad_norm": 0.2625768780708313, "learning_rate": 2.0080367488443743e-05, "loss": 0.328, "mean_token_accuracy": 0.9026461988687515, "num_tokens": 1847739.0, "step": 455 }, { "entropy": 0.34614715725183487, "epoch": 0.9929232444202504, "grad_norm": 0.2605029046535492, "learning_rate": 2.0055813294381443e-05, "loss": 0.3467, "mean_token_accuracy": 0.9046141803264618, "num_tokens": 1851928.0, "step": 456 }, { "entropy": 0.284773550927639, "epoch": 0.995100707675558, "grad_norm": 0.22357277572155, "learning_rate": 2.00357218377083e-05, "loss": 0.2689, "mean_token_accuracy": 0.9219858795404434, "num_tokens": 1856283.0, "step": 457 }, { "entropy": 0.356322281062603, "epoch": 0.9972781709308656, "grad_norm": 0.23450958728790283, "learning_rate": 2.0020094115296876e-05, "loss": 0.3562, "mean_token_accuracy": 0.9017274230718613, "num_tokens": 1861007.0, "step": 458 }, { "entropy": 0.2814597636461258, "epoch": 0.9994556341861731, "grad_norm": 0.2359769642353058, "learning_rate": 2.0008930902543854e-05, "loss": 0.2653, "mean_token_accuracy": 0.9179674088954926, "num_tokens": 1865010.0, "step": 459 }, { "entropy": 0.42821022868156433, "epoch": 1.0, "grad_norm": 0.8799027800559998, "learning_rate": 2.0002232753331453e-05, "loss": 0.4353, "mean_token_accuracy": 0.8921568393707275, "num_tokens": 1865318.0, "step": 460 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0112748518481592e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }