| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 460, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.991020917892456, | |
| "epoch": 0.002177463255307567, | |
| "grad_norm": 1.7277425527572632, | |
| "learning_rate": 0.0, | |
| "loss": 2.1366, | |
| "mean_token_accuracy": 0.628267303109169, | |
| "num_tokens": 3878.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 0.957662433385849, | |
| "epoch": 0.004354926510615134, | |
| "grad_norm": 2.072942018508911, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 2.1489, | |
| "mean_token_accuracy": 0.6321403831243515, | |
| "num_tokens": 7754.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 0.9688078463077545, | |
| "epoch": 0.0065323897659227, | |
| "grad_norm": 1.712500810623169, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 2.1006, | |
| "mean_token_accuracy": 0.6395954489707947, | |
| "num_tokens": 11724.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 0.9446051567792892, | |
| "epoch": 0.008709853021230268, | |
| "grad_norm": 1.6249885559082031, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 1.8636, | |
| "mean_token_accuracy": 0.6592330932617188, | |
| "num_tokens": 15998.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 0.9482486844062805, | |
| "epoch": 0.010887316276537834, | |
| "grad_norm": 1.2645702362060547, | |
| "learning_rate": 5.714285714285714e-05, | |
| "loss": 1.6381, | |
| "mean_token_accuracy": 0.6715894490480423, | |
| "num_tokens": 20034.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 0.8820638656616211, | |
| "epoch": 0.0130647795318454, | |
| "grad_norm": 0.9111854434013367, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 1.256, | |
| "mean_token_accuracy": 0.7338996976613998, | |
| "num_tokens": 24592.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 0.866950273513794, | |
| "epoch": 0.015242242787152967, | |
| "grad_norm": 0.6964920163154602, | |
| "learning_rate": 8.571428571428571e-05, | |
| "loss": 1.0385, | |
| "mean_token_accuracy": 0.7606792002916336, | |
| "num_tokens": 29247.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 0.9245865046977997, | |
| "epoch": 0.017419706042460535, | |
| "grad_norm": 0.6615565419197083, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9594, | |
| "mean_token_accuracy": 0.7808533608913422, | |
| "num_tokens": 33561.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.8866463452577591, | |
| "epoch": 0.0195971692977681, | |
| "grad_norm": 0.5024364590644836, | |
| "learning_rate": 0.00011428571428571428, | |
| "loss": 0.8709, | |
| "mean_token_accuracy": 0.7967555373907089, | |
| "num_tokens": 37956.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.8838604241609573, | |
| "epoch": 0.021774632553075667, | |
| "grad_norm": 0.637697696685791, | |
| "learning_rate": 0.00012857142857142858, | |
| "loss": 0.8448, | |
| "mean_token_accuracy": 0.7953355461359024, | |
| "num_tokens": 41607.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.8180341869592667, | |
| "epoch": 0.023952095808383235, | |
| "grad_norm": 0.5411834120750427, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 0.7641, | |
| "mean_token_accuracy": 0.8057558983564377, | |
| "num_tokens": 45872.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.6423389464616776, | |
| "epoch": 0.0261295590636908, | |
| "grad_norm": 0.5807392597198486, | |
| "learning_rate": 0.00015714285714285716, | |
| "loss": 0.6364, | |
| "mean_token_accuracy": 0.8353168815374374, | |
| "num_tokens": 50197.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.7770279943943024, | |
| "epoch": 0.028307022318998367, | |
| "grad_norm": 0.602966845035553, | |
| "learning_rate": 0.00017142857142857143, | |
| "loss": 0.911, | |
| "mean_token_accuracy": 0.8115980476140976, | |
| "num_tokens": 55436.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.6030550897121429, | |
| "epoch": 0.030484485574305935, | |
| "grad_norm": 0.471264511346817, | |
| "learning_rate": 0.00018571428571428572, | |
| "loss": 0.6506, | |
| "mean_token_accuracy": 0.8220222592353821, | |
| "num_tokens": 59509.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.5797188133001328, | |
| "epoch": 0.0326619488296135, | |
| "grad_norm": 0.3981204628944397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6439, | |
| "mean_token_accuracy": 0.8296175897121429, | |
| "num_tokens": 63811.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.5227785632014275, | |
| "epoch": 0.03483941208492107, | |
| "grad_norm": 0.3803451955318451, | |
| "learning_rate": 0.00019999776724666853, | |
| "loss": 0.5614, | |
| "mean_token_accuracy": 0.8536529093980789, | |
| "num_tokens": 67933.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.5339454486966133, | |
| "epoch": 0.037016875340228635, | |
| "grad_norm": 0.4023122489452362, | |
| "learning_rate": 0.00019999106909745614, | |
| "loss": 0.5768, | |
| "mean_token_accuracy": 0.8468181490898132, | |
| "num_tokens": 71929.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.5080433636903763, | |
| "epoch": 0.0391943385955362, | |
| "grad_norm": 0.359109103679657, | |
| "learning_rate": 0.0001999799058847031, | |
| "loss": 0.5158, | |
| "mean_token_accuracy": 0.8626691251993179, | |
| "num_tokens": 76116.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.49260225892066956, | |
| "epoch": 0.04137180185084377, | |
| "grad_norm": 0.34172919392585754, | |
| "learning_rate": 0.00019996427816229171, | |
| "loss": 0.5121, | |
| "mean_token_accuracy": 0.8724553287029266, | |
| "num_tokens": 80000.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.5065008923411369, | |
| "epoch": 0.043549265106151334, | |
| "grad_norm": 0.4033750295639038, | |
| "learning_rate": 0.00019994418670561857, | |
| "loss": 0.5636, | |
| "mean_token_accuracy": 0.8592322468757629, | |
| "num_tokens": 83682.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.54892348498106, | |
| "epoch": 0.0457267283614589, | |
| "grad_norm": 0.41379520297050476, | |
| "learning_rate": 0.00019991963251155627, | |
| "loss": 0.5693, | |
| "mean_token_accuracy": 0.8495212495326996, | |
| "num_tokens": 87684.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.4928950071334839, | |
| "epoch": 0.04790419161676647, | |
| "grad_norm": 0.3717893362045288, | |
| "learning_rate": 0.00019989061679840392, | |
| "loss": 0.523, | |
| "mean_token_accuracy": 0.8606368601322174, | |
| "num_tokens": 91550.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.5253347381949425, | |
| "epoch": 0.050081654872074034, | |
| "grad_norm": 0.3741125166416168, | |
| "learning_rate": 0.0001998571410058266, | |
| "loss": 0.5433, | |
| "mean_token_accuracy": 0.8630485236644745, | |
| "num_tokens": 95625.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.5028375387191772, | |
| "epoch": 0.0522591181273816, | |
| "grad_norm": 1.8555870056152344, | |
| "learning_rate": 0.00019981920679478407, | |
| "loss": 0.5296, | |
| "mean_token_accuracy": 0.8609876334667206, | |
| "num_tokens": 99517.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5414893701672554, | |
| "epoch": 0.05443658138268917, | |
| "grad_norm": 0.44715237617492676, | |
| "learning_rate": 0.00019977681604744824, | |
| "loss": 0.5782, | |
| "mean_token_accuracy": 0.8441034108400345, | |
| "num_tokens": 103204.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.48021427541971207, | |
| "epoch": 0.056614044637996734, | |
| "grad_norm": 0.31098225712776184, | |
| "learning_rate": 0.0001997299708671098, | |
| "loss": 0.4932, | |
| "mean_token_accuracy": 0.8744789958000183, | |
| "num_tokens": 107327.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.46857017278671265, | |
| "epoch": 0.0587915078933043, | |
| "grad_norm": 0.3036307692527771, | |
| "learning_rate": 0.00019967867357807391, | |
| "loss": 0.4791, | |
| "mean_token_accuracy": 0.8786111921072006, | |
| "num_tokens": 111453.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.49031493067741394, | |
| "epoch": 0.06096897114861187, | |
| "grad_norm": 0.3337958753108978, | |
| "learning_rate": 0.00019962292672554493, | |
| "loss": 0.5018, | |
| "mean_token_accuracy": 0.8619510382413864, | |
| "num_tokens": 115266.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.4807809889316559, | |
| "epoch": 0.06314643440391943, | |
| "grad_norm": 0.35365355014801025, | |
| "learning_rate": 0.00019956273307549988, | |
| "loss": 0.4877, | |
| "mean_token_accuracy": 0.8618622571229935, | |
| "num_tokens": 118928.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.40949247032403946, | |
| "epoch": 0.065323897659227, | |
| "grad_norm": 0.3226538300514221, | |
| "learning_rate": 0.00019949809561455156, | |
| "loss": 0.4133, | |
| "mean_token_accuracy": 0.8882981538772583, | |
| "num_tokens": 122893.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.49030745029449463, | |
| "epoch": 0.06750136091453457, | |
| "grad_norm": 0.33420825004577637, | |
| "learning_rate": 0.0001994290175498001, | |
| "loss": 0.503, | |
| "mean_token_accuracy": 0.8634953200817108, | |
| "num_tokens": 127132.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.49527500569820404, | |
| "epoch": 0.06967882416984214, | |
| "grad_norm": 0.4112333655357361, | |
| "learning_rate": 0.00019935550230867392, | |
| "loss": 0.5067, | |
| "mean_token_accuracy": 0.8607686161994934, | |
| "num_tokens": 131100.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5203969404101372, | |
| "epoch": 0.0718562874251497, | |
| "grad_norm": 1.3927068710327148, | |
| "learning_rate": 0.00019927755353875965, | |
| "loss": 0.5942, | |
| "mean_token_accuracy": 0.8566101640462875, | |
| "num_tokens": 135503.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5023058727383614, | |
| "epoch": 0.07403375068045727, | |
| "grad_norm": 0.4079550802707672, | |
| "learning_rate": 0.00019919517510762124, | |
| "loss": 0.4961, | |
| "mean_token_accuracy": 0.8630523085594177, | |
| "num_tokens": 139771.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.4864235520362854, | |
| "epoch": 0.07621121393576484, | |
| "grad_norm": 0.39264485239982605, | |
| "learning_rate": 0.0001991083711026079, | |
| "loss": 0.5009, | |
| "mean_token_accuracy": 0.871365949511528, | |
| "num_tokens": 143980.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.525127612054348, | |
| "epoch": 0.0783886771910724, | |
| "grad_norm": 0.4305553734302521, | |
| "learning_rate": 0.00019901714583065152, | |
| "loss": 0.4872, | |
| "mean_token_accuracy": 0.8670255392789841, | |
| "num_tokens": 148059.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5707878470420837, | |
| "epoch": 0.08056614044637997, | |
| "grad_norm": 0.44129130244255066, | |
| "learning_rate": 0.00019892150381805267, | |
| "loss": 0.581, | |
| "mean_token_accuracy": 0.844414696097374, | |
| "num_tokens": 152230.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5001106485724449, | |
| "epoch": 0.08274360370168754, | |
| "grad_norm": 0.5918931365013123, | |
| "learning_rate": 0.00019882144981025633, | |
| "loss": 0.4751, | |
| "mean_token_accuracy": 0.8649907559156418, | |
| "num_tokens": 156252.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5312293991446495, | |
| "epoch": 0.0849210669569951, | |
| "grad_norm": 0.4835371971130371, | |
| "learning_rate": 0.00019871698877161627, | |
| "loss": 0.5091, | |
| "mean_token_accuracy": 0.871647521853447, | |
| "num_tokens": 160171.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.4701843932271004, | |
| "epoch": 0.08709853021230267, | |
| "grad_norm": 0.48571643233299255, | |
| "learning_rate": 0.0001986081258851487, | |
| "loss": 0.4495, | |
| "mean_token_accuracy": 0.8771228045225143, | |
| "num_tokens": 163975.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.4694196283817291, | |
| "epoch": 0.08927599346761024, | |
| "grad_norm": 0.4216046929359436, | |
| "learning_rate": 0.00019849486655227532, | |
| "loss": 0.4158, | |
| "mean_token_accuracy": 0.8735549598932266, | |
| "num_tokens": 168421.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.4697120413184166, | |
| "epoch": 0.0914534567229178, | |
| "grad_norm": 0.3664827346801758, | |
| "learning_rate": 0.000198377216392555, | |
| "loss": 0.4231, | |
| "mean_token_accuracy": 0.8784957528114319, | |
| "num_tokens": 172395.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.44586674869060516, | |
| "epoch": 0.09363091997822537, | |
| "grad_norm": 0.39455050230026245, | |
| "learning_rate": 0.00019825518124340529, | |
| "loss": 0.4166, | |
| "mean_token_accuracy": 0.8799059689044952, | |
| "num_tokens": 175967.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.4293370470404625, | |
| "epoch": 0.09580838323353294, | |
| "grad_norm": 0.39706796407699585, | |
| "learning_rate": 0.00019812876715981248, | |
| "loss": 0.4522, | |
| "mean_token_accuracy": 0.8723510503768921, | |
| "num_tokens": 180153.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.41641899943351746, | |
| "epoch": 0.0979858464888405, | |
| "grad_norm": 0.41735970973968506, | |
| "learning_rate": 0.00019799798041403137, | |
| "loss": 0.4436, | |
| "mean_token_accuracy": 0.8725763112306595, | |
| "num_tokens": 184161.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.40215710550546646, | |
| "epoch": 0.10016330974414807, | |
| "grad_norm": 0.44639289379119873, | |
| "learning_rate": 0.00019786282749527406, | |
| "loss": 0.4289, | |
| "mean_token_accuracy": 0.8803199082612991, | |
| "num_tokens": 187869.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.48222628980875015, | |
| "epoch": 0.10234077299945564, | |
| "grad_norm": 0.4197250306606293, | |
| "learning_rate": 0.00019772331510938782, | |
| "loss": 0.4861, | |
| "mean_token_accuracy": 0.8618861585855484, | |
| "num_tokens": 192020.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.49629800766706467, | |
| "epoch": 0.1045182362547632, | |
| "grad_norm": 0.5031387209892273, | |
| "learning_rate": 0.00019757945017852258, | |
| "loss": 0.4775, | |
| "mean_token_accuracy": 0.8681423515081406, | |
| "num_tokens": 195514.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.3977178856730461, | |
| "epoch": 0.10669569951007077, | |
| "grad_norm": 0.4578983783721924, | |
| "learning_rate": 0.0001974312398407873, | |
| "loss": 0.3673, | |
| "mean_token_accuracy": 0.8914825022220612, | |
| "num_tokens": 199234.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.3965229466557503, | |
| "epoch": 0.10887316276537834, | |
| "grad_norm": 0.37602174282073975, | |
| "learning_rate": 0.0001972786914498958, | |
| "loss": 0.3953, | |
| "mean_token_accuracy": 0.8783656060695648, | |
| "num_tokens": 203760.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.42161373794078827, | |
| "epoch": 0.1110506260206859, | |
| "grad_norm": 0.3125810921192169, | |
| "learning_rate": 0.00019712181257480212, | |
| "loss": 0.3754, | |
| "mean_token_accuracy": 0.8832796663045883, | |
| "num_tokens": 207439.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.4191659912467003, | |
| "epoch": 0.11322808927599347, | |
| "grad_norm": 0.32242998480796814, | |
| "learning_rate": 0.00019696061099932471, | |
| "loss": 0.3861, | |
| "mean_token_accuracy": 0.8820012956857681, | |
| "num_tokens": 211708.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.464703693985939, | |
| "epoch": 0.11540555253130104, | |
| "grad_norm": 0.4021685719490051, | |
| "learning_rate": 0.00019679509472176032, | |
| "loss": 0.4384, | |
| "mean_token_accuracy": 0.8743875622749329, | |
| "num_tokens": 215763.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.4165603965520859, | |
| "epoch": 0.1175830157866086, | |
| "grad_norm": 0.3444255590438843, | |
| "learning_rate": 0.00019662527195448722, | |
| "loss": 0.3991, | |
| "mean_token_accuracy": 0.88118776679039, | |
| "num_tokens": 220090.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.4068721905350685, | |
| "epoch": 0.11976047904191617, | |
| "grad_norm": 0.3705560564994812, | |
| "learning_rate": 0.00019645115112355754, | |
| "loss": 0.3707, | |
| "mean_token_accuracy": 0.882274329662323, | |
| "num_tokens": 223672.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.3627975210547447, | |
| "epoch": 0.12193794229722374, | |
| "grad_norm": 0.37365177273750305, | |
| "learning_rate": 0.00019627274086827948, | |
| "loss": 0.36, | |
| "mean_token_accuracy": 0.8874702304601669, | |
| "num_tokens": 227497.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.40359440445899963, | |
| "epoch": 0.1241154055525313, | |
| "grad_norm": 0.33996060490608215, | |
| "learning_rate": 0.00019609005004078838, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8732311725616455, | |
| "num_tokens": 231293.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.36641839146614075, | |
| "epoch": 0.12629286880783885, | |
| "grad_norm": 0.2762836515903473, | |
| "learning_rate": 0.00019590308770560763, | |
| "loss": 0.3485, | |
| "mean_token_accuracy": 0.8926344960927963, | |
| "num_tokens": 236001.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.4077141284942627, | |
| "epoch": 0.12847033206314643, | |
| "grad_norm": 0.2915239930152893, | |
| "learning_rate": 0.00019571186313919895, | |
| "loss": 0.3942, | |
| "mean_token_accuracy": 0.8783977180719376, | |
| "num_tokens": 240264.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.4022030830383301, | |
| "epoch": 0.130647795318454, | |
| "grad_norm": 0.3684654235839844, | |
| "learning_rate": 0.00019551638582950213, | |
| "loss": 0.412, | |
| "mean_token_accuracy": 0.8735997825860977, | |
| "num_tokens": 243854.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.41812095791101456, | |
| "epoch": 0.13282525857376157, | |
| "grad_norm": 0.3383813500404358, | |
| "learning_rate": 0.00019531666547546403, | |
| "loss": 0.4302, | |
| "mean_token_accuracy": 0.8795482665300369, | |
| "num_tokens": 247268.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.38665496557950974, | |
| "epoch": 0.13500272182906914, | |
| "grad_norm": 0.31561279296875, | |
| "learning_rate": 0.0001951127119865578, | |
| "loss": 0.3844, | |
| "mean_token_accuracy": 0.8816228210926056, | |
| "num_tokens": 251256.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.4358583614230156, | |
| "epoch": 0.1371801850843767, | |
| "grad_norm": 0.3552601933479309, | |
| "learning_rate": 0.00019490453548229075, | |
| "loss": 0.4193, | |
| "mean_token_accuracy": 0.8728261440992355, | |
| "num_tokens": 255350.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.40031400322914124, | |
| "epoch": 0.13935764833968428, | |
| "grad_norm": 0.30350831151008606, | |
| "learning_rate": 0.00019469214629170246, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.8818740844726562, | |
| "num_tokens": 259391.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.3782212808728218, | |
| "epoch": 0.14153511159499182, | |
| "grad_norm": 0.2870739996433258, | |
| "learning_rate": 0.00019447555495285247, | |
| "loss": 0.3396, | |
| "mean_token_accuracy": 0.8948279619216919, | |
| "num_tokens": 263599.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.41549866646528244, | |
| "epoch": 0.1437125748502994, | |
| "grad_norm": 0.2995204031467438, | |
| "learning_rate": 0.00019425477221229694, | |
| "loss": 0.394, | |
| "mean_token_accuracy": 0.8853535056114197, | |
| "num_tokens": 267514.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.40607404708862305, | |
| "epoch": 0.14589003810560697, | |
| "grad_norm": 0.3016026020050049, | |
| "learning_rate": 0.00019402980902455592, | |
| "loss": 0.4006, | |
| "mean_token_accuracy": 0.8783000707626343, | |
| "num_tokens": 271156.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.3719393089413643, | |
| "epoch": 0.14806750136091454, | |
| "grad_norm": 0.26128438115119934, | |
| "learning_rate": 0.00019380067655156956, | |
| "loss": 0.3537, | |
| "mean_token_accuracy": 0.8965920209884644, | |
| "num_tokens": 275317.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.42157839983701706, | |
| "epoch": 0.1502449646162221, | |
| "grad_norm": 0.3250483572483063, | |
| "learning_rate": 0.00019356738616214435, | |
| "loss": 0.4115, | |
| "mean_token_accuracy": 0.8846541047096252, | |
| "num_tokens": 279424.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.4183052033185959, | |
| "epoch": 0.15242242787152968, | |
| "grad_norm": 0.315361887216568, | |
| "learning_rate": 0.00019332994943138906, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8700041323900223, | |
| "num_tokens": 283564.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.40483858436346054, | |
| "epoch": 0.15459989112683722, | |
| "grad_norm": 0.31096142530441284, | |
| "learning_rate": 0.00019308837814014038, | |
| "loss": 0.3835, | |
| "mean_token_accuracy": 0.8849562704563141, | |
| "num_tokens": 287357.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.39035435765981674, | |
| "epoch": 0.1567773543821448, | |
| "grad_norm": 0.3067997097969055, | |
| "learning_rate": 0.0001928426842743784, | |
| "loss": 0.3846, | |
| "mean_token_accuracy": 0.8829791098833084, | |
| "num_tokens": 291390.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.3541962653398514, | |
| "epoch": 0.15895481763745237, | |
| "grad_norm": 0.27743661403656006, | |
| "learning_rate": 0.000192592880024632, | |
| "loss": 0.3279, | |
| "mean_token_accuracy": 0.8986150324344635, | |
| "num_tokens": 295446.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.4067593812942505, | |
| "epoch": 0.16113228089275994, | |
| "grad_norm": 0.2917785346508026, | |
| "learning_rate": 0.00019233897778537387, | |
| "loss": 0.4056, | |
| "mean_token_accuracy": 0.8775222897529602, | |
| "num_tokens": 299884.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.3865869492292404, | |
| "epoch": 0.1633097441480675, | |
| "grad_norm": 0.3175944685935974, | |
| "learning_rate": 0.00019208099015440553, | |
| "loss": 0.3947, | |
| "mean_token_accuracy": 0.8831316977739334, | |
| "num_tokens": 303679.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.42061641067266464, | |
| "epoch": 0.16548720740337508, | |
| "grad_norm": 0.29020923376083374, | |
| "learning_rate": 0.00019181892993223241, | |
| "loss": 0.424, | |
| "mean_token_accuracy": 0.8717161864042282, | |
| "num_tokens": 308028.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.3790237084031105, | |
| "epoch": 0.16766467065868262, | |
| "grad_norm": 0.28459441661834717, | |
| "learning_rate": 0.00019155281012142857, | |
| "loss": 0.3669, | |
| "mean_token_accuracy": 0.8902580589056015, | |
| "num_tokens": 312280.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.4007532522082329, | |
| "epoch": 0.1698421339139902, | |
| "grad_norm": 0.2907998263835907, | |
| "learning_rate": 0.00019128264392599166, | |
| "loss": 0.421, | |
| "mean_token_accuracy": 0.8734158575534821, | |
| "num_tokens": 316050.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.38431502133607864, | |
| "epoch": 0.17201959716929777, | |
| "grad_norm": 0.2705579102039337, | |
| "learning_rate": 0.00019100844475068777, | |
| "loss": 0.3687, | |
| "mean_token_accuracy": 0.8934948295354843, | |
| "num_tokens": 319866.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.4128147065639496, | |
| "epoch": 0.17419706042460534, | |
| "grad_norm": 0.3151399493217468, | |
| "learning_rate": 0.0001907302262003863, | |
| "loss": 0.3829, | |
| "mean_token_accuracy": 0.8834633827209473, | |
| "num_tokens": 323982.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.4086031913757324, | |
| "epoch": 0.1763745236799129, | |
| "grad_norm": 0.3054238557815552, | |
| "learning_rate": 0.00019044800207938483, | |
| "loss": 0.3987, | |
| "mean_token_accuracy": 0.8847066015005112, | |
| "num_tokens": 327984.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.3883258253335953, | |
| "epoch": 0.17855198693522048, | |
| "grad_norm": 0.29092952609062195, | |
| "learning_rate": 0.00019016178639072448, | |
| "loss": 0.3799, | |
| "mean_token_accuracy": 0.8958835899829865, | |
| "num_tokens": 331502.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.41453375667333603, | |
| "epoch": 0.18072945019052802, | |
| "grad_norm": 0.279079407453537, | |
| "learning_rate": 0.0001898715933354948, | |
| "loss": 0.4303, | |
| "mean_token_accuracy": 0.879971370100975, | |
| "num_tokens": 335369.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.395871065557003, | |
| "epoch": 0.1829069134458356, | |
| "grad_norm": 0.2992061972618103, | |
| "learning_rate": 0.0001895774373121294, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.8855740427970886, | |
| "num_tokens": 339407.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.352156363427639, | |
| "epoch": 0.18508437670114317, | |
| "grad_norm": 0.29319193959236145, | |
| "learning_rate": 0.00018927933291569142, | |
| "loss": 0.3458, | |
| "mean_token_accuracy": 0.8971658796072006, | |
| "num_tokens": 343524.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.3487248420715332, | |
| "epoch": 0.18726183995645074, | |
| "grad_norm": 0.2763819694519043, | |
| "learning_rate": 0.00018897729493714936, | |
| "loss": 0.3259, | |
| "mean_token_accuracy": 0.8960808515548706, | |
| "num_tokens": 347925.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.4102029874920845, | |
| "epoch": 0.1894393032117583, | |
| "grad_norm": 0.2646510601043701, | |
| "learning_rate": 0.00018867133836264333, | |
| "loss": 0.3945, | |
| "mean_token_accuracy": 0.8839164674282074, | |
| "num_tokens": 352250.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.3762153908610344, | |
| "epoch": 0.19161676646706588, | |
| "grad_norm": 0.3275756239891052, | |
| "learning_rate": 0.00018836147837274128, | |
| "loss": 0.3588, | |
| "mean_token_accuracy": 0.893315777182579, | |
| "num_tokens": 356538.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.3680166006088257, | |
| "epoch": 0.19379422972237342, | |
| "grad_norm": 0.3026663362979889, | |
| "learning_rate": 0.00018804773034168605, | |
| "loss": 0.346, | |
| "mean_token_accuracy": 0.8997195810079575, | |
| "num_tokens": 360352.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.3681929111480713, | |
| "epoch": 0.195971692977681, | |
| "grad_norm": 0.27409690618515015, | |
| "learning_rate": 0.00018773010983663235, | |
| "loss": 0.3619, | |
| "mean_token_accuracy": 0.8918221592903137, | |
| "num_tokens": 364359.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.41026338934898376, | |
| "epoch": 0.19814915623298857, | |
| "grad_norm": 0.27450209856033325, | |
| "learning_rate": 0.00018740863261687438, | |
| "loss": 0.3772, | |
| "mean_token_accuracy": 0.885251596570015, | |
| "num_tokens": 368184.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.41991668939590454, | |
| "epoch": 0.20032661948829614, | |
| "grad_norm": 0.3204193413257599, | |
| "learning_rate": 0.000187083314633064, | |
| "loss": 0.4387, | |
| "mean_token_accuracy": 0.877353847026825, | |
| "num_tokens": 372188.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.3829573169350624, | |
| "epoch": 0.2025040827436037, | |
| "grad_norm": 0.2948894500732422, | |
| "learning_rate": 0.00018675417202641928, | |
| "loss": 0.3713, | |
| "mean_token_accuracy": 0.8871684223413467, | |
| "num_tokens": 376175.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.37284964323043823, | |
| "epoch": 0.20468154599891128, | |
| "grad_norm": 0.3094096779823303, | |
| "learning_rate": 0.00018642122112792352, | |
| "loss": 0.3704, | |
| "mean_token_accuracy": 0.8872140049934387, | |
| "num_tokens": 380212.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.3658677488565445, | |
| "epoch": 0.20685900925421882, | |
| "grad_norm": 0.2979802191257477, | |
| "learning_rate": 0.00018608447845751521, | |
| "loss": 0.3491, | |
| "mean_token_accuracy": 0.8897504657506943, | |
| "num_tokens": 384295.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.36876438558101654, | |
| "epoch": 0.2090364725095264, | |
| "grad_norm": 0.2677754759788513, | |
| "learning_rate": 0.00018574396072326807, | |
| "loss": 0.3441, | |
| "mean_token_accuracy": 0.894922137260437, | |
| "num_tokens": 388732.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.3612924814224243, | |
| "epoch": 0.21121393576483397, | |
| "grad_norm": 0.2736094892024994, | |
| "learning_rate": 0.0001853996848205622, | |
| "loss": 0.3723, | |
| "mean_token_accuracy": 0.8909705579280853, | |
| "num_tokens": 392764.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.3905804604291916, | |
| "epoch": 0.21339139902014154, | |
| "grad_norm": 0.2624414265155792, | |
| "learning_rate": 0.0001850516678312458, | |
| "loss": 0.3891, | |
| "mean_token_accuracy": 0.8835895210504532, | |
| "num_tokens": 397014.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.3591335415840149, | |
| "epoch": 0.2155688622754491, | |
| "grad_norm": 0.27455052733421326, | |
| "learning_rate": 0.0001846999270227876, | |
| "loss": 0.3285, | |
| "mean_token_accuracy": 0.9014366716146469, | |
| "num_tokens": 400931.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.3889941945672035, | |
| "epoch": 0.21774632553075668, | |
| "grad_norm": 0.3075306713581085, | |
| "learning_rate": 0.00018434447984742012, | |
| "loss": 0.3748, | |
| "mean_token_accuracy": 0.8902212232351303, | |
| "num_tokens": 404953.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.40706127136945724, | |
| "epoch": 0.21992378878606422, | |
| "grad_norm": 0.291089802980423, | |
| "learning_rate": 0.00018398534394127366, | |
| "loss": 0.3842, | |
| "mean_token_accuracy": 0.8786927759647369, | |
| "num_tokens": 408846.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.3662910833954811, | |
| "epoch": 0.2221012520413718, | |
| "grad_norm": 0.2830312252044678, | |
| "learning_rate": 0.00018362253712350131, | |
| "loss": 0.3651, | |
| "mean_token_accuracy": 0.8856998383998871, | |
| "num_tokens": 413058.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.3981722518801689, | |
| "epoch": 0.22427871529667937, | |
| "grad_norm": 0.26717105507850647, | |
| "learning_rate": 0.00018325607739539497, | |
| "loss": 0.4013, | |
| "mean_token_accuracy": 0.881842851638794, | |
| "num_tokens": 417404.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.38402143120765686, | |
| "epoch": 0.22645617855198694, | |
| "grad_norm": 0.26284581422805786, | |
| "learning_rate": 0.00018288598293949185, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.8858134895563126, | |
| "num_tokens": 421886.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.35189586132764816, | |
| "epoch": 0.2286336418072945, | |
| "grad_norm": 0.2981458604335785, | |
| "learning_rate": 0.00018251227211867264, | |
| "loss": 0.3779, | |
| "mean_token_accuracy": 0.8904144316911697, | |
| "num_tokens": 426069.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.3991141989827156, | |
| "epoch": 0.23081110506260208, | |
| "grad_norm": 0.30855289101600647, | |
| "learning_rate": 0.0001821349634752502, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.875004380941391, | |
| "num_tokens": 430019.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.3846806064248085, | |
| "epoch": 0.23298856831790962, | |
| "grad_norm": 0.25153040885925293, | |
| "learning_rate": 0.00018175407573004974, | |
| "loss": 0.3944, | |
| "mean_token_accuracy": 0.8794781714677811, | |
| "num_tokens": 434787.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.38610684871673584, | |
| "epoch": 0.2351660315732172, | |
| "grad_norm": 0.25855541229248047, | |
| "learning_rate": 0.00018136962778147965, | |
| "loss": 0.3625, | |
| "mean_token_accuracy": 0.895257756114006, | |
| "num_tokens": 438762.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.38023480772972107, | |
| "epoch": 0.23734349482852476, | |
| "grad_norm": 0.26064959168434143, | |
| "learning_rate": 0.00018098163870459419, | |
| "loss": 0.3508, | |
| "mean_token_accuracy": 0.8982452154159546, | |
| "num_tokens": 442358.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.38109494745731354, | |
| "epoch": 0.23952095808383234, | |
| "grad_norm": 0.2560478746891022, | |
| "learning_rate": 0.00018059012775014673, | |
| "loss": 0.3316, | |
| "mean_token_accuracy": 0.8920884728431702, | |
| "num_tokens": 446375.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.40175357460975647, | |
| "epoch": 0.2416984213391399, | |
| "grad_norm": 0.2690741717815399, | |
| "learning_rate": 0.00018019511434363479, | |
| "loss": 0.3694, | |
| "mean_token_accuracy": 0.8843608647584915, | |
| "num_tokens": 450240.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.4437231123447418, | |
| "epoch": 0.24387588459444748, | |
| "grad_norm": 0.3393898606300354, | |
| "learning_rate": 0.00017979661808433615, | |
| "loss": 0.4375, | |
| "mean_token_accuracy": 0.8717398643493652, | |
| "num_tokens": 454162.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.39301927387714386, | |
| "epoch": 0.24605334784975502, | |
| "grad_norm": 0.26305022835731506, | |
| "learning_rate": 0.00017939465874433633, | |
| "loss": 0.3915, | |
| "mean_token_accuracy": 0.8859032839536667, | |
| "num_tokens": 458075.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.37585896253585815, | |
| "epoch": 0.2482308111050626, | |
| "grad_norm": 0.2808936536312103, | |
| "learning_rate": 0.0001789892562675477, | |
| "loss": 0.3808, | |
| "mean_token_accuracy": 0.8814007937908173, | |
| "num_tokens": 462440.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.35389212518930435, | |
| "epoch": 0.25040827436037016, | |
| "grad_norm": 0.2638992667198181, | |
| "learning_rate": 0.0001785804307687199, | |
| "loss": 0.3669, | |
| "mean_token_accuracy": 0.8885058760643005, | |
| "num_tokens": 466896.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.32084520161151886, | |
| "epoch": 0.2525857376156777, | |
| "grad_norm": 0.2875458896160126, | |
| "learning_rate": 0.00017816820253244156, | |
| "loss": 0.3393, | |
| "mean_token_accuracy": 0.8992051929235458, | |
| "num_tokens": 470737.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.37875620275735855, | |
| "epoch": 0.2547632008709853, | |
| "grad_norm": 0.3010421693325043, | |
| "learning_rate": 0.0001777525920121343, | |
| "loss": 0.3771, | |
| "mean_token_accuracy": 0.8866951763629913, | |
| "num_tokens": 474704.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.3695053979754448, | |
| "epoch": 0.25694066412629285, | |
| "grad_norm": 0.28365740180015564, | |
| "learning_rate": 0.0001773336198290375, | |
| "loss": 0.3606, | |
| "mean_token_accuracy": 0.8899102210998535, | |
| "num_tokens": 478684.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.37022798508405685, | |
| "epoch": 0.25911812738160045, | |
| "grad_norm": 0.2810768187046051, | |
| "learning_rate": 0.00017691130677118533, | |
| "loss": 0.371, | |
| "mean_token_accuracy": 0.8898769170045853, | |
| "num_tokens": 482795.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.3846744894981384, | |
| "epoch": 0.261295590636908, | |
| "grad_norm": 0.2767440974712372, | |
| "learning_rate": 0.00017648567379237524, | |
| "loss": 0.3858, | |
| "mean_token_accuracy": 0.8894098848104477, | |
| "num_tokens": 486910.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.36647915840148926, | |
| "epoch": 0.2634730538922156, | |
| "grad_norm": 0.29192766547203064, | |
| "learning_rate": 0.00017605674201112844, | |
| "loss": 0.3532, | |
| "mean_token_accuracy": 0.8931601047515869, | |
| "num_tokens": 490909.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.3607020005583763, | |
| "epoch": 0.26565051714752314, | |
| "grad_norm": 0.27455756068229675, | |
| "learning_rate": 0.00017562453270964184, | |
| "loss": 0.3376, | |
| "mean_token_accuracy": 0.8977847099304199, | |
| "num_tokens": 494900.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.39875783771276474, | |
| "epoch": 0.2678279804028307, | |
| "grad_norm": 0.29144948720932007, | |
| "learning_rate": 0.0001751890673327323, | |
| "loss": 0.3625, | |
| "mean_token_accuracy": 0.8899316191673279, | |
| "num_tokens": 498621.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.388169527053833, | |
| "epoch": 0.2700054436581383, | |
| "grad_norm": 0.28327831625938416, | |
| "learning_rate": 0.00017475036748677253, | |
| "loss": 0.368, | |
| "mean_token_accuracy": 0.8881956189870834, | |
| "num_tokens": 502604.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.42279627174139023, | |
| "epoch": 0.2721829069134458, | |
| "grad_norm": 0.2637234330177307, | |
| "learning_rate": 0.00017430845493861903, | |
| "loss": 0.4163, | |
| "mean_token_accuracy": 0.8793482929468155, | |
| "num_tokens": 506851.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.3659377843141556, | |
| "epoch": 0.2743603701687534, | |
| "grad_norm": 0.2649920582771301, | |
| "learning_rate": 0.00017386335161453204, | |
| "loss": 0.3592, | |
| "mean_token_accuracy": 0.8870955407619476, | |
| "num_tokens": 511029.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.3424355015158653, | |
| "epoch": 0.27653783342406096, | |
| "grad_norm": 0.24584396183490753, | |
| "learning_rate": 0.00017341507959908788, | |
| "loss": 0.3212, | |
| "mean_token_accuracy": 0.8989846706390381, | |
| "num_tokens": 514975.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.38080035150051117, | |
| "epoch": 0.27871529667936856, | |
| "grad_norm": 0.2918618321418762, | |
| "learning_rate": 0.00017296366113408283, | |
| "loss": 0.3836, | |
| "mean_token_accuracy": 0.8840546309947968, | |
| "num_tokens": 518603.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.37054024636745453, | |
| "epoch": 0.2808927599346761, | |
| "grad_norm": 0.2792854309082031, | |
| "learning_rate": 0.00017250911861742984, | |
| "loss": 0.383, | |
| "mean_token_accuracy": 0.8847608417272568, | |
| "num_tokens": 522974.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.4149508401751518, | |
| "epoch": 0.28307022318998365, | |
| "grad_norm": 0.2900242805480957, | |
| "learning_rate": 0.00017205147460204708, | |
| "loss": 0.4176, | |
| "mean_token_accuracy": 0.8743131309747696, | |
| "num_tokens": 527053.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.3568470776081085, | |
| "epoch": 0.28524768644529125, | |
| "grad_norm": 0.2806275188922882, | |
| "learning_rate": 0.00017159075179473904, | |
| "loss": 0.3506, | |
| "mean_token_accuracy": 0.8944987952709198, | |
| "num_tokens": 531165.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.3553621917963028, | |
| "epoch": 0.2874251497005988, | |
| "grad_norm": 0.25992849469184875, | |
| "learning_rate": 0.00017112697305506972, | |
| "loss": 0.3473, | |
| "mean_token_accuracy": 0.8974603414535522, | |
| "num_tokens": 535268.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.350556381046772, | |
| "epoch": 0.2896026129559064, | |
| "grad_norm": 0.255686491727829, | |
| "learning_rate": 0.00017066016139422868, | |
| "loss": 0.3428, | |
| "mean_token_accuracy": 0.8938136249780655, | |
| "num_tokens": 539608.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.3975898027420044, | |
| "epoch": 0.29178007621121393, | |
| "grad_norm": 0.2862681746482849, | |
| "learning_rate": 0.00017019033997388893, | |
| "loss": 0.3852, | |
| "mean_token_accuracy": 0.8919837325811386, | |
| "num_tokens": 543509.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.3602987751364708, | |
| "epoch": 0.2939575394665215, | |
| "grad_norm": 0.2506209909915924, | |
| "learning_rate": 0.00016971753210505815, | |
| "loss": 0.3512, | |
| "mean_token_accuracy": 0.8999500423669815, | |
| "num_tokens": 548201.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.36172477155923843, | |
| "epoch": 0.2961350027218291, | |
| "grad_norm": 0.24992506206035614, | |
| "learning_rate": 0.00016924176124692171, | |
| "loss": 0.3296, | |
| "mean_token_accuracy": 0.9002155065536499, | |
| "num_tokens": 552588.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.39114704728126526, | |
| "epoch": 0.2983124659771366, | |
| "grad_norm": 0.26535582542419434, | |
| "learning_rate": 0.00016876305100567898, | |
| "loss": 0.3606, | |
| "mean_token_accuracy": 0.8913624733686447, | |
| "num_tokens": 556684.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.3595954030752182, | |
| "epoch": 0.3004899292324442, | |
| "grad_norm": 0.2526366114616394, | |
| "learning_rate": 0.0001682814251333718, | |
| "loss": 0.3524, | |
| "mean_token_accuracy": 0.8964285999536514, | |
| "num_tokens": 560872.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.3456057384610176, | |
| "epoch": 0.30266739248775176, | |
| "grad_norm": 0.2838667631149292, | |
| "learning_rate": 0.0001677969075267062, | |
| "loss": 0.3598, | |
| "mean_token_accuracy": 0.8893538117408752, | |
| "num_tokens": 565414.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.3304522782564163, | |
| "epoch": 0.30484485574305936, | |
| "grad_norm": 0.2537218928337097, | |
| "learning_rate": 0.00016730952222586672, | |
| "loss": 0.3252, | |
| "mean_token_accuracy": 0.9008310884237289, | |
| "num_tokens": 569961.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.37971338629722595, | |
| "epoch": 0.3070223189983669, | |
| "grad_norm": 0.2846769392490387, | |
| "learning_rate": 0.00016681929341332333, | |
| "loss": 0.3812, | |
| "mean_token_accuracy": 0.8877308219671249, | |
| "num_tokens": 573882.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.32383736968040466, | |
| "epoch": 0.30919978225367445, | |
| "grad_norm": 0.30265504121780396, | |
| "learning_rate": 0.00016632624541263193, | |
| "loss": 0.3259, | |
| "mean_token_accuracy": 0.8970090597867966, | |
| "num_tokens": 577860.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.4320111721754074, | |
| "epoch": 0.31137724550898205, | |
| "grad_norm": 0.2903831899166107, | |
| "learning_rate": 0.0001658304026872274, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8787370920181274, | |
| "num_tokens": 581333.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.372535839676857, | |
| "epoch": 0.3135547087642896, | |
| "grad_norm": 0.26929277181625366, | |
| "learning_rate": 0.00016533178983920964, | |
| "loss": 0.3555, | |
| "mean_token_accuracy": 0.8883365392684937, | |
| "num_tokens": 585459.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.38039466738700867, | |
| "epoch": 0.3157321720195972, | |
| "grad_norm": 0.2679445743560791, | |
| "learning_rate": 0.00016483043160812295, | |
| "loss": 0.3633, | |
| "mean_token_accuracy": 0.8902519345283508, | |
| "num_tokens": 589257.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.42324574291706085, | |
| "epoch": 0.31790963527490473, | |
| "grad_norm": 0.2745194137096405, | |
| "learning_rate": 0.0001643263528697288, | |
| "loss": 0.4154, | |
| "mean_token_accuracy": 0.878746971487999, | |
| "num_tokens": 593457.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.46310587227344513, | |
| "epoch": 0.3200870985302123, | |
| "grad_norm": 0.2937363088130951, | |
| "learning_rate": 0.0001638195786347712, | |
| "loss": 0.4564, | |
| "mean_token_accuracy": 0.8730504065752029, | |
| "num_tokens": 596979.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.3750259429216385, | |
| "epoch": 0.3222645617855199, | |
| "grad_norm": 0.24124816060066223, | |
| "learning_rate": 0.00016331013404773597, | |
| "loss": 0.3568, | |
| "mean_token_accuracy": 0.8933057188987732, | |
| "num_tokens": 601388.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.37991973757743835, | |
| "epoch": 0.3244420250408274, | |
| "grad_norm": 0.27898603677749634, | |
| "learning_rate": 0.00016279804438560304, | |
| "loss": 0.3518, | |
| "mean_token_accuracy": 0.8888091742992401, | |
| "num_tokens": 605267.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.38875921070575714, | |
| "epoch": 0.326619488296135, | |
| "grad_norm": 0.2823559641838074, | |
| "learning_rate": 0.00016228333505659246, | |
| "loss": 0.376, | |
| "mean_token_accuracy": 0.8856324106454849, | |
| "num_tokens": 609434.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.3876258060336113, | |
| "epoch": 0.32879695155144256, | |
| "grad_norm": 0.2898506224155426, | |
| "learning_rate": 0.00016176603159890346, | |
| "loss": 0.376, | |
| "mean_token_accuracy": 0.8831023424863815, | |
| "num_tokens": 613396.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.3707014173269272, | |
| "epoch": 0.33097441480675016, | |
| "grad_norm": 0.2642916142940521, | |
| "learning_rate": 0.00016124615967944762, | |
| "loss": 0.3752, | |
| "mean_token_accuracy": 0.8911104500293732, | |
| "num_tokens": 617399.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.3736526593565941, | |
| "epoch": 0.3331518780620577, | |
| "grad_norm": 0.3004290461540222, | |
| "learning_rate": 0.00016072374509257516, | |
| "loss": 0.3808, | |
| "mean_token_accuracy": 0.8887975662946701, | |
| "num_tokens": 621104.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.35118088871240616, | |
| "epoch": 0.33532934131736525, | |
| "grad_norm": 0.26038020849227905, | |
| "learning_rate": 0.0001601988137587952, | |
| "loss": 0.3382, | |
| "mean_token_accuracy": 0.8998311161994934, | |
| "num_tokens": 625151.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.38535889238119125, | |
| "epoch": 0.33750680457267285, | |
| "grad_norm": 0.2737407088279724, | |
| "learning_rate": 0.00015967139172348954, | |
| "loss": 0.3913, | |
| "mean_token_accuracy": 0.8854628801345825, | |
| "num_tokens": 628964.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.38133371621370316, | |
| "epoch": 0.3396842678279804, | |
| "grad_norm": 0.27977254986763, | |
| "learning_rate": 0.00015914150515562055, | |
| "loss": 0.3794, | |
| "mean_token_accuracy": 0.8869093209505081, | |
| "num_tokens": 632846.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.37492088973522186, | |
| "epoch": 0.341861731083288, | |
| "grad_norm": 0.2831854224205017, | |
| "learning_rate": 0.00015860918034643276, | |
| "loss": 0.355, | |
| "mean_token_accuracy": 0.8947048038244247, | |
| "num_tokens": 636601.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.4035057872533798, | |
| "epoch": 0.34403919433859553, | |
| "grad_norm": 0.37472277879714966, | |
| "learning_rate": 0.00015807444370814815, | |
| "loss": 0.3954, | |
| "mean_token_accuracy": 0.8825927823781967, | |
| "num_tokens": 640518.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.34154055267572403, | |
| "epoch": 0.3462166575939031, | |
| "grad_norm": 0.27869144082069397, | |
| "learning_rate": 0.00015753732177265582, | |
| "loss": 0.3376, | |
| "mean_token_accuracy": 0.8913106769323349, | |
| "num_tokens": 644858.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.41696153581142426, | |
| "epoch": 0.3483941208492107, | |
| "grad_norm": 0.291029155254364, | |
| "learning_rate": 0.00015699784119019554, | |
| "loss": 0.3964, | |
| "mean_token_accuracy": 0.8756668865680695, | |
| "num_tokens": 648735.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.3924735262989998, | |
| "epoch": 0.3505715841045182, | |
| "grad_norm": 0.28552576899528503, | |
| "learning_rate": 0.00015645602872803554, | |
| "loss": 0.3852, | |
| "mean_token_accuracy": 0.8868783414363861, | |
| "num_tokens": 652408.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.34768833965063095, | |
| "epoch": 0.3527490473598258, | |
| "grad_norm": 0.2506498098373413, | |
| "learning_rate": 0.00015591191126914424, | |
| "loss": 0.3351, | |
| "mean_token_accuracy": 0.8980260044336319, | |
| "num_tokens": 656844.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.3891329765319824, | |
| "epoch": 0.35492651061513336, | |
| "grad_norm": 0.30480027198791504, | |
| "learning_rate": 0.0001553655158108565, | |
| "loss": 0.4034, | |
| "mean_token_accuracy": 0.8790914118289948, | |
| "num_tokens": 661184.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.4067026600241661, | |
| "epoch": 0.35710397387044096, | |
| "grad_norm": 0.27617979049682617, | |
| "learning_rate": 0.00015481686946353413, | |
| "loss": 0.4081, | |
| "mean_token_accuracy": 0.8769482225179672, | |
| "num_tokens": 665163.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.4310021921992302, | |
| "epoch": 0.3592814371257485, | |
| "grad_norm": 0.2954219877719879, | |
| "learning_rate": 0.00015426599944922062, | |
| "loss": 0.4193, | |
| "mean_token_accuracy": 0.8807303011417389, | |
| "num_tokens": 669177.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.37181543558835983, | |
| "epoch": 0.36145890038105605, | |
| "grad_norm": 0.2674584984779358, | |
| "learning_rate": 0.0001537129331002907, | |
| "loss": 0.3423, | |
| "mean_token_accuracy": 0.8933178037405014, | |
| "num_tokens": 672660.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.36294087767601013, | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.2539677321910858, | |
| "learning_rate": 0.00015315769785809394, | |
| "loss": 0.3419, | |
| "mean_token_accuracy": 0.8953043073415756, | |
| "num_tokens": 676937.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.36527111381292343, | |
| "epoch": 0.3658138268916712, | |
| "grad_norm": 0.279691219329834, | |
| "learning_rate": 0.0001526003212715934, | |
| "loss": 0.3689, | |
| "mean_token_accuracy": 0.8915591537952423, | |
| "num_tokens": 680798.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.32713668793439865, | |
| "epoch": 0.3679912901469788, | |
| "grad_norm": 0.2610296308994293, | |
| "learning_rate": 0.00015204083099599862, | |
| "loss": 0.3398, | |
| "mean_token_accuracy": 0.8963142186403275, | |
| "num_tokens": 685386.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.35941240191459656, | |
| "epoch": 0.37016875340228633, | |
| "grad_norm": 0.26744726300239563, | |
| "learning_rate": 0.00015147925479139357, | |
| "loss": 0.3543, | |
| "mean_token_accuracy": 0.8914755284786224, | |
| "num_tokens": 689455.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.3640653118491173, | |
| "epoch": 0.3723462166575939, | |
| "grad_norm": 0.2773352861404419, | |
| "learning_rate": 0.00015091562052135912, | |
| "loss": 0.3822, | |
| "mean_token_accuracy": 0.8882244229316711, | |
| "num_tokens": 693956.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.37736089527606964, | |
| "epoch": 0.3745236799129015, | |
| "grad_norm": 0.2925175130367279, | |
| "learning_rate": 0.00015034995615159074, | |
| "loss": 0.3628, | |
| "mean_token_accuracy": 0.889089897274971, | |
| "num_tokens": 697863.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.37925824522972107, | |
| "epoch": 0.376701143168209, | |
| "grad_norm": 0.2618020474910736, | |
| "learning_rate": 0.00014978228974851077, | |
| "loss": 0.3624, | |
| "mean_token_accuracy": 0.8942320197820663, | |
| "num_tokens": 701537.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.34706228971481323, | |
| "epoch": 0.3788786064235166, | |
| "grad_norm": 0.2923741340637207, | |
| "learning_rate": 0.000149212649477876, | |
| "loss": 0.3541, | |
| "mean_token_accuracy": 0.8954867422580719, | |
| "num_tokens": 705253.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.3569258749485016, | |
| "epoch": 0.38105606967882416, | |
| "grad_norm": 0.2816322147846222, | |
| "learning_rate": 0.00014864106360337992, | |
| "loss": 0.357, | |
| "mean_token_accuracy": 0.8935216814279556, | |
| "num_tokens": 709276.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.35546237230300903, | |
| "epoch": 0.38323353293413176, | |
| "grad_norm": 0.2701316773891449, | |
| "learning_rate": 0.00014806756048525073, | |
| "loss": 0.3423, | |
| "mean_token_accuracy": 0.9047370553016663, | |
| "num_tokens": 713489.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.38647014647722244, | |
| "epoch": 0.3854109961894393, | |
| "grad_norm": 0.2974873185157776, | |
| "learning_rate": 0.00014749216857884388, | |
| "loss": 0.3698, | |
| "mean_token_accuracy": 0.8884487450122833, | |
| "num_tokens": 717582.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.41117021441459656, | |
| "epoch": 0.38758845944474685, | |
| "grad_norm": 0.46910688281059265, | |
| "learning_rate": 0.0001469149164332304, | |
| "loss": 0.3913, | |
| "mean_token_accuracy": 0.8818454891443253, | |
| "num_tokens": 721522.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.3503909111022949, | |
| "epoch": 0.38976592270005445, | |
| "grad_norm": 0.24447594583034515, | |
| "learning_rate": 0.00014633583268978037, | |
| "loss": 0.3159, | |
| "mean_token_accuracy": 0.9022247046232224, | |
| "num_tokens": 725345.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.34674597531557083, | |
| "epoch": 0.391943385955362, | |
| "grad_norm": 0.25831112265586853, | |
| "learning_rate": 0.00014575494608074166, | |
| "loss": 0.3403, | |
| "mean_token_accuracy": 0.8952628076076508, | |
| "num_tokens": 729377.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.32907338812947273, | |
| "epoch": 0.3941208492106696, | |
| "grad_norm": 0.25881391763687134, | |
| "learning_rate": 0.0001451722854278146, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.9026439040899277, | |
| "num_tokens": 733265.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.35795633494853973, | |
| "epoch": 0.39629831246597713, | |
| "grad_norm": 0.28063708543777466, | |
| "learning_rate": 0.00014458787964072165, | |
| "loss": 0.3381, | |
| "mean_token_accuracy": 0.8983410447835922, | |
| "num_tokens": 737131.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.33193762600421906, | |
| "epoch": 0.39847577572128473, | |
| "grad_norm": 0.29431116580963135, | |
| "learning_rate": 0.00014400175771577326, | |
| "loss": 0.3225, | |
| "mean_token_accuracy": 0.9057250618934631, | |
| "num_tokens": 740821.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.31135137379169464, | |
| "epoch": 0.4006532389765923, | |
| "grad_norm": 0.29750552773475647, | |
| "learning_rate": 0.00014341394873442897, | |
| "loss": 0.3264, | |
| "mean_token_accuracy": 0.8973560929298401, | |
| "num_tokens": 744896.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.3354290798306465, | |
| "epoch": 0.4028307022318998, | |
| "grad_norm": 0.27261385321617126, | |
| "learning_rate": 0.0001428244818618546, | |
| "loss": 0.3427, | |
| "mean_token_accuracy": 0.8985736221075058, | |
| "num_tokens": 748839.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.3166900649666786, | |
| "epoch": 0.4050081654872074, | |
| "grad_norm": 0.27092301845550537, | |
| "learning_rate": 0.0001422333863454751, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.9003172963857651, | |
| "num_tokens": 752819.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.3550329655408859, | |
| "epoch": 0.40718562874251496, | |
| "grad_norm": 0.27660685777664185, | |
| "learning_rate": 0.0001416406915135235, | |
| "loss": 0.3544, | |
| "mean_token_accuracy": 0.8941550552845001, | |
| "num_tokens": 756769.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.3845446854829788, | |
| "epoch": 0.40936309199782256, | |
| "grad_norm": 0.3029703199863434, | |
| "learning_rate": 0.00014104642677358547, | |
| "loss": 0.3864, | |
| "mean_token_accuracy": 0.8840687274932861, | |
| "num_tokens": 760466.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.3692278042435646, | |
| "epoch": 0.4115405552531301, | |
| "grad_norm": 0.2795009911060333, | |
| "learning_rate": 0.00014045062161114065, | |
| "loss": 0.3618, | |
| "mean_token_accuracy": 0.8954125195741653, | |
| "num_tokens": 764627.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.34045620262622833, | |
| "epoch": 0.41371801850843765, | |
| "grad_norm": 0.2698828876018524, | |
| "learning_rate": 0.00013985330558809918, | |
| "loss": 0.3225, | |
| "mean_token_accuracy": 0.8965429812669754, | |
| "num_tokens": 768901.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.3410160765051842, | |
| "epoch": 0.41589548176374525, | |
| "grad_norm": 0.25038790702819824, | |
| "learning_rate": 0.00013925450834133542, | |
| "loss": 0.3253, | |
| "mean_token_accuracy": 0.9037521332502365, | |
| "num_tokens": 773052.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.36402270942926407, | |
| "epoch": 0.4180729450190528, | |
| "grad_norm": 0.2695653736591339, | |
| "learning_rate": 0.00013865425958121697, | |
| "loss": 0.3614, | |
| "mean_token_accuracy": 0.8942222446203232, | |
| "num_tokens": 776826.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.31327100098133087, | |
| "epoch": 0.4202504082743604, | |
| "grad_norm": 0.2406344711780548, | |
| "learning_rate": 0.00013805258909013095, | |
| "loss": 0.2927, | |
| "mean_token_accuracy": 0.9095935225486755, | |
| "num_tokens": 781250.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.37202536314725876, | |
| "epoch": 0.42242787152966793, | |
| "grad_norm": 0.30606889724731445, | |
| "learning_rate": 0.00013744952672100613, | |
| "loss": 0.3924, | |
| "mean_token_accuracy": 0.8838685899972916, | |
| "num_tokens": 785238.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.3558414503931999, | |
| "epoch": 0.42460533478497553, | |
| "grad_norm": 0.24589793384075165, | |
| "learning_rate": 0.00013684510239583166, | |
| "loss": 0.344, | |
| "mean_token_accuracy": 0.896059587597847, | |
| "num_tokens": 789796.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.37479735910892487, | |
| "epoch": 0.4267827980402831, | |
| "grad_norm": 0.25714266300201416, | |
| "learning_rate": 0.0001362393461041726, | |
| "loss": 0.3708, | |
| "mean_token_accuracy": 0.8902730643749237, | |
| "num_tokens": 794040.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.356051467359066, | |
| "epoch": 0.4289602612955906, | |
| "grad_norm": 0.27870944142341614, | |
| "learning_rate": 0.00013563228790168178, | |
| "loss": 0.3551, | |
| "mean_token_accuracy": 0.8951977044343948, | |
| "num_tokens": 798230.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.3533203676342964, | |
| "epoch": 0.4311377245508982, | |
| "grad_norm": 0.2748214602470398, | |
| "learning_rate": 0.00013502395790860862, | |
| "loss": 0.3345, | |
| "mean_token_accuracy": 0.8976791948080063, | |
| "num_tokens": 802137.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.404046893119812, | |
| "epoch": 0.43331518780620576, | |
| "grad_norm": 0.2737223505973816, | |
| "learning_rate": 0.00013441438630830464, | |
| "loss": 0.4053, | |
| "mean_token_accuracy": 0.8848972916603088, | |
| "num_tokens": 806240.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.3257349133491516, | |
| "epoch": 0.43549265106151336, | |
| "grad_norm": 0.28284040093421936, | |
| "learning_rate": 0.0001338036033457259, | |
| "loss": 0.3047, | |
| "mean_token_accuracy": 0.9047138094902039, | |
| "num_tokens": 809920.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.3515155389904976, | |
| "epoch": 0.4376701143168209, | |
| "grad_norm": 0.2601410746574402, | |
| "learning_rate": 0.00013319163932593226, | |
| "loss": 0.3389, | |
| "mean_token_accuracy": 0.8959746956825256, | |
| "num_tokens": 813888.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.35355835407972336, | |
| "epoch": 0.43984757757212845, | |
| "grad_norm": 0.28591784834861755, | |
| "learning_rate": 0.0001325785246125838, | |
| "loss": 0.3629, | |
| "mean_token_accuracy": 0.8906663358211517, | |
| "num_tokens": 817940.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.36141665279865265, | |
| "epoch": 0.44202504082743604, | |
| "grad_norm": 0.27857449650764465, | |
| "learning_rate": 0.00013196428962643426, | |
| "loss": 0.3418, | |
| "mean_token_accuracy": 0.8927578181028366, | |
| "num_tokens": 822014.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.4061436876654625, | |
| "epoch": 0.4442025040827436, | |
| "grad_norm": 0.2518883943557739, | |
| "learning_rate": 0.0001313489648438217, | |
| "loss": 0.4024, | |
| "mean_token_accuracy": 0.8816352039575577, | |
| "num_tokens": 826422.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.3674250468611717, | |
| "epoch": 0.4463799673380512, | |
| "grad_norm": 0.2753954231739044, | |
| "learning_rate": 0.00013073258079515632, | |
| "loss": 0.3508, | |
| "mean_token_accuracy": 0.8967752158641815, | |
| "num_tokens": 830085.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.35362084209918976, | |
| "epoch": 0.44855743059335873, | |
| "grad_norm": 0.2868417203426361, | |
| "learning_rate": 0.00013011516806340557, | |
| "loss": 0.3743, | |
| "mean_token_accuracy": 0.8918885141611099, | |
| "num_tokens": 834548.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.39741218090057373, | |
| "epoch": 0.45073489384866633, | |
| "grad_norm": 0.2914039194583893, | |
| "learning_rate": 0.0001294967572825769, | |
| "loss": 0.3976, | |
| "mean_token_accuracy": 0.8822353929281235, | |
| "num_tokens": 838029.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.31900452077388763, | |
| "epoch": 0.4529123571039739, | |
| "grad_norm": 0.24336911737918854, | |
| "learning_rate": 0.0001288773791361977, | |
| "loss": 0.3179, | |
| "mean_token_accuracy": 0.9089991301298141, | |
| "num_tokens": 842500.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.3548683598637581, | |
| "epoch": 0.4550898203592814, | |
| "grad_norm": 0.24573664367198944, | |
| "learning_rate": 0.0001282570643557928, | |
| "loss": 0.3332, | |
| "mean_token_accuracy": 0.8994109332561493, | |
| "num_tokens": 846504.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.4130469933152199, | |
| "epoch": 0.457267283614589, | |
| "grad_norm": 0.22916413843631744, | |
| "learning_rate": 0.00012763584371935986, | |
| "loss": 0.3935, | |
| "mean_token_accuracy": 0.8888524770736694, | |
| "num_tokens": 850825.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.39430346339941025, | |
| "epoch": 0.45944474686989656, | |
| "grad_norm": 0.24899472296237946, | |
| "learning_rate": 0.00012701374804984205, | |
| "loss": 0.3623, | |
| "mean_token_accuracy": 0.8868012726306915, | |
| "num_tokens": 854995.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.3773266300559044, | |
| "epoch": 0.46162221012520416, | |
| "grad_norm": 0.282216340303421, | |
| "learning_rate": 0.00012639080821359898, | |
| "loss": 0.3786, | |
| "mean_token_accuracy": 0.8827318847179413, | |
| "num_tokens": 858988.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.3632218912243843, | |
| "epoch": 0.4637996733805117, | |
| "grad_norm": 0.2573084235191345, | |
| "learning_rate": 0.00012576705511887492, | |
| "loss": 0.3624, | |
| "mean_token_accuracy": 0.8912414461374283, | |
| "num_tokens": 863081.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.35169900953769684, | |
| "epoch": 0.46597713663581924, | |
| "grad_norm": 0.2548096477985382, | |
| "learning_rate": 0.00012514251971426545, | |
| "loss": 0.3325, | |
| "mean_token_accuracy": 0.9051143527030945, | |
| "num_tokens": 867052.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.36711084097623825, | |
| "epoch": 0.46815459989112684, | |
| "grad_norm": 0.2645510733127594, | |
| "learning_rate": 0.00012451723298718175, | |
| "loss": 0.3774, | |
| "mean_token_accuracy": 0.8909319043159485, | |
| "num_tokens": 871119.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.35685280710458755, | |
| "epoch": 0.4703320631464344, | |
| "grad_norm": 0.3010730445384979, | |
| "learning_rate": 0.0001238912259623133, | |
| "loss": 0.3435, | |
| "mean_token_accuracy": 0.8955214470624924, | |
| "num_tokens": 874529.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.3657463937997818, | |
| "epoch": 0.472509526401742, | |
| "grad_norm": 0.2753501534461975, | |
| "learning_rate": 0.0001232645297000883, | |
| "loss": 0.356, | |
| "mean_token_accuracy": 0.8999243825674057, | |
| "num_tokens": 878518.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.3516548126935959, | |
| "epoch": 0.47468698965704953, | |
| "grad_norm": 0.2859194576740265, | |
| "learning_rate": 0.00012263717529513267, | |
| "loss": 0.3561, | |
| "mean_token_accuracy": 0.8952623754739761, | |
| "num_tokens": 882202.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.3554818853735924, | |
| "epoch": 0.47686445291235713, | |
| "grad_norm": 0.2630636394023895, | |
| "learning_rate": 0.00012200919387472723, | |
| "loss": 0.3454, | |
| "mean_token_accuracy": 0.8877929896116257, | |
| "num_tokens": 886781.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.35459691286087036, | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 0.28057464957237244, | |
| "learning_rate": 0.0001213806165972633, | |
| "loss": 0.3597, | |
| "mean_token_accuracy": 0.8925827890634537, | |
| "num_tokens": 890846.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.3253984898328781, | |
| "epoch": 0.4812193794229722, | |
| "grad_norm": 0.2502402067184448, | |
| "learning_rate": 0.00012075147465069667, | |
| "loss": 0.3183, | |
| "mean_token_accuracy": 0.9015309363603592, | |
| "num_tokens": 895392.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.3588094562292099, | |
| "epoch": 0.4833968426782798, | |
| "grad_norm": 0.24630582332611084, | |
| "learning_rate": 0.0001201217992510002, | |
| "loss": 0.3361, | |
| "mean_token_accuracy": 0.9005966037511826, | |
| "num_tokens": 899490.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.3819248303771019, | |
| "epoch": 0.48557430593358736, | |
| "grad_norm": 0.24468845129013062, | |
| "learning_rate": 0.00011949162164061486, | |
| "loss": 0.3661, | |
| "mean_token_accuracy": 0.8975157290697098, | |
| "num_tokens": 903478.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.4134289547801018, | |
| "epoch": 0.48775176918889496, | |
| "grad_norm": 0.27261775732040405, | |
| "learning_rate": 0.0001188609730868998, | |
| "loss": 0.4087, | |
| "mean_token_accuracy": 0.8844785243272781, | |
| "num_tokens": 907286.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.3919166326522827, | |
| "epoch": 0.4899292324442025, | |
| "grad_norm": 0.2661035358905792, | |
| "learning_rate": 0.00011822988488058071, | |
| "loss": 0.3575, | |
| "mean_token_accuracy": 0.8900353014469147, | |
| "num_tokens": 911300.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.34307558089494705, | |
| "epoch": 0.49210669569951004, | |
| "grad_norm": 0.2561405301094055, | |
| "learning_rate": 0.00011759838833419754, | |
| "loss": 0.3052, | |
| "mean_token_accuracy": 0.90419901907444, | |
| "num_tokens": 915659.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.35558557510375977, | |
| "epoch": 0.49428415895481764, | |
| "grad_norm": 0.24936646223068237, | |
| "learning_rate": 0.00011696651478055067, | |
| "loss": 0.3531, | |
| "mean_token_accuracy": 0.8979819416999817, | |
| "num_tokens": 919483.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.35391464084386826, | |
| "epoch": 0.4964616222101252, | |
| "grad_norm": 0.2600042521953583, | |
| "learning_rate": 0.00011633429557114635, | |
| "loss": 0.3565, | |
| "mean_token_accuracy": 0.889078825712204, | |
| "num_tokens": 923394.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.37007713317871094, | |
| "epoch": 0.4986390854654328, | |
| "grad_norm": 0.25796735286712646, | |
| "learning_rate": 0.00011570176207464114, | |
| "loss": 0.3369, | |
| "mean_token_accuracy": 0.8971839994192123, | |
| "num_tokens": 927293.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.38342171162366867, | |
| "epoch": 0.5008165487207403, | |
| "grad_norm": 0.27563533186912537, | |
| "learning_rate": 0.00011506894567528556, | |
| "loss": 0.3546, | |
| "mean_token_accuracy": 0.8875249475240707, | |
| "num_tokens": 931453.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.3373766243457794, | |
| "epoch": 0.5029940119760479, | |
| "grad_norm": 0.24225658178329468, | |
| "learning_rate": 0.00011443587777136679, | |
| "loss": 0.3411, | |
| "mean_token_accuracy": 0.9000124335289001, | |
| "num_tokens": 936010.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.33466411381959915, | |
| "epoch": 0.5051714752313554, | |
| "grad_norm": 0.2858439087867737, | |
| "learning_rate": 0.0001138025897736509, | |
| "loss": 0.3343, | |
| "mean_token_accuracy": 0.8957197666168213, | |
| "num_tokens": 939926.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.3573242276906967, | |
| "epoch": 0.5073489384866631, | |
| "grad_norm": 0.30942314863204956, | |
| "learning_rate": 0.00011316911310382416, | |
| "loss": 0.3597, | |
| "mean_token_accuracy": 0.8864942044019699, | |
| "num_tokens": 944087.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.3710939437150955, | |
| "epoch": 0.5095264017419706, | |
| "grad_norm": 0.2737363278865814, | |
| "learning_rate": 0.00011253547919293439, | |
| "loss": 0.3577, | |
| "mean_token_accuracy": 0.8874527662992477, | |
| "num_tokens": 948518.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.33612143993377686, | |
| "epoch": 0.5117038649972782, | |
| "grad_norm": 0.24085883796215057, | |
| "learning_rate": 0.00011190171947983091, | |
| "loss": 0.3161, | |
| "mean_token_accuracy": 0.902932345867157, | |
| "num_tokens": 952833.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.353444904088974, | |
| "epoch": 0.5138813282525857, | |
| "grad_norm": 0.28172338008880615, | |
| "learning_rate": 0.00011126786540960512, | |
| "loss": 0.3562, | |
| "mean_token_accuracy": 0.8990496397018433, | |
| "num_tokens": 956824.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.33875197917222977, | |
| "epoch": 0.5160587915078934, | |
| "grad_norm": 0.2717280387878418, | |
| "learning_rate": 0.00011063394843203004, | |
| "loss": 0.3117, | |
| "mean_token_accuracy": 0.9031887650489807, | |
| "num_tokens": 960613.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.3543147072196007, | |
| "epoch": 0.5182362547632009, | |
| "grad_norm": 0.2418098896741867, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 0.3577, | |
| "mean_token_accuracy": 0.8868001103401184, | |
| "num_tokens": 965072.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.3672889471054077, | |
| "epoch": 0.5204137180185084, | |
| "grad_norm": 0.27860227227211, | |
| "learning_rate": 0.00010936605156797, | |
| "loss": 0.3616, | |
| "mean_token_accuracy": 0.8912352472543716, | |
| "num_tokens": 969185.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.3546944558620453, | |
| "epoch": 0.522591181273816, | |
| "grad_norm": 0.27250248193740845, | |
| "learning_rate": 0.0001087321345903949, | |
| "loss": 0.34, | |
| "mean_token_accuracy": 0.8949205875396729, | |
| "num_tokens": 972955.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.4006873667240143, | |
| "epoch": 0.5247686445291235, | |
| "grad_norm": 0.28049609065055847, | |
| "learning_rate": 0.00010809828052016913, | |
| "loss": 0.3895, | |
| "mean_token_accuracy": 0.878919780254364, | |
| "num_tokens": 976759.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.34407609701156616, | |
| "epoch": 0.5269461077844312, | |
| "grad_norm": 0.22804318368434906, | |
| "learning_rate": 0.00010746452080706563, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.9041478931903839, | |
| "num_tokens": 981169.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.34020114690065384, | |
| "epoch": 0.5291235710397387, | |
| "grad_norm": 0.25987792015075684, | |
| "learning_rate": 0.00010683088689617582, | |
| "loss": 0.3175, | |
| "mean_token_accuracy": 0.9022326022386551, | |
| "num_tokens": 984838.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.35350754112005234, | |
| "epoch": 0.5313010342950463, | |
| "grad_norm": 0.2573815584182739, | |
| "learning_rate": 0.00010619741022634912, | |
| "loss": 0.3525, | |
| "mean_token_accuracy": 0.8904687911272049, | |
| "num_tokens": 988767.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.319248978048563, | |
| "epoch": 0.5334784975503538, | |
| "grad_norm": 0.21112677454948425, | |
| "learning_rate": 0.00010556412222863321, | |
| "loss": 0.3022, | |
| "mean_token_accuracy": 0.9129808992147446, | |
| "num_tokens": 993209.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.3874542936682701, | |
| "epoch": 0.5356559608056614, | |
| "grad_norm": 0.2539237439632416, | |
| "learning_rate": 0.00010493105432471443, | |
| "loss": 0.3908, | |
| "mean_token_accuracy": 0.8874447643756866, | |
| "num_tokens": 997348.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.3753085806965828, | |
| "epoch": 0.537833424060969, | |
| "grad_norm": 0.242266446352005, | |
| "learning_rate": 0.00010429823792535891, | |
| "loss": 0.3721, | |
| "mean_token_accuracy": 0.8896859586238861, | |
| "num_tokens": 1001182.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.326670840382576, | |
| "epoch": 0.5400108873162766, | |
| "grad_norm": 0.24620375037193298, | |
| "learning_rate": 0.00010366570442885373, | |
| "loss": 0.3195, | |
| "mean_token_accuracy": 0.9036577641963959, | |
| "num_tokens": 1005310.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.36552029848098755, | |
| "epoch": 0.5421883505715841, | |
| "grad_norm": 0.24721576273441315, | |
| "learning_rate": 0.00010303348521944938, | |
| "loss": 0.3665, | |
| "mean_token_accuracy": 0.892762616276741, | |
| "num_tokens": 1009657.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.34408629685640335, | |
| "epoch": 0.5443658138268916, | |
| "grad_norm": 0.23724570870399475, | |
| "learning_rate": 0.0001024016116658025, | |
| "loss": 0.3347, | |
| "mean_token_accuracy": 0.9008950591087341, | |
| "num_tokens": 1014240.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.33717598021030426, | |
| "epoch": 0.5465432770821992, | |
| "grad_norm": 0.25629547238349915, | |
| "learning_rate": 0.0001017701151194193, | |
| "loss": 0.3434, | |
| "mean_token_accuracy": 0.9011830985546112, | |
| "num_tokens": 1018254.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.36749306321144104, | |
| "epoch": 0.5487207403375068, | |
| "grad_norm": 0.2619577944278717, | |
| "learning_rate": 0.00010113902691310024, | |
| "loss": 0.3551, | |
| "mean_token_accuracy": 0.8974686414003372, | |
| "num_tokens": 1022155.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.4006720781326294, | |
| "epoch": 0.5508982035928144, | |
| "grad_norm": 0.2916308343410492, | |
| "learning_rate": 0.00010050837835938516, | |
| "loss": 0.3901, | |
| "mean_token_accuracy": 0.884143054485321, | |
| "num_tokens": 1026011.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.3434867560863495, | |
| "epoch": 0.5530756668481219, | |
| "grad_norm": 0.24261599779129028, | |
| "learning_rate": 9.98782007489998e-05, | |
| "loss": 0.3447, | |
| "mean_token_accuracy": 0.8931203186511993, | |
| "num_tokens": 1029811.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.33298294991254807, | |
| "epoch": 0.5552531301034295, | |
| "grad_norm": 0.24710261821746826, | |
| "learning_rate": 9.924852534930333e-05, | |
| "loss": 0.3163, | |
| "mean_token_accuracy": 0.8988287448883057, | |
| "num_tokens": 1033838.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.36351051926612854, | |
| "epoch": 0.5574305933587371, | |
| "grad_norm": 0.22865501046180725, | |
| "learning_rate": 9.861938340273671e-05, | |
| "loss": 0.3537, | |
| "mean_token_accuracy": 0.8958317637443542, | |
| "num_tokens": 1038890.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.34496162831783295, | |
| "epoch": 0.5596080566140447, | |
| "grad_norm": 0.27052974700927734, | |
| "learning_rate": 9.79908061252728e-05, | |
| "loss": 0.3422, | |
| "mean_token_accuracy": 0.8985669314861298, | |
| "num_tokens": 1042344.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.3629255071282387, | |
| "epoch": 0.5617855198693522, | |
| "grad_norm": 0.27112752199172974, | |
| "learning_rate": 9.736282470486739e-05, | |
| "loss": 0.36, | |
| "mean_token_accuracy": 0.8962416350841522, | |
| "num_tokens": 1046638.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.3592648208141327, | |
| "epoch": 0.5639629831246598, | |
| "grad_norm": 0.23911136388778687, | |
| "learning_rate": 9.673547029991173e-05, | |
| "loss": 0.3398, | |
| "mean_token_accuracy": 0.8957805782556534, | |
| "num_tokens": 1050963.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.41059066355228424, | |
| "epoch": 0.5661404463799673, | |
| "grad_norm": 0.2601061463356018, | |
| "learning_rate": 9.61087740376867e-05, | |
| "loss": 0.413, | |
| "mean_token_accuracy": 0.875213697552681, | |
| "num_tokens": 1055077.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.33156271278858185, | |
| "epoch": 0.568317909635275, | |
| "grad_norm": 0.2332238405942917, | |
| "learning_rate": 9.548276701281821e-05, | |
| "loss": 0.3202, | |
| "mean_token_accuracy": 0.9033721536397934, | |
| "num_tokens": 1059270.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.38206638395786285, | |
| "epoch": 0.5704953728905825, | |
| "grad_norm": 0.2890869677066803, | |
| "learning_rate": 9.485748028573455e-05, | |
| "loss": 0.3721, | |
| "mean_token_accuracy": 0.8858179748058319, | |
| "num_tokens": 1063429.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.3339729979634285, | |
| "epoch": 0.57267283614589, | |
| "grad_norm": 0.23651231825351715, | |
| "learning_rate": 9.423294488112509e-05, | |
| "loss": 0.3376, | |
| "mean_token_accuracy": 0.9060862809419632, | |
| "num_tokens": 1067575.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.36243191361427307, | |
| "epoch": 0.5748502994011976, | |
| "grad_norm": 0.2469407469034195, | |
| "learning_rate": 9.360919178640104e-05, | |
| "loss": 0.3313, | |
| "mean_token_accuracy": 0.9048342257738113, | |
| "num_tokens": 1071393.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.3420562148094177, | |
| "epoch": 0.5770277626565051, | |
| "grad_norm": 0.24036115407943726, | |
| "learning_rate": 9.298625195015796e-05, | |
| "loss": 0.3464, | |
| "mean_token_accuracy": 0.900355190038681, | |
| "num_tokens": 1076079.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.39919717609882355, | |
| "epoch": 0.5792052259118128, | |
| "grad_norm": 0.2509303390979767, | |
| "learning_rate": 9.236415628064017e-05, | |
| "loss": 0.3731, | |
| "mean_token_accuracy": 0.8862645626068115, | |
| "num_tokens": 1079989.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.3894932344555855, | |
| "epoch": 0.5813826891671203, | |
| "grad_norm": 0.25672271847724915, | |
| "learning_rate": 9.174293564420724e-05, | |
| "loss": 0.3749, | |
| "mean_token_accuracy": 0.8905623853206635, | |
| "num_tokens": 1083957.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.37751832604408264, | |
| "epoch": 0.5835601524224279, | |
| "grad_norm": 0.2643100321292877, | |
| "learning_rate": 9.112262086380234e-05, | |
| "loss": 0.371, | |
| "mean_token_accuracy": 0.8892365545034409, | |
| "num_tokens": 1087639.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.35557425767183304, | |
| "epoch": 0.5857376156777354, | |
| "grad_norm": 0.2569376230239868, | |
| "learning_rate": 9.050324271742312e-05, | |
| "loss": 0.3369, | |
| "mean_token_accuracy": 0.8985206633806229, | |
| "num_tokens": 1091448.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.3663223683834076, | |
| "epoch": 0.587915078933043, | |
| "grad_norm": 0.28307580947875977, | |
| "learning_rate": 8.988483193659447e-05, | |
| "loss": 0.3379, | |
| "mean_token_accuracy": 0.8939681947231293, | |
| "num_tokens": 1095282.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.35191214829683304, | |
| "epoch": 0.5900925421883506, | |
| "grad_norm": 0.241379514336586, | |
| "learning_rate": 8.926741920484374e-05, | |
| "loss": 0.3447, | |
| "mean_token_accuracy": 0.8967802226543427, | |
| "num_tokens": 1099519.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.33553165942430496, | |
| "epoch": 0.5922700054436582, | |
| "grad_norm": 0.26522010564804077, | |
| "learning_rate": 8.865103515617834e-05, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 0.9028987288475037, | |
| "num_tokens": 1103293.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.321424663066864, | |
| "epoch": 0.5944474686989657, | |
| "grad_norm": 0.23075014352798462, | |
| "learning_rate": 8.803571037356575e-05, | |
| "loss": 0.3204, | |
| "mean_token_accuracy": 0.9045960456132889, | |
| "num_tokens": 1107725.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.3491132855415344, | |
| "epoch": 0.5966249319542732, | |
| "grad_norm": 0.26291459798812866, | |
| "learning_rate": 8.742147538741623e-05, | |
| "loss": 0.3178, | |
| "mean_token_accuracy": 0.9050692319869995, | |
| "num_tokens": 1111448.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.3245581164956093, | |
| "epoch": 0.5988023952095808, | |
| "grad_norm": 0.2527916729450226, | |
| "learning_rate": 8.680836067406775e-05, | |
| "loss": 0.3164, | |
| "mean_token_accuracy": 0.9089783430099487, | |
| "num_tokens": 1115353.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.3509984761476517, | |
| "epoch": 0.6009798584648884, | |
| "grad_norm": 0.2409028708934784, | |
| "learning_rate": 8.619639665427411e-05, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.901856929063797, | |
| "num_tokens": 1119037.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.41339434683322906, | |
| "epoch": 0.603157321720196, | |
| "grad_norm": 0.2666266858577728, | |
| "learning_rate": 8.558561369169535e-05, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8851277679204941, | |
| "num_tokens": 1122815.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.355926550924778, | |
| "epoch": 0.6053347849755035, | |
| "grad_norm": 0.2666811943054199, | |
| "learning_rate": 8.497604209139139e-05, | |
| "loss": 0.3598, | |
| "mean_token_accuracy": 0.8959801942110062, | |
| "num_tokens": 1126942.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.33385297656059265, | |
| "epoch": 0.6075122482308111, | |
| "grad_norm": 0.26262858510017395, | |
| "learning_rate": 8.436771209831825e-05, | |
| "loss": 0.356, | |
| "mean_token_accuracy": 0.8975410759449005, | |
| "num_tokens": 1130948.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.3468668982386589, | |
| "epoch": 0.6096897114861187, | |
| "grad_norm": 0.2627294659614563, | |
| "learning_rate": 8.376065389582739e-05, | |
| "loss": 0.3453, | |
| "mean_token_accuracy": 0.8972453325986862, | |
| "num_tokens": 1135319.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.34889067709445953, | |
| "epoch": 0.6118671747414263, | |
| "grad_norm": 0.2477421760559082, | |
| "learning_rate": 8.315489760416839e-05, | |
| "loss": 0.3221, | |
| "mean_token_accuracy": 0.9074793308973312, | |
| "num_tokens": 1138864.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.3540688380599022, | |
| "epoch": 0.6140446379967338, | |
| "grad_norm": 0.2644377052783966, | |
| "learning_rate": 8.255047327899392e-05, | |
| "loss": 0.3697, | |
| "mean_token_accuracy": 0.8973688334226608, | |
| "num_tokens": 1142749.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.3112604096531868, | |
| "epoch": 0.6162221012520414, | |
| "grad_norm": 0.24608677625656128, | |
| "learning_rate": 8.19474109098691e-05, | |
| "loss": 0.3115, | |
| "mean_token_accuracy": 0.9069450497627258, | |
| "num_tokens": 1146891.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.3105768784880638, | |
| "epoch": 0.6183995645073489, | |
| "grad_norm": 0.2628800868988037, | |
| "learning_rate": 8.134574041878306e-05, | |
| "loss": 0.3144, | |
| "mean_token_accuracy": 0.9045025259256363, | |
| "num_tokens": 1151024.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.3061619848012924, | |
| "epoch": 0.6205770277626566, | |
| "grad_norm": 0.2500765919685364, | |
| "learning_rate": 8.074549165866463e-05, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.9090612530708313, | |
| "num_tokens": 1155564.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.34363674372434616, | |
| "epoch": 0.6227544910179641, | |
| "grad_norm": 0.2619493305683136, | |
| "learning_rate": 8.014669441190081e-05, | |
| "loss": 0.3196, | |
| "mean_token_accuracy": 0.8998923152685165, | |
| "num_tokens": 1159454.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.3449995443224907, | |
| "epoch": 0.6249319542732716, | |
| "grad_norm": 0.2670820355415344, | |
| "learning_rate": 7.954937838885937e-05, | |
| "loss": 0.3517, | |
| "mean_token_accuracy": 0.8967305719852448, | |
| "num_tokens": 1163267.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.3603576719760895, | |
| "epoch": 0.6271094175285792, | |
| "grad_norm": 0.24100132286548615, | |
| "learning_rate": 7.895357322641452e-05, | |
| "loss": 0.3508, | |
| "mean_token_accuracy": 0.8935562521219254, | |
| "num_tokens": 1167581.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.3160111829638481, | |
| "epoch": 0.6292868807838867, | |
| "grad_norm": 0.2645825445652008, | |
| "learning_rate": 7.835930848647653e-05, | |
| "loss": 0.3045, | |
| "mean_token_accuracy": 0.9113835692405701, | |
| "num_tokens": 1171514.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.33360420912504196, | |
| "epoch": 0.6314643440391944, | |
| "grad_norm": 0.22924089431762695, | |
| "learning_rate": 7.776661365452491e-05, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.9061863869428635, | |
| "num_tokens": 1175361.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.3485657498240471, | |
| "epoch": 0.6336418072945019, | |
| "grad_norm": 0.24018257856369019, | |
| "learning_rate": 7.717551813814543e-05, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.903602659702301, | |
| "num_tokens": 1179132.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.342680849134922, | |
| "epoch": 0.6358192705498095, | |
| "grad_norm": 0.22566929459571838, | |
| "learning_rate": 7.658605126557105e-05, | |
| "loss": 0.3183, | |
| "mean_token_accuracy": 0.9066330194473267, | |
| "num_tokens": 1183571.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.3731561452150345, | |
| "epoch": 0.637996733805117, | |
| "grad_norm": 0.2820538580417633, | |
| "learning_rate": 7.599824228422677e-05, | |
| "loss": 0.371, | |
| "mean_token_accuracy": 0.8894180357456207, | |
| "num_tokens": 1187179.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.32869182527065277, | |
| "epoch": 0.6401741970604246, | |
| "grad_norm": 0.2502634823322296, | |
| "learning_rate": 7.541212035927839e-05, | |
| "loss": 0.2968, | |
| "mean_token_accuracy": 0.9134543687105179, | |
| "num_tokens": 1191246.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.37562160193920135, | |
| "epoch": 0.6423516603157322, | |
| "grad_norm": 0.2863782048225403, | |
| "learning_rate": 7.482771457218542e-05, | |
| "loss": 0.3717, | |
| "mean_token_accuracy": 0.8882504254579544, | |
| "num_tokens": 1195149.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.33977876603603363, | |
| "epoch": 0.6445291235710398, | |
| "grad_norm": 0.24794067442417145, | |
| "learning_rate": 7.424505391925833e-05, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.9125866144895554, | |
| "num_tokens": 1198886.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.3772798329591751, | |
| "epoch": 0.6467065868263473, | |
| "grad_norm": 0.23983165621757507, | |
| "learning_rate": 7.366416731021964e-05, | |
| "loss": 0.362, | |
| "mean_token_accuracy": 0.8952146172523499, | |
| "num_tokens": 1202933.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.3076706826686859, | |
| "epoch": 0.6488840500816548, | |
| "grad_norm": 0.2429223656654358, | |
| "learning_rate": 7.30850835667696e-05, | |
| "loss": 0.3008, | |
| "mean_token_accuracy": 0.909699097275734, | |
| "num_tokens": 1206978.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.3360467702150345, | |
| "epoch": 0.6510615133369625, | |
| "grad_norm": 0.25572511553764343, | |
| "learning_rate": 7.250783142115615e-05, | |
| "loss": 0.341, | |
| "mean_token_accuracy": 0.9028728753328323, | |
| "num_tokens": 1210951.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.305056668817997, | |
| "epoch": 0.65323897659227, | |
| "grad_norm": 0.24135318398475647, | |
| "learning_rate": 7.193243951474933e-05, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.908637598156929, | |
| "num_tokens": 1215517.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.337252639234066, | |
| "epoch": 0.6554164398475776, | |
| "grad_norm": 0.27407306432724, | |
| "learning_rate": 7.135893639662012e-05, | |
| "loss": 0.3226, | |
| "mean_token_accuracy": 0.9033920913934708, | |
| "num_tokens": 1219456.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.3444167599081993, | |
| "epoch": 0.6575939031028851, | |
| "grad_norm": 0.2554808557033539, | |
| "learning_rate": 7.078735052212402e-05, | |
| "loss": 0.3405, | |
| "mean_token_accuracy": 0.8994651138782501, | |
| "num_tokens": 1223440.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.3203364834189415, | |
| "epoch": 0.6597713663581927, | |
| "grad_norm": 0.2498241364955902, | |
| "learning_rate": 7.021771025148922e-05, | |
| "loss": 0.2994, | |
| "mean_token_accuracy": 0.9104214161634445, | |
| "num_tokens": 1227205.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.3659024015069008, | |
| "epoch": 0.6619488296135003, | |
| "grad_norm": 0.24576182663440704, | |
| "learning_rate": 6.965004384840928e-05, | |
| "loss": 0.3434, | |
| "mean_token_accuracy": 0.8974325805902481, | |
| "num_tokens": 1231062.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.35433361679315567, | |
| "epoch": 0.6641262928688079, | |
| "grad_norm": 0.2348756641149521, | |
| "learning_rate": 6.90843794786409e-05, | |
| "loss": 0.3326, | |
| "mean_token_accuracy": 0.8999007195234299, | |
| "num_tokens": 1235210.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.3522880747914314, | |
| "epoch": 0.6663037561241154, | |
| "grad_norm": 0.24180057644844055, | |
| "learning_rate": 6.852074520860648e-05, | |
| "loss": 0.3286, | |
| "mean_token_accuracy": 0.9014742374420166, | |
| "num_tokens": 1238954.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.35235612094402313, | |
| "epoch": 0.668481219379423, | |
| "grad_norm": 0.24760101735591888, | |
| "learning_rate": 6.795916900400138e-05, | |
| "loss": 0.3262, | |
| "mean_token_accuracy": 0.9001569449901581, | |
| "num_tokens": 1242691.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.35372819751501083, | |
| "epoch": 0.6706586826347305, | |
| "grad_norm": 0.2558618485927582, | |
| "learning_rate": 6.739967872840662e-05, | |
| "loss": 0.3389, | |
| "mean_token_accuracy": 0.9027666747570038, | |
| "num_tokens": 1246355.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.32674338668584824, | |
| "epoch": 0.6728361458900382, | |
| "grad_norm": 0.2397354543209076, | |
| "learning_rate": 6.684230214190608e-05, | |
| "loss": 0.3026, | |
| "mean_token_accuracy": 0.9039190113544464, | |
| "num_tokens": 1251017.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.3184630870819092, | |
| "epoch": 0.6750136091453457, | |
| "grad_norm": 0.2725917100906372, | |
| "learning_rate": 6.628706689970932e-05, | |
| "loss": 0.3305, | |
| "mean_token_accuracy": 0.8989760279655457, | |
| "num_tokens": 1255024.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.35561081022024155, | |
| "epoch": 0.6771910724006532, | |
| "grad_norm": 0.24204087257385254, | |
| "learning_rate": 6.573400055077938e-05, | |
| "loss": 0.3393, | |
| "mean_token_accuracy": 0.8942540436983109, | |
| "num_tokens": 1259033.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.3308749422430992, | |
| "epoch": 0.6793685356559608, | |
| "grad_norm": 0.23772156238555908, | |
| "learning_rate": 6.518313053646586e-05, | |
| "loss": 0.3264, | |
| "mean_token_accuracy": 0.9023979008197784, | |
| "num_tokens": 1263455.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.3347730040550232, | |
| "epoch": 0.6815459989112683, | |
| "grad_norm": 0.2505793571472168, | |
| "learning_rate": 6.463448418914348e-05, | |
| "loss": 0.3392, | |
| "mean_token_accuracy": 0.9027709066867828, | |
| "num_tokens": 1267335.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.3568695932626724, | |
| "epoch": 0.683723462166576, | |
| "grad_norm": 0.24569235742092133, | |
| "learning_rate": 6.408808873085577e-05, | |
| "loss": 0.3399, | |
| "mean_token_accuracy": 0.8940989226102829, | |
| "num_tokens": 1271810.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.33151426911354065, | |
| "epoch": 0.6859009254218835, | |
| "grad_norm": 0.28417110443115234, | |
| "learning_rate": 6.354397127196448e-05, | |
| "loss": 0.3196, | |
| "mean_token_accuracy": 0.9016236513853073, | |
| "num_tokens": 1275575.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.31233637779951096, | |
| "epoch": 0.6880783886771911, | |
| "grad_norm": 0.23522846400737762, | |
| "learning_rate": 6.300215880980446e-05, | |
| "loss": 0.2954, | |
| "mean_token_accuracy": 0.9116706401109695, | |
| "num_tokens": 1280034.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.35172613710165024, | |
| "epoch": 0.6902558519324986, | |
| "grad_norm": 0.25289177894592285, | |
| "learning_rate": 6.246267822734421e-05, | |
| "loss": 0.3253, | |
| "mean_token_accuracy": 0.8971187770366669, | |
| "num_tokens": 1283664.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.3484005257487297, | |
| "epoch": 0.6924333151878062, | |
| "grad_norm": 0.2565121054649353, | |
| "learning_rate": 6.192555629185189e-05, | |
| "loss": 0.3408, | |
| "mean_token_accuracy": 0.8945488780736923, | |
| "num_tokens": 1287685.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.3195461556315422, | |
| "epoch": 0.6946107784431138, | |
| "grad_norm": 0.24285030364990234, | |
| "learning_rate": 6.139081965356725e-05, | |
| "loss": 0.3188, | |
| "mean_token_accuracy": 0.9035038352012634, | |
| "num_tokens": 1291337.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.33748240023851395, | |
| "epoch": 0.6967882416984214, | |
| "grad_norm": 0.24630972743034363, | |
| "learning_rate": 6.085849484437944e-05, | |
| "loss": 0.3411, | |
| "mean_token_accuracy": 0.9040576815605164, | |
| "num_tokens": 1295196.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.3222072795033455, | |
| "epoch": 0.6989657049537289, | |
| "grad_norm": 0.23582881689071655, | |
| "learning_rate": 6.0328608276510476e-05, | |
| "loss": 0.3193, | |
| "mean_token_accuracy": 0.900396928191185, | |
| "num_tokens": 1299276.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.34793104976415634, | |
| "epoch": 0.7011431682090364, | |
| "grad_norm": 0.28013235330581665, | |
| "learning_rate": 5.980118624120483e-05, | |
| "loss": 0.3234, | |
| "mean_token_accuracy": 0.8983870148658752, | |
| "num_tokens": 1302970.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.3033921644091606, | |
| "epoch": 0.7033206314643441, | |
| "grad_norm": 0.23157738149166107, | |
| "learning_rate": 5.9276254907424846e-05, | |
| "loss": 0.2927, | |
| "mean_token_accuracy": 0.9108779579401016, | |
| "num_tokens": 1307008.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.36583440005779266, | |
| "epoch": 0.7054980947196516, | |
| "grad_norm": 0.2319372296333313, | |
| "learning_rate": 5.875384032055239e-05, | |
| "loss": 0.371, | |
| "mean_token_accuracy": 0.897381991147995, | |
| "num_tokens": 1311263.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.3215944245457649, | |
| "epoch": 0.7076755579749592, | |
| "grad_norm": 0.23082365095615387, | |
| "learning_rate": 5.823396840109657e-05, | |
| "loss": 0.3094, | |
| "mean_token_accuracy": 0.903637707233429, | |
| "num_tokens": 1315823.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.34219200164079666, | |
| "epoch": 0.7098530212302667, | |
| "grad_norm": 0.23884856700897217, | |
| "learning_rate": 5.771666494340756e-05, | |
| "loss": 0.3289, | |
| "mean_token_accuracy": 0.9032928794622421, | |
| "num_tokens": 1319955.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.2886577844619751, | |
| "epoch": 0.7120304844855743, | |
| "grad_norm": 0.22707660496234894, | |
| "learning_rate": 5.7201955614396964e-05, | |
| "loss": 0.2839, | |
| "mean_token_accuracy": 0.9143697619438171, | |
| "num_tokens": 1324096.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.3365718871355057, | |
| "epoch": 0.7142079477408819, | |
| "grad_norm": 0.21789753437042236, | |
| "learning_rate": 5.668986595226404e-05, | |
| "loss": 0.3316, | |
| "mean_token_accuracy": 0.9025033861398697, | |
| "num_tokens": 1328868.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.313778854906559, | |
| "epoch": 0.7163854109961895, | |
| "grad_norm": 0.24393050372600555, | |
| "learning_rate": 5.618042136522881e-05, | |
| "loss": 0.3212, | |
| "mean_token_accuracy": 0.9037179052829742, | |
| "num_tokens": 1333087.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.3029978275299072, | |
| "epoch": 0.718562874251497, | |
| "grad_norm": 0.24070705473423004, | |
| "learning_rate": 5.567364713027121e-05, | |
| "loss": 0.306, | |
| "mean_token_accuracy": 0.9108355790376663, | |
| "num_tokens": 1337351.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.36839231103658676, | |
| "epoch": 0.7207403375068046, | |
| "grad_norm": 0.25364482402801514, | |
| "learning_rate": 5.5169568391877035e-05, | |
| "loss": 0.3493, | |
| "mean_token_accuracy": 0.89275161921978, | |
| "num_tokens": 1341499.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.37619777768850327, | |
| "epoch": 0.7229178007621121, | |
| "grad_norm": 0.24351854622364044, | |
| "learning_rate": 5.46682101607904e-05, | |
| "loss": 0.3816, | |
| "mean_token_accuracy": 0.8932196348905563, | |
| "num_tokens": 1345295.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.30273835361003876, | |
| "epoch": 0.7250952640174197, | |
| "grad_norm": 0.2297053039073944, | |
| "learning_rate": 5.416959731277264e-05, | |
| "loss": 0.2852, | |
| "mean_token_accuracy": 0.9142936319112778, | |
| "num_tokens": 1349605.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.3630438446998596, | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.2559914290904999, | |
| "learning_rate": 5.3673754587368094e-05, | |
| "loss": 0.3791, | |
| "mean_token_accuracy": 0.8942387253046036, | |
| "num_tokens": 1353706.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.32199136167764664, | |
| "epoch": 0.7294501905280348, | |
| "grad_norm": 0.25669071078300476, | |
| "learning_rate": 5.318070658667671e-05, | |
| "loss": 0.3123, | |
| "mean_token_accuracy": 0.9080253690481186, | |
| "num_tokens": 1357558.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.35776887834072113, | |
| "epoch": 0.7316276537833424, | |
| "grad_norm": 0.2596750855445862, | |
| "learning_rate": 5.269047777413333e-05, | |
| "loss": 0.3436, | |
| "mean_token_accuracy": 0.8997514098882675, | |
| "num_tokens": 1361340.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.34765905141830444, | |
| "epoch": 0.7338051170386499, | |
| "grad_norm": 0.21836940944194794, | |
| "learning_rate": 5.22030924732938e-05, | |
| "loss": 0.3277, | |
| "mean_token_accuracy": 0.9053044319152832, | |
| "num_tokens": 1365153.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.34295450896024704, | |
| "epoch": 0.7359825802939576, | |
| "grad_norm": 0.2738622725009918, | |
| "learning_rate": 5.171857486662823e-05, | |
| "loss": 0.3336, | |
| "mean_token_accuracy": 0.8998141139745712, | |
| "num_tokens": 1368896.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.32134611159563065, | |
| "epoch": 0.7381600435492651, | |
| "grad_norm": 0.22107118368148804, | |
| "learning_rate": 5.1236948994321055e-05, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.908054381608963, | |
| "num_tokens": 1373609.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.3105727434158325, | |
| "epoch": 0.7403375068045727, | |
| "grad_norm": 0.23407259583473206, | |
| "learning_rate": 5.075823875307828e-05, | |
| "loss": 0.2947, | |
| "mean_token_accuracy": 0.9088436663150787, | |
| "num_tokens": 1377893.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.3235616162419319, | |
| "epoch": 0.7425149700598802, | |
| "grad_norm": 0.2505863606929779, | |
| "learning_rate": 5.0282467894941864e-05, | |
| "loss": 0.3338, | |
| "mean_token_accuracy": 0.9098049253225327, | |
| "num_tokens": 1381665.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.30407993495464325, | |
| "epoch": 0.7446924333151878, | |
| "grad_norm": 0.23674152791500092, | |
| "learning_rate": 4.980966002611108e-05, | |
| "loss": 0.2939, | |
| "mean_token_accuracy": 0.9113668948411942, | |
| "num_tokens": 1386000.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.29837000370025635, | |
| "epoch": 0.7468698965704954, | |
| "grad_norm": 0.24069277942180634, | |
| "learning_rate": 4.933983860577136e-05, | |
| "loss": 0.2801, | |
| "mean_token_accuracy": 0.9147733300924301, | |
| "num_tokens": 1389768.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5488722026348114, | |
| "epoch": 0.749047359825803, | |
| "grad_norm": 0.23018239438533783, | |
| "learning_rate": 4.887302694493029e-05, | |
| "loss": 0.6326, | |
| "mean_token_accuracy": 0.8503530323505402, | |
| "num_tokens": 1394588.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.3708427771925926, | |
| "epoch": 0.7512248230811105, | |
| "grad_norm": 0.28215181827545166, | |
| "learning_rate": 4.840924820526096e-05, | |
| "loss": 0.3952, | |
| "mean_token_accuracy": 0.8861146718263626, | |
| "num_tokens": 1398304.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.34193163365125656, | |
| "epoch": 0.753402286336418, | |
| "grad_norm": 0.2342662215232849, | |
| "learning_rate": 4.794852539795291e-05, | |
| "loss": 0.3495, | |
| "mean_token_accuracy": 0.903597891330719, | |
| "num_tokens": 1402505.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.3103507123887539, | |
| "epoch": 0.7555797495917257, | |
| "grad_norm": 0.23902368545532227, | |
| "learning_rate": 4.749088138257017e-05, | |
| "loss": 0.3078, | |
| "mean_token_accuracy": 0.9087391942739487, | |
| "num_tokens": 1406703.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.3236440420150757, | |
| "epoch": 0.7577572128470332, | |
| "grad_norm": 0.22265306115150452, | |
| "learning_rate": 4.703633886591719e-05, | |
| "loss": 0.3387, | |
| "mean_token_accuracy": 0.9036975800991058, | |
| "num_tokens": 1410765.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.31991977244615555, | |
| "epoch": 0.7599346761023408, | |
| "grad_norm": 0.2397955358028412, | |
| "learning_rate": 4.6584920400912156e-05, | |
| "loss": 0.3056, | |
| "mean_token_accuracy": 0.9113240092992783, | |
| "num_tokens": 1414804.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.28877923637628555, | |
| "epoch": 0.7621121393576483, | |
| "grad_norm": 0.2253178060054779, | |
| "learning_rate": 4.6136648385467977e-05, | |
| "loss": 0.2649, | |
| "mean_token_accuracy": 0.9233576655387878, | |
| "num_tokens": 1419025.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.35838521271944046, | |
| "epoch": 0.7642896026129559, | |
| "grad_norm": 0.2513080835342407, | |
| "learning_rate": 4.5691545061381026e-05, | |
| "loss": 0.3413, | |
| "mean_token_accuracy": 0.8982634395360947, | |
| "num_tokens": 1423031.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.371716171503067, | |
| "epoch": 0.7664670658682635, | |
| "grad_norm": 0.23435106873512268, | |
| "learning_rate": 4.5249632513227504e-05, | |
| "loss": 0.3457, | |
| "mean_token_accuracy": 0.9014202654361725, | |
| "num_tokens": 1427232.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.32540784031152725, | |
| "epoch": 0.7686445291235711, | |
| "grad_norm": 0.2637276351451874, | |
| "learning_rate": 4.481093266726772e-05, | |
| "loss": 0.2913, | |
| "mean_token_accuracy": 0.9063924849033356, | |
| "num_tokens": 1431135.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.35406405478715897, | |
| "epoch": 0.7708219923788786, | |
| "grad_norm": 0.24304324388504028, | |
| "learning_rate": 4.43754672903582e-05, | |
| "loss": 0.3232, | |
| "mean_token_accuracy": 0.9024296700954437, | |
| "num_tokens": 1435499.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.32546380907297134, | |
| "epoch": 0.7729994556341862, | |
| "grad_norm": 0.22986435890197754, | |
| "learning_rate": 4.394325798887158e-05, | |
| "loss": 0.31, | |
| "mean_token_accuracy": 0.9013588130474091, | |
| "num_tokens": 1439833.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.38513386994600296, | |
| "epoch": 0.7751769188894937, | |
| "grad_norm": 0.27596256136894226, | |
| "learning_rate": 4.351432620762478e-05, | |
| "loss": 0.346, | |
| "mean_token_accuracy": 0.8986889123916626, | |
| "num_tokens": 1443460.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.3382200300693512, | |
| "epoch": 0.7773543821448013, | |
| "grad_norm": 0.24578897655010223, | |
| "learning_rate": 4.30886932288147e-05, | |
| "loss": 0.3229, | |
| "mean_token_accuracy": 0.9034900367259979, | |
| "num_tokens": 1447099.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.3409022316336632, | |
| "epoch": 0.7795318454001089, | |
| "grad_norm": 0.2280901074409485, | |
| "learning_rate": 4.266638017096252e-05, | |
| "loss": 0.3411, | |
| "mean_token_accuracy": 0.9012559950351715, | |
| "num_tokens": 1451312.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.32152481377124786, | |
| "epoch": 0.7817093086554164, | |
| "grad_norm": 0.24760432541370392, | |
| "learning_rate": 4.224740798786573e-05, | |
| "loss": 0.3204, | |
| "mean_token_accuracy": 0.9076259434223175, | |
| "num_tokens": 1455523.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.31170132011175156, | |
| "epoch": 0.783886771910724, | |
| "grad_norm": 0.2510303258895874, | |
| "learning_rate": 4.183179746755844e-05, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 0.9090617448091507, | |
| "num_tokens": 1459544.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.3523375913500786, | |
| "epoch": 0.7860642351660315, | |
| "grad_norm": 0.26667118072509766, | |
| "learning_rate": 4.141956923128013e-05, | |
| "loss": 0.3492, | |
| "mean_token_accuracy": 0.8998522162437439, | |
| "num_tokens": 1463315.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.3598644956946373, | |
| "epoch": 0.7882416984213392, | |
| "grad_norm": 0.2440025806427002, | |
| "learning_rate": 4.1010743732452294e-05, | |
| "loss": 0.3544, | |
| "mean_token_accuracy": 0.8947449177503586, | |
| "num_tokens": 1467647.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.395267553627491, | |
| "epoch": 0.7904191616766467, | |
| "grad_norm": 0.24411144852638245, | |
| "learning_rate": 4.0605341255663696e-05, | |
| "loss": 0.4317, | |
| "mean_token_accuracy": 0.8864284604787827, | |
| "num_tokens": 1471972.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.33659572899341583, | |
| "epoch": 0.7925966249319543, | |
| "grad_norm": 0.26458773016929626, | |
| "learning_rate": 4.02033819156639e-05, | |
| "loss": 0.3298, | |
| "mean_token_accuracy": 0.9003510624170303, | |
| "num_tokens": 1475826.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.29316914454102516, | |
| "epoch": 0.7947740881872618, | |
| "grad_norm": 0.25398463010787964, | |
| "learning_rate": 3.980488565636522e-05, | |
| "loss": 0.2772, | |
| "mean_token_accuracy": 0.9137367159128189, | |
| "num_tokens": 1480107.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.3080258443951607, | |
| "epoch": 0.7969515514425695, | |
| "grad_norm": 0.26426613330841064, | |
| "learning_rate": 3.9409872249853286e-05, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.9098687618970871, | |
| "num_tokens": 1484069.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.34426791220903397, | |
| "epoch": 0.799129014697877, | |
| "grad_norm": 0.2809188663959503, | |
| "learning_rate": 3.9018361295405856e-05, | |
| "loss": 0.3592, | |
| "mean_token_accuracy": 0.9000663906335831, | |
| "num_tokens": 1487840.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.33940157294273376, | |
| "epoch": 0.8013064779531845, | |
| "grad_norm": 0.2272171825170517, | |
| "learning_rate": 3.8630372218520384e-05, | |
| "loss": 0.3417, | |
| "mean_token_accuracy": 0.9024456739425659, | |
| "num_tokens": 1491938.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.33219510316848755, | |
| "epoch": 0.8034839412084921, | |
| "grad_norm": 0.2192796915769577, | |
| "learning_rate": 3.824592426995029e-05, | |
| "loss": 0.3221, | |
| "mean_token_accuracy": 0.9031501561403275, | |
| "num_tokens": 1496386.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.3439122289419174, | |
| "epoch": 0.8056614044637996, | |
| "grad_norm": 0.229109987616539, | |
| "learning_rate": 3.786503652474982e-05, | |
| "loss": 0.3427, | |
| "mean_token_accuracy": 0.9062491357326508, | |
| "num_tokens": 1500938.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.3725889101624489, | |
| "epoch": 0.8078388677191073, | |
| "grad_norm": 0.2585630714893341, | |
| "learning_rate": 3.7487727881327405e-05, | |
| "loss": 0.3704, | |
| "mean_token_accuracy": 0.8960603177547455, | |
| "num_tokens": 1504742.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.3037722408771515, | |
| "epoch": 0.8100163309744148, | |
| "grad_norm": 0.23759490251541138, | |
| "learning_rate": 3.711401706050821e-05, | |
| "loss": 0.2939, | |
| "mean_token_accuracy": 0.9124279767274857, | |
| "num_tokens": 1508512.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.3051731139421463, | |
| "epoch": 0.8121937942297224, | |
| "grad_norm": 0.22473642230033875, | |
| "learning_rate": 3.674392260460509e-05, | |
| "loss": 0.3036, | |
| "mean_token_accuracy": 0.9092454463243484, | |
| "num_tokens": 1513083.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.3145020753145218, | |
| "epoch": 0.8143712574850299, | |
| "grad_norm": 0.2272917479276657, | |
| "learning_rate": 3.6377462876498694e-05, | |
| "loss": 0.2858, | |
| "mean_token_accuracy": 0.9174733906984329, | |
| "num_tokens": 1516960.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.33495523035526276, | |
| "epoch": 0.8165487207403375, | |
| "grad_norm": 0.24096311628818512, | |
| "learning_rate": 3.601465605872636e-05, | |
| "loss": 0.3004, | |
| "mean_token_accuracy": 0.9126247465610504, | |
| "num_tokens": 1520583.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.3524938374757767, | |
| "epoch": 0.8187261839956451, | |
| "grad_norm": 0.23482073843479156, | |
| "learning_rate": 3.565552015257989e-05, | |
| "loss": 0.3596, | |
| "mean_token_accuracy": 0.894221231341362, | |
| "num_tokens": 1525126.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.3637235388159752, | |
| "epoch": 0.8209036472509527, | |
| "grad_norm": 0.2486315220594406, | |
| "learning_rate": 3.530007297721239e-05, | |
| "loss": 0.3518, | |
| "mean_token_accuracy": 0.8981701731681824, | |
| "num_tokens": 1528846.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.327960979193449, | |
| "epoch": 0.8230811105062602, | |
| "grad_norm": 0.21721476316452026, | |
| "learning_rate": 3.494833216875421e-05, | |
| "loss": 0.2854, | |
| "mean_token_accuracy": 0.915936678647995, | |
| "num_tokens": 1532720.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.3281715139746666, | |
| "epoch": 0.8252585737615677, | |
| "grad_norm": 0.27801278233528137, | |
| "learning_rate": 3.4600315179437807e-05, | |
| "loss": 0.3094, | |
| "mean_token_accuracy": 0.9122365713119507, | |
| "num_tokens": 1536770.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.319459468126297, | |
| "epoch": 0.8274360370168753, | |
| "grad_norm": 0.24818798899650574, | |
| "learning_rate": 3.425603927673195e-05, | |
| "loss": 0.2909, | |
| "mean_token_accuracy": 0.9143448621034622, | |
| "num_tokens": 1540543.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.29846663028001785, | |
| "epoch": 0.829613500272183, | |
| "grad_norm": 0.2553517520427704, | |
| "learning_rate": 3.3915521542484794e-05, | |
| "loss": 0.2984, | |
| "mean_token_accuracy": 0.9117088168859482, | |
| "num_tokens": 1544682.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.3208995833992958, | |
| "epoch": 0.8317909635274905, | |
| "grad_norm": 0.23631241917610168, | |
| "learning_rate": 3.357877887207648e-05, | |
| "loss": 0.3218, | |
| "mean_token_accuracy": 0.9085069596767426, | |
| "num_tokens": 1548933.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.3497694879770279, | |
| "epoch": 0.833968426782798, | |
| "grad_norm": 0.26314374804496765, | |
| "learning_rate": 3.3245827973580754e-05, | |
| "loss": 0.3651, | |
| "mean_token_accuracy": 0.8973031789064407, | |
| "num_tokens": 1553109.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.36065296083688736, | |
| "epoch": 0.8361458900381056, | |
| "grad_norm": 0.2554258704185486, | |
| "learning_rate": 3.2916685366936016e-05, | |
| "loss": 0.3572, | |
| "mean_token_accuracy": 0.8984216153621674, | |
| "num_tokens": 1557199.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.3203965201973915, | |
| "epoch": 0.8383233532934131, | |
| "grad_norm": 0.2560184597969055, | |
| "learning_rate": 3.259136738312565e-05, | |
| "loss": 0.3107, | |
| "mean_token_accuracy": 0.9113545119762421, | |
| "num_tokens": 1560942.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.3545750603079796, | |
| "epoch": 0.8405008165487208, | |
| "grad_norm": 0.23520711064338684, | |
| "learning_rate": 3.226989016336767e-05, | |
| "loss": 0.3295, | |
| "mean_token_accuracy": 0.8977851718664169, | |
| "num_tokens": 1565528.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.27805931866168976, | |
| "epoch": 0.8426782798040283, | |
| "grad_norm": 0.22847194969654083, | |
| "learning_rate": 3.1952269658313963e-05, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9223105758428574, | |
| "num_tokens": 1569618.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.36420372873544693, | |
| "epoch": 0.8448557430593359, | |
| "grad_norm": 0.2458695024251938, | |
| "learning_rate": 3.163852162725872e-05, | |
| "loss": 0.349, | |
| "mean_token_accuracy": 0.8980138152837753, | |
| "num_tokens": 1573505.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.3188191279768944, | |
| "epoch": 0.8470332063146434, | |
| "grad_norm": 0.245536670088768, | |
| "learning_rate": 3.1328661637356714e-05, | |
| "loss": 0.3177, | |
| "mean_token_accuracy": 0.907622441649437, | |
| "num_tokens": 1577568.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.3238792344927788, | |
| "epoch": 0.8492106695699511, | |
| "grad_norm": 0.24584944546222687, | |
| "learning_rate": 3.102270506285067e-05, | |
| "loss": 0.3085, | |
| "mean_token_accuracy": 0.9090628027915955, | |
| "num_tokens": 1581202.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.34554795920848846, | |
| "epoch": 0.8513881328252586, | |
| "grad_norm": 0.24180692434310913, | |
| "learning_rate": 3.072066708430862e-05, | |
| "loss": 0.3203, | |
| "mean_token_accuracy": 0.9024082869291306, | |
| "num_tokens": 1585340.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.31679805368185043, | |
| "epoch": 0.8535655960805661, | |
| "grad_norm": 0.23670694231987, | |
| "learning_rate": 3.042256268787063e-05, | |
| "loss": 0.2891, | |
| "mean_token_accuracy": 0.9171215295791626, | |
| "num_tokens": 1589570.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.316896952688694, | |
| "epoch": 0.8557430593358737, | |
| "grad_norm": 0.26047396659851074, | |
| "learning_rate": 3.0128406664505215e-05, | |
| "loss": 0.3237, | |
| "mean_token_accuracy": 0.9058733284473419, | |
| "num_tokens": 1593421.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.3199731484055519, | |
| "epoch": 0.8579205225911812, | |
| "grad_norm": 0.2323935478925705, | |
| "learning_rate": 2.9838213609275546e-05, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.9120573252439499, | |
| "num_tokens": 1597598.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.29843273013830185, | |
| "epoch": 0.8600979858464889, | |
| "grad_norm": 0.2387438267469406, | |
| "learning_rate": 2.9551997920615187e-05, | |
| "loss": 0.2862, | |
| "mean_token_accuracy": 0.9175356030464172, | |
| "num_tokens": 1601591.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.31333109736442566, | |
| "epoch": 0.8622754491017964, | |
| "grad_norm": 0.23580299317836761, | |
| "learning_rate": 2.926977379961374e-05, | |
| "loss": 0.3098, | |
| "mean_token_accuracy": 0.911782830953598, | |
| "num_tokens": 1606156.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.32873860746622086, | |
| "epoch": 0.864452912357104, | |
| "grad_norm": 0.23804928362369537, | |
| "learning_rate": 2.899155524931224e-05, | |
| "loss": 0.3171, | |
| "mean_token_accuracy": 0.9060818552970886, | |
| "num_tokens": 1610215.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.331471748650074, | |
| "epoch": 0.8666303756124115, | |
| "grad_norm": 0.22940973937511444, | |
| "learning_rate": 2.8717356074008345e-05, | |
| "loss": 0.3201, | |
| "mean_token_accuracy": 0.905473530292511, | |
| "num_tokens": 1614427.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.33943046629428864, | |
| "epoch": 0.8688078388677191, | |
| "grad_norm": 0.24828903377056122, | |
| "learning_rate": 2.844718987857145e-05, | |
| "loss": 0.3408, | |
| "mean_token_accuracy": 0.8990557938814163, | |
| "num_tokens": 1618891.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.33763300627470016, | |
| "epoch": 0.8709853021230267, | |
| "grad_norm": 0.25826534628868103, | |
| "learning_rate": 2.818107006776761e-05, | |
| "loss": 0.3195, | |
| "mean_token_accuracy": 0.9027258008718491, | |
| "num_tokens": 1622659.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.29499682784080505, | |
| "epoch": 0.8731627653783343, | |
| "grad_norm": 0.22961440682411194, | |
| "learning_rate": 2.7919009845594502e-05, | |
| "loss": 0.2923, | |
| "mean_token_accuracy": 0.9152926355600357, | |
| "num_tokens": 1626858.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.3353520557284355, | |
| "epoch": 0.8753402286336418, | |
| "grad_norm": 0.25194504857063293, | |
| "learning_rate": 2.7661022214626153e-05, | |
| "loss": 0.3207, | |
| "mean_token_accuracy": 0.9085413068532944, | |
| "num_tokens": 1630448.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.29210612177848816, | |
| "epoch": 0.8775176918889493, | |
| "grad_norm": 0.2511427402496338, | |
| "learning_rate": 2.7407119975368006e-05, | |
| "loss": 0.2815, | |
| "mean_token_accuracy": 0.9171009808778763, | |
| "num_tokens": 1634411.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.35340818017721176, | |
| "epoch": 0.8796951551442569, | |
| "grad_norm": 0.24676676094532013, | |
| "learning_rate": 2.7157315725621612e-05, | |
| "loss": 0.3692, | |
| "mean_token_accuracy": 0.905316099524498, | |
| "num_tokens": 1638404.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.3412262871861458, | |
| "epoch": 0.8818726183995645, | |
| "grad_norm": 0.27478235960006714, | |
| "learning_rate": 2.6911621859859658e-05, | |
| "loss": 0.3472, | |
| "mean_token_accuracy": 0.90118607878685, | |
| "num_tokens": 1642162.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.33481264114379883, | |
| "epoch": 0.8840500816548721, | |
| "grad_norm": 0.2933956980705261, | |
| "learning_rate": 2.6670050568610972e-05, | |
| "loss": 0.3248, | |
| "mean_token_accuracy": 0.9072499722242355, | |
| "num_tokens": 1646171.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.3591442406177521, | |
| "epoch": 0.8862275449101796, | |
| "grad_norm": 0.21709908545017242, | |
| "learning_rate": 2.6432613837855658e-05, | |
| "loss": 0.3407, | |
| "mean_token_accuracy": 0.9071426689624786, | |
| "num_tokens": 1650504.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.32970624417066574, | |
| "epoch": 0.8884050081654872, | |
| "grad_norm": 0.23687736690044403, | |
| "learning_rate": 2.6199323448430458e-05, | |
| "loss": 0.3135, | |
| "mean_token_accuracy": 0.903979942202568, | |
| "num_tokens": 1654507.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.3415728807449341, | |
| "epoch": 0.8905824714207947, | |
| "grad_norm": 0.2553468644618988, | |
| "learning_rate": 2.597019097544409e-05, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.9025170505046844, | |
| "num_tokens": 1658421.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.29549212008714676, | |
| "epoch": 0.8927599346761024, | |
| "grad_norm": 0.21464505791664124, | |
| "learning_rate": 2.574522778770308e-05, | |
| "loss": 0.2634, | |
| "mean_token_accuracy": 0.9200884401798248, | |
| "num_tokens": 1662809.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.3326757438480854, | |
| "epoch": 0.8949373979314099, | |
| "grad_norm": 0.23331218957901, | |
| "learning_rate": 2.5524445047147567e-05, | |
| "loss": 0.319, | |
| "mean_token_accuracy": 0.900556892156601, | |
| "num_tokens": 1667221.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.31935854256153107, | |
| "epoch": 0.8971148611867175, | |
| "grad_norm": 0.23457299172878265, | |
| "learning_rate": 2.5307853708297523e-05, | |
| "loss": 0.3045, | |
| "mean_token_accuracy": 0.9045213311910629, | |
| "num_tokens": 1671381.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.3340509235858917, | |
| "epoch": 0.899292324442025, | |
| "grad_norm": 0.23886168003082275, | |
| "learning_rate": 2.5095464517709277e-05, | |
| "loss": 0.3264, | |
| "mean_token_accuracy": 0.899304986000061, | |
| "num_tokens": 1675656.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.3181797042489052, | |
| "epoch": 0.9014697876973327, | |
| "grad_norm": 0.24742458760738373, | |
| "learning_rate": 2.4887288013442218e-05, | |
| "loss": 0.2988, | |
| "mean_token_accuracy": 0.9066351801156998, | |
| "num_tokens": 1679259.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.3163676857948303, | |
| "epoch": 0.9036472509526402, | |
| "grad_norm": 0.25340980291366577, | |
| "learning_rate": 2.468333452453597e-05, | |
| "loss": 0.2979, | |
| "mean_token_accuracy": 0.9118978530168533, | |
| "num_tokens": 1683245.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.30397678166627884, | |
| "epoch": 0.9058247142079477, | |
| "grad_norm": 0.2358277142047882, | |
| "learning_rate": 2.4483614170497916e-05, | |
| "loss": 0.2955, | |
| "mean_token_accuracy": 0.9145314395427704, | |
| "num_tokens": 1687531.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.34245041757822037, | |
| "epoch": 0.9080021774632553, | |
| "grad_norm": 0.23215466737747192, | |
| "learning_rate": 2.4288136860801048e-05, | |
| "loss": 0.326, | |
| "mean_token_accuracy": 0.9006476998329163, | |
| "num_tokens": 1692172.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.3470025435090065, | |
| "epoch": 0.9101796407185628, | |
| "grad_norm": 0.26786699891090393, | |
| "learning_rate": 2.409691229439239e-05, | |
| "loss": 0.3668, | |
| "mean_token_accuracy": 0.8918263465166092, | |
| "num_tokens": 1696141.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.30502913892269135, | |
| "epoch": 0.9123571039738705, | |
| "grad_norm": 0.23780497908592224, | |
| "learning_rate": 2.3909949959211657e-05, | |
| "loss": 0.2906, | |
| "mean_token_accuracy": 0.9070711433887482, | |
| "num_tokens": 1700408.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.3096166178584099, | |
| "epoch": 0.914534567229178, | |
| "grad_norm": 0.21969804167747498, | |
| "learning_rate": 2.372725913172055e-05, | |
| "loss": 0.32, | |
| "mean_token_accuracy": 0.9115228056907654, | |
| "num_tokens": 1704797.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.30217302590608597, | |
| "epoch": 0.9167120304844856, | |
| "grad_norm": 0.23517285287380219, | |
| "learning_rate": 2.3548848876442465e-05, | |
| "loss": 0.2789, | |
| "mean_token_accuracy": 0.9120800346136093, | |
| "num_tokens": 1708762.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.27675122022628784, | |
| "epoch": 0.9188894937397931, | |
| "grad_norm": 0.2593907415866852, | |
| "learning_rate": 2.337472804551281e-05, | |
| "loss": 0.2552, | |
| "mean_token_accuracy": 0.9166678935289383, | |
| "num_tokens": 1712763.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.31945841014385223, | |
| "epoch": 0.9210669569951007, | |
| "grad_norm": 0.22665663063526154, | |
| "learning_rate": 2.320490527823968e-05, | |
| "loss": 0.322, | |
| "mean_token_accuracy": 0.9008611887693405, | |
| "num_tokens": 1717586.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.28783877938985825, | |
| "epoch": 0.9232444202504083, | |
| "grad_norm": 0.2106105536222458, | |
| "learning_rate": 2.303938900067531e-05, | |
| "loss": 0.2571, | |
| "mean_token_accuracy": 0.9197226613759995, | |
| "num_tokens": 1722046.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.31481262296438217, | |
| "epoch": 0.9254218835057159, | |
| "grad_norm": 0.24338746070861816, | |
| "learning_rate": 2.2878187425197893e-05, | |
| "loss": 0.3072, | |
| "mean_token_accuracy": 0.9047886729240417, | |
| "num_tokens": 1726207.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.35147786885499954, | |
| "epoch": 0.9275993467610234, | |
| "grad_norm": 0.2515200078487396, | |
| "learning_rate": 2.272130855010421e-05, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.8965179175138474, | |
| "num_tokens": 1730155.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.36182061582803726, | |
| "epoch": 0.929776810016331, | |
| "grad_norm": 0.2628372609615326, | |
| "learning_rate": 2.2568760159212745e-05, | |
| "loss": 0.3187, | |
| "mean_token_accuracy": 0.9001797884702682, | |
| "num_tokens": 1733927.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.32963769882917404, | |
| "epoch": 0.9319542732716385, | |
| "grad_norm": 0.26346680521965027, | |
| "learning_rate": 2.2420549821477435e-05, | |
| "loss": 0.311, | |
| "mean_token_accuracy": 0.9040227830410004, | |
| "num_tokens": 1737774.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.37061919271945953, | |
| "epoch": 0.9341317365269461, | |
| "grad_norm": 0.2579784691333771, | |
| "learning_rate": 2.227668489061219e-05, | |
| "loss": 0.3676, | |
| "mean_token_accuracy": 0.8960554301738739, | |
| "num_tokens": 1741942.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.3078198730945587, | |
| "epoch": 0.9363091997822537, | |
| "grad_norm": 0.24415822327136993, | |
| "learning_rate": 2.2137172504725956e-05, | |
| "loss": 0.2881, | |
| "mean_token_accuracy": 0.912653386592865, | |
| "num_tokens": 1745914.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.3296479806303978, | |
| "epoch": 0.9384866630375612, | |
| "grad_norm": 0.25575825572013855, | |
| "learning_rate": 2.2002019585968637e-05, | |
| "loss": 0.3096, | |
| "mean_token_accuracy": 0.9089950323104858, | |
| "num_tokens": 1749929.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.3241398259997368, | |
| "epoch": 0.9406641262928688, | |
| "grad_norm": 0.2516978085041046, | |
| "learning_rate": 2.187123284018753e-05, | |
| "loss": 0.3186, | |
| "mean_token_accuracy": 0.9034547358751297, | |
| "num_tokens": 1753992.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.3980755880475044, | |
| "epoch": 0.9428415895481764, | |
| "grad_norm": 0.24856629967689514, | |
| "learning_rate": 2.174481875659472e-05, | |
| "loss": 0.3749, | |
| "mean_token_accuracy": 0.8908516466617584, | |
| "num_tokens": 1758062.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.3143734037876129, | |
| "epoch": 0.945019052803484, | |
| "grad_norm": 0.25844618678092957, | |
| "learning_rate": 2.1622783607444988e-05, | |
| "loss": 0.2784, | |
| "mean_token_accuracy": 0.922119140625, | |
| "num_tokens": 1761689.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.3378266841173172, | |
| "epoch": 0.9471965160587915, | |
| "grad_norm": 0.24213889241218567, | |
| "learning_rate": 2.150513344772469e-05, | |
| "loss": 0.3155, | |
| "mean_token_accuracy": 0.9061428606510162, | |
| "num_tokens": 1766010.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.35330820083618164, | |
| "epoch": 0.9493739793140991, | |
| "grad_norm": 0.2620498836040497, | |
| "learning_rate": 2.1391874114851294e-05, | |
| "loss": 0.3583, | |
| "mean_token_accuracy": 0.9004585295915604, | |
| "num_tokens": 1769801.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.2881145551800728, | |
| "epoch": 0.9515514425694066, | |
| "grad_norm": 0.24421681463718414, | |
| "learning_rate": 2.128301122838377e-05, | |
| "loss": 0.3026, | |
| "mean_token_accuracy": 0.9104648381471634, | |
| "num_tokens": 1774342.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.3526333123445511, | |
| "epoch": 0.9537289058247143, | |
| "grad_norm": 0.2302054911851883, | |
| "learning_rate": 2.117855018974369e-05, | |
| "loss": 0.3199, | |
| "mean_token_accuracy": 0.9067949205636978, | |
| "num_tokens": 1778412.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.32418397441506386, | |
| "epoch": 0.9559063690800218, | |
| "grad_norm": 0.21741004288196564, | |
| "learning_rate": 2.107849618194735e-05, | |
| "loss": 0.3114, | |
| "mean_token_accuracy": 0.9031261652708054, | |
| "num_tokens": 1782995.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.30877869576215744, | |
| "epoch": 0.9580838323353293, | |
| "grad_norm": 0.23063865303993225, | |
| "learning_rate": 2.0982854169348503e-05, | |
| "loss": 0.2949, | |
| "mean_token_accuracy": 0.9094719737768173, | |
| "num_tokens": 1787537.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.3279525935649872, | |
| "epoch": 0.9602612955906369, | |
| "grad_norm": 0.2691234350204468, | |
| "learning_rate": 2.0891628897392087e-05, | |
| "loss": 0.345, | |
| "mean_token_accuracy": 0.8982786238193512, | |
| "num_tokens": 1791355.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.3470368981361389, | |
| "epoch": 0.9624387588459444, | |
| "grad_norm": 0.26819464564323425, | |
| "learning_rate": 2.0804824892378765e-05, | |
| "loss": 0.3414, | |
| "mean_token_accuracy": 0.9030001610517502, | |
| "num_tokens": 1795467.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.3493390902876854, | |
| "epoch": 0.9646162221012521, | |
| "grad_norm": 0.23444399237632751, | |
| "learning_rate": 2.0722446461240352e-05, | |
| "loss": 0.3442, | |
| "mean_token_accuracy": 0.8999157398939133, | |
| "num_tokens": 1800109.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.3092813342809677, | |
| "epoch": 0.9667936853565596, | |
| "grad_norm": 0.23800377547740936, | |
| "learning_rate": 2.0644497691326106e-05, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.9111448973417282, | |
| "num_tokens": 1804018.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.29666490107774734, | |
| "epoch": 0.9689711486118672, | |
| "grad_norm": 0.22874487936496735, | |
| "learning_rate": 2.0570982450199913e-05, | |
| "loss": 0.2858, | |
| "mean_token_accuracy": 0.9175421446561813, | |
| "num_tokens": 1808059.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.3779358044266701, | |
| "epoch": 0.9711486118671747, | |
| "grad_norm": 0.2360084503889084, | |
| "learning_rate": 2.0501904385448447e-05, | |
| "loss": 0.3668, | |
| "mean_token_accuracy": 0.9037110358476639, | |
| "num_tokens": 1812165.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.3430086299777031, | |
| "epoch": 0.9733260751224823, | |
| "grad_norm": 0.2596234679222107, | |
| "learning_rate": 2.043726692450014e-05, | |
| "loss": 0.3233, | |
| "mean_token_accuracy": 0.9003089815378189, | |
| "num_tokens": 1815708.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.3329969719052315, | |
| "epoch": 0.9755035383777899, | |
| "grad_norm": 0.25411558151245117, | |
| "learning_rate": 2.037707327445511e-05, | |
| "loss": 0.3299, | |
| "mean_token_accuracy": 0.9008579254150391, | |
| "num_tokens": 1819635.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.3378527835011482, | |
| "epoch": 0.9776810016330975, | |
| "grad_norm": 0.2512282431125641, | |
| "learning_rate": 2.0321326421926097e-05, | |
| "loss": 0.3325, | |
| "mean_token_accuracy": 0.9022142142057419, | |
| "num_tokens": 1823694.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.34048717468976974, | |
| "epoch": 0.979858464888405, | |
| "grad_norm": 0.24113033711910248, | |
| "learning_rate": 2.0270029132890223e-05, | |
| "loss": 0.344, | |
| "mean_token_accuracy": 0.9008767306804657, | |
| "num_tokens": 1827735.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.3050212487578392, | |
| "epoch": 0.9820359281437125, | |
| "grad_norm": 0.21851961314678192, | |
| "learning_rate": 2.0223183952551785e-05, | |
| "loss": 0.2795, | |
| "mean_token_accuracy": 0.917202040553093, | |
| "num_tokens": 1831884.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.3319382965564728, | |
| "epoch": 0.9842133913990201, | |
| "grad_norm": 0.24525989592075348, | |
| "learning_rate": 2.018079320521593e-05, | |
| "loss": 0.3079, | |
| "mean_token_accuracy": 0.9144886583089828, | |
| "num_tokens": 1835507.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.34535887837409973, | |
| "epoch": 0.9863908546543277, | |
| "grad_norm": 0.2506140172481537, | |
| "learning_rate": 2.0142858994173404e-05, | |
| "loss": 0.3436, | |
| "mean_token_accuracy": 0.9002240151166916, | |
| "num_tokens": 1839606.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.3276618719100952, | |
| "epoch": 0.9885683179096353, | |
| "grad_norm": 0.2481948435306549, | |
| "learning_rate": 2.0109383201596102e-05, | |
| "loss": 0.3105, | |
| "mean_token_accuracy": 0.9108982384204865, | |
| "num_tokens": 1843500.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.3270680084824562, | |
| "epoch": 0.9907457811649428, | |
| "grad_norm": 0.2625768780708313, | |
| "learning_rate": 2.0080367488443743e-05, | |
| "loss": 0.328, | |
| "mean_token_accuracy": 0.9026461988687515, | |
| "num_tokens": 1847739.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.34614715725183487, | |
| "epoch": 0.9929232444202504, | |
| "grad_norm": 0.2605029046535492, | |
| "learning_rate": 2.0055813294381443e-05, | |
| "loss": 0.3467, | |
| "mean_token_accuracy": 0.9046141803264618, | |
| "num_tokens": 1851928.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.284773550927639, | |
| "epoch": 0.995100707675558, | |
| "grad_norm": 0.22357277572155, | |
| "learning_rate": 2.00357218377083e-05, | |
| "loss": 0.2689, | |
| "mean_token_accuracy": 0.9219858795404434, | |
| "num_tokens": 1856283.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.356322281062603, | |
| "epoch": 0.9972781709308656, | |
| "grad_norm": 0.23450958728790283, | |
| "learning_rate": 2.0020094115296876e-05, | |
| "loss": 0.3562, | |
| "mean_token_accuracy": 0.9017274230718613, | |
| "num_tokens": 1861007.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.2814597636461258, | |
| "epoch": 0.9994556341861731, | |
| "grad_norm": 0.2359769642353058, | |
| "learning_rate": 2.0008930902543854e-05, | |
| "loss": 0.2653, | |
| "mean_token_accuracy": 0.9179674088954926, | |
| "num_tokens": 1865010.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.42821022868156433, | |
| "epoch": 1.0, | |
| "grad_norm": 0.8799027800559998, | |
| "learning_rate": 2.0002232753331453e-05, | |
| "loss": 0.4353, | |
| "mean_token_accuracy": 0.8921568393707275, | |
| "num_tokens": 1865318.0, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0112748518481592e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |