{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.12656632065773, "epoch": 0.003738317757009346, "grad_norm": 0.4271441102027893, "learning_rate": 0.0002, "loss": 2.4663805961608887, "mean_token_accuracy": 0.543229952454567, "num_tokens": 16235.0, "step": 1 }, { "entropy": 1.2336603701114655, "epoch": 0.007476635514018692, "grad_norm": 0.38558802008628845, "learning_rate": 0.0002, "loss": 2.1421403884887695, "mean_token_accuracy": 0.5718609094619751, "num_tokens": 32508.0, "step": 2 }, { "entropy": 1.3997833728790283, "epoch": 0.011214953271028037, "grad_norm": 0.2918585538864136, "learning_rate": 0.0002, "loss": 1.7201573848724365, "mean_token_accuracy": 0.5951470136642456, "num_tokens": 48740.0, "step": 3 }, { "entropy": 1.3798432350158691, "epoch": 0.014953271028037384, "grad_norm": 0.22533445060253143, "learning_rate": 0.0002, "loss": 1.409985899925232, "mean_token_accuracy": 0.6346195936203003, "num_tokens": 65174.0, "step": 4 }, { "entropy": 1.3409797251224518, "epoch": 0.018691588785046728, "grad_norm": 0.3003067374229431, "learning_rate": 0.0002, "loss": 1.28883695602417, "mean_token_accuracy": 0.6407334357500076, "num_tokens": 81213.0, "step": 5 }, { "entropy": 1.2644231617450714, "epoch": 0.022429906542056073, "grad_norm": 0.1622222661972046, "learning_rate": 0.0002, "loss": 1.1853853464126587, "mean_token_accuracy": 0.6605143547058105, "num_tokens": 97766.0, "step": 6 }, { "entropy": 1.208539754152298, "epoch": 0.026168224299065422, "grad_norm": 0.10511886328458786, "learning_rate": 0.0002, "loss": 1.1084699630737305, "mean_token_accuracy": 0.6641467809677124, "num_tokens": 114186.0, "step": 7 }, { "entropy": 1.1391400694847107, "epoch": 0.029906542056074768, "grad_norm": 0.10200454294681549, "learning_rate": 0.0002, "loss": 1.0516071319580078, "mean_token_accuracy": 0.6707163900136948, "num_tokens": 130305.0, "step": 8 }, { "entropy": 1.0563430190086365, "epoch": 0.03364485981308411, "grad_norm": 0.1273493468761444, "learning_rate": 0.0002, "loss": 0.992067813873291, "mean_token_accuracy": 0.6933889836072922, "num_tokens": 146652.0, "step": 9 }, { "entropy": 0.9964777082204819, "epoch": 0.037383177570093455, "grad_norm": 0.1289750188589096, "learning_rate": 0.0002, "loss": 0.9485647082328796, "mean_token_accuracy": 0.6941430121660233, "num_tokens": 162967.0, "step": 10 }, { "entropy": 0.9659603089094162, "epoch": 0.041121495327102804, "grad_norm": 0.10667150467634201, "learning_rate": 0.0002, "loss": 0.8798340559005737, "mean_token_accuracy": 0.7052389085292816, "num_tokens": 179255.0, "step": 11 }, { "entropy": 0.9392479658126831, "epoch": 0.044859813084112146, "grad_norm": 0.11929332464933395, "learning_rate": 0.0002, "loss": 0.8541638851165771, "mean_token_accuracy": 0.7038426250219345, "num_tokens": 195430.0, "step": 12 }, { "entropy": 0.863442063331604, "epoch": 0.048598130841121495, "grad_norm": 1.4121192693710327, "learning_rate": 0.0002, "loss": 0.8078625202178955, "mean_token_accuracy": 0.7139769345521927, "num_tokens": 211424.0, "step": 13 }, { "entropy": 0.8306129276752472, "epoch": 0.052336448598130844, "grad_norm": 0.10941090434789658, "learning_rate": 0.0002, "loss": 0.7781446576118469, "mean_token_accuracy": 0.7239344716072083, "num_tokens": 227810.0, "step": 14 }, { "entropy": 0.7757371664047241, "epoch": 0.056074766355140186, "grad_norm": 0.10486897826194763, "learning_rate": 0.0002, "loss": 0.7468726634979248, "mean_token_accuracy": 0.7250657230615616, "num_tokens": 243991.0, "step": 15 }, { "entropy": 0.7809797525405884, "epoch": 0.059813084112149535, "grad_norm": 0.8654316663742065, "learning_rate": 0.0002, "loss": 0.7594712376594543, "mean_token_accuracy": 0.7155007421970367, "num_tokens": 260281.0, "step": 16 }, { "entropy": 0.7353586554527283, "epoch": 0.06355140186915888, "grad_norm": 0.0876963660120964, "learning_rate": 0.0002, "loss": 0.7153522372245789, "mean_token_accuracy": 0.7296042591333389, "num_tokens": 276669.0, "step": 17 }, { "entropy": 0.6980300098657608, "epoch": 0.06728971962616823, "grad_norm": 0.07835765182971954, "learning_rate": 0.0002, "loss": 0.6894713640213013, "mean_token_accuracy": 0.7386218756437302, "num_tokens": 292849.0, "step": 18 }, { "entropy": 0.6726928502321243, "epoch": 0.07102803738317758, "grad_norm": 0.08941305428743362, "learning_rate": 0.0002, "loss": 0.6766728162765503, "mean_token_accuracy": 0.7433070838451385, "num_tokens": 309145.0, "step": 19 }, { "entropy": 0.6663309931755066, "epoch": 0.07476635514018691, "grad_norm": 0.08141425251960754, "learning_rate": 0.0002, "loss": 0.6594260334968567, "mean_token_accuracy": 0.7467465251684189, "num_tokens": 325653.0, "step": 20 }, { "entropy": 0.6754828691482544, "epoch": 0.07850467289719626, "grad_norm": 0.08411722630262375, "learning_rate": 0.0002, "loss": 0.661962628364563, "mean_token_accuracy": 0.7418759763240814, "num_tokens": 341884.0, "step": 21 }, { "entropy": 0.6487417817115784, "epoch": 0.08224299065420561, "grad_norm": 0.08564816415309906, "learning_rate": 0.0002, "loss": 0.6224545240402222, "mean_token_accuracy": 0.7568920999765396, "num_tokens": 358367.0, "step": 22 }, { "entropy": 0.6594859212636948, "epoch": 0.08598130841121496, "grad_norm": 0.08242395520210266, "learning_rate": 0.0002, "loss": 0.6353108882904053, "mean_token_accuracy": 0.748349204659462, "num_tokens": 374461.0, "step": 23 }, { "entropy": 0.6361121833324432, "epoch": 0.08971962616822429, "grad_norm": 0.06784524023532867, "learning_rate": 0.0002, "loss": 0.6177537441253662, "mean_token_accuracy": 0.7591407150030136, "num_tokens": 390663.0, "step": 24 }, { "entropy": 0.633724257349968, "epoch": 0.09345794392523364, "grad_norm": 0.06730605661869049, "learning_rate": 0.0002, "loss": 0.6257245540618896, "mean_token_accuracy": 0.7586156576871872, "num_tokens": 407000.0, "step": 25 }, { "entropy": 0.6177336722612381, "epoch": 0.09719626168224299, "grad_norm": 0.07131887227296829, "learning_rate": 0.0002, "loss": 0.6150547862052917, "mean_token_accuracy": 0.7589291036128998, "num_tokens": 423358.0, "step": 26 }, { "entropy": 0.6160949915647507, "epoch": 0.10093457943925234, "grad_norm": 0.06616901606321335, "learning_rate": 0.0002, "loss": 0.6125935316085815, "mean_token_accuracy": 0.7595443874597549, "num_tokens": 439799.0, "step": 27 }, { "entropy": 0.6129452586174011, "epoch": 0.10467289719626169, "grad_norm": 0.05841955915093422, "learning_rate": 0.0002, "loss": 0.608031153678894, "mean_token_accuracy": 0.7601521760225296, "num_tokens": 456163.0, "step": 28 }, { "entropy": 0.5918006747961044, "epoch": 0.10841121495327102, "grad_norm": 0.06275882571935654, "learning_rate": 0.0002, "loss": 0.5862717628479004, "mean_token_accuracy": 0.7687633484601974, "num_tokens": 472127.0, "step": 29 }, { "entropy": 0.6155861914157867, "epoch": 0.11214953271028037, "grad_norm": 0.06225947290658951, "learning_rate": 0.0002, "loss": 0.6084246635437012, "mean_token_accuracy": 0.7598295211791992, "num_tokens": 488332.0, "step": 30 }, { "entropy": 0.6035004556179047, "epoch": 0.11588785046728972, "grad_norm": 0.06444618105888367, "learning_rate": 0.0002, "loss": 0.5935206413269043, "mean_token_accuracy": 0.7651257067918777, "num_tokens": 504710.0, "step": 31 }, { "entropy": 0.6106846928596497, "epoch": 0.11962616822429907, "grad_norm": 0.0602172389626503, "learning_rate": 0.0002, "loss": 0.5996757745742798, "mean_token_accuracy": 0.760893777012825, "num_tokens": 521082.0, "step": 32 }, { "entropy": 0.5945021361112595, "epoch": 0.1233644859813084, "grad_norm": 0.06356704980134964, "learning_rate": 0.0002, "loss": 0.5945574045181274, "mean_token_accuracy": 0.765913113951683, "num_tokens": 537475.0, "step": 33 }, { "entropy": 0.5772300958633423, "epoch": 0.12710280373831775, "grad_norm": 0.06089172512292862, "learning_rate": 0.0002, "loss": 0.5904273986816406, "mean_token_accuracy": 0.76410873234272, "num_tokens": 553508.0, "step": 34 }, { "entropy": 0.550044596195221, "epoch": 0.1308411214953271, "grad_norm": 0.06109277158975601, "learning_rate": 0.0002, "loss": 0.5613641142845154, "mean_token_accuracy": 0.7737480998039246, "num_tokens": 569417.0, "step": 35 }, { "entropy": 0.5723532140254974, "epoch": 0.13457943925233645, "grad_norm": 0.05618736520409584, "learning_rate": 0.0002, "loss": 0.5777797698974609, "mean_token_accuracy": 0.7723707407712936, "num_tokens": 585786.0, "step": 36 }, { "entropy": 0.580461397767067, "epoch": 0.1383177570093458, "grad_norm": 0.05472671613097191, "learning_rate": 0.0002, "loss": 0.5808417201042175, "mean_token_accuracy": 0.7668861597776413, "num_tokens": 602132.0, "step": 37 }, { "entropy": 0.5738302320241928, "epoch": 0.14205607476635515, "grad_norm": 0.06117068976163864, "learning_rate": 0.0002, "loss": 0.57148277759552, "mean_token_accuracy": 0.774108350276947, "num_tokens": 618157.0, "step": 38 }, { "entropy": 0.5823365598917007, "epoch": 0.14579439252336449, "grad_norm": 0.05150913447141647, "learning_rate": 0.0002, "loss": 0.5758525729179382, "mean_token_accuracy": 0.7670020014047623, "num_tokens": 634401.0, "step": 39 }, { "entropy": 0.5688591003417969, "epoch": 0.14953271028037382, "grad_norm": 0.054129600524902344, "learning_rate": 0.0002, "loss": 0.5642731189727783, "mean_token_accuracy": 0.7723482251167297, "num_tokens": 650471.0, "step": 40 }, { "entropy": 0.5930688679218292, "epoch": 0.15327102803738318, "grad_norm": 0.04651381075382233, "learning_rate": 0.0002, "loss": 0.5882899761199951, "mean_token_accuracy": 0.7660222053527832, "num_tokens": 667141.0, "step": 41 }, { "entropy": 0.5680070519447327, "epoch": 0.15700934579439252, "grad_norm": 0.04372819885611534, "learning_rate": 0.0002, "loss": 0.5683910846710205, "mean_token_accuracy": 0.7714007496833801, "num_tokens": 683716.0, "step": 42 }, { "entropy": 0.577846348285675, "epoch": 0.16074766355140188, "grad_norm": 0.050794582813978195, "learning_rate": 0.0002, "loss": 0.5828132033348083, "mean_token_accuracy": 0.7683440744876862, "num_tokens": 700166.0, "step": 43 }, { "entropy": 0.5514896064996719, "epoch": 0.16448598130841122, "grad_norm": 0.05992089584469795, "learning_rate": 0.0002, "loss": 0.563271701335907, "mean_token_accuracy": 0.7739104330539703, "num_tokens": 716342.0, "step": 44 }, { "entropy": 0.575609490275383, "epoch": 0.16822429906542055, "grad_norm": 0.05013341084122658, "learning_rate": 0.0002, "loss": 0.5849894285202026, "mean_token_accuracy": 0.7635113149881363, "num_tokens": 732893.0, "step": 45 }, { "entropy": 0.5762993842363358, "epoch": 0.17196261682242991, "grad_norm": 0.048744700849056244, "learning_rate": 0.0002, "loss": 0.574410080909729, "mean_token_accuracy": 0.7676838040351868, "num_tokens": 749295.0, "step": 46 }, { "entropy": 0.5723859369754791, "epoch": 0.17570093457943925, "grad_norm": 0.05009591579437256, "learning_rate": 0.0002, "loss": 0.5668792128562927, "mean_token_accuracy": 0.7715302407741547, "num_tokens": 765549.0, "step": 47 }, { "entropy": 0.5764475762844086, "epoch": 0.17943925233644858, "grad_norm": 0.04878581687808037, "learning_rate": 0.0002, "loss": 0.5665475130081177, "mean_token_accuracy": 0.7720314264297485, "num_tokens": 781843.0, "step": 48 }, { "entropy": 0.5845135897397995, "epoch": 0.18317757009345795, "grad_norm": 0.04589271917939186, "learning_rate": 0.0002, "loss": 0.5771698951721191, "mean_token_accuracy": 0.7694474011659622, "num_tokens": 798405.0, "step": 49 }, { "entropy": 0.569475919008255, "epoch": 0.18691588785046728, "grad_norm": 0.04119531437754631, "learning_rate": 0.0002, "loss": 0.5674958229064941, "mean_token_accuracy": 0.7736699432134628, "num_tokens": 814777.0, "step": 50 }, { "entropy": 0.5692360401153564, "epoch": 0.19065420560747665, "grad_norm": 0.0399826280772686, "learning_rate": 0.0002, "loss": 0.5702151656150818, "mean_token_accuracy": 0.7684639543294907, "num_tokens": 831134.0, "step": 51 }, { "entropy": 0.5498989522457123, "epoch": 0.19439252336448598, "grad_norm": 0.05800061300396919, "learning_rate": 0.0002, "loss": 0.5609486699104309, "mean_token_accuracy": 0.7740016728639603, "num_tokens": 847344.0, "step": 52 }, { "entropy": 0.5662340968847275, "epoch": 0.19813084112149532, "grad_norm": 0.047494642436504364, "learning_rate": 0.0002, "loss": 0.5766743421554565, "mean_token_accuracy": 0.7678139507770538, "num_tokens": 863618.0, "step": 53 }, { "entropy": 0.5752062201499939, "epoch": 0.20186915887850468, "grad_norm": 0.05196239426732063, "learning_rate": 0.0002, "loss": 0.5726749300956726, "mean_token_accuracy": 0.7699306309223175, "num_tokens": 879844.0, "step": 54 }, { "entropy": 0.5600160509347916, "epoch": 0.205607476635514, "grad_norm": 0.04689890146255493, "learning_rate": 0.0002, "loss": 0.5549072623252869, "mean_token_accuracy": 0.7740037143230438, "num_tokens": 896085.0, "step": 55 }, { "entropy": 0.5731441378593445, "epoch": 0.20934579439252338, "grad_norm": 0.04465720057487488, "learning_rate": 0.0002, "loss": 0.5675906538963318, "mean_token_accuracy": 0.7729700356721878, "num_tokens": 912450.0, "step": 56 }, { "entropy": 0.5865043848752975, "epoch": 0.2130841121495327, "grad_norm": 0.03869406878948212, "learning_rate": 0.0002, "loss": 0.5814957022666931, "mean_token_accuracy": 0.7672637850046158, "num_tokens": 928895.0, "step": 57 }, { "entropy": 0.5643806457519531, "epoch": 0.21682242990654205, "grad_norm": 0.03822167217731476, "learning_rate": 0.0002, "loss": 0.5589677691459656, "mean_token_accuracy": 0.7748006731271744, "num_tokens": 945239.0, "step": 58 }, { "entropy": 0.5692119598388672, "epoch": 0.2205607476635514, "grad_norm": 0.042791273444890976, "learning_rate": 0.0002, "loss": 0.5735809206962585, "mean_token_accuracy": 0.7694528251886368, "num_tokens": 961363.0, "step": 59 }, { "entropy": 0.5709938555955887, "epoch": 0.22429906542056074, "grad_norm": 0.04215843975543976, "learning_rate": 0.0002, "loss": 0.5765149593353271, "mean_token_accuracy": 0.7663712352514267, "num_tokens": 977455.0, "step": 60 }, { "entropy": 0.5535417348146439, "epoch": 0.22803738317757008, "grad_norm": 0.046243466436862946, "learning_rate": 0.0002, "loss": 0.5625483989715576, "mean_token_accuracy": 0.7734335362911224, "num_tokens": 993620.0, "step": 61 }, { "entropy": 0.5543283224105835, "epoch": 0.23177570093457944, "grad_norm": 0.0379357784986496, "learning_rate": 0.0002, "loss": 0.5572807788848877, "mean_token_accuracy": 0.7759047746658325, "num_tokens": 1009834.0, "step": 62 }, { "entropy": 0.5534257739782333, "epoch": 0.23551401869158878, "grad_norm": 0.03617486730217934, "learning_rate": 0.0002, "loss": 0.5538501739501953, "mean_token_accuracy": 0.7762316316366196, "num_tokens": 1025981.0, "step": 63 }, { "entropy": 0.5688228756189346, "epoch": 0.23925233644859814, "grad_norm": 0.03479798510670662, "learning_rate": 0.0002, "loss": 0.5626167058944702, "mean_token_accuracy": 0.7745891660451889, "num_tokens": 1042596.0, "step": 64 }, { "entropy": 0.5704841166734695, "epoch": 0.24299065420560748, "grad_norm": 0.04157167300581932, "learning_rate": 0.0002, "loss": 0.568891704082489, "mean_token_accuracy": 0.7680116444826126, "num_tokens": 1058884.0, "step": 65 }, { "entropy": 0.5743043571710587, "epoch": 0.2467289719626168, "grad_norm": 0.03632580116391182, "learning_rate": 0.0002, "loss": 0.5711199045181274, "mean_token_accuracy": 0.769555926322937, "num_tokens": 1075319.0, "step": 66 }, { "entropy": 0.559576690196991, "epoch": 0.2504672897196262, "grad_norm": 0.038374125957489014, "learning_rate": 0.0002, "loss": 0.5629530549049377, "mean_token_accuracy": 0.771178126335144, "num_tokens": 1091451.0, "step": 67 }, { "entropy": 0.5548212379217148, "epoch": 0.2542056074766355, "grad_norm": 0.03802485764026642, "learning_rate": 0.0002, "loss": 0.5578069686889648, "mean_token_accuracy": 0.7767467051744461, "num_tokens": 1107549.0, "step": 68 }, { "entropy": 0.5731668472290039, "epoch": 0.25794392523364484, "grad_norm": 0.03902502730488777, "learning_rate": 0.0002, "loss": 0.5750908255577087, "mean_token_accuracy": 0.7706117182970047, "num_tokens": 1123904.0, "step": 69 }, { "entropy": 0.5669015496969223, "epoch": 0.2616822429906542, "grad_norm": 0.03905792534351349, "learning_rate": 0.0002, "loss": 0.5693663358688354, "mean_token_accuracy": 0.7708643227815628, "num_tokens": 1139931.0, "step": 70 }, { "entropy": 0.5661756098270416, "epoch": 0.26542056074766357, "grad_norm": 0.04826045408844948, "learning_rate": 0.0002, "loss": 0.5717936754226685, "mean_token_accuracy": 0.7682332992553711, "num_tokens": 1156090.0, "step": 71 }, { "entropy": 0.568753570318222, "epoch": 0.2691588785046729, "grad_norm": 0.03873279318213463, "learning_rate": 0.0002, "loss": 0.5717485547065735, "mean_token_accuracy": 0.7686503529548645, "num_tokens": 1172312.0, "step": 72 }, { "entropy": 0.5719727724790573, "epoch": 0.27289719626168224, "grad_norm": 0.039684589952230453, "learning_rate": 0.0002, "loss": 0.565541684627533, "mean_token_accuracy": 0.769890546798706, "num_tokens": 1188846.0, "step": 73 }, { "entropy": 0.5802080780267715, "epoch": 0.2766355140186916, "grad_norm": 0.03692556545138359, "learning_rate": 0.0002, "loss": 0.5813108682632446, "mean_token_accuracy": 0.7652633637189865, "num_tokens": 1205115.0, "step": 74 }, { "entropy": 0.5709390044212341, "epoch": 0.2803738317757009, "grad_norm": 0.03715148940682411, "learning_rate": 0.0002, "loss": 0.5739152431488037, "mean_token_accuracy": 0.7695163637399673, "num_tokens": 1221457.0, "step": 75 }, { "entropy": 0.5634023249149323, "epoch": 0.2841121495327103, "grad_norm": 0.035052694380283356, "learning_rate": 0.0002, "loss": 0.5634779334068298, "mean_token_accuracy": 0.7735425382852554, "num_tokens": 1237852.0, "step": 76 }, { "entropy": 0.5770431756973267, "epoch": 0.28785046728971964, "grad_norm": 0.04037750884890556, "learning_rate": 0.0002, "loss": 0.5792219042778015, "mean_token_accuracy": 0.7656148821115494, "num_tokens": 1253991.0, "step": 77 }, { "entropy": 0.5483120232820511, "epoch": 0.29158878504672897, "grad_norm": 0.04199967905879021, "learning_rate": 0.0002, "loss": 0.5473575592041016, "mean_token_accuracy": 0.7797968685626984, "num_tokens": 1270154.0, "step": 78 }, { "entropy": 0.5623519718647003, "epoch": 0.2953271028037383, "grad_norm": 0.04001434147357941, "learning_rate": 0.0002, "loss": 0.5669924020767212, "mean_token_accuracy": 0.7740958780050278, "num_tokens": 1286373.0, "step": 79 }, { "entropy": 0.5505794137716293, "epoch": 0.29906542056074764, "grad_norm": 0.039846453815698624, "learning_rate": 0.0002, "loss": 0.5637381076812744, "mean_token_accuracy": 0.7710813283920288, "num_tokens": 1302910.0, "step": 80 }, { "entropy": 0.573449894785881, "epoch": 0.30280373831775703, "grad_norm": 0.03970034047961235, "learning_rate": 0.0002, "loss": 0.5817972421646118, "mean_token_accuracy": 0.767284482717514, "num_tokens": 1319105.0, "step": 81 }, { "entropy": 0.5815064907073975, "epoch": 0.30654205607476637, "grad_norm": 0.036917295306921005, "learning_rate": 0.0002, "loss": 0.5764390826225281, "mean_token_accuracy": 0.7660059034824371, "num_tokens": 1335418.0, "step": 82 }, { "entropy": 0.5537111163139343, "epoch": 0.3102803738317757, "grad_norm": 0.038016658276319504, "learning_rate": 0.0002, "loss": 0.544030487537384, "mean_token_accuracy": 0.780098170042038, "num_tokens": 1351471.0, "step": 83 }, { "entropy": 0.5532083511352539, "epoch": 0.31401869158878504, "grad_norm": 0.03766188770532608, "learning_rate": 0.0002, "loss": 0.543038010597229, "mean_token_accuracy": 0.7815051227807999, "num_tokens": 1367729.0, "step": 84 }, { "entropy": 0.569915771484375, "epoch": 0.3177570093457944, "grad_norm": 0.03935057669878006, "learning_rate": 0.0002, "loss": 0.5673943758010864, "mean_token_accuracy": 0.7705481499433517, "num_tokens": 1384218.0, "step": 85 }, { "entropy": 0.5557460188865662, "epoch": 0.32149532710280376, "grad_norm": 0.0382615365087986, "learning_rate": 0.0002, "loss": 0.5650104284286499, "mean_token_accuracy": 0.7701956182718277, "num_tokens": 1400496.0, "step": 86 }, { "entropy": 0.5529367923736572, "epoch": 0.3252336448598131, "grad_norm": 0.03607897832989693, "learning_rate": 0.0002, "loss": 0.5612208843231201, "mean_token_accuracy": 0.773573562502861, "num_tokens": 1416728.0, "step": 87 }, { "entropy": 0.5617222934961319, "epoch": 0.32897196261682243, "grad_norm": 0.0373239666223526, "learning_rate": 0.0002, "loss": 0.5661642551422119, "mean_token_accuracy": 0.7711510807275772, "num_tokens": 1433091.0, "step": 88 }, { "entropy": 0.55742546916008, "epoch": 0.33271028037383177, "grad_norm": 0.03938078507781029, "learning_rate": 0.0002, "loss": 0.5600550770759583, "mean_token_accuracy": 0.7730235010385513, "num_tokens": 1449246.0, "step": 89 }, { "entropy": 0.5685389190912247, "epoch": 0.3364485981308411, "grad_norm": 0.040714140981435776, "learning_rate": 0.0002, "loss": 0.5676398873329163, "mean_token_accuracy": 0.7700921297073364, "num_tokens": 1465805.0, "step": 90 }, { "entropy": 0.5774114727973938, "epoch": 0.3401869158878505, "grad_norm": 0.03398137167096138, "learning_rate": 0.0002, "loss": 0.5775306224822998, "mean_token_accuracy": 0.7659128755331039, "num_tokens": 1482298.0, "step": 91 }, { "entropy": 0.5467455387115479, "epoch": 0.34392523364485983, "grad_norm": 0.032925065606832504, "learning_rate": 0.0002, "loss": 0.5481046438217163, "mean_token_accuracy": 0.7773325145244598, "num_tokens": 1498536.0, "step": 92 }, { "entropy": 0.5445878356695175, "epoch": 0.34766355140186916, "grad_norm": 0.03473861888051033, "learning_rate": 0.0002, "loss": 0.5424526929855347, "mean_token_accuracy": 0.7816839218139648, "num_tokens": 1514823.0, "step": 93 }, { "entropy": 0.5637122839689255, "epoch": 0.3514018691588785, "grad_norm": 0.03804982081055641, "learning_rate": 0.0002, "loss": 0.5646781325340271, "mean_token_accuracy": 0.7692969292402267, "num_tokens": 1531148.0, "step": 94 }, { "entropy": 0.5571535974740982, "epoch": 0.35514018691588783, "grad_norm": 0.03457267954945564, "learning_rate": 0.0002, "loss": 0.5619444251060486, "mean_token_accuracy": 0.7773198187351227, "num_tokens": 1547476.0, "step": 95 }, { "entropy": 0.5707617700099945, "epoch": 0.35887850467289717, "grad_norm": 0.03933979198336601, "learning_rate": 0.0002, "loss": 0.572324275970459, "mean_token_accuracy": 0.7692963778972626, "num_tokens": 1563979.0, "step": 96 }, { "entropy": 0.556370198726654, "epoch": 0.36261682242990656, "grad_norm": 0.03271894529461861, "learning_rate": 0.0002, "loss": 0.5558284521102905, "mean_token_accuracy": 0.7744213789701462, "num_tokens": 1580311.0, "step": 97 }, { "entropy": 0.5528354942798615, "epoch": 0.3663551401869159, "grad_norm": 0.03302107751369476, "learning_rate": 0.0002, "loss": 0.5553282499313354, "mean_token_accuracy": 0.77690689265728, "num_tokens": 1596402.0, "step": 98 }, { "entropy": 0.5531659126281738, "epoch": 0.37009345794392523, "grad_norm": 0.03468908742070198, "learning_rate": 0.0002, "loss": 0.5576953887939453, "mean_token_accuracy": 0.7762762904167175, "num_tokens": 1612430.0, "step": 99 }, { "entropy": 0.5810890346765518, "epoch": 0.37383177570093457, "grad_norm": 0.03342665359377861, "learning_rate": 0.0002, "loss": 0.5769139528274536, "mean_token_accuracy": 0.7672095000743866, "num_tokens": 1628891.0, "step": 100 }, { "entropy": 0.5750298053026199, "epoch": 0.3775700934579439, "grad_norm": 0.03441772237420082, "learning_rate": 0.0002, "loss": 0.5772010087966919, "mean_token_accuracy": 0.7646144926548004, "num_tokens": 1645047.0, "step": 101 }, { "entropy": 0.5650183409452438, "epoch": 0.3813084112149533, "grad_norm": 0.03096170350909233, "learning_rate": 0.0002, "loss": 0.5606149435043335, "mean_token_accuracy": 0.7738576829433441, "num_tokens": 1661380.0, "step": 102 }, { "entropy": 0.5494536608457565, "epoch": 0.3850467289719626, "grad_norm": 0.03677360713481903, "learning_rate": 0.0002, "loss": 0.5568496584892273, "mean_token_accuracy": 0.775225818157196, "num_tokens": 1677541.0, "step": 103 }, { "entropy": 0.5550926774740219, "epoch": 0.38878504672897196, "grad_norm": 0.03032948076725006, "learning_rate": 0.0002, "loss": 0.558656632900238, "mean_token_accuracy": 0.7753722071647644, "num_tokens": 1693849.0, "step": 104 }, { "entropy": 0.5538856834173203, "epoch": 0.3925233644859813, "grad_norm": 0.033197011798620224, "learning_rate": 0.0002, "loss": 0.5585562586784363, "mean_token_accuracy": 0.7750265747308731, "num_tokens": 1710410.0, "step": 105 }, { "entropy": 0.557091012597084, "epoch": 0.39626168224299063, "grad_norm": 0.03343191742897034, "learning_rate": 0.0002, "loss": 0.5658184885978699, "mean_token_accuracy": 0.7713737785816193, "num_tokens": 1726519.0, "step": 106 }, { "entropy": 0.573070839047432, "epoch": 0.4, "grad_norm": 0.03520960360765457, "learning_rate": 0.0002, "loss": 0.5683936476707458, "mean_token_accuracy": 0.7706228792667389, "num_tokens": 1742802.0, "step": 107 }, { "entropy": 0.5730053037405014, "epoch": 0.40373831775700936, "grad_norm": 0.032127268612384796, "learning_rate": 0.0002, "loss": 0.5697438716888428, "mean_token_accuracy": 0.7664725631475449, "num_tokens": 1759059.0, "step": 108 }, { "entropy": 0.5633453279733658, "epoch": 0.4074766355140187, "grad_norm": 0.03088793158531189, "learning_rate": 0.0002, "loss": 0.5599843263626099, "mean_token_accuracy": 0.7760611772537231, "num_tokens": 1775536.0, "step": 109 }, { "entropy": 0.550876572728157, "epoch": 0.411214953271028, "grad_norm": 0.032173894345760345, "learning_rate": 0.0002, "loss": 0.552717387676239, "mean_token_accuracy": 0.7752785235643387, "num_tokens": 1792019.0, "step": 110 }, { "entropy": 0.5721830427646637, "epoch": 0.41495327102803736, "grad_norm": 0.033584315329790115, "learning_rate": 0.0002, "loss": 0.5759853720664978, "mean_token_accuracy": 0.7664880454540253, "num_tokens": 1808419.0, "step": 111 }, { "entropy": 0.5759546905755997, "epoch": 0.41869158878504675, "grad_norm": 0.03846940025687218, "learning_rate": 0.0002, "loss": 0.5841522216796875, "mean_token_accuracy": 0.7626957893371582, "num_tokens": 1824543.0, "step": 112 }, { "entropy": 0.5635320693254471, "epoch": 0.4224299065420561, "grad_norm": 0.03328083083033562, "learning_rate": 0.0002, "loss": 0.5629671812057495, "mean_token_accuracy": 0.7737283408641815, "num_tokens": 1840757.0, "step": 113 }, { "entropy": 0.5591580420732498, "epoch": 0.4261682242990654, "grad_norm": 0.0327068492770195, "learning_rate": 0.0002, "loss": 0.551184356212616, "mean_token_accuracy": 0.7753513604402542, "num_tokens": 1857132.0, "step": 114 }, { "entropy": 0.5579714924097061, "epoch": 0.42990654205607476, "grad_norm": 0.0334380678832531, "learning_rate": 0.0002, "loss": 0.5555400252342224, "mean_token_accuracy": 0.7759147882461548, "num_tokens": 1873360.0, "step": 115 }, { "entropy": 0.5697025954723358, "epoch": 0.4336448598130841, "grad_norm": 0.03651506081223488, "learning_rate": 0.0002, "loss": 0.568575382232666, "mean_token_accuracy": 0.7692690938711166, "num_tokens": 1889933.0, "step": 116 }, { "entropy": 0.5710670948028564, "epoch": 0.4373831775700935, "grad_norm": 0.03260137885808945, "learning_rate": 0.0002, "loss": 0.5754102468490601, "mean_token_accuracy": 0.7645916491746902, "num_tokens": 1906415.0, "step": 117 }, { "entropy": 0.5612241625785828, "epoch": 0.4411214953271028, "grad_norm": 0.030186068266630173, "learning_rate": 0.0002, "loss": 0.5625964403152466, "mean_token_accuracy": 0.7733658254146576, "num_tokens": 1922692.0, "step": 118 }, { "entropy": 0.5558670610189438, "epoch": 0.44485981308411215, "grad_norm": 0.0367811918258667, "learning_rate": 0.0002, "loss": 0.5577695965766907, "mean_token_accuracy": 0.772549107670784, "num_tokens": 1939001.0, "step": 119 }, { "entropy": 0.5691811889410019, "epoch": 0.4485981308411215, "grad_norm": 0.03843454644083977, "learning_rate": 0.0002, "loss": 0.5703588128089905, "mean_token_accuracy": 0.7689766734838486, "num_tokens": 1955537.0, "step": 120 }, { "entropy": 0.5652327984571457, "epoch": 0.4523364485981308, "grad_norm": 0.032110750675201416, "learning_rate": 0.0002, "loss": 0.5627662539482117, "mean_token_accuracy": 0.7731665819883347, "num_tokens": 1971820.0, "step": 121 }, { "entropy": 0.5414326637983322, "epoch": 0.45607476635514016, "grad_norm": 0.031934358179569244, "learning_rate": 0.0002, "loss": 0.5432534217834473, "mean_token_accuracy": 0.7791064232587814, "num_tokens": 1988118.0, "step": 122 }, { "entropy": 0.5502553433179855, "epoch": 0.45981308411214955, "grad_norm": 0.035253144800662994, "learning_rate": 0.0002, "loss": 0.5521403551101685, "mean_token_accuracy": 0.7760459184646606, "num_tokens": 2004642.0, "step": 123 }, { "entropy": 0.5582242012023926, "epoch": 0.4635514018691589, "grad_norm": 0.035558655858039856, "learning_rate": 0.0002, "loss": 0.5682451725006104, "mean_token_accuracy": 0.7699540108442307, "num_tokens": 2020965.0, "step": 124 }, { "entropy": 0.5626089125871658, "epoch": 0.4672897196261682, "grad_norm": 0.028148163110017776, "learning_rate": 0.0002, "loss": 0.5638797283172607, "mean_token_accuracy": 0.7697459608316422, "num_tokens": 2037202.0, "step": 125 }, { "entropy": 0.5653271377086639, "epoch": 0.47102803738317756, "grad_norm": 0.03597045689821243, "learning_rate": 0.0002, "loss": 0.5635451078414917, "mean_token_accuracy": 0.7696232795715332, "num_tokens": 2053309.0, "step": 126 }, { "entropy": 0.560562789440155, "epoch": 0.4747663551401869, "grad_norm": 0.03047817200422287, "learning_rate": 0.0002, "loss": 0.5625080466270447, "mean_token_accuracy": 0.7718035280704498, "num_tokens": 2069535.0, "step": 127 }, { "entropy": 0.5554249584674835, "epoch": 0.4785046728971963, "grad_norm": 0.028741145506501198, "learning_rate": 0.0002, "loss": 0.5504335165023804, "mean_token_accuracy": 0.7771810442209244, "num_tokens": 2085763.0, "step": 128 }, { "entropy": 0.5567069947719574, "epoch": 0.4822429906542056, "grad_norm": 0.031639862805604935, "learning_rate": 0.0002, "loss": 0.5562032461166382, "mean_token_accuracy": 0.7760691046714783, "num_tokens": 2102046.0, "step": 129 }, { "entropy": 0.5418022572994232, "epoch": 0.48598130841121495, "grad_norm": 0.03434485197067261, "learning_rate": 0.0002, "loss": 0.5446175932884216, "mean_token_accuracy": 0.7789350152015686, "num_tokens": 2118239.0, "step": 130 }, { "entropy": 0.5367967188358307, "epoch": 0.4897196261682243, "grad_norm": 0.03757743164896965, "learning_rate": 0.0002, "loss": 0.5414644479751587, "mean_token_accuracy": 0.7816939055919647, "num_tokens": 2134627.0, "step": 131 }, { "entropy": 0.5399434715509415, "epoch": 0.4934579439252336, "grad_norm": 0.03444533050060272, "learning_rate": 0.0002, "loss": 0.5489372611045837, "mean_token_accuracy": 0.7746081054210663, "num_tokens": 2150944.0, "step": 132 }, { "entropy": 0.5634311139583588, "epoch": 0.497196261682243, "grad_norm": 0.028091201558709145, "learning_rate": 0.0002, "loss": 0.5653581619262695, "mean_token_accuracy": 0.7713855057954788, "num_tokens": 2167218.0, "step": 133 }, { "entropy": 0.5568374693393707, "epoch": 0.5009345794392523, "grad_norm": 0.029833409935235977, "learning_rate": 0.0002, "loss": 0.5585245490074158, "mean_token_accuracy": 0.7745143622159958, "num_tokens": 2183449.0, "step": 134 }, { "entropy": 0.5839870423078537, "epoch": 0.5046728971962616, "grad_norm": 0.03770853579044342, "learning_rate": 0.0002, "loss": 0.5719978213310242, "mean_token_accuracy": 0.7675238102674484, "num_tokens": 2199875.0, "step": 135 }, { "entropy": 0.5689375847578049, "epoch": 0.508411214953271, "grad_norm": 0.03635553643107414, "learning_rate": 0.0002, "loss": 0.5626992583274841, "mean_token_accuracy": 0.7723798751831055, "num_tokens": 2216163.0, "step": 136 }, { "entropy": 0.5507294833660126, "epoch": 0.5121495327102804, "grad_norm": 0.03596559911966324, "learning_rate": 0.0002, "loss": 0.5608267188072205, "mean_token_accuracy": 0.7710549086332321, "num_tokens": 2232636.0, "step": 137 }, { "entropy": 0.5623424351215363, "epoch": 0.5158878504672897, "grad_norm": 0.033818867057561874, "learning_rate": 0.0002, "loss": 0.5718593597412109, "mean_token_accuracy": 0.7696182578802109, "num_tokens": 2248825.0, "step": 138 }, { "entropy": 0.5675409585237503, "epoch": 0.5196261682242991, "grad_norm": 0.03331133350729942, "learning_rate": 0.0002, "loss": 0.5714356899261475, "mean_token_accuracy": 0.7693182229995728, "num_tokens": 2265359.0, "step": 139 }, { "entropy": 0.5522013902664185, "epoch": 0.5233644859813084, "grad_norm": 0.03208749741315842, "learning_rate": 0.0002, "loss": 0.5529259443283081, "mean_token_accuracy": 0.7765516042709351, "num_tokens": 2281629.0, "step": 140 }, { "entropy": 0.5493837893009186, "epoch": 0.5271028037383177, "grad_norm": 0.0305814016610384, "learning_rate": 0.0002, "loss": 0.5490883588790894, "mean_token_accuracy": 0.7763204425573349, "num_tokens": 2297908.0, "step": 141 }, { "entropy": 0.5564678907394409, "epoch": 0.5308411214953271, "grad_norm": 0.034225739538669586, "learning_rate": 0.0002, "loss": 0.5602461099624634, "mean_token_accuracy": 0.7709554880857468, "num_tokens": 2314115.0, "step": 142 }, { "entropy": 0.5697164833545685, "epoch": 0.5345794392523364, "grad_norm": 0.03395864740014076, "learning_rate": 0.0002, "loss": 0.5692602396011353, "mean_token_accuracy": 0.766906350851059, "num_tokens": 2330462.0, "step": 143 }, { "entropy": 0.5691278576850891, "epoch": 0.5383177570093458, "grad_norm": 0.03194013983011246, "learning_rate": 0.0002, "loss": 0.562545657157898, "mean_token_accuracy": 0.7723768651485443, "num_tokens": 2346630.0, "step": 144 }, { "entropy": 0.558807983994484, "epoch": 0.5420560747663551, "grad_norm": 0.036789294332265854, "learning_rate": 0.0002, "loss": 0.5632457733154297, "mean_token_accuracy": 0.772635355591774, "num_tokens": 2362732.0, "step": 145 }, { "entropy": 0.5582777112722397, "epoch": 0.5457943925233645, "grad_norm": 0.02997492626309395, "learning_rate": 0.0002, "loss": 0.5614091753959656, "mean_token_accuracy": 0.7702963054180145, "num_tokens": 2379199.0, "step": 146 }, { "entropy": 0.5584180504083633, "epoch": 0.5495327102803739, "grad_norm": 0.033580392599105835, "learning_rate": 0.0002, "loss": 0.5605478286743164, "mean_token_accuracy": 0.7730905264616013, "num_tokens": 2395497.0, "step": 147 }, { "entropy": 0.5477179437875748, "epoch": 0.5532710280373832, "grad_norm": 0.03941367194056511, "learning_rate": 0.0002, "loss": 0.5504173636436462, "mean_token_accuracy": 0.77938412129879, "num_tokens": 2411648.0, "step": 148 }, { "entropy": 0.5601572096347809, "epoch": 0.5570093457943925, "grad_norm": 0.030582338571548462, "learning_rate": 0.0002, "loss": 0.5634943246841431, "mean_token_accuracy": 0.7728341221809387, "num_tokens": 2427925.0, "step": 149 }, { "entropy": 0.5869706571102142, "epoch": 0.5607476635514018, "grad_norm": 0.036973923444747925, "learning_rate": 0.0002, "loss": 0.5785589218139648, "mean_token_accuracy": 0.765045240521431, "num_tokens": 2444416.0, "step": 150 }, { "entropy": 0.5624907165765762, "epoch": 0.5644859813084112, "grad_norm": 0.036355964839458466, "learning_rate": 0.0002, "loss": 0.5561196208000183, "mean_token_accuracy": 0.7752401679754257, "num_tokens": 2460808.0, "step": 151 }, { "entropy": 0.5570034384727478, "epoch": 0.5682242990654206, "grad_norm": 0.027923110872507095, "learning_rate": 0.0002, "loss": 0.5550441145896912, "mean_token_accuracy": 0.7757884711027145, "num_tokens": 2477437.0, "step": 152 }, { "entropy": 0.5643865615129471, "epoch": 0.5719626168224299, "grad_norm": 0.0321192592382431, "learning_rate": 0.0002, "loss": 0.5707546472549438, "mean_token_accuracy": 0.7692134529352188, "num_tokens": 2493966.0, "step": 153 }, { "entropy": 0.5535547733306885, "epoch": 0.5757009345794393, "grad_norm": 0.03465733677148819, "learning_rate": 0.0002, "loss": 0.5610126256942749, "mean_token_accuracy": 0.7733882069587708, "num_tokens": 2510442.0, "step": 154 }, { "entropy": 0.5411207228899002, "epoch": 0.5794392523364486, "grad_norm": 0.03268473595380783, "learning_rate": 0.0002, "loss": 0.5444988012313843, "mean_token_accuracy": 0.7791947424411774, "num_tokens": 2526738.0, "step": 155 }, { "entropy": 0.5539679378271103, "epoch": 0.5831775700934579, "grad_norm": 0.03345946595072746, "learning_rate": 0.0002, "loss": 0.5571167469024658, "mean_token_accuracy": 0.7733618319034576, "num_tokens": 2543004.0, "step": 156 }, { "entropy": 0.547135517001152, "epoch": 0.5869158878504673, "grad_norm": 0.03414901718497276, "learning_rate": 0.0002, "loss": 0.5551236867904663, "mean_token_accuracy": 0.7734578996896744, "num_tokens": 2559150.0, "step": 157 }, { "entropy": 0.5595978051424026, "epoch": 0.5906542056074766, "grad_norm": 0.03502917289733887, "learning_rate": 0.0002, "loss": 0.5722506046295166, "mean_token_accuracy": 0.7680937796831131, "num_tokens": 2575360.0, "step": 158 }, { "entropy": 0.56221604347229, "epoch": 0.594392523364486, "grad_norm": 0.036693476140499115, "learning_rate": 0.0002, "loss": 0.5663124918937683, "mean_token_accuracy": 0.7699347287416458, "num_tokens": 2591749.0, "step": 159 }, { "entropy": 0.5489411354064941, "epoch": 0.5981308411214953, "grad_norm": 0.029823357239365578, "learning_rate": 0.0002, "loss": 0.5525665879249573, "mean_token_accuracy": 0.7778102308511734, "num_tokens": 2608011.0, "step": 160 }, { "entropy": 0.5679098963737488, "epoch": 0.6018691588785047, "grad_norm": 0.03129269927740097, "learning_rate": 0.0002, "loss": 0.5632325410842896, "mean_token_accuracy": 0.7711086720228195, "num_tokens": 2624110.0, "step": 161 }, { "entropy": 0.5759385228157043, "epoch": 0.6056074766355141, "grad_norm": 0.03027232177555561, "learning_rate": 0.0002, "loss": 0.566430926322937, "mean_token_accuracy": 0.7684105038642883, "num_tokens": 2640619.0, "step": 162 }, { "entropy": 0.5755711048841476, "epoch": 0.6093457943925233, "grad_norm": 0.02997921220958233, "learning_rate": 0.0002, "loss": 0.5693614482879639, "mean_token_accuracy": 0.7678638249635696, "num_tokens": 2656816.0, "step": 163 }, { "entropy": 0.5675656646490097, "epoch": 0.6130841121495327, "grad_norm": 0.02925792895257473, "learning_rate": 0.0002, "loss": 0.5620183348655701, "mean_token_accuracy": 0.7710973769426346, "num_tokens": 2673238.0, "step": 164 }, { "entropy": 0.5436252653598785, "epoch": 0.616822429906542, "grad_norm": 0.030324436724185944, "learning_rate": 0.0002, "loss": 0.5462444424629211, "mean_token_accuracy": 0.779330775141716, "num_tokens": 2689740.0, "step": 165 }, { "entropy": 0.5572406202554703, "epoch": 0.6205607476635514, "grad_norm": 0.03400828689336777, "learning_rate": 0.0002, "loss": 0.5641958713531494, "mean_token_accuracy": 0.7692032605409622, "num_tokens": 2706162.0, "step": 166 }, { "entropy": 0.554596871137619, "epoch": 0.6242990654205608, "grad_norm": 0.03054538182914257, "learning_rate": 0.0002, "loss": 0.556669294834137, "mean_token_accuracy": 0.7765887379646301, "num_tokens": 2722464.0, "step": 167 }, { "entropy": 0.5644665658473969, "epoch": 0.6280373831775701, "grad_norm": 0.03194966912269592, "learning_rate": 0.0002, "loss": 0.5671694278717041, "mean_token_accuracy": 0.7694765031337738, "num_tokens": 2738958.0, "step": 168 }, { "entropy": 0.5491771847009659, "epoch": 0.6317757009345795, "grad_norm": 0.03178941458463669, "learning_rate": 0.0002, "loss": 0.5497304797172546, "mean_token_accuracy": 0.7750105261802673, "num_tokens": 2755355.0, "step": 169 }, { "entropy": 0.5742185562849045, "epoch": 0.6355140186915887, "grad_norm": 0.027454091235995293, "learning_rate": 0.0002, "loss": 0.5754401683807373, "mean_token_accuracy": 0.7658552527427673, "num_tokens": 2771556.0, "step": 170 }, { "entropy": 0.5589788407087326, "epoch": 0.6392523364485981, "grad_norm": 0.029149651527404785, "learning_rate": 0.0002, "loss": 0.554992139339447, "mean_token_accuracy": 0.7758396863937378, "num_tokens": 2787760.0, "step": 171 }, { "entropy": 0.5677189081907272, "epoch": 0.6429906542056075, "grad_norm": 0.03037264011800289, "learning_rate": 0.0002, "loss": 0.5637961626052856, "mean_token_accuracy": 0.7705356478691101, "num_tokens": 2803802.0, "step": 172 }, { "entropy": 0.5565283447504044, "epoch": 0.6467289719626168, "grad_norm": 0.03331301361322403, "learning_rate": 0.0002, "loss": 0.5568943023681641, "mean_token_accuracy": 0.77414271235466, "num_tokens": 2820371.0, "step": 173 }, { "entropy": 0.5312813073396683, "epoch": 0.6504672897196262, "grad_norm": 0.03152315691113472, "learning_rate": 0.0002, "loss": 0.5355879664421082, "mean_token_accuracy": 0.785700336098671, "num_tokens": 2836694.0, "step": 174 }, { "entropy": 0.5379063338041306, "epoch": 0.6542056074766355, "grad_norm": 0.037841469049453735, "learning_rate": 0.0002, "loss": 0.5525423288345337, "mean_token_accuracy": 0.7756439745426178, "num_tokens": 2852864.0, "step": 175 }, { "entropy": 0.5613906681537628, "epoch": 0.6579439252336449, "grad_norm": 0.035853054374456406, "learning_rate": 0.0002, "loss": 0.5655968189239502, "mean_token_accuracy": 0.7716417163610458, "num_tokens": 2869313.0, "step": 176 }, { "entropy": 0.5639201551675797, "epoch": 0.6616822429906543, "grad_norm": 0.026397736743092537, "learning_rate": 0.0002, "loss": 0.5627295970916748, "mean_token_accuracy": 0.7704634070396423, "num_tokens": 2885495.0, "step": 177 }, { "entropy": 0.5702281445264816, "epoch": 0.6654205607476635, "grad_norm": 0.03206147998571396, "learning_rate": 0.0002, "loss": 0.5647550821304321, "mean_token_accuracy": 0.7702795714139938, "num_tokens": 2901765.0, "step": 178 }, { "entropy": 0.5528819262981415, "epoch": 0.6691588785046729, "grad_norm": 0.03629858419299126, "learning_rate": 0.0002, "loss": 0.5473611950874329, "mean_token_accuracy": 0.7778798639774323, "num_tokens": 2918124.0, "step": 179 }, { "entropy": 0.5617557764053345, "epoch": 0.6728971962616822, "grad_norm": 0.03116736188530922, "learning_rate": 0.0002, "loss": 0.5709046721458435, "mean_token_accuracy": 0.7677187621593475, "num_tokens": 2934418.0, "step": 180 }, { "entropy": 0.544835090637207, "epoch": 0.6766355140186916, "grad_norm": 0.03548549860715866, "learning_rate": 0.0002, "loss": 0.5551706552505493, "mean_token_accuracy": 0.7762557417154312, "num_tokens": 2951100.0, "step": 181 }, { "entropy": 0.5660403668880463, "epoch": 0.680373831775701, "grad_norm": 0.03100365214049816, "learning_rate": 0.0002, "loss": 0.5729965567588806, "mean_token_accuracy": 0.7690318375825882, "num_tokens": 2967440.0, "step": 182 }, { "entropy": 0.5780525356531143, "epoch": 0.6841121495327103, "grad_norm": 0.03490225970745087, "learning_rate": 0.0002, "loss": 0.5734487771987915, "mean_token_accuracy": 0.7699766159057617, "num_tokens": 2983954.0, "step": 183 }, { "entropy": 0.5722559094429016, "epoch": 0.6878504672897197, "grad_norm": 0.031209329143166542, "learning_rate": 0.0002, "loss": 0.5663836002349854, "mean_token_accuracy": 0.7720828950405121, "num_tokens": 3000256.0, "step": 184 }, { "entropy": 0.5506948530673981, "epoch": 0.6915887850467289, "grad_norm": 0.029818221926689148, "learning_rate": 0.0002, "loss": 0.5445064306259155, "mean_token_accuracy": 0.7804610878229141, "num_tokens": 3016740.0, "step": 185 }, { "entropy": 0.5661566108465195, "epoch": 0.6953271028037383, "grad_norm": 0.03627892956137657, "learning_rate": 0.0002, "loss": 0.5731881260871887, "mean_token_accuracy": 0.7681418061256409, "num_tokens": 3033200.0, "step": 186 }, { "entropy": 0.5561655461788177, "epoch": 0.6990654205607477, "grad_norm": 0.028912672773003578, "learning_rate": 0.0002, "loss": 0.559117317199707, "mean_token_accuracy": 0.7737248986959457, "num_tokens": 3049728.0, "step": 187 }, { "entropy": 0.5450099408626556, "epoch": 0.702803738317757, "grad_norm": 0.03303583338856697, "learning_rate": 0.0002, "loss": 0.5467768907546997, "mean_token_accuracy": 0.7775131165981293, "num_tokens": 3066007.0, "step": 188 }, { "entropy": 0.5617918968200684, "epoch": 0.7065420560747664, "grad_norm": 0.035768017172813416, "learning_rate": 0.0002, "loss": 0.563019871711731, "mean_token_accuracy": 0.770862489938736, "num_tokens": 3082324.0, "step": 189 }, { "entropy": 0.5339331775903702, "epoch": 0.7102803738317757, "grad_norm": 0.031208420172333717, "learning_rate": 0.0002, "loss": 0.547924280166626, "mean_token_accuracy": 0.7771021723747253, "num_tokens": 3098546.0, "step": 190 }, { "entropy": 0.5686406493186951, "epoch": 0.7140186915887851, "grad_norm": 0.028388923034071922, "learning_rate": 0.0002, "loss": 0.5657324194908142, "mean_token_accuracy": 0.772287517786026, "num_tokens": 3114868.0, "step": 191 }, { "entropy": 0.5583553314208984, "epoch": 0.7177570093457943, "grad_norm": 0.027447570115327835, "learning_rate": 0.0002, "loss": 0.5535703897476196, "mean_token_accuracy": 0.7759178727865219, "num_tokens": 3131210.0, "step": 192 }, { "entropy": 0.5578874051570892, "epoch": 0.7214953271028037, "grad_norm": 0.033130839467048645, "learning_rate": 0.0002, "loss": 0.5513507723808289, "mean_token_accuracy": 0.7747978419065475, "num_tokens": 3147445.0, "step": 193 }, { "entropy": 0.5491522252559662, "epoch": 0.7252336448598131, "grad_norm": 0.030513031408190727, "learning_rate": 0.0002, "loss": 0.5503372550010681, "mean_token_accuracy": 0.7780584990978241, "num_tokens": 3163723.0, "step": 194 }, { "entropy": 0.5677588433027267, "epoch": 0.7289719626168224, "grad_norm": 0.030064091086387634, "learning_rate": 0.0002, "loss": 0.5684211850166321, "mean_token_accuracy": 0.7694611251354218, "num_tokens": 3180127.0, "step": 195 }, { "entropy": 0.5523021966218948, "epoch": 0.7327102803738318, "grad_norm": 0.028454501181840897, "learning_rate": 0.0002, "loss": 0.5564773082733154, "mean_token_accuracy": 0.7736252546310425, "num_tokens": 3196384.0, "step": 196 }, { "entropy": 0.5594403147697449, "epoch": 0.7364485981308411, "grad_norm": 0.031159594655036926, "learning_rate": 0.0002, "loss": 0.5678831934928894, "mean_token_accuracy": 0.7687141001224518, "num_tokens": 3212579.0, "step": 197 }, { "entropy": 0.5670231431722641, "epoch": 0.7401869158878505, "grad_norm": 0.026576390489935875, "learning_rate": 0.0002, "loss": 0.5695415735244751, "mean_token_accuracy": 0.7709443867206573, "num_tokens": 3229005.0, "step": 198 }, { "entropy": 0.5550480484962463, "epoch": 0.7439252336448599, "grad_norm": 0.030606523156166077, "learning_rate": 0.0002, "loss": 0.5502464771270752, "mean_token_accuracy": 0.7791616022586823, "num_tokens": 3245287.0, "step": 199 }, { "entropy": 0.5619281828403473, "epoch": 0.7476635514018691, "grad_norm": 0.030474133789539337, "learning_rate": 0.0002, "loss": 0.5586714148521423, "mean_token_accuracy": 0.7734764218330383, "num_tokens": 3261691.0, "step": 200 }, { "entropy": 0.5405223369598389, "epoch": 0.7514018691588785, "grad_norm": 0.032003577798604965, "learning_rate": 0.0002, "loss": 0.5496760010719299, "mean_token_accuracy": 0.7761346995830536, "num_tokens": 3277743.0, "step": 201 }, { "entropy": 0.5539799779653549, "epoch": 0.7551401869158878, "grad_norm": 0.026676569133996964, "learning_rate": 0.0002, "loss": 0.5552941560745239, "mean_token_accuracy": 0.7729017436504364, "num_tokens": 3293921.0, "step": 202 }, { "entropy": 0.5504231303930283, "epoch": 0.7588785046728972, "grad_norm": 0.02650677040219307, "learning_rate": 0.0002, "loss": 0.5463041663169861, "mean_token_accuracy": 0.7773067653179169, "num_tokens": 3310038.0, "step": 203 }, { "entropy": 0.5567349493503571, "epoch": 0.7626168224299066, "grad_norm": 0.028487270697951317, "learning_rate": 0.0002, "loss": 0.5550260543823242, "mean_token_accuracy": 0.7747003883123398, "num_tokens": 3326542.0, "step": 204 }, { "entropy": 0.5515165776014328, "epoch": 0.7663551401869159, "grad_norm": 0.02944660186767578, "learning_rate": 0.0002, "loss": 0.5483176708221436, "mean_token_accuracy": 0.7772196680307388, "num_tokens": 3342960.0, "step": 205 }, { "entropy": 0.5516369044780731, "epoch": 0.7700934579439253, "grad_norm": 0.02446347288787365, "learning_rate": 0.0002, "loss": 0.5510342121124268, "mean_token_accuracy": 0.7753156870603561, "num_tokens": 3359361.0, "step": 206 }, { "entropy": 0.562598317861557, "epoch": 0.7738317757009345, "grad_norm": 0.032002996653318405, "learning_rate": 0.0002, "loss": 0.5551044344902039, "mean_token_accuracy": 0.7748953849077225, "num_tokens": 3375695.0, "step": 207 }, { "entropy": 0.5636338144540787, "epoch": 0.7775700934579439, "grad_norm": 0.032179221510887146, "learning_rate": 0.0002, "loss": 0.564883291721344, "mean_token_accuracy": 0.7722733914852142, "num_tokens": 3391711.0, "step": 208 }, { "entropy": 0.5475672632455826, "epoch": 0.7813084112149533, "grad_norm": 0.03206668421626091, "learning_rate": 0.0002, "loss": 0.5551382899284363, "mean_token_accuracy": 0.7726904302835464, "num_tokens": 3407951.0, "step": 209 }, { "entropy": 0.540259450674057, "epoch": 0.7850467289719626, "grad_norm": 0.02936564013361931, "learning_rate": 0.0002, "loss": 0.5508178472518921, "mean_token_accuracy": 0.7771763801574707, "num_tokens": 3424278.0, "step": 210 }, { "entropy": 0.5564334988594055, "epoch": 0.788785046728972, "grad_norm": 0.03052506223320961, "learning_rate": 0.0002, "loss": 0.5652161240577698, "mean_token_accuracy": 0.770373746752739, "num_tokens": 3440796.0, "step": 211 }, { "entropy": 0.5524326264858246, "epoch": 0.7925233644859813, "grad_norm": 0.025716882199048996, "learning_rate": 0.0002, "loss": 0.5483862161636353, "mean_token_accuracy": 0.778383657336235, "num_tokens": 3457162.0, "step": 212 }, { "entropy": 0.5574807077646255, "epoch": 0.7962616822429907, "grad_norm": 0.026924515143036842, "learning_rate": 0.0002, "loss": 0.5535562634468079, "mean_token_accuracy": 0.7756220400333405, "num_tokens": 3473707.0, "step": 213 }, { "entropy": 0.558317020535469, "epoch": 0.8, "grad_norm": 0.025764374062418938, "learning_rate": 0.0002, "loss": 0.560704231262207, "mean_token_accuracy": 0.7712857127189636, "num_tokens": 3490125.0, "step": 214 }, { "entropy": 0.5554333925247192, "epoch": 0.8037383177570093, "grad_norm": 0.028298519551753998, "learning_rate": 0.0002, "loss": 0.5522173643112183, "mean_token_accuracy": 0.7743871361017227, "num_tokens": 3506505.0, "step": 215 }, { "entropy": 0.5587067157030106, "epoch": 0.8074766355140187, "grad_norm": 0.02431626431643963, "learning_rate": 0.0002, "loss": 0.5544553995132446, "mean_token_accuracy": 0.7743324339389801, "num_tokens": 3522958.0, "step": 216 }, { "entropy": 0.5645765364170074, "epoch": 0.811214953271028, "grad_norm": 0.02611798420548439, "learning_rate": 0.0002, "loss": 0.5644361972808838, "mean_token_accuracy": 0.7711465507745743, "num_tokens": 3539490.0, "step": 217 }, { "entropy": 0.5525356978178024, "epoch": 0.8149532710280374, "grad_norm": 0.03383297845721245, "learning_rate": 0.0002, "loss": 0.5598211884498596, "mean_token_accuracy": 0.7742004096508026, "num_tokens": 3555746.0, "step": 218 }, { "entropy": 0.5621150583028793, "epoch": 0.8186915887850468, "grad_norm": 0.030269736424088478, "learning_rate": 0.0002, "loss": 0.5634778738021851, "mean_token_accuracy": 0.7692747861146927, "num_tokens": 3572256.0, "step": 219 }, { "entropy": 0.5514157265424728, "epoch": 0.822429906542056, "grad_norm": 0.028750412166118622, "learning_rate": 0.0002, "loss": 0.5467870831489563, "mean_token_accuracy": 0.7769519984722137, "num_tokens": 3588550.0, "step": 220 }, { "entropy": 0.5368104577064514, "epoch": 0.8261682242990654, "grad_norm": 0.03091045655310154, "learning_rate": 0.0002, "loss": 0.5372405648231506, "mean_token_accuracy": 0.7840253859758377, "num_tokens": 3604659.0, "step": 221 }, { "entropy": 0.5409716814756393, "epoch": 0.8299065420560747, "grad_norm": 0.03386515751481056, "learning_rate": 0.0002, "loss": 0.548212468624115, "mean_token_accuracy": 0.7736510932445526, "num_tokens": 3620843.0, "step": 222 }, { "entropy": 0.5629084706306458, "epoch": 0.8336448598130841, "grad_norm": 0.040728501975536346, "learning_rate": 0.0002, "loss": 0.5746021270751953, "mean_token_accuracy": 0.7647373080253601, "num_tokens": 3637324.0, "step": 223 }, { "entropy": 0.5369234085083008, "epoch": 0.8373831775700935, "grad_norm": 0.029392162337899208, "learning_rate": 0.0002, "loss": 0.5397970080375671, "mean_token_accuracy": 0.7819121479988098, "num_tokens": 3653633.0, "step": 224 }, { "entropy": 0.5768532902002335, "epoch": 0.8411214953271028, "grad_norm": 0.033986181020736694, "learning_rate": 0.0002, "loss": 0.5701450109481812, "mean_token_accuracy": 0.7669256031513214, "num_tokens": 3670158.0, "step": 225 }, { "entropy": 0.5465534925460815, "epoch": 0.8448598130841122, "grad_norm": 0.034689608961343765, "learning_rate": 0.0002, "loss": 0.539010226726532, "mean_token_accuracy": 0.7829751968383789, "num_tokens": 3686415.0, "step": 226 }, { "entropy": 0.5669656842947006, "epoch": 0.8485981308411215, "grad_norm": 0.029157601296901703, "learning_rate": 0.0002, "loss": 0.5645594596862793, "mean_token_accuracy": 0.7721282690763474, "num_tokens": 3702620.0, "step": 227 }, { "entropy": 0.5713803917169571, "epoch": 0.8523364485981308, "grad_norm": 0.032975275069475174, "learning_rate": 0.0002, "loss": 0.5758609771728516, "mean_token_accuracy": 0.7657817453145981, "num_tokens": 3719219.0, "step": 228 }, { "entropy": 0.5463247001171112, "epoch": 0.8560747663551402, "grad_norm": 0.039444658905267715, "learning_rate": 0.0002, "loss": 0.5534209609031677, "mean_token_accuracy": 0.7726487815380096, "num_tokens": 3735438.0, "step": 229 }, { "entropy": 0.556586429476738, "epoch": 0.8598130841121495, "grad_norm": 0.02616702765226364, "learning_rate": 0.0002, "loss": 0.5549170970916748, "mean_token_accuracy": 0.7752689123153687, "num_tokens": 3751785.0, "step": 230 }, { "entropy": 0.5389135032892227, "epoch": 0.8635514018691589, "grad_norm": 0.03276278078556061, "learning_rate": 0.0002, "loss": 0.5399537086486816, "mean_token_accuracy": 0.781702533364296, "num_tokens": 3767826.0, "step": 231 }, { "entropy": 0.5364359021186829, "epoch": 0.8672897196261682, "grad_norm": 0.026118800044059753, "learning_rate": 0.0002, "loss": 0.5382952094078064, "mean_token_accuracy": 0.780514121055603, "num_tokens": 3783919.0, "step": 232 }, { "entropy": 0.5687360912561417, "epoch": 0.8710280373831776, "grad_norm": 0.03209976479411125, "learning_rate": 0.0002, "loss": 0.5756676197052002, "mean_token_accuracy": 0.7664439678192139, "num_tokens": 3800454.0, "step": 233 }, { "entropy": 0.5679410099983215, "epoch": 0.874766355140187, "grad_norm": 0.025931114330887794, "learning_rate": 0.0002, "loss": 0.5656247138977051, "mean_token_accuracy": 0.7693636864423752, "num_tokens": 3816747.0, "step": 234 }, { "entropy": 0.557420089840889, "epoch": 0.8785046728971962, "grad_norm": 0.02894972637295723, "learning_rate": 0.0002, "loss": 0.5490383505821228, "mean_token_accuracy": 0.7750599384307861, "num_tokens": 3833058.0, "step": 235 }, { "entropy": 0.560372844338417, "epoch": 0.8822429906542056, "grad_norm": 0.03646957501769066, "learning_rate": 0.0002, "loss": 0.5596282482147217, "mean_token_accuracy": 0.7726272940635681, "num_tokens": 3849415.0, "step": 236 }, { "entropy": 0.5550010055303574, "epoch": 0.8859813084112149, "grad_norm": 0.026594942435622215, "learning_rate": 0.0002, "loss": 0.5539083480834961, "mean_token_accuracy": 0.7734427750110626, "num_tokens": 3865776.0, "step": 237 }, { "entropy": 0.5347648710012436, "epoch": 0.8897196261682243, "grad_norm": 0.03385410085320473, "learning_rate": 0.0002, "loss": 0.5472573041915894, "mean_token_accuracy": 0.7766564786434174, "num_tokens": 3882018.0, "step": 238 }, { "entropy": 0.5376404300332069, "epoch": 0.8934579439252337, "grad_norm": 0.040597062557935715, "learning_rate": 0.0002, "loss": 0.5544540286064148, "mean_token_accuracy": 0.7728734314441681, "num_tokens": 3898287.0, "step": 239 }, { "entropy": 0.5667798519134521, "epoch": 0.897196261682243, "grad_norm": 0.027665674686431885, "learning_rate": 0.0002, "loss": 0.5663026571273804, "mean_token_accuracy": 0.770405575633049, "num_tokens": 3914775.0, "step": 240 }, { "entropy": 0.550272524356842, "epoch": 0.9009345794392524, "grad_norm": 0.029484877362847328, "learning_rate": 0.0002, "loss": 0.5427078008651733, "mean_token_accuracy": 0.7818168848752975, "num_tokens": 3930889.0, "step": 241 }, { "entropy": 0.5710694193840027, "epoch": 0.9046728971962616, "grad_norm": 0.027631685137748718, "learning_rate": 0.0002, "loss": 0.561673641204834, "mean_token_accuracy": 0.7728846818208694, "num_tokens": 3947233.0, "step": 242 }, { "entropy": 0.5513755828142166, "epoch": 0.908411214953271, "grad_norm": 0.030272630974650383, "learning_rate": 0.0002, "loss": 0.5467454195022583, "mean_token_accuracy": 0.7779553532600403, "num_tokens": 3963468.0, "step": 243 }, { "entropy": 0.5469895005226135, "epoch": 0.9121495327102803, "grad_norm": 0.03090892918407917, "learning_rate": 0.0002, "loss": 0.5560286045074463, "mean_token_accuracy": 0.7723891735076904, "num_tokens": 3979910.0, "step": 244 }, { "entropy": 0.5544413626194, "epoch": 0.9158878504672897, "grad_norm": 0.041499219834804535, "learning_rate": 0.0002, "loss": 0.5768874883651733, "mean_token_accuracy": 0.7659346610307693, "num_tokens": 3996196.0, "step": 245 }, { "entropy": 0.5447600036859512, "epoch": 0.9196261682242991, "grad_norm": 0.03076878748834133, "learning_rate": 0.0002, "loss": 0.5456743836402893, "mean_token_accuracy": 0.7770105451345444, "num_tokens": 4012511.0, "step": 246 }, { "entropy": 0.5538895577192307, "epoch": 0.9233644859813084, "grad_norm": 0.03173721581697464, "learning_rate": 0.0002, "loss": 0.5483969449996948, "mean_token_accuracy": 0.7781166434288025, "num_tokens": 4028651.0, "step": 247 }, { "entropy": 0.5794132798910141, "epoch": 0.9271028037383178, "grad_norm": 0.0297909714281559, "learning_rate": 0.0002, "loss": 0.5648066401481628, "mean_token_accuracy": 0.7718619257211685, "num_tokens": 4045251.0, "step": 248 }, { "entropy": 0.5547907501459122, "epoch": 0.930841121495327, "grad_norm": 0.03679649531841278, "learning_rate": 0.0002, "loss": 0.5462634563446045, "mean_token_accuracy": 0.7801699191331863, "num_tokens": 4061348.0, "step": 249 }, { "entropy": 0.5539078116416931, "epoch": 0.9345794392523364, "grad_norm": 0.02851703390479088, "learning_rate": 0.0002, "loss": 0.5593677163124084, "mean_token_accuracy": 0.7756806910037994, "num_tokens": 4077453.0, "step": 250 }, { "entropy": 0.5443865954875946, "epoch": 0.9383177570093458, "grad_norm": 0.030135581269860268, "learning_rate": 0.0002, "loss": 0.5505210161209106, "mean_token_accuracy": 0.7767539322376251, "num_tokens": 4093944.0, "step": 251 }, { "entropy": 0.5541698932647705, "epoch": 0.9420560747663551, "grad_norm": 0.03800193592905998, "learning_rate": 0.0002, "loss": 0.5603746175765991, "mean_token_accuracy": 0.7716375887393951, "num_tokens": 4110397.0, "step": 252 }, { "entropy": 0.5497024953365326, "epoch": 0.9457943925233645, "grad_norm": 0.030841615051031113, "learning_rate": 0.0002, "loss": 0.5577483177185059, "mean_token_accuracy": 0.776105210185051, "num_tokens": 4126788.0, "step": 253 }, { "entropy": 0.5452855974435806, "epoch": 0.9495327102803738, "grad_norm": 0.027110353112220764, "learning_rate": 0.0002, "loss": 0.5468145608901978, "mean_token_accuracy": 0.7746452689170837, "num_tokens": 4143252.0, "step": 254 }, { "entropy": 0.5483012199401855, "epoch": 0.9532710280373832, "grad_norm": 0.02763090282678604, "learning_rate": 0.0002, "loss": 0.542940616607666, "mean_token_accuracy": 0.7776369601488113, "num_tokens": 4159556.0, "step": 255 }, { "entropy": 0.5598485320806503, "epoch": 0.9570093457943926, "grad_norm": 0.02750120870769024, "learning_rate": 0.0002, "loss": 0.5518869161605835, "mean_token_accuracy": 0.7762151658535004, "num_tokens": 4175947.0, "step": 256 }, { "entropy": 0.5783872008323669, "epoch": 0.9607476635514018, "grad_norm": 0.03151006996631622, "learning_rate": 0.0002, "loss": 0.5734107494354248, "mean_token_accuracy": 0.7695904821157455, "num_tokens": 4192348.0, "step": 257 }, { "entropy": 0.5653168857097626, "epoch": 0.9644859813084112, "grad_norm": 0.03166348114609718, "learning_rate": 0.0002, "loss": 0.5732910633087158, "mean_token_accuracy": 0.7679464519023895, "num_tokens": 4208898.0, "step": 258 }, { "entropy": 0.5390284806489944, "epoch": 0.9682242990654205, "grad_norm": 0.026950784027576447, "learning_rate": 0.0002, "loss": 0.5455009937286377, "mean_token_accuracy": 0.7775461375713348, "num_tokens": 4225149.0, "step": 259 }, { "entropy": 0.565416008234024, "epoch": 0.9719626168224299, "grad_norm": 0.030768675729632378, "learning_rate": 0.0002, "loss": 0.5689860582351685, "mean_token_accuracy": 0.7684348970651627, "num_tokens": 4241389.0, "step": 260 }, { "entropy": 0.5577588826417923, "epoch": 0.9757009345794393, "grad_norm": 0.02680326998233795, "learning_rate": 0.0002, "loss": 0.5625928640365601, "mean_token_accuracy": 0.7695075571537018, "num_tokens": 4257979.0, "step": 261 }, { "entropy": 0.55104960501194, "epoch": 0.9794392523364486, "grad_norm": 0.027646353468298912, "learning_rate": 0.0002, "loss": 0.5484559535980225, "mean_token_accuracy": 0.7766857743263245, "num_tokens": 4274290.0, "step": 262 }, { "entropy": 0.5638265609741211, "epoch": 0.983177570093458, "grad_norm": 0.02871805429458618, "learning_rate": 0.0002, "loss": 0.5657901167869568, "mean_token_accuracy": 0.7715673297643661, "num_tokens": 4290725.0, "step": 263 }, { "entropy": 0.547324076294899, "epoch": 0.9869158878504672, "grad_norm": 0.02937854453921318, "learning_rate": 0.0002, "loss": 0.55534827709198, "mean_token_accuracy": 0.7751762270927429, "num_tokens": 4307326.0, "step": 264 }, { "entropy": 0.5487106442451477, "epoch": 0.9906542056074766, "grad_norm": 0.02548016607761383, "learning_rate": 0.0002, "loss": 0.5505661964416504, "mean_token_accuracy": 0.7752106785774231, "num_tokens": 4323823.0, "step": 265 }, { "entropy": 0.5634673833847046, "epoch": 0.994392523364486, "grad_norm": 0.026015356183052063, "learning_rate": 0.0002, "loss": 0.5634418725967407, "mean_token_accuracy": 0.7709382921457291, "num_tokens": 4340138.0, "step": 266 }, { "entropy": 0.5507746189832687, "epoch": 0.9981308411214953, "grad_norm": 0.026798918843269348, "learning_rate": 0.0002, "loss": 0.5513297915458679, "mean_token_accuracy": 0.7769380956888199, "num_tokens": 4356482.0, "step": 267 }, { "entropy": 0.5597052276134491, "epoch": 1.0, "grad_norm": 0.0342809222638607, "learning_rate": 0.0002, "loss": 0.5571821331977844, "mean_token_accuracy": 0.774641364812851, "num_tokens": 4364744.0, "step": 268 }, { "entropy": 0.557921290397644, "epoch": 1.0037383177570094, "grad_norm": 0.029891351237893105, "learning_rate": 0.0002, "loss": 0.5539438128471375, "mean_token_accuracy": 0.7773818224668503, "num_tokens": 4380930.0, "step": 269 }, { "entropy": 0.5416439026594162, "epoch": 1.0074766355140188, "grad_norm": 0.02803446725010872, "learning_rate": 0.0002, "loss": 0.5438423752784729, "mean_token_accuracy": 0.7798180431127548, "num_tokens": 4397244.0, "step": 270 }, { "entropy": 0.5285164415836334, "epoch": 1.011214953271028, "grad_norm": 0.03023347444832325, "learning_rate": 0.0002, "loss": 0.5358922481536865, "mean_token_accuracy": 0.7807245850563049, "num_tokens": 4413671.0, "step": 271 }, { "entropy": 0.5514080822467804, "epoch": 1.0149532710280373, "grad_norm": 0.027458516880869865, "learning_rate": 0.0002, "loss": 0.552421510219574, "mean_token_accuracy": 0.7761755585670471, "num_tokens": 4430035.0, "step": 272 }, { "entropy": 0.5706226229667664, "epoch": 1.0186915887850467, "grad_norm": 0.030846886336803436, "learning_rate": 0.0002, "loss": 0.5667564272880554, "mean_token_accuracy": 0.7689130008220673, "num_tokens": 4446382.0, "step": 273 }, { "entropy": 0.5511225461959839, "epoch": 1.0224299065420561, "grad_norm": 0.029439929872751236, "learning_rate": 0.0002, "loss": 0.5465920567512512, "mean_token_accuracy": 0.7808292508125305, "num_tokens": 4462677.0, "step": 274 }, { "entropy": 0.5416547358036041, "epoch": 1.0261682242990655, "grad_norm": 0.02822115644812584, "learning_rate": 0.0002, "loss": 0.5419396758079529, "mean_token_accuracy": 0.7816834002733231, "num_tokens": 4479083.0, "step": 275 }, { "entropy": 0.5574266612529755, "epoch": 1.0299065420560747, "grad_norm": 0.0327095128595829, "learning_rate": 0.0002, "loss": 0.5565608739852905, "mean_token_accuracy": 0.7745349258184433, "num_tokens": 4495797.0, "step": 276 }, { "entropy": 0.5387104451656342, "epoch": 1.033644859813084, "grad_norm": 0.03164896368980408, "learning_rate": 0.0002, "loss": 0.5406032800674438, "mean_token_accuracy": 0.7823146730661392, "num_tokens": 4512262.0, "step": 277 }, { "entropy": 0.5471370071172714, "epoch": 1.0373831775700935, "grad_norm": 0.03483380377292633, "learning_rate": 0.0002, "loss": 0.5550093054771423, "mean_token_accuracy": 0.7783246338367462, "num_tokens": 4528616.0, "step": 278 }, { "entropy": 0.5368807017803192, "epoch": 1.0411214953271029, "grad_norm": 0.03120633400976658, "learning_rate": 0.0002, "loss": 0.5417410731315613, "mean_token_accuracy": 0.7802102267742157, "num_tokens": 4544882.0, "step": 279 }, { "entropy": 0.5481929332017899, "epoch": 1.0448598130841122, "grad_norm": 0.029517389833927155, "learning_rate": 0.0002, "loss": 0.5472978353500366, "mean_token_accuracy": 0.7788140177726746, "num_tokens": 4561427.0, "step": 280 }, { "entropy": 0.5531918853521347, "epoch": 1.0485981308411214, "grad_norm": 0.03256995975971222, "learning_rate": 0.0002, "loss": 0.5502868890762329, "mean_token_accuracy": 0.7784827798604965, "num_tokens": 4577723.0, "step": 281 }, { "entropy": 0.5540415197610855, "epoch": 1.0523364485981308, "grad_norm": 0.026578353717923164, "learning_rate": 0.0002, "loss": 0.555966854095459, "mean_token_accuracy": 0.775706946849823, "num_tokens": 4594128.0, "step": 282 }, { "entropy": 0.5517027229070663, "epoch": 1.0560747663551402, "grad_norm": 0.030103787779808044, "learning_rate": 0.0002, "loss": 0.5502108931541443, "mean_token_accuracy": 0.7753856778144836, "num_tokens": 4610255.0, "step": 283 }, { "entropy": 0.5304621160030365, "epoch": 1.0598130841121496, "grad_norm": 0.029368899762630463, "learning_rate": 0.0002, "loss": 0.5297666788101196, "mean_token_accuracy": 0.7840214222669601, "num_tokens": 4626599.0, "step": 284 }, { "entropy": 0.5305260270833969, "epoch": 1.063551401869159, "grad_norm": 0.029124870896339417, "learning_rate": 0.0002, "loss": 0.5363407135009766, "mean_token_accuracy": 0.7847000658512115, "num_tokens": 4642927.0, "step": 285 }, { "entropy": 0.5300263911485672, "epoch": 1.0672897196261681, "grad_norm": 0.028800450265407562, "learning_rate": 0.0002, "loss": 0.52923583984375, "mean_token_accuracy": 0.7828178703784943, "num_tokens": 4659455.0, "step": 286 }, { "entropy": 0.5497115254402161, "epoch": 1.0710280373831775, "grad_norm": 0.03032800555229187, "learning_rate": 0.0002, "loss": 0.5526697039604187, "mean_token_accuracy": 0.7718490660190582, "num_tokens": 4675747.0, "step": 287 }, { "entropy": 0.5266695320606232, "epoch": 1.074766355140187, "grad_norm": 0.02653171867132187, "learning_rate": 0.0002, "loss": 0.5255345702171326, "mean_token_accuracy": 0.7853638082742691, "num_tokens": 4691992.0, "step": 288 }, { "entropy": 0.5461495667695999, "epoch": 1.0785046728971963, "grad_norm": 0.025956284254789352, "learning_rate": 0.0002, "loss": 0.5439239740371704, "mean_token_accuracy": 0.7808811217546463, "num_tokens": 4708487.0, "step": 289 }, { "entropy": 0.5421788841485977, "epoch": 1.0822429906542057, "grad_norm": 0.02735847234725952, "learning_rate": 0.0002, "loss": 0.5411931872367859, "mean_token_accuracy": 0.7771425247192383, "num_tokens": 4724824.0, "step": 290 }, { "entropy": 0.5556438118219376, "epoch": 1.0859813084112149, "grad_norm": 0.026816118508577347, "learning_rate": 0.0002, "loss": 0.5484311580657959, "mean_token_accuracy": 0.7775956392288208, "num_tokens": 4741264.0, "step": 291 }, { "entropy": 0.5614602714776993, "epoch": 1.0897196261682243, "grad_norm": 0.03428835794329643, "learning_rate": 0.0002, "loss": 0.5635286569595337, "mean_token_accuracy": 0.7734779864549637, "num_tokens": 4757630.0, "step": 292 }, { "entropy": 0.5510146170854568, "epoch": 1.0934579439252337, "grad_norm": 0.030845943838357925, "learning_rate": 0.0002, "loss": 0.5562302470207214, "mean_token_accuracy": 0.773259237408638, "num_tokens": 4773723.0, "step": 293 }, { "entropy": 0.5555125325918198, "epoch": 1.097196261682243, "grad_norm": 0.028586354106664658, "learning_rate": 0.0002, "loss": 0.5588027834892273, "mean_token_accuracy": 0.7723042815923691, "num_tokens": 4790204.0, "step": 294 }, { "entropy": 0.53548863530159, "epoch": 1.1009345794392524, "grad_norm": 0.032421719282865524, "learning_rate": 0.0002, "loss": 0.5428792238235474, "mean_token_accuracy": 0.780792623758316, "num_tokens": 4806715.0, "step": 295 }, { "entropy": 0.5266362577676773, "epoch": 1.1046728971962616, "grad_norm": 0.044794633984565735, "learning_rate": 0.0002, "loss": 0.5296044945716858, "mean_token_accuracy": 0.7850557416677475, "num_tokens": 4822693.0, "step": 296 }, { "entropy": 0.547786682844162, "epoch": 1.108411214953271, "grad_norm": 0.03065192885696888, "learning_rate": 0.0002, "loss": 0.545957088470459, "mean_token_accuracy": 0.7773084342479706, "num_tokens": 4838834.0, "step": 297 }, { "entropy": 0.5526397377252579, "epoch": 1.1121495327102804, "grad_norm": 0.03121815249323845, "learning_rate": 0.0002, "loss": 0.5505586862564087, "mean_token_accuracy": 0.7751570343971252, "num_tokens": 4854891.0, "step": 298 }, { "entropy": 0.556088924407959, "epoch": 1.1158878504672898, "grad_norm": 0.03519770875573158, "learning_rate": 0.0002, "loss": 0.5572479367256165, "mean_token_accuracy": 0.7747550010681152, "num_tokens": 4871140.0, "step": 299 }, { "entropy": 0.5376470685005188, "epoch": 1.1196261682242992, "grad_norm": 0.03193943575024605, "learning_rate": 0.0002, "loss": 0.5455138087272644, "mean_token_accuracy": 0.7797031998634338, "num_tokens": 4887274.0, "step": 300 }, { "entropy": 0.5635453760623932, "epoch": 1.1233644859813083, "grad_norm": 0.041273750364780426, "learning_rate": 0.0002, "loss": 0.5696390867233276, "mean_token_accuracy": 0.76914082467556, "num_tokens": 4903573.0, "step": 301 }, { "entropy": 0.5702975988388062, "epoch": 1.1271028037383177, "grad_norm": 0.03010556660592556, "learning_rate": 0.0002, "loss": 0.5622550845146179, "mean_token_accuracy": 0.7727158814668655, "num_tokens": 4919926.0, "step": 302 }, { "entropy": 0.5415271073579788, "epoch": 1.1308411214953271, "grad_norm": 0.0310966819524765, "learning_rate": 0.0002, "loss": 0.5458844900131226, "mean_token_accuracy": 0.776058241724968, "num_tokens": 4936123.0, "step": 303 }, { "entropy": 0.5403020679950714, "epoch": 1.1345794392523365, "grad_norm": 0.04535767808556557, "learning_rate": 0.0002, "loss": 0.5387758612632751, "mean_token_accuracy": 0.7784536480903625, "num_tokens": 4952502.0, "step": 304 }, { "entropy": 0.5479062646627426, "epoch": 1.1383177570093457, "grad_norm": 0.028153905645012856, "learning_rate": 0.0002, "loss": 0.5478588938713074, "mean_token_accuracy": 0.7770532369613647, "num_tokens": 4968823.0, "step": 305 }, { "entropy": 0.5423109382390976, "epoch": 1.142056074766355, "grad_norm": 0.03606940805912018, "learning_rate": 0.0002, "loss": 0.5508921146392822, "mean_token_accuracy": 0.7769752442836761, "num_tokens": 4985183.0, "step": 306 }, { "entropy": 0.5484813451766968, "epoch": 1.1457943925233645, "grad_norm": 0.02960861474275589, "learning_rate": 0.0002, "loss": 0.5549089312553406, "mean_token_accuracy": 0.7753880023956299, "num_tokens": 5001335.0, "step": 307 }, { "entropy": 0.5498395711183548, "epoch": 1.1495327102803738, "grad_norm": 0.036366142332553864, "learning_rate": 0.0002, "loss": 0.5471988916397095, "mean_token_accuracy": 0.7787120938301086, "num_tokens": 5017387.0, "step": 308 }, { "entropy": 0.5530393719673157, "epoch": 1.1532710280373832, "grad_norm": 0.029028775170445442, "learning_rate": 0.0002, "loss": 0.5492241978645325, "mean_token_accuracy": 0.7761663198471069, "num_tokens": 5033567.0, "step": 309 }, { "entropy": 0.5492727905511856, "epoch": 1.1570093457943926, "grad_norm": 0.03352445736527443, "learning_rate": 0.0002, "loss": 0.5540640354156494, "mean_token_accuracy": 0.7749823033809662, "num_tokens": 5049801.0, "step": 310 }, { "entropy": 0.5666168481111526, "epoch": 1.1607476635514018, "grad_norm": 0.035840339958667755, "learning_rate": 0.0002, "loss": 0.5706231594085693, "mean_token_accuracy": 0.7669289708137512, "num_tokens": 5066204.0, "step": 311 }, { "entropy": 0.5425457805395126, "epoch": 1.1644859813084112, "grad_norm": 0.03181692957878113, "learning_rate": 0.0002, "loss": 0.5458914041519165, "mean_token_accuracy": 0.7774879634380341, "num_tokens": 5082493.0, "step": 312 }, { "entropy": 0.5557267963886261, "epoch": 1.1682242990654206, "grad_norm": 0.035230670124292374, "learning_rate": 0.0002, "loss": 0.5475496053695679, "mean_token_accuracy": 0.7787989675998688, "num_tokens": 5098639.0, "step": 313 }, { "entropy": 0.5714587569236755, "epoch": 1.17196261682243, "grad_norm": 0.03392059728503227, "learning_rate": 0.0002, "loss": 0.5622156262397766, "mean_token_accuracy": 0.7719752937555313, "num_tokens": 5114831.0, "step": 314 }, { "entropy": 0.5439812690019608, "epoch": 1.1757009345794391, "grad_norm": 0.027537284418940544, "learning_rate": 0.0002, "loss": 0.5427182912826538, "mean_token_accuracy": 0.7786365002393723, "num_tokens": 5131121.0, "step": 315 }, { "entropy": 0.5388712882995605, "epoch": 1.1794392523364485, "grad_norm": 0.03216094896197319, "learning_rate": 0.0002, "loss": 0.5446818470954895, "mean_token_accuracy": 0.7791234254837036, "num_tokens": 5147422.0, "step": 316 }, { "entropy": 0.53206005692482, "epoch": 1.183177570093458, "grad_norm": 0.032054752111434937, "learning_rate": 0.0002, "loss": 0.5439627170562744, "mean_token_accuracy": 0.7801449149847031, "num_tokens": 5163884.0, "step": 317 }, { "entropy": 0.5308776497840881, "epoch": 1.1869158878504673, "grad_norm": 0.032574739307165146, "learning_rate": 0.0002, "loss": 0.5392112731933594, "mean_token_accuracy": 0.777498260140419, "num_tokens": 5180398.0, "step": 318 }, { "entropy": 0.5427455455064774, "epoch": 1.1906542056074767, "grad_norm": 0.03152874857187271, "learning_rate": 0.0002, "loss": 0.5452929139137268, "mean_token_accuracy": 0.7787911593914032, "num_tokens": 5196640.0, "step": 319 }, { "entropy": 0.570340633392334, "epoch": 1.194392523364486, "grad_norm": 0.03098403289914131, "learning_rate": 0.0002, "loss": 0.5688466429710388, "mean_token_accuracy": 0.7672817558050156, "num_tokens": 5212767.0, "step": 320 }, { "entropy": 0.5646504908800125, "epoch": 1.1981308411214953, "grad_norm": 0.032602474093437195, "learning_rate": 0.0002, "loss": 0.5595831274986267, "mean_token_accuracy": 0.7738354504108429, "num_tokens": 5229143.0, "step": 321 }, { "entropy": 0.541440024971962, "epoch": 1.2018691588785047, "grad_norm": 0.0346127450466156, "learning_rate": 0.0002, "loss": 0.5328572988510132, "mean_token_accuracy": 0.7842471748590469, "num_tokens": 5245349.0, "step": 322 }, { "entropy": 0.5371421873569489, "epoch": 1.205607476635514, "grad_norm": 0.030524473637342453, "learning_rate": 0.0002, "loss": 0.5316073894500732, "mean_token_accuracy": 0.7839267402887344, "num_tokens": 5261740.0, "step": 323 }, { "entropy": 0.5501479953527451, "epoch": 1.2093457943925234, "grad_norm": 0.04006117209792137, "learning_rate": 0.0002, "loss": 0.5546258687973022, "mean_token_accuracy": 0.7740581333637238, "num_tokens": 5278402.0, "step": 324 }, { "entropy": 0.5427927225828171, "epoch": 1.2130841121495326, "grad_norm": 0.028997933492064476, "learning_rate": 0.0002, "loss": 0.546272873878479, "mean_token_accuracy": 0.77626071870327, "num_tokens": 5295096.0, "step": 325 }, { "entropy": 0.5374629199504852, "epoch": 1.216822429906542, "grad_norm": 0.031449392437934875, "learning_rate": 0.0002, "loss": 0.5484204292297363, "mean_token_accuracy": 0.7783177495002747, "num_tokens": 5311451.0, "step": 326 }, { "entropy": 0.5593861639499664, "epoch": 1.2205607476635514, "grad_norm": 0.033892612904310226, "learning_rate": 0.0002, "loss": 0.5527151823043823, "mean_token_accuracy": 0.7769543379545212, "num_tokens": 5327705.0, "step": 327 }, { "entropy": 0.5403755158185959, "epoch": 1.2242990654205608, "grad_norm": 0.029873648658394814, "learning_rate": 0.0002, "loss": 0.5416997075080872, "mean_token_accuracy": 0.7783119082450867, "num_tokens": 5344110.0, "step": 328 }, { "entropy": 0.5473423600196838, "epoch": 1.2280373831775702, "grad_norm": 0.028266677632927895, "learning_rate": 0.0002, "loss": 0.5524438619613647, "mean_token_accuracy": 0.7769231647253036, "num_tokens": 5360378.0, "step": 329 }, { "entropy": 0.5364970713853836, "epoch": 1.2317757009345796, "grad_norm": 0.03534099832177162, "learning_rate": 0.0002, "loss": 0.5341481566429138, "mean_token_accuracy": 0.783685103058815, "num_tokens": 5376600.0, "step": 330 }, { "entropy": 0.5472245216369629, "epoch": 1.2355140186915887, "grad_norm": 0.030261849984526634, "learning_rate": 0.0002, "loss": 0.5478684306144714, "mean_token_accuracy": 0.7797873020172119, "num_tokens": 5392761.0, "step": 331 }, { "entropy": 0.545607790350914, "epoch": 1.2392523364485981, "grad_norm": 0.029436452314257622, "learning_rate": 0.0002, "loss": 0.546855628490448, "mean_token_accuracy": 0.7786357402801514, "num_tokens": 5409133.0, "step": 332 }, { "entropy": 0.5291889756917953, "epoch": 1.2429906542056075, "grad_norm": 0.03353505581617355, "learning_rate": 0.0002, "loss": 0.5353861451148987, "mean_token_accuracy": 0.7811570167541504, "num_tokens": 5425384.0, "step": 333 }, { "entropy": 0.5578002631664276, "epoch": 1.246728971962617, "grad_norm": 0.03168244659900665, "learning_rate": 0.0002, "loss": 0.5618013143539429, "mean_token_accuracy": 0.7705619186162949, "num_tokens": 5441708.0, "step": 334 }, { "entropy": 0.555315688252449, "epoch": 1.250467289719626, "grad_norm": 0.03206615522503853, "learning_rate": 0.0002, "loss": 0.5600447654724121, "mean_token_accuracy": 0.7714688628911972, "num_tokens": 5457884.0, "step": 335 }, { "entropy": 0.5601648688316345, "epoch": 1.2542056074766355, "grad_norm": 0.03804044798016548, "learning_rate": 0.0002, "loss": 0.5550276637077332, "mean_token_accuracy": 0.7733457237482071, "num_tokens": 5474231.0, "step": 336 }, { "entropy": 0.542451411485672, "epoch": 1.2579439252336448, "grad_norm": 0.029554393142461777, "learning_rate": 0.0002, "loss": 0.5353547930717468, "mean_token_accuracy": 0.7827602028846741, "num_tokens": 5490557.0, "step": 337 }, { "entropy": 0.5396464318037033, "epoch": 1.2616822429906542, "grad_norm": 0.02930438332259655, "learning_rate": 0.0002, "loss": 0.5352525115013123, "mean_token_accuracy": 0.782452329993248, "num_tokens": 5506827.0, "step": 338 }, { "entropy": 0.551433265209198, "epoch": 1.2654205607476636, "grad_norm": 0.03803868591785431, "learning_rate": 0.0002, "loss": 0.5564743280410767, "mean_token_accuracy": 0.7742451429367065, "num_tokens": 5523197.0, "step": 339 }, { "entropy": 0.5405130237340927, "epoch": 1.269158878504673, "grad_norm": 0.03335575759410858, "learning_rate": 0.0002, "loss": 0.5447483062744141, "mean_token_accuracy": 0.777386024594307, "num_tokens": 5539570.0, "step": 340 }, { "entropy": 0.5281671732664108, "epoch": 1.2728971962616822, "grad_norm": 0.03668655455112457, "learning_rate": 0.0002, "loss": 0.5369662642478943, "mean_token_accuracy": 0.7818697243928909, "num_tokens": 5556018.0, "step": 341 }, { "entropy": 0.5445946455001831, "epoch": 1.2766355140186916, "grad_norm": 0.03418565168976784, "learning_rate": 0.0002, "loss": 0.5481922626495361, "mean_token_accuracy": 0.7817248553037643, "num_tokens": 5571921.0, "step": 342 }, { "entropy": 0.5692614763975143, "epoch": 1.280373831775701, "grad_norm": 0.032861191779375076, "learning_rate": 0.0002, "loss": 0.5536470413208008, "mean_token_accuracy": 0.7768330574035645, "num_tokens": 5588242.0, "step": 343 }, { "entropy": 0.5534744560718536, "epoch": 1.2841121495327104, "grad_norm": 0.02994309738278389, "learning_rate": 0.0002, "loss": 0.5490615367889404, "mean_token_accuracy": 0.7776058167219162, "num_tokens": 5604646.0, "step": 344 }, { "entropy": 0.5477103441953659, "epoch": 1.2878504672897195, "grad_norm": 0.0329648032784462, "learning_rate": 0.0002, "loss": 0.5608856678009033, "mean_token_accuracy": 0.769044816493988, "num_tokens": 5620822.0, "step": 345 }, { "entropy": 0.5447603911161423, "epoch": 1.291588785046729, "grad_norm": 0.038630835711956024, "learning_rate": 0.0002, "loss": 0.5517427921295166, "mean_token_accuracy": 0.776050254702568, "num_tokens": 5637254.0, "step": 346 }, { "entropy": 0.5543326735496521, "epoch": 1.2953271028037383, "grad_norm": 0.03234436735510826, "learning_rate": 0.0002, "loss": 0.5605747103691101, "mean_token_accuracy": 0.7735925763845444, "num_tokens": 5653687.0, "step": 347 }, { "entropy": 0.5351574122905731, "epoch": 1.2990654205607477, "grad_norm": 0.03387833759188652, "learning_rate": 0.0002, "loss": 0.5403937697410583, "mean_token_accuracy": 0.7819892168045044, "num_tokens": 5670152.0, "step": 348 }, { "entropy": 0.5567533820867538, "epoch": 1.302803738317757, "grad_norm": 0.0311372522264719, "learning_rate": 0.0002, "loss": 0.5512552261352539, "mean_token_accuracy": 0.7762364596128464, "num_tokens": 5686422.0, "step": 349 }, { "entropy": 0.5508190989494324, "epoch": 1.3065420560747665, "grad_norm": 0.027689168229699135, "learning_rate": 0.0002, "loss": 0.5455954074859619, "mean_token_accuracy": 0.7787918448448181, "num_tokens": 5702832.0, "step": 350 }, { "entropy": 0.5493623167276382, "epoch": 1.3102803738317756, "grad_norm": 0.03188028931617737, "learning_rate": 0.0002, "loss": 0.5508118867874146, "mean_token_accuracy": 0.7741293609142303, "num_tokens": 5719201.0, "step": 351 }, { "entropy": 0.5517994910478592, "epoch": 1.314018691588785, "grad_norm": 0.03255178779363632, "learning_rate": 0.0002, "loss": 0.5581218004226685, "mean_token_accuracy": 0.7717841118574142, "num_tokens": 5735507.0, "step": 352 }, { "entropy": 0.5363009721040726, "epoch": 1.3177570093457944, "grad_norm": 0.0318707600235939, "learning_rate": 0.0002, "loss": 0.5422943234443665, "mean_token_accuracy": 0.7783725261688232, "num_tokens": 5751653.0, "step": 353 }, { "entropy": 0.5449318736791611, "epoch": 1.3214953271028038, "grad_norm": 0.028741504997015, "learning_rate": 0.0002, "loss": 0.539950966835022, "mean_token_accuracy": 0.7803268283605576, "num_tokens": 5768167.0, "step": 354 }, { "entropy": 0.5602855980396271, "epoch": 1.325233644859813, "grad_norm": 0.030420802533626556, "learning_rate": 0.0002, "loss": 0.554990291595459, "mean_token_accuracy": 0.7761643081903458, "num_tokens": 5784542.0, "step": 355 }, { "entropy": 0.56887586414814, "epoch": 1.3289719626168224, "grad_norm": 0.03126989305019379, "learning_rate": 0.0002, "loss": 0.5672231912612915, "mean_token_accuracy": 0.7678193151950836, "num_tokens": 5801095.0, "step": 356 }, { "entropy": 0.5738541036844254, "epoch": 1.3327102803738318, "grad_norm": 0.03625823184847832, "learning_rate": 0.0002, "loss": 0.5728395581245422, "mean_token_accuracy": 0.7666806429624557, "num_tokens": 5817738.0, "step": 357 }, { "entropy": 0.5436241179704666, "epoch": 1.3364485981308412, "grad_norm": 0.03443320468068123, "learning_rate": 0.0002, "loss": 0.5367251634597778, "mean_token_accuracy": 0.7828597128391266, "num_tokens": 5834159.0, "step": 358 }, { "entropy": 0.5450441539287567, "epoch": 1.3401869158878505, "grad_norm": 0.02960045635700226, "learning_rate": 0.0002, "loss": 0.5478132963180542, "mean_token_accuracy": 0.7773353010416031, "num_tokens": 5850353.0, "step": 359 }, { "entropy": 0.559371218085289, "epoch": 1.34392523364486, "grad_norm": 0.043439071625471115, "learning_rate": 0.0002, "loss": 0.5704307556152344, "mean_token_accuracy": 0.7674223929643631, "num_tokens": 5866661.0, "step": 360 }, { "entropy": 0.5383078157901764, "epoch": 1.347663551401869, "grad_norm": 0.031151141971349716, "learning_rate": 0.0002, "loss": 0.5475639700889587, "mean_token_accuracy": 0.7764850705862045, "num_tokens": 5883147.0, "step": 361 }, { "entropy": 0.5361460000276566, "epoch": 1.3514018691588785, "grad_norm": 0.0367986336350441, "learning_rate": 0.0002, "loss": 0.5413030385971069, "mean_token_accuracy": 0.7792898863554001, "num_tokens": 5899337.0, "step": 362 }, { "entropy": 0.5393686443567276, "epoch": 1.355140186915888, "grad_norm": 0.032062407582998276, "learning_rate": 0.0002, "loss": 0.5485578775405884, "mean_token_accuracy": 0.7746371626853943, "num_tokens": 5915592.0, "step": 363 }, { "entropy": 0.5442528575658798, "epoch": 1.358878504672897, "grad_norm": 0.030468052253127098, "learning_rate": 0.0002, "loss": 0.5427553653717041, "mean_token_accuracy": 0.7785662263631821, "num_tokens": 5931951.0, "step": 364 }, { "entropy": 0.5824908316135406, "epoch": 1.3626168224299064, "grad_norm": 0.037210624665021896, "learning_rate": 0.0002, "loss": 0.5697020292282104, "mean_token_accuracy": 0.7692236304283142, "num_tokens": 5948490.0, "step": 365 }, { "entropy": 0.5620522499084473, "epoch": 1.3663551401869158, "grad_norm": 0.0335218720138073, "learning_rate": 0.0002, "loss": 0.5542594194412231, "mean_token_accuracy": 0.7753977477550507, "num_tokens": 5964660.0, "step": 366 }, { "entropy": 0.5603572577238083, "epoch": 1.3700934579439252, "grad_norm": 0.031322672963142395, "learning_rate": 0.0002, "loss": 0.5575450658798218, "mean_token_accuracy": 0.7735055536031723, "num_tokens": 5981101.0, "step": 367 }, { "entropy": 0.5505388826131821, "epoch": 1.3738317757009346, "grad_norm": 0.030650589615106583, "learning_rate": 0.0002, "loss": 0.5557997822761536, "mean_token_accuracy": 0.7740475237369537, "num_tokens": 5997642.0, "step": 368 }, { "entropy": 0.5392187088727951, "epoch": 1.377570093457944, "grad_norm": 0.030460603535175323, "learning_rate": 0.0002, "loss": 0.5474120378494263, "mean_token_accuracy": 0.7756936997175217, "num_tokens": 6013826.0, "step": 369 }, { "entropy": 0.5465079843997955, "epoch": 1.3813084112149534, "grad_norm": 0.03873775899410248, "learning_rate": 0.0002, "loss": 0.5496590733528137, "mean_token_accuracy": 0.7778041809797287, "num_tokens": 6030111.0, "step": 370 }, { "entropy": 0.5502425879240036, "epoch": 1.3850467289719626, "grad_norm": 0.027835069224238396, "learning_rate": 0.0002, "loss": 0.5515455007553101, "mean_token_accuracy": 0.7742271274328232, "num_tokens": 6046613.0, "step": 371 }, { "entropy": 0.5496622025966644, "epoch": 1.388785046728972, "grad_norm": 0.02913137525320053, "learning_rate": 0.0002, "loss": 0.5523219108581543, "mean_token_accuracy": 0.7767279595136642, "num_tokens": 6062935.0, "step": 372 }, { "entropy": 0.5480591654777527, "epoch": 1.3925233644859814, "grad_norm": 0.028895994648337364, "learning_rate": 0.0002, "loss": 0.5464932918548584, "mean_token_accuracy": 0.7779257446527481, "num_tokens": 6079276.0, "step": 373 }, { "entropy": 0.5592564791440964, "epoch": 1.3962616822429905, "grad_norm": 0.030813386663794518, "learning_rate": 0.0002, "loss": 0.5641001462936401, "mean_token_accuracy": 0.7706102132797241, "num_tokens": 6095477.0, "step": 374 }, { "entropy": 0.5482244938611984, "epoch": 1.4, "grad_norm": 0.034681808203458786, "learning_rate": 0.0002, "loss": 0.5535820722579956, "mean_token_accuracy": 0.7740350067615509, "num_tokens": 6111503.0, "step": 375 }, { "entropy": 0.5437954962253571, "epoch": 1.4037383177570093, "grad_norm": 0.029899772256612778, "learning_rate": 0.0002, "loss": 0.5384761691093445, "mean_token_accuracy": 0.7813697308301926, "num_tokens": 6127666.0, "step": 376 }, { "entropy": 0.5516242235898972, "epoch": 1.4074766355140187, "grad_norm": 0.03098697029054165, "learning_rate": 0.0002, "loss": 0.5510317087173462, "mean_token_accuracy": 0.7748206406831741, "num_tokens": 6143974.0, "step": 377 }, { "entropy": 0.5456867665052414, "epoch": 1.411214953271028, "grad_norm": 0.03481059893965721, "learning_rate": 0.0002, "loss": 0.5417442917823792, "mean_token_accuracy": 0.7805673629045486, "num_tokens": 6160284.0, "step": 378 }, { "entropy": 0.5566543191671371, "epoch": 1.4149532710280375, "grad_norm": 0.03302835300564766, "learning_rate": 0.0002, "loss": 0.5596388578414917, "mean_token_accuracy": 0.7757162600755692, "num_tokens": 6176900.0, "step": 379 }, { "entropy": 0.5518665462732315, "epoch": 1.4186915887850469, "grad_norm": 0.042512837797403336, "learning_rate": 0.0002, "loss": 0.554313600063324, "mean_token_accuracy": 0.7725758254528046, "num_tokens": 6193295.0, "step": 380 }, { "entropy": 0.5387768298387527, "epoch": 1.422429906542056, "grad_norm": 0.031335704028606415, "learning_rate": 0.0002, "loss": 0.5456656813621521, "mean_token_accuracy": 0.7767685800790787, "num_tokens": 6209473.0, "step": 381 }, { "entropy": 0.552179217338562, "epoch": 1.4261682242990654, "grad_norm": 0.03560006618499756, "learning_rate": 0.0002, "loss": 0.5536052584648132, "mean_token_accuracy": 0.7741381675004959, "num_tokens": 6225795.0, "step": 382 }, { "entropy": 0.5529111623764038, "epoch": 1.4299065420560748, "grad_norm": 0.03298206627368927, "learning_rate": 0.0002, "loss": 0.5456759929656982, "mean_token_accuracy": 0.7785012274980545, "num_tokens": 6241738.0, "step": 383 }, { "entropy": 0.5528014451265335, "epoch": 1.433644859813084, "grad_norm": 0.02689899317920208, "learning_rate": 0.0002, "loss": 0.5489047765731812, "mean_token_accuracy": 0.7755105197429657, "num_tokens": 6258266.0, "step": 384 }, { "entropy": 0.5488691926002502, "epoch": 1.4373831775700934, "grad_norm": 0.03345772624015808, "learning_rate": 0.0002, "loss": 0.5473658442497253, "mean_token_accuracy": 0.776367112994194, "num_tokens": 6274629.0, "step": 385 }, { "entropy": 0.5326814502477646, "epoch": 1.4411214953271028, "grad_norm": 0.0327431820333004, "learning_rate": 0.0002, "loss": 0.5437192916870117, "mean_token_accuracy": 0.7790791392326355, "num_tokens": 6290843.0, "step": 386 }, { "entropy": 0.5463947802782059, "epoch": 1.4448598130841122, "grad_norm": 0.029317917302250862, "learning_rate": 0.0002, "loss": 0.5482510924339294, "mean_token_accuracy": 0.7787915766239166, "num_tokens": 6307390.0, "step": 387 }, { "entropy": 0.5279744416475296, "epoch": 1.4485981308411215, "grad_norm": 0.032164428383111954, "learning_rate": 0.0002, "loss": 0.5396722555160522, "mean_token_accuracy": 0.7793098241090775, "num_tokens": 6323780.0, "step": 388 }, { "entropy": 0.5401588678359985, "epoch": 1.452336448598131, "grad_norm": 0.029884206131100655, "learning_rate": 0.0002, "loss": 0.5457247495651245, "mean_token_accuracy": 0.7772396057844162, "num_tokens": 6340075.0, "step": 389 }, { "entropy": 0.5614192336797714, "epoch": 1.45607476635514, "grad_norm": 0.031751908361911774, "learning_rate": 0.0002, "loss": 0.5567028522491455, "mean_token_accuracy": 0.7716124802827835, "num_tokens": 6356186.0, "step": 390 }, { "entropy": 0.5345210433006287, "epoch": 1.4598130841121495, "grad_norm": 0.030872350558638573, "learning_rate": 0.0002, "loss": 0.5334336757659912, "mean_token_accuracy": 0.7826623171567917, "num_tokens": 6372159.0, "step": 391 }, { "entropy": 0.5622972398996353, "epoch": 1.4635514018691589, "grad_norm": 0.0314875952899456, "learning_rate": 0.0002, "loss": 0.5557999610900879, "mean_token_accuracy": 0.7731751799583435, "num_tokens": 6388490.0, "step": 392 }, { "entropy": 0.5456393212080002, "epoch": 1.4672897196261683, "grad_norm": 0.030306922271847725, "learning_rate": 0.0002, "loss": 0.5478385090827942, "mean_token_accuracy": 0.7785396575927734, "num_tokens": 6404875.0, "step": 393 }, { "entropy": 0.553615927696228, "epoch": 1.4710280373831774, "grad_norm": 0.03159041702747345, "learning_rate": 0.0002, "loss": 0.5525414347648621, "mean_token_accuracy": 0.7762843668460846, "num_tokens": 6421373.0, "step": 394 }, { "entropy": 0.54654960334301, "epoch": 1.4747663551401868, "grad_norm": 0.041343770921230316, "learning_rate": 0.0002, "loss": 0.5578322410583496, "mean_token_accuracy": 0.7733658850193024, "num_tokens": 6437609.0, "step": 395 }, { "entropy": 0.531049445271492, "epoch": 1.4785046728971962, "grad_norm": 0.029535705223679543, "learning_rate": 0.0002, "loss": 0.5336673855781555, "mean_token_accuracy": 0.7787897735834122, "num_tokens": 6453830.0, "step": 396 }, { "entropy": 0.5598567724227905, "epoch": 1.4822429906542056, "grad_norm": 0.030157895758748055, "learning_rate": 0.0002, "loss": 0.558460533618927, "mean_token_accuracy": 0.7739997208118439, "num_tokens": 6469831.0, "step": 397 }, { "entropy": 0.5455051362514496, "epoch": 1.485981308411215, "grad_norm": 0.02824362926185131, "learning_rate": 0.0002, "loss": 0.5309131145477295, "mean_token_accuracy": 0.7840657532215118, "num_tokens": 6485983.0, "step": 398 }, { "entropy": 0.5548417568206787, "epoch": 1.4897196261682244, "grad_norm": 0.028244182467460632, "learning_rate": 0.0002, "loss": 0.5448263883590698, "mean_token_accuracy": 0.7788312286138535, "num_tokens": 6502375.0, "step": 399 }, { "entropy": 0.5614428222179413, "epoch": 1.4934579439252336, "grad_norm": 0.029092902317643166, "learning_rate": 0.0002, "loss": 0.5640357732772827, "mean_token_accuracy": 0.7694920003414154, "num_tokens": 6518515.0, "step": 400 }, { "entropy": 0.5202381461858749, "epoch": 1.497196261682243, "grad_norm": 0.0347515270113945, "learning_rate": 0.0002, "loss": 0.5334154963493347, "mean_token_accuracy": 0.7812663912773132, "num_tokens": 6534874.0, "step": 401 }, { "entropy": 0.5337788164615631, "epoch": 1.5009345794392523, "grad_norm": 0.036383189260959625, "learning_rate": 0.0002, "loss": 0.5497745871543884, "mean_token_accuracy": 0.778416782617569, "num_tokens": 6551531.0, "step": 402 }, { "entropy": 0.5441624820232391, "epoch": 1.5046728971962615, "grad_norm": 0.029430663213133812, "learning_rate": 0.0002, "loss": 0.5452989935874939, "mean_token_accuracy": 0.7810618728399277, "num_tokens": 6568009.0, "step": 403 }, { "entropy": 0.5418661385774612, "epoch": 1.508411214953271, "grad_norm": 0.030562201514840126, "learning_rate": 0.0002, "loss": 0.5342137813568115, "mean_token_accuracy": 0.7829063683748245, "num_tokens": 6584207.0, "step": 404 }, { "entropy": 0.5485459864139557, "epoch": 1.5121495327102803, "grad_norm": 0.03423624485731125, "learning_rate": 0.0002, "loss": 0.5410490036010742, "mean_token_accuracy": 0.7787354588508606, "num_tokens": 6600370.0, "step": 405 }, { "entropy": 0.5426456183195114, "epoch": 1.5158878504672897, "grad_norm": 0.02885623089969158, "learning_rate": 0.0002, "loss": 0.5436002612113953, "mean_token_accuracy": 0.7796245515346527, "num_tokens": 6616756.0, "step": 406 }, { "entropy": 0.5356003642082214, "epoch": 1.519626168224299, "grad_norm": 0.03115919418632984, "learning_rate": 0.0002, "loss": 0.5386699438095093, "mean_token_accuracy": 0.7803057432174683, "num_tokens": 6632844.0, "step": 407 }, { "entropy": 0.5387707352638245, "epoch": 1.5233644859813085, "grad_norm": 0.039791349321603775, "learning_rate": 0.0002, "loss": 0.5529868006706238, "mean_token_accuracy": 0.7759213447570801, "num_tokens": 6649378.0, "step": 408 }, { "entropy": 0.5559847801923752, "epoch": 1.5271028037383179, "grad_norm": 0.02880096808075905, "learning_rate": 0.0002, "loss": 0.5526622533798218, "mean_token_accuracy": 0.7757584452629089, "num_tokens": 6665680.0, "step": 409 }, { "entropy": 0.5568434447050095, "epoch": 1.5308411214953273, "grad_norm": 0.03131592646241188, "learning_rate": 0.0002, "loss": 0.5511536002159119, "mean_token_accuracy": 0.7751762717962265, "num_tokens": 6682037.0, "step": 410 }, { "entropy": 0.5535785406827927, "epoch": 1.5345794392523364, "grad_norm": 0.027654770761728287, "learning_rate": 0.0002, "loss": 0.5505651831626892, "mean_token_accuracy": 0.7777209877967834, "num_tokens": 6698293.0, "step": 411 }, { "entropy": 0.5670723766088486, "epoch": 1.5383177570093458, "grad_norm": 0.028583014383912086, "learning_rate": 0.0002, "loss": 0.562312662601471, "mean_token_accuracy": 0.7695807963609695, "num_tokens": 6714701.0, "step": 412 }, { "entropy": 0.5622154772281647, "epoch": 1.542056074766355, "grad_norm": 0.02976270206272602, "learning_rate": 0.0002, "loss": 0.5625367164611816, "mean_token_accuracy": 0.7716499269008636, "num_tokens": 6731185.0, "step": 413 }, { "entropy": 0.5430750995874405, "epoch": 1.5457943925233644, "grad_norm": 0.033997952938079834, "learning_rate": 0.0002, "loss": 0.5533574819564819, "mean_token_accuracy": 0.7739907056093216, "num_tokens": 6747611.0, "step": 414 }, { "entropy": 0.5383965522050858, "epoch": 1.5495327102803738, "grad_norm": 0.030417680740356445, "learning_rate": 0.0002, "loss": 0.5392584204673767, "mean_token_accuracy": 0.781003326177597, "num_tokens": 6764041.0, "step": 415 }, { "entropy": 0.5423173159360886, "epoch": 1.5532710280373832, "grad_norm": 0.03076282888650894, "learning_rate": 0.0002, "loss": 0.5466949343681335, "mean_token_accuracy": 0.7772891670465469, "num_tokens": 6780355.0, "step": 416 }, { "entropy": 0.5329848676919937, "epoch": 1.5570093457943925, "grad_norm": 0.031416404992341995, "learning_rate": 0.0002, "loss": 0.5372002720832825, "mean_token_accuracy": 0.7831790894269943, "num_tokens": 6796818.0, "step": 417 }, { "entropy": 0.5694616734981537, "epoch": 1.560747663551402, "grad_norm": 0.03140864148736, "learning_rate": 0.0002, "loss": 0.5736896395683289, "mean_token_accuracy": 0.7680276483297348, "num_tokens": 6813313.0, "step": 418 }, { "entropy": 0.5422861874103546, "epoch": 1.5644859813084113, "grad_norm": 0.029503118246793747, "learning_rate": 0.0002, "loss": 0.5412414073944092, "mean_token_accuracy": 0.7787739634513855, "num_tokens": 6829806.0, "step": 419 }, { "entropy": 0.5583456158638, "epoch": 1.5682242990654207, "grad_norm": 0.02907589264214039, "learning_rate": 0.0002, "loss": 0.5538471937179565, "mean_token_accuracy": 0.7733865231275558, "num_tokens": 6846001.0, "step": 420 }, { "entropy": 0.541300505399704, "epoch": 1.5719626168224299, "grad_norm": 0.030364159494638443, "learning_rate": 0.0002, "loss": 0.5440077781677246, "mean_token_accuracy": 0.7778935730457306, "num_tokens": 6862199.0, "step": 421 }, { "entropy": 0.5432893335819244, "epoch": 1.5757009345794393, "grad_norm": 0.030575595796108246, "learning_rate": 0.0002, "loss": 0.5458940267562866, "mean_token_accuracy": 0.7759649753570557, "num_tokens": 6878579.0, "step": 422 }, { "entropy": 0.5597539693117142, "epoch": 1.5794392523364484, "grad_norm": 0.03023570403456688, "learning_rate": 0.0002, "loss": 0.5611036419868469, "mean_token_accuracy": 0.771359458565712, "num_tokens": 6895118.0, "step": 423 }, { "entropy": 0.5647385269403458, "epoch": 1.5831775700934578, "grad_norm": 0.03682006523013115, "learning_rate": 0.0002, "loss": 0.5706467032432556, "mean_token_accuracy": 0.7648251056671143, "num_tokens": 6911258.0, "step": 424 }, { "entropy": 0.5421442538499832, "epoch": 1.5869158878504672, "grad_norm": 0.02758963778614998, "learning_rate": 0.0002, "loss": 0.540165364742279, "mean_token_accuracy": 0.7803500890731812, "num_tokens": 6927685.0, "step": 425 }, { "entropy": 0.529248058795929, "epoch": 1.5906542056074766, "grad_norm": 0.03153234347701073, "learning_rate": 0.0002, "loss": 0.5238373875617981, "mean_token_accuracy": 0.7865803390741348, "num_tokens": 6944032.0, "step": 426 }, { "entropy": 0.575338825583458, "epoch": 1.594392523364486, "grad_norm": 0.038368549197912216, "learning_rate": 0.0002, "loss": 0.5686851739883423, "mean_token_accuracy": 0.7687085419893265, "num_tokens": 6960292.0, "step": 427 }, { "entropy": 0.5576592534780502, "epoch": 1.5981308411214954, "grad_norm": 0.028228625655174255, "learning_rate": 0.0002, "loss": 0.5487405061721802, "mean_token_accuracy": 0.7753542214632034, "num_tokens": 6976714.0, "step": 428 }, { "entropy": 0.5344701558351517, "epoch": 1.6018691588785048, "grad_norm": 0.04058045893907547, "learning_rate": 0.0002, "loss": 0.5446043014526367, "mean_token_accuracy": 0.7796988487243652, "num_tokens": 6993050.0, "step": 429 }, { "entropy": 0.5357878655195236, "epoch": 1.6056074766355142, "grad_norm": 0.03584378957748413, "learning_rate": 0.0002, "loss": 0.5503512620925903, "mean_token_accuracy": 0.7766520529985428, "num_tokens": 7009209.0, "step": 430 }, { "entropy": 0.5416888147592545, "epoch": 1.6093457943925233, "grad_norm": 0.035834796726703644, "learning_rate": 0.0002, "loss": 0.5537422895431519, "mean_token_accuracy": 0.7721364051103592, "num_tokens": 7025449.0, "step": 431 }, { "entropy": 0.5495986640453339, "epoch": 1.6130841121495327, "grad_norm": 0.032027650624513626, "learning_rate": 0.0002, "loss": 0.5545753836631775, "mean_token_accuracy": 0.7711912095546722, "num_tokens": 7041746.0, "step": 432 }, { "entropy": 0.545868456363678, "epoch": 1.616822429906542, "grad_norm": 0.03172159940004349, "learning_rate": 0.0002, "loss": 0.5401636958122253, "mean_token_accuracy": 0.7796500027179718, "num_tokens": 7057795.0, "step": 433 }, { "entropy": 0.5575663447380066, "epoch": 1.6205607476635513, "grad_norm": 0.033373311161994934, "learning_rate": 0.0002, "loss": 0.5508802533149719, "mean_token_accuracy": 0.776265561580658, "num_tokens": 7074106.0, "step": 434 }, { "entropy": 0.552743598818779, "epoch": 1.6242990654205607, "grad_norm": 0.028903203085064888, "learning_rate": 0.0002, "loss": 0.5493654012680054, "mean_token_accuracy": 0.7769621759653091, "num_tokens": 7090537.0, "step": 435 }, { "entropy": 0.5319768935441971, "epoch": 1.62803738317757, "grad_norm": 0.034539636224508286, "learning_rate": 0.0002, "loss": 0.5467936396598816, "mean_token_accuracy": 0.7773739099502563, "num_tokens": 7106864.0, "step": 436 }, { "entropy": 0.5451867878437042, "epoch": 1.6317757009345795, "grad_norm": 0.03423994407057762, "learning_rate": 0.0002, "loss": 0.5547507405281067, "mean_token_accuracy": 0.7716930210590363, "num_tokens": 7123027.0, "step": 437 }, { "entropy": 0.5614334046840668, "epoch": 1.6355140186915889, "grad_norm": 0.030570637434720993, "learning_rate": 0.0002, "loss": 0.5614769458770752, "mean_token_accuracy": 0.772892951965332, "num_tokens": 7139089.0, "step": 438 }, { "entropy": 0.5780467242002487, "epoch": 1.6392523364485982, "grad_norm": 0.028702719137072563, "learning_rate": 0.0002, "loss": 0.5703617334365845, "mean_token_accuracy": 0.7703514397144318, "num_tokens": 7155613.0, "step": 439 }, { "entropy": 0.5620117634534836, "epoch": 1.6429906542056076, "grad_norm": 0.032911110669374466, "learning_rate": 0.0002, "loss": 0.5519667863845825, "mean_token_accuracy": 0.776491329073906, "num_tokens": 7171940.0, "step": 440 }, { "entropy": 0.5613545030355453, "epoch": 1.6467289719626168, "grad_norm": 0.02767273783683777, "learning_rate": 0.0002, "loss": 0.5548912286758423, "mean_token_accuracy": 0.7774568051099777, "num_tokens": 7188459.0, "step": 441 }, { "entropy": 0.5349740386009216, "epoch": 1.6504672897196262, "grad_norm": 0.03398311510682106, "learning_rate": 0.0002, "loss": 0.5359267592430115, "mean_token_accuracy": 0.7792400866746902, "num_tokens": 7204742.0, "step": 442 }, { "entropy": 0.5435358434915543, "epoch": 1.6542056074766354, "grad_norm": 0.03121669590473175, "learning_rate": 0.0002, "loss": 0.5480291247367859, "mean_token_accuracy": 0.7757425308227539, "num_tokens": 7220970.0, "step": 443 }, { "entropy": 0.5408525168895721, "epoch": 1.6579439252336448, "grad_norm": 0.03187638521194458, "learning_rate": 0.0002, "loss": 0.5458962321281433, "mean_token_accuracy": 0.7777377218008041, "num_tokens": 7237303.0, "step": 444 }, { "entropy": 0.5296604186296463, "epoch": 1.6616822429906541, "grad_norm": 0.033922888338565826, "learning_rate": 0.0002, "loss": 0.5350003242492676, "mean_token_accuracy": 0.7817184776067734, "num_tokens": 7253313.0, "step": 445 }, { "entropy": 0.5386542528867722, "epoch": 1.6654205607476635, "grad_norm": 0.03487584367394447, "learning_rate": 0.0002, "loss": 0.5504403710365295, "mean_token_accuracy": 0.7764954715967178, "num_tokens": 7269689.0, "step": 446 }, { "entropy": 0.5447485446929932, "epoch": 1.669158878504673, "grad_norm": 0.028691545128822327, "learning_rate": 0.0002, "loss": 0.5440992712974548, "mean_token_accuracy": 0.7813538759946823, "num_tokens": 7286072.0, "step": 447 }, { "entropy": 0.5479656606912613, "epoch": 1.6728971962616823, "grad_norm": 0.02881709486246109, "learning_rate": 0.0002, "loss": 0.5415880084037781, "mean_token_accuracy": 0.7795199900865555, "num_tokens": 7302255.0, "step": 448 }, { "entropy": 0.5570111870765686, "epoch": 1.6766355140186917, "grad_norm": 0.028915997594594955, "learning_rate": 0.0002, "loss": 0.5533952713012695, "mean_token_accuracy": 0.7753083109855652, "num_tokens": 7318517.0, "step": 449 }, { "entropy": 0.5548125952482224, "epoch": 1.680373831775701, "grad_norm": 0.029765961691737175, "learning_rate": 0.0002, "loss": 0.5539486408233643, "mean_token_accuracy": 0.7759220153093338, "num_tokens": 7334708.0, "step": 450 }, { "entropy": 0.5474168807268143, "epoch": 1.6841121495327103, "grad_norm": 0.028495540842413902, "learning_rate": 0.0002, "loss": 0.542155921459198, "mean_token_accuracy": 0.7810131311416626, "num_tokens": 7351081.0, "step": 451 }, { "entropy": 0.5660932809114456, "epoch": 1.6878504672897197, "grad_norm": 0.029109494760632515, "learning_rate": 0.0002, "loss": 0.5608826279640198, "mean_token_accuracy": 0.7715775072574615, "num_tokens": 7367731.0, "step": 452 }, { "entropy": 0.5341303050518036, "epoch": 1.6915887850467288, "grad_norm": 0.0320415273308754, "learning_rate": 0.0002, "loss": 0.5458233952522278, "mean_token_accuracy": 0.7763672173023224, "num_tokens": 7383855.0, "step": 453 }, { "entropy": 0.5321396738290787, "epoch": 1.6953271028037382, "grad_norm": 0.02727021649479866, "learning_rate": 0.0002, "loss": 0.5336453318595886, "mean_token_accuracy": 0.7841753661632538, "num_tokens": 7400413.0, "step": 454 }, { "entropy": 0.5274764150381088, "epoch": 1.6990654205607476, "grad_norm": 0.03324299305677414, "learning_rate": 0.0002, "loss": 0.5358706116676331, "mean_token_accuracy": 0.7782862633466721, "num_tokens": 7416652.0, "step": 455 }, { "entropy": 0.5659113973379135, "epoch": 1.702803738317757, "grad_norm": 0.02792423591017723, "learning_rate": 0.0002, "loss": 0.5652596354484558, "mean_token_accuracy": 0.7699151486158371, "num_tokens": 7433182.0, "step": 456 }, { "entropy": 0.5379252284765244, "epoch": 1.7065420560747664, "grad_norm": 0.029364224523305893, "learning_rate": 0.0002, "loss": 0.5403070449829102, "mean_token_accuracy": 0.780923143029213, "num_tokens": 7449489.0, "step": 457 }, { "entropy": 0.5333061218261719, "epoch": 1.7102803738317758, "grad_norm": 0.03605153039097786, "learning_rate": 0.0002, "loss": 0.5397148728370667, "mean_token_accuracy": 0.7807264924049377, "num_tokens": 7465639.0, "step": 458 }, { "entropy": 0.5705498605966568, "epoch": 1.7140186915887852, "grad_norm": 0.03089967370033264, "learning_rate": 0.0002, "loss": 0.5634230375289917, "mean_token_accuracy": 0.770861804485321, "num_tokens": 7482026.0, "step": 459 }, { "entropy": 0.5468743443489075, "epoch": 1.7177570093457943, "grad_norm": 0.030453559011220932, "learning_rate": 0.0002, "loss": 0.545179545879364, "mean_token_accuracy": 0.7774305045604706, "num_tokens": 7498135.0, "step": 460 }, { "entropy": 0.5617033839225769, "epoch": 1.7214953271028037, "grad_norm": 0.03324849158525467, "learning_rate": 0.0002, "loss": 0.5638455748558044, "mean_token_accuracy": 0.7687248736619949, "num_tokens": 7514525.0, "step": 461 }, { "entropy": 0.5581229031085968, "epoch": 1.7252336448598131, "grad_norm": 0.03176411613821983, "learning_rate": 0.0002, "loss": 0.5653245449066162, "mean_token_accuracy": 0.7685625553131104, "num_tokens": 7530775.0, "step": 462 }, { "entropy": 0.5476332157850266, "epoch": 1.7289719626168223, "grad_norm": 0.02840348146855831, "learning_rate": 0.0002, "loss": 0.5459728240966797, "mean_token_accuracy": 0.7803480625152588, "num_tokens": 7547133.0, "step": 463 }, { "entropy": 0.5295307040214539, "epoch": 1.7327102803738317, "grad_norm": 0.03073256090283394, "learning_rate": 0.0002, "loss": 0.5271958708763123, "mean_token_accuracy": 0.7856812626123428, "num_tokens": 7563202.0, "step": 464 }, { "entropy": 0.5600748807191849, "epoch": 1.736448598130841, "grad_norm": 0.02645997144281864, "learning_rate": 0.0002, "loss": 0.5613283514976501, "mean_token_accuracy": 0.7728501409292221, "num_tokens": 7579316.0, "step": 465 }, { "entropy": 0.5520564913749695, "epoch": 1.7401869158878505, "grad_norm": 0.03572427108883858, "learning_rate": 0.0002, "loss": 0.5537987947463989, "mean_token_accuracy": 0.7724860310554504, "num_tokens": 7595641.0, "step": 466 }, { "entropy": 0.5529971420764923, "epoch": 1.7439252336448599, "grad_norm": 0.03125125169754028, "learning_rate": 0.0002, "loss": 0.5582661628723145, "mean_token_accuracy": 0.7737809270620346, "num_tokens": 7611643.0, "step": 467 }, { "entropy": 0.5647894889116287, "epoch": 1.7476635514018692, "grad_norm": 0.029365174472332, "learning_rate": 0.0002, "loss": 0.5628995895385742, "mean_token_accuracy": 0.770697221159935, "num_tokens": 7628011.0, "step": 468 }, { "entropy": 0.554974377155304, "epoch": 1.7514018691588786, "grad_norm": 0.03162689507007599, "learning_rate": 0.0002, "loss": 0.5540342330932617, "mean_token_accuracy": 0.7753277122974396, "num_tokens": 7644033.0, "step": 469 }, { "entropy": 0.5500662177801132, "epoch": 1.7551401869158878, "grad_norm": 0.03005298413336277, "learning_rate": 0.0002, "loss": 0.5444310307502747, "mean_token_accuracy": 0.7801364362239838, "num_tokens": 7660280.0, "step": 470 }, { "entropy": 0.5447323620319366, "epoch": 1.7588785046728972, "grad_norm": 0.03137346729636192, "learning_rate": 0.0002, "loss": 0.5573670864105225, "mean_token_accuracy": 0.7713485956192017, "num_tokens": 7676463.0, "step": 471 }, { "entropy": 0.5369779318571091, "epoch": 1.7626168224299066, "grad_norm": 0.03314938396215439, "learning_rate": 0.0002, "loss": 0.5444561839103699, "mean_token_accuracy": 0.7770639657974243, "num_tokens": 7692602.0, "step": 472 }, { "entropy": 0.5475834012031555, "epoch": 1.7663551401869158, "grad_norm": 0.02887626923620701, "learning_rate": 0.0002, "loss": 0.548475980758667, "mean_token_accuracy": 0.7783610373735428, "num_tokens": 7708846.0, "step": 473 }, { "entropy": 0.5512323975563049, "epoch": 1.7700934579439251, "grad_norm": 0.029940130189061165, "learning_rate": 0.0002, "loss": 0.5473303198814392, "mean_token_accuracy": 0.7762128710746765, "num_tokens": 7725069.0, "step": 474 }, { "entropy": 0.553005576133728, "epoch": 1.7738317757009345, "grad_norm": 0.030464377254247665, "learning_rate": 0.0002, "loss": 0.5503718852996826, "mean_token_accuracy": 0.774563655257225, "num_tokens": 7741245.0, "step": 475 }, { "entropy": 0.5530129075050354, "epoch": 1.777570093457944, "grad_norm": 0.03166594356298447, "learning_rate": 0.0002, "loss": 0.5523677468299866, "mean_token_accuracy": 0.7772203087806702, "num_tokens": 7757438.0, "step": 476 }, { "entropy": 0.5589546114206314, "epoch": 1.7813084112149533, "grad_norm": 0.031029848381876945, "learning_rate": 0.0002, "loss": 0.562568724155426, "mean_token_accuracy": 0.7697692364454269, "num_tokens": 7773613.0, "step": 477 }, { "entropy": 0.5485216081142426, "epoch": 1.7850467289719627, "grad_norm": 0.03148766979575157, "learning_rate": 0.0002, "loss": 0.5566563010215759, "mean_token_accuracy": 0.7735153138637543, "num_tokens": 7790250.0, "step": 478 }, { "entropy": 0.5454483330249786, "epoch": 1.788785046728972, "grad_norm": 0.02934390679001808, "learning_rate": 0.0002, "loss": 0.5470514297485352, "mean_token_accuracy": 0.777851864695549, "num_tokens": 7806794.0, "step": 479 }, { "entropy": 0.5577091723680496, "epoch": 1.7925233644859813, "grad_norm": 0.032060954719781876, "learning_rate": 0.0002, "loss": 0.5573920011520386, "mean_token_accuracy": 0.7715256214141846, "num_tokens": 7823378.0, "step": 480 }, { "entropy": 0.5442305952310562, "epoch": 1.7962616822429907, "grad_norm": 0.027305442839860916, "learning_rate": 0.0002, "loss": 0.5404268503189087, "mean_token_accuracy": 0.7780007869005203, "num_tokens": 7839749.0, "step": 481 }, { "entropy": 0.5555779784917831, "epoch": 1.8, "grad_norm": 0.03287232294678688, "learning_rate": 0.0002, "loss": 0.5462092161178589, "mean_token_accuracy": 0.7763689607381821, "num_tokens": 7855947.0, "step": 482 }, { "entropy": 0.5372089967131615, "epoch": 1.8037383177570092, "grad_norm": 0.031652286648750305, "learning_rate": 0.0002, "loss": 0.5363561511039734, "mean_token_accuracy": 0.7853012979030609, "num_tokens": 7872142.0, "step": 483 }, { "entropy": 0.5340928807854652, "epoch": 1.8074766355140186, "grad_norm": 0.031619228422641754, "learning_rate": 0.0002, "loss": 0.5403937697410583, "mean_token_accuracy": 0.7826676219701767, "num_tokens": 7888470.0, "step": 484 }, { "entropy": 0.5592721551656723, "epoch": 1.811214953271028, "grad_norm": 0.03946106135845184, "learning_rate": 0.0002, "loss": 0.5722806453704834, "mean_token_accuracy": 0.7665584683418274, "num_tokens": 7904942.0, "step": 485 }, { "entropy": 0.5392829775810242, "epoch": 1.8149532710280374, "grad_norm": 0.04261912405490875, "learning_rate": 0.0002, "loss": 0.5484760999679565, "mean_token_accuracy": 0.7759799510240555, "num_tokens": 7921095.0, "step": 486 }, { "entropy": 0.5537964701652527, "epoch": 1.8186915887850468, "grad_norm": 0.029489269480109215, "learning_rate": 0.0002, "loss": 0.5515441298484802, "mean_token_accuracy": 0.7770739942789078, "num_tokens": 7937493.0, "step": 487 }, { "entropy": 0.5820317566394806, "epoch": 1.8224299065420562, "grad_norm": 0.032789647579193115, "learning_rate": 0.0002, "loss": 0.5696999430656433, "mean_token_accuracy": 0.766129344701767, "num_tokens": 7953872.0, "step": 488 }, { "entropy": 0.5591157823801041, "epoch": 1.8261682242990656, "grad_norm": 0.03274792060256004, "learning_rate": 0.0002, "loss": 0.5492164492607117, "mean_token_accuracy": 0.7776104360818863, "num_tokens": 7970399.0, "step": 489 }, { "entropy": 0.5613900125026703, "epoch": 1.8299065420560747, "grad_norm": 0.03268195316195488, "learning_rate": 0.0002, "loss": 0.5613545179367065, "mean_token_accuracy": 0.7726269513368607, "num_tokens": 7986663.0, "step": 490 }, { "entropy": 0.540773555636406, "epoch": 1.8336448598130841, "grad_norm": 0.031849462538957596, "learning_rate": 0.0002, "loss": 0.5427927374839783, "mean_token_accuracy": 0.7795483022928238, "num_tokens": 8002949.0, "step": 491 }, { "entropy": 0.5281448513269424, "epoch": 1.8373831775700935, "grad_norm": 0.037760283797979355, "learning_rate": 0.0002, "loss": 0.5398802161216736, "mean_token_accuracy": 0.7793932110071182, "num_tokens": 8018924.0, "step": 492 }, { "entropy": 0.5640152990818024, "epoch": 1.8411214953271027, "grad_norm": 0.03318220004439354, "learning_rate": 0.0002, "loss": 0.5651699900627136, "mean_token_accuracy": 0.7711258381605148, "num_tokens": 8035544.0, "step": 493 }, { "entropy": 0.5498005002737045, "epoch": 1.844859813084112, "grad_norm": 0.0300876684486866, "learning_rate": 0.0002, "loss": 0.5483426451683044, "mean_token_accuracy": 0.777212604880333, "num_tokens": 8051604.0, "step": 494 }, { "entropy": 0.5553054213523865, "epoch": 1.8485981308411215, "grad_norm": 0.03142329677939415, "learning_rate": 0.0002, "loss": 0.5571571588516235, "mean_token_accuracy": 0.7740218490362167, "num_tokens": 8067812.0, "step": 495 }, { "entropy": 0.5580199360847473, "epoch": 1.8523364485981308, "grad_norm": 0.03293558582663536, "learning_rate": 0.0002, "loss": 0.5583306550979614, "mean_token_accuracy": 0.7746147364377975, "num_tokens": 8083966.0, "step": 496 }, { "entropy": 0.5503615736961365, "epoch": 1.8560747663551402, "grad_norm": 0.031184855848550797, "learning_rate": 0.0002, "loss": 0.5509845614433289, "mean_token_accuracy": 0.7762554883956909, "num_tokens": 8100276.0, "step": 497 }, { "entropy": 0.5609902739524841, "epoch": 1.8598130841121496, "grad_norm": 0.03478863090276718, "learning_rate": 0.0002, "loss": 0.5611089468002319, "mean_token_accuracy": 0.7710845172405243, "num_tokens": 8116579.0, "step": 498 }, { "entropy": 0.5358163863420486, "epoch": 1.863551401869159, "grad_norm": 0.03343072161078453, "learning_rate": 0.0002, "loss": 0.5352976322174072, "mean_token_accuracy": 0.7815191894769669, "num_tokens": 8132938.0, "step": 499 }, { "entropy": 0.5323279201984406, "epoch": 1.8672897196261682, "grad_norm": 0.030239535495638847, "learning_rate": 0.0002, "loss": 0.5383006930351257, "mean_token_accuracy": 0.7808633744716644, "num_tokens": 8149182.0, "step": 500 }, { "entropy": 0.557625338435173, "epoch": 1.8710280373831776, "grad_norm": 0.031314413994550705, "learning_rate": 0.0002, "loss": 0.5607120990753174, "mean_token_accuracy": 0.7726259678602219, "num_tokens": 8165713.0, "step": 501 }, { "entropy": 0.5501556247472763, "epoch": 1.874766355140187, "grad_norm": 0.029330939054489136, "learning_rate": 0.0002, "loss": 0.5527728796005249, "mean_token_accuracy": 0.7722220122814178, "num_tokens": 8182157.0, "step": 502 }, { "entropy": 0.5571380257606506, "epoch": 1.8785046728971961, "grad_norm": 0.027965383604168892, "learning_rate": 0.0002, "loss": 0.5537632703781128, "mean_token_accuracy": 0.7755916863679886, "num_tokens": 8198641.0, "step": 503 }, { "entropy": 0.5457630455493927, "epoch": 1.8822429906542055, "grad_norm": 0.030688611790537834, "learning_rate": 0.0002, "loss": 0.5442954897880554, "mean_token_accuracy": 0.7765072137117386, "num_tokens": 8214799.0, "step": 504 }, { "entropy": 0.5432839095592499, "epoch": 1.885981308411215, "grad_norm": 0.0319070965051651, "learning_rate": 0.0002, "loss": 0.5535275936126709, "mean_token_accuracy": 0.7709672451019287, "num_tokens": 8230973.0, "step": 505 }, { "entropy": 0.5594919174909592, "epoch": 1.8897196261682243, "grad_norm": 0.04258793592453003, "learning_rate": 0.0002, "loss": 0.5607203841209412, "mean_token_accuracy": 0.7712259739637375, "num_tokens": 8247156.0, "step": 506 }, { "entropy": 0.5589391887187958, "epoch": 1.8934579439252337, "grad_norm": 0.033864762634038925, "learning_rate": 0.0002, "loss": 0.5650033950805664, "mean_token_accuracy": 0.7718524932861328, "num_tokens": 8263441.0, "step": 507 }, { "entropy": 0.5569577813148499, "epoch": 1.897196261682243, "grad_norm": 0.03338006138801575, "learning_rate": 0.0002, "loss": 0.5555600523948669, "mean_token_accuracy": 0.7759018540382385, "num_tokens": 8279848.0, "step": 508 }, { "entropy": 0.5524785667657852, "epoch": 1.9009345794392525, "grad_norm": 0.034291088581085205, "learning_rate": 0.0002, "loss": 0.554389238357544, "mean_token_accuracy": 0.7732797265052795, "num_tokens": 8296286.0, "step": 509 }, { "entropy": 0.5341912508010864, "epoch": 1.9046728971962616, "grad_norm": 0.03332460671663284, "learning_rate": 0.0002, "loss": 0.5296705365180969, "mean_token_accuracy": 0.7850336581468582, "num_tokens": 8312462.0, "step": 510 }, { "entropy": 0.5374017357826233, "epoch": 1.908411214953271, "grad_norm": 0.029762303456664085, "learning_rate": 0.0002, "loss": 0.5377117395401001, "mean_token_accuracy": 0.7782561480998993, "num_tokens": 8328514.0, "step": 511 }, { "entropy": 0.5621481090784073, "epoch": 1.9121495327102802, "grad_norm": 0.02770383097231388, "learning_rate": 0.0002, "loss": 0.556929349899292, "mean_token_accuracy": 0.7750183939933777, "num_tokens": 8345018.0, "step": 512 }, { "entropy": 0.5308145210146904, "epoch": 1.9158878504672896, "grad_norm": 0.031799450516700745, "learning_rate": 0.0002, "loss": 0.5367879867553711, "mean_token_accuracy": 0.7811458259820938, "num_tokens": 8361450.0, "step": 513 }, { "entropy": 0.5505598485469818, "epoch": 1.919626168224299, "grad_norm": 0.030035199597477913, "learning_rate": 0.0002, "loss": 0.55583256483078, "mean_token_accuracy": 0.7735087871551514, "num_tokens": 8378205.0, "step": 514 }, { "entropy": 0.5498997569084167, "epoch": 1.9233644859813084, "grad_norm": 0.031478267163038254, "learning_rate": 0.0002, "loss": 0.554360568523407, "mean_token_accuracy": 0.7755851894617081, "num_tokens": 8394730.0, "step": 515 }, { "entropy": 0.5447141826152802, "epoch": 1.9271028037383178, "grad_norm": 0.034256696701049805, "learning_rate": 0.0002, "loss": 0.5524182915687561, "mean_token_accuracy": 0.7743232250213623, "num_tokens": 8410799.0, "step": 516 }, { "entropy": 0.5548212677240372, "epoch": 1.9308411214953272, "grad_norm": 0.0296107679605484, "learning_rate": 0.0002, "loss": 0.5498183965682983, "mean_token_accuracy": 0.7740313857793808, "num_tokens": 8427372.0, "step": 517 }, { "entropy": 0.5684213787317276, "epoch": 1.9345794392523366, "grad_norm": 0.03422481194138527, "learning_rate": 0.0002, "loss": 0.5559389591217041, "mean_token_accuracy": 0.7754881531000137, "num_tokens": 8443822.0, "step": 518 }, { "entropy": 0.5545912981033325, "epoch": 1.938317757009346, "grad_norm": 0.031684234738349915, "learning_rate": 0.0002, "loss": 0.5498573780059814, "mean_token_accuracy": 0.7783227860927582, "num_tokens": 8460032.0, "step": 519 }, { "entropy": 0.5595797300338745, "epoch": 1.9420560747663551, "grad_norm": 0.02719406597316265, "learning_rate": 0.0002, "loss": 0.5614221096038818, "mean_token_accuracy": 0.7715103030204773, "num_tokens": 8476297.0, "step": 520 }, { "entropy": 0.5345963835716248, "epoch": 1.9457943925233645, "grad_norm": 0.03023097850382328, "learning_rate": 0.0002, "loss": 0.5425735116004944, "mean_token_accuracy": 0.7805851995944977, "num_tokens": 8492637.0, "step": 521 }, { "entropy": 0.5391188263893127, "epoch": 1.9495327102803737, "grad_norm": 0.05476713180541992, "learning_rate": 0.0002, "loss": 0.5556075572967529, "mean_token_accuracy": 0.7749961167573929, "num_tokens": 8509129.0, "step": 522 }, { "entropy": 0.5553655624389648, "epoch": 1.953271028037383, "grad_norm": 0.03542236238718033, "learning_rate": 0.0002, "loss": 0.5655393004417419, "mean_token_accuracy": 0.7717009782791138, "num_tokens": 8525641.0, "step": 523 }, { "entropy": 0.5613285005092621, "epoch": 1.9570093457943925, "grad_norm": 0.06946822255849838, "learning_rate": 0.0002, "loss": 0.5717962384223938, "mean_token_accuracy": 0.7724136412143707, "num_tokens": 8542275.0, "step": 524 }, { "entropy": 0.5575561076402664, "epoch": 1.9607476635514018, "grad_norm": 0.03460278734564781, "learning_rate": 0.0002, "loss": 0.5417395830154419, "mean_token_accuracy": 0.7819567322731018, "num_tokens": 8558373.0, "step": 525 }, { "entropy": 0.5704021006822586, "epoch": 1.9644859813084112, "grad_norm": 0.030037706717848778, "learning_rate": 0.0002, "loss": 0.5573901534080505, "mean_token_accuracy": 0.7713392674922943, "num_tokens": 8574839.0, "step": 526 }, { "entropy": 0.5286285877227783, "epoch": 1.9682242990654206, "grad_norm": 0.032038215547800064, "learning_rate": 0.0002, "loss": 0.5231573581695557, "mean_token_accuracy": 0.7873097807168961, "num_tokens": 8591063.0, "step": 527 }, { "entropy": 0.535316064953804, "epoch": 1.97196261682243, "grad_norm": 0.04137961193919182, "learning_rate": 0.0002, "loss": 0.5491993427276611, "mean_token_accuracy": 0.7760031670331955, "num_tokens": 8607354.0, "step": 528 }, { "entropy": 0.5287620276212692, "epoch": 1.9757009345794394, "grad_norm": 0.03144775703549385, "learning_rate": 0.0002, "loss": 0.5313848853111267, "mean_token_accuracy": 0.784307450056076, "num_tokens": 8623542.0, "step": 529 }, { "entropy": 0.5521504878997803, "epoch": 1.9794392523364486, "grad_norm": 0.03497127816081047, "learning_rate": 0.0002, "loss": 0.5516395568847656, "mean_token_accuracy": 0.7736653387546539, "num_tokens": 8639626.0, "step": 530 }, { "entropy": 0.5580714792013168, "epoch": 1.983177570093458, "grad_norm": 0.030566083267331123, "learning_rate": 0.0002, "loss": 0.5535013675689697, "mean_token_accuracy": 0.7748955637216568, "num_tokens": 8655957.0, "step": 531 }, { "entropy": 0.5411636233329773, "epoch": 1.9869158878504671, "grad_norm": 0.03356699272990227, "learning_rate": 0.0002, "loss": 0.5376905202865601, "mean_token_accuracy": 0.7788012474775314, "num_tokens": 8672109.0, "step": 532 }, { "entropy": 0.5470294207334518, "epoch": 1.9906542056074765, "grad_norm": 0.0316782146692276, "learning_rate": 0.0002, "loss": 0.5445536971092224, "mean_token_accuracy": 0.7801567167043686, "num_tokens": 8688512.0, "step": 533 }, { "entropy": 0.5573801398277283, "epoch": 1.994392523364486, "grad_norm": 0.0308368057012558, "learning_rate": 0.0002, "loss": 0.5613093376159668, "mean_token_accuracy": 0.7755008339881897, "num_tokens": 8704882.0, "step": 534 }, { "entropy": 0.5606262683868408, "epoch": 1.9981308411214953, "grad_norm": 0.033759523183107376, "learning_rate": 0.0002, "loss": 0.5673450827598572, "mean_token_accuracy": 0.7693974524736404, "num_tokens": 8721476.0, "step": 535 }, { "entropy": 0.5470572412014008, "epoch": 2.0, "grad_norm": 0.045990657061338425, "learning_rate": 0.0002, "loss": 0.5525597333908081, "mean_token_accuracy": 0.7788615226745605, "num_tokens": 8729601.0, "step": 536 }, { "entropy": 0.5381215959787369, "epoch": 2.0037383177570094, "grad_norm": 0.03212118148803711, "learning_rate": 0.0002, "loss": 0.5325874090194702, "mean_token_accuracy": 0.7825482040643692, "num_tokens": 8745950.0, "step": 537 }, { "entropy": 0.5637937486171722, "epoch": 2.007476635514019, "grad_norm": 0.036541201174259186, "learning_rate": 0.0002, "loss": 0.5618294477462769, "mean_token_accuracy": 0.773602232336998, "num_tokens": 8762499.0, "step": 538 }, { "entropy": 0.5491923093795776, "epoch": 2.011214953271028, "grad_norm": 0.033549197018146515, "learning_rate": 0.0002, "loss": 0.548430323600769, "mean_token_accuracy": 0.7764875292778015, "num_tokens": 8778855.0, "step": 539 }, { "entropy": 0.5251094102859497, "epoch": 2.0149532710280376, "grad_norm": 0.036079153418540955, "learning_rate": 0.0002, "loss": 0.5315405130386353, "mean_token_accuracy": 0.7840714603662491, "num_tokens": 8794810.0, "step": 540 }, { "entropy": 0.5423221588134766, "epoch": 2.0186915887850465, "grad_norm": 0.03329861909151077, "learning_rate": 0.0002, "loss": 0.5420343279838562, "mean_token_accuracy": 0.7797907888889313, "num_tokens": 8811426.0, "step": 541 }, { "entropy": 0.5213563144207001, "epoch": 2.022429906542056, "grad_norm": 0.03049337863922119, "learning_rate": 0.0002, "loss": 0.5193029642105103, "mean_token_accuracy": 0.7878206521272659, "num_tokens": 8827505.0, "step": 542 }, { "entropy": 0.5485236346721649, "epoch": 2.0261682242990653, "grad_norm": 0.038072168827056885, "learning_rate": 0.0002, "loss": 0.5403975248336792, "mean_token_accuracy": 0.7787782251834869, "num_tokens": 8843789.0, "step": 543 }, { "entropy": 0.5497236847877502, "epoch": 2.0299065420560747, "grad_norm": 0.037746790796518326, "learning_rate": 0.0002, "loss": 0.5424782037734985, "mean_token_accuracy": 0.7821084409952164, "num_tokens": 8860524.0, "step": 544 }, { "entropy": 0.5128878131508827, "epoch": 2.033644859813084, "grad_norm": 0.03184136748313904, "learning_rate": 0.0002, "loss": 0.5119982957839966, "mean_token_accuracy": 0.7925940603017807, "num_tokens": 8876520.0, "step": 545 }, { "entropy": 0.53415547311306, "epoch": 2.0373831775700935, "grad_norm": 0.04230194166302681, "learning_rate": 0.0002, "loss": 0.5436858534812927, "mean_token_accuracy": 0.7798719555139542, "num_tokens": 8892800.0, "step": 546 }, { "entropy": 0.527920126914978, "epoch": 2.041121495327103, "grad_norm": 0.035794876515865326, "learning_rate": 0.0002, "loss": 0.537831723690033, "mean_token_accuracy": 0.7832628786563873, "num_tokens": 8908779.0, "step": 547 }, { "entropy": 0.528620719909668, "epoch": 2.0448598130841122, "grad_norm": 0.043260980397462845, "learning_rate": 0.0002, "loss": 0.5385839343070984, "mean_token_accuracy": 0.7800839692354202, "num_tokens": 8925225.0, "step": 548 }, { "entropy": 0.5344889611005783, "epoch": 2.0485981308411216, "grad_norm": 0.03616830334067345, "learning_rate": 0.0002, "loss": 0.5279685258865356, "mean_token_accuracy": 0.7877432852983475, "num_tokens": 8941370.0, "step": 549 }, { "entropy": 0.5505447387695312, "epoch": 2.052336448598131, "grad_norm": 0.03392447903752327, "learning_rate": 0.0002, "loss": 0.5464667081832886, "mean_token_accuracy": 0.778993234038353, "num_tokens": 8957759.0, "step": 550 }, { "entropy": 0.537495419383049, "epoch": 2.05607476635514, "grad_norm": 0.03487386927008629, "learning_rate": 0.0002, "loss": 0.5327776074409485, "mean_token_accuracy": 0.7819164842367172, "num_tokens": 8974120.0, "step": 551 }, { "entropy": 0.5181033089756966, "epoch": 2.0598130841121494, "grad_norm": 0.03655601665377617, "learning_rate": 0.0002, "loss": 0.5197772979736328, "mean_token_accuracy": 0.7876780480146408, "num_tokens": 8990084.0, "step": 552 }, { "entropy": 0.5097288861870766, "epoch": 2.0635514018691588, "grad_norm": 0.04094317555427551, "learning_rate": 0.0002, "loss": 0.5214163661003113, "mean_token_accuracy": 0.7877646237611771, "num_tokens": 9006115.0, "step": 553 }, { "entropy": 0.5392448753118515, "epoch": 2.067289719626168, "grad_norm": 0.042336490005254745, "learning_rate": 0.0002, "loss": 0.5487770438194275, "mean_token_accuracy": 0.7746841162443161, "num_tokens": 9022503.0, "step": 554 }, { "entropy": 0.5353204905986786, "epoch": 2.0710280373831775, "grad_norm": 0.04751956835389137, "learning_rate": 0.0002, "loss": 0.5423939824104309, "mean_token_accuracy": 0.7819565683603287, "num_tokens": 9038587.0, "step": 555 }, { "entropy": 0.5576211661100388, "epoch": 2.074766355140187, "grad_norm": 0.034248773008584976, "learning_rate": 0.0002, "loss": 0.5450438261032104, "mean_token_accuracy": 0.7806050181388855, "num_tokens": 9054978.0, "step": 556 }, { "entropy": 0.5164358094334602, "epoch": 2.0785046728971963, "grad_norm": 0.03642895817756653, "learning_rate": 0.0002, "loss": 0.5048035979270935, "mean_token_accuracy": 0.7946237772703171, "num_tokens": 9071189.0, "step": 557 }, { "entropy": 0.5479462146759033, "epoch": 2.0822429906542057, "grad_norm": 0.03524266555905342, "learning_rate": 0.0002, "loss": 0.5424850583076477, "mean_token_accuracy": 0.7782812714576721, "num_tokens": 9087453.0, "step": 558 }, { "entropy": 0.5207670480012894, "epoch": 2.085981308411215, "grad_norm": 0.04086553677916527, "learning_rate": 0.0002, "loss": 0.5275461673736572, "mean_token_accuracy": 0.7870053201913834, "num_tokens": 9103538.0, "step": 559 }, { "entropy": 0.5350566729903221, "epoch": 2.0897196261682245, "grad_norm": 0.036386121064424515, "learning_rate": 0.0002, "loss": 0.5380175113677979, "mean_token_accuracy": 0.7814048826694489, "num_tokens": 9119858.0, "step": 560 }, { "entropy": 0.5368697345256805, "epoch": 2.0934579439252334, "grad_norm": 0.039366140961647034, "learning_rate": 0.0002, "loss": 0.5444531440734863, "mean_token_accuracy": 0.7792541682720184, "num_tokens": 9136204.0, "step": 561 }, { "entropy": 0.5295629873871803, "epoch": 2.097196261682243, "grad_norm": 0.03559441864490509, "learning_rate": 0.0002, "loss": 0.5286230444908142, "mean_token_accuracy": 0.784547358751297, "num_tokens": 9152718.0, "step": 562 }, { "entropy": 0.5568843930959702, "epoch": 2.100934579439252, "grad_norm": 0.034528154879808426, "learning_rate": 0.0002, "loss": 0.5466718077659607, "mean_token_accuracy": 0.7782703340053558, "num_tokens": 9168840.0, "step": 563 }, { "entropy": 0.5514650642871857, "epoch": 2.1046728971962616, "grad_norm": 0.034620221704244614, "learning_rate": 0.0002, "loss": 0.5481366515159607, "mean_token_accuracy": 0.7774865627288818, "num_tokens": 9185012.0, "step": 564 }, { "entropy": 0.5468508899211884, "epoch": 2.108411214953271, "grad_norm": 0.038367778062820435, "learning_rate": 0.0002, "loss": 0.5465208888053894, "mean_token_accuracy": 0.7787877917289734, "num_tokens": 9201579.0, "step": 565 }, { "entropy": 0.5365718752145767, "epoch": 2.1121495327102804, "grad_norm": 0.033649299293756485, "learning_rate": 0.0002, "loss": 0.5394605398178101, "mean_token_accuracy": 0.7824818789958954, "num_tokens": 9217958.0, "step": 566 }, { "entropy": 0.5342001020908356, "epoch": 2.1158878504672898, "grad_norm": 0.04148790240287781, "learning_rate": 0.0002, "loss": 0.541080892086029, "mean_token_accuracy": 0.7807753682136536, "num_tokens": 9234182.0, "step": 567 }, { "entropy": 0.5269056260585785, "epoch": 2.119626168224299, "grad_norm": 0.031905628740787506, "learning_rate": 0.0002, "loss": 0.529283881187439, "mean_token_accuracy": 0.7837703377008438, "num_tokens": 9250712.0, "step": 568 }, { "entropy": 0.5335036367177963, "epoch": 2.1233644859813086, "grad_norm": 0.041321150958538055, "learning_rate": 0.0002, "loss": 0.5374078154563904, "mean_token_accuracy": 0.782123014330864, "num_tokens": 9266961.0, "step": 569 }, { "entropy": 0.5442205667495728, "epoch": 2.127102803738318, "grad_norm": 0.034318044781684875, "learning_rate": 0.0002, "loss": 0.5429351329803467, "mean_token_accuracy": 0.7788351625204086, "num_tokens": 9283528.0, "step": 570 }, { "entropy": 0.5432394444942474, "epoch": 2.130841121495327, "grad_norm": 0.047397077083587646, "learning_rate": 0.0002, "loss": 0.5424203276634216, "mean_token_accuracy": 0.7810939103364944, "num_tokens": 9299837.0, "step": 571 }, { "entropy": 0.5400207340717316, "epoch": 2.1345794392523363, "grad_norm": 0.03500756248831749, "learning_rate": 0.0002, "loss": 0.5377690196037292, "mean_token_accuracy": 0.783811166882515, "num_tokens": 9315959.0, "step": 572 }, { "entropy": 0.5296697020530701, "epoch": 2.1383177570093457, "grad_norm": 0.03790782764554024, "learning_rate": 0.0002, "loss": 0.5289957523345947, "mean_token_accuracy": 0.7867159694433212, "num_tokens": 9332370.0, "step": 573 }, { "entropy": 0.5078830569982529, "epoch": 2.142056074766355, "grad_norm": 0.045958928763866425, "learning_rate": 0.0002, "loss": 0.5104236006736755, "mean_token_accuracy": 0.7909017950296402, "num_tokens": 9348594.0, "step": 574 }, { "entropy": 0.5188925862312317, "epoch": 2.1457943925233645, "grad_norm": 0.03916464373469353, "learning_rate": 0.0002, "loss": 0.5316386818885803, "mean_token_accuracy": 0.7828120291233063, "num_tokens": 9365046.0, "step": 575 }, { "entropy": 0.5045325607061386, "epoch": 2.149532710280374, "grad_norm": 0.04434382542967796, "learning_rate": 0.0002, "loss": 0.5116738080978394, "mean_token_accuracy": 0.7905466854572296, "num_tokens": 9381007.0, "step": 576 }, { "entropy": 0.5541563183069229, "epoch": 2.1532710280373832, "grad_norm": 0.038000430911779404, "learning_rate": 0.0002, "loss": 0.5551270842552185, "mean_token_accuracy": 0.7762157022953033, "num_tokens": 9397394.0, "step": 577 }, { "entropy": 0.5460502356290817, "epoch": 2.1570093457943926, "grad_norm": 0.038676705211400986, "learning_rate": 0.0002, "loss": 0.5363121032714844, "mean_token_accuracy": 0.7802022695541382, "num_tokens": 9413810.0, "step": 578 }, { "entropy": 0.5573510080575943, "epoch": 2.160747663551402, "grad_norm": 0.03721381351351738, "learning_rate": 0.0002, "loss": 0.5444300174713135, "mean_token_accuracy": 0.7804805636405945, "num_tokens": 9430091.0, "step": 579 }, { "entropy": 0.5371396392583847, "epoch": 2.1644859813084114, "grad_norm": 0.04258019104599953, "learning_rate": 0.0002, "loss": 0.5351753234863281, "mean_token_accuracy": 0.7820869237184525, "num_tokens": 9446665.0, "step": 580 }, { "entropy": 0.5393694788217545, "epoch": 2.1682242990654204, "grad_norm": 0.0406467579305172, "learning_rate": 0.0002, "loss": 0.5430103540420532, "mean_token_accuracy": 0.7779065668582916, "num_tokens": 9463118.0, "step": 581 }, { "entropy": 0.5272447615861893, "epoch": 2.1719626168224297, "grad_norm": 0.04435638338327408, "learning_rate": 0.0002, "loss": 0.5354752540588379, "mean_token_accuracy": 0.7838975638151169, "num_tokens": 9479432.0, "step": 582 }, { "entropy": 0.5255759209394455, "epoch": 2.175700934579439, "grad_norm": 0.03574801981449127, "learning_rate": 0.0002, "loss": 0.531680703163147, "mean_token_accuracy": 0.7842760384082794, "num_tokens": 9495707.0, "step": 583 }, { "entropy": 0.5348410457372665, "epoch": 2.1794392523364485, "grad_norm": 0.03383009880781174, "learning_rate": 0.0002, "loss": 0.5284703373908997, "mean_token_accuracy": 0.7889558225870132, "num_tokens": 9512236.0, "step": 584 }, { "entropy": 0.5311737060546875, "epoch": 2.183177570093458, "grad_norm": 0.035349104553461075, "learning_rate": 0.0002, "loss": 0.5332157611846924, "mean_token_accuracy": 0.7814211249351501, "num_tokens": 9528589.0, "step": 585 }, { "entropy": 0.5255388617515564, "epoch": 2.1869158878504673, "grad_norm": 0.043005745857954025, "learning_rate": 0.0002, "loss": 0.5251577496528625, "mean_token_accuracy": 0.7884248644113541, "num_tokens": 9544965.0, "step": 586 }, { "entropy": 0.5347089469432831, "epoch": 2.1906542056074767, "grad_norm": 0.03752923756837845, "learning_rate": 0.0002, "loss": 0.5362472534179688, "mean_token_accuracy": 0.7811613231897354, "num_tokens": 9561276.0, "step": 587 }, { "entropy": 0.5310826078057289, "epoch": 2.194392523364486, "grad_norm": 0.05228811874985695, "learning_rate": 0.0002, "loss": 0.5329592227935791, "mean_token_accuracy": 0.7827970087528229, "num_tokens": 9577509.0, "step": 588 }, { "entropy": 0.5254483968019485, "epoch": 2.1981308411214955, "grad_norm": 0.03692999482154846, "learning_rate": 0.0002, "loss": 0.5311483144760132, "mean_token_accuracy": 0.7830882370471954, "num_tokens": 9593982.0, "step": 589 }, { "entropy": 0.5360620766878128, "epoch": 2.201869158878505, "grad_norm": 0.04609117656946182, "learning_rate": 0.0002, "loss": 0.5386216640472412, "mean_token_accuracy": 0.7802708595991135, "num_tokens": 9610311.0, "step": 590 }, { "entropy": 0.5463242679834366, "epoch": 2.205607476635514, "grad_norm": 0.03901510685682297, "learning_rate": 0.0002, "loss": 0.5447873473167419, "mean_token_accuracy": 0.7785727232694626, "num_tokens": 9626678.0, "step": 591 }, { "entropy": 0.5129301249980927, "epoch": 2.209345794392523, "grad_norm": 0.043117035180330276, "learning_rate": 0.0002, "loss": 0.5128067135810852, "mean_token_accuracy": 0.7911233007907867, "num_tokens": 9642843.0, "step": 592 }, { "entropy": 0.5312749594449997, "epoch": 2.2130841121495326, "grad_norm": 0.03675411641597748, "learning_rate": 0.0002, "loss": 0.5329593420028687, "mean_token_accuracy": 0.7832809239625931, "num_tokens": 9659218.0, "step": 593 }, { "entropy": 0.5422542840242386, "epoch": 2.216822429906542, "grad_norm": 0.036754533648490906, "learning_rate": 0.0002, "loss": 0.5398430824279785, "mean_token_accuracy": 0.7803453654050827, "num_tokens": 9675649.0, "step": 594 }, { "entropy": 0.5472271293401718, "epoch": 2.2205607476635514, "grad_norm": 0.043753694742918015, "learning_rate": 0.0002, "loss": 0.5421810150146484, "mean_token_accuracy": 0.7812557965517044, "num_tokens": 9691932.0, "step": 595 }, { "entropy": 0.5446718335151672, "epoch": 2.2242990654205608, "grad_norm": 0.0450102761387825, "learning_rate": 0.0002, "loss": 0.5450670719146729, "mean_token_accuracy": 0.7795027941465378, "num_tokens": 9708243.0, "step": 596 }, { "entropy": 0.5422708988189697, "epoch": 2.22803738317757, "grad_norm": 0.042899005115032196, "learning_rate": 0.0002, "loss": 0.5427168011665344, "mean_token_accuracy": 0.7769834697246552, "num_tokens": 9724620.0, "step": 597 }, { "entropy": 0.5316948816180229, "epoch": 2.2317757009345796, "grad_norm": 0.0438719242811203, "learning_rate": 0.0002, "loss": 0.5369054675102234, "mean_token_accuracy": 0.7818674147129059, "num_tokens": 9740813.0, "step": 598 }, { "entropy": 0.5353083610534668, "epoch": 2.235514018691589, "grad_norm": 0.045174483209848404, "learning_rate": 0.0002, "loss": 0.535564124584198, "mean_token_accuracy": 0.7826817184686661, "num_tokens": 9757081.0, "step": 599 }, { "entropy": 0.53409144282341, "epoch": 2.2392523364485983, "grad_norm": 0.046971406787633896, "learning_rate": 0.0002, "loss": 0.5388940572738647, "mean_token_accuracy": 0.7797097563743591, "num_tokens": 9773286.0, "step": 600 }, { "entropy": 0.5229181125760078, "epoch": 2.2429906542056073, "grad_norm": 0.04818117991089821, "learning_rate": 0.0002, "loss": 0.5283955931663513, "mean_token_accuracy": 0.7855319827795029, "num_tokens": 9789231.0, "step": 601 }, { "entropy": 0.5502548068761826, "epoch": 2.2467289719626167, "grad_norm": 0.041451770812273026, "learning_rate": 0.0002, "loss": 0.5441420078277588, "mean_token_accuracy": 0.7805446833372116, "num_tokens": 9805737.0, "step": 602 }, { "entropy": 0.5555277764797211, "epoch": 2.250467289719626, "grad_norm": 0.03888588771224022, "learning_rate": 0.0002, "loss": 0.5571548938751221, "mean_token_accuracy": 0.7741208076477051, "num_tokens": 9822370.0, "step": 603 }, { "entropy": 0.5331219285726547, "epoch": 2.2542056074766355, "grad_norm": 0.050726499408483505, "learning_rate": 0.0002, "loss": 0.5355172157287598, "mean_token_accuracy": 0.7803194671869278, "num_tokens": 9838846.0, "step": 604 }, { "entropy": 0.5391329601407051, "epoch": 2.257943925233645, "grad_norm": 0.03473533317446709, "learning_rate": 0.0002, "loss": 0.5380818843841553, "mean_token_accuracy": 0.7837731093168259, "num_tokens": 9855269.0, "step": 605 }, { "entropy": 0.5419459789991379, "epoch": 2.2616822429906542, "grad_norm": 0.04428257793188095, "learning_rate": 0.0002, "loss": 0.5402700304985046, "mean_token_accuracy": 0.7803330719470978, "num_tokens": 9871498.0, "step": 606 }, { "entropy": 0.5475794821977615, "epoch": 2.2654205607476636, "grad_norm": 0.03847254440188408, "learning_rate": 0.0002, "loss": 0.5443584322929382, "mean_token_accuracy": 0.7776888459920883, "num_tokens": 9887880.0, "step": 607 }, { "entropy": 0.5413693785667419, "epoch": 2.269158878504673, "grad_norm": 0.03769246116280556, "learning_rate": 0.0002, "loss": 0.5448262095451355, "mean_token_accuracy": 0.7788306772708893, "num_tokens": 9904482.0, "step": 608 }, { "entropy": 0.5233470648527145, "epoch": 2.2728971962616824, "grad_norm": 0.041845668107271194, "learning_rate": 0.0002, "loss": 0.5302014946937561, "mean_token_accuracy": 0.7834525555372238, "num_tokens": 9920720.0, "step": 609 }, { "entropy": 0.526485301554203, "epoch": 2.2766355140186914, "grad_norm": 0.04298217222094536, "learning_rate": 0.0002, "loss": 0.5376767516136169, "mean_token_accuracy": 0.7815933078527451, "num_tokens": 9936855.0, "step": 610 }, { "entropy": 0.5407330542802811, "epoch": 2.2803738317757007, "grad_norm": 0.03829406201839447, "learning_rate": 0.0002, "loss": 0.5375736951828003, "mean_token_accuracy": 0.7817153483629227, "num_tokens": 9953359.0, "step": 611 }, { "entropy": 0.557465985417366, "epoch": 2.28411214953271, "grad_norm": 0.0430569127202034, "learning_rate": 0.0002, "loss": 0.5485789775848389, "mean_token_accuracy": 0.7774669080972672, "num_tokens": 9969809.0, "step": 612 }, { "entropy": 0.5491045266389847, "epoch": 2.2878504672897195, "grad_norm": 0.04154661297798157, "learning_rate": 0.0002, "loss": 0.5452516078948975, "mean_token_accuracy": 0.7782464772462845, "num_tokens": 9986122.0, "step": 613 }, { "entropy": 0.5396340191364288, "epoch": 2.291588785046729, "grad_norm": 0.03867339715361595, "learning_rate": 0.0002, "loss": 0.5436422228813171, "mean_token_accuracy": 0.7793163359165192, "num_tokens": 10002373.0, "step": 614 }, { "entropy": 0.5227179303765297, "epoch": 2.2953271028037383, "grad_norm": 0.055158648639917374, "learning_rate": 0.0002, "loss": 0.5356475710868835, "mean_token_accuracy": 0.7828944474458694, "num_tokens": 10018532.0, "step": 615 }, { "entropy": 0.5101833418011665, "epoch": 2.2990654205607477, "grad_norm": 0.04139378294348717, "learning_rate": 0.0002, "loss": 0.5111054182052612, "mean_token_accuracy": 0.7948217988014221, "num_tokens": 10034449.0, "step": 616 }, { "entropy": 0.5332518517971039, "epoch": 2.302803738317757, "grad_norm": 0.042138371616601944, "learning_rate": 0.0002, "loss": 0.5291332602500916, "mean_token_accuracy": 0.7875723540782928, "num_tokens": 10050791.0, "step": 617 }, { "entropy": 0.5545465350151062, "epoch": 2.3065420560747665, "grad_norm": 0.04594315588474274, "learning_rate": 0.0002, "loss": 0.5547114610671997, "mean_token_accuracy": 0.7752625793218613, "num_tokens": 10067160.0, "step": 618 }, { "entropy": 0.538428008556366, "epoch": 2.310280373831776, "grad_norm": 0.038197144865989685, "learning_rate": 0.0002, "loss": 0.5356147885322571, "mean_token_accuracy": 0.7812609076499939, "num_tokens": 10083623.0, "step": 619 }, { "entropy": 0.515357218682766, "epoch": 2.3140186915887853, "grad_norm": 0.04305245727300644, "learning_rate": 0.0002, "loss": 0.5182097554206848, "mean_token_accuracy": 0.7897254973649979, "num_tokens": 10099734.0, "step": 620 }, { "entropy": 0.5176303833723068, "epoch": 2.317757009345794, "grad_norm": 0.040814559906721115, "learning_rate": 0.0002, "loss": 0.5241186618804932, "mean_token_accuracy": 0.7862492203712463, "num_tokens": 10115923.0, "step": 621 }, { "entropy": 0.5319753438234329, "epoch": 2.3214953271028036, "grad_norm": 0.038612622767686844, "learning_rate": 0.0002, "loss": 0.5332948565483093, "mean_token_accuracy": 0.7826831489801407, "num_tokens": 10132186.0, "step": 622 }, { "entropy": 0.5231878906488419, "epoch": 2.325233644859813, "grad_norm": 0.04399793595075607, "learning_rate": 0.0002, "loss": 0.5220815539360046, "mean_token_accuracy": 0.7883405387401581, "num_tokens": 10148176.0, "step": 623 }, { "entropy": 0.5503655076026917, "epoch": 2.3289719626168224, "grad_norm": 0.03310840204358101, "learning_rate": 0.0002, "loss": 0.5424314737319946, "mean_token_accuracy": 0.7791298031806946, "num_tokens": 10164602.0, "step": 624 }, { "entropy": 0.5562791079282761, "epoch": 2.3327102803738318, "grad_norm": 0.046219419687986374, "learning_rate": 0.0002, "loss": 0.5487840175628662, "mean_token_accuracy": 0.7803521156311035, "num_tokens": 10180910.0, "step": 625 }, { "entropy": 0.536386102437973, "epoch": 2.336448598130841, "grad_norm": 0.038521721959114075, "learning_rate": 0.0002, "loss": 0.5320638418197632, "mean_token_accuracy": 0.7856791615486145, "num_tokens": 10197138.0, "step": 626 }, { "entropy": 0.5220321416854858, "epoch": 2.3401869158878505, "grad_norm": 0.046215180307626724, "learning_rate": 0.0002, "loss": 0.5289742946624756, "mean_token_accuracy": 0.784678503870964, "num_tokens": 10213246.0, "step": 627 }, { "entropy": 0.5178990513086319, "epoch": 2.34392523364486, "grad_norm": 0.04778464511036873, "learning_rate": 0.0002, "loss": 0.522329568862915, "mean_token_accuracy": 0.7881183475255966, "num_tokens": 10229431.0, "step": 628 }, { "entropy": 0.5353438407182693, "epoch": 2.3476635514018693, "grad_norm": 0.04080234467983246, "learning_rate": 0.0002, "loss": 0.5433787107467651, "mean_token_accuracy": 0.7780589759349823, "num_tokens": 10245684.0, "step": 629 }, { "entropy": 0.5368916243314743, "epoch": 2.3514018691588783, "grad_norm": 0.043697554618120193, "learning_rate": 0.0002, "loss": 0.541444718837738, "mean_token_accuracy": 0.7807413637638092, "num_tokens": 10262210.0, "step": 630 }, { "entropy": 0.5506647378206253, "epoch": 2.3551401869158877, "grad_norm": 0.038478951901197433, "learning_rate": 0.0002, "loss": 0.5461610555648804, "mean_token_accuracy": 0.7788456082344055, "num_tokens": 10278611.0, "step": 631 }, { "entropy": 0.5395764261484146, "epoch": 2.358878504672897, "grad_norm": 0.03904217854142189, "learning_rate": 0.0002, "loss": 0.5317508578300476, "mean_token_accuracy": 0.7833081781864166, "num_tokens": 10294800.0, "step": 632 }, { "entropy": 0.5478651374578476, "epoch": 2.3626168224299064, "grad_norm": 0.048824410885572433, "learning_rate": 0.0002, "loss": 0.5395293831825256, "mean_token_accuracy": 0.783235713839531, "num_tokens": 10311090.0, "step": 633 }, { "entropy": 0.5332029610872269, "epoch": 2.366355140186916, "grad_norm": 0.04313044250011444, "learning_rate": 0.0002, "loss": 0.5401085615158081, "mean_token_accuracy": 0.778812825679779, "num_tokens": 10327348.0, "step": 634 }, { "entropy": 0.5406146496534348, "epoch": 2.3700934579439252, "grad_norm": 0.04600725322961807, "learning_rate": 0.0002, "loss": 0.5516705513000488, "mean_token_accuracy": 0.7761097699403763, "num_tokens": 10343800.0, "step": 635 }, { "entropy": 0.5261052846908569, "epoch": 2.3738317757009346, "grad_norm": 0.045134712010622025, "learning_rate": 0.0002, "loss": 0.5412300825119019, "mean_token_accuracy": 0.7802619636058807, "num_tokens": 10360082.0, "step": 636 }, { "entropy": 0.5589279979467392, "epoch": 2.377570093457944, "grad_norm": 0.041725922375917435, "learning_rate": 0.0002, "loss": 0.5517748594284058, "mean_token_accuracy": 0.778441995382309, "num_tokens": 10376345.0, "step": 637 }, { "entropy": 0.5504082888364792, "epoch": 2.3813084112149534, "grad_norm": 0.03725145012140274, "learning_rate": 0.0002, "loss": 0.5404931306838989, "mean_token_accuracy": 0.7776447534561157, "num_tokens": 10392870.0, "step": 638 }, { "entropy": 0.5359382033348083, "epoch": 2.385046728971963, "grad_norm": 0.0364760085940361, "learning_rate": 0.0002, "loss": 0.533162534236908, "mean_token_accuracy": 0.7851890027523041, "num_tokens": 10409256.0, "step": 639 }, { "entropy": 0.5336398631334305, "epoch": 2.388785046728972, "grad_norm": 0.036078356206417084, "learning_rate": 0.0002, "loss": 0.5374175906181335, "mean_token_accuracy": 0.7814856320619583, "num_tokens": 10425831.0, "step": 640 }, { "entropy": 0.5284569710493088, "epoch": 2.392523364485981, "grad_norm": 0.04704172909259796, "learning_rate": 0.0002, "loss": 0.5387214422225952, "mean_token_accuracy": 0.7815752625465393, "num_tokens": 10442382.0, "step": 641 }, { "entropy": 0.5344073623418808, "epoch": 2.3962616822429905, "grad_norm": 0.0398792028427124, "learning_rate": 0.0002, "loss": 0.5398225784301758, "mean_token_accuracy": 0.7818136066198349, "num_tokens": 10458810.0, "step": 642 }, { "entropy": 0.5323895663022995, "epoch": 2.4, "grad_norm": 0.037454817444086075, "learning_rate": 0.0002, "loss": 0.5368887782096863, "mean_token_accuracy": 0.7800801247358322, "num_tokens": 10474692.0, "step": 643 }, { "entropy": 0.5394662618637085, "epoch": 2.4037383177570093, "grad_norm": 0.03576047718524933, "learning_rate": 0.0002, "loss": 0.5351858735084534, "mean_token_accuracy": 0.7815855145454407, "num_tokens": 10491015.0, "step": 644 }, { "entropy": 0.547369509935379, "epoch": 2.4074766355140187, "grad_norm": 0.0398087315261364, "learning_rate": 0.0002, "loss": 0.5397285223007202, "mean_token_accuracy": 0.7805114239454269, "num_tokens": 10507366.0, "step": 645 }, { "entropy": 0.5508280843496323, "epoch": 2.411214953271028, "grad_norm": 0.03709566593170166, "learning_rate": 0.0002, "loss": 0.5448777675628662, "mean_token_accuracy": 0.7763405591249466, "num_tokens": 10523374.0, "step": 646 }, { "entropy": 0.5248509049415588, "epoch": 2.4149532710280375, "grad_norm": 0.03418833017349243, "learning_rate": 0.0002, "loss": 0.5208706855773926, "mean_token_accuracy": 0.7874817848205566, "num_tokens": 10539624.0, "step": 647 }, { "entropy": 0.5466809421777725, "epoch": 2.418691588785047, "grad_norm": 0.039764732122421265, "learning_rate": 0.0002, "loss": 0.5513855218887329, "mean_token_accuracy": 0.776073694229126, "num_tokens": 10556212.0, "step": 648 }, { "entropy": 0.5117013603448868, "epoch": 2.4224299065420563, "grad_norm": 0.04086057096719742, "learning_rate": 0.0002, "loss": 0.5219972729682922, "mean_token_accuracy": 0.7889275252819061, "num_tokens": 10572323.0, "step": 649 }, { "entropy": 0.5393745452165604, "epoch": 2.426168224299065, "grad_norm": 0.037193622440099716, "learning_rate": 0.0002, "loss": 0.5456075668334961, "mean_token_accuracy": 0.7753270417451859, "num_tokens": 10588533.0, "step": 650 }, { "entropy": 0.5517471730709076, "epoch": 2.4299065420560746, "grad_norm": 0.04061353579163551, "learning_rate": 0.0002, "loss": 0.5480504035949707, "mean_token_accuracy": 0.7777185589075089, "num_tokens": 10604736.0, "step": 651 }, { "entropy": 0.5332285165786743, "epoch": 2.433644859813084, "grad_norm": 0.037262339144945145, "learning_rate": 0.0002, "loss": 0.52723628282547, "mean_token_accuracy": 0.7820963263511658, "num_tokens": 10621005.0, "step": 652 }, { "entropy": 0.5427125096321106, "epoch": 2.4373831775700934, "grad_norm": 0.038290560245513916, "learning_rate": 0.0002, "loss": 0.5433245897293091, "mean_token_accuracy": 0.7764440774917603, "num_tokens": 10637274.0, "step": 653 }, { "entropy": 0.515294149518013, "epoch": 2.4411214953271028, "grad_norm": 0.07859813421964645, "learning_rate": 0.0002, "loss": 0.5192139744758606, "mean_token_accuracy": 0.7903406471014023, "num_tokens": 10653571.0, "step": 654 }, { "entropy": 0.5411062091588974, "epoch": 2.444859813084112, "grad_norm": 0.04054918885231018, "learning_rate": 0.0002, "loss": 0.5439664721488953, "mean_token_accuracy": 0.7815183401107788, "num_tokens": 10670139.0, "step": 655 }, { "entropy": 0.5487605780363083, "epoch": 2.4485981308411215, "grad_norm": 0.04026317596435547, "learning_rate": 0.0002, "loss": 0.5495845675468445, "mean_token_accuracy": 0.7765460163354874, "num_tokens": 10686846.0, "step": 656 }, { "entropy": 0.5351516157388687, "epoch": 2.452336448598131, "grad_norm": 0.040862392634153366, "learning_rate": 0.0002, "loss": 0.5336912870407104, "mean_token_accuracy": 0.7818685173988342, "num_tokens": 10703200.0, "step": 657 }, { "entropy": 0.5463723838329315, "epoch": 2.4560747663551403, "grad_norm": 0.03873393312096596, "learning_rate": 0.0002, "loss": 0.5465680360794067, "mean_token_accuracy": 0.7760122418403625, "num_tokens": 10719561.0, "step": 658 }, { "entropy": 0.5416133552789688, "epoch": 2.4598130841121497, "grad_norm": 0.044795434921979904, "learning_rate": 0.0002, "loss": 0.5411824584007263, "mean_token_accuracy": 0.7804904133081436, "num_tokens": 10735767.0, "step": 659 }, { "entropy": 0.5494029372930527, "epoch": 2.463551401869159, "grad_norm": 0.04379895702004433, "learning_rate": 0.0002, "loss": 0.5456870198249817, "mean_token_accuracy": 0.7755402028560638, "num_tokens": 10751886.0, "step": 660 }, { "entropy": 0.5367189347743988, "epoch": 2.467289719626168, "grad_norm": 0.03852448984980583, "learning_rate": 0.0002, "loss": 0.5393000841140747, "mean_token_accuracy": 0.7800532579421997, "num_tokens": 10768210.0, "step": 661 }, { "entropy": 0.5270116031169891, "epoch": 2.4710280373831774, "grad_norm": 0.03792192041873932, "learning_rate": 0.0002, "loss": 0.5289605259895325, "mean_token_accuracy": 0.7838020473718643, "num_tokens": 10784434.0, "step": 662 }, { "entropy": 0.5338448286056519, "epoch": 2.474766355140187, "grad_norm": 0.0350453220307827, "learning_rate": 0.0002, "loss": 0.5380920767784119, "mean_token_accuracy": 0.7818057388067245, "num_tokens": 10800619.0, "step": 663 }, { "entropy": 0.5228566378355026, "epoch": 2.4785046728971962, "grad_norm": 0.046152058988809586, "learning_rate": 0.0002, "loss": 0.5300622582435608, "mean_token_accuracy": 0.7793385684490204, "num_tokens": 10816801.0, "step": 664 }, { "entropy": 0.5290849655866623, "epoch": 2.4822429906542056, "grad_norm": 0.03659910336136818, "learning_rate": 0.0002, "loss": 0.5329374074935913, "mean_token_accuracy": 0.7838267683982849, "num_tokens": 10833095.0, "step": 665 }, { "entropy": 0.545561358332634, "epoch": 2.485981308411215, "grad_norm": 0.04097100347280502, "learning_rate": 0.0002, "loss": 0.5479649901390076, "mean_token_accuracy": 0.7784263789653778, "num_tokens": 10849473.0, "step": 666 }, { "entropy": 0.5502291470766068, "epoch": 2.4897196261682244, "grad_norm": 0.04253846034407616, "learning_rate": 0.0002, "loss": 0.5466883182525635, "mean_token_accuracy": 0.7778628617525101, "num_tokens": 10865837.0, "step": 667 }, { "entropy": 0.5474338084459305, "epoch": 2.493457943925234, "grad_norm": 0.037734732031822205, "learning_rate": 0.0002, "loss": 0.5415964126586914, "mean_token_accuracy": 0.7777974009513855, "num_tokens": 10882273.0, "step": 668 }, { "entropy": 0.5401993542909622, "epoch": 2.497196261682243, "grad_norm": 0.039542876183986664, "learning_rate": 0.0002, "loss": 0.5339391231536865, "mean_token_accuracy": 0.784349262714386, "num_tokens": 10898780.0, "step": 669 }, { "entropy": 0.5420306771993637, "epoch": 2.500934579439252, "grad_norm": 0.049927666783332825, "learning_rate": 0.0002, "loss": 0.5389054417610168, "mean_token_accuracy": 0.7841761559247971, "num_tokens": 10915059.0, "step": 670 }, { "entropy": 0.5333422720432281, "epoch": 2.5046728971962615, "grad_norm": 0.042702775448560715, "learning_rate": 0.0002, "loss": 0.5403023958206177, "mean_token_accuracy": 0.7792320251464844, "num_tokens": 10931718.0, "step": 671 }, { "entropy": 0.5289912968873978, "epoch": 2.508411214953271, "grad_norm": 0.050530027598142624, "learning_rate": 0.0002, "loss": 0.5404794216156006, "mean_token_accuracy": 0.7815851122140884, "num_tokens": 10948084.0, "step": 672 }, { "entropy": 0.5341697633266449, "epoch": 2.5121495327102803, "grad_norm": 0.04310121387243271, "learning_rate": 0.0002, "loss": 0.5389139652252197, "mean_token_accuracy": 0.778786912560463, "num_tokens": 10964373.0, "step": 673 }, { "entropy": 0.5569636076688766, "epoch": 2.5158878504672897, "grad_norm": 0.03820215165615082, "learning_rate": 0.0002, "loss": 0.5578426122665405, "mean_token_accuracy": 0.7730483710765839, "num_tokens": 10980732.0, "step": 674 }, { "entropy": 0.5347766578197479, "epoch": 2.519626168224299, "grad_norm": 0.04349920526146889, "learning_rate": 0.0002, "loss": 0.5336275100708008, "mean_token_accuracy": 0.7815207839012146, "num_tokens": 10997005.0, "step": 675 }, { "entropy": 0.5299794673919678, "epoch": 2.5233644859813085, "grad_norm": 0.04003509134054184, "learning_rate": 0.0002, "loss": 0.5294742584228516, "mean_token_accuracy": 0.7869250029325485, "num_tokens": 11013055.0, "step": 676 }, { "entropy": 0.5352783799171448, "epoch": 2.527102803738318, "grad_norm": 0.054121218621730804, "learning_rate": 0.0002, "loss": 0.5448738932609558, "mean_token_accuracy": 0.7791888117790222, "num_tokens": 11029266.0, "step": 677 }, { "entropy": 0.5354646146297455, "epoch": 2.5308411214953273, "grad_norm": 0.03573855757713318, "learning_rate": 0.0002, "loss": 0.5352723002433777, "mean_token_accuracy": 0.7825258523225784, "num_tokens": 11045806.0, "step": 678 }, { "entropy": 0.556391716003418, "epoch": 2.5345794392523366, "grad_norm": 0.04871753975749016, "learning_rate": 0.0002, "loss": 0.5602859258651733, "mean_token_accuracy": 0.7722157090902328, "num_tokens": 11062035.0, "step": 679 }, { "entropy": 0.5508870929479599, "epoch": 2.538317757009346, "grad_norm": 0.03932088986039162, "learning_rate": 0.0002, "loss": 0.5469393730163574, "mean_token_accuracy": 0.7782620638608932, "num_tokens": 11078375.0, "step": 680 }, { "entropy": 0.5481788516044617, "epoch": 2.542056074766355, "grad_norm": 0.04463294520974159, "learning_rate": 0.0002, "loss": 0.5469505190849304, "mean_token_accuracy": 0.7766976356506348, "num_tokens": 11094977.0, "step": 681 }, { "entropy": 0.5154567137360573, "epoch": 2.5457943925233644, "grad_norm": 0.044517725706100464, "learning_rate": 0.0002, "loss": 0.5210436582565308, "mean_token_accuracy": 0.7881979048252106, "num_tokens": 11110907.0, "step": 682 }, { "entropy": 0.5250661969184875, "epoch": 2.5495327102803738, "grad_norm": 0.03574059158563614, "learning_rate": 0.0002, "loss": 0.5239285826683044, "mean_token_accuracy": 0.7901371419429779, "num_tokens": 11127432.0, "step": 683 }, { "entropy": 0.541177287697792, "epoch": 2.553271028037383, "grad_norm": 0.03583724424242973, "learning_rate": 0.0002, "loss": 0.5399287343025208, "mean_token_accuracy": 0.7795550227165222, "num_tokens": 11143788.0, "step": 684 }, { "entropy": 0.5319067388772964, "epoch": 2.5570093457943925, "grad_norm": 0.038700610399246216, "learning_rate": 0.0002, "loss": 0.5372647047042847, "mean_token_accuracy": 0.7816288769245148, "num_tokens": 11160145.0, "step": 685 }, { "entropy": 0.5243031531572342, "epoch": 2.560747663551402, "grad_norm": 0.0457780659198761, "learning_rate": 0.0002, "loss": 0.5248138308525085, "mean_token_accuracy": 0.7840212136507034, "num_tokens": 11176075.0, "step": 686 }, { "entropy": 0.5483701825141907, "epoch": 2.5644859813084113, "grad_norm": 0.0399782694876194, "learning_rate": 0.0002, "loss": 0.5485758185386658, "mean_token_accuracy": 0.7779590934514999, "num_tokens": 11192293.0, "step": 687 }, { "entropy": 0.5290739685297012, "epoch": 2.5682242990654207, "grad_norm": 0.056546278297901154, "learning_rate": 0.0002, "loss": 0.5325236320495605, "mean_token_accuracy": 0.7835103422403336, "num_tokens": 11208542.0, "step": 688 }, { "entropy": 0.5161010921001434, "epoch": 2.5719626168224297, "grad_norm": 0.042589396238327026, "learning_rate": 0.0002, "loss": 0.5185222625732422, "mean_token_accuracy": 0.7873405963182449, "num_tokens": 11224578.0, "step": 689 }, { "entropy": 0.5410270541906357, "epoch": 2.575700934579439, "grad_norm": 0.05106229707598686, "learning_rate": 0.0002, "loss": 0.5452054142951965, "mean_token_accuracy": 0.7787328362464905, "num_tokens": 11240887.0, "step": 690 }, { "entropy": 0.5375277251005173, "epoch": 2.5794392523364484, "grad_norm": 0.03891480341553688, "learning_rate": 0.0002, "loss": 0.5347110033035278, "mean_token_accuracy": 0.7833239287137985, "num_tokens": 11256921.0, "step": 691 }, { "entropy": 0.5428935289382935, "epoch": 2.583177570093458, "grad_norm": 0.04642964154481888, "learning_rate": 0.0002, "loss": 0.5380253195762634, "mean_token_accuracy": 0.7818872332572937, "num_tokens": 11273253.0, "step": 692 }, { "entropy": 0.5503559708595276, "epoch": 2.586915887850467, "grad_norm": 0.04631572589278221, "learning_rate": 0.0002, "loss": 0.5499509572982788, "mean_token_accuracy": 0.7778131514787674, "num_tokens": 11289524.0, "step": 693 }, { "entropy": 0.5296535789966583, "epoch": 2.5906542056074766, "grad_norm": 0.04232152923941612, "learning_rate": 0.0002, "loss": 0.5292780995368958, "mean_token_accuracy": 0.7848498374223709, "num_tokens": 11305878.0, "step": 694 }, { "entropy": 0.5324369296431541, "epoch": 2.594392523364486, "grad_norm": 0.04305447265505791, "learning_rate": 0.0002, "loss": 0.5328658223152161, "mean_token_accuracy": 0.7839655876159668, "num_tokens": 11322266.0, "step": 695 }, { "entropy": 0.5353843569755554, "epoch": 2.5981308411214954, "grad_norm": 0.04098288714885712, "learning_rate": 0.0002, "loss": 0.5361748933792114, "mean_token_accuracy": 0.7821073234081268, "num_tokens": 11338684.0, "step": 696 }, { "entropy": 0.5268280059099197, "epoch": 2.601869158878505, "grad_norm": 0.05113406851887703, "learning_rate": 0.0002, "loss": 0.5360528230667114, "mean_token_accuracy": 0.7813736945390701, "num_tokens": 11354924.0, "step": 697 }, { "entropy": 0.5334519147872925, "epoch": 2.605607476635514, "grad_norm": 0.036048226058483124, "learning_rate": 0.0002, "loss": 0.5367494225502014, "mean_token_accuracy": 0.782368615269661, "num_tokens": 11371138.0, "step": 698 }, { "entropy": 0.5625623911619186, "epoch": 2.6093457943925236, "grad_norm": 0.04338160157203674, "learning_rate": 0.0002, "loss": 0.5562830567359924, "mean_token_accuracy": 0.7749900668859482, "num_tokens": 11387674.0, "step": 699 }, { "entropy": 0.5387382507324219, "epoch": 2.613084112149533, "grad_norm": 0.04549875482916832, "learning_rate": 0.0002, "loss": 0.5360974073410034, "mean_token_accuracy": 0.781986802816391, "num_tokens": 11403934.0, "step": 700 }, { "entropy": 0.5418427735567093, "epoch": 2.616822429906542, "grad_norm": 0.04425078630447388, "learning_rate": 0.0002, "loss": 0.5500712990760803, "mean_token_accuracy": 0.7762207537889481, "num_tokens": 11420207.0, "step": 701 }, { "entropy": 0.5345925241708755, "epoch": 2.6205607476635513, "grad_norm": 0.0503389798104763, "learning_rate": 0.0002, "loss": 0.5410506129264832, "mean_token_accuracy": 0.7824158221483231, "num_tokens": 11436366.0, "step": 702 }, { "entropy": 0.5293083861470222, "epoch": 2.6242990654205607, "grad_norm": 0.03849806264042854, "learning_rate": 0.0002, "loss": 0.5313189625740051, "mean_token_accuracy": 0.7851823717355728, "num_tokens": 11452692.0, "step": 703 }, { "entropy": 0.5381535738706589, "epoch": 2.62803738317757, "grad_norm": 0.04830117151141167, "learning_rate": 0.0002, "loss": 0.5306882262229919, "mean_token_accuracy": 0.7875523120164871, "num_tokens": 11468948.0, "step": 704 }, { "entropy": 0.5537677556276321, "epoch": 2.6317757009345795, "grad_norm": 0.03648355230689049, "learning_rate": 0.0002, "loss": 0.549413800239563, "mean_token_accuracy": 0.7742456942796707, "num_tokens": 11485304.0, "step": 705 }, { "entropy": 0.5376065969467163, "epoch": 2.635514018691589, "grad_norm": 0.03775647282600403, "learning_rate": 0.0002, "loss": 0.5347313284873962, "mean_token_accuracy": 0.7820166647434235, "num_tokens": 11501515.0, "step": 706 }, { "entropy": 0.5389592945575714, "epoch": 2.6392523364485982, "grad_norm": 0.03849456459283829, "learning_rate": 0.0002, "loss": 0.542040228843689, "mean_token_accuracy": 0.7777668088674545, "num_tokens": 11517823.0, "step": 707 }, { "entropy": 0.5297961235046387, "epoch": 2.6429906542056076, "grad_norm": 0.03884672373533249, "learning_rate": 0.0002, "loss": 0.5295203924179077, "mean_token_accuracy": 0.7848687022924423, "num_tokens": 11534089.0, "step": 708 }, { "entropy": 0.5374749451875687, "epoch": 2.6467289719626166, "grad_norm": 0.040985025465488434, "learning_rate": 0.0002, "loss": 0.5486632585525513, "mean_token_accuracy": 0.7780227363109589, "num_tokens": 11550404.0, "step": 709 }, { "entropy": 0.5216163545846939, "epoch": 2.650467289719626, "grad_norm": 0.041445303708314896, "learning_rate": 0.0002, "loss": 0.5271479487419128, "mean_token_accuracy": 0.7851904779672623, "num_tokens": 11566700.0, "step": 710 }, { "entropy": 0.548863023519516, "epoch": 2.6542056074766354, "grad_norm": 0.03768117353320122, "learning_rate": 0.0002, "loss": 0.5421991944313049, "mean_token_accuracy": 0.7786275446414948, "num_tokens": 11583296.0, "step": 711 }, { "entropy": 0.5540084540843964, "epoch": 2.6579439252336448, "grad_norm": 0.03594231605529785, "learning_rate": 0.0002, "loss": 0.5558887720108032, "mean_token_accuracy": 0.775081142783165, "num_tokens": 11599637.0, "step": 712 }, { "entropy": 0.528472974896431, "epoch": 2.661682242990654, "grad_norm": 0.03718520700931549, "learning_rate": 0.0002, "loss": 0.5246076583862305, "mean_token_accuracy": 0.7852199673652649, "num_tokens": 11615767.0, "step": 713 }, { "entropy": 0.546594500541687, "epoch": 2.6654205607476635, "grad_norm": 0.042944129556417465, "learning_rate": 0.0002, "loss": 0.5401133298873901, "mean_token_accuracy": 0.7802519649267197, "num_tokens": 11632056.0, "step": 714 }, { "entropy": 0.5382472574710846, "epoch": 2.669158878504673, "grad_norm": 0.04242360591888428, "learning_rate": 0.0002, "loss": 0.5468363761901855, "mean_token_accuracy": 0.7763016223907471, "num_tokens": 11648587.0, "step": 715 }, { "entropy": 0.5384316891431808, "epoch": 2.6728971962616823, "grad_norm": 0.04231888800859451, "learning_rate": 0.0002, "loss": 0.5447696447372437, "mean_token_accuracy": 0.7771705389022827, "num_tokens": 11665216.0, "step": 716 }, { "entropy": 0.536566972732544, "epoch": 2.6766355140186917, "grad_norm": 0.051330000162124634, "learning_rate": 0.0002, "loss": 0.5337138175964355, "mean_token_accuracy": 0.7841814905405045, "num_tokens": 11681565.0, "step": 717 }, { "entropy": 0.5605298280715942, "epoch": 2.680373831775701, "grad_norm": 0.04393962025642395, "learning_rate": 0.0002, "loss": 0.5522550344467163, "mean_token_accuracy": 0.7745645940303802, "num_tokens": 11697734.0, "step": 718 }, { "entropy": 0.5421400368213654, "epoch": 2.6841121495327105, "grad_norm": 0.04087737947702408, "learning_rate": 0.0002, "loss": 0.5356095433235168, "mean_token_accuracy": 0.7823581695556641, "num_tokens": 11714256.0, "step": 719 }, { "entropy": 0.5455932766199112, "epoch": 2.68785046728972, "grad_norm": 0.04586983844637871, "learning_rate": 0.0002, "loss": 0.5500515699386597, "mean_token_accuracy": 0.7770348936319351, "num_tokens": 11730670.0, "step": 720 }, { "entropy": 0.521054208278656, "epoch": 2.691588785046729, "grad_norm": 0.04511021822690964, "learning_rate": 0.0002, "loss": 0.5274732112884521, "mean_token_accuracy": 0.7863785922527313, "num_tokens": 11747011.0, "step": 721 }, { "entropy": 0.5369152277708054, "epoch": 2.695327102803738, "grad_norm": 0.04111414775252342, "learning_rate": 0.0002, "loss": 0.5466327667236328, "mean_token_accuracy": 0.7800845950841904, "num_tokens": 11763325.0, "step": 722 }, { "entropy": 0.5467284768819809, "epoch": 2.6990654205607476, "grad_norm": 0.04847726225852966, "learning_rate": 0.0002, "loss": 0.5574571490287781, "mean_token_accuracy": 0.7709622234106064, "num_tokens": 11779629.0, "step": 723 }, { "entropy": 0.556825578212738, "epoch": 2.702803738317757, "grad_norm": 0.04135042428970337, "learning_rate": 0.0002, "loss": 0.5567163228988647, "mean_token_accuracy": 0.773699164390564, "num_tokens": 11795735.0, "step": 724 }, { "entropy": 0.5429602861404419, "epoch": 2.7065420560747664, "grad_norm": 0.0402897410094738, "learning_rate": 0.0002, "loss": 0.5313383936882019, "mean_token_accuracy": 0.7854284048080444, "num_tokens": 11812127.0, "step": 725 }, { "entropy": 0.5411138385534286, "epoch": 2.710280373831776, "grad_norm": 0.04476531967520714, "learning_rate": 0.0002, "loss": 0.5395961403846741, "mean_token_accuracy": 0.7811660319566727, "num_tokens": 11828424.0, "step": 726 }, { "entropy": 0.5500029474496841, "epoch": 2.714018691588785, "grad_norm": 0.03904065489768982, "learning_rate": 0.0002, "loss": 0.5481054186820984, "mean_token_accuracy": 0.7797027230262756, "num_tokens": 11844904.0, "step": 727 }, { "entropy": 0.5594752728939056, "epoch": 2.717757009345794, "grad_norm": 0.04920347407460213, "learning_rate": 0.0002, "loss": 0.5654065012931824, "mean_token_accuracy": 0.7703305035829544, "num_tokens": 11861341.0, "step": 728 }, { "entropy": 0.5409399420022964, "epoch": 2.7214953271028035, "grad_norm": 0.04093843698501587, "learning_rate": 0.0002, "loss": 0.5432956218719482, "mean_token_accuracy": 0.7790299355983734, "num_tokens": 11877689.0, "step": 729 }, { "entropy": 0.5429576933383942, "epoch": 2.725233644859813, "grad_norm": 0.049346111714839935, "learning_rate": 0.0002, "loss": 0.55011385679245, "mean_token_accuracy": 0.77861687541008, "num_tokens": 11893814.0, "step": 730 }, { "entropy": 0.5407661944627762, "epoch": 2.7289719626168223, "grad_norm": 0.0420721061527729, "learning_rate": 0.0002, "loss": 0.5426504015922546, "mean_token_accuracy": 0.7803787589073181, "num_tokens": 11910096.0, "step": 731 }, { "entropy": 0.5468227863311768, "epoch": 2.7327102803738317, "grad_norm": 0.0373503714799881, "learning_rate": 0.0002, "loss": 0.5417306423187256, "mean_token_accuracy": 0.782159686088562, "num_tokens": 11926285.0, "step": 732 }, { "entropy": 0.5427874177694321, "epoch": 2.736448598130841, "grad_norm": 0.041012153029441833, "learning_rate": 0.0002, "loss": 0.5334447622299194, "mean_token_accuracy": 0.7827651649713516, "num_tokens": 11942656.0, "step": 733 }, { "entropy": 0.5550535768270493, "epoch": 2.7401869158878505, "grad_norm": 0.03842266649007797, "learning_rate": 0.0002, "loss": 0.5497796535491943, "mean_token_accuracy": 0.7729970514774323, "num_tokens": 11959059.0, "step": 734 }, { "entropy": 0.5359070003032684, "epoch": 2.74392523364486, "grad_norm": 0.039268966764211655, "learning_rate": 0.0002, "loss": 0.5411967039108276, "mean_token_accuracy": 0.7831978797912598, "num_tokens": 11975265.0, "step": 735 }, { "entropy": 0.5536347031593323, "epoch": 2.7476635514018692, "grad_norm": 0.045411862432956696, "learning_rate": 0.0002, "loss": 0.5618187189102173, "mean_token_accuracy": 0.7741181403398514, "num_tokens": 11991498.0, "step": 736 }, { "entropy": 0.5233520418405533, "epoch": 2.7514018691588786, "grad_norm": 0.040144748985767365, "learning_rate": 0.0002, "loss": 0.5300607681274414, "mean_token_accuracy": 0.7847813218832016, "num_tokens": 12007487.0, "step": 737 }, { "entropy": 0.5281567052006721, "epoch": 2.755140186915888, "grad_norm": 0.04088376462459564, "learning_rate": 0.0002, "loss": 0.5294374823570251, "mean_token_accuracy": 0.7852809429168701, "num_tokens": 12023900.0, "step": 738 }, { "entropy": 0.5510239601135254, "epoch": 2.7588785046728974, "grad_norm": 0.04011458903551102, "learning_rate": 0.0002, "loss": 0.5465855002403259, "mean_token_accuracy": 0.7779260277748108, "num_tokens": 12040338.0, "step": 739 }, { "entropy": 0.57439024746418, "epoch": 2.762616822429907, "grad_norm": 0.036590199917554855, "learning_rate": 0.0002, "loss": 0.5653122663497925, "mean_token_accuracy": 0.7694305032491684, "num_tokens": 12056958.0, "step": 740 }, { "entropy": 0.5615127831697464, "epoch": 2.7663551401869158, "grad_norm": 0.036815449595451355, "learning_rate": 0.0002, "loss": 0.550983190536499, "mean_token_accuracy": 0.7743483930826187, "num_tokens": 12073644.0, "step": 741 }, { "entropy": 0.5349987298250198, "epoch": 2.770093457943925, "grad_norm": 0.03783464804291725, "learning_rate": 0.0002, "loss": 0.5378219485282898, "mean_token_accuracy": 0.7834212332963943, "num_tokens": 12090085.0, "step": 742 }, { "entropy": 0.5288607105612755, "epoch": 2.7738317757009345, "grad_norm": 0.047371115535497665, "learning_rate": 0.0002, "loss": 0.5444093346595764, "mean_token_accuracy": 0.7794700562953949, "num_tokens": 12106341.0, "step": 743 }, { "entropy": 0.5414262413978577, "epoch": 2.777570093457944, "grad_norm": 0.04306622967123985, "learning_rate": 0.0002, "loss": 0.548575222492218, "mean_token_accuracy": 0.7780982106924057, "num_tokens": 12122689.0, "step": 744 }, { "entropy": 0.5265444070100784, "epoch": 2.7813084112149533, "grad_norm": 0.038641780614852905, "learning_rate": 0.0002, "loss": 0.5287938117980957, "mean_token_accuracy": 0.7837643325328827, "num_tokens": 12138802.0, "step": 745 }, { "entropy": 0.5466189384460449, "epoch": 2.7850467289719627, "grad_norm": 0.0338594987988472, "learning_rate": 0.0002, "loss": 0.5439702272415161, "mean_token_accuracy": 0.7782793641090393, "num_tokens": 12154981.0, "step": 746 }, { "entropy": 0.5158288925886154, "epoch": 2.788785046728972, "grad_norm": 0.040148280560970306, "learning_rate": 0.0002, "loss": 0.5098775625228882, "mean_token_accuracy": 0.7936903238296509, "num_tokens": 12171278.0, "step": 747 }, { "entropy": 0.5605306029319763, "epoch": 2.792523364485981, "grad_norm": 0.03989556431770325, "learning_rate": 0.0002, "loss": 0.5507832169532776, "mean_token_accuracy": 0.7760983258485794, "num_tokens": 12187732.0, "step": 748 }, { "entropy": 0.561933159828186, "epoch": 2.7962616822429904, "grad_norm": 0.04341628775000572, "learning_rate": 0.0002, "loss": 0.5628443956375122, "mean_token_accuracy": 0.7725982367992401, "num_tokens": 12204073.0, "step": 749 }, { "entropy": 0.5275013446807861, "epoch": 2.8, "grad_norm": 0.04758904501795769, "learning_rate": 0.0002, "loss": 0.5401396751403809, "mean_token_accuracy": 0.7802035212516785, "num_tokens": 12220319.0, "step": 750 }, { "entropy": 0.5415465384721756, "epoch": 2.803738317757009, "grad_norm": 0.04323052614927292, "learning_rate": 0.0002, "loss": 0.5467565059661865, "mean_token_accuracy": 0.7801296561956406, "num_tokens": 12236798.0, "step": 751 }, { "entropy": 0.5384011566638947, "epoch": 2.8074766355140186, "grad_norm": 0.04094940423965454, "learning_rate": 0.0002, "loss": 0.5408844947814941, "mean_token_accuracy": 0.7790292948484421, "num_tokens": 12253226.0, "step": 752 }, { "entropy": 0.5556510388851166, "epoch": 2.811214953271028, "grad_norm": 0.037975817918777466, "learning_rate": 0.0002, "loss": 0.5480787754058838, "mean_token_accuracy": 0.7771931290626526, "num_tokens": 12269489.0, "step": 753 }, { "entropy": 0.5475790053606033, "epoch": 2.8149532710280374, "grad_norm": 0.041421882808208466, "learning_rate": 0.0002, "loss": 0.5383135676383972, "mean_token_accuracy": 0.7827092558145523, "num_tokens": 12285892.0, "step": 754 }, { "entropy": 0.5555797815322876, "epoch": 2.8186915887850468, "grad_norm": 0.03941413015127182, "learning_rate": 0.0002, "loss": 0.552151083946228, "mean_token_accuracy": 0.7751595675945282, "num_tokens": 12302269.0, "step": 755 }, { "entropy": 0.5256431847810745, "epoch": 2.822429906542056, "grad_norm": 0.040782686322927475, "learning_rate": 0.0002, "loss": 0.5262829661369324, "mean_token_accuracy": 0.7846409976482391, "num_tokens": 12318521.0, "step": 756 }, { "entropy": 0.538894459605217, "epoch": 2.8261682242990656, "grad_norm": 0.052266813814640045, "learning_rate": 0.0002, "loss": 0.5539013147354126, "mean_token_accuracy": 0.7756392508745193, "num_tokens": 12334819.0, "step": 757 }, { "entropy": 0.5483682453632355, "epoch": 2.829906542056075, "grad_norm": 0.04095127433538437, "learning_rate": 0.0002, "loss": 0.5520408749580383, "mean_token_accuracy": 0.7747367471456528, "num_tokens": 12351218.0, "step": 758 }, { "entropy": 0.5276503935456276, "epoch": 2.8336448598130843, "grad_norm": 0.04603305831551552, "learning_rate": 0.0002, "loss": 0.5317422151565552, "mean_token_accuracy": 0.780977338552475, "num_tokens": 12367390.0, "step": 759 }, { "entropy": 0.5502448529005051, "epoch": 2.8373831775700937, "grad_norm": 0.04640703275799751, "learning_rate": 0.0002, "loss": 0.5535072684288025, "mean_token_accuracy": 0.7761691957712173, "num_tokens": 12383960.0, "step": 760 }, { "entropy": 0.547056645154953, "epoch": 2.8411214953271027, "grad_norm": 0.033438824117183685, "learning_rate": 0.0002, "loss": 0.5412831902503967, "mean_token_accuracy": 0.7795712947845459, "num_tokens": 12400550.0, "step": 761 }, { "entropy": 0.5364657193422318, "epoch": 2.844859813084112, "grad_norm": 0.04271340370178223, "learning_rate": 0.0002, "loss": 0.5346530079841614, "mean_token_accuracy": 0.7835509330034256, "num_tokens": 12417061.0, "step": 762 }, { "entropy": 0.5455985963344574, "epoch": 2.8485981308411215, "grad_norm": 0.03856063261628151, "learning_rate": 0.0002, "loss": 0.5402116179466248, "mean_token_accuracy": 0.7816472351551056, "num_tokens": 12433548.0, "step": 763 }, { "entropy": 0.532633364200592, "epoch": 2.852336448598131, "grad_norm": 0.039442550390958786, "learning_rate": 0.0002, "loss": 0.5322520732879639, "mean_token_accuracy": 0.783360943198204, "num_tokens": 12449702.0, "step": 764 }, { "entropy": 0.5533113479614258, "epoch": 2.8560747663551402, "grad_norm": 0.03981044888496399, "learning_rate": 0.0002, "loss": 0.5526716113090515, "mean_token_accuracy": 0.7752720266580582, "num_tokens": 12465797.0, "step": 765 }, { "entropy": 0.5458943992853165, "epoch": 2.8598130841121496, "grad_norm": 0.043415430933237076, "learning_rate": 0.0002, "loss": 0.5514388084411621, "mean_token_accuracy": 0.7782578617334366, "num_tokens": 12482100.0, "step": 766 }, { "entropy": 0.5316417217254639, "epoch": 2.863551401869159, "grad_norm": 0.03658653050661087, "learning_rate": 0.0002, "loss": 0.5376189947128296, "mean_token_accuracy": 0.7812371999025345, "num_tokens": 12498442.0, "step": 767 }, { "entropy": 0.5365964025259018, "epoch": 2.867289719626168, "grad_norm": 0.04015335068106651, "learning_rate": 0.0002, "loss": 0.5381023287773132, "mean_token_accuracy": 0.7802128046751022, "num_tokens": 12514722.0, "step": 768 }, { "entropy": 0.5392501503229141, "epoch": 2.8710280373831774, "grad_norm": 0.04526032134890556, "learning_rate": 0.0002, "loss": 0.5440354347229004, "mean_token_accuracy": 0.7788137197494507, "num_tokens": 12531173.0, "step": 769 }, { "entropy": 0.5416650772094727, "epoch": 2.8747663551401867, "grad_norm": 0.03573603555560112, "learning_rate": 0.0002, "loss": 0.5344440340995789, "mean_token_accuracy": 0.782467320561409, "num_tokens": 12547297.0, "step": 770 }, { "entropy": 0.537946805357933, "epoch": 2.878504672897196, "grad_norm": 0.043754760175943375, "learning_rate": 0.0002, "loss": 0.5369762778282166, "mean_token_accuracy": 0.7813331335783005, "num_tokens": 12563639.0, "step": 771 }, { "entropy": 0.5417525321245193, "epoch": 2.8822429906542055, "grad_norm": 0.03892975300550461, "learning_rate": 0.0002, "loss": 0.5408830642700195, "mean_token_accuracy": 0.7807131111621857, "num_tokens": 12579951.0, "step": 772 }, { "entropy": 0.5286070853471756, "epoch": 2.885981308411215, "grad_norm": 0.041709210723638535, "learning_rate": 0.0002, "loss": 0.5315775275230408, "mean_token_accuracy": 0.7836516797542572, "num_tokens": 12596427.0, "step": 773 }, { "entropy": 0.5347200036048889, "epoch": 2.8897196261682243, "grad_norm": 0.04162106290459633, "learning_rate": 0.0002, "loss": 0.5488803386688232, "mean_token_accuracy": 0.7781624644994736, "num_tokens": 12612693.0, "step": 774 }, { "entropy": 0.5630818009376526, "epoch": 2.8934579439252337, "grad_norm": 0.03779264912009239, "learning_rate": 0.0002, "loss": 0.5618957281112671, "mean_token_accuracy": 0.7714088261127472, "num_tokens": 12629093.0, "step": 775 }, { "entropy": 0.5579015165567398, "epoch": 2.897196261682243, "grad_norm": 0.04071388393640518, "learning_rate": 0.0002, "loss": 0.5509809255599976, "mean_token_accuracy": 0.7759078145027161, "num_tokens": 12645440.0, "step": 776 }, { "entropy": 0.5593527257442474, "epoch": 2.9009345794392525, "grad_norm": 0.041921358555555344, "learning_rate": 0.0002, "loss": 0.5505045056343079, "mean_token_accuracy": 0.7758798003196716, "num_tokens": 12661819.0, "step": 777 }, { "entropy": 0.5402603298425674, "epoch": 2.904672897196262, "grad_norm": 0.03740124776959419, "learning_rate": 0.0002, "loss": 0.5350624322891235, "mean_token_accuracy": 0.7829450070858002, "num_tokens": 12678029.0, "step": 778 }, { "entropy": 0.5501836538314819, "epoch": 2.9084112149532713, "grad_norm": 0.03699700906872749, "learning_rate": 0.0002, "loss": 0.5496166944503784, "mean_token_accuracy": 0.7787871360778809, "num_tokens": 12694566.0, "step": 779 }, { "entropy": 0.5449737459421158, "epoch": 2.91214953271028, "grad_norm": 0.03947729989886284, "learning_rate": 0.0002, "loss": 0.5487996935844421, "mean_token_accuracy": 0.7771195471286774, "num_tokens": 12711096.0, "step": 780 }, { "entropy": 0.509773313999176, "epoch": 2.9158878504672896, "grad_norm": 0.04015858471393585, "learning_rate": 0.0002, "loss": 0.5180044174194336, "mean_token_accuracy": 0.7871870398521423, "num_tokens": 12727181.0, "step": 781 }, { "entropy": 0.5145790874958038, "epoch": 2.919626168224299, "grad_norm": 0.04480452463030815, "learning_rate": 0.0002, "loss": 0.517657995223999, "mean_token_accuracy": 0.7905906438827515, "num_tokens": 12743263.0, "step": 782 }, { "entropy": 0.536189079284668, "epoch": 2.9233644859813084, "grad_norm": 0.0368233323097229, "learning_rate": 0.0002, "loss": 0.5374237895011902, "mean_token_accuracy": 0.7814907878637314, "num_tokens": 12759582.0, "step": 783 }, { "entropy": 0.5301052629947662, "epoch": 2.9271028037383178, "grad_norm": 0.036369625478982925, "learning_rate": 0.0002, "loss": 0.5254780054092407, "mean_token_accuracy": 0.7876885831356049, "num_tokens": 12775680.0, "step": 784 }, { "entropy": 0.5395437628030777, "epoch": 2.930841121495327, "grad_norm": 0.037106823176145554, "learning_rate": 0.0002, "loss": 0.5353831648826599, "mean_token_accuracy": 0.7856823652982712, "num_tokens": 12791849.0, "step": 785 }, { "entropy": 0.5460378974676132, "epoch": 2.9345794392523366, "grad_norm": 0.0374838188290596, "learning_rate": 0.0002, "loss": 0.5441444516181946, "mean_token_accuracy": 0.7800013571977615, "num_tokens": 12808470.0, "step": 786 }, { "entropy": 0.5510992407798767, "epoch": 2.938317757009346, "grad_norm": 0.03663073852658272, "learning_rate": 0.0002, "loss": 0.5466246604919434, "mean_token_accuracy": 0.7789618521928787, "num_tokens": 12824709.0, "step": 787 }, { "entropy": 0.5445446521043777, "epoch": 2.942056074766355, "grad_norm": 0.03850307688117027, "learning_rate": 0.0002, "loss": 0.5457326769828796, "mean_token_accuracy": 0.779052123427391, "num_tokens": 12841079.0, "step": 788 }, { "entropy": 0.5365033894777298, "epoch": 2.9457943925233643, "grad_norm": 0.04035929962992668, "learning_rate": 0.0002, "loss": 0.5459482073783875, "mean_token_accuracy": 0.7797062546014786, "num_tokens": 12857523.0, "step": 789 }, { "entropy": 0.535067155957222, "epoch": 2.9495327102803737, "grad_norm": 0.04887193441390991, "learning_rate": 0.0002, "loss": 0.5398947596549988, "mean_token_accuracy": 0.7823842316865921, "num_tokens": 12874241.0, "step": 790 }, { "entropy": 0.5346145331859589, "epoch": 2.953271028037383, "grad_norm": 0.03713555634021759, "learning_rate": 0.0002, "loss": 0.5383285880088806, "mean_token_accuracy": 0.7822743952274323, "num_tokens": 12890347.0, "step": 791 }, { "entropy": 0.5538973659276962, "epoch": 2.9570093457943925, "grad_norm": 0.042103007435798645, "learning_rate": 0.0002, "loss": 0.5548110604286194, "mean_token_accuracy": 0.7737681418657303, "num_tokens": 12906728.0, "step": 792 }, { "entropy": 0.5500922650098801, "epoch": 2.960747663551402, "grad_norm": 0.03705638647079468, "learning_rate": 0.0002, "loss": 0.5455094575881958, "mean_token_accuracy": 0.7803948670625687, "num_tokens": 12923166.0, "step": 793 }, { "entropy": 0.562080979347229, "epoch": 2.9644859813084112, "grad_norm": 0.045153554528951645, "learning_rate": 0.0002, "loss": 0.5568199157714844, "mean_token_accuracy": 0.7736331224441528, "num_tokens": 12939504.0, "step": 794 }, { "entropy": 0.5559557229280472, "epoch": 2.9682242990654206, "grad_norm": 0.04255378246307373, "learning_rate": 0.0002, "loss": 0.5531718134880066, "mean_token_accuracy": 0.7762871235609055, "num_tokens": 12955898.0, "step": 795 }, { "entropy": 0.5435759872198105, "epoch": 2.97196261682243, "grad_norm": 0.03799128159880638, "learning_rate": 0.0002, "loss": 0.5441620349884033, "mean_token_accuracy": 0.7793318778276443, "num_tokens": 12972346.0, "step": 796 }, { "entropy": 0.5359157919883728, "epoch": 2.9757009345794394, "grad_norm": 0.05715997889637947, "learning_rate": 0.0002, "loss": 0.5515891909599304, "mean_token_accuracy": 0.7771831452846527, "num_tokens": 12988848.0, "step": 797 }, { "entropy": 0.5230652317404747, "epoch": 2.979439252336449, "grad_norm": 0.04036436975002289, "learning_rate": 0.0002, "loss": 0.5234889388084412, "mean_token_accuracy": 0.7856348752975464, "num_tokens": 13004832.0, "step": 798 }, { "entropy": 0.5457260459661484, "epoch": 2.983177570093458, "grad_norm": 0.04120893031358719, "learning_rate": 0.0002, "loss": 0.5378625392913818, "mean_token_accuracy": 0.7840824872255325, "num_tokens": 13021226.0, "step": 799 }, { "entropy": 0.5480275601148605, "epoch": 2.986915887850467, "grad_norm": 0.050067413598299026, "learning_rate": 0.0002, "loss": 0.5414943099021912, "mean_token_accuracy": 0.7796735763549805, "num_tokens": 13037664.0, "step": 800 }, { "entropy": 0.5385295897722244, "epoch": 2.9906542056074765, "grad_norm": 0.03477542847394943, "learning_rate": 0.0002, "loss": 0.5353237390518188, "mean_token_accuracy": 0.7814339101314545, "num_tokens": 13053836.0, "step": 801 }, { "entropy": 0.5408166199922562, "epoch": 2.994392523364486, "grad_norm": 0.038822371512651443, "learning_rate": 0.0002, "loss": 0.5407392382621765, "mean_token_accuracy": 0.7796344310045242, "num_tokens": 13070132.0, "step": 802 }, { "entropy": 0.533338338136673, "epoch": 2.9981308411214953, "grad_norm": 0.04834038019180298, "learning_rate": 0.0002, "loss": 0.5456323027610779, "mean_token_accuracy": 0.7770627439022064, "num_tokens": 13086317.0, "step": 803 }, { "entropy": 0.520211398601532, "epoch": 3.0, "grad_norm": 0.04815197363495827, "learning_rate": 0.0002, "loss": 0.5207195281982422, "mean_token_accuracy": 0.7871742844581604, "num_tokens": 13094581.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2192829660484076e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }